feat: joblib cache (#749)

qew21 · web-flow · commit 83a041148ff9 · 2025-04-04T12:08:18.000+08:00
* cache function

* fix test

* bin cache

* fix test

* fix test

* fix test

* cache for different source

* cache for localenv

* remove unnecessary log

* reformat

* remove unrelated modify
diff --git a/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt b/rdagent/components/coder/data_science/feature/eval_tests/feature_test.txt
@@ -28,10 +28,22 @@ X_loaded = deepcopy(X)
 y_loaded = deepcopy(y)
 X_test_loaded = deepcopy(X_test)
 
+import sys
+import reprlib
+from joblib.memory import MemorizedFunc
+
+
+def get_original_code(func):
+    if isinstance(func, MemorizedFunc):
+        return func.func.__code__
+    return func.__code__
+
+
 def debug_info_print(func):
     def wrapper(*args, **kwargs):
+        original_code = get_original_code(func)
         def local_trace(frame, event, arg):
-            if event == "return" and frame.f_code == func.__code__:
+            if event == "return" and frame.f_code == original_code:
                 print("\n" + "="*20 + "Running feat_eng code, local variable values:" + "="*20)
                 for k, v in frame.f_locals.items():
                     printed = aRepr.repr(v)
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -39,10 +39,15 @@ feature_coder:
     ```python
     {{ data_loader_code }}
     ```
-    3. **Additional Guidance:**
+    4. **Additional Guidance:**
       - If a previous attempt exists, improve upon it without repeating mistakes.
       - If errors indicate a missing file, find a way to download it or implement an alternative solution.
       - You should avoid using logging module to output information in your generated code, and instead use the print() function.
+    5. You should use the following cache decorator to cache the results of the function:
+    ```python
+    from joblib import Memory
+    memory = Memory(location='/tmp/cache', verbose=0)
+    @memory.cache```
 
     ## Output Format
     {% if out_spec %}
diff --git a/rdagent/components/coder/data_science/model/eval_tests/model_test.txt b/rdagent/components/coder/data_science/model/eval_tests/model_test.txt
@@ -35,6 +35,17 @@ print(f"test_ids length: {len(test_ids)}")
 
 train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.8, random_state=42)
 
+
+import sys
+import reprlib
+from joblib.memory import MemorizedFunc
+
+
+def get_original_code(func):
+    if isinstance(func, MemorizedFunc):
+        return func.func.__code__
+    return func.__code__
+
 print("train_X:", aRepr.repr(train_X))
 print("train_y:", aRepr.repr(train_y))
 print("val_X:", aRepr.repr(val_X))
@@ -46,10 +57,12 @@ print(f"val_X.shape: {val_X.shape}" if hasattr(val_X, 'shape') else f"val_X leng
 print(f"val_y.shape: {val_y.shape}" if hasattr(val_y, 'shape') else f"val_y length: {len(val_y)}")
 
 
+
 def debug_info_print(func):
     def wrapper(*args, **kwargs):
+        original_code = get_original_code(func)
         def local_trace(frame, event, arg):
-            if event == "return" and frame.f_code == func.__code__:
+            if event == "return" and frame.f_code == original_code:
                 print("\n" + "="*20 + "Running model training code, local variable values:" + "="*20)
                 for k, v in frame.f_locals.items():
                     printed = aRepr.repr(v)
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -40,6 +40,11 @@ model_coder:
     {{ feature_code }}
     2. You should avoid using logging module to output information in your generated code, and instead use the print() function.
     3. If the model can both be implemented by PyTorch and Tensorflow, please use pytorch for broader compatibility.
+    4. You should use the following cache decorator to cache the results of the function:
+    ```python
+    from joblib import Memory
+    memory = Memory(location='/tmp/cache', verbose=0)
+    @memory.cache``
 
     ## Output Format
     {% if out_spec %}
diff --git a/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt b/rdagent/components/coder/data_science/raw_data_loader/eval_tests/data_loader_test.txt
@@ -9,12 +9,22 @@ from load_data import load_data
 
 import sys
 import reprlib
+from joblib.memory import MemorizedFunc
+
+
+def get_original_code(func):
+    if isinstance(func, MemorizedFunc):
+        return func.func.__code__
+    return func.__code__
+
+
 def debug_info_print(func):
     aRepr = reprlib.Repr()
     aRepr.maxother=300
     def wrapper(*args, **kwargs):
+        original_code = get_original_code(func)
         def local_trace(frame, event, arg):
-            if event == "return" and frame.f_code == func.__code__:
+            if event == "return" and frame.f_code == original_code:
                 print("\n" + "="*20 + "Running data_load code, local variable values:" + "="*20)
                 for k, v in frame.f_locals.items():
                     printed = aRepr.repr(v)
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -334,6 +334,11 @@ data_loader_coder:
     ## Guidelines
     1. Ensure that the dataset is loaded strictly from `/kaggle/input/`, following the exact folder structure described in the **Data Folder Description**, and do not attempt to load data from the current directory (`./`).
     2. You should avoid using logging module to output information in your generated code, and instead use the print() function.
+    3. You should use the following cache decorator to cache the results of the function:
+    ```python
+    from joblib import Memory
+    memory = Memory(location='/tmp/cache', verbose=0)
+    @memory.cache```
     
     ## Exploratory Data Analysis (EDA) part(Required):
     - Before returning the data, you should always add an EDA part describing the data to help the following steps understand the data better.
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
@@ -1,5 +1,5 @@
 """
-The motiviation of the utils is for environment management
+The motivation of the utils is for environment management
 
 Tries to create uniform environment for the agent to run;
 - All the code and data is expected included in one folder
@@ -210,8 +210,8 @@ def cached_run(
         target_folder = Path(RD_AGENT_SETTINGS.pickle_cache_folder_path_str) / f"utils.env.run"
         target_folder.mkdir(parents=True, exist_ok=True)
 
-        # we must add the information of data (beyound code) into the key.
-        # Otherwise, all commands operating on data will become invalue (e.g. rm -r submission.csv)
+        # we must add the information of data (beyond code) into the key.
+        # Otherwise, all commands operating on data will become invalid (e.g. rm -r submission.csv)
         # So we recursively walk in the folder and add the sorted relative filename list as part of the key.
         data_key = []
         for path in Path(local_path).rglob("*"):
@@ -292,7 +292,7 @@ class LocalConf(EnvConf):
 
 class LocalEnv(Env[ASpecificLocalConf]):
     """
-    Sometimes local environment may be more convinient for testing
+    Sometimes local environment may be more convenient for testing
     """
 
     def prepare(self) -> None: ...
@@ -311,6 +311,9 @@ def _run_ret_code(
         if self.conf.extra_volumes is not None:
             for lp, rp in self.conf.extra_volumes.items():
                 volumes[lp] = rp
+            cache_path = "/tmp/sample" if "/sample/" in "".join(self.conf.extra_volumes.keys()) else "/tmp/full"
+            Path(cache_path).mkdir(parents=True, exist_ok=True)
+            volumes[cache_path] = "/tmp/cache"
         for lp, rp in running_extra_volume.items():
             volumes[lp] = rp
 
@@ -605,9 +608,13 @@ def _run_ret_code(
         if local_path is not None:
             local_path = os.path.abspath(local_path)
             volumes[local_path] = {"bind": self.conf.mount_path, "mode": "rw"}
+
         if self.conf.extra_volumes is not None:
             for lp, rp in self.conf.extra_volumes.items():
                 volumes[lp] = {"bind": rp, "mode": self.conf.extra_volume_mode}
+            cache_path = "/tmp/sample" if "/sample/" in "".join(self.conf.extra_volumes.keys()) else "/tmp/full"
+            Path(cache_path).mkdir(parents=True, exist_ok=True)
+            volumes[cache_path] = {"bind": "/tmp/cache", "mode": "rw"}
         for lp, rp in running_extra_volume.items():
             volumes[lp] = {"bind": rp, "mode": self.conf.extra_volume_mode}