Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: joblib cache #749

Merged
merged 12 commits into from
Apr 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,22 @@ X_loaded = deepcopy(X)
y_loaded = deepcopy(y)
X_test_loaded = deepcopy(X_test)

import sys
import reprlib
from joblib.memory import MemorizedFunc


def get_original_code(func):
if isinstance(func, MemorizedFunc):
return func.func.__code__
return func.__code__


def debug_info_print(func):
def wrapper(*args, **kwargs):
original_code = get_original_code(func)
def local_trace(frame, event, arg):
if event == "return" and frame.f_code == func.__code__:
if event == "return" and frame.f_code == original_code:
print("\n" + "="*20 + "Running feat_eng code, local variable values:" + "="*20)
for k, v in frame.f_locals.items():
printed = aRepr.repr(v)
Expand Down
7 changes: 6 additions & 1 deletion rdagent/components/coder/data_science/feature/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -39,10 +39,15 @@ feature_coder:
```python
{{ data_loader_code }}
```
3. **Additional Guidance:**
4. **Additional Guidance:**
- If a previous attempt exists, improve upon it without repeating mistakes.
- If errors indicate a missing file, find a way to download it or implement an alternative solution.
- You should avoid using logging module to output information in your generated code, and instead use the print() function.
5. You should use the following cache decorator to cache the results of the function:
```python
from joblib import Memory
memory = Memory(location='/tmp/cache', verbose=0)
@memory.cache```

## Output Format
{% if out_spec %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,17 @@ print(f"test_ids length: {len(test_ids)}")

train_X, val_X, train_y, val_y = train_test_split(X, y, test_size=0.8, random_state=42)


import sys
import reprlib
from joblib.memory import MemorizedFunc


def get_original_code(func):
if isinstance(func, MemorizedFunc):
return func.func.__code__
return func.__code__

print("train_X:", aRepr.repr(train_X))
print("train_y:", aRepr.repr(train_y))
print("val_X:", aRepr.repr(val_X))
Expand All @@ -46,10 +57,12 @@ print(f"val_X.shape: {val_X.shape}" if hasattr(val_X, 'shape') else f"val_X leng
print(f"val_y.shape: {val_y.shape}" if hasattr(val_y, 'shape') else f"val_y length: {len(val_y)}")



def debug_info_print(func):
def wrapper(*args, **kwargs):
original_code = get_original_code(func)
def local_trace(frame, event, arg):
if event == "return" and frame.f_code == func.__code__:
if event == "return" and frame.f_code == original_code:
print("\n" + "="*20 + "Running model training code, local variable values:" + "="*20)
for k, v in frame.f_locals.items():
printed = aRepr.repr(v)
Expand Down
5 changes: 5 additions & 0 deletions rdagent/components/coder/data_science/model/prompts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ model_coder:
{{ feature_code }}
2. You should avoid using logging module to output information in your generated code, and instead use the print() function.
3. If the model can both be implemented by PyTorch and Tensorflow, please use pytorch for broader compatibility.
4. You should use the following cache decorator to cache the results of the function:
```python
from joblib import Memory
memory = Memory(location='/tmp/cache', verbose=0)
@memory.cache``

## Output Format
{% if out_spec %}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,22 @@ from load_data import load_data

import sys
import reprlib
from joblib.memory import MemorizedFunc


def get_original_code(func):
if isinstance(func, MemorizedFunc):
return func.func.__code__
return func.__code__


def debug_info_print(func):
aRepr = reprlib.Repr()
aRepr.maxother=300
def wrapper(*args, **kwargs):
original_code = get_original_code(func)
def local_trace(frame, event, arg):
if event == "return" and frame.f_code == func.__code__:
if event == "return" and frame.f_code == original_code:
print("\n" + "="*20 + "Running data_load code, local variable values:" + "="*20)
for k, v in frame.f_locals.items():
printed = aRepr.repr(v)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -334,6 +334,11 @@ data_loader_coder:
## Guidelines
1. Ensure that the dataset is loaded strictly from `/kaggle/input/`, following the exact folder structure described in the **Data Folder Description**, and do not attempt to load data from the current directory (`./`).
2. You should avoid using logging module to output information in your generated code, and instead use the print() function.
3. You should use the following cache decorator to cache the results of the function:
```python
from joblib import Memory
memory = Memory(location='/tmp/cache', verbose=0)
@memory.cache```

## Exploratory Data Analysis (EDA) part(Required):
- Before returning the data, you should always add an EDA part describing the data to help the following steps understand the data better.
Expand Down
15 changes: 11 additions & 4 deletions rdagent/utils/env.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""
The motiviation of the utils is for environment management
The motivation of the utils is for environment management

Tries to create uniform environment for the agent to run;
- All the code and data is expected included in one folder
Expand Down Expand Up @@ -210,8 +210,8 @@ def cached_run(
target_folder = Path(RD_AGENT_SETTINGS.pickle_cache_folder_path_str) / f"utils.env.run"
target_folder.mkdir(parents=True, exist_ok=True)

# we must add the information of data (beyound code) into the key.
# Otherwise, all commands operating on data will become invalue (e.g. rm -r submission.csv)
# we must add the information of data (beyond code) into the key.
# Otherwise, all commands operating on data will become invalid (e.g. rm -r submission.csv)
# So we recursively walk in the folder and add the sorted relative filename list as part of the key.
data_key = []
for path in Path(local_path).rglob("*"):
Expand Down Expand Up @@ -292,7 +292,7 @@ class LocalConf(EnvConf):

class LocalEnv(Env[ASpecificLocalConf]):
"""
Sometimes local environment may be more convinient for testing
Sometimes local environment may be more convenient for testing
"""

def prepare(self) -> None: ...
Expand All @@ -311,6 +311,9 @@ def _run_ret_code(
if self.conf.extra_volumes is not None:
for lp, rp in self.conf.extra_volumes.items():
volumes[lp] = rp
cache_path = "/tmp/sample" if "/sample/" in "".join(self.conf.extra_volumes.keys()) else "/tmp/full"
Path(cache_path).mkdir(parents=True, exist_ok=True)
volumes[cache_path] = "/tmp/cache"
for lp, rp in running_extra_volume.items():
volumes[lp] = rp

Expand Down Expand Up @@ -605,9 +608,13 @@ def _run_ret_code(
if local_path is not None:
local_path = os.path.abspath(local_path)
volumes[local_path] = {"bind": self.conf.mount_path, "mode": "rw"}

if self.conf.extra_volumes is not None:
for lp, rp in self.conf.extra_volumes.items():
volumes[lp] = {"bind": rp, "mode": self.conf.extra_volume_mode}
cache_path = "/tmp/sample" if "/sample/" in "".join(self.conf.extra_volumes.keys()) else "/tmp/full"
Path(cache_path).mkdir(parents=True, exist_ok=True)
volumes[cache_path] = {"bind": "/tmp/cache", "mode": "rw"}
for lp, rp in running_extra_volume.items():
volumes[lp] = {"bind": rp, "mode": self.conf.extra_volume_mode}

Expand Down