feat: replace hard-coded cache paths with dynamic cache_path config (#952)

you-n-g · web-flow · commit db568947f108 · 2025-06-12T17:44:31.000+08:00
* feat: replace hard-coded cache paths with dynamic cache_path config

* style: reorder wait_retry import and format chmod list

* refactor: pass workspace_path to chmod command and use DockerConf check
diff --git a/rdagent/components/coder/data_science/feature/prompts.yaml b/rdagent/components/coder/data_science/feature/prompts.yaml
@@ -46,10 +46,10 @@ feature_coder:
     5. You should use the following cache decorator to cache the results of the function:
     ```python
     from joblib import Memory
-    memory = Memory(location='./cache', verbose=0)
+    memory = Memory(location='{% include "scenarios.data_science.share:scen.cache_path" %}', verbose=0)
     @memory.cache```
     6. Coding tricks:
-      - If the input consists of a batch of file paths and you need to modify the file contents to complete your feature engineering task, you can accomplish your feature engineering task by modifying these files and creating new files in a subfolder within "./cache" (this path is persistent, otherwise you may lose your created file). Then the new file paths are returned.
+      - If the input consists of a batch of file paths and you need to modify the file contents to complete your feature engineering task, you can accomplish your feature engineering task by modifying these files and creating new files in a subfolder within "{% include "scenarios.data_science.share:scen.cache_path" %}" (this path is persistent, otherwise you may lose your created file). Then the new file paths are returned.
 
     {% include "scenarios.data_science.share:guidelines.coding" %}
 
diff --git a/rdagent/components/coder/data_science/model/prompts.yaml b/rdagent/components/coder/data_science/model/prompts.yaml
@@ -43,7 +43,7 @@ model_coder:
     4. You should use the following cache decorator to cache the results of the function:
     ```python
     from joblib import Memory
-    memory = Memory(location='./cache', verbose=0)
+    memory = Memory(location='{% include "scenarios.data_science.share:scen.cache_path" %}', verbose=0)
     @memory.cache``
     {% include "scenarios.data_science.share:guidelines.coding" %}
 
diff --git a/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml b/rdagent/components/coder/data_science/raw_data_loader/prompts.yaml
@@ -273,7 +273,7 @@ data_loader_coder:
     3. You should use the following cache decorator to cache the results of the function:
     ```python
     from joblib import Memory
-    memory = Memory(location='./cache', verbose=0)
+    memory = Memory(location='{% include "scenarios.data_science.share:scen.cache_path" %}', verbose=0)
     @memory.cache```
     {% include "scenarios.data_science.share:guidelines.coding" %}
     
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -61,7 +61,8 @@ describe: # some template to describe some object
 scen:  # customizable
   role: |-
     You are a Kaggle Grandmaster and expert ML engineer with deep expertise in statistics, machine learning, and competition optimization.
-  input_path: "./input/"
+  input_path: "./workspace_input/"
+  cache_path: "./workspace_cache/"
 
 component_description:
   DataLoadSpec: |-
diff --git a/rdagent/utils/env.py b/rdagent/utils/env.py
@@ -40,6 +40,7 @@
 from rdagent.core.experiment import RD_AGENT_SETTINGS
 from rdagent.log import rdagent_logger as logger
 from rdagent.oai.llm_utils import md5_hash
+from rdagent.utils.agent.tpl import T
 from rdagent.utils.workflow import wait_retry
 
 
@@ -240,15 +241,32 @@ def run_ret_code(
         # FIXME: the input path and cache path is hard coded here.
         # We don't want to change the content in input and cache path.
         # Otherwise, it may produce large amount of warnings.
+        def _get_chmod_cmd(workspace_path: str) -> str:
+            def _get_path_stem(path: str) -> str | None:
+                # If the input path is relative, keep only the first component
+                p = Path(path)
+                if not p.is_absolute() and p.parts:
+                    return p.parts[0]
+                return None
+
+            chmod_cmd = f"chmod -R 777 $(find {workspace_path} -mindepth 1 -maxdepth 1"
+            for name in [
+                _get_path_stem(T("scenarios.data_science.share:scen.cache_path").r()),
+                _get_path_stem(T("scenarios.data_science.share:scen.input_path").r()),
+            ]:
+                chmod_cmd += f" ! -name {name}"
+            chmod_cmd += ")"
+            return chmod_cmd
+
         entry_add_timeout = (
             f"/bin/sh -c 'timeout --kill-after=10 {self.conf.running_timeout_period} {entry}; "
             + "entry_exit_code=$?; "
             + (
-                f"chmod -R 777 $(find {self.conf.mount_path} -mindepth 1 -maxdepth 1 ! -name cache ! -name input); "
+                f"{_get_chmod_cmd(self.conf.mount_path)}"
                 # We don't have to change the permission of the cache and input folder to remove it
                 # + f"if [ -d {self.conf.mount_path}/cache ]; then chmod 777 {self.conf.mount_path}/cache; fi; " +
                 #     f"if [ -d {self.conf.mount_path}/input ]; then chmod 777 {self.conf.mount_path}/input; fi; "
-                if hasattr(self.conf, "mount_path")
+                if isinstance(self.conf, DockerConf)
                 else ""
             )
             + "exit $entry_exit_code'"
@@ -409,7 +427,7 @@ def _run_ret_code(
                 volumes[lp] = rp
             cache_path = "/tmp/sample" if "/sample/" in "".join(self.conf.extra_volumes.keys()) else "/tmp/full"
             Path(cache_path).mkdir(parents=True, exist_ok=True)
-            volumes[cache_path] = "./cache"
+            volumes[cache_path] = T("scenarios.data_science.share:scen.cache_path").r()
         for lp, rp in running_extra_volume.items():
             volumes[lp] = rp
 
@@ -821,7 +839,7 @@ def _run_ret_code(
                 volumes[lp] = {"bind": rp, "mode": self.conf.extra_volume_mode}
             cache_path = "/tmp/sample" if "/sample/" in "".join(self.conf.extra_volumes.keys()) else "/tmp/full"
             Path(cache_path).mkdir(parents=True, exist_ok=True)
-            volumes[cache_path] = {"bind": "./cache", "mode": "rw"}
+            volumes[cache_path] = {"bind": T("scenarios.data_science.share:scen.cache_path").r(), "mode": "rw"}
         for lp, rp in running_extra_volume.items():
             volumes[lp] = {"bind": rp, "mode": self.conf.extra_volume_mode}