Skip to content

Commit 67d0e01

Browse files
authored
feat: archive python and csv files in workspace to maintain results (#814)
* archive workspace also * remove non python csv and md files in workspace to avoid big workspace dump * FIX ci
1 parent 76d8536 commit 67d0e01

File tree

1 file changed

+32
-6
lines changed

1 file changed

+32
-6
lines changed

rdagent/app/data_science/loop.py

Lines changed: 32 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
2323
from rdagent.components.workflow.conf import BasePropSetting
2424
from rdagent.components.workflow.rd_loop import RDLoop
25+
from rdagent.core.conf import RD_AGENT_SETTINGS
2526
from rdagent.core.exception import CoderError, RunnerError
2627
from rdagent.core.proposal import ExperimentFeedback
2728
from rdagent.core.scenario import Scenario
@@ -175,21 +176,46 @@ def record(self, prev_out: dict[str, Any]):
175176
and Path(DS_RD_SETTING.log_archive_path).is_dir()
176177
):
177178
start_archive_datetime = datetime.now()
178-
logger.info(f"Archiving log folder after loop {self.loop_idx}")
179-
tar_path = (
179+
logger.info(f"Archiving log and workspace folder after loop {self.loop_idx}")
180+
mid_log_tar_path = (
180181
Path(
181182
DS_RD_SETTING.log_archive_temp_path
182183
if DS_RD_SETTING.log_archive_temp_path
183184
else DS_RD_SETTING.log_archive_path
184185
)
185186
/ "mid_log.tar"
186187
)
187-
subprocess.run(["tar", "-cf", str(tar_path), "-C", (Path().cwd() / "log"), "."], check=True)
188+
mid_workspace_tar_path = (
189+
Path(
190+
DS_RD_SETTING.log_archive_temp_path
191+
if DS_RD_SETTING.log_archive_temp_path
192+
else DS_RD_SETTING.log_archive_path
193+
)
194+
/ "mid_workspace.tar"
195+
)
196+
subprocess.run(["tar", "-cf", str(mid_log_tar_path), "-C", (Path().cwd() / "log"), "."], check=True)
197+
198+
# remove all files and folders in the workspace except for .py, .md, and .csv files to avoid large workspace dump
199+
for workspace_id in Path(RD_AGENT_SETTINGS.workspace_path).iterdir():
200+
for file_and_folder in workspace_id.iterdir():
201+
if file_and_folder.is_dir():
202+
shutil.rmtree(file_and_folder)
203+
elif file_and_folder.is_file() and file_and_folder.suffix not in [".py", ".md", ".csv"]:
204+
file_and_folder.unlink()
205+
206+
subprocess.run(
207+
["tar", "-cf", str(mid_workspace_tar_path), "-C", (RD_AGENT_SETTINGS.workspace_path), "."], check=True
208+
)
188209
if DS_RD_SETTING.log_archive_temp_path is not None:
189-
shutil.move(tar_path, Path(DS_RD_SETTING.log_archive_path) / "mid_log.tar")
190-
tar_path = Path(DS_RD_SETTING.log_archive_path) / "mid_log.tar"
210+
shutil.move(mid_log_tar_path, Path(DS_RD_SETTING.log_archive_path) / "mid_log.tar")
211+
mid_log_tar_path = Path(DS_RD_SETTING.log_archive_path) / "mid_log.tar"
212+
shutil.move(mid_workspace_tar_path, Path(DS_RD_SETTING.log_archive_path) / "mid_workspace.tar")
213+
mid_workspace_tar_path = Path(DS_RD_SETTING.log_archive_path) / "mid_workspace.tar"
214+
shutil.copy(
215+
mid_log_tar_path, Path(DS_RD_SETTING.log_archive_path) / "mid_log_bak.tar"
216+
) # backup when upper code line is killed when running
191217
shutil.copy(
192-
tar_path, Path(DS_RD_SETTING.log_archive_path) / "mid_log_bak.tar"
218+
mid_workspace_tar_path, Path(DS_RD_SETTING.log_archive_path) / "mid_workspace_bak.tar"
193219
) # backup when upper code line is killed when running
194220
self.timer.add_duration(datetime.now() - start_archive_datetime)
195221

0 commit comments

Comments
 (0)