Skip to content

Commit 39a3741

Browse files
author
Xu
committed
fix evolving history
1 parent 7b1708b commit 39a3741

File tree

8 files changed

+32
-59
lines changed

8 files changed

+32
-59
lines changed

rdagent/components/coder/CoSTEER/evaluators.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,7 @@ class CoSTEERSingleFeedback(Feedback):
4242
return_checking: str | None # including every check in the testing (constraints about the generated value)
4343
# value_feedback, shape_feedback, value_generated_flag
4444
code: str
45-
final_decision: bool
45+
final_decision: bool | None
4646

4747
@staticmethod
4848
def val_and_update_init_dict(data: dict) -> dict:

rdagent/components/coder/CoSTEER/evolving_strategy.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020

2121
class MultiProcessEvolvingStrategy(EvolvingStrategy):
22-
KEY_CHANGE_SUMMARY = "__change_summary__" # Optional key for the summary of the change of evolving subjects
22+
KEY_CHANGE_SUMMARY = "__change_summary__" # Optional key for the summary of the change of evolving subjects
2323

2424
def __init__(self, scen: Scenario, settings: CoSTEERSettings):
2525
super().__init__(scen)

rdagent/components/coder/factor_coder/config.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,7 @@
44
from pydantic_settings import SettingsConfigDict
55

66
from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
7-
from rdagent.utils.env import (
8-
CondaConf,
9-
Env,
10-
LocalEnv,
11-
)
7+
from rdagent.utils.env import CondaConf, Env, LocalEnv
128

139

1410
class FactorCoSTEERSettings(CoSTEERSettings):

rdagent/components/coder/model_coder/conf.py

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,7 @@
33
from pydantic_settings import SettingsConfigDict
44

55
from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
6-
from rdagent.utils.env import (
7-
Env,
8-
QlibCondaConf,
9-
QlibCondaEnv,
10-
QTDockerEnv,
11-
)
6+
from rdagent.utils.env import Env, QlibCondaConf, QlibCondaEnv, QTDockerEnv
127

138

149
class ModelCoSTEERSettings(CoSTEERSettings):

rdagent/core/experiment.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -153,8 +153,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
153153
{}
154154
) # The code injected into the folder, store them in the variable to reproduce the former result
155155
self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
156-
self.ws_ckp: bytes | None = None # In-memory checkpoint data created by ``create_ws_ckp``.
157-
self.change_summary: str | None = None # The change from the previous version of workspace
156+
self.ws_ckp: bytes | None = None # In-memory checkpoint data created by ``create_ws_ckp``.
157+
self.change_summary: str | None = None # The change from the previous version of workspace
158158

159159
@staticmethod
160160
def _format_code_dict(code_dict: dict[str, str]) -> str:

rdagent/scenarios/data_science/dev/runner/__init__.py

Lines changed: 9 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -50,20 +50,12 @@ def implement_one_task(
5050
if prev_task_feedback is None:
5151
# if no prev_task_feedback, it is the first loop; we do not make any changes and goto evaluators directly.
5252
return {}
53-
54-
# Get previous runner loops
53+
54+
# Get evolving history
5555
task_info = target_task.get_task_information()
5656
queried_former_failed_knowledge = (
5757
queried_knowledge.task_to_former_failed_traces[task_info] if queried_knowledge is not None else []
58-
)
59-
queried_former_failed_knowledge = (
60-
[
61-
knowledge
62-
for knowledge in queried_former_failed_knowledge[0]
63-
if knowledge.implementation.file_dict.get("main.py") != workspace.file_dict.get("main.py")
64-
],
65-
queried_former_failed_knowledge[1],
66-
)
58+
)[0]
6759

6860
# Set output agent
6961
if self.settings.diff_mode:
@@ -73,7 +65,7 @@ def implement_one_task(
7365
output_spec = PythonBatchEditOut.get_spec(with_del=False)
7466
extract_output_fn = PythonBatchEditOut.extract_output
7567

76-
if prev_task_feedback.final_decision is False:
68+
if prev_task_feedback.acceptable is False:
7769
task_information_str = target_task.get_task_information()
7870
# Use system_debugger for error fixing and debugging
7971
system_prompt = T(".prompts:DSCoSTEER.system_debugger").r(
@@ -97,8 +89,10 @@ def implement_one_task(
9789
user_prompt = T(".prompts:DSCoSTEER.user").r(
9890
code=workspace.all_codes,
9991
feedback=prev_task_feedback,
100-
hyperparameter_tuning_suggestion=prev_task_feedback.hyperparameter_tuning_suggestion,
101-
queried_former_failed_knowledge=queried_former_failed_knowledge[0],
92+
hyperparameter_tuning_suggestion=(
93+
prev_task_feedback.hyperparameter_tuning_suggestion if prev_task_feedback.acceptable else None
94+
),
95+
queried_former_failed_knowledge=queried_former_failed_knowledge,
10296
)
10397

10498
code = session.build_chat_completion(user_prompt=user_prompt)
@@ -117,7 +111,7 @@ def implement_one_task(
117111
)
118112
change_summary = session.build_chat_completion(user_prompt=user_prompt)
119113
code_batch_edit.update({"__change_summary__": change_summary})
120-
114+
121115
return code_batch_edit
122116

123117
def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):

rdagent/scenarios/data_science/dev/runner/eval.py

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,7 @@ def __str__(self) -> str:
6262
parts.append(str(self.hyperparameter_tuning_suggestion))
6363
return "\n".join(parts)
6464

65+
6566
class DSRunnerEvaluator(CoSTEEREvaluator):
6667

6768
def evaluate(
@@ -96,15 +97,7 @@ def evaluate(
9697
task_info = target_task.get_task_information()
9798
queried_former_failed_knowledge = (
9899
queried_knowledge.task_to_former_failed_traces[task_info] if queried_knowledge is not None else []
99-
)
100-
queried_former_failed_knowledge = (
101-
[
102-
knowledge
103-
for knowledge in queried_former_failed_knowledge[0]
104-
if knowledge.implementation.file_dict.get("main.py") != implementation.file_dict.get("main.py")
105-
],
106-
queried_former_failed_knowledge[1],
107-
)
100+
)[0]
108101

109102
# execute workflow
110103
result = implementation.run(env=env, entry="python -m coverage run main.py")
@@ -193,16 +186,17 @@ def evaluate(
193186
time_spent=f"{implementation.running_info.running_time:.2f} seconds",
194187
timeout=f"{env.conf.running_timeout_period} seconds",
195188
percent_of_timeout_used=f"{time_spent_ratio * 100:.2f}%",
196-
queried_former_failed_knowledge=queried_former_failed_knowledge[0],
189+
queried_former_failed_knowledge=queried_former_failed_knowledge,
197190
)
198191

199192
feedback = build_cls_from_json_with_retry(
200193
DSRunnerFeedback,
201194
system_prompt=system_prompt,
202195
user_prompt=user_prompt,
203-
init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
196+
# init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
204197
)
205198
feedback.score = score_df.to_string() if score_ret_code == 0 else None
199+
feedback.final_decision = feedback.acceptable and (not feedback.hyperparameter_tuning_decision)
206200

207201
if feedback and not DS_RD_SETTING.coder_on_whole_pipeline:
208202
# remove unused files

rdagent/scenarios/data_science/dev/runner/prompts.yaml

Lines changed: 12 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -25,13 +25,10 @@ DSCoSTEER_eval:
2525
3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission.
2626
If the code does not satisfy the requirements:
2727
- Set "acceptable" to false.
28-
- Set "final_decision" to false.
29-
{% if enable_hyperparameter_tuning_check %}- set "hyperparameter_tuning_decision" to false.
30-
- Set "hyperparameter_tuning_suggestion" to an empty string.
3128
If the code satisfy the requirements:
3229
- Set "acceptable" to true.
33-
- Proceed to the next evaluation.
3430
31+
{% if enable_hyperparameter_tuning_check %}
3532
# Evaluation 2: Hyperparameter
3633
## Evaluation Description
3734
The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
@@ -45,7 +42,6 @@ DSCoSTEER_eval:
4542
3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence. If there are no obvious and impactful opportunities and the code runs well, please accept it.
4643
If the code satisfy the requirements:
4744
- Set "hyperparameter_tuning_decision" to true.
48-
- Set "final_decision" to false.
4945
- Provide a reasonable suggestion in "hyperparameter_tuning_suggestion". The "hyperparameter_tuning_suggestion" should begin with a clear observation, followed by your suggestion. For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still going down and early stopping was not activated. Only 15% of the allowed time was used. [Suggestion] We recommend increasing epochs to 100 to avoid underfitting and further improve model performance."
5046
If the code does not satisfy the requirements:
5147
- Set "hyperparameter_tuning_decision" to false.
@@ -59,10 +55,11 @@ DSCoSTEER_eval:
5955
"execution": "Describe whether the whole code base executed successfully and generating the final submission. Include any errors or issues encountered, and retain all error messages and traceback details.",
6056
"return_checking": "Verify the generated files, particularly the submission file. Ensure that its format matches the sample submission",
6157
"code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
62-
"acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,{% if enable_hyperparameter_tuning_check %}
58+
"acceptable": <true/false: if the solution has passed execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
59+
{% if enable_hyperparameter_tuning_check %}
6360
"hyperparameter_tuning_decision": <true/false>,
64-
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
65-
"final_decision": <true/false>,
61+
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
62+
{% endif %}
6663
}
6764
```
6865
{% else %}
@@ -101,14 +98,13 @@ DSCoSTEER_eval:
10198
"acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
10299
{% if enable_hyperparameter_tuning_check %}"hyperparameter_tuning_decision": <true/false>,
103100
"hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
104-
"final_decision": <true/false>,
105101
}
106102
```
107103
{% endif %}
108104
# NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently
109105

110106
user: |-
111-
# Code base
107+
# Current Code base
112108
{{ code }}
113109
114110
## Stdout of code execution and testing
@@ -121,10 +117,9 @@ DSCoSTEER_eval:
121117
122118
{% if queried_former_failed_knowledge|length != 0 %}
123119
# Evolving History
124-
{% for former_failed_knowledge in queried_former_failed_knowledge %} ## Attempt {{ loop.index }}:
120+
{% for former_failed_knowledge in queried_former_failed_knowledge %}## Attempt {{ loop.index }}:
125121
### Summary of Changes
126122
{{ former_failed_knowledge.implementation.change_summary }}
127-
### Feedbacks
128123
{{ former_failed_knowledge.feedback }}
129124
{% endfor %}
130125
{% endif %}
@@ -138,7 +133,6 @@ DSCoSTEER:
138133
1. Code base.
139134
2. Task description, which is the task the code is trying to solve.
140135
3. Feedback generated during the execution of the whole workflow.
141-
4. Suggestions for hyperparameter tuning.
142136
Your job is to debug the whole code base, try to correct the errors, and ensure that the workflow can execute successfully on the full dataset.
143137
144138
## Task description
@@ -191,10 +185,10 @@ DSCoSTEER:
191185
{% endif %}
192186
193187
user: |-
194-
# Code Base
188+
# Current Code Base
195189
{{ code }}
196190
197-
## Feedback
191+
## Feedback of Current Code Base
198192
{{ feedback }}
199193
200194
{% if hyperparameter_tuning_suggestion is not none %}
@@ -204,10 +198,10 @@ DSCoSTEER:
204198
205199
{% if queried_former_failed_knowledge|length != 0 %}
206200
# Evolving History
207-
{% for former_failed_knowledge in queried_former_failed_knowledge %} ## Attempt {{ loop.index }}:
201+
{% for former_failed_knowledge in queried_former_failed_knowledge %}## Attempt {{ loop.index }}:
208202
### Summary of Changes
209203
{{ former_failed_knowledge.implementation.change_summary }}
210-
### Feedbacks
211-
{{ former_failed_knowledge.feedback }}
204+
### Validation Scores
205+
{{ former_failed_knowledge.feedback.score }}
212206
{% endfor %}
213207
{% endif %}

0 commit comments

Comments
 (0)