fix evolving history

Xu · Xu · commit 39a37410b18f · 2025-08-02T09:46:11.000Z
diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -42,7 +42,7 @@ class CoSTEERSingleFeedback(Feedback):
     return_checking: str | None  # including every check in the testing (constraints about the generated value)
     # value_feedback, shape_feedback, value_generated_flag
     code: str
-    final_decision: bool
+    final_decision: bool | None
 
     @staticmethod
     def val_and_update_init_dict(data: dict) -> dict:
diff --git a/rdagent/components/coder/CoSTEER/evolving_strategy.py b/rdagent/components/coder/CoSTEER/evolving_strategy.py
@@ -19,7 +19,7 @@
 
 
 class MultiProcessEvolvingStrategy(EvolvingStrategy):
-    KEY_CHANGE_SUMMARY = "__change_summary__" # Optional key for the summary of the change of evolving subjects
+    KEY_CHANGE_SUMMARY = "__change_summary__"  # Optional key for the summary of the change of evolving subjects
 
     def __init__(self, scen: Scenario, settings: CoSTEERSettings):
         super().__init__(scen)
diff --git a/rdagent/components/coder/factor_coder/config.py b/rdagent/components/coder/factor_coder/config.py
@@ -4,11 +4,7 @@
 from pydantic_settings import SettingsConfigDict
 
 from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
-from rdagent.utils.env import (
-    CondaConf,
-    Env,
-    LocalEnv,
-)
+from rdagent.utils.env import CondaConf, Env, LocalEnv
 
 
 class FactorCoSTEERSettings(CoSTEERSettings):
diff --git a/rdagent/components/coder/model_coder/conf.py b/rdagent/components/coder/model_coder/conf.py
@@ -3,12 +3,7 @@
 from pydantic_settings import SettingsConfigDict
 
 from rdagent.components.coder.CoSTEER.config import CoSTEERSettings
-from rdagent.utils.env import (
-    Env,
-    QlibCondaConf,
-    QlibCondaEnv,
-    QTDockerEnv,
-)
+from rdagent.utils.env import Env, QlibCondaConf, QlibCondaEnv, QTDockerEnv
 
 
 class ModelCoSTEERSettings(CoSTEERSettings):
diff --git a/rdagent/core/experiment.py b/rdagent/core/experiment.py
@@ -153,8 +153,8 @@ def __init__(self, *args: Any, **kwargs: Any) -> None:
             {}
         )  # The code injected into the folder, store them in the variable to reproduce the former result
         self.workspace_path: Path = RD_AGENT_SETTINGS.workspace_path / uuid.uuid4().hex
-        self.ws_ckp: bytes | None = None # In-memory checkpoint data created by ``create_ws_ckp``.
-        self.change_summary: str | None = None # The change from the previous version of workspace
+        self.ws_ckp: bytes | None = None  # In-memory checkpoint data created by ``create_ws_ckp``.
+        self.change_summary: str | None = None  # The change from the previous version of workspace
 
     @staticmethod
     def _format_code_dict(code_dict: dict[str, str]) -> str:
diff --git a/rdagent/scenarios/data_science/dev/runner/__init__.py b/rdagent/scenarios/data_science/dev/runner/__init__.py
@@ -50,20 +50,12 @@ def implement_one_task(
         if prev_task_feedback is None:
             # if no prev_task_feedback, it is the first loop; we do not make any changes and goto evaluators directly.
             return {}
-        
-        # Get previous runner loops
+
+        # Get evolving history
         task_info = target_task.get_task_information()
         queried_former_failed_knowledge = (
             queried_knowledge.task_to_former_failed_traces[task_info] if queried_knowledge is not None else []
-        )
-        queried_former_failed_knowledge = (
-            [
-                knowledge
-                for knowledge in queried_former_failed_knowledge[0]
-                if knowledge.implementation.file_dict.get("main.py") != workspace.file_dict.get("main.py")
-            ],
-            queried_former_failed_knowledge[1],
-        )
+        )[0]
 
         # Set output agent
         if self.settings.diff_mode:
@@ -73,7 +65,7 @@ def implement_one_task(
             output_spec = PythonBatchEditOut.get_spec(with_del=False)
             extract_output_fn = PythonBatchEditOut.extract_output
 
-        if prev_task_feedback.final_decision is False:
+        if prev_task_feedback.acceptable is False:
             task_information_str = target_task.get_task_information()
             # Use system_debugger for error fixing and debugging
             system_prompt = T(".prompts:DSCoSTEER.system_debugger").r(
@@ -97,8 +89,10 @@ def implement_one_task(
         user_prompt = T(".prompts:DSCoSTEER.user").r(
             code=workspace.all_codes,
             feedback=prev_task_feedback,
-            hyperparameter_tuning_suggestion=prev_task_feedback.hyperparameter_tuning_suggestion,
-            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+            hyperparameter_tuning_suggestion=(
+                prev_task_feedback.hyperparameter_tuning_suggestion if prev_task_feedback.acceptable else None
+            ),
+            queried_former_failed_knowledge=queried_former_failed_knowledge,
         )
 
         code = session.build_chat_completion(user_prompt=user_prompt)
@@ -117,7 +111,7 @@ def implement_one_task(
         )
         change_summary = session.build_chat_completion(user_prompt=user_prompt)
         code_batch_edit.update({"__change_summary__": change_summary})
-        
+
         return code_batch_edit
 
     def assign_code_list_to_evo(self, code_list: list[dict[str, str]], evo):
diff --git a/rdagent/scenarios/data_science/dev/runner/eval.py b/rdagent/scenarios/data_science/dev/runner/eval.py
@@ -62,6 +62,7 @@ def __str__(self) -> str:
             parts.append(str(self.hyperparameter_tuning_suggestion))
         return "\n".join(parts)
 
+
 class DSRunnerEvaluator(CoSTEEREvaluator):
 
     def evaluate(
@@ -96,15 +97,7 @@ def evaluate(
         task_info = target_task.get_task_information()
         queried_former_failed_knowledge = (
             queried_knowledge.task_to_former_failed_traces[task_info] if queried_knowledge is not None else []
-        )
-        queried_former_failed_knowledge = (
-            [
-                knowledge
-                for knowledge in queried_former_failed_knowledge[0]
-                if knowledge.implementation.file_dict.get("main.py") != implementation.file_dict.get("main.py")
-            ],
-            queried_former_failed_knowledge[1],
-        )
+        )[0]
 
         # execute workflow
         result = implementation.run(env=env, entry="python -m coverage run main.py")
@@ -193,16 +186,17 @@ def evaluate(
             time_spent=f"{implementation.running_info.running_time:.2f} seconds",
             timeout=f"{env.conf.running_timeout_period} seconds",
             percent_of_timeout_used=f"{time_spent_ratio * 100:.2f}%",
-            queried_former_failed_knowledge=queried_former_failed_knowledge[0],
+            queried_former_failed_knowledge=queried_former_failed_knowledge,
         )
 
         feedback = build_cls_from_json_with_retry(
             DSRunnerFeedback,
             system_prompt=system_prompt,
             user_prompt=user_prompt,
-            init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
+            # init_kwargs_update_func=DSRunnerFeedback.val_and_update_init_dict,
         )
         feedback.score = score_df.to_string() if score_ret_code == 0 else None
+        feedback.final_decision = feedback.acceptable and (not feedback.hyperparameter_tuning_decision)
 
         if feedback and not DS_RD_SETTING.coder_on_whole_pipeline:
             # remove unused files
diff --git a/rdagent/scenarios/data_science/dev/runner/prompts.yaml b/rdagent/scenarios/data_science/dev/runner/prompts.yaml
@@ -25,13 +25,10 @@ DSCoSTEER_eval:
     3. Confirm that the prediction file (`submission.csv`) is generated using only the test dataset, and its format matches the sample submission.
     If the code does not satisfy the requirements:
     - Set "acceptable" to false.
-    - Set "final_decision" to false.
-    {% if enable_hyperparameter_tuning_check %}- set "hyperparameter_tuning_decision" to false.
-    - Set "hyperparameter_tuning_suggestion" to an empty string.
     If the code satisfy the requirements:
     - Set "acceptable" to true.
-    - Proceed to the next evaluation.
 
+    {% if enable_hyperparameter_tuning_check %}
     # Evaluation 2: Hyperparameter
     ## Evaluation Description
     The user will provide you the time spent on the whole code execution and the timeout of the code execution. You should decide whether the hyperparameter is reasonable based on the time.
@@ -45,7 +42,6 @@ DSCoSTEER_eval:
     3. Your suggestion should have a strong chance of improving the model's performance. Focus on the most obvious and impactful opportunities for quick improvement by leveraging more training time. Don't explore hyperparameters with low confidence.  If there are no obvious and impactful opportunities and the code runs well, please accept it.
     If the code satisfy the requirements:
     - Set "hyperparameter_tuning_decision" to true.
-    - Set "final_decision" to false.
     - Provide a reasonable suggestion in "hyperparameter_tuning_suggestion". The "hyperparameter_tuning_suggestion" should begin with a clear observation, followed by your suggestion. For example: "[Observation] The maximum number of epochs was reached, but the validation loss is still going down and early stopping was not activated. Only 15% of the allowed time was used. [Suggestion] We recommend increasing epochs to 100 to avoid underfitting and further improve model performance."
     If the code does not satisfy the requirements:
     - Set "hyperparameter_tuning_decision" to false.
@@ -59,10 +55,11 @@ DSCoSTEER_eval:
         "execution": "Describe whether the whole code base executed successfully and generating the final submission. Include any errors or issues encountered, and retain all error messages and traceback details.",
         "return_checking": "Verify the generated files, particularly the submission file. Ensure that its format matches the sample submission",
         "code": "Provide feedback on code quality, readability, and adherence to the given specifications.",
-        "acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,{% if enable_hyperparameter_tuning_check %}
+        "acceptable": <true/false: if the solution has passed execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
+        {% if enable_hyperparameter_tuning_check %}
         "hyperparameter_tuning_decision": <true/false>,
-        "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
-        "final_decision": <true/false>,
+        "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,
+        {% endif %}
     }
     ```
     {% else %}
@@ -101,14 +98,13 @@ DSCoSTEER_eval:
         "acceptable": <true/false: if the solution has paased execution, return_checking, and code verification, then it is a valid solution and acceptable. Otherwise it is not acceptable.>,
         {% if enable_hyperparameter_tuning_check %}"hyperparameter_tuning_decision": <true/false>,
         "hyperparameter_tuning_suggestion": <suggestion in plain text for hyperparameter tuning>,{% endif %}
-        "final_decision": <true/false>,
     }
     ```
     {% endif %}
 # NOTE: when is_sub_enabled == False, we don't have any checking about the return. So it is just placeholder currently
 
   user: |-
-    # Code base
+    # Current Code base
     {{ code }}
 
     ## Stdout of code execution and testing
@@ -121,10 +117,9 @@ DSCoSTEER_eval:
     
     {% if queried_former_failed_knowledge|length != 0 %}
     # Evolving History
-    {% for former_failed_knowledge in queried_former_failed_knowledge %} ## Attempt {{ loop.index }}:
+    {% for former_failed_knowledge in queried_former_failed_knowledge %}## Attempt {{ loop.index }}:
     ### Summary of Changes
     {{ former_failed_knowledge.implementation.change_summary }}
-    ### Feedbacks
     {{ former_failed_knowledge.feedback }}
     {% endfor %}
     {% endif %}
@@ -138,7 +133,6 @@ DSCoSTEER:
     1. Code base.
     2. Task description, which is the task the code is trying to solve.
     3. Feedback generated during the execution of the whole workflow.
-    4. Suggestions for hyperparameter tuning.
     Your job is to debug the whole code base, try to correct the errors, and ensure that the workflow can execute successfully on the full dataset.
 
     ## Task description
@@ -191,10 +185,10 @@ DSCoSTEER:
     {% endif %}
 
   user: |-
-    # Code Base
+    # Current Code Base
     {{ code }}
 
-    ## Feedback
+    ## Feedback of Current Code Base
     {{ feedback }}
 
     {% if hyperparameter_tuning_suggestion is not none %}
@@ -204,10 +198,10 @@ DSCoSTEER:
 
     {% if queried_former_failed_knowledge|length != 0 %}
     # Evolving History
-    {% for former_failed_knowledge in queried_former_failed_knowledge %} ## Attempt {{ loop.index }}:
+    {% for former_failed_knowledge in queried_former_failed_knowledge %}## Attempt {{ loop.index }}:
     ### Summary of Changes
     {{ former_failed_knowledge.implementation.change_summary }}
-    ### Feedbacks
-    {{ former_failed_knowledge.feedback }}
+    ### Validation Scores
+    {{ former_failed_knowledge.feedback.score }}
     {% endfor %}
     {% endif %}