feat: merge failed and successful traces together (#766)

WinstonLiyt · web-flow · commit 3a2aa8cf0102 · 2025-04-08T17:04:55.000+08:00
* merge failed and successful traces together

* delete the task description from the trace display

* prune unnecessary info for the proposal stage
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/naive.py b/rdagent/scenarios/data_science/proposal/exp_gen/naive.py
@@ -20,18 +20,9 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             exp=sota_exp, heading="Best of previous exploration of the scenario"
         )
 
-        sota_exp_feedback_list = trace.experiment_and_feedback_list_after_init(return_type="sota")
-        failed_exp_feedback_list = trace.experiment_and_feedback_list_after_init(return_type="failed")[
-            -DS_RD_SETTING.max_trace_hist :
-        ]
-
-        sota_exp_and_feedback_list_desc = T("scenarios.data_science.share:describe.trace").r(
-            exp_and_feedback_list=sota_exp_feedback_list,
-            success=True,
-        )
-        failed_exp_and_feedback_list_desc = T("scenarios.data_science.share:describe.trace").r(
-            exp_and_feedback_list=failed_exp_feedback_list,
-            success=False,
+        exp_and_feedback_list_desc = T("scenarios.data_science.share:describe.trace").r(
+            exp_and_feedback_list=trace.experiment_and_feedback_list_after_init(return_type="all"),
+            type="all",
         )
 
         sys_prompt = T(".naive:naive_gen.system").r()
@@ -40,8 +31,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             competition_desc=competition_desc,
             sota_exp_desc=sota_exp_desc,
             scenario_desc=scenario_desc,
-            sota_exp_and_feedback_list_desc=sota_exp_and_feedback_list_desc,
-            failed_exp_and_feedback_list_desc=failed_exp_and_feedback_list_desc,
+            exp_and_feedback_list_desc=exp_and_feedback_list_desc,
         )
 
         task = build_cls_from_json_with_retry(
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/naive.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/naive.yaml
@@ -2,7 +2,7 @@ naive_gen:
   system: |-
     You are a Kaggle Grandmaster and expert ML engineer with deep expertise in statistics, machine learning, and competition optimization.
     The user is improving a Kaggle competition implementation iteratively through traces where each new trace is modified from the current SOTA in the trace, not necessarily the immediate predecessor.
-    You will be given a competition scenario, previous SOTA(best) and failed experiments and feedbacks, the current SOTA implementation and feedback, and a list of identified problems.
+    You will be given a competition scenario, previous SOTA (best) and failed experiments and feedbacks, the current SOTA implementation and feedback, and a list of identified problems.
 
     ## Guidelines
     Here are guidelines to aid your task design. You don't need to answer all the questions.
@@ -27,11 +27,8 @@ naive_gen:
     # Competition Description
     {{ competition_desc }}
 
-    # Previous Failed Experiments and Feedbacks:
-    {{ failed_exp_and_feedback_list_desc }}
-
-    # Previous SOTA Experiments and Feedbacks:
-    {{ sota_exp_and_feedback_list_desc }}
+    # Previous Experiments and Feedbacks:
+    {{ exp_and_feedback_list_desc }}
 
     # Current SOTA Implementation
     {{ sota_exp_desc }}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts.yaml
@@ -216,14 +216,8 @@ direct_exp_gen:
     }
 
   user: |-
-    # All former successful experiments and their feedbacks
-    Below are all the experiments that surpassed the previous SOTA solutions along with their feedback. The current SOTA solution is the latest among these successful trials:
-    {{ sota_exp_and_feedback_list_desc }}
-
-    {% if failed_exp_and_feedback_list_desc %}
-    # Several latest failed experiments and their feedbacks
-    The user has conducted several recent experiments on this scenario, but they either encountered execution errors or failed to surpass the SOTA performance. The details of these failed experiments and their results are as follows:
-    {{ failed_exp_and_feedback_list_desc }}
+    # All former experiments and their feedbacks
+    {{ exp_and_feedback_list_desc }}
     
     {% if targets == "Model" %}
     Based on the feedback from previous experiment failures, if the failure was due to exceeding the time limit or memory constraints, start with the smallest model size or choose alternative algorithms or methods with significantly lower time or space complexity instead of using a neural network. You can then iteratively refine and optimize the model in later stages.
@@ -245,8 +239,6 @@ direct_exp_gen:
     When building the model, if the runtime permits, consider incorporating hyperparameter search methods to improve performance.
     {% endif %}
     
-    {% endif %}
-    
     {% if last_exp_diff %}
     # Here are the differences between the latest version of implementation and the current best version of implementation
     It is presented in diff format, highlighting changes from the best version to the latest version.
@@ -280,14 +272,8 @@ component_gen:
     {{ component_output_format }}
 
   user: |-
-    Here's the former SOTA experiments and their feedbacks:
-    {{ sota_exp_and_feedback_list_desc }}
-
-    Also, here's the former failed experiments and their feedbacks:
-    {{ failed_exp_and_feedback_list_desc }}
-
-    All former trials and their feedbacks are provided in pandas DataFrame format. The user has already made several hypothesis on this scenario and did several evaluation on them:
-    {{ component_and_feedback_df }}
+    Here are the former experiments and their feedbacks:
+    {{ exp_and_feedback_desc }}
     
     Please choose the most proper component to focus on based on the information above. Please balance the exploration and exploitation.
     Avoid selecting the same component more than 5 times in a row to ensure that the chosen component is not overly repetitive.
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -52,11 +52,8 @@ feedback_problem:
     # Scenario Description
     {{ scenario_desc }}
     
-    # Previous SOTA Experiments and Feedbacks:
-    {{ sota_exp_and_feedback_list_desc }}
-
-    # Previous Failed Experiments and Feedbacks:
-    {{ failed_exp_and_feedback_list_desc }}
+    # Previous Experiments and Feedbacks:
+    {{ exp_and_feedback_list_desc }}    
 
     # Current SOTA Implementation
     {{ sota_exp_desc }}
@@ -115,11 +112,8 @@ hypothesis_gen:
     # Scenario Description
     {{ scenario_desc }}
 
-    # Previous SOTA Experiments and Feedbacks:
-    {{ sota_exp_and_feedback_list_desc }}
-
-    # Previous Failed Experiments and Feedbacks:
-    {{ failed_exp_and_feedback_list_desc }}
+    # Previous Experiments and Feedbacks:
+    {{ exp_and_feedback_list_desc }}
 
     # Current SOTA Implementation
     {{ sota_exp_desc }}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -83,26 +83,11 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             generate_diff_from_dict(sota_exp.experiment_workspace.file_dict, last_exp.experiment_workspace.file_dict)
         )  # we use file_dict for hitting the cache when replicate the experiment in another machine.
 
-        sota_exp_feedback_list = trace.experiment_and_feedback_list_after_init(return_type="sota")
-        failed_exp_feedback_list = trace.experiment_and_feedback_list_after_init(return_type="failed")[
-            -DS_RD_SETTING.max_trace_hist :
-        ]
         all_exp_feedback_list = trace.experiment_and_feedback_list_after_init(return_type="all")
-        trace_component_to_feedback_df = pd.DataFrame(columns=["component", "hypothesis", "decision"])
-        for index, (exp, fb) in enumerate(all_exp_feedback_list):
-            trace_component_to_feedback_df.loc[f"trial {index + 1}"] = [
-                exp.hypothesis.component,
-                exp.hypothesis.hypothesis,
-                fb.decision,
-            ]
 
-        sota_exp_feedback_list_desc = T("scenarios.data_science.share:describe.trace").r(
-            exp_and_feedback_list=sota_exp_feedback_list,
-            success=True,
-        )
-        failed_exp_feedback_list_desc = T("scenarios.data_science.share:describe.trace").r(
-            exp_and_feedback_list=failed_exp_feedback_list,
-            success=False,
+        exp_feedback_list_desc = T("scenarios.data_science.share:describe.trace").r(
+            exp_and_feedback_list=all_exp_feedback_list,
+            type="all",
         )
 
         # Generate component using template with proper context
@@ -120,13 +105,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
         )
 
         component_user_prompt = T(".prompts:component_gen.user").r(
-            sota_exp_and_feedback_list_desc=sota_exp_feedback_list_desc,
-            failed_exp_and_feedback_list_desc=failed_exp_feedback_list_desc,
-            component_and_feedback_df=(
-                trace_component_to_feedback_df.to_string()
-                if len(trace_component_to_feedback_df) > 0
-                else "No experiment and feedback provided"
-            ),
+            exp_and_feedback_list_desc=exp_feedback_list_desc,
         )
 
         resp_dict_component: dict = json.loads(
@@ -172,8 +151,7 @@ def gen(self, trace: DSTrace) -> DSExperiment:
             user_prompt = T(".prompts:direct_exp_gen.user").r(
                 targets=component_info["target_name"],
                 sota_exp_desc=sota_exp_desc,
-                sota_exp_and_feedback_list_desc=sota_exp_feedback_list_desc,
-                failed_exp_and_feedback_list_desc=failed_exp_feedback_list_desc,
+                exp_and_feedback_list_desc=exp_feedback_list_desc,
                 last_exp_diff=last_exp_diff,
             )
 
@@ -262,8 +240,7 @@ def identify_scenario_problem(self, scenario_desc: str, competition_desc: str, s
     def identify_feedback_problem(
         self,
         scenario_desc: str,
-        sota_exp_feedback_list_desc: str,
-        failed_exp_feedback_list_desc: str,
+        exp_feedback_list_desc: str,
         sota_exp_desc: str,
         pipeline: bool,
     ) -> Dict:
@@ -273,8 +250,7 @@ def identify_feedback_problem(
         )
         user_prompt = T(".prompts_v2:feedback_problem.user").r(
             scenario_desc=scenario_desc,
-            sota_exp_and_feedback_list_desc=sota_exp_feedback_list_desc,
-            failed_exp_and_feedback_list_desc=failed_exp_feedback_list_desc,
+            exp_and_feedback_list_desc=exp_feedback_list_desc,
             sota_exp_desc=sota_exp_desc,
         )
         response = APIBackend().build_messages_and_create_chat_completion(
@@ -289,8 +265,7 @@ def hypothesis_gen(
         self,
         component_desc: str,
         scenario_desc: str,
-        sota_exp_feedback_list_desc: str,
-        failed_exp_feedback_list_desc: str,
+        exp_feedback_list_desc: str,
         sota_exp_desc: str,
         problems: list,
         pipeline: bool,
@@ -303,8 +278,7 @@ def hypothesis_gen(
         )
         user_prompt = T(".prompts_v2:hypothesis_gen.user").r(
             scenario_desc=scenario_desc,
-            sota_exp_and_feedback_list_desc=sota_exp_feedback_list_desc,
-            failed_exp_and_feedback_list_desc=failed_exp_feedback_list_desc,
+            exp_and_feedback_list_desc=exp_feedback_list_desc,
             sota_exp_desc=sota_exp_desc,
             problems=json.dumps(problems, indent=2),
         )
@@ -428,18 +402,9 @@ def gen(self, trace: DSTrace, pipeline: bool = False) -> DSExperiment:
             exp=sota_exp, heading="Best of previous exploration of the scenario"
         )
 
-        sota_exp_feedback_list = trace.experiment_and_feedback_list_after_init(return_type="sota")
-        failed_exp_feedback_list = trace.experiment_and_feedback_list_after_init(return_type="failed")[
-            -DS_RD_SETTING.max_trace_hist :
-        ]
-
-        sota_exp_feedback_list_desc = T("scenarios.data_science.share:describe.trace").r(
-            exp_and_feedback_list=sota_exp_feedback_list,
-            success=True,
-        )
-        failed_exp_feedback_list_desc = T("scenarios.data_science.share:describe.trace").r(
-            exp_and_feedback_list=failed_exp_feedback_list,
-            success=False,
+        exp_feedback_list_desc = T("scenarios.data_science.share:describe.trace").r(
+            exp_and_feedback_list=trace.experiment_and_feedback_list_after_init(return_type="all"),
+            type="all",
         )
 
         # Step 1: Identify problems
@@ -450,8 +415,7 @@ def gen(self, trace: DSTrace, pipeline: bool = False) -> DSExperiment:
         )
         fb_problems = self.identify_feedback_problem(
             scenario_desc=scenario_desc,
-            sota_exp_feedback_list_desc=sota_exp_feedback_list_desc,
-            failed_exp_feedback_list_desc=failed_exp_feedback_list_desc,
+            exp_feedback_list_desc=exp_feedback_list_desc,
             sota_exp_desc=sota_exp_desc,
             pipeline=pipeline,
         )
@@ -461,8 +425,7 @@ def gen(self, trace: DSTrace, pipeline: bool = False) -> DSExperiment:
         hypothesis_dict = self.hypothesis_gen(
             component_desc=component_desc,
             scenario_desc=scenario_desc,
-            sota_exp_feedback_list_desc=sota_exp_feedback_list_desc,
-            failed_exp_feedback_list_desc=failed_exp_feedback_list_desc,
+            exp_feedback_list_desc=exp_feedback_list_desc,
             sota_exp_desc=sota_exp_desc,
             problems=all_problems,
             pipeline=pipeline,
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -43,24 +43,47 @@ describe: # some template to describe some object
 
   trace: |-
     {% if exp_and_feedback_list|length == 0 %}
-    No previous {% if success %}successful{% else %}failed{% endif %} trial available.
+    No previous 
+    {% if type == "success" %}
+    successful
+    {% elif type == "failure" %}
+    failed
     {% else %}
-    {% if success %}
-    ## {{ heading | default('Trace of the successful trial') }}
+    successful or failed
+    {% endif %} trial available.
     {% else %}
+    {% if type == "success" %}
+    ## {{ heading | default('Trace of the successful trial') }}
+    {% elif type == "failure" %}
     ## {{ heading | default('Trace of the failed trial') }}
+    {% else %}
+    ## {{ heading | default('Trace of all trials') }}
+    {% endif %}
+
+    Before current trial, several 
+    {% if type == "success" %}
+    successful
+    {% elif type == "failure" %}
+    failed
+    {% else %}
+    successful or failed
+    {% endif %} trials are listed below.
+    {% if type == "success" %}
+    The current SOTA method is the combination of the best solutions of these trials.
     {% endif %}
-    Before current trial, several {% if success %}successful{% else %}failed{% endif %} trials are listed below. {% if success %}The current SOTA method is the combination of the best solutions of these trials.{% endif %} The trace order is from the earliest to the latest please focus more on the later trials.
+
+    The trace order is from the earliest to the latest. Please focus more on the later trials.
+
     {% for exp_and_feedback in exp_and_feedback_list %}
     ### Experiment index: {{ loop.index }}
     The experiment is designed based on hypothesis: {{ exp_and_feedback[0].hypothesis }}
-    ### Task of experiment
-    {{ exp_and_feedback[0].pending_tasks_list[0][0].get_task_information() }}
+
     {% if exp_and_feedback[0].result is none %}
     Experiment score: Running buggy
     {% else %}
     Experiment score: {{ exp_and_feedback[0].result.loc["ensemble"].iloc[0] }}
     {% endif %}
+
     Experiment feedback decision: {{ exp_and_feedback[1].decision }}
     Reason: {{ exp_and_feedback[1].reason }}
     {% endfor %}