feat: add drafting pipeline (#832)

RolandMinrui · Xu · web-flow · commit efedddf39bc1 · 2025-04-30T16:25:32.000+08:00
* init commit

* add drafting prompt

* complete the drafting

* remove scenario problems from proposal

* rename prompts_drafting.yaml

* fix bug

* fix DSHypothesis print bug

* add failed drafting exp to prompt

* fix small bug

* use get_task_information() for task design

* resolve all comments

* add problem_desc to pesudo hypothesis

---------

Co-authored-by: Xu &lt;v-xuminrui@microsoft.com&gt;
diff --git a/rdagent/scenarios/data_science/dev/prompts.yaml b/rdagent/scenarios/data_science/dev/prompts.yaml
@@ -41,7 +41,7 @@ exp_feedback:
     - Begin your `reasoning` with `[Experiment Analysis]`, clearly stating why the current experiment's result surpasses or falls short compared to the SOTA.
     - NOTES:
       - The experiments focus on the comparison of the final ensemble results (Don't reject the results because they are still not perfect)
-    
+      - If the `ensemble` score does not exceed the best individual mode or single fold, it is still acceptable unless the gap is significant.
     Step 4: Analyze Code With Similar validation Results
     - If the current `ensemble` validation score is similar to the SOTA `ensemble` validation score, give the decision based on the comparison between the current experiment and SOTA.
     - The current code should replace the best result if the code is:
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py b/rdagent/scenarios/data_science/proposal/exp_gen/__init__.py
@@ -3,7 +3,10 @@
 from rdagent.core.utils import import_class
 from rdagent.scenarios.data_science.experiment.experiment import DSExperiment
 from rdagent.scenarios.data_science.proposal.exp_gen.base import DSHypothesis, DSTrace
-from rdagent.scenarios.data_science.proposal.exp_gen.draft import DSDraftExpGen
+from rdagent.scenarios.data_science.proposal.exp_gen.draft import (
+    DSDraftExpGen,
+    DSDraftV2ExpGen,
+)
 from rdagent.scenarios.data_science.proposal.exp_gen.proposal import (
     DSProposalV1ExpGen,
     DSProposalV2ExpGen,
@@ -29,6 +32,9 @@ def gen(self, trace: DSTrace, selection: tuple[int, ...] = (-1,)) -> DSExperimen
         if DS_RD_SETTING.proposal_version not in ["v1", "v2"]:
             return import_class(DS_RD_SETTING.proposal_version)(scen=self.scen).gen(trace=trace)
 
+        if trace.sota_experiment() is None:
+            return DSDraftV2ExpGen(scen=self.scen).gen(trace=trace)
+
         if DS_RD_SETTING.coder_on_whole_pipeline:
             return DSProposalV2ExpGen(scen=self.scen).gen(trace=trace, pipeline=True)
 
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/base.py b/rdagent/scenarios/data_science/proposal/exp_gen/base.py
@@ -13,13 +13,13 @@ def __init__(
         self,
         component: COMPONENT,
         hypothesis: str = "",
-        reason: str = "",
-        concise_reason: str = "",
-        concise_observation: str = "",
-        concise_justification: str = "",
-        concise_knowledge: str = "",
-        problem_name: str = "",
-        problem_desc: str = "",
+        reason: str | None = None,
+        concise_reason: str | None = None,
+        concise_observation: str | None = None,
+        concise_justification: str | None = None,
+        concise_knowledge: str | None = None,
+        problem_name: str | None = None,
+        problem_desc: str | None = None,
         problem_label: Literal["SCENARIO_PROBLEM", "FEEDBACK_PROBLEM"] = "FEEDBACK_PROBLEM",
     ) -> None:
         super().__init__(
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/draft.py b/rdagent/scenarios/data_science/proposal/exp_gen/draft.py
@@ -1,10 +1,11 @@
 import json
-from typing import TYPE_CHECKING
+from typing import TYPE_CHECKING, Dict
 
 from rdagent.app.data_science.conf import DS_RD_SETTING
 from rdagent.components.coder.data_science.ensemble.exp import EnsembleTask
 from rdagent.components.coder.data_science.feature.exp import FeatureTask
 from rdagent.components.coder.data_science.model.exp import ModelTask
+from rdagent.components.coder.data_science.pipeline.exp import PipelineTask
 from rdagent.components.coder.data_science.raw_data_loader.exp import DataLoaderTask
 from rdagent.components.coder.data_science.workflow.exp import WorkflowTask
 from rdagent.core.proposal import ExpGen, Hypothesis
@@ -116,3 +117,77 @@ def gen(
             # exp.experiment_workspace.inject_code_from_folder(last_successful_exp.experiment_workspace.workspace_path)
             exp.experiment_workspace.inject_code_from_file_dict(last_successful_exp.experiment_workspace)
         return exp
+
+
+class DSDraftV2ExpGen(ExpGen):
+    def task_gen(
+        self,
+        scenario_desc: str,
+        scen_problems: dict,
+        component_desc: str,
+        drafting_trace_desc: str,
+    ) -> DSExperiment:
+        scen_problems_text = ""
+        for i, (problem_name, problem_dict) in enumerate(scen_problems.items()):
+            scen_problems_text += f"## Problem Name: {problem_name}\n"
+            scen_problems_text += f"- Problem Description: {problem_dict['problem']}\n\n"
+        sys_prompt = T(".prompts_drafting:task_draft.system").r(
+            task_spec=T(f"scenarios.data_science.share:component_spec.Pipeline").r(),
+            component_desc=component_desc,
+        )
+        user_prompt = T(".prompts_drafting:task_draft.user").r(
+            scenario_desc=scenario_desc,
+            scen_problems=scen_problems_text,
+            drafting_trace_desc=drafting_trace_desc,
+        )
+        response = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=sys_prompt,
+            json_mode=True,
+            json_target_type=Dict[str, str],
+        )
+        task_dict = json.loads(response)
+        task_design = task_dict.get("task_design", "Description not provided")
+        task = PipelineTask(name="Workflow", description=task_design)
+
+        # we use a pesudo hypothesis here
+        pesudo_hypothesis = DSHypothesis(
+            component=task_component,
+            hypothesis="This is a pesudo hypothesis for drafting the first competition implementation. Your result should not be influenced by this hypothesis.",
+            problem_name="This is a pesudo problem name for drafting. The corresponding problem description includes several problem together.",
+            problem_desc=scen_problems_text,
+        )
+        exp = DSExperiment(pending_tasks_list=[[task]], hypothesis=pesudo_hypothesis)
+        return exp
+
+    def gen(self, trace: DSTrace) -> DSExperiment:
+        # Prepare
+        last_exp = trace.last_exp()
+        if not isinstance(last_exp, DSExperiment):
+            eda_output = None
+        else:
+            eda_output = last_exp.experiment_workspace.file_dict.get("EDA.md", None)
+
+        component_desc = T("scenarios.data_science.share:component_description_in_pipeline").r()
+        scenario_desc = trace.scen.get_scenario_all_desc(eda_output=eda_output)
+        drafting_trace_desc = T("scenarios.data_science.share:describe.drafting_trace").r(
+            exp_and_feedback_list=trace.experiment_and_feedback_list_after_init(return_type="all"),
+        )
+
+        # Step 1: Identify Scenario Problems
+        sys_prompt = T(".prompts_drafting:scenario_problem.system").r()
+        user_prompt = T(".prompts_drafting:scenario_problem.user").r(scenario_desc=scenario_desc)
+        response = APIBackend().build_messages_and_create_chat_completion(
+            user_prompt=user_prompt,
+            system_prompt=sys_prompt,
+            json_mode=True,
+            json_target_type=Dict[str, Dict[str, str]],
+        )
+        scen_problems = json.loads(response)
+
+        # Step 2: Design Task
+        return self.task_gen(
+            scenario_desc=scenario_desc,
+            scen_problems=scen_problems,
+            drafting_trace_desc=drafting_trace_desc,
+        )
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_drafting.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_drafting.yaml
@@ -0,0 +1,72 @@
+scenario_problem:
+  system: |-
+    {% include "scenarios.data_science.share:scen.role" %}
+    The user is creating a Kaggle competition implementation iteratively and this is the first iteration. You will be given the Kaggle competition scenario.
+    Your task is to analyze the given information and extract the **Scenario Problems** from the given materials to aid the implementation.
+
+    ## Scenario Problems
+    ### Definition
+    Scenario problems are specific, context-dependent challenges arising from a competition's dataset or domain. They fall into two categories:
+    1. Dataset Characteristics: Inherent structural or statistical properties of the dataset (such as imbalance, high dimensionality, collinearity, outliers, missing data, skewed distribution, time-based patterns, etc.).
+    2. Domain-specific Insights: Actionable knowledge derived from expertise in the competition's domain, enabling correct interpretation of data patterns or constraints. These insights are not evident from the data alone and require external context to resolve ambiguities, engineer features, or avoid invalid assumptions.
+
+    ### Specification
+    1. The problem should be specific and fine-grained. Avoid general or vague statements. 
+    2. The problem should technical or methodological. Focus on design and implementation flaws.
+    3. The problem should be strictly aligned with the improvement of target metric. **IF THE PROBLEM IS SOLVED, THEN THE TARGET METRIC WILL IMPROVE**.
+
+    ### Output Format
+    For each of the identified problem, you should strictly adhere to the following JSON schema. Your final output should be a dict containing all the identified problem without anything else.
+    {
+      "problem name 1": {
+        "problem": "Description of the first issue in no more than three sentences.",
+        "reason": "Brief explanation of why this is a problem, based on evidence from provided materials in no more than three sentences."
+      },
+      "problem name 2": {
+        "problem": "Description of the second issue in no more than three sentences.",
+        "reason": "Brief explanation of why this is a problem, based on evidence from provided materials in no more than three sentences."
+      }
+    }
+
+  user: |-
+    # Scenario Description
+    {{ scenario_desc }}
+
+
+task_draft:
+  system: |-
+    {% include "scenarios.data_science.share:scen.role" %}
+    The user is creating a Kaggle competition implementation iteratively and this is the first iteration.
+    You will be given a competition scenario and a list of identified scenario problems from the given competition scenario.
+    In addition, if there are any previous failed experiments, you will receive the task designs and failures. Please read them carefully to have a better understanding.
+    Your role is to design a very detailed task with specific steps and instructions to implement competition solution and address identifed scenario problems. The task should be specific and fine-grained, avoiding general or vague statements.
+
+    # Task Design
+    ## Task Specification
+    {{ task_spec }}
+
+    ## Task Design Guidelines
+    Here are guidelines **YOU MUST FOLLOW** in your task design:
+    1. The task should be concise with several steps each only in a few sentences. 
+    2. DO NOT write any code in the task description.
+    3. DO NOT use any pharases like "for example" or "eg.," in the task description. Clearly give a decision (such as the specific method or model name) in the task description.
+    4. DO NOT use vague statements like "choose a proper model" or "optimize the pipeline". Instead, specify the exact step and task to be made.
+    5. Your task design should try to cover **ALL** the identified scenario problems. DO NOT include any conflicting ideas in the task design. If there are conflicting ideas due to conflicting identified problems, prioritize the most impactful or feasible option. If multiple solutions exist for a problem, select the most impactful or feasible option only. DO NOT include any conflicting ideas in the task description.
+    6. Carefully read and analyze the previous failed experiments if any so that no similar mistakes will be made in your task design. Remember to put the lessons you learned from previous experiments in the new task design.
+
+    ##  Task Output Format:
+    Design a specific and detailed Pipeline task based on the given competition scenario and scenario problems. The output should be detailed enough to directly implement the corresponding code.
+    The output should follow JSON format. The schema is as follows:
+    {
+        "task_design": "A precise and comprehensive description of the main workflow script (`main.py`).",
+    }
+
+  user: |-
+    # Scenario Description
+    {{ scenario_desc }}
+
+    # Identified Scenario Problems
+    {{ scen_problems }}
+
+    # Previous Failed Experiments
+    {{ drafting_trace_desc }}
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml b/rdagent/scenarios/data_science/proposal/exp_gen/prompts_v2.yaml
@@ -1,33 +1,3 @@
-scenario_problem:
-  system: |-
-    {% include "scenarios.data_science.share:scen.role" %}
-    You will be given the scenario description and the current SOTA implementation and feedback.
-    Your task is to analyze the given information and extract the **Scenario Problems** from the given materials.
-
-    ## Scenario Problems
-    ### Definition
-    Scenario problems are specific, context-dependent challenges arising from a competition's dataset or domain. They fall into two categories:
-    1. Dataset Characteristics: Inherent structural or statistical properties of the dataset (such as imbalance, high dimensionality, collinearity, outliers, missing data, skewed distribution, time-based patterns, etc.).
-    2. Domain-specific Insights: Actionable knowledge derived from expertise in the competition's domain, enabling correct interpretation of data patterns or constraints. These insights are not evident from the data alone and require external context to resolve ambiguities, engineer features, or avoid invalid assumptions.
-
-    ### Specification
-    {{ problem_spec }}
-    
-    ### Core Analysis Dimensions
-    1. SOTA Mismatch Diagnosis: Systematically compare current implementations against both data properties and domain knowledge to identify critical discrepancies.
-    2. Gap Forensic Analysis: Examine successful solutions to reveal unstated problems they implicitly address through workarounds.
-    3. Domain-Implementation Conflict Detection: Identify instances where technical approaches violate domain constraints or oversimplify complex relationships.
-
-    ### Output Format
-    {{ problem_output_format }}
-
-  user: |-
-    # Scenario Description
-    {{ scenario_desc }}
-
-    # Current SOTA Implementation
-    {{ sota_exp_desc }}
-
 feedback_problem:
   system: |-
     {% include "scenarios.data_science.share:scen.role" %}
@@ -81,12 +51,15 @@ hypothesis_gen:
     2. Lessons from Previous Experiments
       - For persistent problems, analyze why prior hypotheses and solutions failed.
       - Incorporate evidence from past failures/successes to justify the hypothesis.
+      - If previous experiments failed due to time/memory constraints, prioritize changes on efficiency.
     3. Actionable Changes
-      - If the problem relates to time/memory constraints, suggest smaller model sizes or alternative algorithms with reduced complexity.
+      - If the problem relates to time/memory constraints, consider smaller model sizes or alternative algorithms with reduced complexity.
       - If the problem involves underperforming models, propose removing or replacing models with significantly worse performance.
       - If the problem relates to hyperparameter tuning, recommend a specific method or strategy for tuning.
+    4. Priority Note on Time/Memory Constraints
+      - If time/memory constraints exist, they must be prioritized above all other problems. In such cases, do not response any other problems in the response dictionary.
     {% if enable_idea_pool %}
-    4. Idea Reference
+    5. Idea Reference
       - Each idea is a method, technique or trick that contributes to high performance from other competition implementation under similar problem. You are free to use them as an inspiration for your hypothesis proposal.
     {% endif %}
 
diff --git a/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py b/rdagent/scenarios/data_science/proposal/exp_gen/proposal.py
@@ -224,23 +224,6 @@ def _f(user_prompt):
 
 
 class DSProposalV2ExpGen(ExpGen):
-    def identify_scenario_problem(self, scenario_desc: str, sota_exp_desc: str) -> Dict:
-        sys_prompt = T(".prompts_v2:scenario_problem.system").r(
-            problem_spec=T(".prompts_v2:specification.problem").r(),
-            problem_output_format=T(".prompts_v2:output_format.problem").r(),
-        )
-        user_prompt = T(".prompts_v2:scenario_problem.user").r(
-            scenario_desc=scenario_desc,
-            sota_exp_desc=sota_exp_desc,
-        )
-        response = APIBackend().build_messages_and_create_chat_completion(
-            user_prompt=user_prompt,
-            system_prompt=sys_prompt,
-            json_mode=True,
-            json_target_type=Dict[str, Dict[str, str]],
-        )
-        return json.loads(response)
-
     def identify_feedback_problem(self, scenario_desc: str, exp_feedback_list_desc: str, sota_exp_desc: str) -> Dict:
         sys_prompt = T(".prompts_v2:feedback_problem.system").r(
             problem_spec=T(".prompts_v2:specification.problem").r(),
@@ -457,20 +440,14 @@ def gen(self, trace: DSTrace, pipeline: bool = False) -> DSExperiment:
         )
 
         # Step 1: Identify problems
-        scen_problems = self.identify_scenario_problem(
-            scenario_desc=scenario_desc,
-            sota_exp_desc=sota_exp_desc,
-        )
-        for problem_name in scen_problems:
-            scen_problems[problem_name]["label"] = "SCENARIO_PROBLEM"
         fb_problems = self.identify_feedback_problem(
             scenario_desc=scenario_desc,
             exp_feedback_list_desc=exp_feedback_list_desc,
             sota_exp_desc=sota_exp_desc,
         )
         for problem_name in fb_problems:
             fb_problems[problem_name]["label"] = "FEEDBACK_PROBLEM"
-        all_problems = {**scen_problems, **fb_problems}
+        all_problems = fb_problems
 
         # Step 1.5: Sample ideas from idea pool
         if DS_RD_SETTING.enable_knowledge_base:
diff --git a/rdagent/scenarios/data_science/share.yaml b/rdagent/scenarios/data_science/share.yaml
@@ -79,7 +79,18 @@ describe: # some template to describe some object
     Reason: {{ exp_and_feedback[1].reason }}
     {% endfor %}
     {% endif %}
+  
+  drafting_trace: |-
+    {% if exp_and_feedback_list|length == 0 %}
+    No previous drafting experiments available.
+    {% else %}
+    {% for exp_and_feedback in exp_and_feedback_list %}
+    ## Drafting Experiment {{ loop.index }}
+    Task Design: {{ exp_and_feedback[0].pending_tasks_list[0][0].get_task_information() }}
+    Failure: {{ exp_and_feedback[1].reason }}
 
+    {% endfor %}
+    {% endif %}
 
 scen:  # customizable
   role: |-