microsoft · Hoder-zyf · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 30, 2025
diff --git a/.gitignore b/.gitignore
@@ -177,4 +177,4 @@ rdagent/app/benchmark/factor/example.json
 
 # UI Server resources
 videos/
-static/
+static/
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -140,6 +140,10 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     """Enable hypothesis critique and rewrite stages for improving hypothesis quality"""
     enable_scale_check: bool = False
 
+    #### mcp in coder
+    enable_context7: bool = True
+    """enable the use of context7 as mcp to search for relevant documents of current implementation errors"""
+
     #### enable runner code change summary
     runner_enable_code_change_summary: bool = True
 

diff --git a/rdagent/components/coder/CoSTEER/evaluators.py b/rdagent/components/coder/CoSTEER/evaluators.py
@@ -76,6 +76,27 @@ def val_and_update_init_dict(data: dict) -> dict:
                 raise ValueError(f"'{attr}' must be a string, not {type(data[attr])}")
         return data
 
+    @classmethod
+    def merge(cls, feedback_li: list["CoSTEERSingleFeedback"]) -> "CoSTEERSingleFeedback":
+        # NOTE:
+        # Here we don't know the detailed design of each feedback, we just know they are CoSTEERSingleFeedback
+        # So we merge them only based on CoSTEERSingleFeedback's attributes
+        # **So some information may be lost when we have different types of feedbacks**
+        # If you have more sophisticated sub class of CoSTEERSingleFeedback, you should override this method
+        # to avoid the loss of information.
+
+        fb = deepcopy(feedback_li[0])
+
+        # for all the evaluators, aggregate the final_decision from `task_id`
+        fb.final_decision = all(fb.final_decision for fb in feedback_li)
+        for attr in "execution", "return_checking", "code":
+            setattr(
+                fb,
+                attr,
+                "\n\n".join([getattr(_fb, attr) for _fb in feedback_li if getattr(_fb, attr) is not None]),
+            )
+        return fb
+
     def __str__(self) -> str:
         return f"""------------------Execution------------------
 {self.execution}
@@ -230,7 +251,18 @@ def evaluate(
         **kwargs,
     ) -> CoSTEERMultiFeedback:
         eval_l = self.single_evaluator if isinstance(self.single_evaluator, list) else [self.single_evaluator]
+
+        # 1) Evaluate each sub_task
         task_li_feedback_li = []
+        # task_li_feedback_li: List[List[CoSTEERSingleFeedback]]
+        # Example:
+        # If there are 2 evaluators and 3 sub_tasks in evo, and each evaluator's evaluate returns a list of 3 CoSTEERSingleFeedbacks,
+        # Then task_li_feedback_li will be:
+        # [
+        #   [feedback_1_1, feedback_1_2, feedback_1_3],  # results from the 1st evaluator for all sub_tasks
+        #   [feedback_2_1, feedback_2_2, feedback_2_3],  # results from the 2nd evaluator for all sub_tasks
+        # ]
+        # Where feedback_i_j is the feedback from the i-th evaluator for the j-th sub_task.
         for ev in eval_l:
             multi_implementation_feedback = multiprocessing_wrapper(
                 [
@@ -248,27 +280,22 @@ def evaluate(
                 n=RD_AGENT_SETTINGS.multi_proc_n,
             )
             task_li_feedback_li.append(multi_implementation_feedback)
-        # merge the feedbacks
+
+        # 2) merge the feedbacks along the sub_tasks to aggregate the multiple evaluation feedbacks
         merged_task_feedback = []
+        # task_li_feedback_li[0] is a list of feedbacks of different tasks for the 1st evaluator
         for task_id, fb in enumerate(task_li_feedback_li[0]):
-            fb = deepcopy(fb)  # deep copy to make it more robust
-
-            fb.final_decision = all(
-                task_li_feedback[task_id].final_decision for task_li_feedback in task_li_feedback_li
-            )
-            for attr in "execution", "return_checking", "code":
-                setattr(
-                    fb,
-                    attr,
-                    "\n\n".join(
-                        [
-                            getattr(task_li_feedback[task_id], attr)
-                            for task_li_feedback in task_li_feedback_li
-                            if getattr(task_li_feedback[task_id], attr) is not None
-                        ]
-                    ),
-                )
+            fb = fb.merge([fb_li[task_id] for fb_li in task_li_feedback_li])
             merged_task_feedback.append(fb)
+        # merged_task_feedback: List[CoSTEERSingleFeedback]
+        # Example:
+        # [
+        #   CoSTEERSingleFeedback(final_decision=True, execution="...", return_checking="...", code="..."),
+        #   CoSTEERSingleFeedback(final_decision=False, execution="...", return_checking="...", code="..."),
+        #   ...
+        # ]
+        # Each element corresponds to the merged feedback for one sub-task across all evaluators.
+        # merged_task_feedback[i] is the merged feedback for the i-th sub_task
 
         final_decision = [
             None if single_feedback is None else single_feedback.final_decision

diff --git a/rdagent/components/coder/data_science/pipeline/eval.py b/rdagent/components/coder/data_science/pipeline/eval.py
@@ -1,7 +1,9 @@
 # tess successfully running.
 # (GPT) if it aligns with the spec & rationality of the spec.
-import json
+import asyncio
+import concurrent.futures
 import re
+from dataclasses import dataclass
 from pathlib import Path
 
 import pandas as pd
@@ -18,14 +20,104 @@
 from rdagent.components.coder.data_science.conf import get_clear_ws_cmd, get_ds_env
 from rdagent.components.coder.data_science.share.notebook import NotebookConverter
 from rdagent.components.coder.data_science.utils import remove_eda_part
+from rdagent.components.mcp import query_context7
 from rdagent.core.experiment import FBWorkspace, Task
+from rdagent.log import rdagent_logger as logger
 from rdagent.scenarios.data_science.test_eval import get_test_eval
 from rdagent.utils.agent.tpl import T
 from rdagent.utils.agent.workflow import build_cls_from_json_with_retry
 
 DIRNAME = Path(__file__).absolute().resolve().parent
 
-PipelineSingleFeedback = CoSTEERSingleFeedback
+
+@dataclass
+class DSCoderFeedback(CoSTEERSingleFeedback):
+    """
+    Feedback for Data Science CoSTEER evaluation.
+    This feedback is used to evaluate the code and execution of the Data Science CoSTEER task.
+    """
+
+    requires_documentation_search: bool | None = None
+    error_message: str | None = None
+
+    @staticmethod
+    def val_and_update_init_dict(data: dict) -> dict:
+        # First call parent class validation method to handle base fields
+        data = CoSTEERSingleFeedback.val_and_update_init_dict(data)
+
+        # Validate new fields
+        if "requires_documentation_search" in data:
+            if isinstance(data["requires_documentation_search"], str):
+                if data["requires_documentation_search"] == "false" or data["requires_documentation_search"] == "False":
+                    data["requires_documentation_search"] = False
+                elif data["requires_documentation_search"] == "true" or data["requires_documentation_search"] == "True":
+                    data["requires_documentation_search"] = True
+                else:
+                    raise ValueError(
+                        f"'requires_documentation_search' string value must be 'true', 'True', 'false', or 'False', not '{data['requires_documentation_search']}'"
+                    )
+            elif data["requires_documentation_search"] is not None and not isinstance(
+                data["requires_documentation_search"], bool
+            ):
+                raise ValueError(
+                    f"'requires_documentation_search' must be a boolean, string, or None, not {type(data['requires_documentation_search'])}"
+                )
+
+        if "error_message" in data:
+            if data["error_message"] is not None and not isinstance(data["error_message"], str):
+                raise ValueError(f"'error_message' must be a string or None, not {type(data['error_message'])}")
+
+        return data
+
+    def __str__(self) -> str:
+        base_str = super().__str__()
+
+        if self.requires_documentation_search is not None:
+            base_str += f"-------------------Documentation Search Required------------------\n{self.requires_documentation_search}\n"
+
+        if self.error_message is not None:
+            # Check if error_message contains Context7 documentation results
+            if "### API Documentation Reference:" in self.error_message:
+                base_str += f"-------------------Error Analysis & Documentation Search Results ------------------\n{self.error_message}\n"
+            else:
+                base_str += f"-------------------Error Message------------------\n{self.error_message}\n"
+
+        return base_str
+
+    @classmethod
+    def merge(cls, feedback_li: list[CoSTEERSingleFeedback]) -> "DSCoderFeedback":
+        # Call parent class merge method to handle base fields
+        merged_fb = super().merge(feedback_li)
+
+        # Convert to DSCoderFeedback type if needed
+        if not isinstance(merged_fb, DSCoderFeedback):
+            merged_fb = DSCoderFeedback(
+                execution=merged_fb.execution,
+                return_checking=merged_fb.return_checking,
+                code=merged_fb.code,
+                final_decision=merged_fb.final_decision,
+            )
+
+        # Merge error_message fields
+        error_messages = [
+            fb.error_message for fb in feedback_li if isinstance(fb, DSCoderFeedback) and fb.error_message is not None
+        ]
+        if error_messages:
+            merged_fb.error_message = "\n\n".join(error_messages)
+
+        # Merge requires_documentation_search fields (True if any is True)
+        requires_search = [
+            fb.requires_documentation_search
+            for fb in feedback_li
+            if isinstance(fb, DSCoderFeedback) and fb.requires_documentation_search is not None
+        ]
+        if requires_search:
+            merged_fb.requires_documentation_search = any(requires_search)
+
+        return merged_fb
+
+
+PipelineSingleFeedback = DSCoderFeedback
 PipelineMultiFeedback = CoSTEERMultiFeedback
 
 
@@ -51,6 +143,8 @@ def evaluate(
                 execution="This task has failed too many times, skip implementation.",
                 return_checking="This task has failed too many times, skip implementation.",
                 code="This task has failed too many times, skip implementation.",
+                error_message="This task has failed too many times, skip implementation.",
+                requires_documentation_search=False,
                 final_decision=False,
             )
 
@@ -176,6 +270,9 @@ def evaluate(
         else:
             eda_output = implementation.file_dict.get("EDA.md", None)
 
+        # extract enable_context7 from setting
+        enable_context7 = DS_RD_SETTING.enable_context7
+
         queried_similar_successful_knowledge = (
             queried_knowledge.task_to_similar_task_successful_knowledge[target_task.get_task_information()]
             if queried_knowledge is not None
@@ -185,6 +282,7 @@ def evaluate(
         system_prompt = T(".prompts:pipeline_eval.system").r(
             is_sub_enabled=test_eval.is_sub_enabled(self.scen.competition),
             debug_mode=DS_RD_SETTING.sample_data_by_LLM,
+            enable_context7=enable_context7,
             mle_check=DS_RD_SETTING.sample_data_by_LLM,
             queried_similar_successful_knowledge=queried_similar_successful_knowledge,
         )
@@ -204,6 +302,35 @@ def evaluate(
             user_prompt=user_prompt,
             init_kwargs_update_func=PipelineSingleFeedback.val_and_update_init_dict,
         )
+
+        if enable_context7 and wfb.requires_documentation_search is True:
+            try:
+
+                def run_context7_sync():
+                    """Run Context7 query in a new event loop"""
+                    # Create new event loop to avoid conflicts with existing loop
+                    new_loop = asyncio.new_event_loop()
+                    asyncio.set_event_loop(new_loop)
+                    try:
+                        return new_loop.run_until_complete(
+                            query_context7(error_message=wfb.error_message, full_code=implementation.all_codes)
+                        )
+                    finally:
+                        new_loop.close()
+
+                # Execute in thread pool to avoid event loop conflicts
+                with concurrent.futures.ThreadPoolExecutor(max_workers=1) as executor:
+                    future = executor.submit(run_context7_sync)
+                    context7_result = future.result(timeout=120)  # 120s timeout, sufficient time for retry mechanism
+
+                if context7_result:
+                    logger.info("Context7: Documentation search completed successfully")
+                    wfb.error_message += f"\n\n### API Documentation Reference:\nThe following API documentation was retrieved based on the error. This provides factual information about API changes or parameter specifications only:\n\n{context7_result}"
+                else:
+                    logger.warning("Context7: Documentation search failed or no results found")
+            except Exception as e:
+                logger.error(f"Context7: Query failed - {str(e)}")
+
         if score_ret_code != 0 and wfb.final_decision is True:
             wfb.final_decision = False
             wfb.return_checking += "\n" + score_check_text

diff --git a/rdagent/components/coder/data_science/pipeline/prompts.yaml b/rdagent/components/coder/data_science/pipeline/prompts.yaml
@@ -232,10 +232,15 @@ pipeline_eval:
     - Notes:
       - Model performance is not evaluated in this step; focus solely on successful execution.
       - Warnings are acceptable if they do not interfere with successful code execution.
+      - **Environment Constraint**: The coding environment is fixed and pre-configured. No package installation or modification is allowed. Code must use only existing pre-installed packages.
     - If the code execute successfully:
-      - Proceed to Step 2.
+      - Proceed to Step 2 and overlook the remaining steps in Step 1.
     - If the code does not execute successfully:
-      - Set the "final_decision" to false and write complete analysis in the "execution" field.
+      - Set the "final_decision" to false.
+      {% if enable_context7 %}
+      - Given that my package/environment is fixed and unchangeable, first you should go through the code and the execution output,if the problem could be solved by looking up the official documentation to confirm feature/API availability, compatible usage, or official alternatives in the fixed environment, set the "requires_documentation_search" to true.
+      {% endif %}
+      - Write complete analysis in the "execution" field.
 
     ### Competition Alignment
     - Goal: Confirm strict adherence to the competition's evaluation rules and experimental setup.
@@ -309,9 +314,13 @@ pipeline_eval:
     Please respond with your feedback in the following JSON format without anything else.
     ```json
     {
+    {% if enable_context7 %}
+        "requires_documentation_search": <true/false>,
+    {% endif %}
         "execution": "Describe whether the code executed successfully. Include any errors or issues encountered, and append all error messages and full traceback details without summarizing or omitting any information. If errors occurred, analyze the root causes: (1) Are they fundamental algorithmic/approach issues, or (2) Implementation details that can be easily fixed, or (3) Environment/dependency problems?",
         "return_checking": "Examine the generated files by cross-referencing the code logic and stdout output. Verify: (1) Format matches required submission format (index, column names, CSV content); (2) **File generation authenticity**: Is the file genuinely produced by successful model execution, or is it a result of exception handling/fallback mechanisms? Cite specific code sections and stdout evidence.",
         "code": "Begin explicitly with [Code analysis] or [Evaluation error]. Provide structured analysis: (1) **Technical Appropriateness**: Does the chosen approach (algorithms, data processing, validation strategy) match this problem's data characteristics and competition requirements? (2) **Effective Components**: What specific parts work well and why are they effective for this problem type? (3) **Issues & Improvements**: Identify concrete problems and suggest actionable improvement directions (without providing actual code). (4) **Code Quality**: Assess readability, structure, and adherence to specifications.",
+        "error_message": "If the code execution has problems, extract the error information in the following format, otherwise set to empty string: ### TRACEBACK: <full relevant traceback extracted from execution output> ### SUPPLEMENTARY_INFO: <only if TRACEBACK is unclear - copy exact code fragments: import statements, variable=value assignments, function calls with parameters as they appear in code>",
         "final_decision": <true/false>
     }
     ```

diff --git a/rdagent/components/mcp/__init__.py b/rdagent/components/mcp/__init__.py
@@ -0,0 +1,8 @@
+"""MCP (Model Context Protocol) integration for RD-Agent.
+
+This module provides context7 functionality for documentation search.
+"""
+
+from .context7 import query_context7
+
+__all__ = ["query_context7"]
-Original file line number
+Diff line change
@@ Expand Up / @@ -177,4 +177,4 @@ rdagent/app/benchmark/factor/example.json @@
     # UI Server resources
     videos/
-    static/
+    static/