chore: custom data refine (#864)

qew21 · web-flow · commit 8b2f22c6e564 · 2025-05-10T17:02:12.000+08:00
* chore: print up to 100 columns in simple mode

* fix: check content for model dump

* chore: add show_nan_columns config
diff --git a/rdagent/app/data_science/conf.py b/rdagent/app/data_science/conf.py
@@ -44,6 +44,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
     rule_base_eval: bool = False
     sample_data: bool = True
     use_raw_description: bool = False
+    show_nan_columns: bool = False
 
     #### model dump
     enable_model_dump: bool = False
diff --git a/rdagent/components/coder/data_science/share/eval.py b/rdagent/components/coder/data_science/share/eval.py
@@ -48,6 +48,18 @@ def evaluate(
 
         # 2) check the result and stdout after reruning the model.
 
+        # Read the content of files submission.csv and scores.csv before execution
+        submission_content_before = (
+            (implementation.workspace_path / "submission.csv").read_text()
+            if (implementation.workspace_path / "submission.csv").exists()
+            else None
+        )
+        scores_content_before = (
+            (implementation.workspace_path / "scores.csv").read_text()
+            if (implementation.workspace_path / "scores.csv").exists()
+            else None
+        )
+
         # Remove the files submission.csv and scores.csv
         implementation.execute(env=env, entry=get_clear_ws_cmd(stage="before_inference"))
 
@@ -70,17 +82,17 @@ def evaluate(
                     final_decision=False,
                 )
 
-        # Read the content of files submission.csv and scores.csv before execution
-        submission_content_before = (
-            (implementation.workspace_path / "submission.csv").read_text()
-            if (implementation.workspace_path / "submission.csv").exists()
-            else None
-        )
-        scores_content_before = (
-            (implementation.workspace_path / "scores.csv").read_text()
-            if (implementation.workspace_path / "scores.csv").exists()
-            else None
-        )
+        # Check if scores contain NaN (values)
+        score_df = pd.read_csv((implementation.workspace_path / "scores.csv"), index_col=0)
+        if score_df.isnull().values.any():
+            nan_locations = score_df[score_df.isnull().any(axis=1)]
+            err_msg = f"\n[Error] The scores dataframe contains NaN values at the following locations:\n{nan_locations}"
+            return CoSTEERSingleFeedback(
+                execution=err_msg,
+                return_checking=err_msg,
+                code=err_msg,
+                final_decision=False,
+            )
 
         assert submission_content_before is not None
         assert scores_content_before is not None
diff --git a/rdagent/scenarios/data_science/scen/__init__.py b/rdagent/scenarios/data_science/scen/__init__.py
@@ -166,7 +166,9 @@ def get_runtime_environment(self) -> str:
         return stdout
 
     def _get_data_folder_description(self) -> str:
-        return describe_data_folder_v2(Path(DS_RD_SETTING.local_data_path) / self.competition)
+        return describe_data_folder_v2(
+            Path(DS_RD_SETTING.local_data_path) / self.competition, show_nan_columns=DS_RD_SETTING.show_nan_columns
+        )
 
 
 class KaggleScen(DataScienceScen):
diff --git a/rdagent/scenarios/data_science/scen/utils.py b/rdagent/scenarios/data_science/scen/utils.py
@@ -268,7 +268,7 @@ def _walk(path: Path):
         yield p
 
 
-def preview_csv(p: Path, file_name: str, simple=True) -> str:
+def preview_csv(p: Path, file_name: str, simple=True, show_nan_columns=False) -> str:
     """Generate a textual preview of a csv file
 
     Args:
@@ -287,7 +287,7 @@ def preview_csv(p: Path, file_name: str, simple=True) -> str:
 
     if simple:
         cols = df.columns.tolist()
-        sel_cols = 15
+        sel_cols = min(len(cols), 100)
         cols_str = ", ".join(cols[:sel_cols])
         res = f"The columns are: {cols_str}"
         if len(cols) > sel_cols:
@@ -312,6 +312,10 @@ def preview_csv(p: Path, file_name: str, simple=True) -> str:
                 out.append(
                     f"{name} has {df[col].nunique()} unique values. Some example values: {df[col].value_counts().head(4).index.tolist()}"
                 )
+    if show_nan_columns:
+        nan_cols = [col for col in df.columns.tolist() if df[col].isnull().any()]
+        if nan_cols:
+            out.append(f"Columns containing NaN values: {', '.join(nan_cols)}")
 
     return "\n".join(out)
 
@@ -346,7 +350,7 @@ def preview_json(p: Path, file_name: str):
     return f"-> {file_name} has auto-generated json schema:\n" + builder.to_json(indent=2)
 
 
-def describe_data_folder_v2(base_path, include_file_details=True, simple=False):
+def describe_data_folder_v2(base_path, include_file_details=True, simple=False, show_nan_columns=False):
     """
     Generate a textual preview of a directory, including an overview of the directory
     structure and previews of individual files
@@ -359,7 +363,7 @@ def describe_data_folder_v2(base_path, include_file_details=True, simple=False):
             file_name = str(fn.relative_to(base_path))
 
             if fn.suffix == ".csv":
-                out.append(preview_csv(fn, file_name, simple=simple))
+                out.append(preview_csv(fn, file_name, simple=simple, show_nan_columns=show_nan_columns))
             elif fn.suffix == ".json":
                 out.append(preview_json(fn, file_name))
             elif fn.suffix in plaintext_files:
@@ -374,7 +378,9 @@ def describe_data_folder_v2(base_path, include_file_details=True, simple=False):
 
     # if the result is very long we generate a simpler version
     if len(result) > 6_000 and not simple:
-        return describe_data_folder_v2(base_path, include_file_details=include_file_details, simple=True)
+        return describe_data_folder_v2(
+            base_path, include_file_details=include_file_details, simple=True, show_nan_columns=show_nan_columns
+        )
     # if still too long, we truncate
     if len(result) > 6_000 and simple:
         return result[:6_000] + "\n... (truncated)"