Skip to content

Commit 8b2f22c

Browse files
authored
chore: custom data refine (#864)
* chore: print up to 100 columns in simple mode * fix: check content for model dump * chore: add show_nan_columns config
1 parent 2b9427a commit 8b2f22c

File tree

4 files changed

+38
-17
lines changed

4 files changed

+38
-17
lines changed

rdagent/app/data_science/conf.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ class DataScienceBasePropSetting(KaggleBasePropSetting):
4444
rule_base_eval: bool = False
4545
sample_data: bool = True
4646
use_raw_description: bool = False
47+
show_nan_columns: bool = False
4748

4849
#### model dump
4950
enable_model_dump: bool = False

rdagent/components/coder/data_science/share/eval.py

Lines changed: 23 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,18 @@ def evaluate(
4848

4949
# 2) check the result and stdout after reruning the model.
5050

51+
# Read the content of files submission.csv and scores.csv before execution
52+
submission_content_before = (
53+
(implementation.workspace_path / "submission.csv").read_text()
54+
if (implementation.workspace_path / "submission.csv").exists()
55+
else None
56+
)
57+
scores_content_before = (
58+
(implementation.workspace_path / "scores.csv").read_text()
59+
if (implementation.workspace_path / "scores.csv").exists()
60+
else None
61+
)
62+
5163
# Remove the files submission.csv and scores.csv
5264
implementation.execute(env=env, entry=get_clear_ws_cmd(stage="before_inference"))
5365

@@ -70,17 +82,17 @@ def evaluate(
7082
final_decision=False,
7183
)
7284

73-
# Read the content of files submission.csv and scores.csv before execution
74-
submission_content_before = (
75-
(implementation.workspace_path / "submission.csv").read_text()
76-
if (implementation.workspace_path / "submission.csv").exists()
77-
else None
78-
)
79-
scores_content_before = (
80-
(implementation.workspace_path / "scores.csv").read_text()
81-
if (implementation.workspace_path / "scores.csv").exists()
82-
else None
83-
)
85+
# Check if scores contain NaN (values)
86+
score_df = pd.read_csv((implementation.workspace_path / "scores.csv"), index_col=0)
87+
if score_df.isnull().values.any():
88+
nan_locations = score_df[score_df.isnull().any(axis=1)]
89+
err_msg = f"\n[Error] The scores dataframe contains NaN values at the following locations:\n{nan_locations}"
90+
return CoSTEERSingleFeedback(
91+
execution=err_msg,
92+
return_checking=err_msg,
93+
code=err_msg,
94+
final_decision=False,
95+
)
8496

8597
assert submission_content_before is not None
8698
assert scores_content_before is not None

rdagent/scenarios/data_science/scen/__init__.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -166,7 +166,9 @@ def get_runtime_environment(self) -> str:
166166
return stdout
167167

168168
def _get_data_folder_description(self) -> str:
169-
return describe_data_folder_v2(Path(DS_RD_SETTING.local_data_path) / self.competition)
169+
return describe_data_folder_v2(
170+
Path(DS_RD_SETTING.local_data_path) / self.competition, show_nan_columns=DS_RD_SETTING.show_nan_columns
171+
)
170172

171173

172174
class KaggleScen(DataScienceScen):

rdagent/scenarios/data_science/scen/utils.py

Lines changed: 11 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -268,7 +268,7 @@ def _walk(path: Path):
268268
yield p
269269

270270

271-
def preview_csv(p: Path, file_name: str, simple=True) -> str:
271+
def preview_csv(p: Path, file_name: str, simple=True, show_nan_columns=False) -> str:
272272
"""Generate a textual preview of a csv file
273273
274274
Args:
@@ -287,7 +287,7 @@ def preview_csv(p: Path, file_name: str, simple=True) -> str:
287287

288288
if simple:
289289
cols = df.columns.tolist()
290-
sel_cols = 15
290+
sel_cols = min(len(cols), 100)
291291
cols_str = ", ".join(cols[:sel_cols])
292292
res = f"The columns are: {cols_str}"
293293
if len(cols) > sel_cols:
@@ -312,6 +312,10 @@ def preview_csv(p: Path, file_name: str, simple=True) -> str:
312312
out.append(
313313
f"{name} has {df[col].nunique()} unique values. Some example values: {df[col].value_counts().head(4).index.tolist()}"
314314
)
315+
if show_nan_columns:
316+
nan_cols = [col for col in df.columns.tolist() if df[col].isnull().any()]
317+
if nan_cols:
318+
out.append(f"Columns containing NaN values: {', '.join(nan_cols)}")
315319

316320
return "\n".join(out)
317321

@@ -346,7 +350,7 @@ def preview_json(p: Path, file_name: str):
346350
return f"-> {file_name} has auto-generated json schema:\n" + builder.to_json(indent=2)
347351

348352

349-
def describe_data_folder_v2(base_path, include_file_details=True, simple=False):
353+
def describe_data_folder_v2(base_path, include_file_details=True, simple=False, show_nan_columns=False):
350354
"""
351355
Generate a textual preview of a directory, including an overview of the directory
352356
structure and previews of individual files
@@ -359,7 +363,7 @@ def describe_data_folder_v2(base_path, include_file_details=True, simple=False):
359363
file_name = str(fn.relative_to(base_path))
360364

361365
if fn.suffix == ".csv":
362-
out.append(preview_csv(fn, file_name, simple=simple))
366+
out.append(preview_csv(fn, file_name, simple=simple, show_nan_columns=show_nan_columns))
363367
elif fn.suffix == ".json":
364368
out.append(preview_json(fn, file_name))
365369
elif fn.suffix in plaintext_files:
@@ -374,7 +378,9 @@ def describe_data_folder_v2(base_path, include_file_details=True, simple=False):
374378

375379
# if the result is very long we generate a simpler version
376380
if len(result) > 6_000 and not simple:
377-
return describe_data_folder_v2(base_path, include_file_details=include_file_details, simple=True)
381+
return describe_data_folder_v2(
382+
base_path, include_file_details=include_file_details, simple=True, show_nan_columns=show_nan_columns
383+
)
378384
# if still too long, we truncate
379385
if len(result) > 6_000 and simple:
380386
return result[:6_000] + "\n... (truncated)"

0 commit comments

Comments
 (0)