Skip to content

Commit f434315

Browse files
authored
✨ feat: support easy task aggr (#228)
1 parent a684e7a commit f434315

File tree

2 files changed

+22
-10
lines changed

2 files changed

+22
-10
lines changed

src/aggregators/aggregate_results.py

Lines changed: 22 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -84,6 +84,20 @@ def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]:
8484
results = defaultdict(lambda: defaultdict(lambda: defaultdict(dict)))
8585

8686
# Current layout: results/<exp>/<model>__<service>/run-N/<category>__<task>/
87+
# Some pipelines include task-set suffix in service dir (e.g., "filesystem-easy").
88+
# Normalize such names back to canonical service keys used by tasks/ (filesystem, github, notion, playwright, postgres).
89+
90+
def normalize_service_name(name: str) -> str:
91+
# Strip known task-set suffixes like "-easy" or "-standard"
92+
if name.endswith("-easy") or name.endswith("-standard"):
93+
base = name.rsplit("-", 1)[0]
94+
else:
95+
base = name
96+
97+
# Map variant names to canonical service
98+
if base == "playwright_webarena":
99+
return "playwright"
100+
return base
87101
for model_service_dir in exp_dir.iterdir():
88102
if not model_service_dir.is_dir() or "__" not in model_service_dir.name:
89103
continue
@@ -944,13 +958,15 @@ def main():
944958
# Print validation report with summary table
945959
print_validation_report(complete_models, incomplete_models, invalid_models,
946960
all_tasks, args.k, single_run_models, results)
947-
948-
if not complete_models:
961+
962+
# Determine which models to include in output (strict: only complete models)
963+
models_for_output = dict(complete_models)
964+
if not models_for_output:
949965
return 1
950966

951967
# Calculate metrics
952968
print("\n📊 Calculating metrics...")
953-
summary = calculate_metrics(complete_models, all_tasks, args.k, single_run_models)
969+
summary = calculate_metrics(models_for_output, all_tasks, args.k, single_run_models)
954970
summary["experiment_name"] = args.exp_name
955971
summary["task_set"] = args.task_set
956972

@@ -962,12 +978,12 @@ def main():
962978

963979
# Generate model_results
964980
print("📁 Generating model_results...")
965-
generate_model_results(exp_dir, complete_models, all_tasks)
966-
print(f" Created {len(complete_models)} model directories")
981+
generate_model_results(exp_dir, models_for_output, all_tasks)
982+
print(f" Created {len(models_for_output)} model directories")
967983

968984
# Generate task_results
969985
print("📁 Generating task_results...")
970-
generate_task_results(exp_dir, complete_models, all_tasks)
986+
generate_task_results(exp_dir, models_for_output, all_tasks)
971987
print(f" Created {total_tasks} task files")
972988

973989
# Generate README

src/errors.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -46,14 +46,10 @@ def standardize_error_message(error: str, mcp_service: Optional[str] = None) ->
4646
"connection refused" in error_str.lower() or "econnrefused" in error_str.lower()
4747
):
4848
base_msg = "Connection refused"
49-
elif "authentication" in error_str.lower() or "unauthorized" in error_str.lower():
50-
base_msg = "Authentication failed"
5149
elif "not found" in error_str.lower():
5250
base_msg = "Resource not found"
5351
elif "already exists" in error_str.lower():
5452
base_msg = "Resource already exists"
55-
elif "mcp" in error_str.lower() and "error" in error_str.lower():
56-
base_msg = "MCP service error"
5753
else:
5854
# Return original message if no standardization applies
5955
return error_str

0 commit comments

Comments
 (0)