@@ -84,6 +84,20 @@ def collect_results(exp_dir: Path, k: int) -> Dict[str, Dict[str, Any]]:
8484 results = defaultdict (lambda : defaultdict (lambda : defaultdict (dict )))
8585
8686 # Current layout: results/<exp>/<model>__<service>/run-N/<category>__<task>/
87+ # Some pipelines include task-set suffix in service dir (e.g., "filesystem-easy").
88+ # Normalize such names back to canonical service keys used by tasks/ (filesystem, github, notion, playwright, postgres).
89+
90+ def normalize_service_name (name : str ) -> str :
91+ # Strip known task-set suffixes like "-easy" or "-standard"
92+ if name .endswith ("-easy" ) or name .endswith ("-standard" ):
93+ base = name .rsplit ("-" , 1 )[0 ]
94+ else :
95+ base = name
96+
97+ # Map variant names to canonical service
98+ if base == "playwright_webarena" :
99+ return "playwright"
100+ return base
87101 for model_service_dir in exp_dir .iterdir ():
88102 if not model_service_dir .is_dir () or "__" not in model_service_dir .name :
89103 continue
@@ -944,13 +958,15 @@ def main():
944958 # Print validation report with summary table
945959 print_validation_report (complete_models , incomplete_models , invalid_models ,
946960 all_tasks , args .k , single_run_models , results )
947-
948- if not complete_models :
961+
962+ # Determine which models to include in output (strict: only complete models)
963+ models_for_output = dict (complete_models )
964+ if not models_for_output :
949965 return 1
950966
951967 # Calculate metrics
952968 print ("\n 📊 Calculating metrics..." )
953- summary = calculate_metrics (complete_models , all_tasks , args .k , single_run_models )
969+ summary = calculate_metrics (models_for_output , all_tasks , args .k , single_run_models )
954970 summary ["experiment_name" ] = args .exp_name
955971 summary ["task_set" ] = args .task_set
956972
@@ -962,12 +978,12 @@ def main():
962978
963979 # Generate model_results
964980 print ("📁 Generating model_results..." )
965- generate_model_results (exp_dir , complete_models , all_tasks )
966- print (f" Created { len (complete_models )} model directories" )
981+ generate_model_results (exp_dir , models_for_output , all_tasks )
982+ print (f" Created { len (models_for_output )} model directories" )
967983
968984 # Generate task_results
969985 print ("📁 Generating task_results..." )
970- generate_task_results (exp_dir , complete_models , all_tasks )
986+ generate_task_results (exp_dir , models_for_output , all_tasks )
971987 print (f" Created { total_tasks } task files" )
972988
973989 # Generate README
0 commit comments