Skip to content

Commit 379db26

Browse files
[BFCL] Improve Warning Message when Aggregating Results (#517)
As mentioned in #506, this PR make the warning messages more informative for user to know action items when aggregating leaderboard results. --------- Co-authored-by: CharlieJCJ <[email protected]>
1 parent 453f71c commit 379db26

File tree

2 files changed

+124
-12
lines changed

2 files changed

+124
-12
lines changed

berkeley-function-call-leaderboard/eval_checker/eval_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -404,7 +404,7 @@ def runner(model_names, test_categories, api_sanity_check):
404404
# This is helpful when you only want to run the evaluation for a subset of models and test categories.
405405
update_leaderboard_table_with_score_file(LEADERBOARD_TABLE, OUTPUT_PATH)
406406
# Write the leaderboard table to a file
407-
generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH)
407+
generate_leaderboard_csv(LEADERBOARD_TABLE, OUTPUT_PATH, model_names, test_categories)
408408

409409
# Clean up the executable expected output files
410410
# They should be re-generated the next time the evaluation is run

berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py

Lines changed: 123 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -615,6 +615,8 @@
615615
# Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/
616616
V100_x8_PRICE_PER_HOUR = 22.032
617617

618+
RED_FONT = "\033[91m"
619+
RESET = "\033[0m"
618620

619621
def extract_after_test(input_string):
620622
parts = input_string.split("_test_")[1].split("_result")[0].split(".json")[0]
@@ -792,10 +794,7 @@ def display_api_status_error(rest_error, executable_error, display_success=False
792794
if display_success:
793795
print("🟢 All API Status Test Passed!")
794796
return None
795-
796-
RED_FONT = "\033[91m"
797-
RESET = "\033[0m"
798-
797+
799798
print(f"\n{RED_FONT}{'-' * 18} Executable Categories' Error Bounds Based on API Health Status {'-' * 18}{RESET}\n")
800799

801800
if rest_error:
@@ -953,7 +952,10 @@ def get_metric(model_name, cost_data, latency_data):
953952
return cost, mean_latency, std_latency, percentile_95_latency
954953

955954

956-
def generate_leaderboard_csv(leaderboard_table, output_path):
955+
def generate_leaderboard_csv(
956+
leaderboard_table, output_path, eval_models=None, eval_categories=None
957+
):
958+
print("📈 Aggregating data to generate leaderboard score table...")
957959
data = []
958960
for model_name, value in leaderboard_table.items():
959961
model_name_escaped = model_name.replace("_", "/")
@@ -1027,12 +1029,6 @@ def generate_leaderboard_csv(leaderboard_table, output_path):
10271029
model_name_escaped, cost_data, latency_data
10281030
)
10291031

1030-
if overall_accuracy["total_count"] != 1700:
1031-
print("-" * 100)
1032-
print(
1033-
f"❗️Warning: Total count for {model_name} is {overall_accuracy['total_count']}"
1034-
)
1035-
10361032
data.append(
10371033
[
10381034
"N/A",
@@ -1083,6 +1079,122 @@ def generate_leaderboard_csv(leaderboard_table, output_path):
10831079
else:
10841080
f.write(",".join(row))
10851081

1082+
if eval_models:
1083+
category_status = check_model_category_status(score_path=output_path)
1084+
check_all_category_present(
1085+
category_status, eval_models=eval_models, eval_categories=eval_categories
1086+
)
1087+
1088+
1089+
def check_model_category_status(score_path):
1090+
result_path = score_path.replace("score", "result")
1091+
1092+
leaderboard_categories = [
1093+
"simple",
1094+
"multiple_function",
1095+
"parallel_function",
1096+
"parallel_multiple_function",
1097+
"executable_simple",
1098+
"executable_multiple_function",
1099+
"executable_parallel_function",
1100+
"executable_parallel_multiple_function",
1101+
"java",
1102+
"javascript",
1103+
"rest",
1104+
"relevance",
1105+
]
1106+
1107+
category_status = {}
1108+
1109+
# Check for all models in MODEL_METADATA_MAPPING
1110+
for model_name in MODEL_METADATA_MAPPING.keys():
1111+
category_status[model_name] = {
1112+
category: {"generated": False, "evaluated": False}
1113+
for category in leaderboard_categories
1114+
}
1115+
1116+
# Check result folder
1117+
result_subdir = os.path.join(result_path, model_name)
1118+
if os.path.exists(result_subdir):
1119+
for result_file in os.listdir(result_subdir):
1120+
test_category = result_file.split("_test_")[1].split("_result")[0]
1121+
if test_category in category_status[model_name]:
1122+
category_status[model_name][test_category]["generated"] = True
1123+
1124+
# Check score folder
1125+
score_subdir = os.path.join(score_path, model_name)
1126+
if os.path.exists(score_subdir):
1127+
for score_file in os.listdir(score_subdir):
1128+
test_category = score_file.split("_score.json")[0]
1129+
if test_category in category_status[model_name]:
1130+
category_status[model_name][test_category]["evaluated"] = True
1131+
1132+
return category_status
1133+
1134+
1135+
def check_all_category_present(category_status, eval_models=None, eval_categories=None):
1136+
found_issues = False
1137+
first_time = True
1138+
commands = []
1139+
1140+
for model_name, categories in category_status.items():
1141+
if eval_models and model_name not in eval_models:
1142+
continue
1143+
1144+
not_generated = [
1145+
cat
1146+
for cat, status in categories.items()
1147+
if not status["generated"]
1148+
and (not eval_categories or cat in eval_categories)
1149+
]
1150+
not_evaluated = [
1151+
cat
1152+
for cat, status in categories.items()
1153+
if not status["evaluated"]
1154+
and (not eval_categories or cat in eval_categories)
1155+
]
1156+
1157+
if not_generated or not_evaluated:
1158+
found_issues = True
1159+
if first_time:
1160+
print(f"We are checking models: {eval_models} and categories: {eval_categories}")
1161+
print(f"\n{RED_FONT}{'=' * 30} Model Category Status {'=' * 30}{RESET}")
1162+
first_time = False
1163+
1164+
print(f"{RED_FONT}Model: {model_name}{RESET}")
1165+
if not_generated:
1166+
print(f"\n Missing results for {len(not_generated)} categories:")
1167+
for cat in not_generated:
1168+
print(f" - {cat}")
1169+
commands.append("cd ..")
1170+
commands.append(
1171+
f"python openfunctions_evaluation.py --model {model_name} --test-category {' '.join(not_generated)}"
1172+
)
1173+
1174+
if not_evaluated:
1175+
print(f"\n Unevaluated results for {len(not_evaluated)} categories:")
1176+
for cat in not_evaluated:
1177+
print(f" - {cat}")
1178+
1179+
all_categories = set(not_generated + not_evaluated)
1180+
if all_categories:
1181+
commands.append("cd eval_checker")
1182+
commands.append(
1183+
f"python eval_runner.py --model {model_name} --test-category {' '.join(all_categories)}"
1184+
)
1185+
1186+
if found_issues:
1187+
print(f"\n{RED_FONT}{'=' * 40} Recommended Actions {'=' * 40}{RESET}\n")
1188+
print(
1189+
"To address these issues, run the following commands from the current directory:"
1190+
)
1191+
print("\n" + " && \\\n".join(commands))
1192+
print(f"\n{RED_FONT}{'=' * 100}{RESET}\n")
1193+
else:
1194+
print("🎉 All categories are present and evaluated for all models!\n")
1195+
1196+
return found_issues
1197+
10861198

10871199
def update_leaderboard_table_with_score_file(leaderboard_table, score_path):
10881200

0 commit comments

Comments
 (0)