|
615 | 615 | # Reference: https://azure.microsoft.com/en-us/pricing/details/machine-learning/
|
616 | 616 | V100_x8_PRICE_PER_HOUR = 22.032
|
617 | 617 |
|
| 618 | +RED_FONT = "\033[91m" |
| 619 | +RESET = "\033[0m" |
618 | 620 |
|
619 | 621 | def extract_after_test(input_string):
|
620 | 622 | parts = input_string.split("_test_")[1].split("_result")[0].split(".json")[0]
|
@@ -792,10 +794,7 @@ def display_api_status_error(rest_error, executable_error, display_success=False
|
792 | 794 | if display_success:
|
793 | 795 | print("🟢 All API Status Test Passed!")
|
794 | 796 | return None
|
795 |
| - |
796 |
| - RED_FONT = "\033[91m" |
797 |
| - RESET = "\033[0m" |
798 |
| - |
| 797 | + |
799 | 798 | print(f"\n{RED_FONT}{'-' * 18} Executable Categories' Error Bounds Based on API Health Status {'-' * 18}{RESET}\n")
|
800 | 799 |
|
801 | 800 | if rest_error:
|
@@ -953,7 +952,10 @@ def get_metric(model_name, cost_data, latency_data):
|
953 | 952 | return cost, mean_latency, std_latency, percentile_95_latency
|
954 | 953 |
|
955 | 954 |
|
956 |
| -def generate_leaderboard_csv(leaderboard_table, output_path): |
| 955 | +def generate_leaderboard_csv( |
| 956 | + leaderboard_table, output_path, eval_models=None, eval_categories=None |
| 957 | +): |
| 958 | + print("📈 Aggregating data to generate leaderboard score table...") |
957 | 959 | data = []
|
958 | 960 | for model_name, value in leaderboard_table.items():
|
959 | 961 | model_name_escaped = model_name.replace("_", "/")
|
@@ -1027,12 +1029,6 @@ def generate_leaderboard_csv(leaderboard_table, output_path):
|
1027 | 1029 | model_name_escaped, cost_data, latency_data
|
1028 | 1030 | )
|
1029 | 1031 |
|
1030 |
| - if overall_accuracy["total_count"] != 1700: |
1031 |
| - print("-" * 100) |
1032 |
| - print( |
1033 |
| - f"❗️Warning: Total count for {model_name} is {overall_accuracy['total_count']}" |
1034 |
| - ) |
1035 |
| - |
1036 | 1032 | data.append(
|
1037 | 1033 | [
|
1038 | 1034 | "N/A",
|
@@ -1083,6 +1079,122 @@ def generate_leaderboard_csv(leaderboard_table, output_path):
|
1083 | 1079 | else:
|
1084 | 1080 | f.write(",".join(row))
|
1085 | 1081 |
|
| 1082 | + if eval_models: |
| 1083 | + category_status = check_model_category_status(score_path=output_path) |
| 1084 | + check_all_category_present( |
| 1085 | + category_status, eval_models=eval_models, eval_categories=eval_categories |
| 1086 | + ) |
| 1087 | + |
| 1088 | + |
| 1089 | +def check_model_category_status(score_path): |
| 1090 | + result_path = score_path.replace("score", "result") |
| 1091 | + |
| 1092 | + leaderboard_categories = [ |
| 1093 | + "simple", |
| 1094 | + "multiple_function", |
| 1095 | + "parallel_function", |
| 1096 | + "parallel_multiple_function", |
| 1097 | + "executable_simple", |
| 1098 | + "executable_multiple_function", |
| 1099 | + "executable_parallel_function", |
| 1100 | + "executable_parallel_multiple_function", |
| 1101 | + "java", |
| 1102 | + "javascript", |
| 1103 | + "rest", |
| 1104 | + "relevance", |
| 1105 | + ] |
| 1106 | + |
| 1107 | + category_status = {} |
| 1108 | + |
| 1109 | + # Check for all models in MODEL_METADATA_MAPPING |
| 1110 | + for model_name in MODEL_METADATA_MAPPING.keys(): |
| 1111 | + category_status[model_name] = { |
| 1112 | + category: {"generated": False, "evaluated": False} |
| 1113 | + for category in leaderboard_categories |
| 1114 | + } |
| 1115 | + |
| 1116 | + # Check result folder |
| 1117 | + result_subdir = os.path.join(result_path, model_name) |
| 1118 | + if os.path.exists(result_subdir): |
| 1119 | + for result_file in os.listdir(result_subdir): |
| 1120 | + test_category = result_file.split("_test_")[1].split("_result")[0] |
| 1121 | + if test_category in category_status[model_name]: |
| 1122 | + category_status[model_name][test_category]["generated"] = True |
| 1123 | + |
| 1124 | + # Check score folder |
| 1125 | + score_subdir = os.path.join(score_path, model_name) |
| 1126 | + if os.path.exists(score_subdir): |
| 1127 | + for score_file in os.listdir(score_subdir): |
| 1128 | + test_category = score_file.split("_score.json")[0] |
| 1129 | + if test_category in category_status[model_name]: |
| 1130 | + category_status[model_name][test_category]["evaluated"] = True |
| 1131 | + |
| 1132 | + return category_status |
| 1133 | + |
| 1134 | + |
| 1135 | +def check_all_category_present(category_status, eval_models=None, eval_categories=None): |
| 1136 | + found_issues = False |
| 1137 | + first_time = True |
| 1138 | + commands = [] |
| 1139 | + |
| 1140 | + for model_name, categories in category_status.items(): |
| 1141 | + if eval_models and model_name not in eval_models: |
| 1142 | + continue |
| 1143 | + |
| 1144 | + not_generated = [ |
| 1145 | + cat |
| 1146 | + for cat, status in categories.items() |
| 1147 | + if not status["generated"] |
| 1148 | + and (not eval_categories or cat in eval_categories) |
| 1149 | + ] |
| 1150 | + not_evaluated = [ |
| 1151 | + cat |
| 1152 | + for cat, status in categories.items() |
| 1153 | + if not status["evaluated"] |
| 1154 | + and (not eval_categories or cat in eval_categories) |
| 1155 | + ] |
| 1156 | + |
| 1157 | + if not_generated or not_evaluated: |
| 1158 | + found_issues = True |
| 1159 | + if first_time: |
| 1160 | + print(f"We are checking models: {eval_models} and categories: {eval_categories}") |
| 1161 | + print(f"\n{RED_FONT}{'=' * 30} Model Category Status {'=' * 30}{RESET}") |
| 1162 | + first_time = False |
| 1163 | + |
| 1164 | + print(f"{RED_FONT}Model: {model_name}{RESET}") |
| 1165 | + if not_generated: |
| 1166 | + print(f"\n Missing results for {len(not_generated)} categories:") |
| 1167 | + for cat in not_generated: |
| 1168 | + print(f" - {cat}") |
| 1169 | + commands.append("cd ..") |
| 1170 | + commands.append( |
| 1171 | + f"python openfunctions_evaluation.py --model {model_name} --test-category {' '.join(not_generated)}" |
| 1172 | + ) |
| 1173 | + |
| 1174 | + if not_evaluated: |
| 1175 | + print(f"\n Unevaluated results for {len(not_evaluated)} categories:") |
| 1176 | + for cat in not_evaluated: |
| 1177 | + print(f" - {cat}") |
| 1178 | + |
| 1179 | + all_categories = set(not_generated + not_evaluated) |
| 1180 | + if all_categories: |
| 1181 | + commands.append("cd eval_checker") |
| 1182 | + commands.append( |
| 1183 | + f"python eval_runner.py --model {model_name} --test-category {' '.join(all_categories)}" |
| 1184 | + ) |
| 1185 | + |
| 1186 | + if found_issues: |
| 1187 | + print(f"\n{RED_FONT}{'=' * 40} Recommended Actions {'=' * 40}{RESET}\n") |
| 1188 | + print( |
| 1189 | + "To address these issues, run the following commands from the current directory:" |
| 1190 | + ) |
| 1191 | + print("\n" + " && \\\n".join(commands)) |
| 1192 | + print(f"\n{RED_FONT}{'=' * 100}{RESET}\n") |
| 1193 | + else: |
| 1194 | + print("🎉 All categories are present and evaluated for all models!\n") |
| 1195 | + |
| 1196 | + return found_issues |
| 1197 | + |
1086 | 1198 |
|
1087 | 1199 | def update_leaderboard_table_with_score_file(leaderboard_table, score_path):
|
1088 | 1200 |
|
|
0 commit comments