Skip to content

Commit a739bd0

Browse files
committed
1
1 parent 4c243b8 commit a739bd0

File tree

3 files changed

+130
-114
lines changed

3 files changed

+130
-114
lines changed

berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -177,7 +177,7 @@ def multi_threaded_inference(
177177
)
178178
break # Success, exit the loop
179179
except Exception as e:
180-
raise e
180+
# raise e
181181
if retry_count < RETRY_LIMIT and (
182182
"rate limit reached" in str(e).lower()
183183
or (hasattr(e, "status_code") and (e.status_code in {429, 503, 500}))

berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py

Lines changed: 67 additions & 67 deletions
Original file line numberDiff line numberDiff line change
@@ -482,73 +482,6 @@ def ast_file_runner(
482482

483483

484484
#### Main runner function ####
485-
def runner(model_names, test_categories, result_dir, score_dir):
486-
487-
# State udpated by each eval subtask.
488-
state = dict(
489-
# A dictionary to store the evaluation scores.
490-
# Key is model name, value is a dictionary with keys as test category
491-
# and values as a dictionary with accuracy and total count.
492-
leaderboard_table={},
493-
)
494-
495-
# Get a list of all entries in the folder
496-
entries = result_dir.iterdir()
497-
498-
# Filter out the subdirectories
499-
subdirs = [entry for entry in entries if entry.is_dir()]
500-
501-
# Traverse each subdirectory
502-
for subdir in tqdm(subdirs, desc="Number of models evaluated"):
503-
504-
model_name = subdir.relative_to(result_dir).name
505-
if model_names is not None and model_name not in model_names:
506-
continue
507-
508-
model_name_escaped = model_name.replace("_", "/")
509-
510-
print(f"🦍 Model: {model_name}")
511-
512-
# Find and process all JSON files in the subdirectory
513-
for model_result_json in subdir.glob("*.json"):
514-
test_category = extract_test_category(model_result_json)
515-
if test_category not in test_categories:
516-
continue
517-
518-
handler = get_handler(model_name_escaped)
519-
520-
# We don't evaluate chatable and SQL models in our current
521-
# leaderboard.
522-
if (
523-
is_chatable(test_category)
524-
or is_executable(test_category)
525-
or is_memory_prereq(test_category)
526-
or "conflict" in test_category
527-
):
528-
continue
529-
530-
model_result = load_file(model_result_json, sort_by_id=True)
531-
532-
state = evaluate_task(
533-
test_category,
534-
result_dir,
535-
score_dir,
536-
model_result,
537-
model_name,
538-
handler,
539-
state,
540-
)
541-
542-
# This function reads all the score files from local folder and updates the
543-
# leaderboard table. This is helpful when you only want to run the
544-
# evaluation for a subset of models and test categories.
545-
update_leaderboard_table_with_local_score_file(state["leaderboard_table"], score_dir)
546-
# Write the leaderboard table to a file
547-
generate_leaderboard_csv(
548-
state["leaderboard_table"], score_dir, model_names, test_categories
549-
)
550-
551-
552485
def evaluate_task(
553486
test_category,
554487
result_dir,
@@ -623,6 +556,73 @@ def evaluate_task(
623556
return state
624557

625558

559+
def runner(model_names, test_categories, result_dir, score_dir):
560+
561+
# State udpated by each eval subtask.
562+
state = dict(
563+
# A dictionary to store the evaluation scores.
564+
# Key is model name, value is a dictionary with keys as test category
565+
# and values as a dictionary with accuracy and total count.
566+
leaderboard_table={},
567+
)
568+
569+
# Get a list of all entries in the folder
570+
entries = result_dir.iterdir()
571+
572+
# Filter out the subdirectories
573+
subdirs = [entry for entry in entries if entry.is_dir()]
574+
575+
# Traverse each subdirectory
576+
for subdir in tqdm(subdirs, desc="Number of models evaluated"):
577+
578+
model_name = subdir.relative_to(result_dir).name
579+
if model_names is not None and model_name not in model_names:
580+
continue
581+
582+
model_name_escaped = model_name.replace("_", "/")
583+
584+
print(f"🦍 Model: {model_name}")
585+
586+
# Find and process all JSON files in the subdirectory
587+
for model_result_json in subdir.glob("*.json"):
588+
test_category = extract_test_category(model_result_json)
589+
if test_category not in test_categories:
590+
continue
591+
592+
handler = get_handler(model_name_escaped)
593+
594+
# We don't evaluate chatable and SQL models in our current
595+
# leaderboard.
596+
if (
597+
is_chatable(test_category)
598+
or is_executable(test_category)
599+
or is_memory_prereq(test_category)
600+
or "conflict" in test_category
601+
):
602+
continue
603+
604+
model_result = load_file(model_result_json, sort_by_id=True)
605+
606+
state = evaluate_task(
607+
test_category,
608+
result_dir,
609+
score_dir,
610+
model_result,
611+
model_name,
612+
handler,
613+
state,
614+
)
615+
616+
# This function reads all the score files from local folder and updates the
617+
# leaderboard table. This is helpful when you only want to run the
618+
# evaluation for a subset of models and test categories.
619+
update_leaderboard_table_with_local_score_file(state["leaderboard_table"], score_dir)
620+
# Write the leaderboard table to a file
621+
generate_leaderboard_csv(
622+
state["leaderboard_table"], score_dir, model_names, test_categories
623+
)
624+
625+
626626
def main(model, test_categories, result_dir, score_dir):
627627
if result_dir is None:
628628
result_dir = RESULT_PATH

berkeley-function-call-leaderboard/bfcl/utils.py

Lines changed: 62 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,9 @@
2222
from bfcl.constants.executable_backend_config import MULTI_TURN_FUNC_DOC_FILE_MAPPING
2323

2424

25+
#### Helper functions to extract/parse/complete test category from different formats ####
26+
27+
2528
def extract_test_category(input_string: Union[str, Path]) -> str:
2629
"""
2730
Extract the test category from a given file name.
@@ -51,6 +54,18 @@ def extract_test_category_from_id(test_entry_id: str, remove_prereq: bool = Fals
5154
return test_entry_id.rsplit("_", 1)[0]
5255

5356

57+
def extract_memory_backend_type(test_category):
58+
"""
59+
This function extracts the memory backend type from the test category.
60+
The test category should be in the form of `memory_kv` or `memory_knowledge_graph`, etc.
61+
"""
62+
if not is_memory(test_category):
63+
raise ValueError(f"Test category {test_category} is not a memory category.")
64+
65+
# Split the test category by underscores and extract the backend type
66+
return test_category[len("memory_") :]
67+
68+
5469
def find_file_by_category(
5570
test_category: str,
5671
folder_path: Path,
@@ -101,6 +116,25 @@ def get_file_name_by_category(
101116
return file_name
102117

103118

119+
def parse_test_category_argument(test_category_args: list[str]) -> list[str]:
120+
test_name_total = set()
121+
122+
for test_category in test_category_args:
123+
if test_category in TEST_COLLECTION_MAPPING:
124+
for test_name in TEST_COLLECTION_MAPPING[test_category]:
125+
test_name_total.add(test_name)
126+
elif test_category in ALL_CATEGORIES:
127+
test_name_total.add(test_category)
128+
else:
129+
# Invalid test category name
130+
raise Exception(f"Invalid test category name provided: {test_category}")
131+
132+
return sorted(list(test_name_total))
133+
134+
135+
#### Predicate functions to check the test category ####
136+
137+
104138
def is_web_search(test_category):
105139
return "web_search" in test_category
106140

@@ -164,16 +198,8 @@ def is_sql(test_category):
164198
def contain_multi_turn_interaction(test_category):
165199
return is_multi_turn(test_category) or is_agentic(test_category)
166200

167-
def extract_memory_backend_type(test_category):
168-
"""
169-
This function extracts the memory backend type from the test category.
170-
The test category should be in the form of `memory_kv` or `memory_knowledge_graph`, etc.
171-
"""
172-
if not is_memory(test_category):
173-
raise ValueError(f"Test category {test_category} is not a memory category.")
174201

175-
# Split the test category by underscores and extract the backend type
176-
return test_category[len("memory_") :]
202+
#### Helper functions to load/write the dataset files ####
177203

178204

179205
def load_file(file_path, sort_by_id=False):
@@ -188,6 +214,29 @@ def load_file(file_path, sort_by_id=False):
188214
return result
189215

190216

217+
def load_dataset_entry(test_category: str) -> list[dict]:
218+
"""
219+
This function retrieves the dataset entry for a given test category.
220+
The input should not be a test category goup, but a specific test category.
221+
"""
222+
if not is_memory(test_category):
223+
file_name = f"{VERSION_PREFIX}_{test_category}.json"
224+
all_entries = load_file(PROMPT_PATH / file_name)
225+
else:
226+
# Memory categories
227+
all_entries = []
228+
for scenario in MEMORY_SCENARIO_NAME:
229+
file_name = f"{VERSION_PREFIX}_memory_{scenario}.json"
230+
entries = load_file(PROMPT_PATH / file_name)
231+
all_entries += process_memory_test_case(entries, test_category, scenario)
232+
233+
all_entries = process_agentic_test_case(all_entries)
234+
all_entries = populate_test_cases_with_predefined_functions(all_entries)
235+
all_entries = process_func_doc(all_entries)
236+
237+
return all_entries
238+
239+
191240
def write_list_of_dicts_to_file(filename, data, subdir=None):
192241
if subdir:
193242
# Ensure the subdirectory exists
@@ -264,6 +313,9 @@ def sort_key(entry):
264313
return (priority, test_category, int(index))
265314

266315

316+
#### Helper functions to check the output format ####
317+
318+
267319
# TODO: Reorganize this function to be more readable
268320
def is_function_calling_format_output(decoded_output):
269321
"""
@@ -313,20 +365,7 @@ def is_empty_output(decoded_output):
313365
return False
314366

315367

316-
def parse_test_category_argument(test_category_args: list[str]) -> list[str]:
317-
test_name_total = set()
318-
319-
for test_category in test_category_args:
320-
if test_category in TEST_COLLECTION_MAPPING:
321-
for test_name in TEST_COLLECTION_MAPPING[test_category]:
322-
test_name_total.add(test_name)
323-
elif test_category in ALL_CATEGORIES:
324-
test_name_total.add(test_category)
325-
else:
326-
# Invalid test category name
327-
raise Exception(f"Invalid test category name provided: {test_category}")
328-
329-
return sorted(list(test_name_total))
368+
#### Helper functions to process the dataset entries ####
330369

331370

332371
def _get_language_specific_hint(test_category):
@@ -496,29 +535,6 @@ def populate_test_cases_with_predefined_functions(test_cases: list[dict]) -> lis
496535
return test_cases
497536

498537

499-
def load_dataset_entry(test_category: str) -> list[dict]:
500-
"""
501-
This function retrieves the dataset entry for a given test category.
502-
The input should not be a test category goup, but a specific test category.
503-
"""
504-
if not is_memory(test_category):
505-
file_name = f"{VERSION_PREFIX}_{test_category}.json"
506-
all_entries = load_file(PROMPT_PATH / file_name)
507-
else:
508-
# Memory categories
509-
all_entries = []
510-
for scenario in MEMORY_SCENARIO_NAME:
511-
file_name = f"{VERSION_PREFIX}_memory_{scenario}.json"
512-
entries = load_file(PROMPT_PATH / file_name)
513-
all_entries += process_memory_test_case(entries, test_category, scenario)
514-
515-
all_entries = process_agentic_test_case(all_entries)
516-
all_entries = populate_test_cases_with_predefined_functions(all_entries)
517-
all_entries = process_func_doc(all_entries)
518-
519-
return all_entries
520-
521-
522538
def clean_up_memory_prereq_entries(test_cases: list[dict]) -> list[dict]:
523539
"""
524540
Remove memory-prerequisite test cases when their corresponding

0 commit comments

Comments
 (0)