1

HuanzhiMao · HuanzhiMao · commit a739bd07f450 · 2025-04-30T17:34:53.000-07:00
diff --git a/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py
@@ -177,7 +177,7 @@ def multi_threaded_inference(
             )
             break  # Success, exit the loop
         except Exception as e:
-            raise e
+            # raise e
             if retry_count < RETRY_LIMIT and (
                 "rate limit reached" in str(e).lower()
                 or (hasattr(e, "status_code") and (e.status_code in {429, 503, 500}))
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -482,73 +482,6 @@ def ast_file_runner(
 
 
 #### Main runner function ####
-def runner(model_names, test_categories, result_dir, score_dir):
-
-    # State udpated by each eval subtask.
-    state = dict(
-        # A dictionary to store the evaluation scores.
-        # Key is model name, value is a dictionary with keys as test category
-        # and values as a dictionary with accuracy and total count.
-        leaderboard_table={},
-    )
-
-    # Get a list of all entries in the folder
-    entries = result_dir.iterdir()
-
-    # Filter out the subdirectories
-    subdirs = [entry for entry in entries if entry.is_dir()]
-
-    # Traverse each subdirectory
-    for subdir in tqdm(subdirs, desc="Number of models evaluated"):
-
-        model_name = subdir.relative_to(result_dir).name
-        if model_names is not None and model_name not in model_names:
-            continue
-
-        model_name_escaped = model_name.replace("_", "/")
-
-        print(f"🦍 Model: {model_name}")
-
-        # Find and process all JSON files in the subdirectory
-        for model_result_json in subdir.glob("*.json"):
-            test_category = extract_test_category(model_result_json)
-            if test_category not in test_categories:
-                continue
-
-            handler = get_handler(model_name_escaped)
-
-            # We don't evaluate chatable and SQL models in our current
-            # leaderboard.
-            if (
-                is_chatable(test_category)
-                or is_executable(test_category)
-                or is_memory_prereq(test_category)
-                or "conflict" in test_category
-            ):
-                continue
-
-            model_result = load_file(model_result_json, sort_by_id=True)
-
-            state = evaluate_task(
-                test_category,
-                result_dir,
-                score_dir,
-                model_result,
-                model_name,
-                handler,
-                state,
-            )
-
-    # This function reads all the score files from local folder and updates the
-    # leaderboard table. This is helpful when you only want to run the
-    # evaluation for a subset of models and test categories.
-    update_leaderboard_table_with_local_score_file(state["leaderboard_table"], score_dir)
-    # Write the leaderboard table to a file
-    generate_leaderboard_csv(
-        state["leaderboard_table"], score_dir, model_names, test_categories
-    )
-
-
 def evaluate_task(
     test_category,
     result_dir,
@@ -623,6 +556,73 @@ def evaluate_task(
     return state
 
 
+def runner(model_names, test_categories, result_dir, score_dir):
+
+    # State udpated by each eval subtask.
+    state = dict(
+        # A dictionary to store the evaluation scores.
+        # Key is model name, value is a dictionary with keys as test category
+        # and values as a dictionary with accuracy and total count.
+        leaderboard_table={},
+    )
+
+    # Get a list of all entries in the folder
+    entries = result_dir.iterdir()
+
+    # Filter out the subdirectories
+    subdirs = [entry for entry in entries if entry.is_dir()]
+
+    # Traverse each subdirectory
+    for subdir in tqdm(subdirs, desc="Number of models evaluated"):
+
+        model_name = subdir.relative_to(result_dir).name
+        if model_names is not None and model_name not in model_names:
+            continue
+
+        model_name_escaped = model_name.replace("_", "/")
+
+        print(f"🦍 Model: {model_name}")
+
+        # Find and process all JSON files in the subdirectory
+        for model_result_json in subdir.glob("*.json"):
+            test_category = extract_test_category(model_result_json)
+            if test_category not in test_categories:
+                continue
+
+            handler = get_handler(model_name_escaped)
+
+            # We don't evaluate chatable and SQL models in our current
+            # leaderboard.
+            if (
+                is_chatable(test_category)
+                or is_executable(test_category)
+                or is_memory_prereq(test_category)
+                or "conflict" in test_category
+            ):
+                continue
+
+            model_result = load_file(model_result_json, sort_by_id=True)
+
+            state = evaluate_task(
+                test_category,
+                result_dir,
+                score_dir,
+                model_result,
+                model_name,
+                handler,
+                state,
+            )
+
+    # This function reads all the score files from local folder and updates the
+    # leaderboard table. This is helpful when you only want to run the
+    # evaluation for a subset of models and test categories.
+    update_leaderboard_table_with_local_score_file(state["leaderboard_table"], score_dir)
+    # Write the leaderboard table to a file
+    generate_leaderboard_csv(
+        state["leaderboard_table"], score_dir, model_names, test_categories
+    )
+
+
 def main(model, test_categories, result_dir, score_dir):
     if result_dir is None:
         result_dir = RESULT_PATH
diff --git a/berkeley-function-call-leaderboard/bfcl/utils.py b/berkeley-function-call-leaderboard/bfcl/utils.py
@@ -22,6 +22,9 @@
 from bfcl.constants.executable_backend_config import MULTI_TURN_FUNC_DOC_FILE_MAPPING
 
 
+#### Helper functions to extract/parse/complete test category from different formats ####
+
+
 def extract_test_category(input_string: Union[str, Path]) -> str:
     """
     Extract the test category from a given file name.
@@ -51,6 +54,18 @@ def extract_test_category_from_id(test_entry_id: str, remove_prereq: bool = Fals
     return test_entry_id.rsplit("_", 1)[0]
 
 
+def extract_memory_backend_type(test_category):
+    """
+    This function extracts the memory backend type from the test category.
+    The test category should be in the form of `memory_kv` or `memory_knowledge_graph`, etc.
+    """
+    if not is_memory(test_category):
+        raise ValueError(f"Test category {test_category} is not a memory category.")
+
+    # Split the test category by underscores and extract the backend type
+    return test_category[len("memory_") :]
+
+
 def find_file_by_category(
     test_category: str,
     folder_path: Path,
@@ -101,6 +116,25 @@ def get_file_name_by_category(
     return file_name
 
 
+def parse_test_category_argument(test_category_args: list[str]) -> list[str]:
+    test_name_total = set()
+
+    for test_category in test_category_args:
+        if test_category in TEST_COLLECTION_MAPPING:
+            for test_name in TEST_COLLECTION_MAPPING[test_category]:
+                test_name_total.add(test_name)
+        elif test_category in ALL_CATEGORIES:
+            test_name_total.add(test_category)
+        else:
+            # Invalid test category name
+            raise Exception(f"Invalid test category name provided: {test_category}")
+
+    return sorted(list(test_name_total))
+
+
+#### Predicate functions to check the test category ####
+
+
 def is_web_search(test_category):
     return "web_search" in test_category
 
@@ -164,16 +198,8 @@ def is_sql(test_category):
 def contain_multi_turn_interaction(test_category):
     return is_multi_turn(test_category) or is_agentic(test_category)
 
-def extract_memory_backend_type(test_category):
-    """
-    This function extracts the memory backend type from the test category.
-    The test category should be in the form of `memory_kv` or `memory_knowledge_graph`, etc.
-    """
-    if not is_memory(test_category):
-        raise ValueError(f"Test category {test_category} is not a memory category.")
 
-    # Split the test category by underscores and extract the backend type
-    return test_category[len("memory_") :]
+#### Helper functions to load/write the dataset files ####
 
 
 def load_file(file_path, sort_by_id=False):
@@ -188,6 +214,29 @@ def load_file(file_path, sort_by_id=False):
     return result
 
 
+def load_dataset_entry(test_category: str) -> list[dict]:
+    """
+    This function retrieves the dataset entry for a given test category.
+    The input should not be a test category goup, but a specific test category.
+    """
+    if not is_memory(test_category):
+        file_name = f"{VERSION_PREFIX}_{test_category}.json"
+        all_entries = load_file(PROMPT_PATH / file_name)
+    else:
+        # Memory categories
+        all_entries = []
+        for scenario in MEMORY_SCENARIO_NAME:
+            file_name = f"{VERSION_PREFIX}_memory_{scenario}.json"
+            entries = load_file(PROMPT_PATH / file_name)
+            all_entries += process_memory_test_case(entries, test_category, scenario)
+
+    all_entries = process_agentic_test_case(all_entries)
+    all_entries = populate_test_cases_with_predefined_functions(all_entries)
+    all_entries = process_func_doc(all_entries)
+
+    return all_entries
+
+
 def write_list_of_dicts_to_file(filename, data, subdir=None):
     if subdir:
         # Ensure the subdirectory exists
@@ -264,6 +313,9 @@ def sort_key(entry):
     return (priority, test_category, int(index))
 
 
+#### Helper functions to check the output format ####
+
+
 # TODO: Reorganize this function to be more readable
 def is_function_calling_format_output(decoded_output):
     """
@@ -313,20 +365,7 @@ def is_empty_output(decoded_output):
     return False
 
 
-def parse_test_category_argument(test_category_args: list[str]) -> list[str]:
-    test_name_total = set()
-
-    for test_category in test_category_args:
-        if test_category in TEST_COLLECTION_MAPPING:
-            for test_name in TEST_COLLECTION_MAPPING[test_category]:
-                test_name_total.add(test_name)
-        elif test_category in ALL_CATEGORIES:
-            test_name_total.add(test_category)
-        else:
-            # Invalid test category name
-            raise Exception(f"Invalid test category name provided: {test_category}")
-
-    return sorted(list(test_name_total))
+#### Helper functions to process the dataset entries ####
 
 
 def _get_language_specific_hint(test_category):
@@ -496,29 +535,6 @@ def populate_test_cases_with_predefined_functions(test_cases: list[dict]) -> lis
     return test_cases
 
 
-def load_dataset_entry(test_category: str) -> list[dict]:
-    """
-    This function retrieves the dataset entry for a given test category.
-    The input should not be a test category goup, but a specific test category.
-    """
-    if not is_memory(test_category):
-        file_name = f"{VERSION_PREFIX}_{test_category}.json"
-        all_entries = load_file(PROMPT_PATH / file_name)
-    else:
-        # Memory categories
-        all_entries = []
-        for scenario in MEMORY_SCENARIO_NAME:
-            file_name = f"{VERSION_PREFIX}_memory_{scenario}.json"
-            entries = load_file(PROMPT_PATH / file_name)
-            all_entries += process_memory_test_case(entries, test_category, scenario)
-
-    all_entries = process_agentic_test_case(all_entries)
-    all_entries = populate_test_cases_with_predefined_functions(all_entries)
-    all_entries = process_func_doc(all_entries)
-
-    return all_entries
-
-
 def clean_up_memory_prereq_entries(test_cases: list[dict]) -> list[dict]:
     """
     Remove memory-prerequisite test cases when their corresponding

Original file line number	Diff line number	Diff line change
`@@ -177,7 +177,7 @@ def multi_threaded_inference(`
`177`	`177`	`)`
`178`	`178`	`break # Success, exit the loop`
`179`	`179`	`except Exception as e:`
`180`		`- raise e`
	`180`	`+ # raise e`
`181`	`181`	`if retry_count < RETRY_LIMIT and (`
`182`	`182`	`"rate limit reached" in str(e).lower()`
`183`	`183`	`or (hasattr(e, "status_code") and (e.status_code in {429, 503, 500}))`