[BFCL] Chore: Various Improvements and Adjustments (#673)

HuanzhiMao · web-flow · commit 3fcfab9f6cde · 2024-10-08T18:16:34.000-07:00
This PR **will not** affect the leaderboard score. 

- Exclude `multi_turn_composite` from "all" and "multi_turn" categories
as it's not currently taken into account when calculating score
- Sort by rank for live and non-live CSV files
- Add descriptive error handling for force-terminated entries
- Improve debug log readability for `Gorilla File Systems` functions
- Suppress print statements in the Salesforce handler
- Set cost to "N/A" for all OSS models
diff --git a/berkeley-function-call-leaderboard/bfcl/constant.py b/berkeley-function-call-leaderboard/bfcl/constant.py
@@ -54,14 +54,14 @@
         "multi_turn_miss_func",
         "multi_turn_miss_param",
         "multi_turn_long_context",
-        "multi_turn_composite",
+        # "multi_turn_composite",  # Composite is currently not included in the leaderboard
     ],
     "multi_turn": [
         "multi_turn_base",
         "multi_turn_miss_func",
         "multi_turn_miss_param",
         "multi_turn_long_context",
-        "multi_turn_composite",
+        # "multi_turn_composite",  # Composite is currently not included in the leaderboard
     ],
     "single_turn": [
         "exec_simple",
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -35,6 +35,20 @@ def multi_turn_runner(
         multi_turn_ground_truth_list: list[list[str]] = possible_answer[i]["ground_truth"]
         test_entry: dict = prompt[i]
 
+        if type(multi_turn_model_result_list) != list:
+            result.append(
+                {
+                    "id": index,
+                    "model_name": model_name,
+                    "test_category": test_category,
+                    "valid": False,
+                    "error": ["Error during inference phase. Model did not output a list of model responses."],
+                    "error_type": "multi_turn:inference_error",
+                    "prompt": test_entry,
+                    "model_result": multi_turn_model_result_list,
+                    "possible_answer": multi_turn_ground_truth_list,
+                }
+            )
         # Check if force-terminated during inference phase.
         # This happens when the model has retried too many times and still haven't figured out the answer.
         # When force-terminated, no further evaluation is needed. This whole entry will be failed.
@@ -421,7 +435,7 @@ def runner(model_names, test_categories, api_sanity_check):
     subdirs = [entry.path for entry in entries if entry.is_dir()]
 
     # Traverse each subdirectory
-    for subdir in subdirs:
+    for subdir in tqdm(subdirs, desc="Number of models evaluated"):
 
         model_name = subdir.split(INPUT_PATH)[1]
         if model_names is not None and model_name not in model_names:
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py
@@ -90,6 +90,14 @@ def write_list_of_dicts_to_file(filename, data, subdir=None):
     # Write the list of dictionaries to the file in JSON format
     with open(filename, "w") as f:
         for i, entry in enumerate(data):
+            # Go through each key-value pair in the dictionary to make sure the values are JSON serializable
+            for key, value in entry.items():
+                try:
+                    json.dumps(value)
+                except:
+                    # If the value is not JSON serializable, wrap it in a string
+                    entry[key] = str(value)
+
             json_str = json.dumps(entry)
             f.write(json_str)
             if i < len(data) - 1:
@@ -338,15 +346,17 @@ def get_cost_letency_info(model_name, cost_data, latency_data):
         ) / 1000
         cost = round(cost, 2)
 
-    if model_name in OSS_LATENCY:
-        mean_latency, std_latency, percentile_95_latency = (
-            OSS_LATENCY[model_name] / 1700,
-            "N/A",
-            "N/A",
-        )
-        mean_latency = round(mean_latency, 2)
-        cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
-        cost = round(cost, 2)
+    # TODO: Have a formal way to calculate the cost and latency for OSS models
+    # Currently, all OSS models will have no cost.
+    # if model_name in OSS_LATENCY:
+    #     mean_latency, std_latency, percentile_95_latency = (
+    #         OSS_LATENCY[model_name] / 1700,
+    #         "N/A",
+    #         "N/A",
+    #     )
+    #     mean_latency = round(mean_latency, 2)
+    #     cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
+    #     cost = round(cost, 2)
 
     elif len(latency_data["data"]) != 0:
         mean_latency = statistics.mean(latency_data["data"])
@@ -612,7 +622,7 @@ def generate_leaderboard_csv(
         )
         
     # Write Non-Live Score File
-    data_non_live.sort(key=lambda x: x[1], reverse=True)
+    data_non_live.sort(key=lambda x: x[2], reverse=True)
     for i in range(len(data_non_live)):
         data_non_live[i][0] = str(i + 1)
         for j in range(2, len(data_non_live[i])):
@@ -629,7 +639,7 @@ def generate_leaderboard_csv(
                 f.write(",".join(row))
     
     # Write Live Score File
-    data_live.sort(key=lambda x: x[1], reverse=True)
+    data_live.sort(key=lambda x: x[2], reverse=True)
     for i in range(len(data_live)):
         data_live[i][0] = str(i + 1)
         for j in range(2, len(data_live[i])):
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py
@@ -1,3 +1,5 @@
+from bfcl.model_handler.handler_map import local_inference_handler_map
+
 MODEL_METADATA_MAPPING = {
     "o1-preview-2024-09-12": [
         "o1-preview-2024-09-12 (Prompt)",
@@ -203,12 +205,6 @@
         "MeetKai",
         "MIT",
     ],
-    "meetkai/functionary-small-v3.2-FC": [
-        "Functionary-Small-v3.2 (FC)",
-        "https://huggingface.co/meetkai/functionary-small-v3.2",
-        "MeetKai",
-        "MIT",
-    ],
     "meetkai/functionary-medium-v3.1-FC": [
         "Functionary-Medium-v3.1 (FC)",
         "https://huggingface.co/meetkai/functionary-medium-v3.1",
@@ -733,35 +729,17 @@
 # Because we do batching when generating the data, so the latency is not accurate from the result data.
 # This is the latency for the whole batch of data, when using 8 V100 GPUs.
 OSS_LATENCY = {
-    "deepseek-ai/deepseek-coder-6.7b-instruct": 909,
-    "google/gemma-7b-it": 95,
-    "NousResearch/Hermes-2-Pro-Mistral-7B": 135,
-    "NousResearch/Hermes-2-Pro-Llama-3-8B": 77,
-    "NousResearch/Hermes-2-Theta-Llama-3-8B": 73,
-    "NousResearch/Hermes-2-Theta-Llama-3-70B": 716,
-    "NousResearch/Hermes-2-Pro-Llama-3-70B": 674,
-    "meta-llama/Meta-Llama-3-8B-Instruct": 73,
-    "meta-llama/Meta-Llama-3-70B-Instruct": 307,
-    "gorilla-openfunctions-v2": 83,
-    "THUDM/glm-4-9b-chat": 223,
 }
 
-
-NO_COST_MODELS = [
+# All OSS models will have no cost shown on the leaderboard. 
+NO_COST_MODELS = list(local_inference_handler_map.keys())
+# The following models will also have no cost, even though they are queries through the API.
+NO_COST_MODELS += [
     "Nexusflow-Raven-v2",
     "firefunction-v1-FC",
     "firefunction-v2-FC",
     "meetkai/functionary-small-v3.1-FC",
-    "meetkai/functionary-small-v3.2-FC",
     "meetkai/functionary-medium-v3.1-FC",
     "snowflake/arctic",
     "nvidia/nemotron-4-340b-instruct",
-    "ibm-granite/granite-20b-functioncalling",
-    "THUDM/glm-4-9b-chat",
-    "Salesforce/xLAM-1b-fc-r",
-    "Salesforce/xLAM-7b-fc-r",
-    "Salesforce/xLAM-7b-r",
-    "Salesforce/xLAM-8x7b-r",
-    "Salesforce/xLAM-8x22b-r",
-    "Team-ACE/ToolACE-8B",
 ]
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/gorilla_file_system.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/gorilla_file_system.py
@@ -49,7 +49,7 @@ def _append(self, additional_content: str) -> None:
         self.last_modified = datetime.datetime.now()
 
     def __repr__(self):
-        return f"<File: {self.name}, Last Modified: {self.last_modified}>"
+        return f"<<File: {self.name}, Last Modified: {self.last_modified}, Content: {self.content}>>"
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, File):
@@ -121,7 +121,7 @@ def _list_contents(self) -> List[str]:
         return list(self.contents.keys())
 
     def __repr__(self):
-        return f"<Directory: {self.name}, Contents: {list(self.contents.keys())}>"
+        return f"<Directory: {self.name}, Parent: {self.parent.name if self.parent else None}, Contents: {self.contents}>"
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, Directory):
diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py
@@ -24,8 +24,9 @@
 from bfcl.model_handler.proprietary_model.yi import YiHandler
 
 # TODO: Add Deepseek V2 and Gemma V2, meta-llama/Llama-3.1-405B-Instruct
-handler_map = {
-    # Inference through API calls
+
+# Inference through API calls
+api_inference_handler_map = {
     "gorilla-openfunctions-v2": GorillaHandler,
     "o1-preview-2024-09-12": OpenAIHandler,
     "o1-mini-2024-09-12": OpenAIHandler,
@@ -68,7 +69,7 @@
     "gemini-1.5-flash-001-FC": GeminiHandler,
     "gemini-1.0-pro-002": GeminiHandler,
     "gemini-1.0-pro-002-FC": GeminiHandler,
-    "meetkai/functionary-small-v3.2-FC": FunctionaryHandler,
+    "meetkai/functionary-small-v3.1-FC": FunctionaryHandler,
     "meetkai/functionary-medium-v3.1-FC": FunctionaryHandler,
     "databricks-dbrx-instruct": DatabricksHandler,
     "command-r-plus-FC": CohereHandler,
@@ -77,8 +78,11 @@
     "command-r-plus-optimized": CohereHandler,
     "snowflake/arctic": NvidiaHandler,
     "nvidia/nemotron-4-340b-instruct": NvidiaHandler,
-    "yi-large-fc": YiHandler,
-    # Inference through local hosting
+    # "yi-large-fc": YiHandler,  #  Their API is under maintenance, and will not be back online in the near future
+}
+
+# Inference through local hosting
+local_inference_handler_map = {
     "meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler,
     "meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler,
     "meta-llama/Llama-3.1-8B-Instruct-FC": LlamaFCHandler,
@@ -114,8 +118,10 @@
     "Qwen/Qwen2.5-1.5B-Instruct": QwenHandler,
     "Qwen/Qwen2.5-7B-Instruct": QwenHandler,
     "Team-ACE/ToolACE-8B": LlamaHandler,
-    
-    # Deprecated/outdated models, no longer on the leaderboard
+}
+
+# Deprecated/outdated models, no longer on the leaderboard
+outdated_model_handler_map = {
     # "gorilla-openfunctions-v0": GorillaHandler,
     # "gpt-4o-2024-05-13": OpenAIHandler,
     # "gpt-4o-2024-05-13-FC": OpenAIHandler,
@@ -135,3 +141,5 @@
     # "google/gemma-7b-it": GemmaHandler,
     # "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler,
 }
+
+handler_map = {**api_inference_handler_map, **local_inference_handler_map}
diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/salesforce.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/salesforce.py
@@ -23,7 +23,6 @@ def decode_ast(self, result, language="Python"):
         return decoded_output
 
     def decode_execute(self, result):
-        result = result
         if isinstance(result, list):
             tool_calls = result
         elif isinstance(result, dict):
@@ -70,8 +69,8 @@ def xlam_json_to_python_tool_calls(tool_calls):
                     [f"{key}={repr(value)}" for key, value in arguments.items()]
                 )
                 python_format.append(f"{name}({args_str})")
-            else:
-                print(f"Invalid format: {tool_call}")
+            # else:
+                # print(f"Invalid format: {tool_call}")
 
         return python_format