Skip to content

Commit 3fcfab9

Browse files
authored
[BFCL] Chore: Various Improvements and Adjustments (#673)
This PR **will not** affect the leaderboard score. - Exclude `multi_turn_composite` from "all" and "multi_turn" categories as it's not currently taken into account when calculating score - Sort by rank for live and non-live CSV files - Add descriptive error handling for force-terminated entries - Improve debug log readability for `Gorilla File Systems` functions - Suppress print statements in the Salesforce handler - Set cost to "N/A" for all OSS models
1 parent a1d25f9 commit 3fcfab9

File tree

7 files changed

+63
-54
lines changed

7 files changed

+63
-54
lines changed

berkeley-function-call-leaderboard/bfcl/constant.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -54,14 +54,14 @@
5454
"multi_turn_miss_func",
5555
"multi_turn_miss_param",
5656
"multi_turn_long_context",
57-
"multi_turn_composite",
57+
# "multi_turn_composite", # Composite is currently not included in the leaderboard
5858
],
5959
"multi_turn": [
6060
"multi_turn_base",
6161
"multi_turn_miss_func",
6262
"multi_turn_miss_param",
6363
"multi_turn_long_context",
64-
"multi_turn_composite",
64+
# "multi_turn_composite", # Composite is currently not included in the leaderboard
6565
],
6666
"single_turn": [
6767
"exec_simple",

berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,20 @@ def multi_turn_runner(
3535
multi_turn_ground_truth_list: list[list[str]] = possible_answer[i]["ground_truth"]
3636
test_entry: dict = prompt[i]
3737

38+
if type(multi_turn_model_result_list) != list:
39+
result.append(
40+
{
41+
"id": index,
42+
"model_name": model_name,
43+
"test_category": test_category,
44+
"valid": False,
45+
"error": ["Error during inference phase. Model did not output a list of model responses."],
46+
"error_type": "multi_turn:inference_error",
47+
"prompt": test_entry,
48+
"model_result": multi_turn_model_result_list,
49+
"possible_answer": multi_turn_ground_truth_list,
50+
}
51+
)
3852
# Check if force-terminated during inference phase.
3953
# This happens when the model has retried too many times and still haven't figured out the answer.
4054
# When force-terminated, no further evaluation is needed. This whole entry will be failed.
@@ -421,7 +435,7 @@ def runner(model_names, test_categories, api_sanity_check):
421435
subdirs = [entry.path for entry in entries if entry.is_dir()]
422436

423437
# Traverse each subdirectory
424-
for subdir in subdirs:
438+
for subdir in tqdm(subdirs, desc="Number of models evaluated"):
425439

426440
model_name = subdir.split(INPUT_PATH)[1]
427441
if model_names is not None and model_name not in model_names:

berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner_helper.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -90,6 +90,14 @@ def write_list_of_dicts_to_file(filename, data, subdir=None):
9090
# Write the list of dictionaries to the file in JSON format
9191
with open(filename, "w") as f:
9292
for i, entry in enumerate(data):
93+
# Go through each key-value pair in the dictionary to make sure the values are JSON serializable
94+
for key, value in entry.items():
95+
try:
96+
json.dumps(value)
97+
except:
98+
# If the value is not JSON serializable, wrap it in a string
99+
entry[key] = str(value)
100+
93101
json_str = json.dumps(entry)
94102
f.write(json_str)
95103
if i < len(data) - 1:
@@ -338,15 +346,17 @@ def get_cost_letency_info(model_name, cost_data, latency_data):
338346
) / 1000
339347
cost = round(cost, 2)
340348

341-
if model_name in OSS_LATENCY:
342-
mean_latency, std_latency, percentile_95_latency = (
343-
OSS_LATENCY[model_name] / 1700,
344-
"N/A",
345-
"N/A",
346-
)
347-
mean_latency = round(mean_latency, 2)
348-
cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
349-
cost = round(cost, 2)
349+
# TODO: Have a formal way to calculate the cost and latency for OSS models
350+
# Currently, all OSS models will have no cost.
351+
# if model_name in OSS_LATENCY:
352+
# mean_latency, std_latency, percentile_95_latency = (
353+
# OSS_LATENCY[model_name] / 1700,
354+
# "N/A",
355+
# "N/A",
356+
# )
357+
# mean_latency = round(mean_latency, 2)
358+
# cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
359+
# cost = round(cost, 2)
350360

351361
elif len(latency_data["data"]) != 0:
352362
mean_latency = statistics.mean(latency_data["data"])
@@ -612,7 +622,7 @@ def generate_leaderboard_csv(
612622
)
613623

614624
# Write Non-Live Score File
615-
data_non_live.sort(key=lambda x: x[1], reverse=True)
625+
data_non_live.sort(key=lambda x: x[2], reverse=True)
616626
for i in range(len(data_non_live)):
617627
data_non_live[i][0] = str(i + 1)
618628
for j in range(2, len(data_non_live[i])):
@@ -629,7 +639,7 @@ def generate_leaderboard_csv(
629639
f.write(",".join(row))
630640

631641
# Write Live Score File
632-
data_live.sort(key=lambda x: x[1], reverse=True)
642+
data_live.sort(key=lambda x: x[2], reverse=True)
633643
for i in range(len(data_live)):
634644
data_live[i][0] = str(i + 1)
635645
for j in range(2, len(data_live[i])):

berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py

Lines changed: 6 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
from bfcl.model_handler.handler_map import local_inference_handler_map
2+
13
MODEL_METADATA_MAPPING = {
24
"o1-preview-2024-09-12": [
35
"o1-preview-2024-09-12 (Prompt)",
@@ -203,12 +205,6 @@
203205
"MeetKai",
204206
"MIT",
205207
],
206-
"meetkai/functionary-small-v3.2-FC": [
207-
"Functionary-Small-v3.2 (FC)",
208-
"https://huggingface.co/meetkai/functionary-small-v3.2",
209-
"MeetKai",
210-
"MIT",
211-
],
212208
"meetkai/functionary-medium-v3.1-FC": [
213209
"Functionary-Medium-v3.1 (FC)",
214210
"https://huggingface.co/meetkai/functionary-medium-v3.1",
@@ -733,35 +729,17 @@
733729
# Because we do batching when generating the data, so the latency is not accurate from the result data.
734730
# This is the latency for the whole batch of data, when using 8 V100 GPUs.
735731
OSS_LATENCY = {
736-
"deepseek-ai/deepseek-coder-6.7b-instruct": 909,
737-
"google/gemma-7b-it": 95,
738-
"NousResearch/Hermes-2-Pro-Mistral-7B": 135,
739-
"NousResearch/Hermes-2-Pro-Llama-3-8B": 77,
740-
"NousResearch/Hermes-2-Theta-Llama-3-8B": 73,
741-
"NousResearch/Hermes-2-Theta-Llama-3-70B": 716,
742-
"NousResearch/Hermes-2-Pro-Llama-3-70B": 674,
743-
"meta-llama/Meta-Llama-3-8B-Instruct": 73,
744-
"meta-llama/Meta-Llama-3-70B-Instruct": 307,
745-
"gorilla-openfunctions-v2": 83,
746-
"THUDM/glm-4-9b-chat": 223,
747732
}
748733

749-
750-
NO_COST_MODELS = [
734+
# All OSS models will have no cost shown on the leaderboard.
735+
NO_COST_MODELS = list(local_inference_handler_map.keys())
736+
# The following models will also have no cost, even though they are queries through the API.
737+
NO_COST_MODELS += [
751738
"Nexusflow-Raven-v2",
752739
"firefunction-v1-FC",
753740
"firefunction-v2-FC",
754741
"meetkai/functionary-small-v3.1-FC",
755-
"meetkai/functionary-small-v3.2-FC",
756742
"meetkai/functionary-medium-v3.1-FC",
757743
"snowflake/arctic",
758744
"nvidia/nemotron-4-340b-instruct",
759-
"ibm-granite/granite-20b-functioncalling",
760-
"THUDM/glm-4-9b-chat",
761-
"Salesforce/xLAM-1b-fc-r",
762-
"Salesforce/xLAM-7b-fc-r",
763-
"Salesforce/xLAM-7b-r",
764-
"Salesforce/xLAM-8x7b-r",
765-
"Salesforce/xLAM-8x22b-r",
766-
"Team-ACE/ToolACE-8B",
767745
]

berkeley-function-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/gorilla_file_system.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ def _append(self, additional_content: str) -> None:
4949
self.last_modified = datetime.datetime.now()
5050

5151
def __repr__(self):
52-
return f"<File: {self.name}, Last Modified: {self.last_modified}>"
52+
return f"<<File: {self.name}, Last Modified: {self.last_modified}, Content: {self.content}>>"
5353

5454
def __eq__(self, other: object) -> bool:
5555
if not isinstance(other, File):
@@ -121,7 +121,7 @@ def _list_contents(self) -> List[str]:
121121
return list(self.contents.keys())
122122

123123
def __repr__(self):
124-
return f"<Directory: {self.name}, Contents: {list(self.contents.keys())}>"
124+
return f"<Directory: {self.name}, Parent: {self.parent.name if self.parent else None}, Contents: {self.contents}>"
125125

126126
def __eq__(self, other: object) -> bool:
127127
if not isinstance(other, Directory):

berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py

Lines changed: 15 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -24,8 +24,9 @@
2424
from bfcl.model_handler.proprietary_model.yi import YiHandler
2525

2626
# TODO: Add Deepseek V2 and Gemma V2, meta-llama/Llama-3.1-405B-Instruct
27-
handler_map = {
28-
# Inference through API calls
27+
28+
# Inference through API calls
29+
api_inference_handler_map = {
2930
"gorilla-openfunctions-v2": GorillaHandler,
3031
"o1-preview-2024-09-12": OpenAIHandler,
3132
"o1-mini-2024-09-12": OpenAIHandler,
@@ -68,7 +69,7 @@
6869
"gemini-1.5-flash-001-FC": GeminiHandler,
6970
"gemini-1.0-pro-002": GeminiHandler,
7071
"gemini-1.0-pro-002-FC": GeminiHandler,
71-
"meetkai/functionary-small-v3.2-FC": FunctionaryHandler,
72+
"meetkai/functionary-small-v3.1-FC": FunctionaryHandler,
7273
"meetkai/functionary-medium-v3.1-FC": FunctionaryHandler,
7374
"databricks-dbrx-instruct": DatabricksHandler,
7475
"command-r-plus-FC": CohereHandler,
@@ -77,8 +78,11 @@
7778
"command-r-plus-optimized": CohereHandler,
7879
"snowflake/arctic": NvidiaHandler,
7980
"nvidia/nemotron-4-340b-instruct": NvidiaHandler,
80-
"yi-large-fc": YiHandler,
81-
# Inference through local hosting
81+
# "yi-large-fc": YiHandler, # Their API is under maintenance, and will not be back online in the near future
82+
}
83+
84+
# Inference through local hosting
85+
local_inference_handler_map = {
8286
"meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler,
8387
"meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler,
8488
"meta-llama/Llama-3.1-8B-Instruct-FC": LlamaFCHandler,
@@ -114,8 +118,10 @@
114118
"Qwen/Qwen2.5-1.5B-Instruct": QwenHandler,
115119
"Qwen/Qwen2.5-7B-Instruct": QwenHandler,
116120
"Team-ACE/ToolACE-8B": LlamaHandler,
117-
118-
# Deprecated/outdated models, no longer on the leaderboard
121+
}
122+
123+
# Deprecated/outdated models, no longer on the leaderboard
124+
outdated_model_handler_map = {
119125
# "gorilla-openfunctions-v0": GorillaHandler,
120126
# "gpt-4o-2024-05-13": OpenAIHandler,
121127
# "gpt-4o-2024-05-13-FC": OpenAIHandler,
@@ -135,3 +141,5 @@
135141
# "google/gemma-7b-it": GemmaHandler,
136142
# "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler,
137143
}
144+
145+
handler_map = {**api_inference_handler_map, **local_inference_handler_map}

berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/salesforce.py

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@ def decode_ast(self, result, language="Python"):
2323
return decoded_output
2424

2525
def decode_execute(self, result):
26-
result = result
2726
if isinstance(result, list):
2827
tool_calls = result
2928
elif isinstance(result, dict):
@@ -70,8 +69,8 @@ def xlam_json_to_python_tool_calls(tool_calls):
7069
[f"{key}={repr(value)}" for key, value in arguments.items()]
7170
)
7271
python_format.append(f"{name}({args_str})")
73-
else:
74-
print(f"Invalid format: {tool_call}")
72+
# else:
73+
# print(f"Invalid format: {tool_call}")
7574

7675
return python_format
7776

0 commit comments

Comments
 (0)