Skip to content
4 changes: 2 additions & 2 deletions berkeley-function-call-leaderboard/bfcl/constant.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,14 +54,14 @@
"multi_turn_miss_func",
"multi_turn_miss_param",
"multi_turn_long_context",
"multi_turn_composite",
# "multi_turn_composite",
],
"multi_turn": [
"multi_turn_base",
"multi_turn_miss_func",
"multi_turn_miss_param",
"multi_turn_long_context",
"multi_turn_composite",
# "multi_turn_composite",
],
"single_turn": [
"exec_simple",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,20 @@ def multi_turn_runner(
multi_turn_ground_truth_list: list[list[str]] = possible_answer[i]["ground_truth"]
test_entry: dict = prompt[i]

if type(multi_turn_model_result_list) != list:
result.append(
{
"id": index,
"model_name": model_name,
"test_category": test_category,
"valid": False,
"error": ["Error during inference phase. Model did not output a list of model responses."],
"error_type": "multi_turn:inference_error",
"prompt": test_entry,
"model_result": multi_turn_model_result_list,
"possible_answer": multi_turn_ground_truth_list,
}
)
# Check if force-terminated during inference phase.
# This happens when the model has retried too many times and still haven't figured out the answer.
# When force-terminated, no further evaluation is needed. This whole entry will be failed.
Expand Down Expand Up @@ -421,7 +435,7 @@ def runner(model_names, test_categories, api_sanity_check):
subdirs = [entry.path for entry in entries if entry.is_dir()]

# Traverse each subdirectory
for subdir in subdirs:
for subdir in tqdm(subdirs, desc="Number of models evaluated"):

model_name = subdir.split(INPUT_PATH)[1]
if model_names is not None and model_name not in model_names:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -338,15 +338,15 @@ def get_cost_letency_info(model_name, cost_data, latency_data):
) / 1000
cost = round(cost, 2)

if model_name in OSS_LATENCY:
mean_latency, std_latency, percentile_95_latency = (
OSS_LATENCY[model_name] / 1700,
"N/A",
"N/A",
)
mean_latency = round(mean_latency, 2)
cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
cost = round(cost, 2)
# if model_name in OSS_LATENCY:
# mean_latency, std_latency, percentile_95_latency = (
# OSS_LATENCY[model_name] / 1700,
# "N/A",
# "N/A",
# )
# mean_latency = round(mean_latency, 2)
# cost = mean_latency * 1000 * V100_x8_PRICE_PER_HOUR / 3600
# cost = round(cost, 2)

elif len(latency_data["data"]) != 0:
mean_latency = statistics.mean(latency_data["data"])
Expand Down Expand Up @@ -612,7 +612,7 @@ def generate_leaderboard_csv(
)

# Write Non-Live Score File
data_non_live.sort(key=lambda x: x[1], reverse=True)
data_non_live.sort(key=lambda x: x[2], reverse=True)
for i in range(len(data_non_live)):
data_non_live[i][0] = str(i + 1)
for j in range(2, len(data_non_live[i])):
Expand All @@ -629,7 +629,7 @@ def generate_leaderboard_csv(
f.write(",".join(row))

# Write Live Score File
data_live.sort(key=lambda x: x[1], reverse=True)
data_live.sort(key=lambda x: x[2], reverse=True)
for i in range(len(data_live)):
data_live[i][0] = str(i + 1)
for j in range(2, len(data_live[i])):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from bfcl.model_handler.handler_map import local_inference_handler_map

MODEL_METADATA_MAPPING = {
"o1-preview-2024-09-12": [
"o1-preview-2024-09-12 (Prompt)",
Expand Down Expand Up @@ -203,12 +205,6 @@
"MeetKai",
"MIT",
],
"meetkai/functionary-small-v3.2-FC": [
"Functionary-Small-v3.2 (FC)",
"https://huggingface.co/meetkai/functionary-small-v3.2",
"MeetKai",
"MIT",
],
"meetkai/functionary-medium-v3.1-FC": [
"Functionary-Medium-v3.1 (FC)",
"https://huggingface.co/meetkai/functionary-medium-v3.1",
Expand Down Expand Up @@ -733,35 +729,15 @@
# Because we do batching when generating the data, so the latency is not accurate from the result data.
# This is the latency for the whole batch of data, when using 8 V100 GPUs.
OSS_LATENCY = {
"deepseek-ai/deepseek-coder-6.7b-instruct": 909,
"google/gemma-7b-it": 95,
"NousResearch/Hermes-2-Pro-Mistral-7B": 135,
"NousResearch/Hermes-2-Pro-Llama-3-8B": 77,
"NousResearch/Hermes-2-Theta-Llama-3-8B": 73,
"NousResearch/Hermes-2-Theta-Llama-3-70B": 716,
"NousResearch/Hermes-2-Pro-Llama-3-70B": 674,
"meta-llama/Meta-Llama-3-8B-Instruct": 73,
"meta-llama/Meta-Llama-3-70B-Instruct": 307,
"gorilla-openfunctions-v2": 83,
"THUDM/glm-4-9b-chat": 223,
}


NO_COST_MODELS = [
# All OSS models will have no cost shown on the leaderboard.
NO_COST_MODELS = list(local_inference_handler_map.keys()) + [
"Nexusflow-Raven-v2",
"firefunction-v1-FC",
"firefunction-v2-FC",
"meetkai/functionary-small-v3.1-FC",
"meetkai/functionary-small-v3.2-FC",
"meetkai/functionary-medium-v3.1-FC",
"snowflake/arctic",
"nvidia/nemotron-4-340b-instruct",
"ibm-granite/granite-20b-functioncalling",
"THUDM/glm-4-9b-chat",
"Salesforce/xLAM-1b-fc-r",
"Salesforce/xLAM-7b-fc-r",
"Salesforce/xLAM-7b-r",
"Salesforce/xLAM-8x7b-r",
"Salesforce/xLAM-8x22b-r",
"Team-ACE/ToolACE-8B",
]
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def _append(self, additional_content: str) -> None:
self.last_modified = datetime.datetime.now()

def __repr__(self):
return f"<File: {self.name}, Last Modified: {self.last_modified}>"
return f"<<File: {self.name}, Last Modified: {self.last_modified}, Content: {self.content}>>"

def __eq__(self, other: object) -> bool:
if not isinstance(other, File):
Expand Down Expand Up @@ -121,7 +121,7 @@ def _list_contents(self) -> List[str]:
return list(self.contents.keys())

def __repr__(self):
return f"<Directory: {self.name}, Contents: {list(self.contents.keys())}>"
return f"<Directory: {self.name}, Parent: {self.parent.name if self.parent else None}, Contents: {self.contents}>"

def __eq__(self, other: object) -> bool:
if not isinstance(other, Directory):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
from bfcl.model_handler.proprietary_model.yi import YiHandler

# TODO: Add Deepseek V2 and Gemma V2, meta-llama/Llama-3.1-405B-Instruct
handler_map = {
# Inference through API calls

# Inference through API calls
api_inference_handler_map = {
"gorilla-openfunctions-v2": GorillaHandler,
"o1-preview-2024-09-12": OpenAIHandler,
"o1-mini-2024-09-12": OpenAIHandler,
Expand Down Expand Up @@ -68,7 +69,7 @@
"gemini-1.5-flash-001-FC": GeminiHandler,
"gemini-1.0-pro-002": GeminiHandler,
"gemini-1.0-pro-002-FC": GeminiHandler,
"meetkai/functionary-small-v3.2-FC": FunctionaryHandler,
"meetkai/functionary-small-v3.1-FC": FunctionaryHandler,
"meetkai/functionary-medium-v3.1-FC": FunctionaryHandler,
"databricks-dbrx-instruct": DatabricksHandler,
"command-r-plus-FC": CohereHandler,
Expand All @@ -77,8 +78,11 @@
"command-r-plus-optimized": CohereHandler,
"snowflake/arctic": NvidiaHandler,
"nvidia/nemotron-4-340b-instruct": NvidiaHandler,
"yi-large-fc": YiHandler,
# Inference through local hosting
# "yi-large-fc": YiHandler, # Their API is under maintenance, and will not be back online in the near future
}

# Inference through local hosting
local_inference_handler_map = {
"meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler,
"meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler,
"meta-llama/Llama-3.1-8B-Instruct-FC": LlamaFCHandler,
Expand Down Expand Up @@ -114,8 +118,10 @@
"Qwen/Qwen2.5-1.5B-Instruct": QwenHandler,
"Qwen/Qwen2.5-7B-Instruct": QwenHandler,
"Team-ACE/ToolACE-8B": LlamaHandler,

# Deprecated/outdated models, no longer on the leaderboard
}

# Deprecated/outdated models, no longer on the leaderboard
outdated_model_handler_map = {
# "gorilla-openfunctions-v0": GorillaHandler,
# "gpt-4o-2024-05-13": OpenAIHandler,
# "gpt-4o-2024-05-13-FC": OpenAIHandler,
Expand All @@ -135,3 +141,5 @@
# "google/gemma-7b-it": GemmaHandler,
# "deepseek-ai/deepseek-coder-6.7b-instruct": DeepseekHandler,
}

handler_map = {**api_inference_handler_map, **local_inference_handler_map}
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ def decode_ast(self, result, language="Python"):
return decoded_output

def decode_execute(self, result):
result = result
if isinstance(result, list):
tool_calls = result
elif isinstance(result, dict):
Expand Down Expand Up @@ -70,8 +69,8 @@ def xlam_json_to_python_tool_calls(tool_calls):
[f"{key}={repr(value)}" for key, value in arguments.items()]
)
python_format.append(f"{name}({args_str})")
else:
print(f"Invalid format: {tool_call}")
# else:
# print(f"Invalid format: {tool_call}")

return python_format

Expand Down