Skip to content

Commit bd14f1c

Browse files
HuanzhiMaoVishnuSuresh27
authored andcommitted
[BFCL] Add Llama-3.2-1B-Instruct, Llama-3.2-3B-Instruct, Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct (ShishirPatil#657)
This PR adds the following new models to the leaderboard: - `meta-llama/Llama-3.2-1B-Instruct` - `meta-llama/Llama-3.2-1B-Instruct-FC` - `meta-llama/Llama-3.2-3B-Instruct` - `meta-llama/Llama-3.2-3B-Instruct-FC` - `meta-llama/Llama-3.1-8B-Instruct` - `meta-llama/Llama-3.1-8B-Instruct-FC` - `meta-llama/Llama-3.1-70B-Instruct` - `meta-llama/Llama-3.1-70B-Instruct-FC`
1 parent a94a4ce commit bd14f1c

File tree

7 files changed

+311
-5
lines changed

7 files changed

+311
-5
lines changed

berkeley-function-call-leaderboard/CHANGELOG.md

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,15 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documen
1212
- `microsoft/Phi-3-mini-128k-instruct`
1313
- `microsoft/Phi-3-mini-4k-instruct`
1414
- [Sept 25, 2024] [#660](https://github.com/ShishirPatil/gorilla/pull/660): Bug fix in `parse_nested_value` function to handle nested dictionary values properly.
15+
- [Sept 24, 2024] [#657](https://github.com/ShishirPatil/gorilla/pull/657): Add the following new models to the leaderboard:
16+
- `meta-llama/Llama-3.2-1B-Instruct`
17+
- `meta-llama/Llama-3.2-1B-Instruct-FC`
18+
- `meta-llama/Llama-3.2-3B-Instruct`
19+
- `meta-llama/Llama-3.2-3B-Instruct-FC`
20+
- `meta-llama/Llama-3.1-8B-Instruct`
21+
- `meta-llama/Llama-3.1-8B-Instruct-FC`
22+
- `meta-llama/Llama-3.1-70B-Instruct`
23+
- `meta-llama/Llama-3.1-70B-Instruct-FC`
1524
- [Sept 24, 2024] [#648](https://github.com/ShishirPatil/gorilla/pull/648): Add the following new models to the leaderboard:
1625
- `gemini-1.5-pro-002`
1726
- `gemini-1.5-pro-002-FC`

berkeley-function-call-leaderboard/README.md

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -152,7 +152,11 @@ Below is _a table of models we support_ to run our leaderboard evaluation agains
152152
|google/gemma-7b-it 💻| Prompt|
153153
|meetkai/functionary-medium-v3.1-FC| Function Calling|
154154
|meetkai/functionary-small-{v3.1,v3.2}-FC| Function Calling|
155-
|meta-llama/Meta-Llama-3-{8B,70B}-Instruct | Prompt|
155+
|meta-llama/Meta-Llama-3-{8B,70B}-Instruct 💻| Prompt|
156+
|meta-llama/Llama-3.1-{8B,70B}-Instruct-FC 💻| Function Calling|
157+
|meta-llama/Llama-3.1-{8B,70B}-Instruct 💻| Prompt|
158+
|meta-llama/Llama-3.2-{1B,3B}-Instruct-FC 💻| Function Calling|
159+
|meta-llama/Llama-3.2-{1B,3B}-Instruct 💻| Prompt|
156160
|open-mixtral-{8x7b,8x22b} | Prompt|
157161
|open-mixtral-8x22b-FC | Function Calling|
158162
|open-mistral-nemo-2407 | Prompt|

berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -401,6 +401,54 @@
401401
"Meta",
402402
"Meta Llama 3 Community",
403403
],
404+
"meta-llama/Llama-3.1-8B-Instruct": [
405+
"Llama-3.1-8B-Instruct (Prompt)",
406+
"https://llama.meta.com/llama3",
407+
"Meta",
408+
"Meta Llama 3 Community",
409+
],
410+
"meta-llama/Llama-3.1-70B-Instruct": [
411+
"Llama-3.1-70B-Instruct (Prompt)",
412+
"https://llama.meta.com/llama3",
413+
"Meta",
414+
"Meta Llama 3 Community",
415+
],
416+
"meta-llama/Llama-3.2-1B-Instruct": [
417+
"Llama-3.2-1B-Instruct (Prompt)",
418+
"https://llama.meta.com/llama3",
419+
"Meta",
420+
"Meta Llama 3 Community",
421+
],
422+
"meta-llama/Llama-3.2-3B-Instruct": [
423+
"Llama-3.2-3B-Instruct (Prompt)",
424+
"https://llama.meta.com/llama3",
425+
"Meta",
426+
"Meta Llama 3 Community",
427+
],
428+
"meta-llama/Llama-3.1-8B-Instruct-FC": [
429+
"Llama-3.1-8B-Instruct (FC)",
430+
"https://llama.meta.com/llama3",
431+
"Meta",
432+
"Meta Llama 3 Community",
433+
],
434+
"meta-llama/Llama-3.1-70B-Instruct-FC": [
435+
"Llama-3.1-70B-Instruct (FC)",
436+
"https://llama.meta.com/llama3",
437+
"Meta",
438+
"Meta Llama 3 Community",
439+
],
440+
"meta-llama/Llama-3.2-1B-Instruct-FC": [
441+
"Llama-3.2-1B-Instruct (FC)",
442+
"https://llama.meta.com/llama3",
443+
"Meta",
444+
"Meta Llama 3 Community",
445+
],
446+
"meta-llama/Llama-3.2-3B-Instruct-FC": [
447+
"Llama-3.2-3B-Instruct (FC)",
448+
"https://llama.meta.com/llama3",
449+
"Meta",
450+
"Meta Llama 3 Community",
451+
],
404452
"command-r-plus-FC": [
405453
"Command-R-Plus (FC) (Original)",
406454
"https://txt.cohere.com/command-r-plus-microsoft-azure",

berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,7 @@
66
from bfcl.model_handler.oss_model.hammer import HammerHandler
77
from bfcl.model_handler.oss_model.hermes import HermesHandler
88
from bfcl.model_handler.oss_model.llama import LlamaHandler
9+
from bfcl.model_handler.oss_model.llama_fc import LlamaFCHandler
910
from bfcl.model_handler.oss_model.phi import PhiHandler
1011
from bfcl.model_handler.oss_model.salesforce import SalesforceHandler
1112
from bfcl.model_handler.proprietary_model.claude import ClaudeHandler
@@ -21,7 +22,7 @@
2122
from bfcl.model_handler.proprietary_model.openai import OpenAIHandler
2223
from bfcl.model_handler.proprietary_model.yi import YiHandler
2324

24-
# TODO: Add Deepseek V2 and Gemma V2
25+
# TODO: Add Deepseek V2 and Gemma V2, meta-llama/Llama-3.1-405B-Instruct
2526
handler_map = {
2627
# Inference through API calls
2728
"gorilla-openfunctions-v2": GorillaHandler,
@@ -79,6 +80,14 @@
7980
# Inference through local hosting
8081
"meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler,
8182
"meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler,
83+
"meta-llama/Llama-3.1-8B-Instruct-FC": LlamaFCHandler,
84+
"meta-llama/Llama-3.1-70B-Instruct-FC": LlamaFCHandler,
85+
"meta-llama/Llama-3.2-1B-Instruct-FC": LlamaFCHandler,
86+
"meta-llama/Llama-3.2-3B-Instruct-FC": LlamaFCHandler,
87+
"meta-llama/Llama-3.1-8B-Instruct": LlamaHandler,
88+
"meta-llama/Llama-3.1-70B-Instruct": LlamaHandler,
89+
"meta-llama/Llama-3.2-1B-Instruct": LlamaHandler,
90+
"meta-llama/Llama-3.2-3B-Instruct": LlamaHandler,
8291
"Salesforce/xLAM-1b-fc-r": SalesforceHandler,
8392
"Salesforce/xLAM-7b-fc-r": SalesforceHandler,
8493
"Salesforce/xLAM-7b-r": SalesforceHandler,

berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base_oss_handler.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@
2121
class OSSHandler(BaseHandler):
2222
def __init__(self, model_name, temperature, dtype="bfloat16") -> None:
2323
super().__init__(model_name, temperature)
24+
self.model_name_huggingface = model_name
2425
self.model_style = ModelStyle.OSSMODEL
2526
self.dtype = dtype
2627
self.client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
@@ -57,7 +58,7 @@ def batch_inference(
5758
[
5859
"vllm",
5960
"serve",
60-
str(self.model_name),
61+
str(self.model_name_huggingface),
6162
"--port",
6263
str(VLLM_PORT),
6364
"--dtype",
@@ -205,15 +206,15 @@ def _query_prompting(self, inference_data: dict):
205206

206207
if hasattr(self, "stop_token_ids"):
207208
api_response = self.client.completions.create(
208-
model=self.model_name,
209+
model=self.model_name_huggingface,
209210
temperature=self.temperature,
210211
prompt=formatted_prompt,
211212
stop_token_ids=self.stop_token_ids,
212213
max_tokens=4096, # TODO: Is there a better way to handle this?
213214
)
214215
else:
215216
api_response = self.client.completions.create(
216-
model=self.model_name,
217+
model=self.model_name_huggingface,
217218
temperature=self.temperature,
218219
prompt=formatted_prompt,
219220
max_tokens=4096,

berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
from bfcl.model_handler.oss_model.base_oss_handler import OSSHandler
22

3+
4+
# Note: This is the handler for the Llama models in prompring mode.
5+
# For function call mode, use LlamaFCHandler instead.
6+
# Llama 3 series are benchmarked in prompting mode while the Llama 3.1 series are benchmarked in function call mode.
37
class LlamaHandler(OSSHandler):
48
def __init__(self, model_name, temperature) -> None:
59
super().__init__(model_name, temperature)
Lines changed: 231 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,231 @@
1+
import json
2+
3+
from bfcl.model_handler.oss_model.base_oss_handler import OSSHandler
4+
from bfcl.model_handler.utils import func_doc_language_specific_pre_processing
5+
6+
# TODO: Merge with LlamaHandler
7+
8+
9+
class LlamaFCHandler(OSSHandler):
10+
def __init__(self, model_name, temperature) -> None:
11+
super().__init__(model_name, temperature)
12+
self.model_name_huggingface = model_name.replace("-FC", "")
13+
14+
@staticmethod
15+
def _format_prompt(messages, function):
16+
"""
17+
"bos_token": "<|begin_of_text|>",
18+
"chat_template":
19+
{{- bos_token }}
20+
{%- if custom_tools is defined %}
21+
{%- set tools = custom_tools %}
22+
{%- endif %}
23+
{%- if not tools_in_user_message is defined %}
24+
{%- set tools_in_user_message = true %}
25+
{%- endif %}
26+
{%- if not date_string is defined %}
27+
{%- set date_string = "26 Jul 2024" %}
28+
{%- endif %}
29+
{%- if not tools is defined %}
30+
{%- set tools = none %}
31+
{%- endif %}
32+
33+
{#- This block extracts the system message, so we can slot it into the right place. #}
34+
{%- if messages[0]['role'] == 'system' %}
35+
{%- set system_message = messages[0]['content']|trim %}
36+
{%- set messages = messages[1:] %}
37+
{%- else %}
38+
{%- set system_message = "" %}
39+
{%- endif %}
40+
41+
{#- System message + builtin tools #}
42+
{{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
43+
{%- if builtin_tools is defined or tools is not none %}
44+
{{- "Environment: ipython\n" }}
45+
{%- endif %}
46+
{%- if builtin_tools is defined %}
47+
{{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
48+
{%- endif %}
49+
{{- "Cutting Knowledge Date: December 2023\n" }}
50+
{{- "Today Date: " + date_string + "\n\n" }}
51+
{%- if tools is not none and not tools_in_user_message %}
52+
{{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
53+
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
54+
{{- "Do not use variables.\n\n" }}
55+
{%- for t in tools %}
56+
{{- t | tojson(indent=4) }}
57+
{{- "\n\n" }}
58+
{%- endfor %}
59+
{%- endif %}
60+
{{- system_message }}
61+
{{- "<|eot_id|>" }}
62+
63+
{#- Custom tools are passed in a user message with some extra guidance #}
64+
{%- if tools_in_user_message and not tools is none %}
65+
{#- Extract the first user message so we can plug it in here #}
66+
{%- if messages | length != 0 %}
67+
{%- set first_user_message = messages[0]['content']|trim %}
68+
{%- set messages = messages[1:] %}
69+
{%- else %}
70+
{{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
71+
{%- endif %}
72+
{{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
73+
{{- "Given the following functions, please respond with a JSON for a function call " }}
74+
{{- "with its proper arguments that best answers the given prompt.\n\n" }}
75+
{{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
76+
{{- "Do not use variables.\n\n" }}
77+
{%- for t in tools %}
78+
{{- t | tojson(indent=4) }}
79+
{{- "\n\n" }}
80+
{%- endfor %}
81+
{{- first_user_message + "<|eot_id|>"}}
82+
{%- endif %}
83+
84+
{%- for message in messages %}
85+
{%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
86+
{{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
87+
{%- elif 'tool_calls' in message %}
88+
{%- if not message.tool_calls|length == 1 %}
89+
{{- raise_exception("This model only supports single tool-calls at once!") }}
90+
{%- endif %}
91+
{%- set tool_call = message.tool_calls[0].function %}
92+
{%- if builtin_tools is defined and tool_call.name in builtin_tools %}
93+
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
94+
{{- "<|python_tag|>" + tool_call.name + ".call(" }}
95+
{%- for arg_name, arg_val in tool_call.arguments | items %}
96+
{{- arg_name + '="' + arg_val + '"' }}
97+
{%- if not loop.last %}
98+
{{- ", " }}
99+
{%- endif %}
100+
{%- endfor %}
101+
{{- ")" }}
102+
{%- else %}
103+
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
104+
{{- '{"name": "' + tool_call.name + '", ' }}
105+
{{- '"parameters": ' }}
106+
{{- tool_call.arguments | tojson }}
107+
{{- "}" }}
108+
{%- endif %}
109+
{%- if builtin_tools is defined %}
110+
{#- This means we're in ipython mode #}
111+
{{- "<|eom_id|>" }}
112+
{#- This means we're in ipython mode #}
113+
{{- "<|eom_id|>" }}
114+
{{- "<|eom_id|>" }}
115+
{%- else %}
116+
{{- "<|eot_id|>" }}
117+
{%- endif %}
118+
{%- elif message.role == "tool" or message.role == "ipython" %}
119+
{{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
120+
{%- if message.content is mapping or message.content is iterable %}
121+
{{- message.content | tojson }}
122+
{%- else %}
123+
{{- message.content }}
124+
{%- endif %}
125+
{{- "<|eot_id|>" }}
126+
{%- endif %}
127+
{%- endfor %}
128+
{%- if add_generation_prompt %}
129+
{{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
130+
{%- endif %}
131+
"""
132+
formatted_prompt = "<|begin_of_text|>"
133+
134+
system_message = ""
135+
remaining_messages = messages
136+
if messages[0]["role"] == "system":
137+
system_message = messages[0]["content"].strip()
138+
remaining_messages = messages[1:]
139+
140+
formatted_prompt += "<|start_header_id|>system<|end_header_id|>\n\n"
141+
formatted_prompt += "Environment: ipython\n"
142+
formatted_prompt += "Cutting Knowledge Date: December 2023\n"
143+
formatted_prompt += "Today Date: 26 Jul 2024\n\n"
144+
formatted_prompt += system_message + "<|eot_id|>"
145+
146+
# Llama pass in custom tools in first user message
147+
is_first_user_message = True
148+
for message in remaining_messages:
149+
if message["role"] == "user" and is_first_user_message:
150+
is_first_user_message = False
151+
formatted_prompt += "<|start_header_id|>user<|end_header_id|>\n\n"
152+
formatted_prompt += "Given the following functions, please respond with a JSON for a function call "
153+
formatted_prompt += (
154+
"with its proper arguments that best answers the given prompt.\n\n"
155+
)
156+
formatted_prompt += 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.'
157+
formatted_prompt += "Do not use variables.\n\n"
158+
for func in function:
159+
formatted_prompt += json.dumps(func, indent=4) + "\n\n"
160+
formatted_prompt += f"{message['content'].strip()}<|eot_id|>"
161+
162+
elif message["role"] == "tool":
163+
formatted_prompt += "<|start_header_id|>ipython<|end_header_id|>\n\n"
164+
if isinstance(message["content"], (dict, list)):
165+
formatted_prompt += json.dumps(message["content"])
166+
else:
167+
formatted_prompt += message["content"]
168+
formatted_prompt += "<|eot_id|>"
169+
170+
else:
171+
formatted_prompt += f"<|start_header_id|>{message['role']}<|end_header_id|>\n\n{message['content'].strip()}<|eot_id|>"
172+
173+
formatted_prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
174+
175+
return formatted_prompt
176+
177+
def decode_ast(self, result, language="Python"):
178+
result = result.replace("<|python_tag|>", "")
179+
# Llama sometimes separates the function calls with `;` and sometimes with `,`
180+
if ";" in result:
181+
"""
182+
"<|python_tag|>{\"name\": \"calc_binomial_probability\", \"parameters\": {\"n\": \"10\", \"k\": \"3\", \"p\": \"0\"}}; {\"name\": \"calc_binomial_probability\", \"parameters\": {\"n\": \"15\", \"k\": \"5\", \"p\": \"0\"}}; {\"name\": \"calc_binomial_probability\", \"parameters\": {\"n\": \"20\", \"k\": \"7\", \"p\": \"0\"}}"
183+
"""
184+
function_calls = result.split(";")
185+
function_calls = [json.loads(func_call) for func_call in function_calls]
186+
else:
187+
"""
188+
"[\n {\"name\": \"calculate_permutations\", \"parameters\": {\"n\": \"20\", \"k\": \"5\"}},\n {\"name\": \"calculate_permutations\", \"parameters\": {\"n\": \"12\", \"k\": \"5\"}},\n {\"name\": \"calculate_permutations\", \"parameters\": {\"n\": \"10\", \"k\": \"3\"}}\n]"
189+
"""
190+
function_calls = eval(result)
191+
if type(function_calls) == dict:
192+
function_calls = [function_calls]
193+
194+
decoded_output = []
195+
for func_call in function_calls:
196+
name = func_call["name"]
197+
params = func_call["parameters"]
198+
decoded_output.append({name: params})
199+
200+
return decoded_output
201+
202+
def decode_execute(self, result):
203+
result = result.replace("<|python_tag|>", "")
204+
# Llama sometimes separates the function calls with `;` and sometimes with `,`
205+
if ";" in result:
206+
function_calls = result.split(";")
207+
function_calls = [json.loads(func_call) for func_call in function_calls]
208+
else:
209+
function_calls = eval(result)
210+
if type(function_calls) == dict:
211+
function_calls = [function_calls]
212+
213+
execution_list = []
214+
for func_call in function_calls:
215+
name = func_call["name"]
216+
params = func_call["parameters"]
217+
execution_list.append(
218+
f"{name}({','.join([f'{k}={repr(v)}' for k,v in params.items()])})"
219+
)
220+
221+
return execution_list
222+
223+
def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
224+
functions: list = test_entry["function"]
225+
test_category: str = test_entry["id"].rsplit("_", 1)[0]
226+
227+
functions = func_doc_language_specific_pre_processing(functions, test_category)
228+
229+
# Llama use its own system prompt
230+
231+
return {"message": [], "function": functions}

0 commit comments

Comments
 (0)