Skip to content

Commit 1bb65c9

Browse files
Add support for llama-3.1-nemotron-ultra-253b-v1 to BFCL (#1032)
Add LLaMA-3.1 Nemotron Ultra 253B v1 (FC) to the leaderboard. Used Nvidia API inference handler --------- Co-authored-by: Huanzhi Mao <[email protected]>
1 parent b6e8dfd commit 1bb65c9

File tree

5 files changed

+123
-2
lines changed

5 files changed

+123
-2
lines changed

berkeley-function-call-leaderboard/SUPPORTED_MODELS.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,7 @@ For model names containing `{...}`, multiple versions are available. For example
7474
| Hammer2.1-{7b,3b,1.5b,0.5b} | Function Calling | Self-hosted 💻 | MadeAgents/Hammer2.1-{7b,3b,1.5b,0.5b} |
7575
| Llama-3.1-{8B,70B}-Instruct | Function Calling | Self-hosted 💻 | meta-llama/Llama-3.1-{8B,70B}-Instruct-FC |
7676
| Llama-3.1-{8B,70B}-Instruct | Prompt | Self-hosted 💻 | meta-llama/Llama-3.1-{8B,70B}-Instruct |
77+
| Llama-3.1-Nemotron-Ultra-253B-v1 | Prompt | Nvidia | nvidia/llama-3.1-nemotron-ultra-253b-v1 |
7778
| Llama-3.2-{1B,3B}-Instruct | Function Calling | Self-hosted 💻 | meta-llama/Llama-3.2-{1B,3B}-Instruct-FC |
7879
| Llama-3.3-70B-Instruct | Function Calling | Self-hosted 💻 | meta-llama/Llama-3.3-70B-Instruct-FC |
7980
| Llama-4-Maverick-17B-128E-Instruct-FP8 | Prompt | Novita AI | meta-llama/llama-4-maverick-17b-128e-instruct-fp8-novita |
@@ -105,7 +106,7 @@ For model names containing `{...}`, multiple versions are available. For example
105106
| Phi-4-mini-instruct | Function Calling | Self-hosted 💻 | microsoft/Phi-4-mini-instruct-FC |
106107
| Qwen3-{0.6B,1.7B,4B,8B,14B,32B} | Prompt | Alibaba Cloud | qwen3-{0.6b,1.7b,4b,8b,14b,32b} |
107108
| Qwen3-{0.6B,1.7B,4B,8B,14B,32B} | Prompt | Self-hosted 💻 | Qwen/Qwen3-{0.6B,1.7B,4B,8B,14B,32B} |
108-
| Qwen3-{0.6B,1.7B,4B,8B,14B,32B} | Function Calling | Alibaba Cloud | qwen3-{0.6b,1.7b,4b,8b,14b,32b}-FC |
109+
| Qwen3-{0.6B,1.7B,4B,8B,14B,32B} | Function Calling | Alibaba Cloud | qwen3-{0.6b,1.7b,4b,8b,14b,32b}-FC |
109110
| Qwen3-{0.6B,1.7B,4B,8B,14B,32B} | Function Calling | Self-hosted 💻 | Qwen/Qwen3-{0.6B,1.7B,4B,8B,14B,32B}-FC |
110111
| Qwen3-{30B-A3B,235B-A22B} | Prompt | Alibaba Cloud | qwen3-{30b-a3b, 235b-a22b} |
111112
| Qwen3-{30B-A3B,235B-A22B} | Prompt | Self-hosted 💻 | Qwen/Qwen3-{30B-A3B,235B-A22B} |

berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
from bfcl_eval.model_handler.api_inference.grok import GrokHandler
1515
from bfcl_eval.model_handler.api_inference.mining import MiningHandler
1616
from bfcl_eval.model_handler.api_inference.mistral import MistralHandler
17+
from bfcl_eval.model_handler.api_inference.nemotron import NemotronHandler
1718
from bfcl_eval.model_handler.api_inference.nexus import NexusHandler
1819
from bfcl_eval.model_handler.api_inference.nova import NovaHandler
1920
from bfcl_eval.model_handler.api_inference.novita import NovitaHandler
@@ -756,6 +757,18 @@ class ModelConfig:
756757
is_fc_model=False,
757758
underscore_to_dot=False,
758759
),
760+
"nvidia/llama-3.1-nemotron-ultra-253b-v1": ModelConfig(
761+
model_name="nvidia/llama-3.1-nemotron-ultra-253b-v1",
762+
display_name="Llama-3.1-Nemotron-Ultra-253B-v1 (FC)",
763+
url="https://huggingface.co/nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
764+
org="NVIDIA",
765+
license="nvidia-open-model-license",
766+
model_handler=NemotronHandler,
767+
input_price=None,
768+
output_price=None,
769+
is_fc_model=True,
770+
underscore_to_dot=False,
771+
),
759772
"nvidia/nemotron-4-340b-instruct": ModelConfig(
760773
model_name="nvidia/nemotron-4-340b-instruct",
761774
display_name="Nemotron-4-340b-instruct (Prompt)",

berkeley-function-call-leaderboard/bfcl_eval/constants/supported_models.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@
6868
"command-r7b-12-2024-FC",
6969
"command-a-03-2025-FC",
7070
"snowflake/arctic",
71+
"nvidia/llama-3.1-nemotron-ultra-253b-v1",
7172
"nvidia/nemotron-4-340b-instruct",
7273
"BitAgent/GoGoAgent",
7374
"palmyra-x-004",

berkeley-function-call-leaderboard/bfcl_eval/eval_checker/eval_runner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -462,7 +462,7 @@ def evaluate_task(
462462
score_dir,
463463
)
464464

465-
record_result(state, model_name, test_category, accuracy, total_count)
465+
record_result(state["leaderboard_table"], model_name, test_category, accuracy, total_count)
466466
print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}")
467467

468468
return state
Lines changed: 106 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,106 @@
1+
import re
2+
3+
from bfcl_eval.model_handler.api_inference.nvidia import NvidiaHandler
4+
from bfcl_eval.model_handler.utils import (
5+
combine_consecutive_user_prompts,
6+
convert_system_prompt_into_user_prompt,
7+
default_decode_ast_prompting,
8+
default_decode_execute_prompting,
9+
func_doc_language_specific_pre_processing,
10+
)
11+
from overrides import override
12+
13+
14+
class NemotronHandler(NvidiaHandler):
15+
"""Handler for the LLaMA 3.1 Nemotron Ultra 253B v1 model.
16+
17+
This handler extends NvidiaHandler to support the Nemotron model's XML-based
18+
function calling format. The model expects:
19+
- <TOOLCALL>[function_calls]</TOOLCALL> for function calls
20+
- <AVAILABLE_TOOLS>{functions}</AVAILABLE_TOOLS> for function documentation
21+
"""
22+
23+
def _format_system_prompt(self, prompts, function_docs, test_category):
24+
"""Format the system prompt in the Nemotron-specific XML format."""
25+
26+
system_prompt_template = """You are an expert in composing functions. You are given a question and a set of possible functions.
27+
Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
28+
If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
29+
also point it out. You should only return the function call in tools call sections.
30+
31+
If you decide to invoke any of the function(s), you MUST put it in the format of <TOOLCALL>[func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]</TOOLCALL>
32+
33+
You SHOULD NOT include any other text in the response.
34+
Here is a list of functions in JSON format that you can invoke.
35+
36+
<AVAILABLE_TOOLS>{functions}</AVAILABLE_TOOLS>
37+
38+
{user_prompt}"""
39+
40+
# Extract the first user message content (if any) and remove it from the list.
41+
user_prompt = ""
42+
for idx, msg in enumerate(prompts):
43+
if msg["role"] == "user":
44+
user_prompt = msg["content"]
45+
# Delete the user message – it will be folded into the system prompt.
46+
prompts.pop(idx)
47+
break
48+
49+
system_prompt = system_prompt_template.format(
50+
functions=function_docs, user_prompt=user_prompt
51+
)
52+
53+
# Insert the system prompt at the beginning of the list.
54+
prompts.insert(0, {"role": "system", "content": system_prompt})
55+
56+
return prompts
57+
58+
@override
59+
def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
60+
"""Process the input query and format it for the Nemotron model."""
61+
functions: list = test_entry["function"]
62+
test_category: str = test_entry["id"].rsplit("_", 1)[0]
63+
64+
# Pre-process functions based on language
65+
functions = func_doc_language_specific_pre_processing(functions, test_category)
66+
67+
for round_idx in range(len(test_entry["question"])):
68+
test_entry["question"][round_idx] = convert_system_prompt_into_user_prompt(
69+
test_entry["question"][round_idx]
70+
)
71+
test_entry["question"][round_idx] = combine_consecutive_user_prompts(
72+
test_entry["question"][round_idx]
73+
)
74+
75+
test_entry["question"][0] = self._format_system_prompt(
76+
test_entry["question"][0], functions, test_category
77+
)
78+
79+
# Return empty message list - messages will be added incrementally
80+
return {"message": []}
81+
82+
@override
83+
def decode_ast(self, result, language="Python"):
84+
"""Extract function calls from the Nemotron XML format."""
85+
# Extract content between TOOLCALL tags
86+
toolcall_match = re.search(r"<TOOLCALL>(.*?)</TOOLCALL>", result, re.DOTALL)
87+
if not toolcall_match:
88+
return []
89+
90+
# Get the function call string
91+
func_call_str = toolcall_match.group(1)
92+
93+
return default_decode_ast_prompting(func_call_str, language)
94+
95+
@override
96+
def decode_execute(self, result, language="Python"):
97+
"""Convert Nemotron response to executable function calls."""
98+
# Extract content between TOOLCALL tags
99+
toolcall_match = re.search(r"<TOOLCALL>(.*?)</TOOLCALL>", result, re.DOTALL)
100+
if not toolcall_match:
101+
return []
102+
103+
# Get the function call string
104+
func_call_str = toolcall_match.group(1)
105+
106+
return default_decode_execute_prompting(func_call_str, language)

0 commit comments

Comments
 (0)