Add support for llama-3.1-nemotron-ultra-253b-v1 to BFCL (#1032)

AdityaGhai18 · HuanzhiMao · web-flow · commit 1bb65c95e6dd · 2025-06-14T01:15:43.000-07:00
Add LLaMA-3.1 Nemotron Ultra 253B v1 (FC) to the leaderboard.

Used Nvidia API inference handler

---------

Co-authored-by: Huanzhi Mao &lt;huanzhimao@gmail.com&gt;
diff --git a/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md b/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md
@@ -74,6 +74,7 @@ For model names containing `{...}`, multiple versions are available. For example
 | Hammer2.1-{7b,3b,1.5b,0.5b}            | Function Calling | Self-hosted 💻 | MadeAgents/Hammer2.1-{7b,3b,1.5b,0.5b}                      |
 | Llama-3.1-{8B,70B}-Instruct            | Function Calling | Self-hosted 💻 | meta-llama/Llama-3.1-{8B,70B}-Instruct-FC                   |
 | Llama-3.1-{8B,70B}-Instruct            | Prompt           | Self-hosted 💻 | meta-llama/Llama-3.1-{8B,70B}-Instruct                      |
+| Llama-3.1-Nemotron-Ultra-253B-v1       | Prompt           | Nvidia         | nvidia/llama-3.1-nemotron-ultra-253b-v1                     |
 | Llama-3.2-{1B,3B}-Instruct             | Function Calling | Self-hosted 💻 | meta-llama/Llama-3.2-{1B,3B}-Instruct-FC                    |
 | Llama-3.3-70B-Instruct                 | Function Calling | Self-hosted 💻 | meta-llama/Llama-3.3-70B-Instruct-FC                        |
 | Llama-4-Maverick-17B-128E-Instruct-FP8 | Prompt           | Novita AI      | meta-llama/llama-4-maverick-17b-128e-instruct-fp8-novita    |
@@ -105,7 +106,7 @@ For model names containing `{...}`, multiple versions are available. For example
 | Phi-4-mini-instruct                    | Function Calling | Self-hosted 💻 | microsoft/Phi-4-mini-instruct-FC                            |
 | Qwen3-{0.6B,1.7B,4B,8B,14B,32B}        | Prompt           | Alibaba Cloud  | qwen3-{0.6b,1.7b,4b,8b,14b,32b}                             |
 | Qwen3-{0.6B,1.7B,4B,8B,14B,32B}        | Prompt           | Self-hosted 💻 | Qwen/Qwen3-{0.6B,1.7B,4B,8B,14B,32B}                        |
-| Qwen3-{0.6B,1.7B,4B,8B,14B,32B}        | Function Calling | Alibaba Cloud  | qwen3-{0.6b,1.7b,4b,8b,14b,32b}-FC                         |
+| Qwen3-{0.6B,1.7B,4B,8B,14B,32B}        | Function Calling | Alibaba Cloud  | qwen3-{0.6b,1.7b,4b,8b,14b,32b}-FC                          |
 | Qwen3-{0.6B,1.7B,4B,8B,14B,32B}        | Function Calling | Self-hosted 💻 | Qwen/Qwen3-{0.6B,1.7B,4B,8B,14B,32B}-FC                     |
 | Qwen3-{30B-A3B,235B-A22B}              | Prompt           | Alibaba Cloud  | qwen3-{30b-a3b, 235b-a22b}                                  |
 | Qwen3-{30B-A3B,235B-A22B}              | Prompt           | Self-hosted 💻 | Qwen/Qwen3-{30B-A3B,235B-A22B}                              |
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/model_config.py
@@ -14,6 +14,7 @@
 from bfcl_eval.model_handler.api_inference.grok import GrokHandler
 from bfcl_eval.model_handler.api_inference.mining import MiningHandler
 from bfcl_eval.model_handler.api_inference.mistral import MistralHandler
+from bfcl_eval.model_handler.api_inference.nemotron import NemotronHandler
 from bfcl_eval.model_handler.api_inference.nexus import NexusHandler
 from bfcl_eval.model_handler.api_inference.nova import NovaHandler
 from bfcl_eval.model_handler.api_inference.novita import NovitaHandler
@@ -756,6 +757,18 @@ class ModelConfig:
         is_fc_model=False,
         underscore_to_dot=False,
     ),
+    "nvidia/llama-3.1-nemotron-ultra-253b-v1": ModelConfig(
+        model_name="nvidia/llama-3.1-nemotron-ultra-253b-v1",
+        display_name="Llama-3.1-Nemotron-Ultra-253B-v1 (FC)",
+        url="https://huggingface.co/nvidia/Llama-3_1-Nemotron-Ultra-253B-v1",
+        org="NVIDIA",
+        license="nvidia-open-model-license",
+        model_handler=NemotronHandler,
+        input_price=None,
+        output_price=None,
+        is_fc_model=True,
+        underscore_to_dot=False,
+    ),
     "nvidia/nemotron-4-340b-instruct": ModelConfig(
         model_name="nvidia/nemotron-4-340b-instruct",
         display_name="Nemotron-4-340b-instruct (Prompt)",
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/supported_models.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/supported_models.py
@@ -68,6 +68,7 @@
     "command-r7b-12-2024-FC",
     "command-a-03-2025-FC",
     "snowflake/arctic",
+    "nvidia/llama-3.1-nemotron-ultra-253b-v1",
     "nvidia/nemotron-4-340b-instruct",
     "BitAgent/GoGoAgent",
     "palmyra-x-004",
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl_eval/eval_checker/eval_runner.py
@@ -462,7 +462,7 @@ def evaluate_task(
                 score_dir,
             )
 
-    record_result(state, model_name, test_category, accuracy, total_count)
+    record_result(state["leaderboard_table"], model_name, test_category, accuracy, total_count)
     print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}")
 
     return state
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/nemotron.py b/berkeley-function-call-leaderboard/bfcl_eval/model_handler/api_inference/nemotron.py
@@ -0,0 +1,106 @@
+import re
+
+from bfcl_eval.model_handler.api_inference.nvidia import NvidiaHandler
+from bfcl_eval.model_handler.utils import (
+    combine_consecutive_user_prompts,
+    convert_system_prompt_into_user_prompt,
+    default_decode_ast_prompting,
+    default_decode_execute_prompting,
+    func_doc_language_specific_pre_processing,
+)
+from overrides import override
+
+
+class NemotronHandler(NvidiaHandler):
+    """Handler for the LLaMA 3.1 Nemotron Ultra 253B v1 model.
+
+    This handler extends NvidiaHandler to support the Nemotron model's XML-based
+    function calling format. The model expects:
+    - <TOOLCALL>[function_calls]</TOOLCALL> for function calls
+    - <AVAILABLE_TOOLS>{functions}</AVAILABLE_TOOLS> for function documentation
+    """
+
+    def _format_system_prompt(self, prompts, function_docs, test_category):
+        """Format the system prompt in the Nemotron-specific XML format."""
+
+        system_prompt_template = """You are an expert in composing functions. You are given a question and a set of possible functions.
+Based on the question, you will need to make one or more function/tool calls to achieve the purpose.
+If none of the function can be used, point it out. If the given question lacks the parameters required by the function,
+also point it out. You should only return the function call in tools call sections.
+
+If you decide to invoke any of the function(s), you MUST put it in the format of <TOOLCALL>[func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]</TOOLCALL>
+
+You SHOULD NOT include any other text in the response.
+Here is a list of functions in JSON format that you can invoke.
+
+<AVAILABLE_TOOLS>{functions}</AVAILABLE_TOOLS>
+
+{user_prompt}"""
+
+        # Extract the first user message content (if any) and remove it from the list.
+        user_prompt = ""
+        for idx, msg in enumerate(prompts):
+            if msg["role"] == "user":
+                user_prompt = msg["content"]
+                # Delete the user message – it will be folded into the system prompt.
+                prompts.pop(idx)
+                break
+
+        system_prompt = system_prompt_template.format(
+            functions=function_docs, user_prompt=user_prompt
+        )
+
+        # Insert the system prompt at the beginning of the list.
+        prompts.insert(0, {"role": "system", "content": system_prompt})
+
+        return prompts
+
+    @override
+    def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
+        """Process the input query and format it for the Nemotron model."""
+        functions: list = test_entry["function"]
+        test_category: str = test_entry["id"].rsplit("_", 1)[0]
+
+        # Pre-process functions based on language
+        functions = func_doc_language_specific_pre_processing(functions, test_category)
+
+        for round_idx in range(len(test_entry["question"])):
+            test_entry["question"][round_idx] = convert_system_prompt_into_user_prompt(
+                test_entry["question"][round_idx]
+            )
+            test_entry["question"][round_idx] = combine_consecutive_user_prompts(
+                test_entry["question"][round_idx]
+            )
+
+        test_entry["question"][0] = self._format_system_prompt(
+            test_entry["question"][0], functions, test_category
+        )
+
+        # Return empty message list - messages will be added incrementally
+        return {"message": []}
+
+    @override
+    def decode_ast(self, result, language="Python"):
+        """Extract function calls from the Nemotron XML format."""
+        # Extract content between TOOLCALL tags
+        toolcall_match = re.search(r"<TOOLCALL>(.*?)</TOOLCALL>", result, re.DOTALL)
+        if not toolcall_match:
+            return []
+
+        # Get the function call string
+        func_call_str = toolcall_match.group(1)
+
+        return default_decode_ast_prompting(func_call_str, language)
+
+    @override
+    def decode_execute(self, result, language="Python"):
+        """Convert Nemotron response to executable function calls."""
+        # Extract content between TOOLCALL tags
+        toolcall_match = re.search(r"<TOOLCALL>(.*?)</TOOLCALL>", result, re.DOTALL)
+        if not toolcall_match:
+            return []
+
+        # Get the function call string
+        func_call_str = toolcall_match.group(1)
+
+        return default_decode_execute_prompting(func_call_str, language)

Original file line number	Diff line number	Diff line change
`@@ -462,7 +462,7 @@ def evaluate_task(`
`462`	`462`	`score_dir,`
`463`	`463`	`)`
`464`	`464`
`465`		`- record_result(state, model_name, test_category, accuracy, total_count)`
	`465`	`+ record_result(state["leaderboard_table"], model_name, test_category, accuracy, total_count)`
`466`	`466`	`print(f"✅ Test completed: {test_category}. 🎯 Accuracy: {accuracy}")`
`467`	`467`
`468`	`468`	`return state`