[BFCL] Add Llama-3.2-1B-Instruct, Llama-3.2-3B-Instruct, Llama-3.1-8B-Instruct, Llama-3.1-70B-Instruct (ShishirPatil#657)

HuanzhiMao · VishnuSuresh27 · commit bd14f1ce0183 · 2024-11-10T23:12:44.000-08:00
This PR adds the following new models to the leaderboard:

  - `meta-llama/Llama-3.2-1B-Instruct`
  - `meta-llama/Llama-3.2-1B-Instruct-FC`
  - `meta-llama/Llama-3.2-3B-Instruct`
  - `meta-llama/Llama-3.2-3B-Instruct-FC`
  - `meta-llama/Llama-3.1-8B-Instruct`
  - `meta-llama/Llama-3.1-8B-Instruct-FC`
  - `meta-llama/Llama-3.1-70B-Instruct`
  - `meta-llama/Llama-3.1-70B-Instruct-FC`
diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md
@@ -12,6 +12,15 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documen
   - `microsoft/Phi-3-mini-128k-instruct`
   - `microsoft/Phi-3-mini-4k-instruct`
 - [Sept 25, 2024] [#660](https://github.com/ShishirPatil/gorilla/pull/660): Bug fix in `parse_nested_value` function to handle nested dictionary values properly. 
+- [Sept 24, 2024] [#657](https://github.com/ShishirPatil/gorilla/pull/657): Add the following new models to the leaderboard:
+  - `meta-llama/Llama-3.2-1B-Instruct`
+  - `meta-llama/Llama-3.2-1B-Instruct-FC`
+  - `meta-llama/Llama-3.2-3B-Instruct`
+  - `meta-llama/Llama-3.2-3B-Instruct-FC`
+  - `meta-llama/Llama-3.1-8B-Instruct`
+  - `meta-llama/Llama-3.1-8B-Instruct-FC`
+  - `meta-llama/Llama-3.1-70B-Instruct`
+  - `meta-llama/Llama-3.1-70B-Instruct-FC`
 - [Sept 24, 2024] [#648](https://github.com/ShishirPatil/gorilla/pull/648): Add the following new models to the leaderboard:
   - `gemini-1.5-pro-002`
   - `gemini-1.5-pro-002-FC`
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -152,7 +152,11 @@ Below is _a table of models we support_ to run our leaderboard evaluation agains
 |google/gemma-7b-it 💻| Prompt|
 |meetkai/functionary-medium-v3.1-FC| Function Calling|
 |meetkai/functionary-small-{v3.1,v3.2}-FC| Function Calling|
-|meta-llama/Meta-Llama-3-{8B,70B}-Instruct | Prompt|
+|meta-llama/Meta-Llama-3-{8B,70B}-Instruct 💻| Prompt|
+|meta-llama/Llama-3.1-{8B,70B}-Instruct-FC 💻| Function Calling|
+|meta-llama/Llama-3.1-{8B,70B}-Instruct 💻| Prompt|
+|meta-llama/Llama-3.2-{1B,3B}-Instruct-FC 💻| Function Calling|
+|meta-llama/Llama-3.2-{1B,3B}-Instruct 💻| Prompt|
 |open-mixtral-{8x7b,8x22b} | Prompt|
 |open-mixtral-8x22b-FC | Function Calling|
 |open-mistral-nemo-2407 | Prompt|
diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/model_metadata.py
@@ -401,6 +401,54 @@
         "Meta",
         "Meta Llama 3 Community",
     ],
+    "meta-llama/Llama-3.1-8B-Instruct": [
+        "Llama-3.1-8B-Instruct (Prompt)",
+        "https://llama.meta.com/llama3",
+        "Meta",
+        "Meta Llama 3 Community",
+    ],
+    "meta-llama/Llama-3.1-70B-Instruct": [
+        "Llama-3.1-70B-Instruct (Prompt)",
+        "https://llama.meta.com/llama3",
+        "Meta",
+        "Meta Llama 3 Community",
+    ],
+    "meta-llama/Llama-3.2-1B-Instruct": [
+        "Llama-3.2-1B-Instruct (Prompt)",
+        "https://llama.meta.com/llama3",
+        "Meta",
+        "Meta Llama 3 Community",
+    ],
+    "meta-llama/Llama-3.2-3B-Instruct": [
+        "Llama-3.2-3B-Instruct (Prompt)",
+        "https://llama.meta.com/llama3",
+        "Meta",
+        "Meta Llama 3 Community",
+    ],
+    "meta-llama/Llama-3.1-8B-Instruct-FC": [
+        "Llama-3.1-8B-Instruct (FC)",
+        "https://llama.meta.com/llama3",
+        "Meta",
+        "Meta Llama 3 Community",
+    ],
+    "meta-llama/Llama-3.1-70B-Instruct-FC": [
+        "Llama-3.1-70B-Instruct (FC)",
+        "https://llama.meta.com/llama3",
+        "Meta",
+        "Meta Llama 3 Community",
+    ],
+    "meta-llama/Llama-3.2-1B-Instruct-FC": [
+        "Llama-3.2-1B-Instruct (FC)",
+        "https://llama.meta.com/llama3",
+        "Meta",
+        "Meta Llama 3 Community",
+    ],
+    "meta-llama/Llama-3.2-3B-Instruct-FC": [
+        "Llama-3.2-3B-Instruct (FC)",
+        "https://llama.meta.com/llama3",
+        "Meta",
+        "Meta Llama 3 Community",
+    ],
     "command-r-plus-FC": [
         "Command-R-Plus (FC) (Original)",
         "https://txt.cohere.com/command-r-plus-microsoft-azure",
diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py b/berkeley-function-call-leaderboard/bfcl/model_handler/handler_map.py
@@ -6,6 +6,7 @@
 from bfcl.model_handler.oss_model.hammer import HammerHandler
 from bfcl.model_handler.oss_model.hermes import HermesHandler
 from bfcl.model_handler.oss_model.llama import LlamaHandler
+from bfcl.model_handler.oss_model.llama_fc import LlamaFCHandler
 from bfcl.model_handler.oss_model.phi import PhiHandler
 from bfcl.model_handler.oss_model.salesforce import SalesforceHandler
 from bfcl.model_handler.proprietary_model.claude import ClaudeHandler
@@ -21,7 +22,7 @@
 from bfcl.model_handler.proprietary_model.openai import OpenAIHandler
 from bfcl.model_handler.proprietary_model.yi import YiHandler
 
-# TODO: Add Deepseek V2 and Gemma V2
+# TODO: Add Deepseek V2 and Gemma V2, meta-llama/Llama-3.1-405B-Instruct
 handler_map = {
     # Inference through API calls
     "gorilla-openfunctions-v2": GorillaHandler,
@@ -79,6 +80,14 @@
     # Inference through local hosting
     "meta-llama/Meta-Llama-3-8B-Instruct": LlamaHandler,
     "meta-llama/Meta-Llama-3-70B-Instruct": LlamaHandler,
+    "meta-llama/Llama-3.1-8B-Instruct-FC": LlamaFCHandler,
+    "meta-llama/Llama-3.1-70B-Instruct-FC": LlamaFCHandler,
+    "meta-llama/Llama-3.2-1B-Instruct-FC": LlamaFCHandler,
+    "meta-llama/Llama-3.2-3B-Instruct-FC": LlamaFCHandler,
+    "meta-llama/Llama-3.1-8B-Instruct": LlamaHandler,
+    "meta-llama/Llama-3.1-70B-Instruct": LlamaHandler,
+    "meta-llama/Llama-3.2-1B-Instruct": LlamaHandler,
+    "meta-llama/Llama-3.2-3B-Instruct": LlamaHandler,
     "Salesforce/xLAM-1b-fc-r": SalesforceHandler,
     "Salesforce/xLAM-7b-fc-r": SalesforceHandler,
     "Salesforce/xLAM-7b-r": SalesforceHandler,
diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base_oss_handler.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/base_oss_handler.py
@@ -21,6 +21,7 @@
 class OSSHandler(BaseHandler):
     def __init__(self, model_name, temperature, dtype="bfloat16") -> None:
         super().__init__(model_name, temperature)
+        self.model_name_huggingface = model_name
         self.model_style = ModelStyle.OSSMODEL
         self.dtype = dtype
         self.client = OpenAI(base_url=f"http://localhost:{VLLM_PORT}/v1", api_key="EMPTY")
@@ -57,7 +58,7 @@ def batch_inference(
             [
                 "vllm",
                 "serve",
-                str(self.model_name),
+                str(self.model_name_huggingface),
                 "--port",
                 str(VLLM_PORT),
                 "--dtype",
@@ -205,15 +206,15 @@ def _query_prompting(self, inference_data: dict):
 
         if hasattr(self, "stop_token_ids"):
             api_response = self.client.completions.create(
-                model=self.model_name,
+                model=self.model_name_huggingface,
                 temperature=self.temperature,
                 prompt=formatted_prompt,
                 stop_token_ids=self.stop_token_ids,
                 max_tokens=4096,  # TODO: Is there a better way to handle this?
             )
         else:
             api_response = self.client.completions.create(
-                model=self.model_name,
+                model=self.model_name_huggingface,
                 temperature=self.temperature,
                 prompt=formatted_prompt,
                 max_tokens=4096,
diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama.py
@@ -1,5 +1,9 @@
 from bfcl.model_handler.oss_model.base_oss_handler import OSSHandler
 
+
+# Note: This is the handler for the Llama models in prompring mode.
+# For function call mode, use LlamaFCHandler instead.
+# Llama 3 series are benchmarked in prompting mode while the Llama 3.1 series are benchmarked in function call mode.
 class LlamaHandler(OSSHandler):
     def __init__(self, model_name, temperature) -> None:
         super().__init__(model_name, temperature)
diff --git a/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama_fc.py b/berkeley-function-call-leaderboard/bfcl/model_handler/oss_model/llama_fc.py
@@ -0,0 +1,231 @@
+import json
+
+from bfcl.model_handler.oss_model.base_oss_handler import OSSHandler
+from bfcl.model_handler.utils import func_doc_language_specific_pre_processing
+
+# TODO: Merge with LlamaHandler
+
+
+class LlamaFCHandler(OSSHandler):
+    def __init__(self, model_name, temperature) -> None:
+        super().__init__(model_name, temperature)
+        self.model_name_huggingface = model_name.replace("-FC", "")
+
+    @staticmethod
+    def _format_prompt(messages, function):
+        """
+        "bos_token": "<|begin_of_text|>",
+        "chat_template":
+        {{- bos_token }}
+        {%- if custom_tools is defined %}
+            {%- set tools = custom_tools %}
+        {%- endif %}
+        {%- if not tools_in_user_message is defined %}
+            {%- set tools_in_user_message = true %}
+        {%- endif %}
+        {%- if not date_string is defined %}
+            {%- set date_string = "26 Jul 2024" %}
+        {%- endif %}
+        {%- if not tools is defined %}
+            {%- set tools = none %}
+        {%- endif %}
+
+        {#- This block extracts the system message, so we can slot it into the right place. #}
+        {%- if messages[0]['role'] == 'system' %}
+            {%- set system_message = messages[0]['content']|trim %}
+            {%- set messages = messages[1:] %}
+        {%- else %}
+            {%- set system_message = "" %}
+        {%- endif %}
+
+        {#- System message + builtin tools #}
+        {{- "<|start_header_id|>system<|end_header_id|>\n\n" }}
+        {%- if builtin_tools is defined or tools is not none %}
+            {{- "Environment: ipython\n" }}
+        {%- endif %}
+        {%- if builtin_tools is defined %}
+            {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}}
+        {%- endif %}
+        {{- "Cutting Knowledge Date: December 2023\n" }}
+        {{- "Today Date: " + date_string + "\n\n" }}
+        {%- if tools is not none and not tools_in_user_message %}
+            {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }}
+            {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+            {{- "Do not use variables.\n\n" }}
+            {%- for t in tools %}
+                {{- t | tojson(indent=4) }}
+                {{- "\n\n" }}
+            {%- endfor %}
+        {%- endif %}
+        {{- system_message }}
+        {{- "<|eot_id|>" }}
+
+        {#- Custom tools are passed in a user message with some extra guidance #}
+        {%- if tools_in_user_message and not tools is none %}
+            {#- Extract the first user message so we can plug it in here #}
+            {%- if messages | length != 0 %}
+                {%- set first_user_message = messages[0]['content']|trim %}
+                {%- set messages = messages[1:] %}
+            {%- else %}
+                {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }}
+        {%- endif %}
+            {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}}
+            {{- "Given the following functions, please respond with a JSON for a function call " }}
+            {{- "with its proper arguments that best answers the given prompt.\n\n" }}
+            {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }}
+            {{- "Do not use variables.\n\n" }}
+            {%- for t in tools %}
+                {{- t | tojson(indent=4) }}
+                {{- "\n\n" }}
+            {%- endfor %}
+            {{- first_user_message + "<|eot_id|>"}}
+        {%- endif %}
+
+        {%- for message in messages %}
+            {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %}
+                {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }}
+            {%- elif 'tool_calls' in message %}
+                {%- if not message.tool_calls|length == 1 %}
+                    {{- raise_exception("This model only supports single tool-calls at once!") }}
+                {%- endif %}
+                {%- set tool_call = message.tool_calls[0].function %}
+                {%- if builtin_tools is defined and tool_call.name in builtin_tools %}
+                    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+                    {{- "<|python_tag|>" + tool_call.name + ".call(" }}
+                    {%- for arg_name, arg_val in tool_call.arguments | items %}
+                        {{- arg_name + '="' + arg_val + '"' }}
+                        {%- if not loop.last %}
+                            {{- ", " }}
+                        {%- endif %}
+                        {%- endfor %}
+                    {{- ")" }}
+                {%- else  %}
+                    {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}}
+                    {{- '{"name": "' + tool_call.name + '", ' }}
+                    {{- '"parameters": ' }}
+                    {{- tool_call.arguments | tojson }}
+                    {{- "}" }}
+                {%- endif %}
+                {%- if builtin_tools is defined %}
+                    {#- This means we're in ipython mode #}
+                    {{- "<|eom_id|>" }}
+                    {#- This means we're in ipython mode #}
+                    {{- "<|eom_id|>" }}
+                    {{- "<|eom_id|>" }}
+                {%- else %}
+                    {{- "<|eot_id|>" }}
+                {%- endif %}
+            {%- elif message.role == "tool" or message.role == "ipython" %}
+                {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }}
+                {%- if message.content is mapping or message.content is iterable %}
+                    {{- message.content | tojson }}
+                {%- else %}
+                    {{- message.content }}
+                {%- endif %}
+                {{- "<|eot_id|>" }}
+            {%- endif %}
+        {%- endfor %}
+        {%- if add_generation_prompt %}
+            {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }}
+        {%- endif %}
+        """
+        formatted_prompt = "<|begin_of_text|>"
+
+        system_message = ""
+        remaining_messages = messages
+        if messages[0]["role"] == "system":
+            system_message = messages[0]["content"].strip()
+            remaining_messages = messages[1:]
+
+        formatted_prompt += "<|start_header_id|>system<|end_header_id|>\n\n"
+        formatted_prompt += "Environment: ipython\n"
+        formatted_prompt += "Cutting Knowledge Date: December 2023\n"
+        formatted_prompt += "Today Date: 26 Jul 2024\n\n"
+        formatted_prompt += system_message + "<|eot_id|>"
+
+        # Llama pass in custom tools in first user message
+        is_first_user_message = True
+        for message in remaining_messages:
+            if message["role"] == "user" and is_first_user_message:
+                is_first_user_message = False
+                formatted_prompt += "<|start_header_id|>user<|end_header_id|>\n\n"
+                formatted_prompt += "Given the following functions, please respond with a JSON for a function call "
+                formatted_prompt += (
+                    "with its proper arguments that best answers the given prompt.\n\n"
+                )
+                formatted_prompt += 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.'
+                formatted_prompt += "Do not use variables.\n\n"
+                for func in function:
+                    formatted_prompt += json.dumps(func, indent=4) + "\n\n"
+                formatted_prompt += f"{message['content'].strip()}<|eot_id|>"
+
+            elif message["role"] == "tool":
+                formatted_prompt += "<|start_header_id|>ipython<|end_header_id|>\n\n"
+                if isinstance(message["content"], (dict, list)):
+                    formatted_prompt += json.dumps(message["content"])
+                else:
+                    formatted_prompt += message["content"]
+                formatted_prompt += "<|eot_id|>"
+
+            else:
+                formatted_prompt += f"<|start_header_id|>{message['role']}<|end_header_id|>\n\n{message['content'].strip()}<|eot_id|>"
+
+        formatted_prompt += "<|start_header_id|>assistant<|end_header_id|>\n\n"
+
+        return formatted_prompt
+
+    def decode_ast(self, result, language="Python"):
+        result = result.replace("<|python_tag|>", "")
+        # Llama sometimes separates the function calls with `;` and sometimes with `,`
+        if ";" in result:
+            """
+            "<|python_tag|>{\"name\": \"calc_binomial_probability\", \"parameters\": {\"n\": \"10\", \"k\": \"3\", \"p\": \"0\"}}; {\"name\": \"calc_binomial_probability\", \"parameters\": {\"n\": \"15\", \"k\": \"5\", \"p\": \"0\"}}; {\"name\": \"calc_binomial_probability\", \"parameters\": {\"n\": \"20\", \"k\": \"7\", \"p\": \"0\"}}"
+            """
+            function_calls = result.split(";")
+            function_calls = [json.loads(func_call) for func_call in function_calls]
+        else:
+            """
+            "[\n    {\"name\": \"calculate_permutations\", \"parameters\": {\"n\": \"20\", \"k\": \"5\"}},\n    {\"name\": \"calculate_permutations\", \"parameters\": {\"n\": \"12\", \"k\": \"5\"}},\n    {\"name\": \"calculate_permutations\", \"parameters\": {\"n\": \"10\", \"k\": \"3\"}}\n]"
+            """
+            function_calls = eval(result)
+            if type(function_calls) == dict:
+                function_calls = [function_calls]
+
+        decoded_output = []
+        for func_call in function_calls:
+            name = func_call["name"]
+            params = func_call["parameters"]
+            decoded_output.append({name: params})
+
+        return decoded_output
+
+    def decode_execute(self, result):
+        result = result.replace("<|python_tag|>", "")
+        # Llama sometimes separates the function calls with `;` and sometimes with `,`
+        if ";" in result:
+            function_calls = result.split(";")
+            function_calls = [json.loads(func_call) for func_call in function_calls]
+        else:
+            function_calls = eval(result)
+            if type(function_calls) == dict:
+                function_calls = [function_calls]
+
+        execution_list = []
+        for func_call in function_calls:
+            name = func_call["name"]
+            params = func_call["parameters"]
+            execution_list.append(
+                f"{name}({','.join([f'{k}={repr(v)}' for k,v in params.items()])})"
+            )
+
+        return execution_list
+
+    def _pre_query_processing_prompting(self, test_entry: dict) -> dict:
+        functions: list = test_entry["function"]
+        test_category: str = test_entry["id"].rsplit("_", 1)[0]
+
+        functions = func_doc_language_specific_pre_processing(functions, test_category)
+
+        # Llama use its own system prompt
+
+        return {"message": [], "function": functions}