Merge pull request #2 from ShishirPatil/main

eitanturok · web-flow · commit 1fab38f541f3 · 2024-04-18T17:50:17.000-04:00
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -80,7 +80,7 @@ To run the executable test categories, there are 4 API keys to fill out:
 The `apply_function_credential_config.py` inputs an input file, optionally an outputs file. If the output file is not given as an argument, it will overwrites your original file with the cleaned data.
 
 ```bash
-    python apply_function_credential_config.py --input_file ./data/gorilla_openfunctions_v1_test_rest.json
+    python apply_function_credential_config.py --input-file ./data/gorilla_openfunctions_v1_test_rest.json
 ```
 
 Then, use `eval_data_compilation.py` to compile all files by using
@@ -106,7 +106,7 @@ To generate leaderboard statistics, there are two steps:
 1. Inference the evaluation data and obtain the results from specific models 
 
 ```bash
-    python openfunctions_evaluation.py --model MODEL_NAME --test_category TEST_CATEGORY
+    python openfunctions_evaluation.py --model MODEL_NAME --test-category TEST_CATEGORY
 ```
 For TEST_CATEGORY, we have `executable_simple`, `executable_parallel_function`, `executable_multiple_function`, `executable_parallel_multiple_function`, `simple`, `relevance`, `parallel_function`, `multiple_function`, `parallel_multiple_function`, `java`, `javascript`, `rest`, `sql`, `chatable`.
 
@@ -185,9 +185,9 @@ Below is *a table of models we support* to run our leaderboard evaluation agains
 |gorilla-openfunctions-v2 | Function Calling|
 |gpt-3.5-turbo-0125-FC| Function Calling|
 |gpt-3.5-turbo-0125| Prompt|
-|gpt-4-{0613,1106-preview,0125-preview}-FC| Function Calling|
-|gpt-4-{0613,1106-preview,0125-preview}|Prompt|
-|glaiveai/glaive-function-calling-v1 💻|  Function Calling| 
+|gpt-4-{0613,1106-preview,0125-preview,turbo-2024-04-09}-FC| Function Calling|
+|gpt-4-{0613,1106-preview,0125-preview,turbo-2024-04-09}| Prompt|
+|glaiveai/glaive-function-calling-v1 💻| Function Calling|
 |Nexusflow-Raven-v2 | Function Calling|
 |fire-function-v1-FC | Function Calling|
 |mistral-large-2402-FC-{Any,Auto} | Function Calling|
@@ -196,9 +196,8 @@ Below is *a table of models we support* to run our leaderboard evaluation agains
 |mistral-small-2402-FC-{Any,Auto} | Function Calling|
 |mistral-small-2402 | Prompt|
 |mistral-tiny-2312 | Prompt|
-|claude-3-{opus,sonnet}-20240229-FC | Function Calling |
-|claude-3-haiku-20240307-FC | Function Calling |
-|claude-3-{opus,sonnet}-20240229 | Prompt |
+|claude-3-{opus-20240229,sonnet-20240229,haiku-20240307}-FC | Function Calling |
+|claude-3-{opus-20240229,sonnet-20240229,haiku-20240307} | Prompt |
 |claude-{2.1,instant-1.2}| Prompt|
 |gemini-1.0-pro | Function Calling|
 |databrick-dbrx-instruct | Prompt|
@@ -222,6 +221,7 @@ For inferencing `Databrick-DBRX-instruct`, you need to create a Databrick Azure
 
 ## Changelog
 
+* [April 16, 2024] [#366](https://github.com/ShishirPatil/gorilla/pull/366): Switch to use Anthropic's new Tool Use Beta `tools-2024-04-04` when generating Claude 3 FC series data. `gpt-4-turbo-2024-04-09` and `gpt-4-turbo-2024-04-09-FC` are also added to the leaderboard.
 * [April 11, 2024] [#347](https://github.com/ShishirPatil/gorilla/pull/347): Add the 95th percentile latency to the leaderboard statistics. This metric is useful for understanding the latency distribution of the models, especially the worst-case scenario.
 * [April 10, 2024] [#339](https://github.com/ShishirPatil/gorilla/pull/339): Introduce REST API sanity check for the executable test category. It ensures that all the API endpoints involved during the execution evaluation process are working properly. If any of them are not behaving as expected, the evaluation process will be stopped by default as the result will be inaccurate. Users can choose to bypass this check by setting the `--skip-api-sanity-check` flag.
 * [April 9, 2024] [#338](https://github.com/ShishirPatil/gorilla/pull/338): Bug fix in the evaluation datasets (including both prompts and function docs). Bug fix for possible answers as well.
diff --git a/berkeley-function-call-leaderboard/apply_function_credential_config.py b/berkeley-function-call-leaderboard/apply_function_credential_config.py
@@ -3,8 +3,8 @@
 
 
 parser = argparse.ArgumentParser(description="Replace placeholders in the function credential config file.")
-parser.add_argument("--input_file", help="Path to the function credential config file.", required=True)
-parser.add_argument("--output_file", help="Path to the output file.", default="")
+parser.add_argument("--input-file", help="Path to the function credential config file.", required=True)
+parser.add_argument("--output-file", help="Path to the output file.", default="")
 args = parser.parse_args()
 
 # Load the configuration with actual API keys
diff --git a/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py b/berkeley-function-call-leaderboard/eval_checker/eval_runner_helper.py
@@ -66,14 +66,26 @@
         "OpenAI",
         "Proprietary",
     ],
+    "gpt-4-turbo-2024-04-09-FC": [
+        "GPT-4-turbo-2024-04-09 (FC)",
+        "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
+        "OpenAI",
+        "Proprietary",
+    ],
+    "gpt-4-turbo-2024-04-09": [
+        "GPT-4-turbo-2024-04-09 (Prompt)",
+        "https://platform.openai.com/docs/models/gpt-4-and-gpt-4-turbo",
+        "OpenAI",
+        "Proprietary",
+    ],
     "gorilla-openfunctions-v2": [
         "Gorilla-OpenFunctions-v2 (FC)",
         "https://gorilla.cs.berkeley.edu/blogs/7_open_functions_v2.html",
         "Gorilla LLM",
         "Apache 2.0",
     ],
     "claude-3-opus-20240229-FC": [
-        "Claude-3-Opus-20240229 (FC)",
+        "Claude-3-Opus-20240229 (FC tools-2024-04-04)",
         "https://www.anthropic.com/news/claude-3-family",
         "Anthropic",
         "Proprietary",
@@ -103,7 +115,7 @@
         "Proprietary",
     ],
     "claude-3-sonnet-20240229-FC": [
-        "Claude-3-Sonnet-20240229 (FC)",
+        "Claude-3-Sonnet-20240229 (FC tools-2024-04-04)",
         "https://www.anthropic.com/news/claude-3-family",
         "Anthropic",
         "Proprietary",
@@ -115,7 +127,13 @@
         "Proprietary",
     ],
     "claude-3-haiku-20240307-FC": [
-        "Claude-3-Haiku-20240307 (FC)",
+        "Claude-3-Haiku-20240307 (FC tools-2024-04-04)",
+        "https://www.anthropic.com/news/claude-3-family",
+        "Anthropic",
+        "Proprietary",
+    ],
+    "claude-3-haiku-20240307": [
+        "Claude-3-Haiku-20240307 (Prompt)",
         "https://www.anthropic.com/news/claude-3-family",
         "Anthropic",
         "Proprietary",
@@ -279,6 +297,8 @@
     "gpt-4-1106-preview": 10,
     "gpt-4-0125-preview": 10,
     "gpt-4-0125-preview-FC": 10,
+    "gpt-4-turbo-2024-04-09-FC": 10,
+    "gpt-4-turbo-2024-04-09": 10,
     "gpt-4-0613": 30,
     "gpt-4-0613-FC": 30,
     "gpt-3.5-turbo-0125": 1.5,
@@ -302,6 +322,8 @@
     "mistral-small-2402-FC-Any": 6,
     "mistral-small-2402-FC-Auto": 6,
     "mistral-tiny-2312": 0.25,
+    "gpt-4-turbo-2024-04-09-FC": 30,
+    "gpt-4-turbo-2024-04-09": 30,
     "gpt-4-1106-preview": 30,
     "gpt-4-1106-preview-FC": 30,
     "gpt-4-0125-preview-FC": 30,
@@ -314,7 +336,6 @@
     "databricks-dbrx-instruct": 6.75,
 }
 
-
 # The latency of the open-source models are hardcoded here.
 # Because we do batching when generating the data, so the latency is not accurate from the result data.
 # This is the latency for the whole batch of data.
diff --git a/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py b/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py
@@ -0,0 +1,85 @@
+from model_handler.handler import BaseHandler
+from anthropic import Anthropic
+from anthropic.types import TextBlock
+from anthropic.types.beta.tools import ToolUseBlock
+from model_handler.model_style import ModelStyle
+from model_handler.claude_prompt_handler import ClaudePromptingHandler
+from model_handler.utils import (
+    convert_to_tool,
+    augment_prompt_by_languge,
+    language_specific_pre_processing,
+    ast_parse,
+    convert_to_function_call
+)
+from model_handler.constant import GORILLA_TO_OPENAPI
+import os, time, json
+
+
+class ClaudeFCHandler(BaseHandler):
+    def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None:
+        super().__init__(model_name, temperature, top_p, max_tokens)
+        self.model_style = ModelStyle.Anthropic_Prompt
+
+        self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
+
+    def inference(self, prompt, functions, test_category):
+        if "FC" not in self.model_name:
+            handler = ClaudePromptingHandler(self.model_name, self.temperature, self.top_p, self.max_tokens)
+            return handler.inference(prompt, functions, test_category)
+        else:
+            prompt = augment_prompt_by_languge(prompt, test_category)
+            functions = language_specific_pre_processing(functions, test_category, True)
+            if type(functions) is not list:
+                functions = [functions]
+            claude_tool = convert_to_tool(
+                functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True
+            )
+            message = [{"role": "user", "content": prompt}]
+            start_time = time.time()
+            response = self.client.beta.tools.messages.create(
+                model=self.model_name.strip("-FC"),
+                max_tokens=self.max_tokens,
+                tools=claude_tool,
+                messages=message,
+            )
+            latency = time.time() - start_time
+            text_outputs = []
+            tool_call_outputs = []
+            for content in response.content:
+                if isinstance(content, TextBlock):
+                    text_outputs.append(content.text)
+                elif isinstance(content, ToolUseBlock):
+                    tool_call_outputs.append({content.name: json.dumps(content.input)})
+            result =  tool_call_outputs if tool_call_outputs else text_outputs[0]
+            return result, {"input_tokens": response.usage.input_tokens, "output_tokens": response.usage.output_tokens, "latency": latency}
+    
+    def decode_ast(self,result,language="Python"):
+        if "FC" not in self.model_name:
+            decoded_output = ast_parse(result,language)
+        else:
+            decoded_output = []
+            for invoked_function in result:
+                name = list(invoked_function.keys())[0]
+                params = json.loads(invoked_function[name])
+                if language == "Python":
+                    pass
+                else:
+                    # all values of the json are casted to string for java and javascript
+                    for key in params:
+                        params[key] = str(params[key])
+                decoded_output.append({name: params})
+        return decoded_output
+    
+    def decode_execute(self,result):
+        if "FC" not in self.model_name:
+            decoded_output = ast_parse(result)
+            execution_list = []
+            for function_call in decoded_output:
+                for key, value in function_call.items():
+                    execution_list.append(
+                        f"{key}({','.join([f'{k}={repr(v)}' for k, v in value.items()])})"
+                    )
+            return execution_list
+        else:
+            function_call = convert_to_function_call(result)
+            return function_call
diff --git a/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py b/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py
@@ -18,10 +18,10 @@
 from anthropic import Anthropic
 
 
-class ClaudeHandler(BaseHandler):
+class ClaudePromptingHandler(BaseHandler):
     def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> None:
         super().__init__(model_name, temperature, top_p, max_tokens)
-        self.model_style = ModelStyle.Anthropic
+        self.model_style = ModelStyle.Anthropic_Prompt
 
         self.client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY"))
 
diff --git a/berkeley-function-call-leaderboard/model_handler/constant.py b/berkeley-function-call-leaderboard/model_handler/constant.py
@@ -112,11 +112,16 @@
     "any": str,
 }
 
+# If there is any underscore in folder name, you should change it to `/` in the following strings
 UNDERSCORE_TO_DOT = [
+    "gpt-4-turbo-2024-04-09-FC",
     "gpt-4-1106-preview-FC",
     "gpt-4-0125-preview-FC",
     "gpt-4-0613-FC",
     "gpt-3.5-turbo-0125-FC",
+    "claude-3-opus-20240229-FC",
+    "claude-3-sonnet-20240229-FC",
+    "claude-3-haiku-20240307-FC",
     "mistral-large-2402-FC",
     "mistral-large-2402-FC-Any",
     "mistral-large-2402-FC-Auto",
@@ -128,7 +133,7 @@
     "meetkai/functionary-medium-v2.2-FC",
     "meetkai/functionary-small-v2.4-FC",
     "meetkai/functionary-medium-v2.4-FC",
-    "NousResearch/Hermes-2-Pro-Mistral-7B"
+    "NousResearch/Hermes-2-Pro-Mistral-7B",
 ]
 
 TEST_CATEGORIES = {
diff --git a/berkeley-function-call-leaderboard/model_handler/handler_map.py b/berkeley-function-call-leaderboard/model_handler/handler_map.py
@@ -1,6 +1,7 @@
 from model_handler.gorilla_handler import GorillaHandler
 from model_handler.gpt_handler import OpenAIHandler
-from model_handler.claude_handler import ClaudeHandler
+from model_handler.claude_fc_handler import ClaudeFCHandler
+from model_handler.claude_prompt_handler import ClaudePromptingHandler
 from model_handler.mistral_handler import MistralHandler
 from model_handler.firework_ai_handler import FireworkAIHandler
 from model_handler.nexus_handler import NexusHandler
@@ -16,6 +17,8 @@
 handler_map = {
     "gorilla-openfunctions-v0": GorillaHandler,
     "gorilla-openfunctions-v2": GorillaHandler,
+    "gpt-4-turbo-2024-04-09-FC": OpenAIHandler,
+    "gpt-4-turbo-2024-04-09": OpenAIHandler,
     "gpt-4-1106-preview-FC": OpenAIHandler,
     "gpt-4-1106-preview": OpenAIHandler,
     "gpt-4-0125-preview-FC": OpenAIHandler,
@@ -24,13 +27,14 @@
     "gpt-4-0613": OpenAIHandler,
     "gpt-3.5-turbo-0125-FC": OpenAIHandler,
     "gpt-3.5-turbo-0125": OpenAIHandler,
-    "claude-2.1": ClaudeHandler,
-    "claude-instant-1.2": ClaudeHandler,
-    "claude-3-opus-20240229": ClaudeHandler,
-    "claude-3-opus-20240229-FC": ClaudeHandler,
-    "claude-3-sonnet-20240229": ClaudeHandler,
-    "claude-3-sonnet-20240229-FC": ClaudeHandler,
-    "claude-3-haiku-20240307-FC": ClaudeHandler,
+    "claude-2.1": ClaudePromptingHandler,
+    "claude-instant-1.2": ClaudePromptingHandler,
+    "claude-3-opus-20240229": ClaudePromptingHandler,
+    "claude-3-opus-20240229-FC": ClaudeFCHandler,
+    "claude-3-sonnet-20240229": ClaudePromptingHandler,
+    "claude-3-sonnet-20240229-FC": ClaudeFCHandler,
+    "claude-3-haiku-20240307": ClaudePromptingHandler,
+    "claude-3-haiku-20240307-FC": ClaudeFCHandler,
     "mistral-large-2402": MistralHandler,
     "mistral-large-2402-FC-Any": MistralHandler,
     "mistral-large-2402-FC-Auto": MistralHandler,
diff --git a/berkeley-function-call-leaderboard/model_handler/model_style.py b/berkeley-function-call-leaderboard/model_handler/model_style.py
@@ -4,7 +4,8 @@
 class ModelStyle(Enum):
     Gorilla = "gorilla"
     OpenAI = "gpt"
-    Anthropic = "claude"
+    Anthropic_FC = "claude"
+    Anthropic_Prompt = "claude"
     Mistral = "mistral"
     Google = "google"
     FIREWORK_AI = "firework_ai"
diff --git a/berkeley-function-call-leaderboard/model_handler/utils.py b/berkeley-function-call-leaderboard/model_handler/utils.py
@@ -63,6 +63,7 @@ def convert_to_tool(
             or model_style == ModelStyle.Mistral
             or model_style == ModelStyle.Google
             or model_style == ModelStyle.OSSMODEL
+            or model_style == ModelStyle.Anthropic_FC
         ):
             # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name.
             item["name"] = re.sub(r"\.", "_", item["name"])
@@ -77,7 +78,8 @@ def convert_to_tool(
                 ModelStyle.OpenAI,
                 ModelStyle.Mistral,
                 ModelStyle.Google,
-                ModelStyle.Anthropic,
+                ModelStyle.Anthropic_Prompt,
+                ModelStyle.Anthropic_FC,
                 ModelStyle.FIREWORK_AI,
                 ModelStyle.OSSMODEL,
             ]
@@ -92,21 +94,24 @@ def convert_to_tool(
                 for key, value in properties.items():
                     if value["type"] in JS_TYPE_CONVERSION:
                         properties[key]["type"] = "string"
+        if model_style == ModelStyle.Anthropic_FC:
+            item["input_schema"] = item["parameters"]
+            del item["parameters"]
         if model_style == ModelStyle.Google:
             # Remove fields that are not supported by Gemini today.
             for params in item["parameters"]["properties"].values():
                 if "default" in params:
-                    params["description"] += str(params["default"])
+                    params["description"] += "The Default is:" + str(params["default"])
                     del params["default"]
                 if "optional" in params:
                     del params["optional"]
                 if "maximum" in params:
                     del params["maximum"]
                 if "additionalProperties" in params:
-                    params["description"] += str(params["additionalProperties"])
+                    params["description"] += "The additional properties:" +str(params["additionalProperties"])
                     del params["additionalProperties"]
         if model_style in [
-            ModelStyle.Anthropic,
+            ModelStyle.Anthropic_Prompt,
             ModelStyle.Google,
             ModelStyle.OSSMODEL,
         ]:
@@ -261,6 +266,8 @@ def augment_prompt_by_languge(prompt, test_category):
 def language_specific_pre_processing(function, test_category, string_param):
     if type(function) is dict:
         function = [function]
+    if len(function) == 0:
+       return function
     for item in function:
         properties = item["parameters"]["properties"]
         if test_category == "java":
diff --git a/berkeley-function-call-leaderboard/openfunctions_evaluation.py b/berkeley-function-call-leaderboard/openfunctions_evaluation.py
@@ -8,12 +8,12 @@ def get_args():
     # Refer to model_choice for supported models.
     parser.add_argument("--model", type=str, default="gorilla-openfunctions-v2")
     # Refer to test_categories for supported categories.
-    parser.add_argument("--test_category", type=str, default="all")
+    parser.add_argument("--test-category", type=str, default="all")
 
     # Parameters for the model that you want to test.
     parser.add_argument("--temperature", type=float, default=0.7)
-    parser.add_argument("--top_p", type=float, default=1)
-    parser.add_argument("--max_tokens", type=int, default=1200)
+    parser.add_argument("--top-p", type=float, default=1)
+    parser.add_argument("--max-tokens", type=int, default=1200)
     parser.add_argument("--num-gpus", default=1, type=int)
     parser.add_argument("--timeout", default=60, type=int)