[BFCL] Fix Dataset Pre-Processing for Java and JavaScript Test Category, Part 2 (#545)

HuanzhiMao · web-flow · commit f1417a809254 · 2024-07-24T21:20:06.000-07:00
This PR fixes the function doc pre-processing issues for the Java and JavaScript test categories, following up on PR #538. - Some unnecessary steps in the `convert_to_tool` function are removed. These steps should not exist as not every model handler calls the `convert_to_tool` function (for example, the OSS models) and would unfairly benefit the models that use it. To make sure that every model gets the same pre-processed function doc, the pre-processing phase needs to be in the `language_specific_pre_processing` function (which is used by every handler). - Properly handle the inner element type for nested types.
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -209,7 +209,7 @@ Some companies have proposed some optimization strategies in their models' handl
 ## Changelog
 
 * [July 22, 2024] [#540](https://github.com/ShishirPatil/gorilla/pull/540): Chore: Improve handling of vLLM's cleanup phase error by combining all selected test categories into one single task to submit to the vLLM server.
-* [July 21, 2024] [#538](https://github.com/ShishirPatil/gorilla/pull/538): Fix `language_specific_pre_processing` function to properly handle pre-processing for prompts and function docs in Java and JavaScript test categories. All entries in these categories are affected.
+* [July 21, 2024] [#538](https://github.com/ShishirPatil/gorilla/pull/538), [#545](https://github.com/ShishirPatil/gorilla/pull/545): Fix `language_specific_pre_processing` and `convert_to_tool` function to properly handle pre-processing for prompts and function docs in Java and JavaScript test categories. All entries in these categories are affected.
 * [July 20, 2024] [#537](https://github.com/ShishirPatil/gorilla/pull/537): Update generation script for locally-hosted OSS model to use single-node multi-GPU inference method (tensor parallel). Ray is not used anymore.
 * [July 16, 2024] [#525](https://github.com/ShishirPatil/gorilla/pull/525), [#536](https://github.com/ShishirPatil/gorilla/pull/536): Add new model `ibm-granite/granite-20b-functioncalling` to the leaderboard.
 * [July 10, 2024] [#522](https://github.com/ShishirPatil/gorilla/pull/522): Bug fix in the evaluation dataset for Executable Parallel Multiple category. This includes updates to both prompts and function docs. 2 entries are affected.
diff --git a/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py b/berkeley-function-call-leaderboard/model_handler/claude_fc_handler.py
@@ -34,7 +34,7 @@ def inference(self, prompt, functions, test_category):
             if type(functions) is not list:
                 functions = [functions]
             claude_tool = convert_to_tool(
-                functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True
+                functions, GORILLA_TO_OPENAPI, self.model_style, test_category
             )
             message = [{"role": "user", "content": prompt}]
             start_time = time.time()
diff --git a/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py b/berkeley-function-call-leaderboard/model_handler/claude_prompt_handler.py
@@ -27,7 +27,7 @@ def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> Non
 
     def _get_claude_function_calling_response(self, prompt, functions, test_category):
         input_tool = convert_to_tool(
-            functions, GORILLA_TO_PYTHON, self.model_style, test_category, True
+            functions, GORILLA_TO_PYTHON, self.model_style, test_category
         )
         system_prompt = construct_tool_use_system_prompt(input_tool)
         start = time.time()
diff --git a/berkeley-function-call-leaderboard/model_handler/cohere_handler.py b/berkeley-function-call-leaderboard/model_handler/cohere_handler.py
@@ -75,7 +75,7 @@ def inference(self, prompt, functions, test_category):
             message = prompt
             # Convert JSON schema into R+ compatible function calls.
             cohere_tool = convert_to_tool(
-                functions, GORILLA_TO_PYTHON, self.model_style, test_category, True
+                functions, GORILLA_TO_PYTHON, self.model_style, test_category
             )
             start_time = time.time()
             if len(cohere_tool) > 0:
diff --git a/berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py b/berkeley-function-call-leaderboard/model_handler/firework_ai_handler.py
@@ -39,7 +39,7 @@ def inference(self, prompt, functions, test_category):
             functions = [functions]
         message = [{"role": "user", "content": prompt}]
         oai_tool = convert_to_tool(
-            functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True
+            functions, GORILLA_TO_OPENAPI, self.model_style, test_category
         )
         start_time = time.time()
         model_name = self.model_name.replace("-FC", "")
diff --git a/berkeley-function-call-leaderboard/model_handler/gemini_handler.py b/berkeley-function-call-leaderboard/model_handler/gemini_handler.py
@@ -97,7 +97,7 @@ def inference(self, prompt, functions, test_category):
         prompt = augment_prompt_by_languge(prompt, test_category)
         functions = language_specific_pre_processing(functions, test_category)
         gemini_tool = convert_to_tool(
-            functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True
+            functions, GORILLA_TO_OPENAPI, self.model_style, test_category
         )
         result, metadata = self._query_gemini(prompt, gemini_tool)
         return result, metadata
diff --git a/berkeley-function-call-leaderboard/model_handler/glm_handler.py b/berkeley-function-call-leaderboard/model_handler/glm_handler.py
@@ -23,7 +23,7 @@ def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> Non
 
     def apply_chat_template(self, prompt, function, test_category):
         oai_tool = convert_to_tool(
-            function, GORILLA_TO_OPENAPI, ModelStyle.OpenAI, test_category, True
+            function, GORILLA_TO_OPENAPI, ModelStyle.OpenAI, test_category
         )
         conversation = [{"role": "user", "content": prompt, "tools": oai_tool}]
         return self.tokenizer.apply_chat_template(
diff --git a/berkeley-function-call-leaderboard/model_handler/gpt_handler.py b/berkeley-function-call-leaderboard/model_handler/gpt_handler.py
@@ -56,7 +56,7 @@ def inference(self, prompt,functions,test_category):
                 functions = [functions]
             message = [{"role": "user", "content": prompt}]
             oai_tool = convert_to_tool(
-                functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True
+                functions, GORILLA_TO_OPENAPI, self.model_style, test_category
             )
             start_time = time.time()
             if len(oai_tool) > 0:
diff --git a/berkeley-function-call-leaderboard/model_handler/granite_handler.py b/berkeley-function-call-leaderboard/model_handler/granite_handler.py
@@ -38,7 +38,6 @@ def _format_prompt(prompt, function, test_category):
             GORILLA_TO_OPENAPI,
             model_style=ModelStyle.OSSMODEL,
             test_category=test_category,
-            stringify_parameters=True,
         )
 
         functions_str = "\n".join([json.dumps(func) for func in function])
diff --git a/berkeley-function-call-leaderboard/model_handler/hermes_handler.py b/berkeley-function-call-leaderboard/model_handler/hermes_handler.py
@@ -12,7 +12,7 @@ def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> Non
     def _format_prompt(prompt, function, test_category):
         # Hermes use Langchain to OpenAI conversion. It does not use tool call but function call.
         function = convert_to_tool(
-            function, GORILLA_TO_OPENAPI, ModelStyle.OSSMODEL, test_category, True
+            function, GORILLA_TO_OPENAPI, ModelStyle.OSSMODEL, test_category
         )
         pydantic_format = """{"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}"""
         tool_call_format = """{"arguments": <args-dict>, "name": <function-name>}"""
diff --git a/berkeley-function-call-leaderboard/model_handler/mistral_handler.py b/berkeley-function-call-leaderboard/model_handler/mistral_handler.py
@@ -29,7 +29,7 @@ def inference(self, prompt, functions, test_category):
         if "FC" in self.model_name:
             functions = language_specific_pre_processing(functions, test_category)
             tool = convert_to_tool(
-                functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True
+                functions, GORILLA_TO_OPENAPI, self.model_style, test_category
             )
             message = [
                 ChatMessage(role="user", content=prompt),
diff --git a/berkeley-function-call-leaderboard/model_handler/utils.py b/berkeley-function-call-leaderboard/model_handler/utils.py
@@ -1,6 +1,5 @@
 import re, ast, builtins, ast, json
 from model_handler.model_style import ModelStyle
-from model_handler.constant import JAVA_TYPE_CONVERSION, JS_TYPE_CONVERSION
 from model_handler.java_parser import parse_java_function_call
 from model_handler.js_parser import parse_javascript_function_call
 from model_handler.constant import GORILLA_TO_OPENAPI, USE_COHERE_OPTIMIZATION
@@ -54,7 +53,7 @@ def _cast_to_openai_type(properties, mapping, test_category):
 
 
 def convert_to_tool(
-    functions, mapping, model_style, test_category, stringify_parameters=False
+    functions, mapping, model_style, test_category
 ):
     oai_tool = []
     for item in functions:
@@ -68,34 +67,12 @@ def convert_to_tool(
         ):
             # OAI does not support "." in the function name so we replace it with "_". ^[a-zA-Z0-9_-]{1,64}$ is the regex for the name.
             item["name"] = re.sub(r"\.", "_", item["name"])
+            
         item["parameters"]["type"] = "object"
         item["parameters"]["properties"] = _cast_to_openai_type(
             item["parameters"]["properties"], mapping, test_category
         )
-        # When Java and Javascript, for OpenAPI compatible models, let it become string.
-        if (
-            model_style
-            in [
-                ModelStyle.OpenAI,
-                ModelStyle.Mistral,
-                ModelStyle.Google,
-                ModelStyle.Anthropic_Prompt,
-                ModelStyle.Anthropic_FC,
-                ModelStyle.FIREWORK_AI,
-                ModelStyle.OSSMODEL,
-                ModelStyle.COHERE,
-            ]
-            and stringify_parameters
-        ):
-            properties = item["parameters"]["properties"]
-            if test_category == "java":
-                for key, value in properties.items():
-                    if value["type"] in JAVA_TYPE_CONVERSION:
-                        properties[key]["type"] = "string"
-            elif test_category == "javascript":
-                for key, value in properties.items():
-                    if value["type"] in JS_TYPE_CONVERSION:
-                        properties[key]["type"] = "string"
+
         if model_style == ModelStyle.Anthropic_FC:
             item["input_schema"] = item["parameters"]
             del item["parameters"]
@@ -356,23 +333,42 @@ def language_specific_pre_processing(function, test_category):
                     )
                 else:
                     value["description"] += (
-                        f" This is Java {value['type']} in string representation."
+                        f" This is Java {value['type']} type parameter in string representation."
+                    )
+                if value["type"] == "ArrayList" or value["type"] == "Array":
+                    value["description"] += (
+                        f" The list elements are of type {value['items']['type']}; they are not in string representation."
                     )
+                    del value["items"]
+                    
                 value["type"] = "string"
                 
         elif test_category == "javascript":
             for key, value in properties.items():
                 if value["type"] == "any":
                     properties[key]["description"] += (
-                        " This parameter can be of any type of JavaScript object."
+                        " This parameter can be of any type of JavaScript object in string representation."
                     )
                 else:
                     value["description"] += (
-                        f" This is JavaScript {value['type']} in string representation."
+                        f" This is JavaScript {value['type']} type parameter in string representation."
                     )
+                if value["type"] == "array":
+                    value["description"] += (
+                        f" The list elements are of type {value['items']['type']}; they are not in string representation."
+                    )
+                    del value["items"]
+                
+                if value["type"] == "dict":
+                    if "properties" in value:    # not every dict has properties
+                        value["description"] += (
+                            f" The dictionary entries have the following schema; they are not in string representation. {json.dumps(value['properties'])}"
+                        )
+                        del value["properties"]
+
                 value["type"] = "string"
                 
-        return function
+    return function
 
 
 def construct_tool_use_system_prompt(tools):

Original file line number	Diff line number	Diff line change
`@@ -34,7 +34,7 @@ def inference(self, prompt, functions, test_category):`
`34`	`34`	`if type(functions) is not list:`
`35`	`35`	`functions = [functions]`
`36`	`36`	`claude_tool = convert_to_tool(`
`37`		`- functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True`
	`37`	`+ functions, GORILLA_TO_OPENAPI, self.model_style, test_category`
`38`	`38`	`)`
`39`	`39`	`message = [{"role": "user", "content": prompt}]`
`40`	`40`	`start_time = time.time()`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> Non`
`27`	`27`
`28`	`28`	`def _get_claude_function_calling_response(self, prompt, functions, test_category):`
`29`	`29`	`input_tool = convert_to_tool(`
`30`		`- functions, GORILLA_TO_PYTHON, self.model_style, test_category, True`
	`30`	`+ functions, GORILLA_TO_PYTHON, self.model_style, test_category`
`31`	`31`	`)`
`32`	`32`	`system_prompt = construct_tool_use_system_prompt(input_tool)`
`33`	`33`	`start = time.time()`
Original file line number	Diff line number	Diff line change
`@@ -75,7 +75,7 @@ def inference(self, prompt, functions, test_category):`
`75`	`75`	`message = prompt`
`76`	`76`	`# Convert JSON schema into R+ compatible function calls.`
`77`	`77`	`cohere_tool = convert_to_tool(`
`78`		`- functions, GORILLA_TO_PYTHON, self.model_style, test_category, True`
	`78`	`+ functions, GORILLA_TO_PYTHON, self.model_style, test_category`
`79`	`79`	`)`
`80`	`80`	`start_time = time.time()`
`81`	`81`	`if len(cohere_tool) > 0:`
Original file line number	Diff line number	Diff line change
`@@ -39,7 +39,7 @@ def inference(self, prompt, functions, test_category):`
`39`	`39`	`functions = [functions]`
`40`	`40`	`message = [{"role": "user", "content": prompt}]`
`41`	`41`	`oai_tool = convert_to_tool(`
`42`		`- functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True`
	`42`	`+ functions, GORILLA_TO_OPENAPI, self.model_style, test_category`
`43`	`43`	`)`
`44`	`44`	`start_time = time.time()`
`45`	`45`	`model_name = self.model_name.replace("-FC", "")`
Original file line number	Diff line number	Diff line change
`@@ -97,7 +97,7 @@ def inference(self, prompt, functions, test_category):`
`97`	`97`	`prompt = augment_prompt_by_languge(prompt, test_category)`
`98`	`98`	`functions = language_specific_pre_processing(functions, test_category)`
`99`	`99`	`gemini_tool = convert_to_tool(`
`100`		`- functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True`
	`100`	`+ functions, GORILLA_TO_OPENAPI, self.model_style, test_category`
`101`	`101`	`)`
`102`	`102`	`result, metadata = self._query_gemini(prompt, gemini_tool)`
`103`	`103`	`return result, metadata`
Original file line number	Diff line number	Diff line change
`@@ -23,7 +23,7 @@ def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> Non`
`23`	`23`
`24`	`24`	`def apply_chat_template(self, prompt, function, test_category):`
`25`	`25`	`oai_tool = convert_to_tool(`
`26`		`- function, GORILLA_TO_OPENAPI, ModelStyle.OpenAI, test_category, True`
	`26`	`+ function, GORILLA_TO_OPENAPI, ModelStyle.OpenAI, test_category`
`27`	`27`	`)`
`28`	`28`	`conversation = [{"role": "user", "content": prompt, "tools": oai_tool}]`
`29`	`29`	`return self.tokenizer.apply_chat_template(`
Original file line number	Diff line number	Diff line change
`@@ -56,7 +56,7 @@ def inference(self, prompt,functions,test_category):`
`56`	`56`	`functions = [functions]`
`57`	`57`	`message = [{"role": "user", "content": prompt}]`
`58`	`58`	`oai_tool = convert_to_tool(`
`59`		`- functions, GORILLA_TO_OPENAPI, self.model_style, test_category, True`
	`59`	`+ functions, GORILLA_TO_OPENAPI, self.model_style, test_category`
`60`	`60`	`)`
`61`	`61`	`start_time = time.time()`
`62`	`62`	`if len(oai_tool) > 0:`
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,6 @@ def _format_prompt(prompt, function, test_category):`
`38`	`38`	`GORILLA_TO_OPENAPI,`
`39`	`39`	`model_style=ModelStyle.OSSMODEL,`
`40`	`40`	`test_category=test_category,`
`41`		`- stringify_parameters=True,`
`42`	`41`	`)`
`43`	`42`
`44`	`43`	`functions_str = "\n".join([json.dumps(func) for func in function])`
Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,7 @@ def __init__(self, model_name, temperature=0.7, top_p=1, max_tokens=1000) -> Non`
`12`	`12`	`def _format_prompt(prompt, function, test_category):`
`13`	`13`	`# Hermes use Langchain to OpenAI conversion. It does not use tool call but function call.`
`14`	`14`	`function = convert_to_tool(`
`15`		`- function, GORILLA_TO_OPENAPI, ModelStyle.OSSMODEL, test_category, True`
	`15`	`+ function, GORILLA_TO_OPENAPI, ModelStyle.OSSMODEL, test_category`
`16`	`16`	`)`
`17`	`17`	`pydantic_format = """{"properties": {"arguments": {"title": "Arguments", "type": "object"}, "name": {"title": "Name", "type": "string"}}, "required": ["arguments", "name"], "title": "FunctionCall", "type": "object"}"""`
`18`	`18`	`tool_call_format = """{"arguments": <args-dict>, "name": <function-name>}"""`