Merge branch 'main' into main

HuanzhiMao · web-flow · commit 0d1c478051fd · 2024-07-20T00:01:25.000-07:00
diff --git a/berkeley-function-call-leaderboard/README.md b/berkeley-function-call-leaderboard/README.md
@@ -208,6 +208,7 @@ Some companies have proposed some optimization strategies in their models' handl
 
 ## Changelog
 
+* [July 16, 2024] [#525](https://github.com/ShishirPatil/gorilla/pull/525), [#536](https://github.com/ShishirPatil/gorilla/pull/536): Add new model `ibm-granite/granite-20b-functioncalling` to the leaderboard.
 * [July 10, 2024] [#522](https://github.com/ShishirPatil/gorilla/pull/522): Bug fix in the evaluation dataset for Executable Parallel Multiple category. This includes updates to both prompts and function docs. 2 entries are affected.
 * [July 8, 2024] [#516](https://github.com/ShishirPatil/gorilla/pull/516): Fix double-casting issue in `model_handler` for Java and JavaScript test categories.
 * [July 7, 2024] [#504](https://github.com/ShishirPatil/gorilla/pull/504), [#505](https://github.com/ShishirPatil/gorilla/pull/505), [#506](https://github.com/ShishirPatil/gorilla/pull/506), [#508](https://github.com/ShishirPatil/gorilla/pull/508), [#510](https://github.com/ShishirPatil/gorilla/pull/510), [#512](https://github.com/ShishirPatil/gorilla/pull/512), [#517](https://github.com/ShishirPatil/gorilla/pull/517): Make BFCL user-friendly and easy to extend.
diff --git a/berkeley-function-call-leaderboard/model_handler/granite_handler.py b/berkeley-function-call-leaderboard/model_handler/granite_handler.py
@@ -49,10 +49,10 @@ def _format_prompt(prompt, function, test_category):
         return prompt
 
     def inference(
-        self, question_file, test_category, num_gpus, format_prompt_func=_format_prompt
+        self, test_question, test_category, num_gpus, format_prompt_func=_format_prompt
     ):
         return super().inference(
-            question_file, test_category, num_gpus, format_prompt_func
+            test_question, test_category, num_gpus, format_prompt_func
         )
 
     def decode_ast(self, result, language="Python"):
@@ -76,9 +76,6 @@ def decode_ast(self, result, language="Python"):
                     decoded_outputs.append("No function is called")
                     continue
 
-                if language != "Python":
-                    args = {k: str(v) for k, v in args.items()}
-
                 decoded_outputs.append({fnname: args})
 
         return decoded_outputs
diff --git a/raft/format.py b/raft/format.py
@@ -104,7 +104,7 @@ class OpenAiCompletionDatasetFormatter(DatasetFormatter):
     https://platform.openai.com/docs/guides/fine-tuning/preparing-your-dataset
     """
     def format(self, ds: Dataset, params: Dict[str, str]) -> Dataset:
-        newds = ds.rename_columns({'question': 'prompt', 'cot_answer': 'completion'})
+        newds = ds.rename_columns({'instruction': 'prompt', 'cot_answer': 'completion'})
         return _remove_all_columns_but(newds, ['prompt', 'completion'])
 
 class OpenAiChatDatasetFormatter(OpenAiCompletionDatasetFormatter):