ShishirPatil · HuanzhiMao · Nov 8, 2024 · Nov 4, 2024 · Nov 4, 2024 · Nov 4, 2024
diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md
@@ -2,12 +2,14 @@
 
 All notable changes to the Berkeley Function Calling Leaderboard will be documented in this file.
 
-- [Oct 30, 2024] [#725](https://github.com/ShishirPatil/gorilla/pull/725): Update evaluation metric for irrelevance detection in multi-turn scenarios:
-  - Added a dummy `flag_task_unachievable` function for all multi-turn entries.
-  - If the model identifies a task as unachievable, it should invoke this function explicitly.
-  - During evaluation, if `flag_task_unachievable` is called in a turn, that turn will be marked correct for irrelevance detection, even if other functions were also called in that turn.
-    - This also means that if the model calls `flag_task_unachievable` in a normal (non-irrelevant) turn, that turn will be marked incorrect.
-  - This ensures the model can attempt reasonable exploratory actions before concluding a task is unachievable, aligning multi-turn behavior with single-turn irrelevance standards.
+- [Oct 30, 2024] [#725](https://github.com/ShishirPatil/gorilla/pull/725), [#733](https://github.com/ShishirPatil/gorilla/pull/733): Update evaluation metric for multi-turn categories:
+  - Introduce a new response-based checker, which works alongside with the existing state-based checker.
+    - The new checker compares the model’s execution result against the ground truth execution result, ensuring that the model’s result encompasses the ground truth (i.e., ground truth must be a strict subset of the model result).
+    - It complements the state-based checker, which doesn't work well when the functions don't directly alter the state. For example, it's unclear whether the model actually invoked `get_zipcode_by_city` or `estimate_distance` by just using the state-based checker.
+    - Any multi turn entry will now only be marked correct if it passes both the state and response checkers.
+  - Remove the irrelevance detection for multi-turn categories.
+    - Instead of checking if the model produces no output in a turn with missing function/parameter information, we now assess whether the model can perform correctly once the missing information is provided.
+  - A few dataset entries have been modified to align with these changes.
 - [Oct 30, 2024] [#719](https://github.com/ShishirPatil/gorilla/pull/719), [#722](https://github.com/ShishirPatil/gorilla/pull/722), [#723](https://github.com/ShishirPatil/gorilla/pull/723), [#728](https://github.com/ShishirPatil/gorilla/pull/728), [#732](https://github.com/ShishirPatil/gorilla/pull/732): Bug fix in the dataset and ground truth for the multi-turn categories.
 - [Oct 17, 2024] [#683](https://github.com/ShishirPatil/gorilla/pull/683): Bug fix for the multi turn categories for ambiguity in action intention and function parameters.
 - [Oct 17, 2024] [#709](https://github.com/ShishirPatil/gorilla/pull/709): Rephrase question prompt for Java and JavaScript categories to improve clarity and action intent.

diff --git a/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py b/berkeley-function-call-leaderboard/bfcl/_llm_response_generation.py
@@ -38,7 +38,7 @@ def get_args():
 
     # Parameters for the model that you want to test.
     parser.add_argument("--temperature", type=float, default=0.001)
-    parser.add_argument("--include-debugging-log", action="store_true", default=False)
+    parser.add_argument("--include-debugging-log", "-d", action="store_true", default=False)
     parser.add_argument("--num-threads", default=1, type=int)
     parser.add_argument("--num-gpus", default=1, type=int)
     parser.add_argument("--backend", default="vllm", type=str, choices=["vllm", "sglang"])

diff --git a/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py b/berkeley-function-call-leaderboard/bfcl/eval_checker/eval_runner.py
@@ -59,8 +59,12 @@ def multi_turn_runner(
                     "model_name": model_name,
                     "test_category": test_category,
                     "valid": False,
-                    "error": ["Error during inference phase. Model did not output a list of model responses."],
-                    "error_type": "multi_turn:inference_error",
+                    "error": {
+                        "error_message": [
+                            "Error during inference phase. Model did not output a list of model responses."
+                        ],
+                        "error_type": "multi_turn:inference_error",
+                    },
                     "prompt": test_entry,
                     "model_result": multi_turn_model_result_list,
                     "possible_answer": multi_turn_ground_truth_list,
@@ -76,10 +80,12 @@ def multi_turn_runner(
                     "model_name": model_name,
                     "test_category": test_category,
                     "valid": False,
-                    "error": [
-                        f"Model was force-terminated during inference phase. The length of the model result turns ({len(multi_turn_model_result_list)}) does not match the length of the ground truth turns ({len(multi_turn_ground_truth_list)})."
-                    ],
-                    "error_type": "multi_turn:force_terminated",
+                    "error": {
+                        "error_message": [
+                            f"Model was force-terminated during inference phase. The length of the model result turns ({len(multi_turn_model_result_list)}) does not match the length of the ground truth turns ({len(multi_turn_ground_truth_list)})."
+                        ],
+                        "error_type": "multi_turn:force_terminated",
+                    },
                     "prompt": test_entry,
                     "model_result": multi_turn_model_result_list,
                     "possible_answer": multi_turn_ground_truth_list,
@@ -122,18 +128,18 @@ def multi_turn_runner(
 
         # Perform additional check for multi-turn irrelevance
         # This happens when the model is expected to not output any function calls in a certain turn due to miss parameters or miss functions
-        irrelevance_checker_result = multi_turn_irrelevance_checker(
-            multi_turn_model_result_list_decoded,
-            multi_turn_ground_truth_list,
-        )
+        # irrelevance_checker_result = multi_turn_irrelevance_checker(
+        #     multi_turn_model_result_list_decoded,
+        #     multi_turn_ground_truth_list,
+        # )
 
-        if not irrelevance_checker_result["valid"] or not accuracy_checker_result["valid"]:
+        if not accuracy_checker_result["valid"]:
             temp = {}
             temp["id"] = index
             temp["model_name"] = model_name
             temp["test_category"] = test_category
-            # We display the irrelevance checker result first, then the accuracy checker result if irrelevance is passed
-            temp.update(irrelevance_checker_result if not irrelevance_checker_result["valid"] else accuracy_checker_result)
+            temp["valid"] = accuracy_checker_result.pop("valid")
+            temp["error"] = accuracy_checker_result
             temp["prompt"] = test_entry
             temp["model_result_raw"] = multi_turn_model_result_list
             temp["model_result_decoded"] = multi_turn_model_result_list_decoded
@@ -594,10 +600,10 @@ def runner(model_names, test_categories, api_sanity_check):
     )
 
     print(
-        f"🏁 Evaluation completed. See {SCORE_PATH / 'data_overall.csv'} for evaluation results on BFCL V2."
+        f"🏁 Evaluation completed. See {SCORE_PATH / 'data_overall.csv'} for evaluation results on BFCL V3."
     )
     print(
-        f"See {SCORE_PATH / 'data_live.csv'} and {SCORE_PATH / 'data_non_live.csv'} for evaluation results on BFCL V2 Live and Non-Live categories respectively."
+        f"See {SCORE_PATH / 'data_live.csv'} and {SCORE_PATH / 'data_non_live.csv'} for evaluation results on BFCL V3 Live and Non-Live categories respectively."
     )
 
 

diff --git a/...all-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/gorilla_file_system.py b/...all-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/gorilla_file_system.py
@@ -3,7 +3,7 @@
 from copy import deepcopy
 from typing import Dict, List, Optional, Union
 
-from .long_context import FILE_CONTENT_EXTENSION, FILES_TAIL_USED
+from .long_context import FILE_CONTENT_EXTENSION, FILES_TAIL_USED, POPULATE_FILE_EXTENSION
 
 
 class File:
@@ -141,6 +141,7 @@ def __init__(self) -> None:
         """
         self.root: Directory
         self._current_dir: Directory
+        self._api_description = "This tool belongs to the Gorilla file system. It is a simple file system that allows users to perform basic file operations such as navigating directories, creating files and directories, reading and writing to files, etc."
 
     def __eq__(self, other: object) -> bool:
         if not isinstance(other, GorillaFileSystem):
@@ -227,23 +228,22 @@ def _load_directory(
                 parent.contents[dir_name] = new_file
 
         if is_bottommost and self.long_context:
-            self._populate_directory(parent, 30)
+            self._populate_directory(parent)
 
         return parent
 
     def _populate_directory(
-        self, directory: Directory, file_count: int = 200
+        self, directory: Directory
     ) -> None:  # Used only for long context
         """
         Populate an innermost directory with multiple empty files.
 
         Args:
             directory (Directory): The innermost directory to populate.
-            file_count (int): The number of empty files to create. Defaults to 5.
         """
-        for i in range(file_count):
-            name = str(abs(hash(str(i + 1) + "gorilla")))
-            file_name = f"image_{name}.jpg"
+        for i in range(len(POPULATE_FILE_EXTENSION)):
+            name = POPULATE_FILE_EXTENSION[i]
+            file_name = f"{name}"
             directory._add_file(file_name)
 
     def pwd(self):

diff --git a/...ction-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/long_context.py b/...ction-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/long_context.py
diff --git a/...-function-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/math_api.py b/...-function-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/math_api.py
@@ -6,6 +6,9 @@
 
 
 class MathAPI:
+    def __init__(self):
+        self._api_description = "This tool belongs to the Math API, which provides various mathematical operations."
+
     def logarithm(
         self, value: float, base: float, precision: int
     ) -> Dict[str, float]:

diff --git a/...nction-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/message_api.py b/...nction-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/message_api.py
@@ -64,6 +64,7 @@ def __init__(self):
         self.inbox: List[Dict[str, str]]
         self.message_count: int
         self.current_user: Optional[str]
+        self._api_description = "This tool belongs to the Message API, which is used to manage user interactions in a workspace."
 
     def _load_scenario(self, scenario: dict, long_context=False) -> None:
         """

diff --git a/...nction-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/posting_api.py b/...nction-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/posting_api.py
@@ -24,6 +24,7 @@ def __init__(self):
         self.following_list: List[str]
         # tweet_counter is used to assign unique IDs to tweets, it might not be the same as the length of the tweets list for different scenarios
         self.tweet_counter: int
+        self._api_description = "This tool belongs to the TwitterAPI, which provides core functionality for posting tweets, retweeting, commenting, and following users on Twitter."
 
     def _load_scenario(self, scenario: dict, long_context=False) -> None:
         """

diff --git a/...unction-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/ticket_api.py b/...unction-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/ticket_api.py
@@ -30,6 +30,7 @@ def __init__(self):
         self.ticket_queue: List[Dict[str, Union[int, str]]]
         self.ticket_counter: int
         self.current_user: Optional[str]
+        self._api_description = "This tool belongs to the ticketing system that is part of a company, which allows users to create, view, and manage support business tickets."
 
     def _load_scenario(self, scenario: dict, long_context=False) -> None:
         """

diff --git a/...nction-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/trading_bot.py b/...nction-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/trading_bot.py
@@ -154,6 +154,7 @@ def __init__(self):
         self.stocks: Dict[str, Dict[str, Union[float, int]]]
         self.watch_list: List[str]
         self.transaction_history: List[Dict[str, Union[str, float, int]]]
+        self._api_description = "This tool belongs to the trading system, which allows users to trade stocks, manage their account, and view stock information."
 
     def _load_scenario(self, scenario: dict, long_context=False) -> None:
         """
@@ -300,9 +301,11 @@ def get_order_details(self, order_id: int) -> Dict[str, Union[str, float, int]]:
             order_id (int): ID of the order.
 
         Returns:
+            id (int): ID of the order.
+            order_type (str): Type of the order.
             symbol (str): Symbol of the stock in the order.
             price (float): Price at which the order was placed.
-            num_shares (int): Number of shares in the order.
+            amount (int): Number of shares in the order.
             status (str): Current status of the order. [Enum]: ["Open", "Pending", "Completed", "Cancelled"]
         """
         if order_id not in self.orders:

diff --git a/...ion-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/travel_booking.py b/...ion-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/travel_booking.py
@@ -32,6 +32,7 @@ def __init__(self):
         self.user_first_name: Optional[str]
         self.user_last_name: Optional[str]
         self.budget_limit: Optional[float]
+        self._api_description = "This tool belongs to the travel system, which allows users to book flights, manage credit cards, and view budget information."
 
     def _load_scenario(
         self,
@@ -817,7 +818,7 @@ def purchase_insurance(
 
     def contact_customer_support(self, booking_id: str, message: str) -> Dict[str, str]:
         """
-        Contact customer support
+        Contact travel booking customer support, get immediate support on an issue with an online call. 
 
         Args:
             booking_id (str): The ID of the booking

diff --git a/...on-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/vehicle_control.py b/...on-call-leaderboard/bfcl/eval_checker/multi_turn_eval/func_source_code/vehicle_control.py
@@ -76,6 +76,7 @@ def __init__(self):
         self.frontRightTirePressure: float
         self.rearLeftTirePressure: float
         self.rearRightTirePressure: float
+        self._api_description = "This tool belongs to the vehicle control system, which allows users to control various aspects of the car such as engine, doors, climate control, lights, and more."
 
     def _load_scenario(self, scenario: dict, long_context=False) -> None:
         """
@@ -447,7 +448,7 @@ def pressBrakePedal(self, pedalPosition: float) -> Dict[str, Union[str, float]]:
         # Update the brake pedal status and force
         self.brakePedalStatus = "pressed"
         self._brakePedalForce = force
-        return {"brakePedalStatus": "pressed", "brakePedalForce": force}
+        return {"brakePedalStatus": "pressed", "brakePedalForce": float(force)}
 
     def releaseBrakePedal(self) -> Dict[str, Union[str, float]]:
         """