update changelog

HuanzhiMao · HuanzhiMao · commit 1312aa5a47fb · 2025-08-12T00:10:18.000-07:00
diff --git a/berkeley-function-call-leaderboard/CHANGELOG.md b/berkeley-function-call-leaderboard/CHANGELOG.md
@@ -22,7 +22,6 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documen
   3. **Leaderboard / model cleanup**
      - Retires several deprecated models from the leaderboard.
      - Removes unused model handlers to improve maintainability.
-     - Adds support for new model`claude-opus-4-1-20250805`
   4. **Address #602**
      - `Non-Live Acc` and `Live Acc` score calculation now excludes the Irrelevance/Relevance category scores.
   5. **Resolve #1094.**
@@ -52,6 +51,15 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documen
      ```
 
      Migrate existing outputs to this structure before upgrading, otherwise the evaluation pipeline will fail to locate files.
+  9. **New model support**
+     Adds support for the following models:
+     - `claude-opus-4-1-20250805`
+     - `gpt-5-2025-08-07`
+     - `gpt-5-mini-2025-08-07`
+     - `gpt-5-nano-2025-08-07`
+     - `Qwen/Qwen3-30B-A3B-Instruct-2507`
+     - `Qwen/Qwen3-235B-A22B-Instruct-2507`
+     - `Qwen/Qwen3-4B-Instruct-2507`
 
 - [Jul 8, 2025] [#1098](https://github.com/Shishirtil/gorilla/pull/1098):
   - Re-introduce latency statistics for locally hosted models
diff --git a/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md b/berkeley-function-call-leaderboard/SUPPORTED_MODELS.md
@@ -62,6 +62,12 @@ For model names containing `{...}`, multiple versions are available. For example
 | GPT-4o-2024-11-20                          | Prompt           | OpenAI         | gpt-4o-2024-11-20                                           |
 | GPT-4o-mini-2024-07-18                     | Function Calling | OpenAI         | gpt-4o-mini-2024-07-18-FC                                   |
 | GPT-4o-mini-2024-07-18                     | Prompt           | OpenAI         | gpt-4o-mini-2024-07-18                                      |
+| GPT-5-2025-08-07                           | Function Calling | OpenAI         | gpt-5-2025-08-07-FC                                         |
+| GPT-5-2025-08-07                           | Prompt           | OpenAI         | gpt-5-2025-08-07                                            |
+| GPT-5-mini-2025-08-07                      | Function Calling | OpenAI         | gpt-5-mini-2025-08-07-FC                                    |
+| GPT-5-mini-2025-08-07                      | Prompt           | OpenAI         | gpt-5-mini-2025-08-07                                       |
+| GPT-5-nano-2025-08-07                      | Function Calling | OpenAI         | gpt-5-nano-2025-08-07-FC                                    |
+| GPT-5-nano-2025-08-07                      | Prompt           | OpenAI         | gpt-5-nano-2025-08-07                                       |
 | Granite-20b-FunctionCalling                | Function Calling | Self-hosted 💻 | ibm-granite/granite-20b-functioncalling                     |
 | Granite-3.1-8B-Instruct                    | Function Calling | Self-hosted 💻 | ibm-granite/granite-3.1-8b-instruct                         |
 | Granite-3.2-8B-Instruct                    | Function Calling | Self-hosted 💻 | ibm-granite/granite-3.2-8b-instruct                         |
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/constants/supported_models.py b/berkeley-function-call-leaderboard/bfcl_eval/constants/supported_models.py
@@ -17,6 +17,12 @@
     "DeepSeek-R1-0528",
     "DeepSeek-R1-0528-FC",
     "DeepSeek-V3-0324-FC",
+    "gpt-5-2025-08-07-FC",
+    "gpt-5-2025-08-07",
+    "gpt-5-mini-2025-08-07-FC",
+    "gpt-5-mini-2025-08-07",
+    "gpt-5-nano-2025-08-07-FC",
+    "gpt-5-nano-2025-08-07",
     "gpt-4.1-2025-04-14-FC",
     "gpt-4.1-2025-04-14",
     "gpt-4.1-mini-2025-04-14-FC",
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/eval_checker/multi_turn_eval/func_source_code/memory_api_metaclass.py b/berkeley-function-call-leaderboard/bfcl_eval/eval_checker/multi_turn_eval/func_source_code/memory_api_metaclass.py
@@ -61,9 +61,16 @@ def _prepare_snapshot(self, initial_config: dict) -> Optional[dict]:
             return None
 
         # For non-first entries we MUST have a snapshot to load from.
-        assert (
-            self.latest_snapshot_file.exists()
-        ), f"Not first memory entry, but no snapshot file found in this path: {self.latest_snapshot_file}"
+        # But if the first entry got a error during inference, then there will be no snapshot file
+        if not self.latest_snapshot_file.exists():
+            msg = (
+                "⚠️" * 100
+                + f"\nWarning: Not first memory entry, but no snapshot file found in this path: {self.latest_snapshot_file}. The memory will start empty for {initial_config['test_id']}.\n"
+                + "⚠️" * 100
+            )
+            print(msg)
+
+            return None
 
         with open(self.latest_snapshot_file, "r") as f:
             return json.load(f)
diff --git a/berkeley-function-call-leaderboard/bfcl_eval/utils.py b/berkeley-function-call-leaderboard/bfcl_eval/utils.py
@@ -331,12 +331,50 @@ def get_directory_structure_by_category(test_category: str) -> str:
 #### Helper functions to load/write the dataset files ####
 
 
-def load_file(file_path, sort_by_id=False):
+def load_file(file_path, sort_by_id=False, allow_concatenated_json=False):
     result = []
     with open(file_path) as f:
         file = f.readlines()
         for line in file:
-            result.append(json.loads(line))
+            try:
+                content = json.loads(line)
+                result.append(content)
+            except Exception as e:
+                if not allow_concatenated_json:
+                    raise e
+
+                # Although this really shouldn't happen, sometimes a result file might have more than one JSON objects concatenated on a single line instead of one per line (e.g. '{"id": 1, xxx}{"id": 2, xxx}').
+                # We can parse them incrementally by using `json.JSONDecoder.raw_decode`, which returns both the parsed object and the index where it stopped parsing.
+                line_jsons = []
+                decoder = json.JSONDecoder()
+                idx = 0
+                while idx < len(line):
+                    # Skip whitespace between objects (if any)
+                    while idx < len(line) and line[idx].isspace():
+                        idx += 1
+
+                    if idx >= len(line):
+                        break
+
+                    try:
+                        json_obj, idx = decoder.raw_decode(line, idx)
+                        line_jsons.append(json_obj)
+                    except json.JSONDecodeError:
+                        # If decoding fails at any point, the entire line is invalid.
+                        raise e
+
+                # After parsing, we must ensure the entire line has been consumed.
+                # If `idx` is not at the end of the line, it means there's trailing
+                # garbage, which is an error.
+                if idx < len(line):
+                    raise e
+
+                if not line_jsons:
+                    # If the line was non-empty but contained no JSON objects (e.g., only whitespace),
+                    # it's an error.
+                    raise e
+
+                result.extend(line_jsons)
 
     if sort_by_id:
         result.sort(key=sort_key)
@@ -349,7 +387,7 @@ def sort_file_content_by_id(file_path: Path) -> None:
     when the ordering actually changes to avoid unnecessary disk writes.
     """
     # Load the current content preserving original order (and potential duplicates)
-    original_entries = load_file(file_path)
+    original_entries = load_file(file_path, allow_concatenated_json=True)
 
     # Desired final ordering (sorted, unique)
     sorted_entries = sorted(original_entries, key=sort_key)