Skip to content

Commit 1312aa5

Browse files
committed
update changelog
1 parent d949302 commit 1312aa5

File tree

5 files changed

+72
-7
lines changed

5 files changed

+72
-7
lines changed

berkeley-function-call-leaderboard/CHANGELOG.md

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documen
2222
3. **Leaderboard / model cleanup**
2323
- Retires several deprecated models from the leaderboard.
2424
- Removes unused model handlers to improve maintainability.
25-
- Adds support for new model`claude-opus-4-1-20250805`
2625
4. **Address #602**
2726
- `Non-Live Acc` and `Live Acc` score calculation now excludes the Irrelevance/Relevance category scores.
2827
5. **Resolve #1094.**
@@ -52,6 +51,15 @@ All notable changes to the Berkeley Function Calling Leaderboard will be documen
5251
```
5352
5453
Migrate existing outputs to this structure before upgrading, otherwise the evaluation pipeline will fail to locate files.
54+
9. **New model support**
55+
Adds support for the following models:
56+
- `claude-opus-4-1-20250805`
57+
- `gpt-5-2025-08-07`
58+
- `gpt-5-mini-2025-08-07`
59+
- `gpt-5-nano-2025-08-07`
60+
- `Qwen/Qwen3-30B-A3B-Instruct-2507`
61+
- `Qwen/Qwen3-235B-A22B-Instruct-2507`
62+
- `Qwen/Qwen3-4B-Instruct-2507`
5563
5664
- [Jul 8, 2025] [#1098](https://github.com/Shishirtil/gorilla/pull/1098):
5765
- Re-introduce latency statistics for locally hosted models

berkeley-function-call-leaderboard/SUPPORTED_MODELS.md

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,12 @@ For model names containing `{...}`, multiple versions are available. For example
6262
| GPT-4o-2024-11-20 | Prompt | OpenAI | gpt-4o-2024-11-20 |
6363
| GPT-4o-mini-2024-07-18 | Function Calling | OpenAI | gpt-4o-mini-2024-07-18-FC |
6464
| GPT-4o-mini-2024-07-18 | Prompt | OpenAI | gpt-4o-mini-2024-07-18 |
65+
| GPT-5-2025-08-07 | Function Calling | OpenAI | gpt-5-2025-08-07-FC |
66+
| GPT-5-2025-08-07 | Prompt | OpenAI | gpt-5-2025-08-07 |
67+
| GPT-5-mini-2025-08-07 | Function Calling | OpenAI | gpt-5-mini-2025-08-07-FC |
68+
| GPT-5-mini-2025-08-07 | Prompt | OpenAI | gpt-5-mini-2025-08-07 |
69+
| GPT-5-nano-2025-08-07 | Function Calling | OpenAI | gpt-5-nano-2025-08-07-FC |
70+
| GPT-5-nano-2025-08-07 | Prompt | OpenAI | gpt-5-nano-2025-08-07 |
6571
| Granite-20b-FunctionCalling | Function Calling | Self-hosted 💻 | ibm-granite/granite-20b-functioncalling |
6672
| Granite-3.1-8B-Instruct | Function Calling | Self-hosted 💻 | ibm-granite/granite-3.1-8b-instruct |
6773
| Granite-3.2-8B-Instruct | Function Calling | Self-hosted 💻 | ibm-granite/granite-3.2-8b-instruct |

berkeley-function-call-leaderboard/bfcl_eval/constants/supported_models.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,12 @@
1717
"DeepSeek-R1-0528",
1818
"DeepSeek-R1-0528-FC",
1919
"DeepSeek-V3-0324-FC",
20+
"gpt-5-2025-08-07-FC",
21+
"gpt-5-2025-08-07",
22+
"gpt-5-mini-2025-08-07-FC",
23+
"gpt-5-mini-2025-08-07",
24+
"gpt-5-nano-2025-08-07-FC",
25+
"gpt-5-nano-2025-08-07",
2026
"gpt-4.1-2025-04-14-FC",
2127
"gpt-4.1-2025-04-14",
2228
"gpt-4.1-mini-2025-04-14-FC",

berkeley-function-call-leaderboard/bfcl_eval/eval_checker/multi_turn_eval/func_source_code/memory_api_metaclass.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -61,9 +61,16 @@ def _prepare_snapshot(self, initial_config: dict) -> Optional[dict]:
6161
return None
6262

6363
# For non-first entries we MUST have a snapshot to load from.
64-
assert (
65-
self.latest_snapshot_file.exists()
66-
), f"Not first memory entry, but no snapshot file found in this path: {self.latest_snapshot_file}"
64+
# But if the first entry got a error during inference, then there will be no snapshot file
65+
if not self.latest_snapshot_file.exists():
66+
msg = (
67+
"⚠️" * 100
68+
+ f"\nWarning: Not first memory entry, but no snapshot file found in this path: {self.latest_snapshot_file}. The memory will start empty for {initial_config['test_id']}.\n"
69+
+ "⚠️" * 100
70+
)
71+
print(msg)
72+
73+
return None
6774

6875
with open(self.latest_snapshot_file, "r") as f:
6976
return json.load(f)

berkeley-function-call-leaderboard/bfcl_eval/utils.py

Lines changed: 41 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -331,12 +331,50 @@ def get_directory_structure_by_category(test_category: str) -> str:
331331
#### Helper functions to load/write the dataset files ####
332332

333333

334-
def load_file(file_path, sort_by_id=False):
334+
def load_file(file_path, sort_by_id=False, allow_concatenated_json=False):
335335
result = []
336336
with open(file_path) as f:
337337
file = f.readlines()
338338
for line in file:
339-
result.append(json.loads(line))
339+
try:
340+
content = json.loads(line)
341+
result.append(content)
342+
except Exception as e:
343+
if not allow_concatenated_json:
344+
raise e
345+
346+
# Although this really shouldn't happen, sometimes a result file might have more than one JSON objects concatenated on a single line instead of one per line (e.g. '{"id": 1, xxx}{"id": 2, xxx}').
347+
# We can parse them incrementally by using `json.JSONDecoder.raw_decode`, which returns both the parsed object and the index where it stopped parsing.
348+
line_jsons = []
349+
decoder = json.JSONDecoder()
350+
idx = 0
351+
while idx < len(line):
352+
# Skip whitespace between objects (if any)
353+
while idx < len(line) and line[idx].isspace():
354+
idx += 1
355+
356+
if idx >= len(line):
357+
break
358+
359+
try:
360+
json_obj, idx = decoder.raw_decode(line, idx)
361+
line_jsons.append(json_obj)
362+
except json.JSONDecodeError:
363+
# If decoding fails at any point, the entire line is invalid.
364+
raise e
365+
366+
# After parsing, we must ensure the entire line has been consumed.
367+
# If `idx` is not at the end of the line, it means there's trailing
368+
# garbage, which is an error.
369+
if idx < len(line):
370+
raise e
371+
372+
if not line_jsons:
373+
# If the line was non-empty but contained no JSON objects (e.g., only whitespace),
374+
# it's an error.
375+
raise e
376+
377+
result.extend(line_jsons)
340378

341379
if sort_by_id:
342380
result.sort(key=sort_key)
@@ -349,7 +387,7 @@ def sort_file_content_by_id(file_path: Path) -> None:
349387
when the ordering actually changes to avoid unnecessary disk writes.
350388
"""
351389
# Load the current content preserving original order (and potential duplicates)
352-
original_entries = load_file(file_path)
390+
original_entries = load_file(file_path, allow_concatenated_json=True)
353391

354392
# Desired final ordering (sorted, unique)
355393
sorted_entries = sorted(original_entries, key=sort_key)

0 commit comments

Comments
 (0)