1
1
Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Python Parallel AST,Python Parallel Multiple AST,Irrelevance Detection,Relevance Detection
2
2
1,GPT-4.5-Preview-2025-02-27 (Prompt),82.99%,82.90%,87.98%,81.86%,93.75%,66.67%,83.45%,66.67%
3
- 2,Gemini-2.5-Pro-Exp-03-25 (Prompt),81.47%,82.01%,87.21%,80.53%,93.75 %,83.33%,80.95%,66.67 %
4
- 3,Gemini-2.0-Flash-001 (Prompt ),81.39%,73.87%,75.58%,73.12 %,81.25%,83.33%,93.42%,55.56 %
5
- 4,Grok-3-mini-beta (FC),80.72%,76.09%,82.17%,77.49%,0 .00%,0.00%,88.32%,55.56 %
6
- 5,o1-2024-12-17 (Prompt),80.63%,77.79%,82.95%,76.54 %,81.25%,75.00%,85.15%,72.22 %
7
- 6,Command A (FC),80.45%,78.76%,84.50 %,77.68%,81.25%,62.50%,83.22%,72.22 %
8
- 7,Mistral-small-2503 (FC ),80.14%,73.87%,65.89 %,76.45%,56 .25%,58.33%,89.91 %,72.22%
9
- 8,ToolACE-2-8B (FC),80.05%,77.05%,70.93%,79.01 %,81.25%,54.17%,84.81 %,72.22%
10
- 9,GPT-4.1-2025-04-14 (FC),79.92%,78.39%,80.23%,78.35%,68.75%,66.67%,82.31%,77.78 %
11
- 10,GPT-4o-2024-11-20 (Prompt),79.88%,80.24%,84.50 %,79.30%,87.50%,70.83%,79.48 %,72.22%
12
- 11,Qwen2.5-32B-Instruct (FC),79.69%,79.35%,80.23 %,80.06%,43 .75%,62.50%,80.50%,64.71 %
13
- 12,Gemini-2.0-Flash-Lite-001 (Prompt ),79.61%,75.28%,79.07%,74.74%,75.00%,58.33%,86.51%,66.67 %
14
- 13,GPT-4.5-Preview-2025-02-27 (FC ),79.34%,78.68%,80.23%,78.63%,68.75 %,70.83%,80.61%,66.67 %
15
- 14,Gemini-2.0-Flash-001 (FC ),79.12%,69.58%,74.42%,68.28%,81.25%,66.67%,94.33%,50.00 %
16
- 15,o3-mini- 2025-01-31 (Prompt ),79.08 %,78.24%,82.95%,77.11%,87.50 %,70.83%,80.50%,72.22 %
17
- 16,Qwen2.5-72B-Instruct (FC),78.98%,78.02%,79.84%,78.25%,56 .25%,62.50%,80.61%,70.59 %
18
- 17,Claude-3.5-Sonnet-20241022 (FC),78.94%,80.61%,84.11%,81.96%,25.00%,20 .83%,76.42%,77.78 %
19
- 18,GPT-4o-2024-11-20 (FC),78.85%,79.05%,81.01%,78.54%,81.25%,79.17%,78.46%,83.33 %
20
- 19,Claude-3.7-Sonnet-20250219 (FC),78.41%,75.35%,82.56%,76.45%,0.00%,0.00%,83.22%,72.22 %
21
- 20,GPT-4.1-2025-04-14 (Prompt ),78.32%,78.46%,85.66%,76.54%,93.75%,75.00%,77.89%,88.89 %
22
- 21,GPT-4.1-mini-2025-04-14 (FC),78.05%,78.09%,80.62%,77.78%,75.00%,66.67%,78.12%,72.22 %
23
- 22,Claude-3-Opus-20240229 (FC),78.05 %,75.20%,79.07%,75.78%,31.25%,37.50%,82.77%,61.11 %
24
- 23,o1-2024-12-17 (FC ),78.01%,77.20%,81.78%,79.01%,0.00%,0 .00%,79.37%,72.22 %
25
- 24,watt-tool-70B (FC),77.74%,83.57%,86.05%,83.48%,81.25%,62.50%,68.48%,94.44 %
26
- 25,Gemini-2.5-Pro-Exp-03-25 (FC),77.70%,68.17 %,79.46%,64.96%,81 .25%,79.17%,92.97%,44.44 %
27
- 26,o3-mini-2025-01-31 (FC),77.30%,76.83 %,81.40%,78.63 %,0.00%,0.00%,78.00%,77.78 %
28
- 27,Gemini-2.0-Flash-Lite-001 (FC),77.25%,68.10%,71.32%,67.05%,81.25%,70.83%,91.84%,50.00 %
29
- 28,palmyra-x-004 (FC),77.21%,69.58%,76.36%,68.38%,50.00 %,62.50%,89.00%,72.22 %
30
- 29,Qwen2.5-14B-Instruct (FC),76.68%,75.35%,77.13%,75.02 %,75.00%,70.83%,79.14%,55.56 %
31
- 30,Functionary-Medium-v3.1 (FC),76.63%,82.61 %,81.78%,83.29%,68.75%,70.83%,67.57%,72.22 %
32
- 31,watt-tool-8B (FC),76.50%,77.35%,76.74%,77.49 %,87.50%,70.83%,75.06%,83.33 %
33
- 32,GPT-4o-mini-2024-07-18 (Prompt),76.50 %,77.87%,81.40%,76.73%,93.75%,79.17%,74.26%,83.33 %
34
- 33,Gemma-3-27b-it (Prompt),76.37%,80.16%,86.43%,78.73%,93.75%,66.67%,70.41%,83.33 %
35
- 34,BitAgent-8B,76.14%,77.50%,77.91%,77.40%,87.50%,70.83%,73.92%,83.33 %
36
- 35,mistral-large-2411 (FC),76.06 %,80.53 %,87.21%,79.01 %,81.25%,75.00%,69.05 %,83.33%
37
- 36,Gemini-2.0-Flash-Thinking-Exp-01-21 (Prompt),75.97 %,82.24%,86.43%,81.01%,87.50%,87.50%,66.33%,77.78 %
38
- 37,Qwen/QwQ-32B (FC) (Novita),75.48%,80.01%,81.01%,79.68 %,87.50%,79.17%,68.71%,66.67 %
39
- 38,Qwen2.5-72B-Instruct (Prompt),75.30%,82.38%,85.27%,82.15%,62.50%,75.00%,63.95%,100.00 %
40
- 39,Hammer2.1-7b (FC),75.11%,77.20%,76.74%,77.40%,81.25%,70.83%,71.77%,82.35 %
41
- 40,xiaoming-14B (Prompt),74.77%,76.54%,81.78%,75.88%,62 .50%,58.33%,72.00%,77.78 %
42
- 41,CoALM-405B,74.50%,83.20%,84.88%,83.00%,87.50%,70.83%,60.66%,100.00 %
43
- 42,GPT-4o-mini-2024-07-18 (FC),74.41%,76.68%,78.68%,76.16 %,87.50%,70.83%,70.75%,83.33 %
44
- 43,DeepSeek-R1 (Prompt),74.41%,80.61%,84.11 %,79.87%,87.50%,70.83%,65.08%,66.67 %
45
- 44,Amazon-Nova-Pro-v1:0 (FC),74.32 %,77.72%,80.23 %,77.49 %,81.25%,58.33%,69.05%,77.78 %
46
- 45,xLAM-2-32b-fc-r (FC ),74.23 %,76.17%,82.95 %,75.21%,56.25 %,58.33%,70.98%,88.89 %
47
- 46,Qwen2 .5-32B-Instruct (Prompt ),74.23%,78.83%,82.95%,78.54%,62.50%,58.33%,66.67%,100.00 %
48
- 47,Qwen2.5-7B-Instruct (FC) ,74.19%,75.35%,75.58%,75.59%,68.75%,66.67%,72.34%,77.78 %
49
- 48,Haha-7B ,74.19%,77.57 %,78.29%,77.59%,75.00 %,70.83%,68.82 %,83.33%
50
- 49,Qwen2.5-14B-Instruct (Prompt ),74.14%,75.20%,74.42%,75.78%,62.50%,66.67%,72.45 %,77.78%
51
- 50,Hammer2.1-3b (FC),74.04%,73.06%,73.26%,73.31%,62.50%,66.67%,75.40%,82.35 %
52
- 51,GoGoAgent ,74.01%,74.69%,72.87%,75.40%,68.75 %,66.67%,72.90%,77.78 %
53
- 52,Functionary-Small-v3.1 (FC),73.75%,78.24%,79.84%,78.16%,81.25%,62.50%,66.78 %,77.78%
54
- 53,xLAM-2-70b-fc-r (FC),72.95%,72.02%,77.13%,71.13%,68.75%,58.33%,74.49%,66.67 %
55
- 54,CoALM-8B ,72.95%,67.28%,71.71%,66.67%,56.25%,54.17%,81.41%,83.33 %
56
- 55,claude-3.5-haiku-20241022 (FC),72.37%,77.13%,82.95%,78.35%,18.75%,0 .00%,64.85%,83.33 %
57
- 56,Claude-3.5-Sonnet-20241022 (Prompt),71.97%,80.75%,86.82%,80.06%,81 .25%,45.83%,58.39%,77.78 %
58
- 57,CoALM-70B,71.92%,66.17%,70.16%,65.34%,68 .75%,58.33%,80.84%,66.67 %
59
- 58,Amazon-Nova-Lite-v1:0 (FC ),71.61%,70.61%,72.87%,70.09%,75.00%,66.67%,73.24%,66.67 %
60
- 59,Grok-3-beta (FC) ,71.21%,75.35%,83.72%,76.16%,0.00%,0.00%,64.85%,72.22 %
61
- 60,GPT-4.1-mini-2025-04-14 (Prompt ),71.17%,72 .61%,79.46 %,70.94 %,75.00%,70.83%,68.71%,83.33 %
62
- 61,claude-3.5-haiku-20241022 (Prompt),70.77%,76.68%,84.88%,75.02%,87.50%,54.17%,61.56%,77.78 %
63
- 62,Hammer2 .1-1.5b (FC),70.64%,69.73%,71.32%,69.80%,50 .00%,62.50%,71.88%,77.78 %
64
- 63,MiniCPM3-4B-FC (FC ),70.01%,65.73%,74.81%,63.91%,43.75%,62.50%,76.53%,72.22 %
65
- 64,Grok-3-beta (Prompt),69.61%,81.87%,86.05%,81.20%,75 .00%,70.83%,50.23%,100.00 %
66
- 65,Qwen2.5-3B-Instruct (FC),69.39%,71.87 %,74.03%,72.08%,62.50%,45.83%,65.19%,88.89 %
67
- 66,Command R7B (FC ),69.17%,59.59%,63.18%,58.69%,56.25%,62.50%,84.13%,55.56 %
68
- 67,DeepSeek-V3 (FC),68.41%,82.09%,83.72%,82.15%,81 .25%,62.50%,47.05%,88.89 %
3
+ 2,Gemini-2.0-Flash-001 (Prompt),81.39%,73.87%,75.58%,73.12%,81.25 %,83.33%,93.42%,55.56 %
4
+ 3,Qwen3-30B-A3B (FC ),81.34%,82.38%,86.82%,81.20 %,81.25%,87.50%,79.93%,72.22 %
5
+ 4,Llama-3.1-Nemotron-Ultra-253B-v1 (FC),80.76%,82.75%,86.43%,82.24%,75 .00%,70.83%,77.66%,83.33 %
6
+ 5,DM-Cito-8B (Prompt),80.72%,79.27%,84.50%,78.25 %,81.25%,66.67%,83.22%,66.67 %
7
+ 6,Grok-3-mini-beta (FC),80.72%,76.09%,82.17 %,77.49%,0.00%,0.00%,88.32%,55.56 %
8
+ 7,o1-2024-12-17 (Prompt ),80.63%,77.79%,82.95 %,76.54%,81 .25%,75.00%,85.15 %,72.22%
9
+ 8,Command A (FC),80.45%,78.76%,84.50%,77.68 %,81.25%,62.50%,83.22 %,72.22%
10
+ 9,Mistral-small-2503 (FC),80.14%,73.87%,65.89%,76.45%,56.25%,58.33%,89.91%,72.22 %
11
+ 10,ToolACE-2-8B (FC),80.05%,77.05%,70.93 %,79.01%,81.25%,54.17%,84.81 %,72.22%
12
+ 11,Qwen3-14B (FC),79.96%,81.72%,86.05 %,80.82%,68 .75%,83.33%,77.44%,72.22 %
13
+ 12,GPT-4.1-2025-04-14 (FC ),79.92%,78.39%,80.23%,78.35%,68.75%,66.67%,82.31%,77.78 %
14
+ 13,GPT-4o-2024-11-20 (Prompt ),79.88%,80.24%,84.50%,79.30%,87.50 %,70.83%,79.48%,72.22 %
15
+ 14,Gemini-2.0-Flash-Lite- 001 (Prompt ),79.61%,75.28%,79.07%,74.74%,75.00%,58.33%,86.51%,66.67 %
16
+ 15,GPT-4.5-Preview- 2025-02-27 (FC ),79.34 %,78.68%,80.23%,78.63%,68.75 %,70.83%,80.61%,66.67 %
17
+ 16,Gemini-2.0-Flash-001 (FC),79.12%,69.58%,74.42%,68.28%,81 .25%,66.67%,94.33%,50.00 %
18
+ 17,o3-mini-2025-01-31 (Prompt),79.08%,78.24%,82.95%,77.11%,87.50%,70 .83%,80.50%,72.22 %
19
+ 18,Claude-3.5-Sonnet-20241022 (FC),78.94%,80.61%,84.11%,81.96%,25.00%,20.83%,76.42%,77.78 %
20
+ 19,GPT-4o-2024-11-20 (FC),78.85%,79.05%,81.01%,78.54%,81.25%,79.17%,78.46%,83.33 %
21
+ 20,Qwen3-4B (FC ),78.72%,81.42%,87.60%,79.96%,75.00%,83.33%,74.72%,72.22 %
22
+ 21,Qwen3-8B (FC),78.54%,80.16%,84.88%,79.39%,62.50%,75.00%,76.08%,77.78 %
23
+ 22,Claude-3.7-Sonnet-20250219 (FC),78.41 %,75.35%,82.56%,76.45%,0.00%,0.00%,83.22%,72.22 %
24
+ 23,GPT-4.1-2025-04-14 (Prompt ),78.32%,78.46%,85.66%,76.54%,93.75%,75 .00%,77.89%,88.89 %
25
+ 24,GPT-4.1-mini-2025-04-14 (FC),78.05%,78.09%,80.62%,77.78%,75.00%,66.67%,78.12%,72.22 %
26
+ 25,Claude-3-Opus-20240229 (FC),78.05%,75.20 %,79.07%,75.78%,31 .25%,37.50%,82.77%,61.11 %
27
+ 26,o1-2024-12-17 (FC),78.01%,77.20 %,81.78%,79.01 %,0.00%,0.00%,79.37%,72.22 %
28
+ 27,Qwen3-32B (FC),77.83%,80.90%,84.11%,80.72%,68.75%,62.50%,73.24%,72.22 %
29
+ 28,watt-tool-70B (FC),77.74%,83.57%,86.05%,83.48%,81.25 %,62.50%,68.48%,94.44 %
30
+ 29,DeepSeek-V3-0324 (FC),77.34%,80.38%,86.05%,79.49 %,75.00%,62.50%,72.56%,83.33 %
31
+ 30,o3-mini-2025-01-31 (FC),77.30%,76.83 %,81.40%,78.63%,0.00%,0.00%,78.00%,77.78 %
32
+ 31,DeepSeek-R1-0528 (FC),77.30%,80.46%,85.66%,79.68 %,87.50%,54.17%,72.45%,77.78 %
33
+ 32,Grok-3-beta (FC),77.25 %,77.72%,80.62%,77.40%,56.25%,75.00%,76.76%,66.67 %
34
+ 33,Gemini-2.0-Flash-Lite-001 (FC),77.25%,68.10%,71.32%,67.05%,81.25%,70.83%,91.84%,50.00 %
35
+ 34,palmyra-x-004 (FC),77.21%,69.58%,76.36%,68.38%,50.00%,62.50%,89.00%,72.22 %
36
+ 35,Qwen3-235B-A22B (FC),77.03 %,80.90 %,87.21%,79.30 %,81.25%,83.33%,70.98 %,83.33%
37
+ 36,Functionary-Medium-v3.1 (FC),76.63 %,82.61%,81.78%,83.29%,68.75%,70.83%,67.57%,72.22 %
38
+ 37,watt-tool-8B (FC),76.50%,77.35%,76.74%,77.49 %,87.50%,70.83%,75.06%,83.33 %
39
+ 38,GPT-4o-mini-2024-07-18 (Prompt),76.50%,77.87%,81.40%,76.73%,93.75%,79.17%,74.26%,83.33 %
40
+ 39,Gemma-3-27b-it (Prompt),76.37%,80.16%,86.43%,78.73%,93.75%,66.67%,70.41%,83.33 %
41
+ 40,BitAgent-8B,76.14%,77.50%,77.91%,77.40%,87 .50%,70.83%,73.92%,83.33 %
42
+ 41,mistral-large-2411 (FC),76.06%,80.53%,87.21%,79.01%,81.25%,75.00%,69.05%,83.33 %
43
+ 42,Gemini-2.0-Flash-Thinking-Exp-01-21 (Prompt),75.97%,82.24%,86.43%,81.01 %,87.50%,87.50%,66.33%,77.78 %
44
+ 43,QwQ-32B (FC),75.61%,79.57%,80.62 %,79.20%,81.25%,83.33%,69.39%,83.33 %
45
+ 44,Hammer2.1-7b (FC),75.11 %,77.20%,76.74 %,77.40 %,81.25%,70.83%,71.77%,82.35 %
46
+ 45,xiaoming-14B (Prompt ),74.77 %,76.54%,81.78 %,75.88%,62.50 %,58.33%,72.00%,77.78 %
47
+ 46,Gemini-2 .5-Pro-Preview-05-06 (FC ),74.59%,65.28%,77.91%,62.20%,68.75%,62.50%,89.68%,33.33 %
48
+ 47,CoALM-405B ,74.50%,83.20%,84.88%,83.00%,87.50%,70.83%,60.66%,100.00 %
49
+ 48,GPT-4o-mini-2024-07-18 (FC) ,74.41%,76.68 %,78.68%,76.16%,87.50 %,70.83%,70.75 %,83.33%
50
+ 49,Amazon-Nova-Pro-v1:0 (FC ),74.32%,77.72%,80.23%,77.49%,81.25%,58.33%,69.05 %,77.78%
51
+ 50,xLAM-2-32b-fc-r (FC),74.23%,76.17%,82.95%,75.21%,56.25%,58.33%,70.98%,88.89 %
52
+ 51,Hammer2.1-3b (FC) ,74.04%,73.06%,73.26%,73.31%,62.50 %,66.67%,75.40%,82.35 %
53
+ 52,GoGoAgent,74.01%,74.69%,72.87%,75.40%,68.75%,66.67%,72.90 %,77.78%
54
+ 53,Functionary-Small-v3.1 (FC),73.75%,78.24%,79.84%,78.16%,81.25%,62.50%,66.78%,77.78 %
55
+ 54,xLAM-2-70b-fc-r (FC) ,72.95%,72.02%,77.13%,71.13%,68.75%,58.33%,74.49%,66.67 %
56
+ 55,Qwen3-1.7B (FC),72.95%,73.28%,75.58%,72.65%,75.00%,75 .00%,72.68%,61.11 %
57
+ 56,CoALM-8B,72.95%,67.28%,71.71%,66.67%,56 .25%,54.17%,81.41%,83.33 %
58
+ 57,claude-3.5-haiku-20241022 (FC),72.37%,77.13%,82.95%,78.35%,18 .75%,0.00%,64.85%,83.33 %
59
+ 58,Claude-3.5-Sonnet-20241022 (Prompt ),71.97%,80.75%,86.82%,80.06%,81.25%,45.83%,58.39%,77.78 %
60
+ 59,CoALM-70B ,71.92%,66.17%,70.16%,65.34%,68.75%,58.33%,80.84%,66.67 %
61
+ 60,Amazon-Nova-Lite-v1:0 (FC ),71.61%,70 .61%,72.87 %,70.09 %,75.00%,66.67%,73.24%,66.67 %
62
+ 61,ling-lite-v1.5 (Prompt),71.26%,71.80%,74.42%,71.13%,75.00%,70.83%,70.07%,88.89 %
63
+ 62,GPT-4 .1-mini-2025-04-14 (Prompt),71.17%,72.61%,79.46%,70.94%,75 .00%,70.83%,68.71%,83.33 %
64
+ 63,claude-3.5-haiku-20241022 (Prompt ),70.77%,76.68%,84.88%,75.02%,87.50%,54.17%,61.56%,77.78 %
65
+ 64,Hammer2.1-1.5b (FC),70.64%,69.73%,71.32%,69.80%,50 .00%,62.50%,71.88%,77.78 %
66
+ 65,MiniCPM3-4B-FC (FC),70.01%,65.73 %,74.81%,63.91%,43.75%,62.50%,76.53%,72.22 %
67
+ 66,Grok-3-beta (Prompt ),69.61%,81.87%,86.05%,81.20%,75.00%,70.83%,50.23%,100.00 %
68
+ 67,Command R7B (FC),69.17%,59.59%,63.18%,58.69%,56 .25%,62.50%,84.13%,55.56 %
69
69
68,Sky-T1-32B-Preview (Prompt),68.00%,76.76%,77.52%,77.11%,62.50%,62.50%,54.08%,94.12%
70
- 69,Qwen2.5-7B-Instruct (Prompt),67.44%,75.06%,76.74%,74.93%,62 .50%,70.83%,55.33 %,88.89%
71
- 70,Gemma-3-12b-it (Prompt ),67.13%,73.58%,84.88%,70.85%,87 .50%,62.50%,56.80%,88.89 %
72
- 71,Amazon-Nova-Micro-v1:0 (FC),67.04%,64.17%,65.89%,64.20%,62.50%,45.83%,71.32%,72.22 %
73
- 72,Claude-3-Opus-20240229 (Prompt ),66.99%,79.72%,85.27%,79.11%,68.75%,54.17%,47.17%,83.33 %
74
- 73,xLAM-2-8b-fc-r (FC),66.90%,67.51%,74.81%,66.29%,56.25%,50 .00%,65.76%,77.78 %
75
- 74,GLM-4-9b-Chat (FC ),66.81%,64.03%,72.48%,64.39%,0 .00%,0.00%,71.09 %,66.67%
76
- 75,Phi-4 (Prompt),66.28%,56.11%,62.79%,53.94 %,75.00%,66.67%,81.86 %,66.67%
77
- 76,Open-Mistral-Nemo-2407 (FC ),65.97%,71.13%,77.13%,69.61%,75.00%,70.83%,58.05%,66.67 %
78
- 77,Claude-3.7-Sonnet-20250219 (Prompt ),65.79%,83.64%,87.98%,83.57%,68.75%,50.00%,37.76%,100.00 %
79
- 78,FireFunction-v2 (FC),65.66%,78.09%,79.07%,78.35%,56.25%,70.83%,46.03%,94.44 %
70
+ 69,Gemma-3-12b-it (Prompt),67.13%,73.58%,84.88%,70.85%,87 .50%,62.50%,56.80 %,88.89%
71
+ 70,Amazon-Nova-Micro-v1:0 (FC ),67.04%,64.17%,65.89%,64.20%,62 .50%,45.83%,71.32%,72.22 %
72
+ 71,Claude-3-Opus-20240229 (Prompt),66.99%,79.72%,85.27%,79.11%,68.75%,54.17%,47.17%,83.33 %
73
+ 72,xLAM-2-8b-fc-r (FC ),66.90%,67.51%,74.81%,66.29%,56.25%,50.00%,65.76%,77.78 %
74
+ 73,GLM-4-9b-Chat (FC),66.81%,64.03%,72.48%,64.39%,0.00%,0 .00%,71.09%,66.67 %
75
+ 74,Phi-4 (Prompt ),66.28%,56.11%,62.79%,53.94%,75 .00%,66.67%,81.86 %,66.67%
76
+ 75,Open-Mistral-Nemo-2407 (FC),65.97%,71.13%,77.13%,69.61 %,75.00%,70.83%,58.05 %,66.67%
77
+ 76,Claude-3.7-Sonnet-20250219 (Prompt ),65.79%,83.64%,87.98%,83.57%,68.75%,50.00%,37.76%,100.00 %
78
+ 77,FireFunction-v2 (FC ),65.66%,78.09%,79.07%,78.35%,56.25%,70.83%,46.03%,94.44 %
79
+ 78,Qwen3-0.6B (FC),65.66%,56.40%,65.89%,54.42%,37.50%,54.17%,79.82%,66.67 %
80
80
79,GPT-4.1-nano-2025-04-14 (Prompt),65.35%,60.77%,65.12%,58.97%,81.25%,79.17%,72.22%,72.22%
81
81
80,Ministral-8B-Instruct-2410 (FC),64.93%,72.61%,75.58%,72.27%,62.50%,62.50%,53.06%,70.59%
82
- 81,Qwen2.5-1.5B-Instruct (FC),64.82%,67.06%,74.03%,66.10%,50.00%,45.83%,60.77%,94.44%
83
- 82,GPT-4.1-nano-2025-04-14 (FC),64.33%,72.17%,65.50%,73.41%,87.50%,79.17%,51.70%,94.44%
84
- 83,Hammer2.1-0.5b (FC),62.91%,58.11%,60.08%,58.02%,50.00%,45.83%,69.95%,77.78%
85
- 84,Llama-3.3-70B-Instruct (FC),62.67%,78.02%,81.78%,77.11%,93.75%,66.67%,38.44%,100.00%
86
- 85,Llama-3.1-70B-Instruct (Prompt),62.24%,76.54%,78.29%,76.16%,87.50%,66.67%,39.57%,100.00%
87
- 86,Phi-4-mini-instruct (FC),61.47%,47.45%,40.31%,50.33%,0.00%,29.17%,82.88%,64.71%
88
- 87,Qwen2.5-1.5B-Instruct (Prompt),61.08%,61.07%,70.54%,59.26%,56.25%,41.67%,60.66%,83.33%
89
- 88,Llama-3.1-8B-Instruct (Prompt),61.08%,72.91%,74.03%,73.31%,56.25%,54.17%,42.63%,77.78%
90
- 89,Llama-4-Maverick-17B-128E-Instruct-FP8 (Prompt) (Novita),60.77%,77.57%,84.11%,75.97%,81.25%,75.00%,34.24%,100.00%
91
- 90,DBRX-Instruct (Prompt),60.28%,73.50%,78.29%,73.03%,75.00%,41.67%,39.34%,94.44%
92
- 91,Llama-4-Maverick-17B-128E-Instruct-FP8 (FC) (Novita),60.02%,64.69%,74.42%,64.77%,0.00%,0.00%,52.49%,77.78%
93
- 92,Granite-20b-FunctionCalling (FC),59.66%,58.48%,68.22%,56.32%,43.75%,58.33%,60.88%,88.89%
94
- 93,Gemma-3-4b-it (Prompt),59.17%,63.80%,72.87%,62.77%,37.50%,29.17%,51.70%,77.78%
95
- 94,xLAM-2-1b-fc-r (FC),59.17%,62.62%,74.03%,60.68%,50.00%,33.33%,53.17%,94.44%
96
- 95,Command-R-Plus (FC),59.00%,60.84%,70.54%,58.78%,62.50%,45.83%,55.90%,72.22%
97
- 96,xLAM-2-3b-fc-r (FC),58.91%,64.32%,74.42%,62.11%,68.75%,50.00%,50.00%,88.89%
98
- 97,Bielik-11B-v2.3-Instruct (Prompt),58.91%,69.43%,72.87%,69.33%,43.75%,54.17%,42.40%,77.78%
99
- 98,Qwen2.5-3B-Instruct (Prompt),58.69%,66.91%,69.77%,66.48%,56.25%,62.50%,45.46%,88.89%
100
- 99,Llama-4-Maverick-17B-128E-Instruct-FP8 (FC),58.55%,76.17%,83.33%,74.64%,81.25%,62.50%,30.73%,100.00%
101
- 100,Llama-4-Scout-17B-16E-Instruct (FC),57.97%,74.76%,77.91%,74.36%,68.75%,62.50%,31.41%,100.00%
102
- 101,Llama-3.2-3B-Instruct (FC),55.80%,63.73%,63.95%,64.86%,12.50%,45.83%,42.97%,88.89%
103
- 102,Falcon3-7B-Instruct (FC),54.86%,67.95%,74.03%,66.48%,75.00%,62.50%,34.13%,88.89%
104
- 103,MiniCPM3-4B (Prompt),54.46%,37.23%,46.51%,34.76%,43.75%,41.67%,80.95%,50.00%
105
- 104,Nexusflow-Raven-v2 (FC),54.20%,39.45%,41.47%,38.65%,56.25%,41.67%,76.64%,61.11%
106
- 105,Falcon3-10B-Instruct (FC),54.11%,75.28%,76.36%,76.16%,50.00%,41.67%,20.86%,94.44%
107
- 106,Llama-4-Scout-17B-16E-Instruct (Prompt) (Novita),52.11%,70.32%,75.97%,69.52%,62.50%,50.00%,23.24%,100.00%
108
- 107,Phi-4-mini-instruct (Prompt),50.62%,58.85%,55.04%,59.45%,68.75%,66.67%,37.41%,82.35%
109
- 108,Open-Mistral-Nemo-2407 (Prompt),49.04%,75.13%,77.91%,74.45%,87.50%,66.67%,8.28%,88.89%
110
- 109,Qwen2.5-0.5B-Instruct (FC),47.98%,43.97%,56.20%,41.31%,56.25%,20.83%,53.29%,88.89%
111
- 110,Falcon3-3B-Instruct (FC),47.40%,55.51%,55.43%,56.32%,31.25%,37.50%,34.35%,77.78%
112
- 111,Llama-3.1-70B-Instruct (FC),45.00%,51.81%,52.33%,52.61%,31.25%,25.00%,33.45%,100.00%
113
- 112,Llama-4-Scout-17B-16E-Instruct (FC) (Novita),41.09%,46.71%,59.69%,45.20%,0.00%,4.17%,31.52%,88.89%
114
- 113,QwQ-32B-Preview (Prompt),40.78%,3.55%,7.36%,2.75%,0.00%,0.00%,98.64%,0.00%
115
- 114,Qwen/QwQ-32B (Prompt) (Novita),39.18%,0.00%,0.00%,0.00%,0.00%,0.00%,100.00%,0.00%
116
- 115,Llama-3.1-8B-Instruct (FC),33.50%,49.30%,51.94%,49.00%,37.50%,41.67%,8.05%,94.44%
117
- 116,Falcon3-1B-Instruct (FC),32.70%,2.96%,4.65%,2.37%,0.00%,12.50%,78.91%,0.00%
118
- 117,Qwen2.5-0.5B-Instruct (Prompt),31.59%,38.34%,53.88%,34.76%,56.25%,16.67%,19.95%,94.44%
119
- 118,Llama-3.2-1B-Instruct (FC),31.36%,12.14%,31.40%,7.60%,12.50%,4.17%,60.66%,38.89%
120
- 119,Gemma-3-1b-it (Prompt),30.34%,14.14%,31.01%,10.54%,0.00%,0.00%,54.76%,50.00%
121
- 120,ThinkAgent-1B (FC),25.37%,17.62%,24.42%,16.43%,6.25%,4.17%,36.85%,44.44%
122
- 121,Grok-3-mini-beta (Prompt),0.00%,N/A,N/A,N/A,N/A,N/A,N/A,N/A
82
+ 81,GPT-4.1-nano-2025-04-14 (FC),64.33%,72.17%,65.50%,73.41%,87.50%,79.17%,51.70%,94.44%
83
+ 82,Hammer2.1-0.5b (FC),62.91%,58.11%,60.08%,58.02%,50.00%,45.83%,69.95%,77.78%
84
+ 83,Llama-3.3-70B-Instruct (FC),62.67%,78.02%,81.78%,77.11%,93.75%,66.67%,38.44%,100.00%
85
+ 84,Llama-3.1-70B-Instruct (Prompt),62.24%,76.54%,78.29%,76.16%,87.50%,66.67%,39.57%,100.00%
86
+ 85,Phi-4-mini-instruct (FC),61.47%,47.45%,40.31%,50.33%,0.00%,29.17%,82.88%,64.71%
87
+ 86,Llama-3.1-8B-Instruct (Prompt),61.08%,72.91%,74.03%,73.31%,56.25%,54.17%,42.63%,77.78%
88
+ 87,DBRX-Instruct (Prompt),60.28%,73.50%,78.29%,73.03%,75.00%,41.67%,39.34%,94.44%
89
+ 88,Granite-20b-FunctionCalling (FC),59.66%,58.48%,68.22%,56.32%,43.75%,58.33%,60.88%,88.89%
90
+ 89,Gemma-3-4b-it (Prompt),59.17%,63.80%,72.87%,62.77%,37.50%,29.17%,51.70%,77.78%
91
+ 90,xLAM-2-1b-fc-r (FC),59.17%,62.62%,74.03%,60.68%,50.00%,33.33%,53.17%,94.44%
92
+ 91,Command-R-Plus (FC),59.00%,60.84%,70.54%,58.78%,62.50%,45.83%,55.90%,72.22%
93
+ 92,xLAM-2-3b-fc-r (FC),58.91%,64.32%,74.42%,62.11%,68.75%,50.00%,50.00%,88.89%
94
+ 93,Bielik-11B-v2.3-Instruct (Prompt),58.91%,69.43%,72.87%,69.33%,43.75%,54.17%,42.40%,77.78%
95
+ 94,Llama-4-Maverick-17B-128E-Instruct-FP8 (FC),58.55%,76.17%,83.33%,74.64%,81.25%,62.50%,30.73%,100.00%
96
+ 95,Llama-4-Scout-17B-16E-Instruct (FC),57.97%,74.76%,77.91%,74.36%,68.75%,62.50%,31.41%,100.00%
97
+ 96,Llama-3.2-3B-Instruct (FC),55.80%,63.73%,63.95%,64.86%,12.50%,45.83%,42.97%,88.89%
98
+ 97,Falcon3-7B-Instruct (FC),54.86%,67.95%,74.03%,66.48%,75.00%,62.50%,34.13%,88.89%
99
+ 98,MiniCPM3-4B (Prompt),54.46%,37.23%,46.51%,34.76%,43.75%,41.67%,80.95%,50.00%
100
+ 99,Nexusflow-Raven-v2 (FC),54.20%,39.45%,41.47%,38.65%,56.25%,41.67%,76.64%,61.11%
101
+ 100,Falcon3-10B-Instruct (FC),54.11%,75.28%,76.36%,76.16%,50.00%,41.67%,20.86%,94.44%
102
+ 101,Phi-4-mini-instruct (Prompt),50.62%,58.85%,55.04%,59.45%,68.75%,66.67%,37.41%,82.35%
103
+ 102,Open-Mistral-Nemo-2407 (Prompt),49.04%,75.13%,77.91%,74.45%,87.50%,66.67%,8.28%,88.89%
104
+ 103,Falcon3-3B-Instruct (FC),47.40%,55.51%,55.43%,56.32%,31.25%,37.50%,34.35%,77.78%
105
+ 104,Llama-3.1-70B-Instruct (FC),45.00%,51.81%,52.33%,52.61%,31.25%,25.00%,33.45%,100.00%
106
+ 105,Llama-3.1-8B-Instruct (FC),33.50%,49.30%,51.94%,49.00%,37.50%,41.67%,8.05%,94.44%
107
+ 106,Falcon3-1B-Instruct (FC),32.70%,2.96%,4.65%,2.37%,0.00%,12.50%,78.91%,0.00%
108
+ 107,Llama-3.2-1B-Instruct (FC),31.36%,12.14%,31.40%,7.60%,12.50%,4.17%,60.66%,38.89%
109
+ 108,Gemma-3-1b-it (Prompt),30.34%,14.14%,31.01%,10.54%,0.00%,0.00%,54.76%,50.00%
110
+ 109,ThinkAgent-1B (FC),25.37%,17.62%,24.42%,16.43%,6.25%,4.17%,36.85%,44.44%
0 commit comments