1
1
Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Python Parallel AST,Python Parallel Multiple AST,Irrelevance Detection,Relevance Detection
2
- 1,Gemini-1.5-Flash-002 (Prompt ),76.28%,78.20 %,77.91%,78.30%,93.75 %,66.67%,72.91%,85.37 %
3
- 2,GPT-4-turbo- 2024-04-09 (FC),76.23%,77.45%,77.52%,77.63 %,81.25%,66.67%,74.51%,73.17 %
4
- 3,GPT-4o -2024-08-06 (FC ),75.43%,74.98%,74.42%,75.12%,81.25%,70.83%,76.69%,63.41 %
5
- 4,o1-mini-2024-09-12 (Prompt ),75.39 %,71.39%,73.26%,71.07%,75.00%,62.50%,82.74%,48.78 %
2
+ 1,GPT-4-turbo-2024-04-09 (FC ),76.23%,77.45 %,77.52%,77.63%,81.25 %,66.67%,74.51%,73.17 %
3
+ 2,GPT-4o- 2024-08-06 (FC),75.43%,74.98%,74.42%,75.12 %,81.25%,70.83%,76.69%,63.41 %
4
+ 3,o1-mini -2024-09-12 (Prompt ),75.39%,71.39%,73.26%,71.07%,75.00%,62.50%,82.74%,48.78 %
5
+ 4,Gemini-1.5-Flash-002 (FC ),75.12 %,71.24%,71.32%,70.97%,81.25%,75.00%,81.71%,60.98 %
6
6
5,ToolACE-8B (FC),74.99%,73.33%,66.67%,74.93%,81.25%,70.83%,77.26%,80.49%
7
7
6,Claude-3.5-Sonnet-20240620 (FC),74.68%,76.85%,80.23%,76.76%,56.25%,58.33%,71.66%,68.29%
8
8
7,GPT-4o-mini-2024-07-18 (Prompt),74.63%,75.51%,79.46%,74.35%,93.75%,70.83%,73.26%,75.61%
9
- 8,Gemini-1.5-Pro-002 (Prompt),74.41%,77.00%,77.52%,76.76 %,87.50%,75.00%,70.86%,65.85 %
9
+ 8,Gemini-1.5-Pro-002 (Prompt),74.28%,78.28%,79.84%,77.72 %,87.50%,79.17%,68.11%,75.61 %
10
10
9,Claude-3-Opus-20240229 (FC tools-2024-04-04),74.10%,74.53%,74.81%,75.60%,50.00%,41.67%,73.94%,63.41%
11
- 10,Functionary-Medium-v3.1 (FC ),73.48%,81.05%,79.46%,81.87%,68 .75%,70.83%,62.06%,70.73 %
12
- 11,Gemini-1.5-Pro-001 (Prompt ),73.12%,69.14%,67.44%,69.24%,93 .75%,66.67%,80.00%,56.10 %
13
- 12,Mistral-Medium-2312 (Prompt),73.10%,71.84%,68.60%,73.00%,81.25%,50.00%,100.00%,60.98 %
14
- 13,o1-preview-2024-09-12 (Prompt),73.08%,77.53%,80.62%,76.76%,75.00%,79.17%,66.29%,73.17 %
15
- 14,xLAM-8x22b-r (FC),71.97%,79.40%,78.29%,80.14 %,75.00%,62.50%,60.00%,85.37 %
16
- 15,Functionary-Small-v3.1 (FC),70.41%,75.58%,75.19%,75.89%,81.25%,62.50%,61.83%,85.37 %
17
- 16,Mistral-small-2402 (FC),70.19%,68 .16%,63.57%,71.46%,12.50%,12 .50%,72.69%,82.93 %
18
- 17,GPT-4o-mini-2024-07-18 (FC),70.19%,74.23%,72.87%,74.45 %,87.50%,70.83%,63.54%,80.49 %
19
- 18,Hammer2.0-7b (FC),69.79%,76.63 %,74.42%,77.15%,81.25%,75.00%,58.17%,95.12 %
20
- 19,Command-R-Plus (Prompt) (Original),69.75%,69.59%,66.67%,70.30%,68.75%,70.83%,69.83%,73.17 %
21
- 20,Gemma-2-27b-it (Prompt),69.48%,77.30%,79.46%,77.24%,68.75 %,62.50%,56.69%,87.80 %
22
- 21,Gemma-2-9b-it (Prompt),69.21%,73.11%,73.64%,73.58%,56.25%,58.33%,62.40%,87.80 %
23
- 22,Gemini-1.5-Flash-001 (Prompt),69.21%,75.21%,74.42%,75.12%,93.75%,75.00%,59.43%,82.93 %
24
- 23,xLAM-8x7b-r (FC),69.12%,74.53%,68.22%,76.76%,62.50%,54.17%,60.00%,87.80 %
25
- 24,GPT-4-turbo-2024-04-09 (Prompt),69.04%,84.64%,85.66%,84.57%,87.50%,75.00%,44.57%,82.93 %
26
- 25,Open-Mixtral-8x22b (Prompt),68.46%,63.90%,72.87%,61.33%,81.25%,66.67%,75.54%,65.85 %
27
- 26,mistral-large-2407 (FC),68.37%,79.55%,81.78%,79.27%,68.75%,75.00%,50.97%,75.61 %
28
- 27,xLAM-7b -r (FC),67.88%,72.28%,71.32%,73.48%,31.25%,58.33%,59.77%,97.56 %
29
- 28,GPT-3.5-Turbo-0125 (Prompt),67.48%,64.27%,63.57%,64.61%,68.75%,54.17%,71.77%,80.49 %
30
- 29,Gorilla-OpenFunctions-v2 (FC),67.44%,61.42%,73.64%,58.73%,68.75%,41 .67%,76.34%,73.17 %
31
- 30,Gemini-1.5-Flash-002 (FC),67.35%,57.98%,58.14%,57.96 %,68.75%,50 .00%,81.94%,60.98 %
32
- 31,Open-Mixtral-8x22b (FC),66.86%,71.16%,73.26%,72.32%,6.25%,41.67%,59.54%,82.93 %
33
- 32,Meta-Llama-3-70B-Instruct (Prompt),66.15%,79.10%,78.68%,79.65%,68.75%,66.67%,45.14%,92.68 %
34
- 33,Qwen2 .5-7B-Instruct (Prompt),65.97%,72.13%,72.48%,72.32%,62.50%,66.67%,55.31%,92.68 %
35
- 34,Gemini-1.5-Pro-001 (FC),65.53%,58.05%,57.75 %,58.24%,75.00 %,41.67%,77.03%,63.41 %
36
- 35,Claude-3-Haiku-20240307 (Prompt),65.04%,74.53%,77.13%,74.64%,68.75%,45.83%,49.71 %,82.93%
37
- 36,Open-Mixtral-8x7b (Prompt),64.95%,63.30%,57.36%,65.00 %,68.75%,50.00%,67.31%,68.29 %
38
- 37,Gemini-1.5-Flash-001 (FC),64.90%,59.48%,58.14%,60.46%,43.75%,41.67%,73.49%,58.54 %
39
- 38,Gemini-1 .5-Pro-002 (FC),64.59%,61.05%,58.91%,61.33%,81.25%,58.33%,69.71%,70.73 %
40
- 39,Hammer2.0-1.5b (FC),63.22%,68.76%,70.54%,68.56%,56.25%,66.67%,53.37%,92.68 %
41
- 40,Open-Mistral-Nemo-2407 (FC),62.37 %,68.46%,71.71%,67.79%,62.50 %,66.67%,53.14%,60.98 %
42
- 41,DBRX-Instruct (Prompt ),62.33%,72.06%,74.81%,71.65%,75.00%,58.33%,46.29%,87.80 %
43
- 42,GPT-4o-2024-08-06 (Prompt),62.19%,42.55%,42.64%,42.82%,25 .00%,41.67%,93.37%,36.59 %
44
- 43,Hermes-2-Pro-Llama-3-8B (FC),61.79%,64.57%,67.44%,64.42%,56.25%,45.83%,57.83%,56.10 %
45
- 44,Qwen2.5-1.5B-Instruct (Prompt ),61.71%,60.37%,64.73%,59.88%,50.00%,41.67%,63.09%,75.61 %
46
- 45,GPT-3 .5-Turbo-0125 (FC ),61.22%,76.25%,74.42%,77.82%,43.75%,50.00%,36.57%,97.56 %
47
- 46,Llama -3.1-70B-Instruct (Prompt ),61.13%,72.58%,77.13%,71.46%,87.50%,62.50%,42.17%,92.68 %
48
- 47,Hermes-2-Pro- Llama-3-70B (FC),60.51%,55.28%,63.18%,53.04%,56.25%,66.67%,68.46%,60.98 %
49
- 48,MiniCPM3-4B (FC),59.88%,50.71%,56.98%,49.47 %,56.25%,33.33%,73.94%,58.54 %
50
- 49,Gemini-1.0-Pro-002 (FC),58.91%,55.81%,58.91%,56.12%,37.50%,20.83%,63.20%,68.29 %
11
+ 10,Gemini-1.5-Pro-001 (Prompt ),73.83%,72.96%,74.03%,72.32%,93 .75%,75.00%,75.66%,63.41 %
12
+ 11,Functionary-Medium-v3.1 (FC ),73.48%,81.05%,79.46%,81.87%,68 .75%,70.83%,62.06%,70.73 %
13
+ 12,Gemini-1.5-Flash-002 (Prompt),73.21%,75.13%,77.52%,74.73%,87.50%,58.33%,70.06%,78.05 %
14
+ 13,Mistral-Medium-2312 (Prompt),73.10%,71.84%,68.60%,73.00%,81.25%,50.00%,100.00%,60.98 %
15
+ 14,o1-preview-2024-09-12 (Prompt),73.08%,77.53%,80.62%,76.76 %,75.00%,79.17%,66.29%,73.17 %
16
+ 15,Gemini-1.5-Flash-001 (FC),72.81%,73.03%,72.48%,73.67%,62.50%,58.33%,72.91%,63.41 %
17
+ 16,Gemini-1.5-Pro-001 (FC),72.81%,71 .16%,73.64%,70.59%,81.25%,62 .50%,75.77%,63.41 %
18
+ 17,GoGoAgent,72.46%,72.21%,71.32%,72.42 %,87.50%,62.50%,72.11%,87.80 %
19
+ 18,Gemini-1.5-Pro-002 (FC),72.41%,74.76 %,74.81%,74.64%,87.50%,70.83%,68.80%,73.17 %
20
+ 19,xLAM-8x22b-r (FC),71.97%,79.40%,78.29%,80.14%,75.00%,62.50%,60.00%,85.37 %
21
+ 20,Functionary-Small-v3.1 (FC),70.41%,75.58%,75.19%,75.89%,81.25 %,62.50%,61.83%,85.37 %
22
+ 21,Mistral-small-2402 (FC),70.19%,68.16%,63.57%,71.46%,12.50%,12.50%,72.69%,82.93 %
23
+ 22,GPT-4o-mini-2024-07-18 (FC),70.19%,74.23%,72.87%,74.45%,87.50%,70.83%,63.54%,80.49 %
24
+ 23,Hammer2.0-7b (FC),69.79%,76.63%,74.42%,77.15%,81.25%,75.00%,58.17%,95.12 %
25
+ 24,Command-R-Plus (Prompt) (Original) ,69.75%,69.59%,66.67%,70.30%,68.75%,70.83%,69.83%,73.17 %
26
+ 25,Gemma-2-27b-it (Prompt),69.48%,77.30%,79.46%,77.24%,68.75%,62.50%,56.69%,87.80 %
27
+ 26,Gemma-2-9b-it (Prompt),69.21%,73.11%,73.64%,73.58%,56.25%,58.33%,62.40%,87.80 %
28
+ 27,xLAM-8x7b -r (FC),69.12%,74.53%,68.22%,76.76%,62.50%,54.17%,60.00%,87.80 %
29
+ 28,GPT-4-turbo-2024-04-09 (Prompt),69.04%,84.64%,85.66%,84.57%,87.50%,75.00%,44.57%,82.93 %
30
+ 29,Open-Mixtral-8x22b (Prompt),68.46%,63.90%,72.87%,61.33%,81.25%,66 .67%,75.54%,65.85 %
31
+ 30,mistral-large-2407 (FC),68.37%,79.55%,81.78%,79.27 %,68.75%,75 .00%,50.97%,75.61 %
32
+ 31,Gemini-1.5-Flash-001 (Prompt),68.24%,76.18%,74.81%,76.18%,93.75%,79.17%,55.20%,87.80 %
33
+ 32,xLAM-7b-r (FC),67.88%,72.28%,71.32%,73.48%,31.25%,58.33%,59.77%,97.56 %
34
+ 33,GPT-3 .5-Turbo-0125 (Prompt),67.48%,64.27%,63.57%,64.61%,68.75%,54.17%,71.77%,80.49 %
35
+ 34,Gorilla-OpenFunctions-v2 (FC),67.44%,61.42%,73.64 %,58.73%,68.75 %,41.67%,76.34%,73.17 %
36
+ 35,Open-Mixtral-8x22b (FC),66.86%,71.16%,73.26%,72.32%,6.25%,41.67%,59.54 %,82.93%
37
+ 36,Meta-Llama-3-70B-Instruct (Prompt),66.15%,79.10%,78.68%,79.65 %,68.75%,66.67%,45.14%,92.68 %
38
+ 37,Gemini-1.0-Pro-002 (FC),66.10%,67.04%,75.19%,65.96%,50.00%,37.50%,64.57%,68.29 %
39
+ 38,Qwen2 .5-7B-Instruct (Prompt),65.97%,72.13%,72.48%,72.32%,62.50%,66.67%,55.31%,92.68 %
40
+ 39,Open-Mixtral-8x7b (Prompt),64.95%,63.30%,57.36%,65.00%,68.75%,50.00%,67.31%,68.29 %
41
+ 40,Hammer2.0-1.5b (FC),63.22 %,68.76%,70.54%,68.56%,56.25 %,66.67%,53.37%,92.68 %
42
+ 41,Open-Mistral-Nemo-2407 (FC ),62.37%,68.46%,71.71%,67.79%,62.50%,66.67%,53.14%,60.98 %
43
+ 42,DBRX-Instruct (Prompt),62.33%,72.06%,74.81%,71.65%,75 .00%,58.33%,46.29%,87.80 %
44
+ 43,GPT-4o-2024-08-06 (Prompt),62.19%,42.55%,42.64%,42.82%,25.00%,41.67%,93.37%,36.59 %
45
+ 44,Hermes-2-Pro-Llama-3-8B (FC ),61.79%,64.57%,67.44%,64.42%,56.25%,45.83%,57.83%,56.10 %
46
+ 45,Qwen2 .5-1.5B-Instruct (Prompt ),61.71%,60.37%,64.73%,59.88%,50.00%,41.67%,63.09%,75.61 %
47
+ 46,GPT -3.5-Turbo-0125 (FC ),61.22%,76.25%,74.42%,77.82%,43.75%,50.00%,36.57%,97.56 %
48
+ 47,Llama-3.1 -70B-Instruct (Prompt),61.13%,72.58%,77.13%,71.46%,87.50%,62.50%,42.17%,92.68 %
49
+ 48,Hermes-2-Pro-Llama-3-70B (FC),60.51%,55.28%,63.18%,53.04 %,56.25%,66.67%,68.46%,60.98 %
50
+ 49,MiniCPM3-4B (FC),59.88%,50.71%,56.98%,49.47%,56.25%,33.33%,73.94%,58.54 %
51
51
50,Llama-3.1-8B-Instruct (Prompt),57.93%,71.31%,71.32%,72.23%,50.00%,45.83%,36.57%,78.05%
52
52
51,Claude-3-Haiku-20240307 (FC tools-2024-04-04),57.66%,74.31%,74.03%,77.15%,0.00%,4.17%,30.40%,97.56%
53
53
52,Granite-20b-FunctionCalling (FC),57.49%,57.08%,65.12%,55.35%,43.75%,54.17%,56.34%,95.12%
54
54
53,Command-R-Plus (FC) (Original),57.26%,61.50%,66.67%,60.56%,56.25%,50.00%,49.14%,92.68%
55
55
54,Hermes-2-Pro-Mistral-7B (FC),56.46%,59.85%,64.73%,59.40%,43.75%,37.50%,50.40%,75.61%
56
- 55,Claude-3.5-Sonnet-20240620 (Prompt),54.24%,31.24%,65.12%,22.66%,37.50%,33.33%,90.97%,19.51%
57
- 56,Qwen2-7B-Instruct (Prompt),54.24%,61.57%,59.30%,62.20%,50.00%,66.67%,41.49%,87.80%
58
- 57,Mistral-Small-2402 (Prompt),53.98%,39.48%,18.22%,45.90%,12.50%,8.33%,76.69%,41.46%
59
- 58,Nexusflow-Raven-v2 (FC),53.49%,39.03%,39.92%,38.48%,56.25%,41.67%,74.97%,65.85%
60
- 59,xLAM-7b-fc-r (FC),53.44%,60.07%,75.58%,57.28%,43.75%,25.00%,42.51%,70.73%
61
- 60,mistral-large-2407 (Prompt),53.35%,67.42%,45.74%,73.10%,68.75%,54.17%,30.17%,90.24%
62
- 61,Hammer2.0-0.5b (FC),52.42%,45.17%,48.84%,44.07%,62.50%,41.67%,61.94%,85.37%
63
- 62,Llama-3.2-3B-Instruct (Prompt),50.91%,44.49%,47.67%,44.74%,0.00%,29.17%,60.11%,63.41%
64
- 63,Meta-Llama-3-8B-Instruct (Prompt),50.51%,59.78%,60.85%,60.75%,37.50%,20.83%,35.20%,75.61%
65
- 64,Open-Mistral-Nemo-2407 (Prompt),50.33%,75.06%,78.29%,74.54%,75.00%,62.50%,10.74%,90.24%
66
- 65,Gemini-1.0-Pro-002 (Prompt),45.67%,38.13%,41.47%,36.93%,68.75%,33.33%,55.54%,80.49%
67
- 66,Llama-3.1-70B-Instruct (FC),44.47%,51.01%,48.45%,52.56%,31.25%,25.00%,31.89%,100.00%
68
- 67,Gemma-2-2b-it (Prompt),41.63%,11.46%,11.24%,11.96%,0.00%,0.00%,89.03%,12.20%
69
- 68,Qwen2-1.5B-Instruct (Prompt),39.00%,41.87%,50.39%,40.50%,25.00%,20.83%,32.91%,75.61%
70
- 69,xLAM-1b-fc-r (FC),38.34%,54.31%,63.18%,54.19%,0.00%,0.00%,11.20%,97.56%
71
- 70,Llama-3.1-8B-Instruct (FC),33.23%,47.34%,48.06%,47.64%,31.25%,37.50%,8.91%,92.68%
72
- 71,Llama-3.2-1B-Instruct (Prompt),29.85%,8.91%,25.97%,4.82%,6.25%,4.17%,60.91%,48.78%
56
+ 55,Qwen2-7B-Instruct (Prompt),54.24%,61.57%,59.30%,62.20%,50.00%,66.67%,41.49%,87.80%
57
+ 56,Mistral-Small-2402 (Prompt),53.98%,39.48%,18.22%,45.90%,12.50%,8.33%,76.69%,41.46%
58
+ 57,Nexusflow-Raven-v2 (FC),53.49%,39.03%,39.92%,38.48%,56.25%,41.67%,74.97%,65.85%
59
+ 58,xLAM-7b-fc-r (FC),53.44%,60.07%,75.58%,57.28%,43.75%,25.00%,42.51%,70.73%
60
+ 59,mistral-large-2407 (Prompt),53.35%,67.42%,45.74%,73.10%,68.75%,54.17%,30.17%,90.24%
61
+ 60,Hammer2.0-0.5b (FC),52.42%,45.17%,48.84%,44.07%,62.50%,41.67%,61.94%,85.37%
62
+ 61,Llama-3.2-3B-Instruct (Prompt),50.91%,44.49%,47.67%,44.74%,0.00%,29.17%,60.11%,63.41%
63
+ 62,Meta-Llama-3-8B-Instruct (Prompt),50.51%,59.78%,60.85%,60.75%,37.50%,20.83%,35.20%,75.61%
64
+ 63,Open-Mistral-Nemo-2407 (Prompt),50.33%,75.06%,78.29%,74.54%,75.00%,62.50%,10.74%,90.24%
65
+ 64,Gemini-1.0-Pro-002 (Prompt),48.38%,48.61%,50.00%,48.41%,56.25%,37.50%,46.29%,85.37%
66
+ 65,Llama-3.1-70B-Instruct (FC),44.47%,51.01%,48.45%,52.56%,31.25%,25.00%,31.89%,100.00%
67
+ 66,Gemma-2-2b-it (Prompt),41.63%,11.46%,11.24%,11.96%,0.00%,0.00%,89.03%,12.20%
68
+ 67,Qwen2-1.5B-Instruct (Prompt),39.00%,41.87%,50.39%,40.50%,25.00%,20.83%,32.91%,75.61%
69
+ 68,xLAM-1b-fc-r (FC),38.34%,54.31%,63.18%,54.19%,0.00%,0.00%,11.20%,97.56%
70
+ 69,Llama-3.1-8B-Instruct (FC),33.23%,47.34%,48.06%,47.64%,31.25%,37.50%,8.91%,92.68%
71
+ 70,Llama-3.2-1B-Instruct (Prompt),29.85%,8.91%,25.97%,4.82%,6.25%,4.17%,60.91%,48.78%
0 commit comments