@@ -27,85 +27,86 @@ Rank,Model,Live Overall Acc,AST Summary,Python Simple AST,Python Multiple AST,Py
27
27
26,Gemini-1.5-Flash-001 (FC),76.37%,74.17%,75.97%,74.26%,62.50%,58.33%,80.27%,50.00%
28
28
27,Gemini-1.5-Pro-002 (FC),76.28%,76.31%,80.23%,75.21%,87.50%,75.00%,76.30%,72.22%
29
29
28,Gemini-1.5-Pro-001 (FC),76.23%,71.65%,75.58%,70.75%,81.25%,62.50%,83.79%,50.00%
30
- 29,Qwen2.5-72B-Instruct (Prompt),75.30%,82.38%,85.27%,82.15%,62.50%,75.00%,63.95%,100.00%
31
- 30,xLAM-7b-r (FC),75.22%,73.87%,72.09%,74.93%,50.00%,62.50%,86.72%,94.44%
32
- 31,Hammer2.1-7b (FC),75.11%,77.20%,76.74%,77.40%,81.25%,70.83%,71.77%,82.35%
33
- 32,CALM-405B,74.50%,83.20%,84.88%,83.00%,87.50%,70.83%,60.66%,100.00%
34
- 33,GPT-4o-mini-2024-07-18 (FC),74.41%,76.68%,78.68%,76.16%,87.50%,70.83%,70.75%,83.33%
35
- 34,Amazon-Nova-Pro-v1:0 (FC),74.32%,77.72%,80.23%,77.49%,81.25%,58.33%,69.05%,77.78%
36
- 35,Qwen2.5-32B-Instruct (Prompt),74.23%,78.83%,82.95%,78.54%,62.50%,58.33%,66.67%,100.00%
37
- 36,Haha-7B,74.19%,77.57%,78.29%,77.59%,75.00%,70.83%,68.82%,83.33%
38
- 37,Qwen2.5-14B-Instruct (Prompt),74.14%,75.20%,74.42%,75.78%,62.50%,66.67%,72.45%,77.78%
39
- 38,Hammer2.1-3b (FC),74.04%,73.06%,73.26%,73.31%,62.50%,66.67%,75.40%,82.35%
40
- 39,GoGoAgent,74.01%,74.69%,72.87%,75.40%,68.75%,66.67%,72.90%,77.78%
41
- 40,Functionary-Small-v3.1 (FC),73.75%,78.24%,79.84%,78.16%,81.25%,62.50%,66.78%,77.78%
42
- 41,DeepSeek-Coder-V2 (FC),73.48%,77.20%,80.62%,77.02%,43.75%,70.83%,67.46%,88.89%
43
- 42,CALM-8B,72.95%,67.28%,71.71%,66.67%,56.25%,54.17%,81.41%,83.33%
44
- 43,xLAM-8x22b-r (FC),72.59%,79.64%,80.23%,79.68%,81.25%,70.83%,61.45%,88.89%
45
- 44,claude-3.5-haiku-20241022 (FC),72.37%,77.13%,82.95%,78.35%,18.75%,0.00%,64.85%,83.33%
46
- 45,Mistral-small-2402 (FC),72.19%,68.62%,65.50%,71.51%,12.50%,12.50%,77.55%,77.78%
47
- 46,Claude-3.5-Sonnet-20241022 (Prompt),71.97%,80.75%,86.82%,80.06%,81.25%,45.83%,58.39%,77.78%
48
- 47,CALM-70B,71.92%,66.17%,70.16%,65.34%,68.75%,58.33%,80.84%,66.67%
49
- 48,Amazon-Nova-Lite-v1:0 (FC),71.61%,70.61%,72.87%,70.09%,75.00%,66.67%,73.24%,66.67%
50
- 49,xLAM-8x7b-r (FC),71.08%,77.65%,74.81%,79.30%,43.75%,58.33%,60.54%,94.44%
51
- 50,claude-3.5-haiku-20241022 (Prompt),70.77%,76.68%,84.88%,75.02%,87.50%,54.17%,61.56%,77.78%
52
- 51,Hammer2.1-1.5b (FC),70.64%,69.73%,71.32%,69.80%,50.00%,62.50%,71.88%,77.78%
53
- 52,FireFunction-v1 (FC),70.46%,70.54%,71.71%,72.93%,0.00%,0.00%,69.84%,94.44%
54
- 53,MiniCPM3-4B-FC (FC),70.01%,65.73%,74.81%,63.91%,43.75%,62.50%,76.53%,72.22%
55
- 54,mistral-large-2407 (FC),69.88%,79.64%,85.27%,78.54%,62.50%,79.17%,54.88%,72.22%
56
- 55,Gemini-1.0-Pro-002 (FC),69.70%,68.91%,78.29%,67.62%,43.75%,41.67%,70.98%,66.67%
57
- 56,Command R7B (FC),69.17%,59.59%,63.18%,58.69%,56.25%,62.50%,84.13%,55.56%
58
- 57,Gemini-1.5-Flash-001 (Prompt),68.90%,76.61%,77.13%,76.16%,93.75%,79.17%,56.80%,83.33%
59
- 58,Open-Mixtral-8x22b (FC),68.64%,72.61%,77.13%,73.12%,6.25%,45.83%,62.24%,83.33%
60
- 59,GPT-3.5-Turbo-0125 (Prompt),68.55%,78.61%,80.62%,78.63%,75.00%,58.33%,52.61%,94.44%
61
- 60,DeepSeek-V3 (FC),68.41%,82.09%,83.72%,82.15%,81.25%,62.50%,47.05%,88.89%
62
- 61,Sky-T1-32B-Preview (Prompt),68.00%,76.76%,77.52%,77.11%,62.50%,62.50%,54.08%,94.12%
63
- 62,Gemma-2-9b-it (Prompt),67.97%,74.54%,77.13%,74.26%,62.50%,66.67%,57.60%,83.33%
64
- 63,Qwen2.5-7B-Instruct (Prompt),67.44%,75.06%,76.74%,74.93%,62.50%,70.83%,55.33%,88.89%
65
- 64,Gemma-2-27b-it (Prompt),67.17%,80.16%,85.27%,79.39%,68.75%,66.67%,46.71%,94.44%
66
- 65,Amazon-Nova-Micro-v1:0 (FC),67.04%,64.17%,65.89%,64.20%,62.50%,45.83%,71.32%,72.22%
67
- 66,Claude-3-Opus-20240229 (Prompt),66.99%,79.72%,85.27%,79.11%,68.75%,54.17%,47.17%,83.33%
68
- 67,GLM-4-9b-Chat (FC),66.81%,64.03%,72.48%,64.39%,0.00%,0.00%,71.09%,66.67%
69
- 68,Open-Mixtral-8x22b (Prompt),66.02%,74.76%,83.33%,72.65%,81.25%,70.83%,52.27%,83.33%
70
- 69,Open-Mistral-Nemo-2407 (FC),65.97%,71.13%,77.13%,69.61%,75.00%,70.83%,58.05%,66.67%
71
- 70,FireFunction-v2 (FC),65.66%,78.09%,79.07%,78.35%,56.25%,70.83%,46.03%,94.44%
72
- 71,Hermes-2-Pro-Llama-3-8B (FC),64.95%,66.62%,72.09%,65.81%,56.25%,50.00%,62.81%,44.44%
73
- 72,Meta-Llama-3-70B-Instruct (Prompt),64.95%,78.53%,81.01%,78.25%,75.00%,66.67%,43.42%,100.00%
74
- 73,Ministral-8B-Instruct-2410 (FC),64.93%,72.61%,75.58%,72.27%,62.50%,62.50%,53.06%,70.59%
75
- 74,GPT-3.5-Turbo-0125 (FC),64.02%,79.20%,81.40%,79.68%,43.75%,58.33%,40.14%,94.44%
76
- 75,GPT-4-turbo-2024-04-09 (Prompt),63.84%,84.97%,87.98%,84.14%,100.00%,79.17%,30.73%,100.00%
77
- 76,Hammer2.1-0.5b (FC),62.91%,58.11%,60.08%,58.02%,50.00%,45.83%,69.95%,77.78%
78
- 77,Llama-3.3-70B-Instruct (Prompt),62.77%,78.02%,81.78%,77.11%,93.75%,66.67%,38.66%,100.00%
79
- 78,Llama-3.1-70B-Instruct (Prompt),62.24%,76.54%,78.29%,76.16%,87.50%,66.67%,39.57%,100.00%
80
- 79,Open-Mixtral-8x7b (Prompt),61.44%,65.36%,63.57%,66.10%,68.75%,50.00%,54.88%,88.89%
81
- 80,Qwen2.5-1.5B-Instruct (Prompt),61.08%,61.07%,70.54%,59.26%,56.25%,41.67%,60.66%,83.33%
82
- 81,Llama-3.1-8B-Instruct (Prompt),61.08%,72.91%,74.03%,73.31%,56.25%,54.17%,42.63%,77.78%
83
- 82,DBRX-Instruct (Prompt),60.28%,73.50%,78.29%,73.03%,75.00%,41.67%,39.34%,94.44%
84
- 83,Granite-20b-FunctionCalling (FC),59.66%,58.48%,68.22%,56.32%,43.75%,58.33%,60.88%,88.89%
85
- 84,Command-R-Plus (FC),59.00%,60.84%,70.54%,58.78%,62.50%,45.83%,55.90%,72.22%
86
- 85,Bielik-11B-v2.3-Instruct (Prompt),58.91%,69.43%,72.87%,69.33%,43.75%,54.17%,42.40%,77.78%
87
- 86,Mistral-Small-2402 (Prompt),58.77%,57.96%,36.43%,65.24%,0.00%,8.33%,60.32%,44.44%
88
- 87,Qwen2.5-3B-Instruct (Prompt),58.69%,66.91%,69.77%,66.48%,56.25%,62.50%,45.46%,88.89%
89
- 88,Hermes-2-Pro-Mistral-7B (FC),57.71%,61.36%,69.77%,60.02%,43.75%,41.67%,51.93%,66.67%
90
- 89,Llama-3.2-3B-Instruct (Prompt),55.80%,63.73%,63.95%,64.86%,12.50%,45.83%,42.97%,88.89%
91
- 90,Falcon3-7B-Instruct (FC),54.86%,67.95%,74.03%,66.48%,75.00%,62.50%,34.13%,88.89%
92
- 91,MiniCPM3-4B (Prompt),54.46%,37.23%,46.51%,34.76%,43.75%,41.67%,80.95%,50.00%
93
- 92,Nexusflow-Raven-v2 (FC),54.20%,39.45%,41.47%,38.65%,56.25%,41.67%,76.64%,61.11%
94
- 93,Falcon3-10B-Instruct (FC),54.11%,75.28%,76.36%,76.16%,50.00%,41.67%,20.86%,94.44%
95
- 94,xLAM-7b-fc-r (FC),53.40%,61.07%,78.68%,58.02%,31.25%,25.00%,41.16%,77.78%
96
- 95,mistral-large-2407 (Prompt),52.82%,82.90%,86.05%,81.96%,93.75%,83.33%,5.78%,100.00%
97
- 96,Qwen2-7B-Instruct (Prompt),50.60%,60.77%,56.59%,62.01%,37.50%,66.67%,34.24%,88.89%
98
- 97,Gemini-1.0-Pro-002 (Prompt),49.13%,47.59%,50.78%,47.01%,62.50%,29.17%,50.91%,77.78%
99
- 98,Open-Mistral-Nemo-2407 (Prompt),49.04%,75.13%,77.91%,74.45%,87.50%,66.67%,8.28%,88.89%
100
- 99,Meta-Llama-3-8B-Instruct (Prompt),47.98%,60.62%,61.24%,61.44%,37.50%,33.33%,28.00%,77.78%
101
- 100,Falcon3-3B-Instruct (FC),47.40%,55.51%,55.43%,56.32%,31.25%,37.50%,34.35%,77.78%
102
- 101,Llama-3.1-70B-Instruct (FC),45.00%,51.81%,52.33%,52.61%,31.25%,25.00%,33.45%,100.00%
103
- 102,Gemma-2-2b-it (Prompt),43.80%,19.54%,26.74%,18.52%,0.00%,0.00%,81.07%,38.89%
104
- 103,QwQ-32B-Preview (Prompt),40.78%,3.55%,7.36%,2.75%,0.00%,0.00%,98.64%,0.00%
105
- 104,DeepSeek-Coder-V2-Lite-Instruct (FC),39.40%,3.55%,2.33%,3.80%,0.00%,8.33%,95.12%,0.00%
106
- 105,Qwen2-1.5B-Instruct (Prompt),39.05%,41.30%,48.84%,40.27%,12.50%,25.00%,34.47%,94.44%
107
- 106,xLAM-1b-fc-r (FC),36.92%,53.89%,63.95%,53.37%,6.25%,0.00%,9.64%,100.00%
108
- 107,Llama-3.1-8B-Instruct (FC),33.50%,49.30%,51.94%,49.00%,37.50%,41.67%,8.05%,94.44%
109
- 108,Falcon3-1B-Instruct (FC),32.70%,2.96%,4.65%,2.37%,0.00%,12.50%,78.91%,0.00%
110
- 109,Qwen2.5-0.5B-Instruct (Prompt),31.59%,38.34%,53.88%,34.76%,56.25%,16.67%,19.95%,94.44%
111
- 110,Llama-3.2-1B-Instruct (Prompt),31.36%,12.14%,31.40%,7.60%,12.50%,4.17%,60.66%,38.89%
30
+ 29,BitAgent-8B,76.14%,77.50%,77.91%,77.40%,87.50%,70.83%,73.92%,83.33%
31
+ 30,Qwen2.5-72B-Instruct (Prompt),75.30%,82.38%,85.27%,82.15%,62.50%,75.00%,63.95%,100.00%
32
+ 31,xLAM-7b-r (FC),75.22%,73.87%,72.09%,74.93%,50.00%,62.50%,86.72%,94.44%
33
+ 32,Hammer2.1-7b (FC),75.11%,77.20%,76.74%,77.40%,81.25%,70.83%,71.77%,82.35%
34
+ 33,CoALM-405B,74.50%,83.20%,84.88%,83.00%,87.50%,70.83%,60.66%,100.00%
35
+ 34,GPT-4o-mini-2024-07-18 (FC),74.41%,76.68%,78.68%,76.16%,87.50%,70.83%,70.75%,83.33%
36
+ 35,Amazon-Nova-Pro-v1:0 (FC),74.32%,77.72%,80.23%,77.49%,81.25%,58.33%,69.05%,77.78%
37
+ 36,Qwen2.5-32B-Instruct (Prompt),74.23%,78.83%,82.95%,78.54%,62.50%,58.33%,66.67%,100.00%
38
+ 37,Haha-7B,74.19%,77.57%,78.29%,77.59%,75.00%,70.83%,68.82%,83.33%
39
+ 38,Qwen2.5-14B-Instruct (Prompt),74.14%,75.20%,74.42%,75.78%,62.50%,66.67%,72.45%,77.78%
40
+ 39,Hammer2.1-3b (FC),74.04%,73.06%,73.26%,73.31%,62.50%,66.67%,75.40%,82.35%
41
+ 40,GoGoAgent,74.01%,74.69%,72.87%,75.40%,68.75%,66.67%,72.90%,77.78%
42
+ 41,Functionary-Small-v3.1 (FC),73.75%,78.24%,79.84%,78.16%,81.25%,62.50%,66.78%,77.78%
43
+ 42,DeepSeek-Coder-V2 (FC),73.48%,77.20%,80.62%,77.02%,43.75%,70.83%,67.46%,88.89%
44
+ 43,CoALM-8B,72.95%,67.28%,71.71%,66.67%,56.25%,54.17%,81.41%,83.33%
45
+ 44,xLAM-8x22b-r (FC),72.59%,79.64%,80.23%,79.68%,81.25%,70.83%,61.45%,88.89%
46
+ 45,claude-3.5-haiku-20241022 (FC),72.37%,77.13%,82.95%,78.35%,18.75%,0.00%,64.85%,83.33%
47
+ 46,Mistral-small-2402 (FC),72.19%,68.62%,65.50%,71.51%,12.50%,12.50%,77.55%,77.78%
48
+ 47,Claude-3.5-Sonnet-20241022 (Prompt),71.97%,80.75%,86.82%,80.06%,81.25%,45.83%,58.39%,77.78%
49
+ 48,CoALM-70B,71.92%,66.17%,70.16%,65.34%,68.75%,58.33%,80.84%,66.67%
50
+ 49,Amazon-Nova-Lite-v1:0 (FC),71.61%,70.61%,72.87%,70.09%,75.00%,66.67%,73.24%,66.67%
51
+ 50,xLAM-8x7b-r (FC),71.08%,77.65%,74.81%,79.30%,43.75%,58.33%,60.54%,94.44%
52
+ 51,claude-3.5-haiku-20241022 (Prompt),70.77%,76.68%,84.88%,75.02%,87.50%,54.17%,61.56%,77.78%
53
+ 52,Hammer2.1-1.5b (FC),70.64%,69.73%,71.32%,69.80%,50.00%,62.50%,71.88%,77.78%
54
+ 53,FireFunction-v1 (FC),70.46%,70.54%,71.71%,72.93%,0.00%,0.00%,69.84%,94.44%
55
+ 54,MiniCPM3-4B-FC (FC),70.01%,65.73%,74.81%,63.91%,43.75%,62.50%,76.53%,72.22%
56
+ 55,mistral-large-2407 (FC),69.88%,79.64%,85.27%,78.54%,62.50%,79.17%,54.88%,72.22%
57
+ 56,Gemini-1.0-Pro-002 (FC),69.70%,68.91%,78.29%,67.62%,43.75%,41.67%,70.98%,66.67%
58
+ 57,Command R7B (FC),69.17%,59.59%,63.18%,58.69%,56.25%,62.50%,84.13%,55.56%
59
+ 58,Gemini-1.5-Flash-001 (Prompt),68.90%,76.61%,77.13%,76.16%,93.75%,79.17%,56.80%,83.33%
60
+ 59,Open-Mixtral-8x22b (FC),68.64%,72.61%,77.13%,73.12%,6.25%,45.83%,62.24%,83.33%
61
+ 60,GPT-3.5-Turbo-0125 (Prompt),68.55%,78.61%,80.62%,78.63%,75.00%,58.33%,52.61%,94.44%
62
+ 61,DeepSeek-V3 (FC),68.41%,82.09%,83.72%,82.15%,81.25%,62.50%,47.05%,88.89%
63
+ 62,Sky-T1-32B-Preview (Prompt),68.00%,76.76%,77.52%,77.11%,62.50%,62.50%,54.08%,94.12%
64
+ 63,Gemma-2-9b-it (Prompt),67.97%,74.54%,77.13%,74.26%,62.50%,66.67%,57.60%,83.33%
65
+ 64,Qwen2.5-7B-Instruct (Prompt),67.44%,75.06%,76.74%,74.93%,62.50%,70.83%,55.33%,88.89%
66
+ 65,Gemma-2-27b-it (Prompt),67.17%,80.16%,85.27%,79.39%,68.75%,66.67%,46.71%,94.44%
67
+ 66,Amazon-Nova-Micro-v1:0 (FC),67.04%,64.17%,65.89%,64.20%,62.50%,45.83%,71.32%,72.22%
68
+ 67,Claude-3-Opus-20240229 (Prompt),66.99%,79.72%,85.27%,79.11%,68.75%,54.17%,47.17%,83.33%
69
+ 68,GLM-4-9b-Chat (FC),66.81%,64.03%,72.48%,64.39%,0.00%,0.00%,71.09%,66.67%
70
+ 69,Open-Mixtral-8x22b (Prompt),66.02%,74.76%,83.33%,72.65%,81.25%,70.83%,52.27%,83.33%
71
+ 70,Open-Mistral-Nemo-2407 (FC),65.97%,71.13%,77.13%,69.61%,75.00%,70.83%,58.05%,66.67%
72
+ 71,FireFunction-v2 (FC),65.66%,78.09%,79.07%,78.35%,56.25%,70.83%,46.03%,94.44%
73
+ 72,Hermes-2-Pro-Llama-3-8B (FC),64.95%,66.62%,72.09%,65.81%,56.25%,50.00%,62.81%,44.44%
74
+ 73,Meta-Llama-3-70B-Instruct (Prompt),64.95%,78.53%,81.01%,78.25%,75.00%,66.67%,43.42%,100.00%
75
+ 74,Ministral-8B-Instruct-2410 (FC),64.93%,72.61%,75.58%,72.27%,62.50%,62.50%,53.06%,70.59%
76
+ 75,GPT-3.5-Turbo-0125 (FC),64.02%,79.20%,81.40%,79.68%,43.75%,58.33%,40.14%,94.44%
77
+ 76,GPT-4-turbo-2024-04-09 (Prompt),63.84%,84.97%,87.98%,84.14%,100.00%,79.17%,30.73%,100.00%
78
+ 77,Hammer2.1-0.5b (FC),62.91%,58.11%,60.08%,58.02%,50.00%,45.83%,69.95%,77.78%
79
+ 78,Llama-3.3-70B-Instruct (Prompt),62.77%,78.02%,81.78%,77.11%,93.75%,66.67%,38.66%,100.00%
80
+ 79,Llama-3.1-70B-Instruct (Prompt),62.24%,76.54%,78.29%,76.16%,87.50%,66.67%,39.57%,100.00%
81
+ 80,Open-Mixtral-8x7b (Prompt),61.44%,65.36%,63.57%,66.10%,68.75%,50.00%,54.88%,88.89%
82
+ 81,Qwen2.5-1.5B-Instruct (Prompt),61.08%,61.07%,70.54%,59.26%,56.25%,41.67%,60.66%,83.33%
83
+ 82,Llama-3.1-8B-Instruct (Prompt),61.08%,72.91%,74.03%,73.31%,56.25%,54.17%,42.63%,77.78%
84
+ 83,DBRX-Instruct (Prompt),60.28%,73.50%,78.29%,73.03%,75.00%,41.67%,39.34%,94.44%
85
+ 84,Granite-20b-FunctionCalling (FC),59.66%,58.48%,68.22%,56.32%,43.75%,58.33%,60.88%,88.89%
86
+ 85,Command-R-Plus (FC),59.00%,60.84%,70.54%,58.78%,62.50%,45.83%,55.90%,72.22%
87
+ 86,Bielik-11B-v2.3-Instruct (Prompt),58.91%,69.43%,72.87%,69.33%,43.75%,54.17%,42.40%,77.78%
88
+ 87,Mistral-Small-2402 (Prompt),58.77%,57.96%,36.43%,65.24%,0.00%,8.33%,60.32%,44.44%
89
+ 88,Qwen2.5-3B-Instruct (Prompt),58.69%,66.91%,69.77%,66.48%,56.25%,62.50%,45.46%,88.89%
90
+ 89,Hermes-2-Pro-Mistral-7B (FC),57.71%,61.36%,69.77%,60.02%,43.75%,41.67%,51.93%,66.67%
91
+ 90,Llama-3.2-3B-Instruct (Prompt),55.80%,63.73%,63.95%,64.86%,12.50%,45.83%,42.97%,88.89%
92
+ 91,Falcon3-7B-Instruct (FC),54.86%,67.95%,74.03%,66.48%,75.00%,62.50%,34.13%,88.89%
93
+ 92,MiniCPM3-4B (Prompt),54.46%,37.23%,46.51%,34.76%,43.75%,41.67%,80.95%,50.00%
94
+ 93,Nexusflow-Raven-v2 (FC),54.20%,39.45%,41.47%,38.65%,56.25%,41.67%,76.64%,61.11%
95
+ 94,Falcon3-10B-Instruct (FC),54.11%,75.28%,76.36%,76.16%,50.00%,41.67%,20.86%,94.44%
96
+ 95,xLAM-7b-fc-r (FC),53.40%,61.07%,78.68%,58.02%,31.25%,25.00%,41.16%,77.78%
97
+ 96,mistral-large-2407 (Prompt),52.82%,82.90%,86.05%,81.96%,93.75%,83.33%,5.78%,100.00%
98
+ 97,Qwen2-7B-Instruct (Prompt),50.60%,60.77%,56.59%,62.01%,37.50%,66.67%,34.24%,88.89%
99
+ 98,Gemini-1.0-Pro-002 (Prompt),49.13%,47.59%,50.78%,47.01%,62.50%,29.17%,50.91%,77.78%
100
+ 99,Open-Mistral-Nemo-2407 (Prompt),49.04%,75.13%,77.91%,74.45%,87.50%,66.67%,8.28%,88.89%
101
+ 100,Meta-Llama-3-8B-Instruct (Prompt),47.98%,60.62%,61.24%,61.44%,37.50%,33.33%,28.00%,77.78%
102
+ 101,Falcon3-3B-Instruct (FC),47.40%,55.51%,55.43%,56.32%,31.25%,37.50%,34.35%,77.78%
103
+ 102,Llama-3.1-70B-Instruct (FC),45.00%,51.81%,52.33%,52.61%,31.25%,25.00%,33.45%,100.00%
104
+ 103,Gemma-2-2b-it (Prompt),43.80%,19.54%,26.74%,18.52%,0.00%,0.00%,81.07%,38.89%
105
+ 104,QwQ-32B-Preview (Prompt),40.78%,3.55%,7.36%,2.75%,0.00%,0.00%,98.64%,0.00%
106
+ 105,DeepSeek-Coder-V2-Lite-Instruct (FC),39.40%,3.55%,2.33%,3.80%,0.00%,8.33%,95.12%,0.00%
107
+ 106,Qwen2-1.5B-Instruct (Prompt),39.05%,41.30%,48.84%,40.27%,12.50%,25.00%,34.47%,94.44%
108
+ 107,xLAM-1b-fc-r (FC),36.92%,53.89%,63.95%,53.37%,6.25%,0.00%,9.64%,100.00%
109
+ 108,Llama-3.1-8B-Instruct (FC),33.50%,49.30%,51.94%,49.00%,37.50%,41.67%,8.05%,94.44%
110
+ 109,Falcon3-1B-Instruct (FC),32.70%,2.96%,4.65%,2.37%,0.00%,12.50%,78.91%,0.00%
111
+ 110,Qwen2.5-0.5B-Instruct (Prompt),31.59%,38.34%,53.88%,34.76%,56.25%,16.67%,19.95%,94.44%
112
+ 111,Llama-3.2-1B-Instruct (Prompt),31.36%,12.14%,31.40%,7.60%,12.50%,4.17%,60.66%,38.89%
0 commit comments