Skip to content

Commit 59e49ae

Browse files
kangreen0210mkj3085003MaiziXiao
authored
[Feature] Support SuperGPQA (#1924)
* support supergpqa * remove unnecessary code * remove unnecessary code * Add Readme * Add Readme * fix lint * fix lint * update * update --------- Co-authored-by: mkj3085003 <[email protected]> Co-authored-by: MaiziXiao <[email protected]>
1 parent e403fd2 commit 59e49ae

File tree

17 files changed

+1317
-8
lines changed

17 files changed

+1317
-8
lines changed

README.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
5757
5858
## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
5959

60+
- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
6061
- **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
6162
- **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
6263
- **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.

README_zh-CN.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -57,6 +57,7 @@
5757
5858
## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
5959

60+
- **\[2025.03.11\]** 现已支持 `SuperGPQA` 覆盖285 个研究生学科的知识能力评测,欢迎尝试!🔥🔥🔥
6061
- **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程,请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情!🔥🔥🔥
6162
- **\[2025.02.15\]** 我们新增了两个实用的评测工具:用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情!🔥🔥🔥
6263
- **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型,该模型在推理、知识类任务上取得同量级最优性能,欢迎尝试。

dataset-index.yml

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -734,6 +734,8 @@
734734
category: Understanding
735735
paper: https://arxiv.org/pdf/1808.08745
736736
configpath: opencompass/configs/datasets/Xsum
737-
738-
739-
737+
- supergpqa:
738+
name: SuperGPQA
739+
category: Knowledge
740+
paper: https://arxiv.org/pdf/2502.14739
741+
configpath: opencompass/configs/datasets/supergpqa
Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
from opencompass.datasets.supergpqa.supergpqa import (
2+
SuperGPQADataset,
3+
SuperGPQAEvaluator,
4+
)
5+
from opencompass.openicl.icl_inferencer import GenInferencer
6+
from opencompass.openicl.icl_prompt_template import PromptTemplate
7+
from opencompass.openicl.icl_retriever import ZeroRetriever
8+
9+
10+
# Reader configuration
11+
reader_cfg = dict(
12+
input_columns=[
13+
'question',
14+
'options',
15+
'discipline',
16+
'field',
17+
'subfield',
18+
'difficulty',
19+
'infer_prompt',
20+
'prompt_mode',
21+
],
22+
output_column='answer_letter',
23+
)
24+
25+
# Inference configuration
26+
infer_cfg = dict(
27+
prompt_template=dict(
28+
type=PromptTemplate,
29+
template=dict(
30+
round=[
31+
dict(
32+
role='HUMAN',
33+
prompt='{infer_prompt}',
34+
),
35+
],
36+
),
37+
),
38+
retriever=dict(type=ZeroRetriever),
39+
inferencer=dict(type=GenInferencer),
40+
)
41+
42+
# Evaluation configuration
43+
eval_cfg = dict(
44+
evaluator=dict(type=SuperGPQAEvaluator),
45+
pred_role='BOT',
46+
)
47+
supergpqa_dataset = dict(
48+
type=SuperGPQADataset,
49+
abbr='supergpqa',
50+
path='m-a-p/SuperGPQA',
51+
prompt_mode='zero-shot',
52+
reader_cfg=reader_cfg,
53+
infer_cfg=infer_cfg,
54+
eval_cfg=eval_cfg,
55+
)
56+
57+
supergpqa_datasets = [supergpqa_dataset]

opencompass/datasets/__init__.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -127,6 +127,7 @@
127127
from .subjective import * # noqa: F401, F403
128128
from .summedits import * # noqa: F401, F403
129129
from .summscreen import * # noqa: F401, F403
130+
from .supergpqa import * # noqa: F401, F403
130131
from .svamp import * # noqa: F401, F403
131132
from .tabmwp import * # noqa: F401, F403
132133
from .taco import * # noqa: F401, F403

opencompass/datasets/supergpqa/__init__.py

Whitespace-only changes.
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
import os
2+
3+
from datasets import Dataset, load_dataset
4+
5+
from opencompass.datasets.supergpqa.supergpqa_eval import (
6+
extract_option_content, extract_option_labels)
7+
from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml
8+
from opencompass.openicl.icl_evaluator import BaseEvaluator
9+
from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
10+
from opencompass.utils import get_data_path
11+
12+
from ..base import BaseDataset
13+
14+
15+
def _parse(item, template, prompt_mode):
16+
prompt_format = [
17+
item['question'] + '\n' + '\n'.join([
18+
f'{chr(65+i)}) {option}'
19+
for i, option in enumerate(item['options'])
20+
])
21+
]
22+
item['infer_prompt'] = template['prompt_format'][0].format(*prompt_format)
23+
item['prompt_mode'] = prompt_mode
24+
return item
25+
26+
27+
@LOAD_DATASET.register_module()
28+
class SuperGPQADataset(BaseDataset):
29+
30+
@staticmethod
31+
def load(path: str, prompt_mode: str, **kwargs):
32+
path = get_data_path(path, local_mode=True)
33+
dataset = load_dataset(path, split='train')
34+
35+
# get prompt template
36+
template_path = None
37+
if prompt_mode == 'zero-shot':
38+
template_path = os.path.join(
39+
os.path.dirname(__file__),
40+
'supergpqa_dataset_config/prompt/zero-shot.yaml',
41+
)
42+
elif prompt_mode == 'five-shot':
43+
template_path = os.path.join(
44+
os.path.dirname(__file__),
45+
'supergpqa_dataset_config/prompt/five-shot.yaml',
46+
)
47+
try:
48+
template = load_yaml(template_path)
49+
except FileNotFoundError:
50+
print(f'[ERROR] Missing prompt template: {template_path}')
51+
return Dataset.from_list([])
52+
53+
dataset = dataset.map(lambda item: _parse(item, template, prompt_mode))
54+
return dataset
55+
56+
57+
@ICL_EVALUATORS.register_module()
58+
class SuperGPQAEvaluator(BaseEvaluator):
59+
60+
def __init__(self):
61+
super().__init__()
62+
63+
def score(self, predictions, references, test_set):
64+
mode = test_set[0]['prompt_mode']
65+
acc = 0
66+
count = 0
67+
err = 0
68+
miss = 0
69+
acc_difficulty = {'hard': 0, 'middle': 0, 'easy': 0}
70+
count_difficulty = {'hard': 0, 'middle': 0, 'easy': 0}
71+
stats = {'discipline': {}, 'field': {}, 'subfield': {}}
72+
details = []
73+
for i, sample in enumerate(test_set):
74+
sample['pred'] = prediction = predictions[i]
75+
gold = references[i]
76+
if mode == 'zero-shot':
77+
predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
78+
if predict is None:
79+
predict = extract_option_content(prediction,
80+
sample['options'])
81+
predict = (chr(sample['options'].index(predict) +
82+
65) if predict else None)
83+
sample['extracted_answer'] = predict
84+
elif mode == 'five-shot':
85+
response = prediction.split('Question:')[0]
86+
predict = extract_option_labels(response, 'ABCDEFGHIJ')
87+
if predict is None:
88+
predict = extract_option_content(response,
89+
sample['options'])
90+
predict = (chr(sample['options'].index(predict) +
91+
65) if predict else None)
92+
if predict is None:
93+
predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
94+
if predict is None:
95+
predict = extract_option_content(
96+
prediction, sample['options'])
97+
predict = (chr(sample['options'].index(predict) +
98+
65) if predict else None)
99+
sample['extracted_answer'] = predict
100+
101+
discipline = sample.get('discipline', 'unknown')
102+
field = sample.get('field', 'unknown')
103+
subfield = sample.get('subfield', 'unknown')
104+
difficulty = sample.get('difficulty', 'unknown')
105+
106+
for level, key in [
107+
('discipline', discipline),
108+
# ('field', f"{discipline}/{field}"),
109+
# ('subfield', f"{discipline}/{field}/{subfield}"),
110+
]:
111+
if key not in stats[level]:
112+
stats[level][key] = {
113+
'correct': 0,
114+
'total': 0,
115+
'miss': 0,
116+
'error': 0,
117+
'discipline': discipline,
118+
'field': field,
119+
'subfield': subfield,
120+
'difficulty': {
121+
'easy': {
122+
'correct': 0,
123+
'total': 0
124+
},
125+
'middle': {
126+
'correct': 0,
127+
'total': 0
128+
},
129+
'hard': {
130+
'correct': 0,
131+
'total': 0
132+
},
133+
},
134+
}
135+
136+
stats[level][key]['total'] += 1
137+
stats[level][key]['difficulty'][difficulty]['total'] += 1
138+
139+
answer_letter = sample['answer_letter']
140+
assert answer_letter == gold
141+
if predict and answer_letter == predict:
142+
acc += 1
143+
acc_difficulty[difficulty] += 1
144+
sample['status'] = 'correct'
145+
stats[level][key]['correct'] += 1
146+
stats[level][key]['difficulty'][difficulty]['correct'] += 1
147+
elif predict is None or predict == '':
148+
miss += 1
149+
sample['status'] = 'miss'
150+
stats[level][key]['miss'] += 1
151+
elif predict == 'error':
152+
err += 1
153+
sample['status'] = 'error'
154+
stats[level][key]['error'] += 1
155+
else:
156+
sample['status'] = 'incorrect'
157+
count += 1
158+
count_difficulty[difficulty] += 1
159+
details.append({
160+
'pred': sample['pred'],
161+
'answer': sample['answer'],
162+
'parsed_answer': sample['extracted_answer'],
163+
'correct': True if sample['status'] else False,
164+
})
165+
166+
return {
167+
'accuracy':
168+
acc / count if count > 0 else 0,
169+
'error_rate':
170+
err / count if count > 0 else 0,
171+
'miss_rate':
172+
miss / count if count > 0 else 0,
173+
'hard_accuracy':
174+
(acc_difficulty['hard'] /
175+
count_difficulty['hard'] if count_difficulty['hard'] > 0 else 0),
176+
'middle_accuracy':
177+
(acc_difficulty['middle'] / count_difficulty['middle']
178+
if count_difficulty['middle'] > 0 else 0),
179+
'easy_accuracy':
180+
(acc_difficulty['easy'] /
181+
count_difficulty['easy'] if count_difficulty['easy'] > 0 else 0),
182+
'details':
183+
details,
184+
}
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
response_key: 'response'
2+
error_key: 'error'
3+
id_key:
4+
- 'uuid'
5+
prompt_key: 'prompt'
6+
7+
8+
9+
history_key: 'history'
10+
status_key: 'status'
11+
12+
save_prompt: True
13+
max_tokens: 4096
14+
temperatrue: 0.0
15+
16+
max_rounds: 30
17+
BoN: 32
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
response_key: 'response'
2+
error_key: 'error'
3+
id_key:
4+
- 'uuid'
5+
prompt_key: 'prompt'
6+
7+
8+
9+
history_key: 'history'
10+
status_key: 'status'
11+
12+
save_prompt: True
13+
max_tokens: 32768
14+
temperatrue: 0.0
15+
16+
max_rounds: 30
17+
BoN: 32

0 commit comments

Comments
 (0)