open-compass
diff --git a/‎README.md
Lines changed: 1 addition & 0 deletions b/‎README.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎README_zh-CN.md
Lines changed: 1 addition & 0 deletions b/‎README_zh-CN.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎dataset-index.yml
Lines changed: 5 additions & 3 deletions b/‎dataset-index.yml
Lines changed: 5 additions & 3 deletions
diff --git a/‎opencompass/configs/datasets/supergpqa/supergpqa_gen.py
Lines changed: 57 additions & 0 deletions b/‎opencompass/configs/datasets/supergpqa/supergpqa_gen.py
Lines changed: 57 additions & 0 deletions
diff --git a/‎opencompass/datasets/__init__.py
Lines changed: 1 addition & 0 deletions b/‎opencompass/datasets/__init__.py
Lines changed: 1 addition & 0 deletions
diff --git a/‎opencompass/datasets/supergpqa/__init__.py b/‎opencompass/datasets/supergpqa/__init__.py
diff --git a/‎opencompass/datasets/supergpqa/supergpqa.py
Lines changed: 184 additions & 0 deletions b/‎opencompass/datasets/supergpqa/supergpqa.py
Lines changed: 184 additions & 0 deletions
diff --git a/‎opencompass/datasets/supergpqa/supergpqa_dataset_config/config_default.yaml
Lines changed: 17 additions & 0 deletions b/‎opencompass/datasets/supergpqa/supergpqa_dataset_config/config_default.yaml
Lines changed: 17 additions & 0 deletions
diff --git a/‎opencompass/datasets/supergpqa/supergpqa_dataset_config/config_reasoning_models.yaml
Lines changed: 17 additions & 0 deletions b/‎opencompass/datasets/supergpqa/supergpqa_dataset_config/config_reasoning_models.yaml
Lines changed: 17 additions & 0 deletions
@@ -57,6 +57,7 @@ Just like a compass guides us on our journey, OpenCompass will guide you through
 
 ## 🚀 What's New <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2025.03.11\]** We have supported evaluation for `SuperGPQA` which is a great benchmark for measuring LLM knowledge ability 🔥🔥🔥
 - **\[2025.02.28\]** We have added a tutorial for `DeepSeek-R1` series model, please check [Evaluating Reasoning Model](docs/en/user_guides/deepseek_r1.md) for more details! 🔥🔥🔥
 - **\[2025.02.15\]** We have added two powerful evaluation tools: `GenericLLMEvaluator` for LLM-as-judge evaluations and `MATHEvaluator` for mathematical reasoning assessments. Check out the documentation for [LLM Judge](docs/en/advanced_guides/llm_judge.md) and [Math Evaluation](docs/en/advanced_guides/general_math.md) for more details! 🔥🔥🔥
 - **\[2025.01.16\]** We now support the [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) model which has enhanced performance on reasoning and knowledge-intensive tasks.
 
@@ -57,6 +57,7 @@
 
 ## 🚀 最新进展 <a><img width="35" height="20" src="https://user-images.githubusercontent.com/12782558/212848161-5e783dd6-11e8-4fe0-bbba-39ffb77730be.png"></a>
 
+- **\[2025.03.11\]** 现已支持 `SuperGPQA`  覆盖285 个研究生学科的知识能力评测，欢迎尝试！🔥🔥🔥
 - **\[2025.02.28\]** 我们为 `DeepSeek-R1` 系列模型添加了教程，请查看 [评估推理模型](docs/en/user_guides/deepseek_r1.md) 了解更多详情！🔥🔥🔥
 - **\[2025.02.15\]** 我们新增了两个实用的评测工具：用于LLM作为评判器的`GenericLLMEvaluator`和用于数学推理评估的`MATHEvaluator`。查看[LLM评判器](docs/zh_cn/advanced_guides/llm_judge.md)和[数学能力评测](docs/zh_cn/advanced_guides/general_math.md)文档了解更多详情！🔥🔥🔥
 - **\[2025.01.16\]** 我们现已支持 [InternLM3-8B-Instruct](https://huggingface.co/internlm/internlm3-8b-instruct) 模型，该模型在推理、知识类任务上取得同量级最优性能，欢迎尝试。
 
@@ -734,6 +734,8 @@
     category: Understanding
     paper: https://arxiv.org/pdf/1808.08745
     configpath: opencompass/configs/datasets/Xsum
-
-
-
+- supergpqa:
+    name: SuperGPQA
+    category: Knowledge
+    paper: https://arxiv.org/pdf/2502.14739
+    configpath: opencompass/configs/datasets/supergpqa
@@ -0,0 +1,57 @@
+from opencompass.datasets.supergpqa.supergpqa import (
+    SuperGPQADataset,
+    SuperGPQAEvaluator,
+)
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+
+
+# Reader configuration
+reader_cfg = dict(
+    input_columns=[
+        'question',
+        'options',
+        'discipline',
+        'field',
+        'subfield',
+        'difficulty',
+        'infer_prompt',
+        'prompt_mode',
+    ],
+    output_column='answer_letter',
+)
+
+# Inference configuration
+infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(
+                    role='HUMAN',
+                    prompt='{infer_prompt}',
+                ),
+            ],
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+# Evaluation configuration
+eval_cfg = dict(
+    evaluator=dict(type=SuperGPQAEvaluator),
+    pred_role='BOT',
+)
+supergpqa_dataset = dict(
+    type=SuperGPQADataset,
+    abbr='supergpqa',
+    path='m-a-p/SuperGPQA',
+    prompt_mode='zero-shot',
+    reader_cfg=reader_cfg,
+    infer_cfg=infer_cfg,
+    eval_cfg=eval_cfg,
+)
+
+supergpqa_datasets = [supergpqa_dataset]
@@ -127,6 +127,7 @@
 from .subjective import *  # noqa: F401, F403
 from .summedits import *  # noqa: F401, F403
 from .summscreen import *  # noqa: F401, F403
+from .supergpqa import *  # noqa: F401, F403
 from .svamp import *  # noqa: F401, F403
 from .tabmwp import *  # noqa: F401, F403
 from .taco import *  # noqa: F401, F403
 
@@ -0,0 +1,184 @@
+import os
+
+from datasets import Dataset, load_dataset
+
+from opencompass.datasets.supergpqa.supergpqa_eval import (
+    extract_option_content, extract_option_labels)
+from opencompass.datasets.supergpqa.supergpqa_utils import load_yaml
+from opencompass.openicl.icl_evaluator import BaseEvaluator
+from opencompass.registry import ICL_EVALUATORS, LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from ..base import BaseDataset
+
+
+def _parse(item, template, prompt_mode):
+    prompt_format = [
+        item['question'] + '\n' + '\n'.join([
+            f'{chr(65+i)}) {option}'
+            for i, option in enumerate(item['options'])
+        ])
+    ]
+    item['infer_prompt'] = template['prompt_format'][0].format(*prompt_format)
+    item['prompt_mode'] = prompt_mode
+    return item
+
+
+@LOAD_DATASET.register_module()
+class SuperGPQADataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, prompt_mode: str, **kwargs):
+        path = get_data_path(path, local_mode=True)
+        dataset = load_dataset(path, split='train')
+
+        # get prompt template
+        template_path = None
+        if prompt_mode == 'zero-shot':
+            template_path = os.path.join(
+                os.path.dirname(__file__),
+                'supergpqa_dataset_config/prompt/zero-shot.yaml',
+            )
+        elif prompt_mode == 'five-shot':
+            template_path = os.path.join(
+                os.path.dirname(__file__),
+                'supergpqa_dataset_config/prompt/five-shot.yaml',
+            )
+        try:
+            template = load_yaml(template_path)
+        except FileNotFoundError:
+            print(f'[ERROR] Missing prompt template: {template_path}')
+            return Dataset.from_list([])
+
+        dataset = dataset.map(lambda item: _parse(item, template, prompt_mode))
+        return dataset
+
+
+@ICL_EVALUATORS.register_module()
+class SuperGPQAEvaluator(BaseEvaluator):
+
+    def __init__(self):
+        super().__init__()
+
+    def score(self, predictions, references, test_set):
+        mode = test_set[0]['prompt_mode']
+        acc = 0
+        count = 0
+        err = 0
+        miss = 0
+        acc_difficulty = {'hard': 0, 'middle': 0, 'easy': 0}
+        count_difficulty = {'hard': 0, 'middle': 0, 'easy': 0}
+        stats = {'discipline': {}, 'field': {}, 'subfield': {}}
+        details = []
+        for i, sample in enumerate(test_set):
+            sample['pred'] = prediction = predictions[i]
+            gold = references[i]
+            if mode == 'zero-shot':
+                predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
+                if predict is None:
+                    predict = extract_option_content(prediction,
+                                                     sample['options'])
+                    predict = (chr(sample['options'].index(predict) +
+                                   65) if predict else None)
+                sample['extracted_answer'] = predict
+            elif mode == 'five-shot':
+                response = prediction.split('Question:')[0]
+                predict = extract_option_labels(response, 'ABCDEFGHIJ')
+                if predict is None:
+                    predict = extract_option_content(response,
+                                                     sample['options'])
+                    predict = (chr(sample['options'].index(predict) +
+                                   65) if predict else None)
+                if predict is None:
+                    predict = extract_option_labels(prediction, 'ABCDEFGHIJ')
+                    if predict is None:
+                        predict = extract_option_content(
+                            prediction, sample['options'])
+                        predict = (chr(sample['options'].index(predict) +
+                                       65) if predict else None)
+                sample['extracted_answer'] = predict
+
+            discipline = sample.get('discipline', 'unknown')
+            field = sample.get('field', 'unknown')
+            subfield = sample.get('subfield', 'unknown')
+            difficulty = sample.get('difficulty', 'unknown')
+
+            for level, key in [
+                ('discipline', discipline),
+                    # ('field', f"{discipline}/{field}"),
+                    # ('subfield', f"{discipline}/{field}/{subfield}"),
+            ]:
+                if key not in stats[level]:
+                    stats[level][key] = {
+                        'correct': 0,
+                        'total': 0,
+                        'miss': 0,
+                        'error': 0,
+                        'discipline': discipline,
+                        'field': field,
+                        'subfield': subfield,
+                        'difficulty': {
+                            'easy': {
+                                'correct': 0,
+                                'total': 0
+                            },
+                            'middle': {
+                                'correct': 0,
+                                'total': 0
+                            },
+                            'hard': {
+                                'correct': 0,
+                                'total': 0
+                            },
+                        },
+                    }
+
+                stats[level][key]['total'] += 1
+                stats[level][key]['difficulty'][difficulty]['total'] += 1
+
+                answer_letter = sample['answer_letter']
+                assert answer_letter == gold
+                if predict and answer_letter == predict:
+                    acc += 1
+                    acc_difficulty[difficulty] += 1
+                    sample['status'] = 'correct'
+                    stats[level][key]['correct'] += 1
+                    stats[level][key]['difficulty'][difficulty]['correct'] += 1
+                elif predict is None or predict == '':
+                    miss += 1
+                    sample['status'] = 'miss'
+                    stats[level][key]['miss'] += 1
+                elif predict == 'error':
+                    err += 1
+                    sample['status'] = 'error'
+                    stats[level][key]['error'] += 1
+                else:
+                    sample['status'] = 'incorrect'
+                count += 1
+                count_difficulty[difficulty] += 1
+                details.append({
+                    'pred': sample['pred'],
+                    'answer': sample['answer'],
+                    'parsed_answer': sample['extracted_answer'],
+                    'correct': True if sample['status'] else False,
+                })
+
+        return {
+            'accuracy':
+            acc / count if count > 0 else 0,
+            'error_rate':
+            err / count if count > 0 else 0,
+            'miss_rate':
+            miss / count if count > 0 else 0,
+            'hard_accuracy':
+            (acc_difficulty['hard'] /
+             count_difficulty['hard'] if count_difficulty['hard'] > 0 else 0),
+            'middle_accuracy':
+            (acc_difficulty['middle'] / count_difficulty['middle']
+             if count_difficulty['middle'] > 0 else 0),
+            'easy_accuracy':
+            (acc_difficulty['easy'] /
+             count_difficulty['easy'] if count_difficulty['easy'] > 0 else 0),
+            'details':
+            details,
+        }
@@ -0,0 +1,17 @@
+response_key: 'response'
+error_key: 'error'
+id_key:
+  - 'uuid'
+prompt_key: 'prompt'
+
+
+
+history_key: 'history'
+status_key: 'status'
+
+save_prompt: True
+max_tokens: 4096
+temperatrue: 0.0
+
+max_rounds: 30
+BoN: 32
@@ -0,0 +1,17 @@
+response_key: 'response'
+error_key: 'error'
+id_key:
+  - 'uuid'
+prompt_key: 'prompt'
+
+
+
+history_key: 'history'
+status_key: 'status'
+
+save_prompt: True
+max_tokens: 32768
+temperatrue: 0.0
+
+max_rounds: 30
+BoN: 32