open-compass · Myhs-phz · Jul 30, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/dataset-index.yml b/dataset-index.yml
@@ -1130,4 +1130,10 @@
     category: Science /Physics
     paper: https://arxiv.org/abs/2504.16074
     configpath: opencompass/configs/datasets/PHYBench/phybench_gen.py
-    configpath_llmjudge: ''
+    configpath_llmjudge: ''
+- eese:
+    name: EESE
+    category: Science
+    paper: https://arxiv.org/abs/2507.16514
+    configpath: opencompass/configs/datasets/eese/eese_llm_judge_gen.py
+    configpath_llmjudge: opencompass/configs/datasets/eese/eese_llm_judge_gen.py
diff --git a/examples/eval_eese_api_judge.py b/examples/eval_eese_api_judge.py
@@ -0,0 +1,47 @@
+
+from mmengine.config import read_base
+
+with read_base():
+    from opencompass.configs.datasets.eese.eese_judge_gen import \
+        eese_datasets
+    # 选择一个感兴趣的模型
+    from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
+        models as gpt4
+
+from opencompass.models import OpenAISDK
+
+# 配置评判模型
+api_meta_template = dict(round=[
+    dict(role='HUMAN', api_role='HUMAN'),
+    dict(role='BOT', api_role='BOT', generate=True),
+], )
+
+judge_cfg = dict(
+    abbr='model-judge',
+    type=OpenAISDK,
+    path='model-name',
+    key='your-api-key',
+    openai_api_base=['openai-url'],
+    meta_template=api_meta_template,
+    query_per_second=16,
+    batch_size=1,
+    temperature=0.001,
+    tokenizer_path='gpt-4o',
+    verbose=True,
+    max_out_len=16384,
+    max_seq_len=49152,
+)
+
+datasets = eese_datasets
+models = gpt4
+
+# 为每个数据集增加judge_cfg信息，而不是覆盖
+for dataset in datasets:
+    if 'eval_cfg' in dataset and 'evaluator' in dataset['eval_cfg']:
+        # 获取现有的judge_cfg，如果不存在则创建空字典
+        existing_judge_cfg = dataset['eval_cfg']['evaluator'].get('judge_cfg', {})
+        # 更新现有的judge_cfg，保留原有配置并添加新配置
+        existing_judge_cfg.update(judge_cfg)
+        # 将更新后的配置设置回去
+        dataset['eval_cfg']['evaluator']['judge_cfg'] = existing_judge_cfg
+
diff --git a/opencompass/configs/datasets/eese/eese_llm_judge_gen.py b/opencompass/configs/datasets/eese/eese_llm_judge_gen.py
@@ -0,0 +1,104 @@
+from opencompass.datasets import EESEDataset
+from opencompass.openicl.icl_prompt_template import PromptTemplate
+from opencompass.openicl.icl_retriever import ZeroRetriever
+from opencompass.openicl.icl_inferencer import GenInferencer
+from opencompass.evaluator import GenericLLMEvaluator
+from opencompass.datasets.eese.eese_postprocessors import eese_score_postprocess_dict
+
+# ----------------------------- Detailed Config -----------------------------
+
+
+# Construct the prompts for tested models
+
+ANSWER_TEMPLATE = """
+Question:{question}\n
+
+Question Type:{question_type}
+if the question type is closed-ended, please answer the question directly(if it's a single/multiple-choice question, only provide the corresponding letters of your answer options). Please do not provide any analysis process.
+if the question type is open-ended, please provide the problem-solving process.
+""".strip()
+
+eese_reader_cfg = dict(input_columns=['problem','question_type'], output_column='final_answer')
+
+eese_infer_cfg = dict(
+    prompt_template=dict(
+        type=PromptTemplate,
+        template=dict(
+            round=[
+                dict(role='HUMAN', prompt=ANSWER_TEMPLATE),
+            ]
+        ),
+    ),
+    retriever=dict(type=ZeroRetriever),
+    inferencer=dict(type=GenInferencer),
+)
+
+
+GRADER_TEMPLATE = """
+As a grading expert, please score the candidates' answers based on the standard answers to the questions.
+
+The following are some evaluation criteria:
+1. Please refer to the standard answer given. You don't need to regenerate the answer to the question because the standard answer has already been given. You only need to determine whether the candidate's answer is consistent with the standard answer based on the form of the question. Don't try to answer the initial question. You can assume that the standard answer is definitely correct.
+2. As the candidates' answers may differ in expression form from the standard answers, please understand the question and the standard answer before making a judgment, and then score the candidates' answers. However, be careful not to attempt to answer the original question.
+3. Some answers may contain multiple items, such as multiple-choice questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is sufficient. For multiple-choice questions and fill-in-the-blank questions, candidates must correctly answer all the corresponding options or blanks to be considered correct.
+4. Some answers can be expressed in different ways. For instance, some answers might be mathematical expressions and some might be textual descriptions, as long as the meaning expressed is the same. Some formulas are expressed in different ways, but they are equivalent and correct.
+
+If this question is a closed-ended one, please directly determine whether the candidate's answer is correct or not. If it is correct, please give 10 points; if it is incorrect, please give 0 points. Please give the score directly without any other explanations.
+If this question is an open-ended one, please refer to the standard answers to score the candidates' answers. The scoring range is 0 to 10 points. Please directly give the final score without any explanation.
+
+This is your task. Just answer the corresponding score. If there are mistakes, don't apologize or correct yourself. We just want to rate the answers.
+
+
+< Original problem Begins >:\n{problem}\n< Original problem ends >\n\n
+< Golden Goal Begins >:\n{final_answer}\n< Golden Goal Ends >\n\n
+< question_type Begins >:\n{question_type}\n< question_type Ends >\n\n
+< Prediction answer Begins >:\n{prediction}\n< Prediction End >\n\n
+
+Determine the correctness of the examinee's answers.
+""".strip()
+
+# Evaluation configuration
+eese_eval_cfg = dict(
+    evaluator=dict(
+        type=GenericLLMEvaluator,
+        prompt_template=dict(
+            type=PromptTemplate,
+            template=dict(
+            begin=[
+                dict(
+                    role='SYSTEM',
+                    fallback_role='HUMAN',
+                    prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
+            ],
+                round=[
+                dict(
+                    role='HUMAN',
+                    prompt = GRADER_TEMPLATE
+                ),
+            ]),
+        ),
+        dataset_cfg=dict(
+            type=EESEDataset,
+            path='opencompass/eese',
+            file_name = 'EESE.jsonl',
+            reader_cfg=eese_reader_cfg,
+        ),
+        judge_cfg=dict(),
+        dict_postprocessor=dict(type=eese_score_postprocess_dict),
+    ),
+    pred_role='BOT',
+)
+
+
+eese_datasets = [
+    dict(
+        type=EESEDataset,
+        abbr='eese-llmjudge',
+        path='opencompass/eese',
+        file_name = 'EESE.jsonl',
+        reader_cfg=eese_reader_cfg,
+        infer_cfg=eese_infer_cfg,
+        eval_cfg=eese_eval_cfg,
+        mode='singlescore',
+    )
+]
diff --git a/opencompass/datasets/__init__.py b/opencompass/datasets/__init__.py
@@ -50,6 +50,7 @@
 from .ds1000 import *  # noqa: F401, F403
 from .ds1000_interpreter import *  # noqa: F401, F403
 from .Earth_Silver import *  # noqa: F401, F403
+from .eese.eese import *  # noqa: F401, F403
 from .eprstmt import *  # noqa: F401, F403
 from .FinanceIQ import *  # noqa: F401, F403
 from .flores import *  # noqa: F401, F403

diff --git a/opencompass/datasets/eese/eese.py b/opencompass/datasets/eese/eese.py
@@ -0,0 +1,45 @@
+import json
+import os.path as osp
+from os import environ
+
+from opencompass.registry import LOAD_DATASET
+from opencompass.utils import get_data_path
+
+from ..base import BaseDataset
+
+
+@LOAD_DATASET.register_module()
+class EESEDataset(BaseDataset):
+
+    @staticmethod
+    def load(path: str, file_name: str = 'EESE.jsonl', **kwargs):
+        """Load EESE dataset from HuggingFace or local file.
+
+        Args:
+            path (str): Path to the dataset
+            file_name (str): Name of the JSONL file
+            **kwargs: Additional arguments
+
+        Returns:
+            datasets.Dataset: The loaded EESE dataset
+        """
+        path = get_data_path(path)
+
+        if environ.get('DATASET_SOURCE') == 'HF':
+            from datasets import load_dataset
+            dataset = load_dataset('AIBench/EESE', 'default', split='test')
+        else:
+            from datasets import Dataset, DatasetDict
+            dataset = DatasetDict()
+            for split in ['test']:
+                raw_data = []
+                filename = osp.join(path, split, f'{file_name}')
+                raw_data = []
+                with open(filename, 'r', encoding='utf-8') as f:
+                    for line in f:
+                        if line.strip():
+                            raw_data.append(json.loads(line))
+            dataset['test'] = Dataset.from_list(raw_data)
+            dataset['train'] = Dataset.from_list(raw_data)
+
+        return dataset
diff --git a/opencompass/datasets/eese/eese_postprocessors.py b/opencompass/datasets/eese/eese_postprocessors.py
@@ -0,0 +1,35 @@
+# flake8: noqa
+
+from opencompass.datasets.eese.utils import extract_first_numeric_score
+from opencompass.registry import DICT_POSTPROCESSORS, TEXT_POSTPROCESSORS
+
+
+@DICT_POSTPROCESSORS.register_module('eese_score_postprocess_dict')
+def eese_score_postprocess_dict(output: dict, output_path: str) -> dict:
+    """Post-process EESE score results for LLM judge - dict version"""
+    # 处理每个预测结果
+    for key, value in output.items():
+        if 'prediction' in value:
+            prediction = value['prediction']
+            # 提取数字分数
+            score = extract_first_numeric_score(prediction)
+            if score is not None:
+                value['score'] = score
+            else:
+                # 如果没有找到数字，尝试解析其他格式
+                prediction_lower = prediction.strip().lower()
+                if 'correct' in prediction_lower or 'right' in prediction_lower or '10' in prediction_lower:
+                    value['score'] = 10
+                elif 'incorrect' in prediction_lower or 'wrong' in prediction_lower or '0' in prediction_lower:
+                    value['score'] = 0
+                else:
+                    value['score'] = 0  # 默认返回0分
+
+    # 计算总体分数
+    scores = [value.get('score', 0) for value in output.values()]
+    if scores:
+        overall_score = sum(scores) / (10 * len(scores))
+    else:
+        overall_score = 0
+
+    return {'overall_score': overall_score, 'details': output}
diff --git a/opencompass/datasets/eese/utils.py b/opencompass/datasets/eese/utils.py
@@ -0,0 +1,46 @@
+# flake8: noqa
+
+import re
+
+
+def extract_first_numeric_score(score_text):
+    """
+    extract the first numeric score from the score text
+
+    Args:
+        score_text (str): the text containing the score
+
+    Returns:
+        int or None: the first numeric score, if not found, return None
+
+    """
+    try:
+        return int(score_text)
+    except:
+        # Validate input
+        if not isinstance(score_text, str) or not score_text.strip():
+            return None
+
+        match = re.search(r'(?<!\d)\d+(?!\d)', score_text)
+
+        if match:
+            return int(match.group())
+        return 0
+
+
+def process_results(results, overall_avg):
+    """process the results"""
+    results_str = '\n' + '=' * 60 + '\n' + 'Summary of evaluation results' + '\n' + '=' * 60 + '\n'
+    results_str += '\nPerformance of each discipline:\n' + '-' * 40 + '\n'
+
+    for discipline, stats in results.items():
+        results_str += f'{discipline}:\n'
+        results_str += f"  Average score: {stats['average_score']}\n"
+        results_str += f"  Number of questions: {stats['count']}\n"
+        results_str += f"  Score distribution: {stats['scores'][:5]}{'...' if len(stats['scores']) > 5 else ''}\n"
+
+    results_str += '-' * 40 + '\n'
+    results_str += f'Overall average score: {overall_avg}\n'
+    results_str += f"Total number of questions: {sum(stats['count'] for stats in results.values())}\n"
+    results_str += '=' * 60 + '\n'
+    return results_str
diff --git a/opencompass/utils/datasets_info.py b/opencompass/utils/datasets_info.py
@@ -111,6 +111,12 @@
         "hf_id": None,
         "local": "./data/clozeTest-maxmin/python/answers.txt",
     },
+    # eese
+    "opencompass/eese": {
+        "ms_id": None,
+        "hf_id": "AIBench/EESE",
+        "local": "./data/eese/",
+    },
     # Flores
     "opencompass/flores": {
         "ms_id": "opencompass/flores",