Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion dataset-index.yml
Original file line number Diff line number Diff line change
Expand Up @@ -1130,4 +1130,10 @@
category: Science /Physics
paper: https://arxiv.org/abs/2504.16074
configpath: opencompass/configs/datasets/PHYBench/phybench_gen.py
configpath_llmjudge: ''
configpath_llmjudge: ''
- eese:
name: EESE
category: Science
paper: https://arxiv.org/abs/2507.16514
configpath: opencompass/configs/datasets/eese/eese_llm_judge_gen.py
configpath_llmjudge: opencompass/configs/datasets/eese/eese_llm_judge_gen.py
47 changes: 47 additions & 0 deletions examples/eval_eese_api_judge.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@

from mmengine.config import read_base

with read_base():
from opencompass.configs.datasets.eese.eese_judge_gen import \
eese_datasets
# 选择一个感兴趣的模型
from opencompass.configs.models.openai.gpt_4o_2024_05_13 import \
models as gpt4

from opencompass.models import OpenAISDK

# 配置评判模型
api_meta_template = dict(round=[
dict(role='HUMAN', api_role='HUMAN'),
dict(role='BOT', api_role='BOT', generate=True),
], )

judge_cfg = dict(
abbr='model-judge',
type=OpenAISDK,
path='model-name',
key='your-api-key',
openai_api_base=['openai-url'],
meta_template=api_meta_template,
query_per_second=16,
batch_size=1,
temperature=0.001,
tokenizer_path='gpt-4o',
verbose=True,
max_out_len=16384,
max_seq_len=49152,
)

datasets = eese_datasets
models = gpt4

# 为每个数据集增加judge_cfg信息,而不是覆盖
for dataset in datasets:
if 'eval_cfg' in dataset and 'evaluator' in dataset['eval_cfg']:
# 获取现有的judge_cfg,如果不存在则创建空字典
existing_judge_cfg = dataset['eval_cfg']['evaluator'].get('judge_cfg', {})
# 更新现有的judge_cfg,保留原有配置并添加新配置
existing_judge_cfg.update(judge_cfg)
# 将更新后的配置设置回去
dataset['eval_cfg']['evaluator']['judge_cfg'] = existing_judge_cfg

104 changes: 104 additions & 0 deletions opencompass/configs/datasets/eese/eese_llm_judge_gen.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from opencompass.datasets import EESEDataset
from opencompass.openicl.icl_prompt_template import PromptTemplate
from opencompass.openicl.icl_retriever import ZeroRetriever
from opencompass.openicl.icl_inferencer import GenInferencer
from opencompass.evaluator import GenericLLMEvaluator
from opencompass.datasets.eese.eese_postprocessors import eese_score_postprocess_dict

# ----------------------------- Detailed Config -----------------------------


# Construct the prompts for tested models

ANSWER_TEMPLATE = """
Question:{question}\n

Question Type:{question_type}
if the question type is closed-ended, please answer the question directly(if it's a single/multiple-choice question, only provide the corresponding letters of your answer options). Please do not provide any analysis process.
if the question type is open-ended, please provide the problem-solving process.
""".strip()

eese_reader_cfg = dict(input_columns=['problem','question_type'], output_column='final_answer')

eese_infer_cfg = dict(
prompt_template=dict(
type=PromptTemplate,
template=dict(
round=[
dict(role='HUMAN', prompt=ANSWER_TEMPLATE),
]
),
),
retriever=dict(type=ZeroRetriever),
inferencer=dict(type=GenInferencer),
)


GRADER_TEMPLATE = """
As a grading expert, please score the candidates' answers based on the standard answers to the questions.

The following are some evaluation criteria:
1. Please refer to the standard answer given. You don't need to regenerate the answer to the question because the standard answer has already been given. You only need to determine whether the candidate's answer is consistent with the standard answer based on the form of the question. Don't try to answer the initial question. You can assume that the standard answer is definitely correct.
2. As the candidates' answers may differ in expression form from the standard answers, please understand the question and the standard answer before making a judgment, and then score the candidates' answers. However, be careful not to attempt to answer the original question.
3. Some answers may contain multiple items, such as multiple-choice questions, fill-in-the-blank questions, etc. As long as the answer is the same as the standard answer, it is sufficient. For multiple-choice questions and fill-in-the-blank questions, candidates must correctly answer all the corresponding options or blanks to be considered correct.
4. Some answers can be expressed in different ways. For instance, some answers might be mathematical expressions and some might be textual descriptions, as long as the meaning expressed is the same. Some formulas are expressed in different ways, but they are equivalent and correct.

If this question is a closed-ended one, please directly determine whether the candidate's answer is correct or not. If it is correct, please give 10 points; if it is incorrect, please give 0 points. Please give the score directly without any other explanations.
If this question is an open-ended one, please refer to the standard answers to score the candidates' answers. The scoring range is 0 to 10 points. Please directly give the final score without any explanation.

This is your task. Just answer the corresponding score. If there are mistakes, don't apologize or correct yourself. We just want to rate the answers.


< Original problem Begins >:\n{problem}\n< Original problem ends >\n\n
< Golden Goal Begins >:\n{final_answer}\n< Golden Goal Ends >\n\n
< question_type Begins >:\n{question_type}\n< question_type Ends >\n\n
< Prediction answer Begins >:\n{prediction}\n< Prediction End >\n\n

Determine the correctness of the examinee's answers.
""".strip()

# Evaluation configuration
eese_eval_cfg = dict(
evaluator=dict(
type=GenericLLMEvaluator,
prompt_template=dict(
type=PromptTemplate,
template=dict(
begin=[
dict(
role='SYSTEM',
fallback_role='HUMAN',
prompt="You are a helpful assistant who evaluates the correctness and quality of models' outputs.")
],
round=[
dict(
role='HUMAN',
prompt = GRADER_TEMPLATE
),
]),
),
dataset_cfg=dict(
type=EESEDataset,
path='opencompass/eese',
file_name = 'EESE.jsonl',
reader_cfg=eese_reader_cfg,
),
judge_cfg=dict(),
dict_postprocessor=dict(type=eese_score_postprocess_dict),
),
pred_role='BOT',
)


eese_datasets = [
dict(
type=EESEDataset,
abbr='eese-llmjudge',
path='opencompass/eese',
file_name = 'EESE.jsonl',
reader_cfg=eese_reader_cfg,
infer_cfg=eese_infer_cfg,
eval_cfg=eese_eval_cfg,
mode='singlescore',
)
]
1 change: 1 addition & 0 deletions opencompass/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
from .ds1000 import * # noqa: F401, F403
from .ds1000_interpreter import * # noqa: F401, F403
from .Earth_Silver import * # noqa: F401, F403
from .eese.eese import * # noqa: F401, F403
from .eprstmt import * # noqa: F401, F403
from .FinanceIQ import * # noqa: F401, F403
from .flores import * # noqa: F401, F403
Expand Down
45 changes: 45 additions & 0 deletions opencompass/datasets/eese/eese.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import json
import os.path as osp
from os import environ

from opencompass.registry import LOAD_DATASET
from opencompass.utils import get_data_path

from ..base import BaseDataset


@LOAD_DATASET.register_module()
class EESEDataset(BaseDataset):

@staticmethod
def load(path: str, file_name: str = 'EESE.jsonl', **kwargs):
"""Load EESE dataset from HuggingFace or local file.

Args:
path (str): Path to the dataset
file_name (str): Name of the JSONL file
**kwargs: Additional arguments

Returns:
datasets.Dataset: The loaded EESE dataset
"""
path = get_data_path(path)

if environ.get('DATASET_SOURCE') == 'HF':
from datasets import load_dataset
dataset = load_dataset('AIBench/EESE', 'default', split='test')
else:
from datasets import Dataset, DatasetDict
dataset = DatasetDict()
for split in ['test']:
raw_data = []
filename = osp.join(path, split, f'{file_name}')
raw_data = []
with open(filename, 'r', encoding='utf-8') as f:
for line in f:
if line.strip():
raw_data.append(json.loads(line))
dataset['test'] = Dataset.from_list(raw_data)
dataset['train'] = Dataset.from_list(raw_data)

return dataset
35 changes: 35 additions & 0 deletions opencompass/datasets/eese/eese_postprocessors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
# flake8: noqa

from opencompass.datasets.eese.utils import extract_first_numeric_score
from opencompass.registry import DICT_POSTPROCESSORS, TEXT_POSTPROCESSORS


@DICT_POSTPROCESSORS.register_module('eese_score_postprocess_dict')
def eese_score_postprocess_dict(output: dict, output_path: str) -> dict:
"""Post-process EESE score results for LLM judge - dict version"""
# 处理每个预测结果
for key, value in output.items():
if 'prediction' in value:
prediction = value['prediction']
# 提取数字分数
score = extract_first_numeric_score(prediction)
if score is not None:
value['score'] = score
else:
# 如果没有找到数字,尝试解析其他格式
prediction_lower = prediction.strip().lower()
if 'correct' in prediction_lower or 'right' in prediction_lower or '10' in prediction_lower:
value['score'] = 10
elif 'incorrect' in prediction_lower or 'wrong' in prediction_lower or '0' in prediction_lower:
value['score'] = 0
else:
value['score'] = 0 # 默认返回0分

# 计算总体分数
scores = [value.get('score', 0) for value in output.values()]
if scores:
overall_score = sum(scores) / (10 * len(scores))
else:
overall_score = 0

return {'overall_score': overall_score, 'details': output}
46 changes: 46 additions & 0 deletions opencompass/datasets/eese/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# flake8: noqa

import re


def extract_first_numeric_score(score_text):
"""
extract the first numeric score from the score text

Args:
score_text (str): the text containing the score

Returns:
int or None: the first numeric score, if not found, return None

"""
try:
return int(score_text)
except:
# Validate input
if not isinstance(score_text, str) or not score_text.strip():
return None

match = re.search(r'(?<!\d)\d+(?!\d)', score_text)

if match:
return int(match.group())
return 0


def process_results(results, overall_avg):
"""process the results"""
results_str = '\n' + '=' * 60 + '\n' + 'Summary of evaluation results' + '\n' + '=' * 60 + '\n'
results_str += '\nPerformance of each discipline:\n' + '-' * 40 + '\n'

for discipline, stats in results.items():
results_str += f'{discipline}:\n'
results_str += f" Average score: {stats['average_score']}\n"
results_str += f" Number of questions: {stats['count']}\n"
results_str += f" Score distribution: {stats['scores'][:5]}{'...' if len(stats['scores']) > 5 else ''}\n"

results_str += '-' * 40 + '\n'
results_str += f'Overall average score: {overall_avg}\n'
results_str += f"Total number of questions: {sum(stats['count'] for stats in results.values())}\n"
results_str += '=' * 60 + '\n'
return results_str
6 changes: 6 additions & 0 deletions opencompass/utils/datasets_info.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,12 @@
"hf_id": None,
"local": "./data/clozeTest-maxmin/python/answers.txt",
},
# eese
"opencompass/eese": {
"ms_id": None,
"hf_id": "AIBench/EESE",
"local": "./data/eese/",
},
# Flores
"opencompass/flores": {
"ms_id": "opencompass/flores",
Expand Down