Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
54 commits
Select commit Hold shift + click to select a range
287f0e3
update
zhulinJulia24 Jun 13, 2025
e5e87ba
Merge branch 'open-compass:main' into update_fullbench
zhulinJulia24 Jun 13, 2025
79016bd
Update daily-run-test.yml
zhulinJulia24 Jun 17, 2025
e1d84e5
Update daily-run-test.yml
zhulinJulia24 Jun 18, 2025
b6e3194
Merge branch 'open-compass:main' into update_fullbench
zhulinJulia24 Jun 18, 2025
8b7ad61
update
zhulinJulia24 Jun 18, 2025
63934b0
Update daily-run-test.yml
zhulinJulia24 Jun 18, 2025
778cc63
Update daily-run-test.yml
zhulinJulia24 Jun 18, 2025
f3f53c4
Update daily-run-test.yml
zhulinJulia24 Jun 18, 2025
504a37e
Update daily-run-test.yml
zhulinJulia24 Jun 18, 2025
64823bb
update
zhulinJulia24 Jun 19, 2025
3c716d4
update
zhulinJulia24 Jun 19, 2025
d35e00a
update
zhulinJulia24 Jun 24, 2025
3214e86
update
zhulinJulia24 Jun 24, 2025
0c29df7
update
zhulinJulia24 Jun 24, 2025
5d32adc
Merge branch 'open-compass:main' into update_fullbench
zhulinJulia24 Jun 24, 2025
8f18c36
Update daily-run-test.yml
zhulinJulia24 Jun 25, 2025
9290200
update
zhulinJulia24 Jun 25, 2025
4ada396
update
zhulinJulia24 Jun 25, 2025
4c5f22d
Update eval_regression_chat_obj_fullbench_v5.py
zhulinJulia24 Jun 25, 2025
e6758f7
upfate
zhulinJulia24 Jun 25, 2025
52a47e3
update
zhulinJulia24 Jun 25, 2025
c583131
update
zhulinJulia24 Jun 25, 2025
15b6611
update
zhulinJulia24 Jun 26, 2025
140a9a5
update
zhulinJulia24 Jun 26, 2025
dd8bbca
update
zhulinJulia24 Jun 26, 2025
097e7a6
update
zhulinJulia24 Jun 26, 2025
8348be2
update
zhulinJulia24 Jun 27, 2025
881ceb9
update
zhulinJulia24 Jun 27, 2025
bc35cb3
Update daily-run-test.yml
zhulinJulia24 Jun 27, 2025
f945dcd
Update daily-run-test.yml
zhulinJulia24 Jun 27, 2025
e016195
Update daily-run-test.yml
zhulinJulia24 Jun 28, 2025
e50e872
Update daily-run-test.yml
zhulinJulia24 Jun 28, 2025
d5bd780
Update daily-run-test.yml
zhulinJulia24 Jun 28, 2025
9cc2e65
update
zhulinJulia24 Jul 1, 2025
6b567ce
update
zhulinJulia24 Jul 1, 2025
df146bc
update
zhulinJulia24 Jul 1, 2025
20fb3f0
update
zhulinJulia24 Jul 1, 2025
fb39d2d
update
zhulinJulia24 Jul 1, 2025
b619a78
Merge branch 'open-compass:main' into update_fullbench
zhulinJulia24 Jul 1, 2025
27a0704
Merge branch 'open-compass:main' into update_fullbench
zhulinJulia24 Jul 1, 2025
28ce282
update
zhulinJulia24 Jul 2, 2025
fdbd6fc
Update eval_regression_chat_obj_fullbench_v7.py
zhulinJulia24 Jul 3, 2025
9151ebd
Update eval_regression_chat_obj_fullbench_v6.py
zhulinJulia24 Jul 3, 2025
a10cfe2
update
zhulinJulia24 Jul 3, 2025
a3c0195
update
zhulinJulia24 Jul 4, 2025
73a7ec9
update
zhulinJulia24 Jul 4, 2025
cad0dd2
update
zhulinJulia24 Jul 4, 2025
22e9753
update
zhulinJulia24 Jul 4, 2025
5250fcf
update
zhulinJulia24 Jul 4, 2025
0aae1d8
update
zhulinJulia24 Jul 4, 2025
fd58ea9
update
zhulinJulia24 Jul 4, 2025
4b8e665
update
zhulinJulia24 Jul 4, 2025
30eb1bf
update
zhulinJulia24 Jul 4, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/scripts/eval_regression_base_fullbench.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501

from ...volc import infer as volc_infer # noqa: F401, E501
from ...volc import infer # noqa: F401, E501

race_datasets = [race_datasets[1]] # Only take RACE-High
humaneval_v2_datasets[0]['abbr'] = 'openai_humaneval_v2'
Expand Down
35 changes: 35 additions & 0 deletions .github/scripts/eval_regression_base_longtext_fullbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
from mmengine.config import read_base

with read_base():
from opencompass.configs.datasets.longbench.longbench import \
longbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.needlebench.needlebench_base.needlebench_base_gen import \
needlebench_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
models as lmdeploy_internlm2_5_7b_chat_1m_model # noqa: F401, E501
# summarizer
from opencompass.configs.summarizers.groups.longbench import \
longbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.needlebench import \
needlebench_internal_200k_summarizer # noqa: F401, E501
from opencompass.configs.summarizers.needlebench import (
needlebench_internal_32k_summarizer,
needlebench_internal_100k_summarizer)

from ...volc import infer # noqa: F401, E501

needlebench_internal_32k_summary_groups = needlebench_internal_32k_summarizer[
'summary_groups']
needlebench_internal_100k_summary_groups = (
needlebench_internal_100k_summarizer['summary_groups'])
needlebench_internal_200k_summary_groups = (
needlebench_internal_200k_summarizer['summary_groups'])

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = [
v[0] for k, v in locals().items()
if k.endswith('_datasets') and isinstance(v, list) and len(v) > 0
]

for d in datasets:
d['reader_cfg']['test_range'] = '[0:16]'
2 changes: 1 addition & 1 deletion .github/scripts/eval_regression_base_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@
from opencompass.configs.models.yi.lmdeploy_yi_1_5_9b import \
models as lmdeploy_yi_1_5_9b_model # noqa: F401, E501

from ...volc import infer as volc_infer # noqa: F401, E501
from ...volc import infer # noqa: F401, E501

race_datasets = [race_datasets[1]]
models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
Expand Down
33 changes: 33 additions & 0 deletions .github/scripts/eval_regression_chat_longtext_fullbench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
from mmengine.config import read_base

with read_base():
from opencompass.configs.datasets.babilong.babilong_256k_gen import \
babiLong_256k_datasets # noqa: F401, E501
from opencompass.configs.datasets.longbench.longbench import \
longbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.needlebench.needlebench_128k.needlebench_128k import \
needlebench_datasets as needlebench_128k_datasets # noqa: F401, E501
from opencompass.configs.datasets.ruler.ruler_128k_gen import \
ruler_datasets as ruler_128k_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm2_5_7b_chat_1m import \
models as lmdeploy_internlm2_5_7b_chat_1m_model # noqa: F401, E501
# Summary Groups
from opencompass.configs.summarizers.groups.babilong import \
babilong_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.longbench import \
longbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.ruler import \
ruler_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.needlebench import \
needlebench_128k_summarizer # noqa: F401, E501

from ...volc import infer # noqa: F401, E501

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
datasets = [
v[0] for k, v in locals().items()
if k.endswith('_datasets') and isinstance(v, list) and len(v) > 0
]

for d in datasets:
d['reader_cfg']['test_range'] = '[0:16]'
4 changes: 1 addition & 3 deletions .github/scripts/eval_regression_chat_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,8 +112,6 @@
models as vllm_mistral_7b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mistral_7b_instruct_v0_2 import \
models as vllm_mistral_7b_instruct_v0_2_model # noqa: F401, E501
from opencompass.configs.models.mistral.vllm_mixtral_8x22b_instruct_v0_1 import \
models as vllm_mixtral_8x22b_instruct_v0_1_model # noqa: F401, E501
from opencompass.configs.models.nvidia.lmdeploy_nemotron_70b_instruct_hf import \
models as lmdeploy_nemotron_70b_instruct_hf_model # noqa: F401, E501
from opencompass.configs.models.phi.hf_phi_4 import \
Expand Down Expand Up @@ -155,7 +153,7 @@
from opencompass.configs.models.yi.lmdeploy_yi_1_5_34b_chat import \
models as lmdeploy_yi_1_5_34b_chat_model # noqa: F401, E501

from ...volc import infer as volc_infer # noqa: F401, E501
from ...volc import infer # noqa: F401, E501

hf_glm4_9b_chat_model[0]['path'] = 'THUDM/glm-4-9b-chat-hf'

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@
from opencompass.configs.summarizers.mmmlu_lite import \
mmmlu_summary_groups # noqa: F401, E501

from ...volc import infer as volc_infer # noqa: F401, E501
from ...volc import infer # noqa: F401, E501

# For HumanEval-X Evaluation
# Apply the evaluator ip_address and port
Expand Down
123 changes: 123 additions & 0 deletions .github/scripts/eval_regression_chat_obj_fullbench_v6.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
from mmengine.config import read_base

with read_base():
from opencompass.configs.datasets.aime2024.aime2024_llmjudge_gen_5e9f4f import \
aime2024_datasets # noqa: F401, E501
from opencompass.configs.datasets.aime2025.aime2025_llmjudge_gen_5e9f4f import \
aime2025_datasets # noqa: F401, E501
from opencompass.configs.datasets.ARC_Prize_Public_Evaluation.arc_prize_public_evaluation_gen_fedd04 import \
arc_prize_public_evaluation_datasets # noqa: F401, E501
from opencompass.configs.datasets.bbh.bbh_llmjudge_gen_b5bdf1 import \
bbh_datasets # noqa: F401, E501
from opencompass.configs.datasets.cmo_fib.cmo_fib_gen_2783e5 import \
cmo_fib_datasets # noqa: F401, E501
# dingo
from opencompass.configs.datasets.dingo.dingo_gen import \
datasets as dingo_datasets # noqa: F401, E501
# General Reasoning
from opencompass.configs.datasets.drop.drop_llmjudge_gen_3857b0 import \
drop_datasets # noqa: F401, E501
from opencompass.configs.datasets.GaokaoBench.GaokaoBench_no_subjective_gen_d16acb import \
GaokaoBench_datasets # noqa: F401, E501
from opencompass.configs.datasets.gpqa.gpqa_0shot_nocot_genericllmeval_gen_772ea0 import \
gpqa_datasets # noqa: F401, E501
# Math Calculation
from opencompass.configs.datasets.gsm8k.gsm8k_0shot_v2_gen_17d799 import \
gsm8k_datasets # noqa: F401, E501
from opencompass.configs.datasets.hellaswag.hellaswag_llmjudge_gen_809ef1 import \
hellaswag_datasets # noqa: F401, E501
from opencompass.configs.datasets.korbench.korbench_llmjudge_gen_56cf43 import \
korbench_0shot_single_datasets # noqa: F401, E501
from opencompass.configs.datasets.math.math_500_llmjudge_gen_6ff468 import \
math_datasets # noqa: F401, E501
from opencompass.configs.datasets.MathBench.mathbench_2024_gen_4b8f28 import \
mathbench_datasets # noqa: F401, E501
from opencompass.configs.datasets.musr.musr_llmjudge_gen_b47fd3 import \
musr_datasets # noqa: F401, E501
from opencompass.configs.datasets.supergpqa.supergpqa_llmjudge_gen_12b8bc import \
supergpqa_datasets # noqa: F401, E501
from opencompass.configs.datasets.triviaqa.triviaqa_wiki_1shot_gen_c87d61 import \
triviaqa_datasets # noqa: F401, E501
from opencompass.configs.models.hf_internlm.hf_internlm3_8b_instruct import \
models as hf_internlm3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.hf_internlm.lmdeploy_internlm3_8b_instruct import \
models as lmdeploy_internlm3_8b_instruct_model # noqa: F401, E501
from opencompass.configs.models.qwen2_5.lmdeploy_qwen2_5_32b_instruct import \
models as lmdeploy_qwen2_5_32b_instruct # noqa: F401, E501
# Summary Groups
from opencompass.configs.summarizers.groups.bbeh import \
bbeh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.bbh import \
bbh_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.cmmlu import \
cmmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.GaokaoBench import \
GaokaoBench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.korbench import \
korbench_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mathbench_v1_2024 import \
mathbench_2024_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu import \
mmlu_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.mmlu_pro import \
mmlu_pro_summary_groups # noqa: F401, E501
from opencompass.configs.summarizers.groups.musr_average import \
summarizer as musr_summarizer
from opencompass.configs.summarizers.mmmlu_lite import \
mmmlu_summary_groups # noqa: F401, E501

from ...volc import infer # noqa: F401, E501

datasets = [
v[0] for k, v in locals().items() if k.endswith('_datasets')
and 'scicode' not in k.lower() and 'dingo' not in k.lower()
and 'arc_prize' not in k.lower() and isinstance(v, list) and len(v) > 0
]

dingo_datasets[0]['abbr'] = 'qa_dingo_cn'
dingo_datasets[0]['path'] = 'data/qabench/history_prompt_case_cn.csv'
datasets.append(dingo_datasets[0])
datasets += arc_prize_public_evaluation_datasets

musr_summary_groups = musr_summarizer['summary_groups']
summary_groups = sum(
[v for k, v in locals().items() if k.endswith('_summary_groups')], [])

summary_groups.append(
{
'name': 'Mathbench',
'subsets': ['mathbench-a (average)', 'mathbench-t (average)'],
}, )

for d in datasets:
d['reader_cfg']['test_range'] = '[0:16]'
if 'dataset_cfg' in d['eval_cfg']['evaluator'] and 'reader_cfg' in d[
'eval_cfg']['evaluator']['dataset_cfg']:
d['eval_cfg']['evaluator']['dataset_cfg']['reader_cfg'][
'test_range'] = '[0:16]'
if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'dataset_cfg' in d[
'eval_cfg']['evaluator']['llm_evaluator']:
d['eval_cfg']['evaluator']['llm_evaluator']['dataset_cfg'][
'reader_cfg']['test_range'] = '[0:16]'

models = sum([v for k, v in locals().items() if k.endswith('_model')], [])
for m in models:
m['abbr'] = m['abbr'] + '_fullbench'
if 'turbomind' in m['abbr'] or 'lmdeploy' in m['abbr']:
m['engine_config']['max_batch_size'] = 1
m['batch_size'] = 1

models = sorted(models, key=lambda x: x['run_cfg']['num_gpus'])

obj_judge_model = lmdeploy_internlm3_8b_instruct_model[0]
obj_judge_model['engine_config']['max_batch_size'] = 1
obj_judge_model['engine_config']['cache_max_entry_count'] = 0.6
obj_judge_model['batch_size'] = 1

for d in datasets:
if 'judge_cfg' in d['eval_cfg']['evaluator']:
d['eval_cfg']['evaluator']['judge_cfg'] = obj_judge_model
if 'llm_evaluator' in d['eval_cfg']['evaluator'] and 'judge_cfg' in d[
'eval_cfg']['evaluator']['llm_evaluator']:
d['eval_cfg']['evaluator']['llm_evaluator'][
'judge_cfg'] = obj_judge_model
Loading