Skip to content

Commit b4cec83

Browse files
YunnglinCopilot
andauthored
[Refector] visualization (#661)
* split file * split ui * update utils * add model compare * add model compare ui * add model compare name * update generate dataset * Update evalscope/app/utils/visualization.py Co-authored-by: Copilot <[email protected]> * Update evalscope/app/utils/data_utils.py Co-authored-by: Copilot <[email protected]> * Update evalscope/app/utils/visualization.py Co-authored-by: Copilot <[email protected]> * fix app * fix app --------- Co-authored-by: Copilot <[email protected]>
1 parent 43fb50c commit b4cec83

File tree

16 files changed

+1416
-899
lines changed

16 files changed

+1416
-899
lines changed

docs/en/get_started/supported_dataset/llm.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Below is the list of supported LLM benchmarks. Click on a benchmark name to jump to details.
44

55
| Benchmark Name | Pretty Name | Task Categories |
6-
|----------------|-------------|----------------|
6+
|------------|----------|----------|
77
| `aime24` | [AIME-2024](#aime-2024) | `Mathematics` |
88
| `aime25` | [AIME-2025](#aime-2025) | `Mathematics` |
99
| `alpaca_eval` | [AlpacaEval2.0](#alpacaeval20) | `Instruction-Following`, `Reasoning` |

docs/generate_dataset_md.py

Lines changed: 137 additions & 131 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,107 @@
11
import json
22
from tqdm import tqdm
3+
from typing import Any, Dict
34

45
from evalscope.benchmarks import DataAdapter
56

7+
# Language dictionaries for dataset markdown generation
8+
DATASET_DETAIL_LOCALE = {
9+
'back_to_top': {
10+
'zh': '返回目录',
11+
'en': 'Back to Top'
12+
},
13+
'toc_title': {
14+
'zh': 'LLM评测集',
15+
'en': 'LLM Benchmarks'
16+
},
17+
'dataset_name': {
18+
'zh': '数据集名称',
19+
'en': 'Dataset Name'
20+
},
21+
'dataset_id': {
22+
'zh': '数据集ID',
23+
'en': 'Dataset ID'
24+
},
25+
'description': {
26+
'zh': '数据集描述',
27+
'en': 'Description'
28+
},
29+
'task_categories': {
30+
'zh': '任务类别',
31+
'en': 'Task Categories'
32+
},
33+
'evaluation_metrics': {
34+
'zh': '评估指标',
35+
'en': 'Evaluation Metrics'
36+
},
37+
'requires_llm_judge': {
38+
'zh': '需要LLM Judge',
39+
'en': 'Requires LLM Judge'
40+
},
41+
'default_shots': {
42+
'zh': '默认提示方式',
43+
'en': 'Default Shots'
44+
},
45+
'subsets': {
46+
'zh': '数据集子集',
47+
'en': 'Subsets'
48+
},
49+
'supported_output_formats': {
50+
'zh': '支持输出格式',
51+
'en': 'Supported Output Formats'
52+
},
53+
'extra_parameters': {
54+
'zh': '额外参数',
55+
'en': 'Extra Parameters'
56+
},
57+
'system_prompt': {
58+
'zh': '系统提示词',
59+
'en': 'System Prompt'
60+
},
61+
'prompt_template': {
62+
'zh': '提示模板',
63+
'en': 'Prompt Template'
64+
},
65+
'yes': {
66+
'zh': '是',
67+
'en': 'Yes'
68+
},
69+
'no': {
70+
'zh': '否',
71+
'en': 'No'
72+
},
73+
'no_description': {
74+
'zh': '暂无详细描述',
75+
'en': 'No detailed description available'
76+
}
77+
}
78+
79+
DOCUMENT_LOCALE = {
80+
'title': {
81+
'zh': 'LLM评测集',
82+
'en': 'LLM Benchmarks'
83+
},
84+
'intro': {
85+
'zh': '以下是支持的LLM评测集列表,点击数据集标准名称可跳转详细信息。',
86+
'en': 'Below is the list of supported LLM benchmarks. Click on a benchmark name to jump to details.'
87+
},
88+
'dataset_name': {
89+
'zh': '数据集名称',
90+
'en': 'Benchmark Name'
91+
},
92+
'pretty_name': {
93+
'zh': '标准名称',
94+
'en': 'Pretty Name'
95+
},
96+
'task_categories': {
97+
'zh': '任务类别',
98+
'en': 'Task Categories'
99+
},
100+
'details_title': {
101+
'zh': '数据集详情',
102+
'en': 'Benchmark Details'
103+
}
104+
}
6105

7106
def wrap_key_words(keywords: list[str]) -> str:
8107
"""
@@ -26,129 +125,33 @@ def process_dictionary(data: dict) -> str:
26125
"""
27126
return json.dumps(data, ensure_ascii=False, indent=4)
28127

128+
def get_dataset_detail_locale(lang: str) -> Dict[str, str]:
129+
"""Get localized strings for dataset details"""
130+
return {k: v[lang] for k, v in DATASET_DETAIL_LOCALE.items()}
29131

30-
def generate_dataset_markdown(data_adapter: DataAdapter) -> str:
31-
"""
32-
根据DataAdapter实例生成美观的Markdown数据集介绍
33-
34-
Args:
35-
data_adapter (DataAdapter): 数据集适配器实例
36-
37-
Returns:
38-
str: 格式化的Markdown字符串
39-
"""
40-
# 获取基础信息
41-
name = data_adapter.name
42-
pretty_name = data_adapter.pretty_name or name
43-
dataset_id = data_adapter.dataset_id
44-
description = data_adapter.description or '暂无详细描述'
45-
46-
47-
# 处理数据集ID的链接格式
48-
if dataset_id.startswith(('http://', 'https://')):
49-
dataset_id_md = f'[{dataset_id}]({dataset_id})'
50-
elif '/' in dataset_id: # ModelScope格式的ID
51-
dataset_id_md = f'[{dataset_id}](https://modelscope.cn/datasets/{dataset_id}/summary)'
52-
else:
53-
dataset_id_md = dataset_id
54-
55-
# 构建详情部分
56-
details = [
57-
f'### {pretty_name}',
58-
'',
59-
f'[返回目录](#llm评测集)',
60-
f'- **数据集名称**: `{name}`',
61-
f'- **数据集ID**: {dataset_id_md}',
62-
f'- **数据集描述**: \n > {description}',
63-
f'- **任务类别**: {wrap_key_words(data_adapter.tags)}',
64-
f'- **评估指标**: {wrap_key_words(data_adapter.metric_list)}',
65-
f"- **需要LLM Judge**: {'是' if data_adapter.llm_as_a_judge else '否'}",
66-
f'- **默认提示方式**: {data_adapter.few_shot_num}-shot'
67-
]
68-
69-
# 添加数据集子集信息
70-
if data_adapter.subset_list:
71-
details.append(f'- **数据集子集**: {wrap_key_words(data_adapter.subset_list)}')
72-
73-
# 添加其他技术信息
74-
technical_info = [
75-
f'- **支持输出格式**: {wrap_key_words(data_adapter.output_types)}',
76-
]
77-
78-
# 添加额外参数信息
79-
extra_params = data_adapter.config_kwargs.get('extra_params', {})
80-
if extra_params:
81-
technical_info.append(f'- **额外参数**: \n```json\n{process_dictionary(extra_params)}\n```')
82-
83-
# 添加提示模板
84-
if data_adapter.system_prompt:
85-
technical_info.append(f'- **系统提示词**: \n```text\n{data_adapter.system_prompt}\n```')
86-
if data_adapter.prompt_template:
87-
technical_info.append(f'- **提示模板**: \n```text\n{data_adapter.prompt_template}\n```')
132+
def get_document_locale(lang: str) -> Dict[str, str]:
133+
"""Get localized strings for document structure"""
134+
return {k: v[lang] for k, v in DOCUMENT_LOCALE.items()}
88135

89-
return '\n'.join(details + [''] + technical_info + [''])
90-
91-
92-
def generate_full_documentation(adapters: list[DataAdapter]) -> str:
93-
"""
94-
生成完整的Markdown文档,包含索引和所有数据集详情
95-
96-
Args:
97-
adapters (list[DataAdapter]): DataAdapter实例列表
98-
99-
Returns:
100-
str: 完整的Markdown文档
101-
"""
102-
# 生成索引
103-
index = [
104-
'# LLM评测集',
105-
'',
106-
'以下是支持的LLM评测集列表,点击数据集标准名称可跳转详细信息。',
107-
'',
108-
'| 数据集名称 | 标准名称 | 任务类别 |',
109-
'|------------|----------|----------|',
110-
]
111-
112-
for adapter in adapters:
113-
name = adapter.name
114-
pretty_name = adapter.pretty_name or name
115-
link_name = pretty_name.lower().replace(' ', '-').replace('.', '')
116-
tags = wrap_key_words(adapter.tags)
117-
index.append(f'| `{name}` | [{pretty_name}](#{link_name}) | {tags} |')
118-
119-
# 生成详情部分
120-
details = [
121-
'',
122-
'---',
123-
'',
124-
'## 数据集详情',
125-
''
126-
]
127-
128-
for i, adapter in enumerate(adapters):
129-
details.append(generate_dataset_markdown(adapter))
130-
if i < len(adapters) - 1:
131-
details.append('---')
132-
details.append('')
133-
134-
return '\n'.join(index + details)
135-
136-
137-
def generate_dataset_markdown_en(data_adapter: DataAdapter) -> str:
136+
def generate_dataset_markdown(data_adapter: DataAdapter, lang: str = 'zh') -> str:
138137
"""
139138
Generate a well-formatted Markdown benchmark introduction based on a DataAdapter instance
140139
141140
Args:
142141
data_adapter (DataAdapter): Dataset adapter instance
142+
lang (str): Language code ('zh' for Chinese, 'en' for English)
143143
144144
Returns:
145145
str: Formatted Markdown string
146146
"""
147+
# Get localized text
148+
text = get_dataset_detail_locale(lang)
149+
147150
# Get basic information
148151
name = data_adapter.name
149152
pretty_name = data_adapter.pretty_name or name
150153
dataset_id = data_adapter.dataset_id
151-
description = data_adapter.description or 'No detailed description available'
154+
description = data_adapter.description or text['no_description']
152155

153156
# Format dataset ID links
154157
if dataset_id.startswith(('http://', 'https://')):
@@ -162,57 +165,60 @@ def generate_dataset_markdown_en(data_adapter: DataAdapter) -> str:
162165
details = [
163166
f'### {pretty_name}',
164167
'',
165-
f'[Back to Top](#llm-benchmarks)',
166-
f'- **Dataset Name**: `{name}`',
167-
f'- **Dataset ID**: {dataset_id_md}',
168-
f'- **Description**: \n > {description}',
169-
f'- **Task Categories**: {wrap_key_words(data_adapter.tags)}',
170-
f'- **Evaluation Metrics**: {wrap_key_words(data_adapter.metric_list)}',
171-
f"- **Requires LLM Judge**: {'Yes' if data_adapter.llm_as_a_judge else 'No'}",
172-
f'- **Default Shots**: {data_adapter.few_shot_num}-shot'
168+
f'[{text["back_to_top"]}](#{text["toc_title"].lower().replace(" ", "-")})',
169+
f'- **{text["dataset_name"]}**: `{name}`',
170+
f'- **{text["dataset_id"]}**: {dataset_id_md}',
171+
f'- **{text["description"]}**: \n > {description}',
172+
f'- **{text["task_categories"]}**: {wrap_key_words(data_adapter.tags)}',
173+
f'- **{text["evaluation_metrics"]}**: {wrap_key_words(data_adapter.metric_list)}',
174+
f'- **{text["requires_llm_judge"]}**: {text["yes"] if data_adapter.llm_as_a_judge else text["no"]}',
175+
f'- **{text["default_shots"]}**: {data_adapter.few_shot_num}-shot'
173176
]
174177

175178
# Add dataset subsets
176179
if data_adapter.subset_list:
177-
details.append(f'- **Subsets**: {wrap_key_words(data_adapter.subset_list)}')
180+
details.append(f'- **{text["subsets"]}**: {wrap_key_words(data_adapter.subset_list)}')
178181

179182
# Add technical information
180183
technical_info = [
181-
f'- **Supported Output Formats**: {wrap_key_words(data_adapter.output_types)}',
184+
f'- **{text["supported_output_formats"]}**: {wrap_key_words(data_adapter.output_types)}',
182185
]
183186

184187
# Add extra parameters
185188
extra_params = data_adapter.config_kwargs.get('extra_params', {})
186189
if extra_params:
187-
technical_info.append(f'- **Extra Parameters**: \n```json\n{process_dictionary(extra_params)}\n```')
190+
technical_info.append(f'- **{text["extra_parameters"]}**: \n```json\n{process_dictionary(extra_params)}\n```')
188191

189192
# Add prompt templates
190193
if data_adapter.system_prompt:
191-
technical_info.append(f'- **System Prompt**: \n```text\n{data_adapter.system_prompt}\n```')
194+
technical_info.append(f'- **{text["system_prompt"]}**: \n```text\n{data_adapter.system_prompt}\n```')
192195
if data_adapter.prompt_template:
193-
technical_info.append(f'- **Prompt Template**: \n```text\n{data_adapter.prompt_template}\n```')
196+
technical_info.append(f'- **{text["prompt_template"]}**: \n```text\n{data_adapter.prompt_template}\n```')
194197

195198
return '\n'.join(details + [''] + technical_info + [''])
196199

197-
198-
def generate_full_documentation_en(adapters: list[DataAdapter]) -> str:
200+
def generate_full_documentation(adapters: list[DataAdapter], lang: str = 'zh') -> str:
199201
"""
200202
Generate complete Markdown documentation with index and all benchmark details
201203
202204
Args:
203205
adapters (list[DataAdapter]): List of DataAdapter instances
206+
lang (str): Language code ('zh' for Chinese, 'en' for English)
204207
205208
Returns:
206209
str: Complete Markdown document
207210
"""
211+
# Get localized text
212+
text = get_document_locale(lang)
213+
208214
# Generate index
209215
index = [
210-
'# LLM Benchmarks',
216+
f'# {text["title"]}',
211217
'',
212-
'Below is the list of supported LLM benchmarks. Click on a benchmark name to jump to details.',
218+
f'{text["intro"]}',
213219
'',
214-
'| Benchmark Name | Pretty Name | Task Categories |',
215-
'|----------------|-------------|----------------|',
220+
f'| {text["dataset_name"]} | {text["pretty_name"]} | {text["task_categories"]} |',
221+
'|------------|----------|----------|',
216222
]
217223

218224
for adapter in adapters:
@@ -227,12 +233,12 @@ def generate_full_documentation_en(adapters: list[DataAdapter]) -> str:
227233
'',
228234
'---',
229235
'',
230-
'## Benchmark Details',
236+
f'## {text["details_title"]}',
231237
''
232238
]
233239

234240
for i, adapter in enumerate(adapters):
235-
details.append(generate_dataset_markdown_en(adapter))
241+
details.append(generate_dataset_markdown(adapter, lang))
236242
if i < len(adapters) - 1:
237243
details.append('---')
238244
details.append('')
@@ -253,8 +259,8 @@ def generate_full_documentation_en(adapters: list[DataAdapter]) -> str:
253259
adapters.sort(key=lambda x: x.name) # 按名称排序
254260

255261
# 生成完整文档
256-
markdown_doc = generate_full_documentation(adapters)
257-
markdown_doc_en = generate_full_documentation_en(adapters)
262+
markdown_doc = generate_full_documentation(adapters, 'zh')
263+
markdown_doc_en = generate_full_documentation(adapters, 'en')
258264

259265
# 输出到文件
260266
with open('docs/zh/get_started/supported_dataset/llm.md', 'w', encoding='utf-8') as f:

0 commit comments

Comments
 (0)