Skip to content
2 changes: 1 addition & 1 deletion docs/en/get_started/supported_dataset/llm.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
Below is the list of supported LLM benchmarks. Click on a benchmark name to jump to details.

| Benchmark Name | Pretty Name | Task Categories |
|----------------|-------------|----------------|
|------------|----------|----------|
| `aime24` | [AIME-2024](#aime-2024) | `Mathematics` |
| `aime25` | [AIME-2025](#aime-2025) | `Mathematics` |
| `alpaca_eval` | [AlpacaEval2.0](#alpacaeval20) | `Instruction-Following`, `Reasoning` |
Expand Down
268 changes: 137 additions & 131 deletions docs/generate_dataset_md.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,107 @@
import json
from tqdm import tqdm
from typing import Any, Dict

from evalscope.benchmarks import DataAdapter

# Language dictionaries for dataset markdown generation
DATASET_DETAIL_LOCALE = {
'back_to_top': {
'zh': '返回目录',
'en': 'Back to Top'
},
'toc_title': {
'zh': 'LLM评测集',
'en': 'LLM Benchmarks'
},
'dataset_name': {
'zh': '数据集名称',
'en': 'Dataset Name'
},
'dataset_id': {
'zh': '数据集ID',
'en': 'Dataset ID'
},
'description': {
'zh': '数据集描述',
'en': 'Description'
},
'task_categories': {
'zh': '任务类别',
'en': 'Task Categories'
},
'evaluation_metrics': {
'zh': '评估指标',
'en': 'Evaluation Metrics'
},
'requires_llm_judge': {
'zh': '需要LLM Judge',
'en': 'Requires LLM Judge'
},
'default_shots': {
'zh': '默认提示方式',
'en': 'Default Shots'
},
'subsets': {
'zh': '数据集子集',
'en': 'Subsets'
},
'supported_output_formats': {
'zh': '支持输出格式',
'en': 'Supported Output Formats'
},
'extra_parameters': {
'zh': '额外参数',
'en': 'Extra Parameters'
},
'system_prompt': {
'zh': '系统提示词',
'en': 'System Prompt'
},
'prompt_template': {
'zh': '提示模板',
'en': 'Prompt Template'
},
'yes': {
'zh': '是',
'en': 'Yes'
},
'no': {
'zh': '否',
'en': 'No'
},
'no_description': {
'zh': '暂无详细描述',
'en': 'No detailed description available'
}
}

DOCUMENT_LOCALE = {
'title': {
'zh': 'LLM评测集',
'en': 'LLM Benchmarks'
},
'intro': {
'zh': '以下是支持的LLM评测集列表,点击数据集标准名称可跳转详细信息。',
'en': 'Below is the list of supported LLM benchmarks. Click on a benchmark name to jump to details.'
},
'dataset_name': {
'zh': '数据集名称',
'en': 'Benchmark Name'
},
'pretty_name': {
'zh': '标准名称',
'en': 'Pretty Name'
},
'task_categories': {
'zh': '任务类别',
'en': 'Task Categories'
},
'details_title': {
'zh': '数据集详情',
'en': 'Benchmark Details'
}
}

def wrap_key_words(keywords: list[str]) -> str:
"""
Expand All @@ -26,129 +125,33 @@ def process_dictionary(data: dict) -> str:
"""
return json.dumps(data, ensure_ascii=False, indent=4)

def get_dataset_detail_locale(lang: str) -> Dict[str, str]:
"""Get localized strings for dataset details"""
return {k: v[lang] for k, v in DATASET_DETAIL_LOCALE.items()}

def generate_dataset_markdown(data_adapter: DataAdapter) -> str:
"""
根据DataAdapter实例生成美观的Markdown数据集介绍

Args:
data_adapter (DataAdapter): 数据集适配器实例

Returns:
str: 格式化的Markdown字符串
"""
# 获取基础信息
name = data_adapter.name
pretty_name = data_adapter.pretty_name or name
dataset_id = data_adapter.dataset_id
description = data_adapter.description or '暂无详细描述'


# 处理数据集ID的链接格式
if dataset_id.startswith(('http://', 'https://')):
dataset_id_md = f'[{dataset_id}]({dataset_id})'
elif '/' in dataset_id: # ModelScope格式的ID
dataset_id_md = f'[{dataset_id}](https://modelscope.cn/datasets/{dataset_id}/summary)'
else:
dataset_id_md = dataset_id

# 构建详情部分
details = [
f'### {pretty_name}',
'',
f'[返回目录](#llm评测集)',
f'- **数据集名称**: `{name}`',
f'- **数据集ID**: {dataset_id_md}',
f'- **数据集描述**: \n > {description}',
f'- **任务类别**: {wrap_key_words(data_adapter.tags)}',
f'- **评估指标**: {wrap_key_words(data_adapter.metric_list)}',
f"- **需要LLM Judge**: {'是' if data_adapter.llm_as_a_judge else '否'}",
f'- **默认提示方式**: {data_adapter.few_shot_num}-shot'
]

# 添加数据集子集信息
if data_adapter.subset_list:
details.append(f'- **数据集子集**: {wrap_key_words(data_adapter.subset_list)}')

# 添加其他技术信息
technical_info = [
f'- **支持输出格式**: {wrap_key_words(data_adapter.output_types)}',
]

# 添加额外参数信息
extra_params = data_adapter.config_kwargs.get('extra_params', {})
if extra_params:
technical_info.append(f'- **额外参数**: \n```json\n{process_dictionary(extra_params)}\n```')

# 添加提示模板
if data_adapter.system_prompt:
technical_info.append(f'- **系统提示词**: \n```text\n{data_adapter.system_prompt}\n```')
if data_adapter.prompt_template:
technical_info.append(f'- **提示模板**: \n```text\n{data_adapter.prompt_template}\n```')
def get_document_locale(lang: str) -> Dict[str, str]:
"""Get localized strings for document structure"""
return {k: v[lang] for k, v in DOCUMENT_LOCALE.items()}

return '\n'.join(details + [''] + technical_info + [''])


def generate_full_documentation(adapters: list[DataAdapter]) -> str:
"""
生成完整的Markdown文档,包含索引和所有数据集详情

Args:
adapters (list[DataAdapter]): DataAdapter实例列表

Returns:
str: 完整的Markdown文档
"""
# 生成索引
index = [
'# LLM评测集',
'',
'以下是支持的LLM评测集列表,点击数据集标准名称可跳转详细信息。',
'',
'| 数据集名称 | 标准名称 | 任务类别 |',
'|------------|----------|----------|',
]

for adapter in adapters:
name = adapter.name
pretty_name = adapter.pretty_name or name
link_name = pretty_name.lower().replace(' ', '-').replace('.', '')
tags = wrap_key_words(adapter.tags)
index.append(f'| `{name}` | [{pretty_name}](#{link_name}) | {tags} |')

# 生成详情部分
details = [
'',
'---',
'',
'## 数据集详情',
''
]

for i, adapter in enumerate(adapters):
details.append(generate_dataset_markdown(adapter))
if i < len(adapters) - 1:
details.append('---')
details.append('')

return '\n'.join(index + details)


def generate_dataset_markdown_en(data_adapter: DataAdapter) -> str:
def generate_dataset_markdown(data_adapter: DataAdapter, lang: str = 'zh') -> str:
"""
Generate a well-formatted Markdown benchmark introduction based on a DataAdapter instance

Args:
data_adapter (DataAdapter): Dataset adapter instance
lang (str): Language code ('zh' for Chinese, 'en' for English)

Returns:
str: Formatted Markdown string
"""
# Get localized text
text = get_dataset_detail_locale(lang)

# Get basic information
name = data_adapter.name
pretty_name = data_adapter.pretty_name or name
dataset_id = data_adapter.dataset_id
description = data_adapter.description or 'No detailed description available'
description = data_adapter.description or text['no_description']

# Format dataset ID links
if dataset_id.startswith(('http://', 'https://')):
Expand All @@ -162,57 +165,60 @@ def generate_dataset_markdown_en(data_adapter: DataAdapter) -> str:
details = [
f'### {pretty_name}',
'',
f'[Back to Top](#llm-benchmarks)',
f'- **Dataset Name**: `{name}`',
f'- **Dataset ID**: {dataset_id_md}',
f'- **Description**: \n > {description}',
f'- **Task Categories**: {wrap_key_words(data_adapter.tags)}',
f'- **Evaluation Metrics**: {wrap_key_words(data_adapter.metric_list)}',
f"- **Requires LLM Judge**: {'Yes' if data_adapter.llm_as_a_judge else 'No'}",
f'- **Default Shots**: {data_adapter.few_shot_num}-shot'
f'[{text["back_to_top"]}](#{text["toc_title"].lower().replace(" ", "-")})',
f'- **{text["dataset_name"]}**: `{name}`',
f'- **{text["dataset_id"]}**: {dataset_id_md}',
f'- **{text["description"]}**: \n > {description}',
f'- **{text["task_categories"]}**: {wrap_key_words(data_adapter.tags)}',
f'- **{text["evaluation_metrics"]}**: {wrap_key_words(data_adapter.metric_list)}',
f'- **{text["requires_llm_judge"]}**: {text["yes"] if data_adapter.llm_as_a_judge else text["no"]}',
f'- **{text["default_shots"]}**: {data_adapter.few_shot_num}-shot'
]

# Add dataset subsets
if data_adapter.subset_list:
details.append(f'- **Subsets**: {wrap_key_words(data_adapter.subset_list)}')
details.append(f'- **{text["subsets"]}**: {wrap_key_words(data_adapter.subset_list)}')

# Add technical information
technical_info = [
f'- **Supported Output Formats**: {wrap_key_words(data_adapter.output_types)}',
f'- **{text["supported_output_formats"]}**: {wrap_key_words(data_adapter.output_types)}',
]

# Add extra parameters
extra_params = data_adapter.config_kwargs.get('extra_params', {})
if extra_params:
technical_info.append(f'- **Extra Parameters**: \n```json\n{process_dictionary(extra_params)}\n```')
technical_info.append(f'- **{text["extra_parameters"]}**: \n```json\n{process_dictionary(extra_params)}\n```')

# Add prompt templates
if data_adapter.system_prompt:
technical_info.append(f'- **System Prompt**: \n```text\n{data_adapter.system_prompt}\n```')
technical_info.append(f'- **{text["system_prompt"]}**: \n```text\n{data_adapter.system_prompt}\n```')
if data_adapter.prompt_template:
technical_info.append(f'- **Prompt Template**: \n```text\n{data_adapter.prompt_template}\n```')
technical_info.append(f'- **{text["prompt_template"]}**: \n```text\n{data_adapter.prompt_template}\n```')

return '\n'.join(details + [''] + technical_info + [''])


def generate_full_documentation_en(adapters: list[DataAdapter]) -> str:
def generate_full_documentation(adapters: list[DataAdapter], lang: str = 'zh') -> str:
"""
Generate complete Markdown documentation with index and all benchmark details

Args:
adapters (list[DataAdapter]): List of DataAdapter instances
lang (str): Language code ('zh' for Chinese, 'en' for English)

Returns:
str: Complete Markdown document
"""
# Get localized text
text = get_document_locale(lang)

# Generate index
index = [
'# LLM Benchmarks',
f'# {text["title"]}',
'',
'Below is the list of supported LLM benchmarks. Click on a benchmark name to jump to details.',
f'{text["intro"]}',
'',
'| Benchmark Name | Pretty Name | Task Categories |',
'|----------------|-------------|----------------|',
f'| {text["dataset_name"]} | {text["pretty_name"]} | {text["task_categories"]} |',
'|------------|----------|----------|',
]

for adapter in adapters:
Expand All @@ -227,12 +233,12 @@ def generate_full_documentation_en(adapters: list[DataAdapter]) -> str:
'',
'---',
'',
'## Benchmark Details',
f'## {text["details_title"]}',
''
]

for i, adapter in enumerate(adapters):
details.append(generate_dataset_markdown_en(adapter))
details.append(generate_dataset_markdown(adapter, lang))
if i < len(adapters) - 1:
details.append('---')
details.append('')
Expand All @@ -253,8 +259,8 @@ def generate_full_documentation_en(adapters: list[DataAdapter]) -> str:
adapters.sort(key=lambda x: x.name) # 按名称排序

# 生成完整文档
markdown_doc = generate_full_documentation(adapters)
markdown_doc_en = generate_full_documentation_en(adapters)
markdown_doc = generate_full_documentation(adapters, 'zh')
markdown_doc_en = generate_full_documentation(adapters, 'en')

# 输出到文件
with open('docs/zh/get_started/supported_dataset/llm.md', 'w', encoding='utf-8') as f:
Expand Down
Loading