Skip to content

Commit 76d977a

Browse files
Support SA1B series datasets (#1542)
1 parent 98ec557 commit 76d977a

File tree

3 files changed

+66
-2
lines changed

3 files changed

+66
-2
lines changed

docs/source/LLM/支持的模型和数据集.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,7 @@
485485
|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)||4483004|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)|
486486
|shareai-llama3-dpo-zh-en-emoji|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji/summary)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo, pairwise|-|
487487
|ultrafeedback-kto|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto/summary)|default|230720|11.0±0.0, min=11, max=11|rlhf, kto|-|
488+
|rlaif-v|[swift/RLAIF-V-Dataset](https://modelscope.cn/datasets/swift/RLAIF-V-Dataset/summary)|default|83132|119.8±52.6, min=28, max=556|rlhf, dpo, multi-modal, en|-|
488489
|pileval|[swift/pile-val-backup](https://modelscope.cn/datasets/swift/pile-val-backup/summary)||214670|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
489490
|mantis-instruct|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct/summary)|birds-to-words<br>chartqa<br>coinstruct<br>contrastive_caption<br>docvqa<br>dreamsim<br>dvqa<br>iconqa<br>imagecode<br>llava_665k_multi<br>lrv_multi<br>multi_vqa<br>nextqa<br>nlvr2<br>spot-the-diff<br>star<br>visual_story_telling|655351|825.7±812.5, min=284, max=13563|chat, multi-modal, vision, quality|[TIGER-Lab/Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct)|
490491
|llava-data-instruct|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data/summary)|llava_instruct|364100|189.0±142.1, min=33, max=5183|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)|
@@ -510,7 +511,8 @@
510511
|sharegpt4v|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V/summary)|ShareGPT4V<br>ShareGPT4V-PT|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
511512
|llava-instruct-150k|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K/summary)||624610|490.4±180.2, min=288, max=5438|chat, multi-modal, vision|-|
512513
|llava-pretrain|[AI-ModelScope/LLaVA-Pretrain](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Pretrain/summary)|blip_laion_cc_sbu_558k|-|Dataset is too huge, please click the original link to view the dataset stat.|vqa, multi-modal, quality|[liuhaotian/LLaVA-Pretrain](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain)|
513-
|RLAIF-v-dataset|[swift/RLAIF-V-Dataset](https://modelscope.cn/datasets/swift/RLAIF-V-Dataset/summary)||83132|113.7±49.7, min=30, max=540|multi-modal, rlhf, quality|[openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)|
514+
|sa1b-dense-caption|[Tongyi-DataEngine/SA1B-Dense-Caption](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Dense-Caption/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|zh, multi-modal, vqa|-|
515+
|sa1b-paired-caption|[Tongyi-DataEngine/SA1B-Paired-Captions-Images](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Paired-Captions-Images/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|zh, multi-modal, vqa|-|
514516
|alpaca-cleaned|[AI-ModelScope/alpaca-cleaned](https://modelscope.cn/datasets/AI-ModelScope/alpaca-cleaned/summary)||51760|177.9±126.4, min=26, max=1044|chat, general, bench, quality|[yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned)|
515517
|aya-collection|[swift/aya_collection](https://modelscope.cn/datasets/swift/aya_collection/summary)|aya_dataset|202364|494.0±6911.3, min=21, max=3044268|multi-lingual, qa|[CohereForAI/aya_collection](https://huggingface.co/datasets/CohereForAI/aya_collection)|
516518
|belle-generated-chat-0.4M|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)||396004|273.3±52.0, min=32, max=873|common, zh|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)|

docs/source_en/LLM/Supported-models-datasets.md

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -485,6 +485,7 @@ The table below introduces the datasets supported by SWIFT:
485485
|stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)||4483004|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)|
486486
|shareai-llama3-dpo-zh-en-emoji|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji/summary)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo, pairwise|-|
487487
|ultrafeedback-kto|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto/summary)|default|230720|11.0±0.0, min=11, max=11|rlhf, kto|-|
488+
|rlaif-v|[swift/RLAIF-V-Dataset](https://modelscope.cn/datasets/swift/RLAIF-V-Dataset/summary)|default|83132|119.8±52.6, min=28, max=556|rlhf, dpo, multi-modal, en|-|
488489
|pileval|[swift/pile-val-backup](https://modelscope.cn/datasets/swift/pile-val-backup/summary)||214670|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
489490
|mantis-instruct|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct/summary)|birds-to-words<br>chartqa<br>coinstruct<br>contrastive_caption<br>docvqa<br>dreamsim<br>dvqa<br>iconqa<br>imagecode<br>llava_665k_multi<br>lrv_multi<br>multi_vqa<br>nextqa<br>nlvr2<br>spot-the-diff<br>star<br>visual_story_telling|655351|825.7±812.5, min=284, max=13563|chat, multi-modal, vision, quality|[TIGER-Lab/Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct)|
490491
|llava-data-instruct|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data/summary)|llava_instruct|364100|189.0±142.1, min=33, max=5183|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)|
@@ -510,7 +511,8 @@ The table below introduces the datasets supported by SWIFT:
510511
|sharegpt4v|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V/summary)|ShareGPT4V<br>ShareGPT4V-PT|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
511512
|llava-instruct-150k|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K/summary)||624610|490.4±180.2, min=288, max=5438|chat, multi-modal, vision|-|
512513
|llava-pretrain|[AI-ModelScope/LLaVA-Pretrain](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Pretrain/summary)|blip_laion_cc_sbu_558k|-|Dataset is too huge, please click the original link to view the dataset stat.|vqa, multi-modal, quality|[liuhaotian/LLaVA-Pretrain](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain)|
513-
|RLAIF-v-dataset|[swift/RLAIF-V-Dataset](https://modelscope.cn/datasets/swift/RLAIF-V-Dataset/summary)||83132|113.7±49.7, min=30, max=540|multi-modal, rlhf, quality|[openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)|
514+
|sa1b-dense-caption|[Tongyi-DataEngine/SA1B-Dense-Caption](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Dense-Caption/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|zh, multi-modal, vqa|-|
515+
|sa1b-paired-caption|[Tongyi-DataEngine/SA1B-Paired-Captions-Images](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Paired-Captions-Images/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|zh, multi-modal, vqa|-|
514516
|alpaca-cleaned|[AI-ModelScope/alpaca-cleaned](https://modelscope.cn/datasets/AI-ModelScope/alpaca-cleaned/summary)||51760|177.9±126.4, min=26, max=1044|chat, general, bench, quality|[yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned)|
515517
|aya-collection|[swift/aya_collection](https://modelscope.cn/datasets/swift/aya_collection/summary)|aya_dataset|202364|494.0±6911.3, min=21, max=3044268|multi-lingual, qa|[CohereForAI/aya_collection](https://huggingface.co/datasets/CohereForAI/aya_collection)|
516518
|belle-generated-chat-0.4M|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)||396004|273.3±52.0, min=32, max=873|common, zh|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)|

swift/llm/utils/dataset.py

Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -195,6 +195,9 @@ class DatasetName:
195195
llava_instruct_150k = 'llava-instruct-150k'
196196
llava_pretrain = 'llava-pretrain'
197197

198+
sa1b_dense_caption = 'sa1b-dense-caption'
199+
sa1b_paired_caption = 'sa1b-paired-caption'
200+
198201
@classmethod
199202
def get_dataset_name_list(cls) -> List[str]:
200203
res = []
@@ -466,6 +469,63 @@ def preprocess_row(row):
466469
hf_dataset_id='OpenGVLab/ShareGPT-4o')
467470

468471

472+
def preprocess_sa1b_paired_caption(dataset: HfDataset):
473+
474+
prompt = ['图片中展示了什么', '讲述一下图片中内容', '告诉我里面有什么', '图片内容是啥']
475+
476+
def preprocess_row(row):
477+
response = row['global_caption']
478+
query = np.random.choice(prompt)
479+
return {
480+
'query': query,
481+
'response': response,
482+
}
483+
484+
return dataset.map(
485+
preprocess_row, load_from_cache_file=dataset_enable_cache).rename_column('opensource_url', 'images')
486+
487+
488+
register_dataset(
489+
DatasetName.sa1b_paired_caption,
490+
'Tongyi-DataEngine/SA1B-Paired-Captions-Images',
491+
None,
492+
preprocess_sa1b_paired_caption,
493+
get_dataset_from_repo,
494+
split=['train'],
495+
huge_dataset=True,
496+
tags=['zh', 'multi-modal', 'vqa'])
497+
498+
499+
def preprocess_sa1b_dense_caption(dataset: HfDataset):
500+
501+
prompt = ['图片中展示了什么', '讲述一下图片中内容', '告诉我里面有什么', '图片内容是啥']
502+
503+
def preprocess_row(row):
504+
response = ast.literal_eval(row['cap_seg'])
505+
response = response.get('global_caption')
506+
query = np.random.choice(prompt)
507+
return {
508+
'query': query,
509+
'response': response,
510+
}
511+
512+
return dataset.map(
513+
preprocess_row,
514+
load_from_cache_file=dataset_enable_cache).filter(lambda row: row.get('response')).rename_column(
515+
'url', 'images')
516+
517+
518+
register_dataset(
519+
DatasetName.sa1b_dense_caption,
520+
'Tongyi-DataEngine/SA1B-Dense-Caption',
521+
None,
522+
preprocess_sa1b_dense_caption,
523+
get_dataset_from_repo,
524+
split=['train'],
525+
huge_dataset=True,
526+
tags=['zh', 'multi-modal', 'vqa'])
527+
528+
469529
def _preprocess_vision_dataset(dataset: HfDataset) -> HfDataset:
470530
prompt = 'please describe the image.'
471531
image_key = 'image'

0 commit comments

Comments
 (0)