Support SA1B series datasets (#1542)

tastelikefeet · web-flow · commit 76d977af23b1 · 2024-07-30T19:46:02.000+08:00
diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -485,6 +485,7 @@
 |stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)||4483004|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)|
 |shareai-llama3-dpo-zh-en-emoji|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji/summary)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo, pairwise|-|
 |ultrafeedback-kto|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto/summary)|default|230720|11.0±0.0, min=11, max=11|rlhf, kto|-|
+|rlaif-v|[swift/RLAIF-V-Dataset](https://modelscope.cn/datasets/swift/RLAIF-V-Dataset/summary)|default|83132|119.8±52.6, min=28, max=556|rlhf, dpo, multi-modal, en|-|
 |pileval|[swift/pile-val-backup](https://modelscope.cn/datasets/swift/pile-val-backup/summary)||214670|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
 |mantis-instruct|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct/summary)|birds-to-words<br>chartqa<br>coinstruct<br>contrastive_caption<br>docvqa<br>dreamsim<br>dvqa<br>iconqa<br>imagecode<br>llava_665k_multi<br>lrv_multi<br>multi_vqa<br>nextqa<br>nlvr2<br>spot-the-diff<br>star<br>visual_story_telling|655351|825.7±812.5, min=284, max=13563|chat, multi-modal, vision, quality|[TIGER-Lab/Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct)|
 |llava-data-instruct|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data/summary)|llava_instruct|364100|189.0±142.1, min=33, max=5183|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)|
@@ -510,7 +511,8 @@
 |sharegpt4v|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V/summary)|ShareGPT4V<br>ShareGPT4V-PT|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
 |llava-instruct-150k|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K/summary)||624610|490.4±180.2, min=288, max=5438|chat, multi-modal, vision|-|
 |llava-pretrain|[AI-ModelScope/LLaVA-Pretrain](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Pretrain/summary)|blip_laion_cc_sbu_558k|-|Dataset is too huge, please click the original link to view the dataset stat.|vqa, multi-modal, quality|[liuhaotian/LLaVA-Pretrain](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain)|
-|RLAIF-v-dataset|[swift/RLAIF-V-Dataset](https://modelscope.cn/datasets/swift/RLAIF-V-Dataset/summary)||83132|113.7±49.7, min=30, max=540|multi-modal, rlhf, quality|[openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)|
+|sa1b-dense-caption|[Tongyi-DataEngine/SA1B-Dense-Caption](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Dense-Caption/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|zh, multi-modal, vqa|-|
+|sa1b-paired-caption|[Tongyi-DataEngine/SA1B-Paired-Captions-Images](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Paired-Captions-Images/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|zh, multi-modal, vqa|-|
 |alpaca-cleaned|[AI-ModelScope/alpaca-cleaned](https://modelscope.cn/datasets/AI-ModelScope/alpaca-cleaned/summary)||51760|177.9±126.4, min=26, max=1044|chat, general, bench, quality|[yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned)|
 |aya-collection|[swift/aya_collection](https://modelscope.cn/datasets/swift/aya_collection/summary)|aya_dataset|202364|494.0±6911.3, min=21, max=3044268|multi-lingual, qa|[CohereForAI/aya_collection](https://huggingface.co/datasets/CohereForAI/aya_collection)|
 |belle-generated-chat-0.4M|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)||396004|273.3±52.0, min=32, max=873|common, zh|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
@@ -485,6 +485,7 @@ The table below introduces the datasets supported by SWIFT:
 |stack-exchange-paired|[AI-ModelScope/stack-exchange-paired](https://modelscope.cn/datasets/AI-ModelScope/stack-exchange-paired/summary)||4483004|534.5±594.6, min=31, max=56588|hfrl, dpo, pairwise|[lvwerra/stack-exchange-paired](https://huggingface.co/datasets/lvwerra/stack-exchange-paired)|
 |shareai-llama3-dpo-zh-en-emoji|[hjh0119/shareAI-Llama3-DPO-zh-en-emoji](https://modelscope.cn/datasets/hjh0119/shareAI-Llama3-DPO-zh-en-emoji/summary)|default|2449|334.0±162.8, min=36, max=1801|rlhf, dpo, pairwise|-|
 |ultrafeedback-kto|[AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto](https://modelscope.cn/datasets/AI-ModelScope/ultrafeedback-binarized-preferences-cleaned-kto/summary)|default|230720|11.0±0.0, min=11, max=11|rlhf, kto|-|
+|rlaif-v|[swift/RLAIF-V-Dataset](https://modelscope.cn/datasets/swift/RLAIF-V-Dataset/summary)|default|83132|119.8±52.6, min=28, max=556|rlhf, dpo, multi-modal, en|-|
 |pileval|[swift/pile-val-backup](https://modelscope.cn/datasets/swift/pile-val-backup/summary)||214670|1612.3±8856.2, min=11, max=1208955|text-generation, awq|[mit-han-lab/pile-val-backup](https://huggingface.co/datasets/mit-han-lab/pile-val-backup)|
 |mantis-instruct|[swift/Mantis-Instruct](https://modelscope.cn/datasets/swift/Mantis-Instruct/summary)|birds-to-words<br>chartqa<br>coinstruct<br>contrastive_caption<br>docvqa<br>dreamsim<br>dvqa<br>iconqa<br>imagecode<br>llava_665k_multi<br>lrv_multi<br>multi_vqa<br>nextqa<br>nlvr2<br>spot-the-diff<br>star<br>visual_story_telling|655351|825.7±812.5, min=284, max=13563|chat, multi-modal, vision, quality|[TIGER-Lab/Mantis-Instruct](https://huggingface.co/datasets/TIGER-Lab/Mantis-Instruct)|
 |llava-data-instruct|[swift/llava-data](https://modelscope.cn/datasets/swift/llava-data/summary)|llava_instruct|364100|189.0±142.1, min=33, max=5183|sft, multi-modal, quality|[TIGER-Lab/llava-data](https://huggingface.co/datasets/TIGER-Lab/llava-data)|
@@ -510,7 +511,8 @@ The table below introduces the datasets supported by SWIFT:
 |sharegpt4v|[AI-ModelScope/ShareGPT4V](https://modelscope.cn/datasets/AI-ModelScope/ShareGPT4V/summary)|ShareGPT4V<br>ShareGPT4V-PT|-|Dataset is too huge, please click the original link to view the dataset stat.|chat, multi-modal, vision|-|
 |llava-instruct-150k|[AI-ModelScope/LLaVA-Instruct-150K](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Instruct-150K/summary)||624610|490.4±180.2, min=288, max=5438|chat, multi-modal, vision|-|
 |llava-pretrain|[AI-ModelScope/LLaVA-Pretrain](https://modelscope.cn/datasets/AI-ModelScope/LLaVA-Pretrain/summary)|blip_laion_cc_sbu_558k|-|Dataset is too huge, please click the original link to view the dataset stat.|vqa, multi-modal, quality|[liuhaotian/LLaVA-Pretrain](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain)|
-|RLAIF-v-dataset|[swift/RLAIF-V-Dataset](https://modelscope.cn/datasets/swift/RLAIF-V-Dataset/summary)||83132|113.7±49.7, min=30, max=540|multi-modal, rlhf, quality|[openbmb/RLAIF-V-Dataset](https://huggingface.co/datasets/openbmb/RLAIF-V-Dataset)|
+|sa1b-dense-caption|[Tongyi-DataEngine/SA1B-Dense-Caption](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Dense-Caption/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|zh, multi-modal, vqa|-|
+|sa1b-paired-caption|[Tongyi-DataEngine/SA1B-Paired-Captions-Images](https://modelscope.cn/datasets/Tongyi-DataEngine/SA1B-Paired-Captions-Images/summary)||-|Dataset is too huge, please click the original link to view the dataset stat.|zh, multi-modal, vqa|-|
 |alpaca-cleaned|[AI-ModelScope/alpaca-cleaned](https://modelscope.cn/datasets/AI-ModelScope/alpaca-cleaned/summary)||51760|177.9±126.4, min=26, max=1044|chat, general, bench, quality|[yahma/alpaca-cleaned](https://huggingface.co/datasets/yahma/alpaca-cleaned)|
 |aya-collection|[swift/aya_collection](https://modelscope.cn/datasets/swift/aya_collection/summary)|aya_dataset|202364|494.0±6911.3, min=21, max=3044268|multi-lingual, qa|[CohereForAI/aya_collection](https://huggingface.co/datasets/CohereForAI/aya_collection)|
 |belle-generated-chat-0.4M|[AI-ModelScope/generated_chat_0.4M](https://modelscope.cn/datasets/AI-ModelScope/generated_chat_0.4M/summary)||396004|273.3±52.0, min=32, max=873|common, zh|[BelleGroup/generated_chat_0.4M](https://huggingface.co/datasets/BelleGroup/generated_chat_0.4M)|
diff --git a/swift/llm/utils/dataset.py b/swift/llm/utils/dataset.py
@@ -195,6 +195,9 @@ class DatasetName:
     llava_instruct_150k = 'llava-instruct-150k'
     llava_pretrain = 'llava-pretrain'
 
+    sa1b_dense_caption = 'sa1b-dense-caption'
+    sa1b_paired_caption = 'sa1b-paired-caption'
+
     @classmethod
     def get_dataset_name_list(cls) -> List[str]:
         res = []
@@ -466,6 +469,63 @@ def preprocess_row(row):
     hf_dataset_id='OpenGVLab/ShareGPT-4o')
 
 
+def preprocess_sa1b_paired_caption(dataset: HfDataset):
+
+    prompt = ['图片中展示了什么', '讲述一下图片中内容', '告诉我里面有什么', '图片内容是啥']
+
+    def preprocess_row(row):
+        response = row['global_caption']
+        query = np.random.choice(prompt)
+        return {
+            'query': query,
+            'response': response,
+        }
+
+    return dataset.map(
+        preprocess_row, load_from_cache_file=dataset_enable_cache).rename_column('opensource_url', 'images')
+
+
+register_dataset(
+    DatasetName.sa1b_paired_caption,
+    'Tongyi-DataEngine/SA1B-Paired-Captions-Images',
+    None,
+    preprocess_sa1b_paired_caption,
+    get_dataset_from_repo,
+    split=['train'],
+    huge_dataset=True,
+    tags=['zh', 'multi-modal', 'vqa'])
+
+
+def preprocess_sa1b_dense_caption(dataset: HfDataset):
+
+    prompt = ['图片中展示了什么', '讲述一下图片中内容', '告诉我里面有什么', '图片内容是啥']
+
+    def preprocess_row(row):
+        response = ast.literal_eval(row['cap_seg'])
+        response = response.get('global_caption')
+        query = np.random.choice(prompt)
+        return {
+            'query': query,
+            'response': response,
+        }
+
+    return dataset.map(
+        preprocess_row,
+        load_from_cache_file=dataset_enable_cache).filter(lambda row: row.get('response')).rename_column(
+            'url', 'images')
+
+
+register_dataset(
+    DatasetName.sa1b_dense_caption,
+    'Tongyi-DataEngine/SA1B-Dense-Caption',
+    None,
+    preprocess_sa1b_dense_caption,
+    get_dataset_from_repo,
+    split=['train'],
+    huge_dataset=True,
+    tags=['zh', 'multi-modal', 'vqa'])
+
+
 def _preprocess_vision_dataset(dataset: HfDataset) -> HfDataset:
     prompt = 'please describe the image.'
     image_key = 'image'