support qwen1.5-moe model (#627)

hjh0119 · web-flow · commit dd9410eac0ff · 2024-03-29T20:41:47.000+08:00
diff --git a/README.md b/README.md
@@ -39,6 +39,7 @@ To facilitate use by users unfamiliar with deep learning, we provide a Gradio we
 Additionally, we are expanding capabilities for other modalities. Currently, we support full-parameter training and LoRA training for AnimateDiff.
 
 ## 🎉 News
+- 🔥2024.03.29: Support **Qwen1.5-MoE** series: Qwen1.5-MoE-A2.7B, Qwen1.5-MoE-A2.7B-Chat, Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4.
 - 🔥2024.03.29: Support the fine-tuning and inference of **Grok-1** 300B MoE, please view details [here](https://github.com/modelscope/swift/tree/main/docs/source_en/LLM/Grok-1-best-practice.md).
 - 🔥2024.03.25: Supports inference and fine-tuning of TeleChat-12b model, use [this script](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/telechat_12b/lora/sft.sh) to start training!
 - 🔥2024.03.20: Supports inference and fine-tuning for the **llava** series. For best practice, you can refer to [here](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/llava最佳实践.md).
diff --git a/README_CN.md b/README_CN.md
@@ -40,6 +40,7 @@ SWIFT支持近**200种LLM和MLLM**（多模态大模型）的训练、推理、
 此外，我们也在拓展其他模态的能力，目前我们支持了AnimateDiff的全参数训练和LoRA训练。
 
 ## 🎉 新闻
+- 🔥2024.03.29: 支持**Qwen1.5-MoE**系列: Qwen1.5-MoE-A2.7B, Qwen1.5-MoE-A2.7B-Chat, Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4.
 - 🔥2024.03.29: 支持**Grok-1**300B MoE模型的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/LLM/Grok训练和推理.md).
 - 🔥2024.03.25: 支持TeleChat-12b模型的训练和推理, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/telechat_12b/lora/sft.sh)来开始训练！.
 - 🔥2024.03.20: 支持**llava**系列的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/llava最佳实践.md).
diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -36,12 +36,14 @@
 |qwen1half-7b|[qwen/Qwen1.5-7B](https://modelscope.cn/models/qwen/Qwen1.5-7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
 |qwen1half-14b|[qwen/Qwen1.5-14B](https://modelscope.cn/models/qwen/Qwen1.5-14B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
 |qwen1half-72b|[qwen/Qwen1.5-72B](https://modelscope.cn/models/qwen/Qwen1.5-72B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
+|qwen1half-moe-a2_7b|[qwen/Qwen1.5-MoE-A2.7B](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;|transformers>=4.37|-|
 |qwen1half-0_5b-chat|[qwen/Qwen1.5-0.5B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
 |qwen1half-1_8b-chat|[qwen/Qwen1.5-1.8B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
 |qwen1half-4b-chat|[qwen/Qwen1.5-4B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
 |qwen1half-7b-chat|[qwen/Qwen1.5-7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
 |qwen1half-14b-chat|[qwen/Qwen1.5-14B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
 |qwen1half-72b-chat|[qwen/Qwen1.5-72B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
+|qwen1half-moe-a2_7b-chat|[qwen/Qwen1.5-MoE-A2.7B-Chat](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37|-|
 |qwen1half-0_5b-chat-int4|[qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
 |qwen1half-1_8b-chat-int4|[qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
 |qwen1half-4b-chat-int4|[qwen/Qwen1.5-4B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|auto_gptq>=0.5, transformers>=4.37|-|
@@ -54,6 +56,7 @@
 |qwen1half-7b-chat-int8|[qwen/Qwen1.5-7B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-7B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
 |qwen1half-14b-chat-int8|[qwen/Qwen1.5-14B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-14B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
 |qwen1half-72b-chat-int8|[qwen/Qwen1.5-72B-Chat-GPTQ-Int8](https://modelscope.cn/models/qwen/Qwen1.5-72B-Chat-GPTQ-Int8/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
+|qwen1half-moe-a2_7b-chat-int4|[qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4](https://modelscope.cn/models/qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2718;|auto_gptq>=0.5, transformers>=4.37|-|
 |qwen1half-0_5b-chat-awq|[qwen/Qwen1.5-0.5B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-0.5B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
 |qwen1half-1_8b-chat-awq|[qwen/Qwen1.5-1.8B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-1.8B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
 |qwen1half-4b-chat-awq|[qwen/Qwen1.5-4B-Chat-AWQ](https://modelscope.cn/models/qwen/Qwen1.5-4B-Chat-AWQ/summary)|q_proj, k_proj, v_proj|qwen|&#x2714;|&#x2714;|transformers>=4.37, autoawq|-|
@@ -198,7 +201,7 @@
 |mamba-790m|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
 |mamba-1.4b|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
 |mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
-|telechat-12b|[TeleAI/telechat-12B](https://modelscope.cn/models/TeleAI/telechat-12B/summary)|self_attention.key_value, self_attention.query|telechat|&#x2714;|&#x2718;||-|
+|telechat-12b|[TeleAI/TeleChat-12B](https://modelscope.cn/models/TeleAI/TeleChat-12B/summary)|self_attention.key_value, self_attention.query|telechat|&#x2714;|&#x2718;||-|
 |grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
 
 
diff --git a/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b/lora/infer.sh b/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b/lora/infer.sh
@@ -0,0 +1,14 @@
+# Experimental environment: A100
+# 36GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_infer.py \
+    --ckpt_dir "output/qwen1half-moe-a2_7b/vx-xxx/checkpoint-xxx" \
+    --load_dataset_config true \
+    --use_flash_attn true \
+    --max_new_tokens 2048 \
+    --temperature 0.1 \
+    --top_p 0.7 \
+    --repetition_penalty 1. \
+    --do_sample true \
+    --merge_lora false \
diff --git a/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b/lora/sft.sh b/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b/lora/sft.sh
@@ -0,0 +1,31 @@
+# Experimental environment: A100
+# 42GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_sft.py \
+    --model_type qwen1half-moe-a2_7b \
+    --sft_type lora \
+    --tuner_backend swift \
+    --dtype AUTO \
+    --output_dir output \
+    --dataset dureader-robust-zh \
+    --train_dataset_sample -1 \
+    --num_train_epochs 1 \
+    --max_length 1024 \
+    --check_dataset_strategy warning \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0.05 \
+    --lora_target_modules ALL \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0.1 \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --use_flash_attn true \
diff --git a/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b_chat/lora/infer.sh b/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b_chat/lora/infer.sh
@@ -0,0 +1,14 @@
+# Experimental environment: A100
+# 36GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_infer.py \
+    --ckpt_dir "output/qwen1half-moe-a2_7b-chat/vx-xxx/checkpoint-xxx" \
+    --load_dataset_config true \
+    --use_flash_attn true \
+    --max_new_tokens 2048 \
+    --temperature 0.1 \
+    --top_p 0.7 \
+    --repetition_penalty 1. \
+    --do_sample true \
+    --merge_lora false \
diff --git a/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b_chat/lora/sft.sh b/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b_chat/lora/sft.sh
@@ -0,0 +1,31 @@
+# Experimental environment: A100
+# 42GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_sft.py \
+    --model_type qwen1half-moe-a2_7b-chat \
+    --sft_type lora \
+    --tuner_backend swift \
+    --dtype AUTO \
+    --output_dir output \
+    --dataset blossom-math-zh \
+    --train_dataset_sample -1 \
+    --num_train_epochs 1 \
+    --max_length 1024 \
+    --check_dataset_strategy warning \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0.05 \
+    --lora_target_modules ALL \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0.1 \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --use_flash_attn true \
diff --git a/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b_chat_int4/qlora/infer.sh b/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b_chat_int4/qlora/infer.sh
@@ -0,0 +1,12 @@
+# Experimental environment: A100
+CUDA_VISIBLE_DEVICES=0 \
+swift infer \
+    --ckpt_dir "output/qwen1half-moe-a2_7b-chat-int4/vx-xxx/checkpoint-xxx" \
+    --load_dataset_config true \
+    --use_flash_attn true \
+    --max_new_tokens 2048 \
+    --temperature 0.1 \
+    --top_p 0.7 \
+    --repetition_penalty 1. \
+    --do_sample true \
+    --merge_lora false \
diff --git a/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b_chat_int4/qlora/sft.sh b/examples/pytorch/llm/scripts/qwen1half_moe_a2_7b_chat_int4/qlora/sft.sh
@@ -0,0 +1,28 @@
+# Experimental environment: A100
+# 17GB GPU memory
+
+CUDA_VISIBLE_DEVICES=0 \
+swift sft \
+    --model_type qwen1half-moe-a2_7b-chat-int4 \
+    --sft_type lora \
+    --output_dir output \
+    --dataset blossom-math-zh \
+    --train_dataset_sample -1 \
+    --num_train_epochs 3 \
+    --max_length 2048 \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0.05 \
+    --lora_target_modules ALL \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0.1 \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --use_flash_attn true \
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -61,12 +61,14 @@ class ModelType:
     qwen1half_7b = 'qwen1half-7b'
     qwen1half_14b = 'qwen1half-14b'
     qwen1half_72b = 'qwen1half-72b'
+    qwen1half_moe_a2_7b = 'qwen1half-moe-a2_7b'
     qwen1half_0_5b_chat = 'qwen1half-0_5b-chat'
     qwen1half_1_8b_chat = 'qwen1half-1_8b-chat'
     qwen1half_4b_chat = 'qwen1half-4b-chat'
     qwen1half_7b_chat = 'qwen1half-7b-chat'
     qwen1half_14b_chat = 'qwen1half-14b-chat'
     qwen1half_72b_chat = 'qwen1half-72b-chat'
+    qwen1half_moe_a2_7b_chat = 'qwen1half-moe-a2_7b-chat'
 
     # qwen1.5 gptq
     qwen1half_0_5b_chat_int4 = 'qwen1half-0_5b-chat-int4'
@@ -81,6 +83,7 @@ class ModelType:
     qwen1half_7b_chat_int8 = 'qwen1half-7b-chat-int8'
     qwen1half_14b_chat_int8 = 'qwen1half-14b-chat-int8'
     qwen1half_72b_chat_int8 = 'qwen1half-72b-chat-int8'
+    qwen1half_moe_a2_7b_chat_int4 = 'qwen1half-moe-a2_7b-chat-int4'
 
     # qwen1.5 awq
     qwen1half_0_5b_chat_awq = 'qwen1half-0_5b-chat-awq'
@@ -982,6 +985,14 @@ def cross_entropy_forward(self, inputs: Tensor,
     support_flash_attn=True,
     support_vllm=True,
     requires=['transformers>=4.37'])
+@register_model(
+    ModelType.qwen1half_moe_a2_7b,
+    'qwen/Qwen1.5-MoE-A2.7B',
+    LoRATM.qwen1half,
+    TemplateType.default_generation,
+    support_flash_attn=True,
+    support_vllm=True,
+    requires=['transformers>=4.37'])
 @register_model(
     ModelType.deepseek_coder_1_3b,
     'deepseek-ai/deepseek-coder-1.3b-base',
@@ -1404,6 +1415,14 @@ def get_model_tokenizer_aqlm(model_dir: str,
     support_flash_attn=True,
     support_vllm=True,
     requires=['transformers>=4.37'])
+@register_model(
+    ModelType.qwen1half_moe_a2_7b_chat,
+    'qwen/Qwen1.5-MoE-A2.7B-Chat',
+    LoRATM.qwen1half,
+    TemplateType.qwen,
+    support_flash_attn=True,
+    support_vllm=True,
+    requires=['transformers>=4.37'])
 def get_model_tokenizer_qwen1half(model_dir: str,
                                   torch_dtype: Dtype,
                                   model_kwargs: Dict[str, Any],
@@ -1540,6 +1559,15 @@ def get_model_tokenizer_qwen1half(model_dir: str,
     torch_dtype=torch.float16,
     function_kwargs={'bits': 8},
     support_flash_attn=True)
+@register_model(
+    ModelType.qwen1half_moe_a2_7b_chat_int4,
+    'qwen/Qwen1.5-MoE-A2.7B-Chat-GPTQ-Int4',
+    LoRATM.qwen1half,
+    TemplateType.qwen,
+    requires=['auto_gptq>=0.5', 'transformers>=4.37'],
+    torch_dtype=torch.float16,
+    function_kwargs={'bits': 4},
+    support_flash_attn=True)
 def get_model_tokenizer_qwen1half_intx(model_dir: str,
                                        torch_dtype: Dtype,
                                        model_kwargs: Dict[str, Any],