Merge commit 'f203a5c3cda7ee30b191e4df0619c19a5bc13c03' into feat/grok-1

tastelikefeet · tastelikefeet · commit e30783a8a995 · 2024-03-29T15:45:58.000+08:00
* commit 'f203a5c3cda7ee30b191e4df0619c19a5bc13c03': fix Telechat model (modelscope#623) fix save dir (modelscope#622) support TeleChat-12b (modelscope#607) update ui (modelscope#621) fix adalora and device_map (modelscope#619) fix deploy safe_response (modelscope#614) support Mistral-7b-v0.2 (modelscope#605) # Conflicts: # README.md # README_CN.md # docs/source/LLM/支持的模型和数据集.md # swift/llm/utils/model.py
diff --git a/README.md b/README.md
@@ -40,6 +40,7 @@ Additionally, we are expanding capabilities for other modalities. Currently, we
 
 ## 🎉 News
 - 🔥2024.03.29: Support the fine-tuning and inference of **Grok-1** 300B MoE, please view details [here](https://github.com/modelscope/swift/tree/main/docs/source_en/LLM/Grok-1-best-practice.md).
+- 🔥2024.03.25: Supports inference and fine-tuning of TeleChat-12b model, use [this script](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/telechat_12b/lora/sft.sh) to start training!
 - 🔥2024.03.20: Supports inference and fine-tuning for the **llava** series. For best practice, you can refer to [here](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/llava最佳实践.md).
 - 🔥2024.03.12: Support inference and fine-tuning for **deepseek-vl** series. Best practices can be found [here](docs/source_en/Multi-Modal/deepseek-vl-best-practice.md).
 - 🔥2024.03.11: Support [GaLore](https://arxiv.org/abs/2403.03507) for effectively reducing memory usage to 1/2 of the original in full-parameter training.
diff --git a/README_CN.md b/README_CN.md
@@ -41,6 +41,7 @@ SWIFT支持近**200种LLM和MLLM**（多模态大模型）的训练、推理、
 
 ## 🎉 新闻
 - 🔥2024.03.29: 支持**Grok-1**300B MoE模型的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/LLM/Grok训练和推理.md).
+- 🔥2024.03.25: 支持TeleChat-12b模型的训练和推理, 使用[这个脚本](https://github.com/modelscope/swift/blob/main/examples/pytorch/llm/scripts/telechat_12b/lora/sft.sh)来开始训练！.
 - 🔥2024.03.20: 支持**llava**系列的推理与微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/llava最佳实践.md).
 - 🔥2024.03.12: 支持**deepseek-vl**系列推理和微调, 最佳实践可以查看[这里](https://github.com/modelscope/swift/tree/main/docs/source/Multi-Modal/deepseek-vl最佳实践.md).
 - 🔥2024.03.11: 支持[GaLore](https://arxiv.org/abs/2403.03507), 用于在全参数训练中有效减小显存占用至原来的1/2.
diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -141,6 +141,7 @@
 |openbuddy-deepseek-67b-chat|[OpenBuddy/openbuddy-deepseek-67b-v15.2](https://modelscope.cn/models/OpenBuddy/openbuddy-deepseek-67b-v15.2/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;||-|
 |openbuddy-mixtral-moe-7b-chat|[OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k](https://modelscope.cn/models/OpenBuddy/openbuddy-mixtral-7bx8-v18.1-32k/summary)|q_proj, k_proj, v_proj|openbuddy|&#x2714;|&#x2714;|transformers>=4.36|-|
 |mistral-7b|[AI-ModelScope/Mistral-7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|-|
+|mistral-7b-v2|[AI-ModelScope/Mistral-7B-v0.2-hf](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-v0.2-hf/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.34|-|
 |mistral-7b-instruct|[AI-ModelScope/Mistral-7B-Instruct-v0.1](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.1/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|-|
 |mistral-7b-instruct-v2|[AI-ModelScope/Mistral-7B-Instruct-v0.2](https://modelscope.cn/models/AI-ModelScope/Mistral-7B-Instruct-v0.2/summary)|q_proj, k_proj, v_proj|llama|&#x2714;|&#x2714;|transformers>=4.34|-|
 |mixtral-moe-7b|[AI-ModelScope/Mixtral-8x7B-v0.1](https://modelscope.cn/models/AI-ModelScope/Mixtral-8x7B-v0.1/summary)|q_proj, k_proj, v_proj|default-generation-bos|&#x2714;|&#x2714;|transformers>=4.36|-|
@@ -197,6 +198,7 @@
 |mamba-790m|[AI-ModelScope/mamba-790m-hf](https://modelscope.cn/models/AI-ModelScope/mamba-790m-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
 |mamba-1.4b|[AI-ModelScope/mamba-1.4b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-1.4b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
 |mamba-2.8b|[AI-ModelScope/mamba-2.8b-hf](https://modelscope.cn/models/AI-ModelScope/mamba-2.8b-hf/summary)|in_proj, x_proj, embeddings, out_proj|default-generation|&#x2718;|&#x2718;|transformers>=4.39.0|-|
+|telechat-12b|[TeleAI/telechat-12B](https://modelscope.cn/models/TeleAI/telechat-12B/summary)|self_attention.key_value, self_attention.query|telechat|&#x2714;|&#x2718;||-|
 |grok-1|[colossalai/grok-1-pytorch](https://modelscope.cn/models/colossalai/grok-1-pytorch/summary)|q_proj, k_proj, v_proj|default-generation|&#x2718;|&#x2718;||-|
 
 
diff --git a/examples/pytorch/llm/scripts/mistral_7b_v2/lora/infer.sh b/examples/pytorch/llm/scripts/mistral_7b_v2/lora/infer.sh
@@ -0,0 +1,14 @@
+# Experimental environment: A100
+# 16GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_infer.py \
+    --ckpt_dir "output/mistral-7b-v2/vx-xxx/checkpoint-xxx" \
+    --load_dataset_config true \
+    --use_flash_attn true \
+    --max_new_tokens 2048 \
+    --temperature 0.5 \
+    --top_p 0.7 \
+    --repetition_penalty 1. \
+    --do_sample true \
+    --merge_lora false \
diff --git a/examples/pytorch/llm/scripts/mistral_7b_v2/lora/sft.sh b/examples/pytorch/llm/scripts/mistral_7b_v2/lora/sft.sh
@@ -0,0 +1,34 @@
+# Experimental environment: A100
+# 19GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_sft.py \
+    --model_id_or_path AI-ModelScope/Mistral-7B-v0.2-hf \
+    --model_revision master \
+    --sft_type lora \
+    --tuner_backend swift \
+    --template_type AUTO \
+    --dtype AUTO \
+    --output_dir output \
+    --dataset dureader-robust-zh \
+    --train_dataset_sample -1 \
+    --num_train_epochs 1 \
+    --max_length 2048 \
+    --check_dataset_strategy warning \
+    --lora_rank 8 \
+    --lora_alpha 32 \
+    --lora_dropout_p 0.05 \
+    --lora_target_modules DEFAULT \
+    --gradient_checkpointing true \
+    --batch_size 1 \
+    --weight_decay 0.1 \
+    --learning_rate 1e-4 \
+    --gradient_accumulation_steps 16 \
+    --max_grad_norm 0.5 \
+    --warmup_ratio 0.03 \
+    --eval_steps 100 \
+    --save_steps 100 \
+    --save_total_limit 2 \
+    --logging_steps 10 \
+    --use_flash_attn true \
+    --save_only_model true \
diff --git a/examples/pytorch/llm/scripts/telechat_12b/lora/infer.sh b/examples/pytorch/llm/scripts/telechat_12b/lora/infer.sh
@@ -0,0 +1,17 @@
+# Experiment env: A100
+# 1 * 26GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_infer.py \
+    --ckpt_dir "output/telechat-12b/vx-xxx/checkpoint-xxx" \
+    --load_dataset_config true \
+    --max_length 2048 \
+    --use_flash_attn true \
+    --max_new_tokens 2048 \
+    --temperature 0.5 \
+    --top_p 0.7 \
+    --repetition_penalty 1. \
+    --do_sample true \
+    --merge_lora false \
+    --dtype fp16 \
+    --stream false
diff --git a/examples/pytorch/llm/scripts/telechat_12b/lora/sft.sh b/examples/pytorch/llm/scripts/telechat_12b/lora/sft.sh
@@ -0,0 +1,29 @@
+# Experiment env: A100
+# 1 * 30GB GPU memory
+PYTHONPATH=../../.. \
+CUDA_VISIBLE_DEVICES=0 \
+python llm_sft.py \
+  --model_type telechat-12b \
+  --dataset dureader-robust-zh \
+  --batch_size 1 \
+  --max_length 1024 \
+  --gradient_accumulation_steps 16 \
+  --learning_rate 5e-5 \
+  --use_flash_attn true \
+  --eval_steps 1000 \
+  --save_steps 1000 \
+  --train_dataset_sample 100000 \
+  --val_dataset_sample 3000 \
+  --num_train_epochs 2 \
+  --check_dataset_strategy none \
+  --gradient_checkpointing true \
+  --weight_decay 0.1 \
+  --max_grad_norm 1.0 \
+  --warmup_ratio 0.03 \
+  --save_total_limit 2 \
+  --logging_steps 10 \
+  --sft_type lora \
+  --lora_target_modules DEFAULT \
+  --lora_rank 8 \
+  --lora_alpha 32 \
+  --dtype fp16
diff --git a/setup.cfg b/setup.cfg
@@ -22,7 +22,7 @@ ignore-words-list = patten,nd,ty,mot,hist,formating,winn,gool,datas,wan,confids
 max-line-length = 120
 select = B,C,E,F,P,T4,W,B9
 ignore = F401,F403,F405,F821,W503,E251,W504
-exclude = docs/src,*.pyi,.git
+exclude = docs/src,*.pyi,.git,peft.py
 
 [darglint]
 ignore=DAR101
diff --git a/swift/llm/deploy.py b/swift/llm/deploy.py
@@ -22,6 +22,7 @@
                     CompletionStreamResponse, DeltaMessage, DeployArguments,
                     Model, ModelList, UsageInfo, inference, inference_stream,
                     messages_to_history, random_uuid)
+from .utils.utils import _get_safe_print_idx
 
 logger = get_logger()
 
@@ -241,8 +242,11 @@ async def _generate_stream():
                 choices = []
                 for output in result.outputs:
                     text = template.tokenizer.decode(output.token_ids, True)
-                    delta_text = text[print_idx_list[output.index]:]
-                    print_idx_list[output.index] += len(delta_text)
+                    new_print_idx = _get_safe_print_idx(
+                        text, print_idx_list[output.index], output.finished())
+                    delta_text = text[print_idx_list[output.
+                                                     index]:new_print_idx]
+                    print_idx_list[output.index] = new_print_idx
                     choice = ChatCompletionResponseStreamChoice(
                         index=output.index,
                         delta=DeltaMessage(
@@ -259,8 +263,11 @@ async def _generate_stream():
                 choices = []
                 for output in result.outputs:
                     text = template.tokenizer.decode(output.token_ids, True)
-                    delta_text = text[print_idx_list[output.index]:]
-                    print_idx_list[output.index] += len(delta_text)
+                    new_print_idx = _get_safe_print_idx(
+                        text, print_idx_list[output.index], output.finished())
+                    delta_text = text[print_idx_list[output.
+                                                     index]:new_print_idx]
+                    print_idx_list[output.index] = new_print_idx
                     choice = CompletionResponseStreamChoice(
                         index=output.index,
                         text=delta_text,
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -191,6 +191,7 @@ class ModelType:
     openbuddy_mixtral_moe_7b_chat = 'openbuddy-mixtral-moe-7b-chat'
     # mistral
     mistral_7b = 'mistral-7b'
+    mistral_7b_v2 = 'mistral-7b-v2'
     mistral_7b_instruct = 'mistral-7b-instruct'
     mistral_7b_instruct_v2 = 'mistral-7b-instruct-v2'
     mixtral_moe_7b = 'mixtral-moe-7b'
@@ -263,7 +264,9 @@ class ModelType:
     mamba_790m = 'mamba-790m'
     mamba_1_4b = 'mamba-1.4b'
     mamba_2_8b = 'mamba-2.8b'
-    # grok-1
+    # teleAI
+    telechat_12b = 'telechat-12b'
+	# grok-1
     grok_1 = 'grok-1'
 
     @classmethod
@@ -297,7 +300,8 @@ class LoRATM(NamedTuple):
     phi = ['Wqkv']
     internlm2 = ['wqkv']
     mamba = ['in_proj', 'x_proj', 'embeddings', 'out_proj']
-    grok_1 = ['q_proj', 'k_proj', 'v_proj']
+    telechat = ['self_attention.key_value', 'self_attention.query']
+	grok_1 = ['q_proj', 'k_proj', 'v_proj']
 
 
 GetModelTokenizerFunction = Callable[..., Tuple[Optional[PreTrainedModel],
@@ -1218,6 +1222,14 @@ def cross_entropy_forward(self, inputs: Tensor,
     requires=['transformers>=4.34'],
     support_flash_attn=True,
     support_vllm=True)
+@register_model(
+    ModelType.mistral_7b_v2,
+    'AI-ModelScope/Mistral-7B-v0.2-hf',
+    LoRATM.llama2,
+    TemplateType.default_generation_bos,
+    requires=['transformers>=4.34'],
+    support_flash_attn=True,
+    support_vllm=True)
 @register_model(
     ModelType.mixtral_moe_7b,
     'AI-ModelScope/Mixtral-8x7B-v0.1',
@@ -2380,6 +2392,12 @@ def get_model_tokenizer_codellama(model_dir: str,
     support_vllm=True,
     support_gradient_checkpointing=False,
     tags=['coding'])
+@register_model(
+    ModelType.telechat_12b,
+    'TeleAI/TeleChat-12B',
+    LoRATM.telechat,
+    TemplateType.telechat,
+    support_flash_attn=True)
 def get_model_tokenizer_phi(model_dir: str,
                             torch_dtype: Dtype,
                             model_kwargs: Dict[str, Any],
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -58,6 +58,7 @@ class TemplateType:
     gemma = 'gemma'
     # compatibility. (Deprecated)
     chatml = 'chatml'
+    telechat = 'telechat'
 
     @classmethod
     def get_template_name_list(cls) -> List[str]:
@@ -1192,6 +1193,10 @@ def get_generate_ids(generate_ids: Tensor,
     ['<bos><start_of_turn>system\n{{SYSTEM}}<end_of_turn>\n'])
 register_template(TemplateType.gemma, gemma_template)
 
+register_template(
+    TemplateType.telechat,
+    Template([], ['<_user>{{QUERY}}<_bot>'], ['<_end>'], ['<_end>']))
+
 
 def get_template(
     template_type: str,
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
@@ -38,7 +38,7 @@
 from transformers.generation.streamers import BaseStreamer
 
 from swift.hub import ModelScopeConfig
-from swift.tuners.module_mapping import MODEL_KEYS_MAPPING, ModelKeys
+from swift.tuners.module_mapping import MODEL_KEYS_MAPPING
 from swift.utils import (get_dist_setting, get_logger, is_ddp_plus_mp, is_dist,
                          is_local_master, is_master, stat_array, upper_bound)
 from .template import History, StopWords, StopWordsCriteria, Template
@@ -429,6 +429,19 @@ def _is_chinese_char(cp):
     return False
 
 
+def _get_safe_print_idx(response: str,
+                        print_idx: int,
+                        is_finished: bool = False) -> int:
+    if is_finished:
+        return len(response)
+    if response.endswith('\n') or len(response) > 0 and _is_chinese_char(
+            ord(response[-1])):
+        print_idx = len(response)
+    else:
+        print_idx = max(response.rfind(' ') + 1, print_idx)
+    return print_idx
+
+
 def to_device(inputs: Any, device: Device) -> Any:
     if callable(getattr(inputs, 'to', None)):
         return inputs.to(device=device)
@@ -582,11 +595,7 @@ def inference_stream(model: PreTrainedModel,
         response = tokenizer.decode(generate_ids, **tokenizer_kwargs)
         if isinstance(template.suffix[-1], str):
             response = response[:-len(template.suffix[-1])]
-        if response.endswith('\n') or len(response) > 0 and _is_chinese_char(
-                ord(response[-1])):
-            print_idx = len(response)
-        else:
-            print_idx = max(response.rfind(' ') + 1, print_idx)
+        print_idx = _get_safe_print_idx(response, print_idx)
         # avoid printing incomplete words
         if safe_response != response[:print_idx]:
             safe_response = response[:print_idx]
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
@@ -19,7 +19,7 @@
 from .argument import InferArguments
 from .model import get_model_tokenizer
 from .template import Template, get_template
-from .utils import _is_chinese_char
+from .utils import _get_safe_print_idx
 
 try:
     from vllm.lora.request import LoRARequest
@@ -275,13 +275,9 @@ def inference_stream_vllm(
             i = int(output.request_id)
             request = request_list[i]
             response = tokenizer.decode(output.outputs[0].token_ids, True)
-            if output.finished or response.endswith(
-                    '\n') or len(response) > 0 and _is_chinese_char(
-                        ord(response[-1])):
-                print_idx_list[i] = len(response)
-            else:
-                print_idx_list[i] = max(
-                    response.rfind(' ') + 1, print_idx_list[i])
+            print_idx_list[i] = _get_safe_print_idx(response,
+                                                    print_idx_list[i],
+                                                    output.finished)
             # avoid printing incomplete words
             safe_response = response[:print_idx_list[i]]
             query = request['query']
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py
@@ -425,8 +425,10 @@ def _save(self, output_dir: Optional[str] = None, state_dict=None):
             for file in additional_files:
                 src_path = os.path.join(model_dir, file)
                 dst_path = os.path.join(output_dir, file)
-                if os.path.exists(src_path):
+                if os.path.isfile(src_path):
                     shutil.copy(src_path, dst_path)
+                elif os.path.isdir(src_path):
+                    shutil.copytree(src_path, dst_path)
 
     def _save_checkpoint(self, model, trial, metrics=None):
         self.state.last_model_checkpoint = os.path.join(
diff --git a/swift/tuners/peft.py b/swift/tuners/peft.py
diff --git a/swift/ui/llm_infer/runtime.py b/swift/ui/llm_infer/runtime.py
diff --git a/swift/ui/llm_train/llm_train.py b/swift/ui/llm_train/llm_train.py
diff --git a/swift/ui/llm_train/lora.py b/swift/ui/llm_train/lora.py
diff --git a/swift/ui/llm_train/model.py b/swift/ui/llm_train/model.py