modelscope · Jintao-Huang · Jul 9, 2024 · Jul 9, 2024
diff --git a/docs/source/Multi-Modal/cogvlm2-video最佳实践.md b/docs/source/Multi-Modal/cogvlm2-video最佳实践.md
@@ -110,7 +110,18 @@ response: The video shows a person lighting a fire in a backyard setting. The pe
 # 40GB GPU memory
 CUDA_VISIBLE_DEVICES=0 swift sft \
     --model_type cogvlm2-video-13b-chat \
-    --dataset video-chatgpt
+    --dataset video-chatgpt \
+    --num_train_epochs 3 \
+
+# ZeRO2
+# Experimental environment: 4 * A100
+# 4 * 40GB GPU memory
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 swift sft \
+    --model_type cogvlm2-video-13b-chat \
+    --dataset video-chatgpt \
+    --num_train_epochs 3 \
+    --deepspeed default-zero2
 ```
 
 [自定义数据集](../LLM/自定义与拓展.md#-推荐命令行参数的形式)支持json, jsonl样式, 以下是自定义数据集的例子:

diff --git a/docs/source_en/Multi-Modal/cogvlm2-video-best-practice.md b/docs/source_en/Multi-Modal/cogvlm2-video-best-practice.md
@@ -109,7 +109,18 @@ Fine-tuning multimodal large models usually uses **custom datasets**. Here is a
 # 40GB GPU memory
 CUDA_VISIBLE_DEVICES=0 swift sft \
     --model_type cogvlm2-video-13b-chat \
-    --dataset video-chatgpt
+    --dataset video-chatgpt \
+    --num_train_epochs 3 \
+
+# ZeRO2
+# Experimental environment: 4 * A100
+# 4 * 40GB GPU memory
+NPROC_PER_NODE=4 \
+CUDA_VISIBLE_DEVICES=0,1,2,3 swift sft \
+    --model_type cogvlm2-video-13b-chat \
+    --dataset video-chatgpt \
+    --num_train_epochs 3 \
+    --deepspeed default-zero2
 ```
 
 [Custom datasets](../LLM/Customization.md#-Recommended-Command-line-arguments) support json, jsonl formats. Here is an example of a custom dataset:

diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -3807,6 +3807,7 @@ def new_get_rank(group=None):
     eos_token='<|im_end|>',
     support_flash_attn=True,
     tags=['multi-modal', 'vision'],
+    function_kwargs={'is_v2_5': True},
     hf_model_id='internlm/internlm-xcomposer2d5-7b')
 @register_model(
     ModelType.internlm_xcomposer2_7b_chat,
@@ -3822,6 +3823,7 @@ def get_model_tokenizer_internlm_xcomposer2(model_dir: str,
                                             model_kwargs: Dict[str, Any],
                                             load_model: bool = True,
                                             **kwargs):
+    is_v2_5 = kwargs.pop('is_v2_5', False)
     model_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
     use_flash_attn = kwargs.pop('use_flash_attn', False)
     model_config._flash_attn_2_enabled = use_flash_attn
@@ -3839,23 +3841,36 @@ def get_model_tokenizer_internlm_xcomposer2(model_dir: str,
             model.model.layers[0].attention.__class__.attention_dropout = 0.
 
         model_cls = model.__class__
-        if not hasattr(model_cls, '__old_encode_img'):  # avoid double patching
-            model_cls.__old_encode_img = model_cls.encode_img
-
-            def _new_encode_img(self, image):
-                if image is None:
-                    return None
-                if isinstance(image, str):
-                    from PIL import Image
-                    image = Image.open(image).convert('RGB')
-                    image = self.vis_processor(image).unsqueeze(0).to(self.device)
-                else:
-                    assert isinstance(image, torch.Tensor)
 
-                img_embeds, atts_img, img_target = self.img2emb(image)
-                return img_embeds.to(device=self.device)  # FIX device_map
+        if is_v2_5:
+
+            def _output_device_map_hook(module, input, output):
+                output = (output[0].to(input[1].device), output[1])
+                return output
+
+            def _output_device_map_hook2(module, input, output):
+                return output.to(input[0].device)
+
+            model.vit.register_forward_hook(_output_device_map_hook)
+            model.vision_proj.register_forward_hook(_output_device_map_hook2)
+        else:
+            if not hasattr(model_cls, '__old_encode_img'):  # avoid double patching
+                model_cls.__old_encode_img = model_cls.encode_img
+
+                def _new_encode_img(self, image):
+                    if image is None:
+                        return None
+                    if isinstance(image, str):
+                        from PIL import Image
+                        image = Image.open(image).convert('RGB')
+                        image = self.vis_processor(image).unsqueeze(0).to(self.device)
+                    else:
+                        assert isinstance(image, torch.Tensor)
+
+                    img_embeds, atts_img, img_target = self.img2emb(image)
+                    return img_embeds.to(device=self.device)  # FIX device_map
 
-            model_cls.encode_img = _new_encode_img
+                model_cls.encode_img = _new_encode_img
 
     return model, tokenizer