fix minicpm-v (modelscope#1010)

Jintao-Huang · web-flow · commit 80c72d2a1ea1 · 2024-05-28T08:54:49.000+08:00
diff --git a/README.md b/README.md
@@ -439,7 +439,7 @@ CUDA_VISIBLE_DEVICES=0 swift infer \
 CUDA_VISIBLE_DEVICES=0 swift eval --model_type qwen1half-7b-chat --eval_dataset mmlu ceval
 ```
 
-### Export
+### Quantization
 
 Original model:
 ```shell
@@ -456,6 +456,7 @@ CUDA_VISIBLE_DEVICES=0 swift export \
 ```
 
 ### Deployment
+The client uses the OpenAI API for invocation, for details refer to the [LLM deployment documentation](https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/VLLM-inference-acceleration-and-deployment.md).
 
 Original model:
 ```shell
diff --git a/README_CN.md b/README_CN.md
@@ -411,8 +411,6 @@ swift sft \
 ```
 
 
-
-
 ### 推理
 原始模型:
 ```shell
@@ -437,7 +435,7 @@ CUDA_VISIBLE_DEVICES=0 swift infer \
 CUDA_VISIBLE_DEVICES=0 swift eval --model_type qwen1half-7b-chat --eval_dataset mmlu ceval
 ```
 
-### 导出
+### 量化
 
 原始模型:
 ```shell
@@ -454,6 +452,7 @@ CUDA_VISIBLE_DEVICES=0 swift export \
 ```
 
 ### 部署
+客户端使用OpenAI API进行调用，具体可以查看[LLM部署文档](https://github.com/modelscope/swift/blob/main/docs/source/LLM/VLLM%E6%8E%A8%E7%90%86%E5%8A%A0%E9%80%9F%E4%B8%8E%E9%83%A8%E7%BD%B2.md#%E9%83%A8%E7%BD%B2)
 
 原始模型:
 ```shell
diff --git a/docs/source/Multi-Modal/minicpm-v-2最佳实践.md b/docs/source/Multi-Modal/minicpm-v-2最佳实践.md
@@ -39,7 +39,7 @@ Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.co
 <<< clear
 <<< 计算结果是多少
 Input a media path or URL <<< http://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/math.png
-计算结果是1452 + 4530 = 5982。
+计算结果是1452 + 45304 = 46756。
 --------------------------------------------------
 <<< clear
 <<< 根据图片中的内容写首诗
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -299,7 +299,7 @@ def _register_self_cognition(self: Union['SftArguments', 'InferArguments']) -> N
                                      'Representing the model name and model author in Chinese and English.')
                 setattr(self, k, v)
 
-    def _handle_dataset_compat(self, train_dataset: HfDataset,
+    def _handle_dataset_compat(self, train_dataset: Optional[HfDataset],
                                val_dataset: Optional[HfDataset]) -> Tuple[HfDataset, Optional[HfDataset]]:
         # compatibility. (Deprecated)
         random_state = np.random.RandomState(self.dataset_seed)
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -1081,6 +1081,8 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             image = _read_from_path(image_path)
             images.append(image)
         inputs, _ = super().encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
         input_ids = inputs['input_ids']
         labels = inputs['labels']
         idx_list = _findall(input_ids, 1)[1:]  # 1: <s>
@@ -1330,7 +1332,16 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
 register_template(TemplateType.minicpm, Template(['<s>{{SYSTEM}}'], ['<用户>{{QUERY}}<AI>'], [], ['</s>']))
 
 
-class MiniCPMVTemlate(Template):
+def _remove_idx(arr: List[int], idx_list: List[int]) -> List[int]:
+    res = []
+    idx_set = set(idx_list)
+    for i, x in enumerate(arr):
+        if i not in idx_set:
+            res.append(x)
+    return res
+
+
+class MiniCPMVTemplate(Template):
 
     def __init__(self, *args, **kwargs):
         self.is_v2_5 = kwargs.pop('is_v2_5', False)
@@ -1345,32 +1356,22 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             return inputs, {}
         input_ids = inputs['input_ids']
         labels = inputs['labels']
-
-        img_start_idxs = np.where(np.array(input_ids) == self.tokenizer.im_start_id)[0]
-        if len(img_start_idxs) > 1:  # if mutli-round, input_ids have mutli <image><unk></image>\n
-            start = 0
-            new_input_ids = []
-            new_labels = []
-            for idx in img_start_idxs[1:]:
-                new_input_ids = new_input_ids + input_ids[start:idx]
-                if labels is not None:
-                    new_labels = new_labels + labels[start:idx]
-                start = idx + 4  # skip <image><unk></image>\n
-            new_input_ids = new_input_ids + input_ids[start:]
-            input_ids = new_input_ids
+        idx_list = _findall(input_ids, -1)
+        if len(idx_list) >= 2:
+            input_ids = _remove_idx(input_ids, idx_list[1:])
             if labels is not None:
-                new_labels = new_labels + labels[start:]
-                labels = new_labels
-
-        idx = img_start_idxs[0] + 1  # first <unk>
+                labels = _remove_idx(labels, idx_list[1:])
+        idx = idx_list[0]
         config = self.model.config
         tgt_sizes = None
-        if config.slice_mode:
+        slice_mode = getattr(config, 'slice_mode', False)
+        if slice_mode:
             images, placeholder = self.model.get_slice_image_placeholder(image, self.tokenizer)
+            placeholder += '\n'
             placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
-            input_ids = (input_ids[:idx - 1] + placeholder_id + input_ids[idx + 2:])
+            input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
             if labels is not None:
-                labels = (labels[:idx - 1] + [-100] * len(placeholder_id) + labels[idx + 2:])
+                labels = (labels[:idx] + [-100] * len(placeholder_id) + labels[idx + 1:])
             input_tensor_ids = torch.tensor(input_ids)
             image_start_idx = torch.where(input_tensor_ids == self.tokenizer.im_start_id)[0]
             image_start_idx += 1
@@ -1393,9 +1394,11 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             else:
                 pixel_values = [self.model.transform(img).to(device=self.model.device) for img in images]
         else:
-            input_ids = (input_ids[:idx] + [self.tokenizer.unk_token_id] * config.query_num + input_ids[idx + 1:])
+            placeholder = '<image>' + '<unk>' * config.query_num + '</image>\n'
+            placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
+            input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
             if labels is not None:
-                labels = (labels[:idx] + [-100] * config.query_num + labels[idx + 1:])
+                labels = (labels[:idx] + [-100] * len(placeholder_id) + labels[idx + 1:])
             image_bound = [torch.tensor([[idx, idx + config.query_num]])]
             pixel_values = [self.model.transform(image).to(device=self.model.device)]
         data = {
@@ -1418,7 +1421,7 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]:
 
 register_template(
     TemplateType.minicpm_v,
-    MiniCPMVTemlate(['<s>{{SYSTEM}}'], ['<用户><image><unk></image>\n{{QUERY}}<AI>'], [], ['</s>']),
+    MiniCPMVTemplate(['<s>{{SYSTEM}}'], ['<用户>', [-1], '{{QUERY}}<AI>'], [], ['</s>']),
     use_model=True,
     lazy_tokenize=True,
     infer_media_type='dialogue',
@@ -1427,11 +1430,11 @@ def get_generate_ids(generate_ids: Tensor, input_token_len: int) -> List[int]:
 
 register_template(
     TemplateType.minicpm_v_v2_5,
-    MiniCPMVTemlate(['<|begin_of_text|>{{SYSTEM}}'], [
-        '<|start_header_id|>user<|end_header_id|>\n\n<image><unk></image>\n{{QUERY}}<|eot_id|>'
+    MiniCPMVTemplate(['<|begin_of_text|>{{SYSTEM}}'], [
+        '<|start_header_id|>user<|end_header_id|>\n\n', [-1], '{{QUERY}}<|eot_id|>'
         '<|start_header_id|>assistant<|end_header_id|>\n\n'
     ], ['<|eot_id|>'], ['<|eot_id|>'],
-                    is_v2_5=True),
+                     is_v2_5=True),
     use_model=True,
     lazy_tokenize=True,
     infer_media_type='dialogue',