modelscope · Jintao-Huang · Jul 8, 2024 · Jul 8, 2024
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -929,7 +929,7 @@ def _load_image(img_path: Union[str, 'PIL.Image.Image']) -> 'PIL.Image.Image':
     return image
 
 
-def _load_video(video_path: str) -> np.ndarray:
+def _load_video_llava(video_path: str) -> np.ndarray:
     import av
     container = av.open(video_path)
     total_frames = container.streams.video[0].frames
@@ -1036,7 +1036,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         if idx_list:
             idx = idx_list[0]
             images_path = example.get('images') or []
-            image = _load_image(images_path[0])
+            image = _read_batch(images_path)[0]
             placeholder = '<|begin_of_image|><|endoftext|><|end_of_image|>'
             placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
             input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
@@ -1627,7 +1627,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             else:
                 videos_path.append(media_file)
         if len(videos_path) > 0:
-            videos = _read_batch(videos_path, _load_video)
+            videos = _read_batch(videos_path, _load_video_llava)
             video_processor = self.tokenizer.processor.video_processor
             video_inputs = video_processor(videos, return_tensors='pt').to(self.model.dtype)
             inputs['pixel_values_videos'] = video_inputs['pixel_values_videos']
@@ -1766,9 +1766,9 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         if len(inputs) == 0:
             return inputs, {}
         image_path = example.get('images') or []
-        if image_path:
-            raw_image = _load_image(image_path[0])
-            pixel_values = self.tokenizer.processor.image_processor(raw_image, return_tensors='pt')['pixel_values']
+        raw_image = _read_batch(image_path)
+        if raw_image:
+            pixel_values = self.tokenizer.processor.image_processor(raw_image[0], return_tensors='pt')['pixel_values']
             inputs['pixel_values'] = pixel_values.to(self.model.dtype)
         return inputs, {}
 
@@ -1806,9 +1806,9 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
             inputs['token_type_ids'] = [0] * n + [1] * n2
         else:
             inputs['token_type_ids'] = [0] * len(inputs['input_ids'])
-        if image_path:
-            raw_image = _load_image(image_path[0])
-            model_inputs = processor(text=example['query'], images=raw_image, return_tensors='pt')
+        raw_image = _read_batch(image_path)
+        if raw_image:
+            model_inputs = processor(text=example['query'], images=raw_image[0], return_tensors='pt')
             inputs['pixel_values'] = model_inputs['pixel_values']
         return inputs, {}
 
@@ -1949,9 +1949,9 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         example['query'], example['history'], images_path = replace_img_tag(example['query'], history,
                                                                             '<image_placeholder>')
         inputs, _ = super().encode(example)
-        images_path.extend(example.get('images') or [])
         if len(inputs) == 0:
             return inputs, {}
+        images_path.extend(example.get('images') or [])
         images = _read_batch(images_path)
         processor = self.tokenizer.processor
         input_ids, labels = inputs['input_ids'], inputs['labels']
@@ -2024,34 +2024,35 @@ def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, exa
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
-        images_path = example.get('images') or []
-        image = _load_image(images_path[0]) if len(images_path) >= 1 else []
         if len(inputs) == 0:
             return inputs, {}
+        images_path = example.get('images') or []
+        image = _read_batch(images_path)
         inputs.pop('loss_scale', None)
         model = self.model
         inputs2 = model.build_conversation_input_ids(
-            self.tokenizer, query=example['query'], history=example.get('history'), images=[image])
-        image_token_len = inputs2['token_type_ids'].sum()
+            self.tokenizer, query=example['query'], history=example.get('history'), images=image)
+        image_token_len = inputs2['token_type_ids'].sum().item()
         input_ids = inputs['input_ids']
         labels = inputs['labels']
         inputs['token_type_ids'] = [0] + [1] * image_token_len + [0] * len(input_ids[1:])
         inputs['input_ids'] = input_ids[:1] + [self.tokenizer.pad_token_id] * image_token_len + input_ids[1:]
         if labels is not None:
             inputs['labels'] = labels[:1] + [-100] * image_token_len + labels[1:]
-        dtype = model.dtype
-        inputs['images'] = [[img.to(dtype=dtype)] for img in inputs2['images']]
-        if 'cross_images' in inputs2:
-            # is cogagent
-            inputs['cross_images'] = [[cross_img.to(dtype=dtype)] for cross_img in inputs2['cross_images']]
+        if len(image) > 0:
+            dtype = model.dtype
+            inputs['images'] = [[img.to(dtype=dtype)] for img in inputs2['images']]
+            if 'cross_images' in inputs2:
+                # is cogagent
+                inputs['cross_images'] = [[cross_img.to(dtype=dtype)] for cross_img in inputs2['cross_images']]
         return inputs, {}
 
     def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
         res = super().data_collator(batch, padding_to)
-        is_cogagent = 'cross_images' in batch[0]
-        keys = ['images', 'cross_images'] if is_cogagent else ['images']
+        keys = ['images', 'cross_images']
         for key in keys:
-            res[key] = [b[key][0] for b in batch]
+            if key in batch[0]:
+                res[key] = [b[key][0] for b in batch]
         token_type_ids = [torch.tensor(b['token_type_ids']) for b in batch]
         token_type_ids = pad_sequence(token_type_ids, batch_first=True, padding_value=0)
         res['token_type_ids'] = token_type_ids
@@ -2107,10 +2108,10 @@ def check_example(self, example):
 
     def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
         inputs, _ = super().encode(example)
-        images_path = example['images']
-        image = _load_image(images_path[0])
         if len(inputs) == 0:
             return inputs, {}
+        images_path = example['images']
+        image = _load_image(images_path[0])
         input_ids = inputs['input_ids']
         labels = inputs['labels']
         idx_list = _findall(input_ids, -1)