modelscope · Jintao-Huang · May 15, 2025 · May 15, 2025
diff --git a/swift/llm/model/model/qwen.py b/swift/llm/model/model/qwen.py
@@ -1,4 +1,5 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import os
 from typing import Any, Dict, Optional, Tuple, Type
 
 import torch
@@ -557,6 +558,9 @@ def _get_cast_dtype(self) -> torch.dtype:
 def patch_qwen_vl_utils(vision_process):
     if hasattr(vision_process, '_patch'):
         return
+    if os.getenv('VIDEO_MAX_PIXELS') and not os.getenv('VIDEO_TOTAL_PIXELS'):
+        # https://github.com/QwenLM/Qwen2.5-VL/issues/1120
+        os.environ['VIDEO_TOTAL_PIXELS'] = str(int(128000 * 28 * 28 * 0.9))
     for key in [
             'image_factor', 'min_pixels', 'max_pixels', 'max_ratio', 'video_min_pixels', 'video_max_pixels',
             'video_total_pixels', 'frame_factor', 'fps', 'fps_min_frames', 'fps_max_frames'

diff --git a/swift/trainers/callback.py b/swift/trainers/callback.py
@@ -93,6 +93,7 @@ def on_epoch_end(self, args: TrainingArguments, state: TrainerState, control: Tr
         control = super().on_epoch_end(args, state, control, **kwargs)
         evaluation_strategy = args.eval_strategy if hasattr(args, 'eval_strategy') else args.evaluation_strategy
         if args.max_epochs is not None and args.max_epochs <= math.ceil(state.epoch):
+            logger.info('Training has reached `max_epochs`. The model will be saved and the training will be exited.')
             if evaluation_strategy != IntervalStrategy.NO:
                 control.should_evaluate = True
             if args.save_strategy != IntervalStrategy.NO:

diff --git a/tests/test_align/test_template/test_video.py b/tests/test_align/test_template/test_video.py
@@ -117,17 +117,19 @@ def test_valley():
 
 def test_qwen2_5_vl():
     os.environ['FPS'] = '1'
+    os.environ['VIDEO_MAX_PIXELS'] = str(360 * 420)
     pt_engine = PtEngine('Qwen/Qwen2.5-VL-7B-Instruct')
     messages = [{'role': 'user', 'content': '<video>What happened in the video?'}]
     videos = ['https://modelscope-open.oss-cn-hangzhou.aliyuncs.com/images/baby.mp4']
     response = _infer_model(pt_engine, messages=messages, videos=videos)
     pt_engine.default_template.template_backend = 'jinja'
     response2 = _infer_model(pt_engine, messages=messages, videos=videos)
     assert response == response2 == (
-        'In the video, a baby is sitting on a bed and appears to be interacting with an open book. '
-        'The baby seems curious and is touching the pages of the book, possibly exploring its contents or '
-        'simply playing with it. The setting looks like a cozy bedroom, and the baby is wearing sunglasses, '
-        'which adds a playful and endearing touch to the scene.')
+        'In the video, a young child is sitting on a bed and appears to be reading or flipping '
+        'through a book. The child is wearing sunglasses and seems focused on the book. '
+        'The setting looks like a cozy bedroom with various items such as clothes and '
+        "possibly toys around. The child's actions suggest they might be exploring or "
+        'learning about the book.')
 
 
 def test_qwen2_5_omni():