fix vllm==0.4.* slower than vllm==0.3.* (#1035)

Jintao-Huang · web-flow · commit ca8c46868610 · 2024-05-31T13:44:41.000+08:00
diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
@@ -162,8 +162,6 @@
 |internlm2-math-7b-chat|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)|wqkv|internlm2|&#x2714;|&#x2714;|transformers>=4.35|math|[internlm/internlm2-math-7b](https://huggingface.co/internlm/internlm2-math-7b)|
 |internlm2-math-20b|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)|wqkv|default-generation|&#x2714;|&#x2714;|transformers>=4.35|math|[internlm/internlm2-math-base-20b](https://huggingface.co/internlm/internlm2-math-base-20b)|
 |internlm2-math-20b-chat|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)|wqkv|internlm2|&#x2714;|&#x2714;|transformers>=4.35|math|[internlm/internlm2-math-20b](https://huggingface.co/internlm/internlm2-math-20b)|
-|internvl-chat-v1_5|[AI-ModelScope/InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|-|[OpenGVLab/InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)|
-|internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|-|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-7b-base](https://huggingface.co/deepseek-ai/deepseek-llm-7b-base)|
 |deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-7b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat)|
 |deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-moe-16b-base](https://huggingface.co/deepseek-ai/deepseek-moe-16b-base)|
@@ -297,6 +295,8 @@
 |yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|q_proj, k_proj, v_proj|yi-vl|&#x2714;|&#x2718;|transformers>=4.34|vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)|
 |llava-llama-3-8b-v1_1|[AI-ModelScope/llava-llama-3-8b-v1_1-transformers](https://modelscope.cn/models/AI-ModelScope/llava-llama-3-8b-v1_1-transformers/summary)|q_proj, k_proj, v_proj|llava-llama-instruct|&#x2714;|&#x2718;|transformers>=4.36|vision|[xtuner/llava-llama-3-8b-v1_1-transformers](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers)|
 |internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|wqkv|internlm-xcomposer2|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
+|internvl-chat-v1_5|[AI-ModelScope/InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)|
+|internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
 |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
 |paligemma-3b-pt-448|[AI-ModelScope/paligemma-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-448/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-448](https://huggingface.co/google/paligemma-3b-pt-448)|
diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
@@ -162,8 +162,6 @@ The table below introcudes all models supported by SWIFT:
 |internlm2-math-7b-chat|[Shanghai_AI_Laboratory/internlm2-math-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-7b/summary)|wqkv|internlm2|&#x2714;|&#x2714;|transformers>=4.35|math|[internlm/internlm2-math-7b](https://huggingface.co/internlm/internlm2-math-7b)|
 |internlm2-math-20b|[Shanghai_AI_Laboratory/internlm2-math-base-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-base-20b/summary)|wqkv|default-generation|&#x2714;|&#x2714;|transformers>=4.35|math|[internlm/internlm2-math-base-20b](https://huggingface.co/internlm/internlm2-math-base-20b)|
 |internlm2-math-20b-chat|[Shanghai_AI_Laboratory/internlm2-math-20b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm2-math-20b/summary)|wqkv|internlm2|&#x2714;|&#x2714;|transformers>=4.35|math|[internlm/internlm2-math-20b](https://huggingface.co/internlm/internlm2-math-20b)|
-|internvl-chat-v1_5|[AI-ModelScope/InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|-|[OpenGVLab/InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)|
-|internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|-|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |deepseek-7b|[deepseek-ai/deepseek-llm-7b-base](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-7b-base](https://huggingface.co/deepseek-ai/deepseek-llm-7b-base)|
 |deepseek-7b-chat|[deepseek-ai/deepseek-llm-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-llm-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-llm-7b-chat](https://huggingface.co/deepseek-ai/deepseek-llm-7b-chat)|
 |deepseek-moe-16b|[deepseek-ai/deepseek-moe-16b-base](https://modelscope.cn/models/deepseek-ai/deepseek-moe-16b-base/summary)|q_proj, k_proj, v_proj|default-generation|&#x2714;|&#x2714;||-|[deepseek-ai/deepseek-moe-16b-base](https://huggingface.co/deepseek-ai/deepseek-moe-16b-base)|
@@ -297,6 +295,8 @@ The table below introcudes all models supported by SWIFT:
 |yi-vl-34b-chat|[01ai/Yi-VL-34B](https://modelscope.cn/models/01ai/Yi-VL-34B/summary)|q_proj, k_proj, v_proj|yi-vl|&#x2714;|&#x2718;|transformers>=4.34|vision|[01-ai/Yi-VL-34B](https://huggingface.co/01-ai/Yi-VL-34B)|
 |llava-llama-3-8b-v1_1|[AI-ModelScope/llava-llama-3-8b-v1_1-transformers](https://modelscope.cn/models/AI-ModelScope/llava-llama-3-8b-v1_1-transformers/summary)|q_proj, k_proj, v_proj|llava-llama-instruct|&#x2714;|&#x2718;|transformers>=4.36|vision|[xtuner/llava-llama-3-8b-v1_1-transformers](https://huggingface.co/xtuner/llava-llama-3-8b-v1_1-transformers)|
 |internlm-xcomposer2-7b-chat|[Shanghai_AI_Laboratory/internlm-xcomposer2-7b](https://modelscope.cn/models/Shanghai_AI_Laboratory/internlm-xcomposer2-7b/summary)|wqkv|internlm-xcomposer2|&#x2714;|&#x2718;||vision|[internlm/internlm-xcomposer2-7b](https://huggingface.co/internlm/internlm-xcomposer2-7b)|
+|internvl-chat-v1_5|[AI-ModelScope/InternVL-Chat-V1-5](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5)|
+|internvl-chat-v1_5-int8|[AI-ModelScope/InternVL-Chat-V1-5-int8](https://modelscope.cn/models/AI-ModelScope/InternVL-Chat-V1-5-int8/summary)|wqkv|internvl|&#x2714;|&#x2718;|transformers>=4.35, timm|vision|[OpenGVLab/InternVL-Chat-V1-5-int8](https://huggingface.co/OpenGVLab/InternVL-Chat-V1-5-int8)|
 |deepseek-vl-1_3b-chat|[deepseek-ai/deepseek-vl-1.3b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-1.3b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-1.3b-chat)|
 |deepseek-vl-7b-chat|[deepseek-ai/deepseek-vl-7b-chat](https://modelscope.cn/models/deepseek-ai/deepseek-vl-7b-chat/summary)|q_proj, k_proj, v_proj|deepseek-vl|&#x2714;|&#x2718;|attrdict|vision|[deepseek-ai/deepseek-vl-7b-chat](https://huggingface.co/deepseek-ai/deepseek-vl-7b-chat)|
 |paligemma-3b-pt-448|[AI-ModelScope/paligemma-3b-pt-448](https://modelscope.cn/models/AI-ModelScope/paligemma-3b-pt-448/summary)|q_proj, k_proj, v_proj|paligemma|&#x2714;|&#x2718;|transformers>=4.41|vision|[google/paligemma-3b-pt-448](https://huggingface.co/google/paligemma-3b-pt-448)|
diff --git a/swift/llm/deploy.py b/swift/llm/deploy.py
@@ -15,12 +15,11 @@
 
 from swift.utils import get_logger, get_main, seed_everything
 from .infer import merge_lora, prepare_model_template
-from .utils import ChatCompletionResponse  # noqa
-from .utils import (ChatCompletionRequest, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
-                    ChatCompletionStreamResponse, ChatMessage, CompletionRequest, CompletionResponse,
-                    CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage,
-                    DeployArguments, Model, ModelList, UsageInfo, inference, inference_stream, messages_to_history,
-                    random_uuid)
+from .utils import (ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice,
+                    ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, CompletionRequest,
+                    CompletionResponse, CompletionResponseChoice, CompletionResponseStreamChoice,
+                    CompletionStreamResponse, DeltaMessage, DeployArguments, Model, ModelList, UsageInfo, inference,
+                    inference_stream, messages_to_history, random_uuid)
 
 logger = get_logger()
 
diff --git a/swift/llm/infer.py b/swift/llm/infer.py
@@ -383,6 +383,9 @@ def llm_infer(args: InferArguments) -> None:
                 'response': response,
                 'history': history,
             }
+            images = infer_kwargs.get('images')
+            if images is not None:
+                obj['images'] = images
             history = new_history
             if jsonl_path is not None:
                 append_to_jsonl(jsonl_path, obj)
@@ -438,6 +441,8 @@ def llm_infer(args: InferArguments) -> None:
                 request['system'] = system
                 if images is not None:
                     request['images'] = images
+                if args.truncation_strategy:
+                    request['truncation_strategy'] = args.truncation_strategy
                 request_list.append(request)
             resp_list = inference_vllm(llm_engine, template, request_list, use_tqdm=True)
             result = []
@@ -452,6 +457,9 @@ def llm_infer(args: InferArguments) -> None:
                     'label': request.pop('label', None),
                     'history': request['history'],
                 }
+                images = request.get('images')
+                if images is not None:
+                    obj['images'] = images
                 if jsonl_path is not None:
                     append_to_jsonl(jsonl_path, obj)
                 result.append(obj)
@@ -491,15 +499,15 @@ def llm_infer(args: InferArguments) -> None:
                     response, _ = inference(
                         model, template, stream=args.stream and args.verbose, verbose=args.verbose, **kwargs)
                 label = data.pop('response', None)
-                if 'truncation_strategy' in kwargs:
-                    kwargs.pop('truncation_strategy')
                 obj = {
                     'system': kwargs['system'],
                     'query': kwargs['query'],
                     'response': response,
                     'label': label,
                     'history': kwargs['history'],
                 }
+                if images is not None:
+                    obj['images'] = images
                 if jsonl_path is not None:
                     append_to_jsonl(jsonl_path, obj)
                 result.append(obj)
diff --git a/swift/llm/utils/__init__.py b/swift/llm/utils/__init__.py
@@ -11,11 +11,10 @@
 from .preprocess import (AlpacaPreprocessor, ClsPreprocessor, ComposePreprocessor, ConversationsPreprocessor,
                          PreprocessFunc, RenameColumnsPreprocessor, SmartPreprocessor, SwiftPreprocessor,
                          TextGenerationPreprocessor)
-from .protocol import ChatCompletionResponse  # noqa
-from .protocol import (ChatCompletionRequest, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice,
-                       ChatCompletionStreamResponse, ChatMessage, CompletionRequest, CompletionResponse,
-                       CompletionResponseChoice, CompletionResponseStreamChoice, CompletionStreamResponse, DeltaMessage,
-                       Model, ModelList, UsageInfo, XRequestConfig, random_uuid)
+from .protocol import (ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice,
+                       ChatCompletionResponseStreamChoice, ChatCompletionStreamResponse, ChatMessage, CompletionRequest,
+                       CompletionResponse, CompletionResponseChoice, CompletionResponseStreamChoice,
+                       CompletionStreamResponse, DeltaMessage, Model, ModelList, UsageInfo, XRequestConfig, random_uuid)
 from .template import (DEFAULT_SYSTEM, TEMPLATE_MAPPING, History, Prompt, StopWords, Template, TemplateType,
                        get_template, register_template)
 from .utils import (LazyLLMDataset, LLMDataset, dataset_map, download_dataset, find_all_linears, find_embedding,
diff --git a/swift/llm/utils/argument.py b/swift/llm/utils/argument.py
@@ -1167,7 +1167,7 @@ def load_from_ckpt_dir(self) -> None:
             ]
         for key in imported_keys:
             value = getattr(self, key)
-            if key == 'dataset' and len(value) > 0:
+            if key in {'dataset', 'val_dataset'} and len(value) > 0:
                 continue
             if key in {'dataset_test_ratio', 'system'} and value is not None:
                 continue
@@ -1180,6 +1180,10 @@ def load_from_ckpt_dir(self) -> None:
         if self.dtype == 'AUTO':
             self.dtype = sft_args.get('dtype')
 
+        # compat
+        if self.val_dataset is None:
+            self.val_dataset = []
+
     @staticmethod
     def check_ckpt_dir_correct(ckpt_dir) -> bool:
         """Check the checkpoint dir is correct, which means it must contains a `configuration.json` file.
diff --git a/swift/llm/utils/model.py b/swift/llm/utils/model.py
@@ -2787,7 +2787,7 @@ def _new_forward(*args, **kwargs):
     TemplateType.internvl,
     requires=['transformers>=4.35', 'timm'],
     support_flash_attn=True,
-    support_gradient_checkpointing=False,
+    tags=['multi-modal', 'vision'],
     hf_model_id='OpenGVLab/InternVL-Chat-V1-5')
 @register_model(
     ModelType.internvl_chat_v1_5_int8,
@@ -2796,7 +2796,7 @@ def _new_forward(*args, **kwargs):
     TemplateType.internvl,
     requires=['transformers>=4.35', 'timm'],
     support_flash_attn=True,
-    support_gradient_checkpointing=False,
+    tags=['multi-modal', 'vision'],
     hf_model_id='OpenGVLab/InternVL-Chat-V1-5-int8')
 def get_model_tokenizer_internvl(model_dir: str,
                                  torch_dtype: Dtype,
@@ -2831,7 +2831,7 @@ def get_model_tokenizer_internvl(model_dir: str,
             model.language_model.output.state.force_no_igemmlt = True
 
     if model is not None:
-        _use_submodel_func(model, 'language_model', ['get_input_embeddings'])
+        _use_submodel_func(model, 'language_model', ['get_input_embeddings', 'gradient_checkpointing_enable'])
         fix_internvl_inplace_bug(model)
         if not hasattr(model, '__old_forward'):  # Avoid double patching
             forward = model.forward
diff --git a/swift/llm/utils/protocol.py b/swift/llm/utils/protocol.py
@@ -13,6 +13,7 @@ def random_uuid() -> str:
 class Model:
     id: str  # model_type
     is_chat: bool  # chat model or generation model
+
     object: str = 'model'
     created: int = field(default_factory=lambda: int(time.time()))
     owned_by: str = 'swift'
diff --git a/swift/llm/utils/template.py b/swift/llm/utils/template.py
@@ -784,7 +784,7 @@ def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] =
              ['<|im_end|>'], INTERNLM_SYSTEM, ['<s><|im_start|>system\n{{SYSTEM}}<|im_end|>\n']))
 
 
-def replace_img_tab(query: str, history: History, replace_token: str) -> Tuple[str, History, List[str]]:
+def replace_img_tag(query: str, history: History, replace_token: str) -> Tuple[str, History, List[str]]:
     images_path = []
     pattern = r'<img>(.+?)</img>'
     new_history = []
@@ -818,7 +818,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         history = example.pop('history', None)
         if history is None:
             history = []
-        example['query'], example['history'], images_path = replace_img_tab(example['query'], history, '</s>')
+        example['query'], example['history'], images_path = replace_img_tag(example['query'], history, '</s>')
 
         images = []
         dtype = self.model.dtype
@@ -1122,7 +1122,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         history = example.pop('history', None)
         if history is None:
             history = []
-        example['query'], example['history'], images_path = replace_img_tab(example['query'], history, '<s>')
+        example['query'], example['history'], images_path = replace_img_tag(example['query'], history, '<s>')
         images = []
         for image_path in images_path:
             image = _read_from_path(image_path)
@@ -1230,7 +1230,7 @@ def encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any
         history = example.pop('history', None)
         if history is None:
             history = []
-        example['query'], example['history'], images_path = replace_img_tab(example['query'], history,
+        example['query'], example['history'], images_path = replace_img_tag(example['query'], history,
                                                                             '<image_placeholder>')
 
         inputs, _ = super().encode(example)
diff --git a/swift/llm/utils/utils.py b/swift/llm/utils/utils.py
@@ -556,8 +556,12 @@ def inference_stream(model: PreTrainedModel,
     }
     template.model = model
     inputs, tokenizer_kwargs = template.encode(example)
-    if len(inputs) == 0:
-        raise ValueError('input_ids exceeds `max_length`. Please increase the value of `max_length`.')
+
+    truncation_strategy = kwargs.pop('truncation_strategy', 'delete')
+    if len(inputs) == 0 and truncation_strategy == 'delete':
+        # input_ids exceeds `max_length`. Please increase the value of `max_length`.
+        return '', history
+
     inputs.pop('labels', None)
     tokenizer = template.tokenizer
     device = next(model.parameters()).device
@@ -691,8 +695,9 @@ def inference(model: PreTrainedModel,
     template.model = model
     inputs, tokenizer_kwargs = template.encode(example)
 
-    truncation_strategy = kwargs.pop('truncation_strategy', None)
+    truncation_strategy = kwargs.pop('truncation_strategy', 'delete')
     if len(inputs) == 0 and truncation_strategy == 'delete':
+        # input_ids exceeds `max_length`. Please increase the value of `max_length`.
         return '', history
 
     inputs.pop('labels', None)
diff --git a/swift/llm/utils/vllm_utils.py b/swift/llm/utils/vllm_utils.py
diff --git a/swift/trainers/mixin.py b/swift/trainers/mixin.py