modelscope · Jintao-Huang · Jul 30, 2024 · Jul 29, 2024 · Jul 29, 2024 · Jul 29, 2024
diff --git a/docs/source/LLM/LmDeploy推理加速与部署.md b/docs/source/LLM/LmDeploy推理加速与部署.md
@@ -1,4 +1,5 @@
 # LmDeploy推理加速与部署
+支持lmdeploy推理加速的模型可以查看[支持的模型](支持的模型和数据集.md#模型).
 
 ## 目录
 - [环境准备](#环境准备)

diff --git a/docs/source/LLM/VLLM推理加速与部署.md b/docs/source/LLM/VLLM推理加速与部署.md
@@ -1,5 +1,6 @@
 
 # VLLM推理加速与部署
+vllm支持的模型可以查看[支持的模型](支持的模型和数据集.md#模型).
 
 ## 目录
 - [环境准备](#环境准备)
@@ -27,7 +28,6 @@ pip install -r requirements/llm.txt  -U
 ```
 
 ## 推理加速
-vllm支持的模型可以查看[支持的模型](支持的模型和数据集.md#模型).
 
 ### 使用python
 ```python

diff --git a/docs/source/LLM/支持的模型和数据集.md b/docs/source/LLM/支持的模型和数据集.md
diff --git a/docs/source/Multi-Modal/LmDeploy推理加速文档.md b/docs/source/Multi-Modal/LmDeploy推理加速文档.md
@@ -1,4 +1,5 @@
 # LmDeploy推理加速与部署
+支持lmdeploy推理加速的多模态模型可以查看[支持的模型](../LLM/支持的模型和数据集.md#多模态大模型).
 
 ## 目录
 - [环境准备](#环境准备)

diff --git a/docs/source/Multi-Modal/vLLM推理加速文档.md b/docs/source/Multi-Modal/vLLM推理加速文档.md
@@ -1,5 +1,5 @@
 # vLLM推理加速文档
-ms-swift已接入了vLLM对多模态模型进行推理加速. 支持的模型可以查看[支持的模型和数据集](../LLM/支持的模型和数据集.md#多模态大模型). 需要注意点是，使用vLLM进行加速会对推理效果产生略微影响, 请确保可以忍受这种损失来提升推理速度.
+ms-swift已接入了vLLM对多模态模型进行推理加速. 支持的模型可以查看[支持的模型和数据集](../LLM/支持的模型和数据集.md#多模态大模型).
 
 ## 目录
 - [环境准备](#环境准备)

diff --git a/docs/source_en/LLM/LmDeploy-inference-acceleration-and-deployment.md b/docs/source_en/LLM/LmDeploy-inference-acceleration-and-deployment.md
@@ -1,4 +1,5 @@
 # LmDeploy Inference Acceleration and Deployment
+Models that support inference acceleration using lmdeploy can be found at [Supported Models](Supported-models-datasets.md#LLM).
 
 ## Table of Contents
 - [Environment Preparation](#environment-preparation)

diff --git a/docs/source_en/LLM/Supported-models-datasets.md b/docs/source_en/LLM/Supported-models-datasets.md
diff --git a/docs/source_en/LLM/VLLM-inference-acceleration-and-deployment.md b/docs/source_en/LLM/VLLM-inference-acceleration-and-deployment.md
@@ -1,4 +1,5 @@
 # VLLM Inference Acceleration and Deployment
+The models supported by vllm can be found in [Supported Models](Supported-models-datasets.md#Models).
 
 ## Table of Contents
 - [Environment Preparation](#environment-preparation)
@@ -24,7 +25,6 @@ pip install -r requirements/llm.txt -U
 ```
 
 ## Inference Acceleration
-The models supported by vllm can be found in [Supported Models](Supported-models-datasets.md#Models).
 
 ### Using Python
 ```python

diff --git a/docs/source_en/Multi-Modal/LmDeploy-inference-acceleration.md b/docs/source_en/Multi-Modal/LmDeploy-inference-acceleration.md
@@ -1,4 +1,5 @@
 # LmDeploy Inference Acceleration and Deployment
+MLLM that support inference acceleration using lmdeploy can be found at [Supported Models](../LLM/Supported-models-datasets.md#MLLM).
 
 ## Table of Contents
 - [Environment Preparation](#environment-preparation)

diff --git a/docs/source_en/Multi-Modal/vllm-inference-acceleration.md b/docs/source_en/Multi-Modal/vllm-inference-acceleration.md
@@ -1,6 +1,6 @@
 # vLLM Inference Acceleration Documentation
 
-ms-swift has integrated vLLM for accelerating inference of multimodal models. Check out the supported models in [Supported Models and Datasets Documentation](../LLM/Supported-models-datasets.md). It's worth noting that using vLLM for acceleration may have a slight impact on the inference quality, so please ensure you can tolerate this loss in order to boost the inference speed.
+ms-swift has integrated vLLM for accelerating inference of multimodal models. Check out the supported models in [Supported Models and Datasets Documentation](../LLM/Supported-models-datasets.md#MLLM).
 
 ## Table of Contents
 - [Environment Setup](#environment-setup)

diff --git a/scripts/utils/run_model_info.py b/scripts/utils/run_model_info.py
@@ -9,9 +9,9 @@ def get_model_info_table():
     model_name_list = ModelType.get_model_name_list()
     result = [
         '| Model Type | Model ID | Default Lora Target Modules | Default Template |'
-        ' Support Flash Attn | Support VLLM | Requires | Tags | HF Model ID |\n'
+        ' Support Flash Attn | Support vLLM |  Support LMDeploy | Requires | Tags | HF Model ID |\n'
         '| ---------  | -------- | --------------------------- | ---------------- |'
-        ' ------------------ | ------------ | -------- | ---- | ----------- |\n'
+        ' ------------------ | ------------ | ----------------- | -------- | ---- | ----------- |\n'
     ] * 2
     res_llm: List[Any] = []
     res_mllm: List[Any] = []
@@ -29,6 +29,8 @@ def get_model_info_table():
         support_flash_attn = bool_mapping[support_flash_attn]
         support_vllm = model_info.get('support_vllm', False)
         support_vllm = bool_mapping[support_vllm]
+        support_lmdeploy = model_info.get('support_lmdeploy', False)
+        support_lmdeploy = bool_mapping[support_lmdeploy]
         requires = ', '.join(model_info['requires'])
         tags = model_info.get('tags', [])
         if 'multi-modal' in tags:
@@ -43,8 +45,8 @@ def get_model_info_table():
         if hf_model_id is None:
             hf_model_id = '-'
         r = [
-            model_name, model_id, lora_target_modules, template, support_flash_attn, support_vllm, requires, tags_str,
-            hf_model_id
+            model_name, model_id, lora_target_modules, template, support_flash_attn, support_vllm, support_lmdeploy,
+            requires, tags_str, hf_model_id
         ]
         if is_multi_modal:
             res_mllm.append(r)
@@ -55,12 +57,13 @@ def get_model_info_table():
     for i, res in enumerate([res_llm, res_mllm]):
         for r in res:
             ms_url = f'https://modelscope.cn/models/{r[1]}/summary'
-            if r[8] != '-':
-                hf_url = f'https://huggingface.co/{r[8]}'
-                hf_model_id_str = f'[{r[8]}]({hf_url})'
+            if r[9] != '-':
+                hf_url = f'https://huggingface.co/{r[9]}'
+                hf_model_id_str = f'[{r[9]}]({hf_url})'
             else:
                 hf_model_id_str = '-'
-            text[i] += f'|{r[0]}|[{r[1]}]({ms_url})|{r[2]}|{r[3]}|{r[4]}|{r[5]}|{r[6]}|{r[7]}|{hf_model_id_str}|\n'
+            text[i] += (
+                f'|{r[0]}|[{r[1]}]({ms_url})|{r[2]}|{r[3]}|{r[4]}|{r[5]}|{r[6]}|{r[7]}|{r[8]}|{hf_model_id_str}|\n')
         result[i] += text[i]
 
     for i, fpath in enumerate(fpaths):

diff --git a/swift/llm/utils/lmdeploy_utils.py b/swift/llm/utils/lmdeploy_utils.py
@@ -11,10 +11,11 @@
 
 import torch
 from lmdeploy import EngineGenerationConfig as _LmdeployGenerationConfig
-from lmdeploy import PytorchEngineConfig, TurbomindEngineConfig, pipeline
+from lmdeploy import PytorchEngineConfig, TurbomindEngineConfig, VisionConfig, pipeline
 from lmdeploy.api import autoget_backend_config
 from lmdeploy.serve.async_engine import AsyncEngine
 from lmdeploy.serve.vl_async_engine import VLAsyncEngine
+from lmdeploy.vl.constants import IMAGE_DUMMY_TOKEN_INDEX
 from tqdm import tqdm
 from transformers import GenerationConfig
 
@@ -34,6 +35,7 @@ def get_lmdeploy_engine(
         revision: Optional[str] = None,
         tp: int = 1,
         cache_max_entry_count: float = 0.8,
+        vision_batch_size: int = 1,  # max_batch_size in VisionConfig
         engine_kwargs: Optional[Dict[str, Any]] = None,
         **kwargs) -> Union[AsyncEngine, VLAsyncEngine]:
     model_dir = kwargs.pop('model_dir', None)
@@ -56,10 +58,17 @@ def get_lmdeploy_engine(
     if isinstance(backend_config, PytorchEngineConfig):
         backend_config.thread_safe = True
     logger.info(f'backend_config: {backend_config}')
-    lmdeploy_engine = pipeline(model_dir, backend_config=backend_config)
+    pipeline_kwargs = {}
+    is_multimodal = tokenizer.is_multimodal
+    if is_multimodal:
+        vision_config = VisionConfig(max_batch_size=vision_batch_size)
+        pipeline_kwargs['vision_config'] = vision_config
+        logger.info(f'vision_config: {vision_config}')
+
+    lmdeploy_engine = pipeline(model_dir, backend_config=backend_config, **pipeline_kwargs)
     lmdeploy_engine.model_dir = model_dir
     lmdeploy_engine.model_type = model_type
-    lmdeploy_engine.is_multimodal = tokenizer.is_multimodal
+    lmdeploy_engine.is_multimodal = is_multimodal
     lmdeploy_engine.hf_tokenizer = tokenizer
 
     generation_config_path = os.path.join(model_dir, 'generation_config.json')
@@ -113,6 +122,31 @@ def __init__(
             **kwargs)
 
 
+async def _prepare_lmdeploy_inputs(lmdeploy_engine, inputs: Dict[str, Any]) -> None:
+    from .template import _findall
+    images = inputs.pop('images', None) or []
+    if len(images) > 0:
+        images = await lmdeploy_engine.vl_encoder.async_infer(images)
+
+        input_ids = inputs['input_ids']
+        idx_list = _findall(input_ids, -100)
+        assert len(idx_list) == len(images), f'len(idx_list): {len(idx_list)}, len(images): {len(images)}'
+        idx_list.insert(0, -1)
+        new_input_ids = []
+        ranges = []
+        for i in range(len(idx_list) - 1):
+            _range = []
+            new_input_ids += input_ids[idx_list[i] + 1:idx_list[i + 1]]
+            _range.append(len(new_input_ids))
+            new_input_ids += [IMAGE_DUMMY_TOKEN_INDEX] * images[i].shape[0]
+            _range.append(len(new_input_ids))
+            ranges.append(_range)
+        new_input_ids += input_ids[idx_list[-1] + 1:]
+        inputs['input_embeddings'] = images
+        inputs['input_embedding_ranges'] = ranges
+        inputs['input_ids'] = new_input_ids
+
+
 def _prepare_lmdeploy_request(lmdeploy_engine: Union[AsyncEngine, VLAsyncEngine],
                               template: Template,
                               request_list: List[Dict[str, Any]],
@@ -124,6 +158,9 @@ def _prepare_lmdeploy_request(lmdeploy_engine: Union[AsyncEngine, VLAsyncEngine]
     for key in ['num_prompt_tokens', 'num_generated_tokens', 'num_samples']:
         generation_info[key] = 0
 
+    if hasattr(lmdeploy_engine, 'vl_encoder'):
+        lmdeploy_engine.vl_encoder._loop_task = None
+
     template.model = lmdeploy_engine
     tokenizer = template.tokenizer
     if tokenizer.eos_token_id is not None and tokenizer.eos_token_id not in generation_config.stop_words:
@@ -134,7 +171,7 @@ def _prepare_lmdeploy_request(lmdeploy_engine: Union[AsyncEngine, VLAsyncEngine]
             generation_config.stop_words.append(token_list[0])
     if isinstance(template.suffix[-1], list) and len(
             template.suffix[-1]) == 1 and template.suffix[-1] not in generation_config.stop_words:
-        generation_config.stop_words.append(template.suffix[-1])
+        generation_config.stop_words.append(template.suffix[-1][0])
 
     resp_list: List[Optional[Dict[str, Any]]] = [None] * len(request_list)
     generators = []
@@ -165,7 +202,6 @@ def _prepare_inputs(request: Dict[str, Any]) -> Dict[str, Any]:
             # input_ids exceeds `max_length`. Please increase the value of `max_length`.
             resp_list[i] = {'response': '', 'history': request['history']}
             continue
-        generation_info['num_prompt_tokens'] += len(inputs['input_ids'])
         generator = lmdeploy_engine.get_generator(False, i)
         generators.append((i, inputs, generator))
 
@@ -211,6 +247,8 @@ def inference_stream_lmdeploy(lmdeploy_engine: Union[AsyncEngine, VLAsyncEngine]
 
     async def _inner_infer(i: int, inputs: Dict[str, Any], generator) -> None:
         generator = await generator
+        await _prepare_lmdeploy_inputs(lmdeploy_engine, inputs)
+        generation_info['num_prompt_tokens'] += len(inputs['input_ids'])
         async with lmdeploy_engine.safe_run(i):
             async for output in generator.async_stream_infer(
                     session_id=i, **inputs, stream_output=True, gen_config=generation_config):
@@ -292,6 +330,8 @@ def inference_lmdeploy(lmdeploy_engine: Union[AsyncEngine, VLAsyncEngine],
 
     async def _inner_infer(i: int, inputs: Dict[str, Any], generator) -> None:
         generator = await generator
+        await _prepare_lmdeploy_inputs(lmdeploy_engine, inputs)
+        generation_info['num_prompt_tokens'] += len(inputs['input_ids'])
         async with lmdeploy_engine.safe_run(i):
             async for output in generator.async_stream_infer(
                     session_id=i, **inputs, stream_output=False, gen_config=generation_config):