Fix llm quantization docs (#458)

Jintao-Huang · web-flow · commit 85e54b64d5c7 · 2024-02-26T15:42:31.000+08:00
diff --git a/docs/source/LLM/LLM量化文档.md b/docs/source/LLM/LLM量化文档.md
@@ -1,5 +1,5 @@
 # LLM量化文档
-swift使用awq技术对模型进行量化. 该量化技术支持vllm进行加速推理. 使用VLLM对awq量化模型进行加速推理可以查看[]().
+swift使用awq技术对模型进行量化. 该量化技术支持vllm进行加速推理.
 
 
 ## 目录
@@ -31,6 +31,7 @@ pip install -r requirements/llm.txt  -U
 ```bash
 # awq-int4量化 (使用A100大约需要20分钟)
 CUDA_VISIBLE_DEVICES=0 swift export --model_type qwen1half-7b-chat --quant_bits 4
+
 # 推理 swift量化产生的模型
 CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-7b-chat --model_id_or_path qwen1half-7b-chat-int4
 # 推理 原始模型
@@ -40,7 +41,7 @@ CUDA_VISIBLE_DEVICES=0 swift infer --model_type qwen1half-7b-chat --model_id_or_
 ```
 
 
-效果区别:
+效果对比:
 ```python
 # swift量化产生的模型
 """
@@ -187,14 +188,14 @@ swift export --ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \
 swift export --ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \
     --push_to_hub true \
     --hub_model_id qwen1half-4b-chat-lora \
-    --hub_token '<your-sdk-token>'
-    --merge_lora true \
+    --hub_token '<your-sdk-token>' \
+    --merge_lora true
 
 # 推送量化后模型
 swift export --ckpt_dir output/qwen1half-4b-chat/vx-xxx/checkpoint-xxx \
     --push_to_hub true \
     --hub_model_id qwen1half-4b-chat-lora \
-    --hub_token '<your-sdk-token>'
+    --hub_token '<your-sdk-token>' \
     --merge_lora true \
     --quant_bits 4
 ```
diff --git a/docs/source/LLM/VLLM推理加速与部署.md b/docs/source/LLM/VLLM推理加速与部署.md
@@ -250,6 +250,8 @@ swift使用VLLM作为推理后端, 并兼容openai的API样式.
 **服务端:**
 ```bash
 CUDA_VISIBLE_DEVICES=0 swift deploy --model_type qwen-7b-chat
+# 多卡部署
+RAY_memory_monitor_refresh_ms=0 CUDA_VISIBLE_DEVICES=0,1,2,3 swift deploy --model_type qwen-7b-chat --tensor_parallel_size 4
 ```
 
 **客户端:**
@@ -353,6 +355,8 @@ response: 杭州有许多美食，例如西湖醋鱼、东坡肉、龙井虾仁
 **服务端:**
 ```bash
 CUDA_VISIBLE_DEVICES=0 swift deploy --model_type qwen-7b
+# 多卡部署
+RAY_memory_monitor_refresh_ms=0 CUDA_VISIBLE_DEVICES=0,1,2,3 swift deploy --model_type qwen-7b --tensor_parallel_size 4
 ```
 
 **客户端:**
diff --git a/docs/source/LLM/命令行参数.md b/docs/source/LLM/命令行参数.md
@@ -11,7 +11,7 @@
 
 ## sft 参数
 
-- `--model_type`: 表示你选择的模型类型, 默认是`None`. 可以通过`model_type`来指定对应模型默认的`lora_target_modules`, `template_type`等信息. 你可以通过只指定`model_type`进行微调. 对应的`model_id_or_path`会使用默认的设置, 从ModelScope进行下载, 并使用默认的缓存路径. model_type和model_id_or_path必须指定其中的一个. 可以选择的`model_type`可以查看[支持的模型](支持的模型和数据集.md#模型).
+- `--model_type`: 表示你选择的模型类型, 默认是`None`. `model_type`指定了对应模型默认的`lora_target_modules`, `template_type`等信息. 你可以通过只指定`model_type`进行微调. 对应的`model_id_or_path`会使用默认的设置, 从ModelScope进行下载, 并使用默认的缓存路径. model_type和model_id_or_path必须指定其中的一个. 可以选择的`model_type`可以查看[支持的模型](支持的模型和数据集.md#模型).
 - `--model_id_or_path`: 表示模型在ModelScope Hub中的`model_id`或者本地路径, 默认为`None`. 如果传入的`model_id_or_path`已经被注册, 则会根据`model_id_or_path`推断出`model_type`. 如果未被注册, 则需要同时指定`model_type`, e.g. `--model_type <model_type> --model_id_or_path <model_id_or_path>`.
 - `--model_revision`: 表示模型在ModelScope Hub中对应`model_id`的版本号, 默认为`None`. `model_revision`指定为`None`, 则使用注册在`MODEL_MAPPING`中的revision. 否则强制使用命令行传入的`model_revision`.
 - `--sft_type`: 表示微调的方式, 默认是`'lora'`. 你可以选择的值包括: 'lora', 'full', 'longlora', 'qalora'. 如果你要使用qlora, 你需设置`--sft_type lora --quantization_bit 4`.
diff --git a/swift/llm/__init__.py b/swift/llm/__init__.py
@@ -6,32 +6,30 @@
 
 if TYPE_CHECKING:
     # Recommend using `xxx_main`
-    from .app_ui import gradio_chat_demo, gradio_generation_demo, llm_app_ui, app_ui_main
-    from .deploy import llm_deploy, deploy_main
-    from .dpo import dpo_main, llm_dpo
-    from .infer import llm_infer, merge_lora, prepare_model_template, infer_main, merge_lora_main
-    from .rome import rome_infer, rome_main
-    from .sft import llm_sft, sft_main
-    from .export import llm_export, export_main
+    from .app_ui import gradio_chat_demo, gradio_generation_demo, app_ui_main
+    from .deploy import deploy_main
+    from .dpo import dpo_main
+    from .infer import merge_lora, prepare_model_template, infer_main, merge_lora_main
+    from .rome import rome_main
+    from .sft import sft_main
+    from .export import export_main
 else:
     _extra_objects = {
         k: v
         for k, v in globals().items() if not k.startswith('_')
     }
     _import_structure = {
-        'app_ui': [
-            'gradio_chat_demo', 'gradio_generation_demo', 'llm_app_ui',
-            'app_ui_main'
-        ],
-        'deploy': ['llm_deploy', 'deploy_main'],
-        'dpo': ['dpo_main', 'llm_dpo'],
+        'app_ui':
+        ['gradio_chat_demo', 'gradio_generation_demo', 'app_ui_main'],
+        'deploy': ['deploy_main'],
+        'dpo': ['dpo_main'],
         'infer': [
-            'llm_infer', 'merge_lora', 'prepare_model_template', 'infer_main',
+            'merge_lora', 'prepare_model_template', 'infer_main',
             'merge_lora_main'
         ],
-        'rome': ['rome_infer', 'rome_main'],
-        'sft': ['llm_sft', 'sft_main'],
-        'export': ['llm_export', 'export_main'],
+        'rome': ['rome_main'],
+        'sft': ['sft_main'],
+        'export': ['export_main'],
     }
 
     import sys