vllm worker awq quantization update (#2463)

dongxiaolong · web-flow · commit bcb8076c36f1 · 2023-09-21T20:51:58.000-07:00
Co-authored-by: 董晓龙 &lt;dongxiaolong@shiyanjia.com&gt;
diff --git a/docs/vllm_integration.md b/docs/vllm_integration.md
@@ -18,3 +18,8 @@ See the supported models [here](https://vllm.readthedocs.io/en/latest/models/sup
    ```
    python3 -m fastchat.serve.vllm_worker --model-path lmsys/vicuna-7b-v1.3 --tokenizer hf-internal-testing/llama-tokenizer
    ```
+
+   if you use a awq model, try
+   '''
+   python3 -m fastchat.serve.vllm_worker --model-path TheBloke/vicuna-7B-v1.5-AWQ --quantization awq
+   '''
diff --git a/fastchat/serve/vllm_worker.py b/fastchat/serve/vllm_worker.py
@@ -210,6 +210,8 @@ async def api_model_details(request: Request):
         args.model = args.model_path
     if args.num_gpus > 1:
         args.tensor_parallel_size = args.num_gpus
+    if args.quantizaiton:
+        args.quantization = args.quantization
 
     engine_args = AsyncEngineArgs.from_cli_args(args)
     engine = AsyncLLMEngine.from_engine_args(engine_args)