lm-sys
diff --git a/‎README.md‎
Lines changed: 1 addition & 0 deletions b/‎README.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎docker/Dockerfile‎
Lines changed: 2 additions & 1 deletion b/‎docker/Dockerfile‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/exllama_v2.md‎
Lines changed: 61 additions & 0 deletions b/‎docs/exllama_v2.md‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎fastchat/model/compression.py‎
Lines changed: 1 addition & 1 deletion b/‎fastchat/model/compression.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎fastchat/model/model_adapter.py‎
Lines changed: 40 additions & 2 deletions b/‎fastchat/model/model_adapter.py‎
Lines changed: 40 additions & 2 deletions
diff --git a/‎fastchat/model/model_exllama.py‎
Lines changed: 77 additions & 0 deletions b/‎fastchat/model/model_exllama.py‎
Lines changed: 77 additions & 0 deletions
diff --git a/‎fastchat/modules/exllama.py‎
Lines changed: 46 additions & 0 deletions b/‎fastchat/modules/exllama.py‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎fastchat/serve/base_model_worker.py‎
Lines changed: 25 additions & 6 deletions b/‎fastchat/serve/base_model_worker.py‎
Lines changed: 25 additions & 6 deletions
@@ -185,6 +185,7 @@ This requires 8-bit compression to be enabled and the bitsandbytes package to be
 
 #### More Platforms and Quantization
 - For AMD GPU users, please install ROCm and [the ROCm version of PyTorch](https://pytorch.org/get-started/locally/) before you install FastChat. See also this [post](https://github.com/lm-sys/FastChat/issues/104#issuecomment-1613791563).
+- FastChat supports ExLlama V2. See [docs/exllama_v2.md](/docs/exllama_v2.md).
 - FastChat supports GPTQ 4bit inference with [GPTQ-for-LLaMa](https://github.com/qwopqwop200/GPTQ-for-LLaMa). See [docs/gptq.md](/docs/gptq.md).
 - FastChat supports AWQ 4bit inference with [mit-han-lab/llm-awq](https://github.com/mit-han-lab/llm-awq). See [docs/awq.md](/docs/awq.md).
 - [MLC LLM](https://mlc.ai/mlc-llm/), backed by [TVM Unity](https://github.com/apache/tvm/tree/unity) compiler, deploys Vicuna natively on phones, consumer-class GPUs and web browsers via Vulkan, Metal, CUDA and WebGPU.
 
@@ -3,4 +3,5 @@ FROM nvidia/cuda:11.7.1-runtime-ubuntu20.04
 RUN apt-get update -y && apt-get install -y python3.9 python3.9-distutils curl
 RUN curl https://bootstrap.pypa.io/get-pip.py -o get-pip.py
 RUN python3.9 get-pip.py
-RUN pip3 install fschat
+RUN pip3 install fschat
+RUN pip3 install fschat[model_worker,webui] pydantic==1.10.1
@@ -0,0 +1,61 @@
+# ExllamaV2 GPTQ Inference Franework
+
+Integrated [ExllamaV2](https://github.com/turboderp/exllamav2) customized kernel into Fastchat to provide **Faster** GPTQ inference speed.
+
+**Note: Exllama not yet support embedding REST API.**
+
+## Install ExllamaV2
+
+Setup environment (please refer to [this link](https://github.com/turboderp/exllamav2#how-to) for more details):
+
+```bash
+git clone https://github.com/turboderp/exllamav2
+cd exllamav2
+pip install -e .
+```
+
+Chat with the CLI:
+```bash
+python3 -m fastchat.serve.cli \
+    --model-path models/vicuna-7B-1.1-GPTQ-4bit-128g \
+    --enable-exllama
+```
+
+Start model worker:
+```bash
+# Download quantized model from huggingface
+# Make sure you have git-lfs installed (https://git-lfs.com)
+git lfs install
+git clone https://huggingface.co/TheBloke/vicuna-7B-1.1-GPTQ-4bit-128g models/vicuna-7B-1.1-GPTQ-4bit-128g
+
+# Load model with default configuration (max sequence length 4096, no GPU split setting).
+python3 -m fastchat.serve.model_worker \
+    --model-path models/vicuna-7B-1.1-GPTQ-4bit-128g \
+    --enable-exllama
+
+#Load model with max sequence length 2048, allocate 18 GB to CUDA:0 and 24 GB to CUDA:1.
+python3 -m fastchat.serve.model_worker \
+    --model-path models/vicuna-7B-1.1-GPTQ-4bit-128g \
+    --enable-exllama \
+    --exllama-max-seq-len 2048 \
+    --exllama-gpu-split 18,24
+```
+
+## Performance 
+
+Reference: https://github.com/turboderp/exllamav2#performance
+
+
+| Model      | Mode         | Size  | grpsz | act | V1: 3090Ti | V1: 4090 | V2: 3090Ti | V2: 4090    |
+|------------|--------------|-------|-------|-----|------------|----------|------------|-------------|
+| Llama      | GPTQ         | 7B    | 128   | no  | 143 t/s    | 173 t/s  | 175 t/s    | **195** t/s |
+| Llama      | GPTQ         | 13B   | 128   | no  | 84 t/s     | 102 t/s  | 105 t/s    | **110** t/s |
+| Llama      | GPTQ         | 33B   | 128   | yes | 37 t/s     | 45 t/s   | 45 t/s     | **48** t/s  |
+| OpenLlama  | GPTQ         | 3B    | 128   | yes | 194 t/s    | 226 t/s  | 295 t/s    | **321** t/s |
+| CodeLlama  | EXL2 4.0 bpw | 34B   | -     | -   | -          | -        | 42 t/s     | **48** t/s  |
+| Llama2     | EXL2 3.0 bpw | 7B    | -     | -   | -          | -        | 195 t/s    | **224** t/s |
+| Llama2     | EXL2 4.0 bpw | 7B    | -     | -   | -          | -        | 164 t/s    | **197** t/s |
+| Llama2     | EXL2 5.0 bpw | 7B    | -     | -   | -          | -        | 144 t/s    | **160** t/s |
+| Llama2     | EXL2 2.5 bpw | 70B   | -     | -   | -          | -        | 30 t/s     | **35** t/s  |
+| TinyLlama  | EXL2 3.0 bpw | 1.1B  | -     | -   | -          | -        | 536 t/s    | **635** t/s |
+| TinyLlama  | EXL2 4.0 bpw | 1.1B  | -     | -   | -          | -        | 509 t/s    | **590** t/s |
@@ -147,7 +147,7 @@ def load_compress_model(model_path, device, torch_dtype, use_fast, revision="mai
         # We don't necessarily need to download the model' repo again if there is a cache.
         # So check the default huggingface cache first.
         model_path_temp = os.path.join(
-            os.getenv("HOME"),
+            os.path.expanduser("~"),
             ".cache/huggingface/hub",
             "models--" + model_path.replace("/", "--"),
             "snapshots/",
 
@@ -27,17 +27,19 @@
 )
 
 from fastchat.constants import CPU_ISA
-from fastchat.modules.gptq import GptqConfig, load_gptq_quantized
-from fastchat.modules.awq import AWQConfig, load_awq_quantized
 from fastchat.conversation import Conversation, get_conv_template
 from fastchat.model.compression import load_compress_model
 from fastchat.model.llama_condense_monkey_patch import replace_llama_with_condense
 from fastchat.model.model_chatglm import generate_stream_chatglm
 from fastchat.model.model_codet5p import generate_stream_codet5p
 from fastchat.model.model_falcon import generate_stream_falcon
+from fastchat.model.model_exllama import generate_stream_exllama
 from fastchat.model.monkey_patch_non_inplace import (
     replace_llama_attn_with_non_inplace_operations,
 )
+from fastchat.modules.awq import AWQConfig, load_awq_quantized
+from fastchat.modules.exllama import ExllamaConfig, load_exllama_model
+from fastchat.modules.gptq import GptqConfig, load_gptq_quantized
 from fastchat.utils import get_gpu_memory
 
 # Check an environment variable to check if we should be sharing Peft model
@@ -155,6 +157,7 @@ def load_model(
     cpu_offloading: bool = False,
     gptq_config: Optional[GptqConfig] = None,
     awq_config: Optional[AWQConfig] = None,
+    exllama_config: Optional[ExllamaConfig] = None,
     revision: str = "main",
     debug: bool = False,
 ):
@@ -279,6 +282,9 @@ def load_model(
         else:
             model.to(device)
         return model, tokenizer
+    elif exllama_config:
+        model, tokenizer = load_exllama_model(model_path, exllama_config)
+        return model, tokenizer
     kwargs["revision"] = revision
 
     if dtype is not None:  # Overwrite dtype if it is provided in the arguments.
@@ -325,13 +331,17 @@ def get_generate_stream_function(model: torch.nn.Module, model_path: str):
     is_falcon = "rwforcausallm" in model_type
     is_codet5p = "codet5p" in model_type
     is_peft = "peft" in model_type
+    is_exllama = "exllama" in model_type
 
     if is_chatglm:
         return generate_stream_chatglm
     elif is_falcon:
         return generate_stream_falcon
     elif is_codet5p:
         return generate_stream_codet5p
+    elif is_exllama:
+        return generate_stream_exllama
+
     elif peft_share_base_weights and is_peft:
         # Return a curried stream function that loads the right adapter
         # according to the model_name available in this context.  This ensures
@@ -453,6 +463,23 @@ def add_model_args(parser):
         default=-1,
         help="Used for AWQ. Groupsize to use for AWQ quantization; default uses full row.",
     )
+    parser.add_argument(
+        "--enable-exllama",
+        action="store_true",
+        help="Used for exllamabv2. Enable exllamaV2 inference framework.",
+    )
+    parser.add_argument(
+        "--exllama-max-seq-len",
+        type=int,
+        default=4096,
+        help="Used for exllamabv2. Max sequence length to use for exllamav2 framework; default 4096 sequence length.",
+    )
+    parser.add_argument(
+        "--exllama-gpu-split",
+        type=str,
+        default=None,
+        help="Used for exllamabv2. Comma-separated list of VRAM (in GB) to use per GPU. Example: 20,7,7",
+    )
 
 
 def remove_parent_directory_name(model_path):
@@ -1625,6 +1652,16 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
         return get_conv_template("phind")
 
 
+class Llama2ChangAdapter(Llama2Adapter):
+    """The model adapter for Llama2-ko-chang (e.g., lcw99/llama2-ko-chang-instruct-chat)"""
+
+    def match(self, model_path: str):
+        return "llama2-ko-chang" in model_path.lower()
+
+    def get_default_conv_template(self, model_path: str) -> Conversation:
+        return get_conv_template("polyglot_changgpt")
+
+
 # Note: the registration order matters.
 # The one registered earlier has a higher matching priority.
 register_model_adapter(PeftModelAdapter)
@@ -1684,6 +1721,7 @@ def get_default_conv_template(self, model_path: str) -> Conversation:
 register_model_adapter(ReaLMAdapter)
 register_model_adapter(PhindCodeLlamaAdapter)
 register_model_adapter(CodeLlamaAdapter)
+register_model_adapter(Llama2ChangAdapter)
 
 # After all adapters, try the default base adapter.
 register_model_adapter(BaseModelAdapter)
@@ -0,0 +1,77 @@
+import gc
+import sys
+from typing import Dict
+
+import torch
+
+
+def generate_stream_exllama(
+    model,
+    tokenizer,
+    params: Dict,
+    device: str,
+    context_len: int,
+    stream_interval: int = 2,
+    judge_sent_end: bool = False,
+):
+    try:
+        from exllamav2.generator import ExLlamaV2StreamingGenerator, ExLlamaV2Sampler
+    except ImportError as e:
+        print(f"Error: Failed to load Exllamav2. {e}")
+        sys.exit(-1)
+
+    prompt = params["prompt"]
+
+    generator = ExLlamaV2StreamingGenerator(model.model, model.cache, tokenizer)
+    settings = ExLlamaV2Sampler.Settings()
+
+    settings.temperature = float(params.get("temperature", 0.85))
+    settings.top_k = int(params.get("top_k", 50))
+    settings.top_p = float(params.get("top_p", 0.8))
+    settings.token_repetition_penalty = float(params.get("repetition_penalty", 1.15))
+    settings.disallow_tokens(generator.tokenizer, [generator.tokenizer.eos_token_id])
+
+    max_new_tokens = int(params.get("max_new_tokens", 256))
+
+    generator.set_stop_conditions(params.get("stop_token_ids", None) or [])
+    echo = bool(params.get("echo", True))
+
+    input_ids = generator.tokenizer.encode(prompt)
+    prompt_tokens = input_ids.shape[-1]
+    generator.begin_stream(input_ids, settings)
+
+    generated_tokens = 0
+    if echo:
+        output = prompt
+    else:
+        output = ""
+    while True:
+        chunk, eos, _ = generator.stream()
+        output += chunk
+        generated_tokens += 1
+        if generated_tokens == max_new_tokens:
+            finish_reason = "length"
+            break
+        elif eos:
+            finish_reason = "length"
+            break
+        yield {
+            "text": output,
+            "usage": {
+                "prompt_tokens": prompt_tokens,
+                "completion_tokens": generated_tokens,
+                "total_tokens": prompt_tokens + generated_tokens,
+            },
+            "finish_reason": None,
+        }
+
+    yield {
+        "text": output,
+        "usage": {
+            "prompt_tokens": prompt_tokens,
+            "completion_tokens": generated_tokens,
+            "total_tokens": prompt_tokens + generated_tokens,
+        },
+        "finish_reason": finish_reason,
+    }
+    gc.collect()
@@ -0,0 +1,46 @@
+from dataclasses import dataclass, field
+import sys
+
+
+@dataclass
+class ExllamaConfig:
+    max_seq_len: int
+    gpu_split: str = None
+
+
+class ExllamaModel:
+    def __init__(self, exllama_model, exllama_cache):
+        self.model = exllama_model
+        self.cache = exllama_cache
+        self.config = self.model.config
+
+
+def load_exllama_model(model_path, exllama_config: ExllamaConfig):
+    try:
+        from exllamav2 import (
+            ExLlamaV2Config,
+            ExLlamaV2Tokenizer,
+            ExLlamaV2,
+            ExLlamaV2Cache,
+        )
+    except ImportError as e:
+        print(f"Error: Failed to load Exllamav2. {e}")
+        sys.exit(-1)
+
+    exllamav2_config = ExLlamaV2Config()
+    exllamav2_config.model_dir = model_path
+    exllamav2_config.prepare()
+    exllamav2_config.max_seq_len = exllama_config.max_seq_len
+
+    exllama_model = ExLlamaV2(exllamav2_config)
+    tokenizer = ExLlamaV2Tokenizer(exllamav2_config)
+
+    split = None
+    if exllama_config.gpu_split:
+        split = [float(alloc) for alloc in exllama_config.gpu_split.split(",")]
+    exllama_model.load(split)
+
+    exllama_cache = ExLlamaV2Cache(exllama_model)
+    model = ExllamaModel(exllama_model=exllama_model, exllama_cache=exllama_cache)
+
+    return model, tokenizer
@@ -2,6 +2,7 @@
 import asyncio
 import threading
 import requests
+import uuid
 from fastchat.constants import WORKER_HEART_BEAT_INTERVAL
 from fastchat.conversation import Conversation
 from fastchat.utils import pretty_print_semaphore, build_logger
@@ -10,7 +11,7 @@
 from typing import List
 
 
-worker_id = None
+worker_id = str(uuid.uuid4())[:8]
 worker = None
 logger = None
 
@@ -34,6 +35,8 @@ def __init__(
         limit_worker_concurrency: int,
         conv_template: str = None,
     ):
+        global logger
+
         self.controller_addr = controller_addr
         self.worker_addr = worker_addr
         self.worker_id = worker_id
@@ -50,14 +53,17 @@ def __init__(
 
         self.heart_beat_thread = None
 
+        if logger is None:
+            logger = build_logger("model_worker", f"model_worker_{worker_id}.log")
+
     def make_conv_template(
         self,
         conv_template: str = None,
         model_path: str = None,
-    )-> Conversation:
-        '''
+    ) -> Conversation:
+        """
         can be overrided to costomize the conversation template for different model workers.
-        '''
+        """
         from fastchat.conversation import get_conv_template
         from fastchat.model.model_adapter import get_conversation_template
 
@@ -140,8 +146,12 @@ def get_status(self):
 
     def count_token(self, params):
         prompt = params["prompt"]
-        input_ids = self.tokenizer(prompt).input_ids
-        input_echo_len = len(input_ids)
+
+        try:
+            input_ids = self.tokenizer(prompt).input_ids
+            input_echo_len = len(input_ids)
+        except TypeError:
+            input_echo_len = self.tokenizer.num_tokens(prompt)
 
         ret = {
             "count": input_echo_len,
@@ -152,6 +162,15 @@ def count_token(self, params):
     def get_conv_template(self):
         return {"conv": self.conv}
 
+    def generate_stream_gate(self, params):
+        raise NotImplementedError
+
+    def generate_gate(self, params):
+        raise NotImplementedError
+
+    def get_embeddings(self, params):
+        raise NotImplementedError
+
 
 def release_worker_semaphore():
     worker.semaphore.release()