FEAT: Support orion series models (#933)

aresnow1 · web-flow · commit 679a98f536ed · 2024-01-25T19:08:46.000+08:00
diff --git a/doc/source/models/builtin/llm/index.rst b/doc/source/models/builtin/llm/index.rst
@@ -81,6 +81,10 @@ The following is a list of built-in LLM in Xinference:
   
    orca
   
+   orion-chat
+  
+   orion-chat-rag
+  
    phi-2
   
    qwen-chat
diff --git a/doc/source/models/builtin/llm/orion-chat-rag.rst b/doc/source/models/builtin/llm/orion-chat-rag.rst
@@ -0,0 +1,30 @@
+.. _models_llm_orion-chat-rag:
+
+========================================
+orion-chat-rag
+========================================
+
+- **Context Length:** 4096
+- **Model Name:** orion-chat-rag
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 14
+- **Quantizations:** none, 4-bit, 8-bit
+- **Model ID:** OrionStarAI/Orion-14B-Chat-RAG
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/OrionStarAI/Orion-14B-Chat-RAG>`_, `ModelScope <https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat-RAG>`_
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name orion-chat-rag --size-in-billions 14 --model-format pytorch --quantization ${quantization}
+
diff --git a/doc/source/models/builtin/llm/orion-chat.rst b/doc/source/models/builtin/llm/orion-chat.rst
@@ -0,0 +1,45 @@
+.. _models_llm_orion-chat:
+
+========================================
+orion-chat
+========================================
+
+- **Context Length:** 4096
+- **Model Name:** orion-chat
+- **Languages:** en, zh
+- **Abilities:** chat
+- **Description:** Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.
+
+Specifications
+^^^^^^^^^^^^^^
+
+
+Model Spec 1 (pytorch, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** pytorch
+- **Model Size (in billions):** 14
+- **Quantizations:** none, 4-bit, 8-bit
+- **Model ID:** OrionStarAI/Orion-14B-Chat
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/OrionStarAI/Orion-14B-Chat>`_, `ModelScope <https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat>`_
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name orion-chat --size-in-billions 14 --model-format pytorch --quantization ${quantization}
+
+
+Model Spec 2 (awq, 14 Billion)
+++++++++++++++++++++++++++++++++++++++++
+
+- **Model Format:** awq
+- **Model Size (in billions):** 14
+- **Quantizations:** Int4
+- **Model ID:** OrionStarAI/Orion-14B-Chat-{quantization}
+- **Model Hubs**:  `Hugging Face <https://huggingface.co/OrionStarAI/Orion-14B-Chat-{quantization}>`_, `ModelScope <https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat-{quantization}>`_
+
+Execute the following command to launch the model, remember to replace ``${quantization}`` with your
+chosen quantization method from the options listed above::
+
+   xinference launch --model-name orion-chat --size-in-billions 14 --model-format awq --quantization ${quantization}
+
diff --git a/xinference/model/llm/llm_family.json b/xinference/model/llm/llm_family.json
@@ -3253,5 +3253,89 @@
         "assistant"
       ]
     }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "orion-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "OrionStarAI/Orion-14B-Chat",
+        "model_revision": "ea6fb9b7e1917f3693935accbeb0bfecfd6552a7"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "orion",
+      "roles": [
+        "Human",
+        "assistant"
+      ],
+      "stop": [
+        "<s>",
+        "</s>",
+        "<unk>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "orion-chat-rag",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "OrionStarAI/Orion-14B-Chat-RAG",
+        "model_revision": "eba2e20808407fb431a76b90d5d506e04a0325f2"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "orion",
+      "roles": [
+        "Human",
+        "assistant"
+      ],
+      "stop": [
+        "<s>",
+        "</s>",
+        "<unk>"
+      ]
+    }
   }
 ]
diff --git a/xinference/model/llm/llm_family.py b/xinference/model/llm/llm_family.py
@@ -70,7 +70,7 @@ def validate_model_size_with_radix(cls, v: object) -> object:
 
 
 class PytorchLLMSpecV1(BaseModel):
-    model_format: Literal["pytorch", "gptq"]
+    model_format: Literal["pytorch", "gptq", "awq"]
     # Must in order that `str` first, then `int`
     model_size_in_billions: Union[str, int]
     quantizations: List[str]
@@ -451,7 +451,7 @@ def _get_meta_path(
             return os.path.join(cache_dir, "__valid_download")
         else:
             return os.path.join(cache_dir, f"__valid_download_{model_hub}")
-    elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
+    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
         assert quantization is not None
         if model_hub == "huggingface":
             return os.path.join(cache_dir, f"__valid_download_{quantization}")
@@ -489,7 +489,7 @@ def _skip_download(
                     logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
                     return True
             return False
-    elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
+    elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
         assert quantization is not None
         return os.path.exists(
             _get_meta_path(cache_dir, model_format, model_hub, quantization)
@@ -537,7 +537,7 @@ def cache_from_modelscope(
     ):
         return cache_dir
 
-    if llm_spec.model_format in ["pytorch", "gptq"]:
+    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
         download_dir = retry_download(
             snapshot_download,
             llm_family.model_name,
@@ -598,7 +598,7 @@ def cache_from_huggingface(
     ):
         return cache_dir
 
-    if llm_spec.model_format in ["pytorch", "gptq"]:
+    if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
         assert isinstance(llm_spec, PytorchLLMSpecV1)
         retry_download(
             huggingface_hub.snapshot_download,
@@ -679,7 +679,7 @@ def get_cache_status(
         ]
         return any(revisions)
     # just check meta file for ggml and gptq model
-    elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq"]:
+    elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
         ret = []
         for q in llm_spec.quantizations:
             assert q is not None
diff --git a/xinference/model/llm/llm_family_modelscope.json b/xinference/model/llm/llm_family_modelscope.json
@@ -1872,5 +1872,90 @@
         "assistant"
       ]
     }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "orion-chat",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_id": "OrionStarAI/Orion-14B-Chat",
+        "model_hub": "modelscope"
+      },
+      {
+        "model_format": "awq",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "Int4"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "orion",
+      "roles": [
+        "Human",
+        "assistant"
+      ],
+      "stop": [
+        "<s>",
+        "</s>",
+        "<unk>"
+      ]
+    }
+  },
+  {
+    "version": 1,
+    "context_length": 4096,
+    "model_name": "orion-chat-rag",
+    "model_lang": [
+      "en",
+      "zh"
+    ],
+    "model_ability": [
+      "chat"
+    ],
+    "model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
+    "model_specs": [
+      {
+        "model_format": "pytorch",
+        "model_size_in_billions": 14,
+        "quantizations": [
+          "none",
+          "4-bit",
+          "8-bit"
+        ],
+        "model_hub": "modelscope",
+        "model_id": "OrionStarAI/Orion-14B-Chat-RAG"
+      }
+    ],
+    "prompt_style": {
+      "style_name": "orion",
+      "roles": [
+        "Human",
+        "assistant"
+      ],
+      "stop": [
+        "<s>",
+        "</s>",
+        "<unk>"
+      ]
+    }
   }
 ]
diff --git a/xinference/model/llm/pytorch/core.py b/xinference/model/llm/pytorch/core.py
@@ -190,7 +190,7 @@ def load(self):
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         model_family = llm_family.model_family or llm_family.model_name
         if model_family in [
@@ -408,7 +408,7 @@ def _sanitize_generate_config(
     def match(
         cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
     ) -> bool:
-        if llm_spec.model_format not in ["pytorch", "gptq"]:
+        if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
             return False
         if llm_family.model_name in [
             "baichuan-chat",
diff --git a/xinference/model/llm/utils.py b/xinference/model/llm/utils.py
@@ -380,6 +380,20 @@ def get_prompt(
                 return f"USER: <<question>> {prompt} <<function>> {tools_string}\nASSISTANT: "
             else:
                 return f"USER: <<question>> {prompt}\nASSISTANT: "
+        elif prompt_style.style_name == "orion":
+            ret = "<s>"
+            for i, message in enumerate(chat_history):
+                content = message["content"]
+                role = message["role"]
+                if i % 2 == 0:  # Human
+                    assert content is not None
+                    ret += role + ": " + content + "\n\n"
+                else:  # Assistant
+                    if content:
+                        ret += role + ": </s>" + content + "</s>"
+                    else:
+                        ret += role + ": </s>"
+            return ret
         else:
             raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
 
@@ -597,7 +611,7 @@ def get_file_location(
         is_cached = cache_status
     assert isinstance(is_cached, bool)
 
-    if spec.model_format in ["pytorch", "gptq"]:
+    if spec.model_format in ["pytorch", "gptq", "awq"]:
         return cache_dir, is_cached
     elif spec.model_format in ["ggmlv3", "ggufv2"]:
         assert isinstance(spec, GgmlLLMSpecV1)
diff --git a/xinference/model/llm/vllm/core.py b/xinference/model/llm/vllm/core.py