Skip to content

Commit 679a98f

Browse files
authored
FEAT: Support orion series models (#933)
1 parent e16454a commit 679a98f

File tree

9 files changed

+275
-13
lines changed

9 files changed

+275
-13
lines changed

doc/source/models/builtin/llm/index.rst

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -81,6 +81,10 @@ The following is a list of built-in LLM in Xinference:
8181

8282
orca
8383

84+
orion-chat
85+
86+
orion-chat-rag
87+
8488
phi-2
8589

8690
qwen-chat
Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
.. _models_llm_orion-chat-rag:
2+
3+
========================================
4+
orion-chat-rag
5+
========================================
6+
7+
- **Context Length:** 4096
8+
- **Model Name:** orion-chat-rag
9+
- **Languages:** en, zh
10+
- **Abilities:** chat
11+
- **Description:** Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.
12+
13+
Specifications
14+
^^^^^^^^^^^^^^
15+
16+
17+
Model Spec 1 (pytorch, 14 Billion)
18+
++++++++++++++++++++++++++++++++++++++++
19+
20+
- **Model Format:** pytorch
21+
- **Model Size (in billions):** 14
22+
- **Quantizations:** none, 4-bit, 8-bit
23+
- **Model ID:** OrionStarAI/Orion-14B-Chat-RAG
24+
- **Model Hubs**: `Hugging Face <https://huggingface.co/OrionStarAI/Orion-14B-Chat-RAG>`_, `ModelScope <https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat-RAG>`_
25+
26+
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
27+
chosen quantization method from the options listed above::
28+
29+
xinference launch --model-name orion-chat-rag --size-in-billions 14 --model-format pytorch --quantization ${quantization}
30+
Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,45 @@
1+
.. _models_llm_orion-chat:
2+
3+
========================================
4+
orion-chat
5+
========================================
6+
7+
- **Context Length:** 4096
8+
- **Model Name:** orion-chat
9+
- **Languages:** en, zh
10+
- **Abilities:** chat
11+
- **Description:** Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.
12+
13+
Specifications
14+
^^^^^^^^^^^^^^
15+
16+
17+
Model Spec 1 (pytorch, 14 Billion)
18+
++++++++++++++++++++++++++++++++++++++++
19+
20+
- **Model Format:** pytorch
21+
- **Model Size (in billions):** 14
22+
- **Quantizations:** none, 4-bit, 8-bit
23+
- **Model ID:** OrionStarAI/Orion-14B-Chat
24+
- **Model Hubs**: `Hugging Face <https://huggingface.co/OrionStarAI/Orion-14B-Chat>`_, `ModelScope <https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat>`_
25+
26+
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
27+
chosen quantization method from the options listed above::
28+
29+
xinference launch --model-name orion-chat --size-in-billions 14 --model-format pytorch --quantization ${quantization}
30+
31+
32+
Model Spec 2 (awq, 14 Billion)
33+
++++++++++++++++++++++++++++++++++++++++
34+
35+
- **Model Format:** awq
36+
- **Model Size (in billions):** 14
37+
- **Quantizations:** Int4
38+
- **Model ID:** OrionStarAI/Orion-14B-Chat-{quantization}
39+
- **Model Hubs**: `Hugging Face <https://huggingface.co/OrionStarAI/Orion-14B-Chat-{quantization}>`_, `ModelScope <https://modelscope.cn/models/OrionStarAI/Orion-14B-Chat-{quantization}>`_
40+
41+
Execute the following command to launch the model, remember to replace ``${quantization}`` with your
42+
chosen quantization method from the options listed above::
43+
44+
xinference launch --model-name orion-chat --size-in-billions 14 --model-format awq --quantization ${quantization}
45+

xinference/model/llm/llm_family.json

Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3253,5 +3253,89 @@
32533253
"assistant"
32543254
]
32553255
}
3256+
},
3257+
{
3258+
"version": 1,
3259+
"context_length": 4096,
3260+
"model_name": "orion-chat",
3261+
"model_lang": [
3262+
"en",
3263+
"zh"
3264+
],
3265+
"model_ability": [
3266+
"chat"
3267+
],
3268+
"model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
3269+
"model_specs": [
3270+
{
3271+
"model_format": "pytorch",
3272+
"model_size_in_billions": 14,
3273+
"quantizations": [
3274+
"none",
3275+
"4-bit",
3276+
"8-bit"
3277+
],
3278+
"model_id": "OrionStarAI/Orion-14B-Chat",
3279+
"model_revision": "ea6fb9b7e1917f3693935accbeb0bfecfd6552a7"
3280+
},
3281+
{
3282+
"model_format": "awq",
3283+
"model_size_in_billions": 14,
3284+
"quantizations": [
3285+
"Int4"
3286+
],
3287+
"model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
3288+
}
3289+
],
3290+
"prompt_style": {
3291+
"style_name": "orion",
3292+
"roles": [
3293+
"Human",
3294+
"assistant"
3295+
],
3296+
"stop": [
3297+
"<s>",
3298+
"</s>",
3299+
"<unk>"
3300+
]
3301+
}
3302+
},
3303+
{
3304+
"version": 1,
3305+
"context_length": 4096,
3306+
"model_name": "orion-chat-rag",
3307+
"model_lang": [
3308+
"en",
3309+
"zh"
3310+
],
3311+
"model_ability": [
3312+
"chat"
3313+
],
3314+
"model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
3315+
"model_specs": [
3316+
{
3317+
"model_format": "pytorch",
3318+
"model_size_in_billions": 14,
3319+
"quantizations": [
3320+
"none",
3321+
"4-bit",
3322+
"8-bit"
3323+
],
3324+
"model_id": "OrionStarAI/Orion-14B-Chat-RAG",
3325+
"model_revision": "eba2e20808407fb431a76b90d5d506e04a0325f2"
3326+
}
3327+
],
3328+
"prompt_style": {
3329+
"style_name": "orion",
3330+
"roles": [
3331+
"Human",
3332+
"assistant"
3333+
],
3334+
"stop": [
3335+
"<s>",
3336+
"</s>",
3337+
"<unk>"
3338+
]
3339+
}
32563340
}
32573341
]

xinference/model/llm/llm_family.py

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -70,7 +70,7 @@ def validate_model_size_with_radix(cls, v: object) -> object:
7070

7171

7272
class PytorchLLMSpecV1(BaseModel):
73-
model_format: Literal["pytorch", "gptq"]
73+
model_format: Literal["pytorch", "gptq", "awq"]
7474
# Must in order that `str` first, then `int`
7575
model_size_in_billions: Union[str, int]
7676
quantizations: List[str]
@@ -451,7 +451,7 @@ def _get_meta_path(
451451
return os.path.join(cache_dir, "__valid_download")
452452
else:
453453
return os.path.join(cache_dir, f"__valid_download_{model_hub}")
454-
elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
454+
elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
455455
assert quantization is not None
456456
if model_hub == "huggingface":
457457
return os.path.join(cache_dir, f"__valid_download_{quantization}")
@@ -489,7 +489,7 @@ def _skip_download(
489489
logger.warning(f"Cache {cache_dir} exists, but it was from {hub}")
490490
return True
491491
return False
492-
elif model_format in ["ggmlv3", "ggufv2", "gptq"]:
492+
elif model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
493493
assert quantization is not None
494494
return os.path.exists(
495495
_get_meta_path(cache_dir, model_format, model_hub, quantization)
@@ -537,7 +537,7 @@ def cache_from_modelscope(
537537
):
538538
return cache_dir
539539

540-
if llm_spec.model_format in ["pytorch", "gptq"]:
540+
if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
541541
download_dir = retry_download(
542542
snapshot_download,
543543
llm_family.model_name,
@@ -598,7 +598,7 @@ def cache_from_huggingface(
598598
):
599599
return cache_dir
600600

601-
if llm_spec.model_format in ["pytorch", "gptq"]:
601+
if llm_spec.model_format in ["pytorch", "gptq", "awq"]:
602602
assert isinstance(llm_spec, PytorchLLMSpecV1)
603603
retry_download(
604604
huggingface_hub.snapshot_download,
@@ -679,7 +679,7 @@ def get_cache_status(
679679
]
680680
return any(revisions)
681681
# just check meta file for ggml and gptq model
682-
elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq"]:
682+
elif llm_spec.model_format in ["ggmlv3", "ggufv2", "gptq", "awq"]:
683683
ret = []
684684
for q in llm_spec.quantizations:
685685
assert q is not None

xinference/model/llm/llm_family_modelscope.json

Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1872,5 +1872,90 @@
18721872
"assistant"
18731873
]
18741874
}
1875+
},
1876+
{
1877+
"version": 1,
1878+
"context_length": 4096,
1879+
"model_name": "orion-chat",
1880+
"model_lang": [
1881+
"en",
1882+
"zh"
1883+
],
1884+
"model_ability": [
1885+
"chat"
1886+
],
1887+
"model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
1888+
"model_specs": [
1889+
{
1890+
"model_format": "pytorch",
1891+
"model_size_in_billions": 14,
1892+
"quantizations": [
1893+
"none",
1894+
"4-bit",
1895+
"8-bit"
1896+
],
1897+
"model_id": "OrionStarAI/Orion-14B-Chat",
1898+
"model_hub": "modelscope"
1899+
},
1900+
{
1901+
"model_format": "awq",
1902+
"model_size_in_billions": 14,
1903+
"quantizations": [
1904+
"Int4"
1905+
],
1906+
"model_hub": "modelscope",
1907+
"model_id": "OrionStarAI/Orion-14B-Chat-{quantization}"
1908+
}
1909+
],
1910+
"prompt_style": {
1911+
"style_name": "orion",
1912+
"roles": [
1913+
"Human",
1914+
"assistant"
1915+
],
1916+
"stop": [
1917+
"<s>",
1918+
"</s>",
1919+
"<unk>"
1920+
]
1921+
}
1922+
},
1923+
{
1924+
"version": 1,
1925+
"context_length": 4096,
1926+
"model_name": "orion-chat-rag",
1927+
"model_lang": [
1928+
"en",
1929+
"zh"
1930+
],
1931+
"model_ability": [
1932+
"chat"
1933+
],
1934+
"model_description": "Orion-14B series models are open-source multilingual large language models trained from scratch by OrionStarAI.",
1935+
"model_specs": [
1936+
{
1937+
"model_format": "pytorch",
1938+
"model_size_in_billions": 14,
1939+
"quantizations": [
1940+
"none",
1941+
"4-bit",
1942+
"8-bit"
1943+
],
1944+
"model_hub": "modelscope",
1945+
"model_id": "OrionStarAI/Orion-14B-Chat-RAG"
1946+
}
1947+
],
1948+
"prompt_style": {
1949+
"style_name": "orion",
1950+
"roles": [
1951+
"Human",
1952+
"assistant"
1953+
],
1954+
"stop": [
1955+
"<s>",
1956+
"</s>",
1957+
"<unk>"
1958+
]
1959+
}
18751960
}
18761961
]

xinference/model/llm/pytorch/core.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -190,7 +190,7 @@ def load(self):
190190
def match(
191191
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
192192
) -> bool:
193-
if llm_spec.model_format not in ["pytorch", "gptq"]:
193+
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
194194
return False
195195
model_family = llm_family.model_family or llm_family.model_name
196196
if model_family in [
@@ -408,7 +408,7 @@ def _sanitize_generate_config(
408408
def match(
409409
cls, llm_family: "LLMFamilyV1", llm_spec: "LLMSpecV1", quantization: str
410410
) -> bool:
411-
if llm_spec.model_format not in ["pytorch", "gptq"]:
411+
if llm_spec.model_format not in ["pytorch", "gptq", "awq"]:
412412
return False
413413
if llm_family.model_name in [
414414
"baichuan-chat",

xinference/model/llm/utils.py

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -380,6 +380,20 @@ def get_prompt(
380380
return f"USER: <<question>> {prompt} <<function>> {tools_string}\nASSISTANT: "
381381
else:
382382
return f"USER: <<question>> {prompt}\nASSISTANT: "
383+
elif prompt_style.style_name == "orion":
384+
ret = "<s>"
385+
for i, message in enumerate(chat_history):
386+
content = message["content"]
387+
role = message["role"]
388+
if i % 2 == 0: # Human
389+
assert content is not None
390+
ret += role + ": " + content + "\n\n"
391+
else: # Assistant
392+
if content:
393+
ret += role + ": </s>" + content + "</s>"
394+
else:
395+
ret += role + ": </s>"
396+
return ret
383397
else:
384398
raise ValueError(f"Invalid prompt style: {prompt_style.style_name}")
385399

@@ -597,7 +611,7 @@ def get_file_location(
597611
is_cached = cache_status
598612
assert isinstance(is_cached, bool)
599613

600-
if spec.model_format in ["pytorch", "gptq"]:
614+
if spec.model_format in ["pytorch", "gptq", "awq"]:
601615
return cache_dir, is_cached
602616
elif spec.model_format in ["ggmlv3", "ggufv2"]:
603617
assert isinstance(spec, GgmlLLMSpecV1)

0 commit comments

Comments
 (0)