Skip to content

Commit 1b094ba

Browse files
authored
Bump the versions, new models support (#463)
* up the versions * fixing starcoder2 flash sa * integrate groq / cerebras to the self-hosting (#466) * qwen2.5 models * upd README.md * a warning * get rid of the autogptq models * version 1.8.0 * version 1.8.0 * deprecated versions in the readme * add completion support for the passthrough models * add multiline_code_completion_default_model * _select_default_lora_if_exists for multiline_code_completion_default_model _add_results_for_passthrough_provider fix * rm deepseek-coder-v2/16b/instruct MAX_JOBS=8 * gpt-4 is unavailable
1 parent 31ed965 commit 1b094ba

File tree

19 files changed

+414
-88
lines changed

19 files changed

+414
-88
lines changed

Dockerfile.base

Lines changed: 10 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04
1+
FROM nvidia/cuda:12.4.1-cudnn-devel-ubuntu22.04
22

33
ENV INSTALL_OPTIONAL=TRUE
44
ENV MAX_JOBS=8
@@ -13,24 +13,28 @@ RUN DEBIAN_FRONTEND="noninteractive" TZ=Etc/UTC apt-get install -y \
1313
ruby-full \
1414
ruby-bundler \
1515
build-essential \
16-
cmake \
1716
pkg-config \
1817
libicu-dev \
1918
zlib1g-dev \
2019
libcurl4-openssl-dev \
2120
libssl-dev \
2221
&& rm -rf /var/lib/{apt,dpkg,cache,log}
22+
RUN DEBIAN_FRONTEND="noninteractive" TZ=Etc/UTC apt remove cmake -y
23+
RUN pip install cmake --upgrade
24+
2325
RUN git clone https://github.com/smallcloudai/linguist.git /tmp/linguist \
2426
&& cd /tmp/linguist \
2527
&& bundle install \
2628
&& rake build_gem
2729
ENV PATH="${PATH}:/tmp/linguist/bin"
2830

29-
RUN pip install --no-cache-dir torch==2.3.0 --index-url https://download.pytorch.org/whl/cu118
30-
RUN pip install --no-cache-dir xformers==0.0.26.post1 --index-url https://download.pytorch.org/whl/cu118
31+
RUN pip install --no-cache-dir torch==2.5.0
32+
RUN pip install --no-cache-dir xformers==v0.0.28.post2
3133
RUN pip install ninja
32-
RUN VLLM_INSTALL_PUNICA_KERNELS=1 pip install -v --no-build-isolation git+https://github.com/smallcloudai/vllm@refact_v0.4.2_06052024
34+
RUN pip install setuptools_scm
35+
ENV CMAKE_ARGS="-DLLAMA_CUBLAS=on -DCMAKE_CUDA_ARCHITECTURES=60;61;70;75;80;86;89;90+PTX"
36+
RUN pip install -v --no-build-isolation git+https://github.com/smallcloudai/vllm@refact_v0.6.3_2adb440
3337

34-
# there is no prebuild auto-gptq with torch 2.3.0 support
38+
# there is no prebuild auto-gptq with torch 2.5.0 support
3539
ENV TORCH_CUDA_ARCH_LIST="6.0;6.1;7.0;7.5;8.0;8.6;8.9;9.0+PTX"
3640
RUN BUILD_CUDA_EXT=1 pip install -v --no-build-isolation git+https://github.com/PanQiWei/[email protected]

README.md

Lines changed: 29 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -103,21 +103,35 @@ Extensions > Refact.ai Assistant > Settings > Infurl
103103

104104
## Supported models
105105

106-
| Model | Completion | Chat | Fine-tuning | [Deprecated](## "Will be removed in next versions") |
107-
|---------------------------------------------------------------------------------------------------|------------|------|-------------|-----------------------------------------------------|
108-
| [Refact/1.6B](https://huggingface.co/smallcloudai/Refact-1_6B-fim) | + | | + | |
109-
| [starcoder2/3b/base](https://huggingface.co/bigcode/starcoder2-3b) | + | | + | |
110-
| [starcoder2/7b/base](https://huggingface.co/bigcode/starcoder2-7b) | + | | + | |
111-
| [starcoder2/15b/base](https://huggingface.co/bigcode/starcoder2-15b) | + | | + | |
112-
| [deepseek-coder/1.3b/base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base) | + | | + | |
113-
| [deepseek-coder/5.7b/mqa-base](https://huggingface.co/deepseek-ai/deepseek-coder-5.7bmqa-base) | + | | + | |
114-
| [magicoder/6.7b](https://huggingface.co/TheBloke/Magicoder-S-DS-6.7B-GPTQ) | | + | | |
115-
| [mistral/7b/instruct-v0.1](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GPTQ) | | + | | |
116-
| [mixtral/8x7b/instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | | + | | |
117-
| [deepseek-coder/6.7b/instruct](https://huggingface.co/TheBloke/deepseek-coder-6.7B-instruct-GPTQ) | | + | | |
118-
| [deepseek-coder/33b/instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct) | | + | | |
119-
| [stable/3b/code](https://huggingface.co/stabilityai/stable-code-3b) | + | | | |
120-
| [llama3/8b/instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | | + | | |
106+
| Model | Completion | Chat | Fine-tuning | [Deprecated](## "Will be removed in next versions") |
107+
|---------------------------------------------------------------------------------------------------------|------------|------|-------------|-----------------------------------------------------|
108+
| [Refact/1.6B](https://huggingface.co/smallcloudai/Refact-1_6B-fim) | + | | + | |
109+
| [starcoder2/3b/base](https://huggingface.co/bigcode/starcoder2-3b) | + | | + | |
110+
| [starcoder2/7b/base](https://huggingface.co/bigcode/starcoder2-7b) | + | | + | |
111+
| [starcoder2/15b/base](https://huggingface.co/bigcode/starcoder2-15b) | + | | + | |
112+
| [deepseek-coder/1.3b/base](https://huggingface.co/deepseek-ai/deepseek-coder-1.3b-base) | + | | + | |
113+
| [deepseek-coder/5.7b/mqa-base](https://huggingface.co/deepseek-ai/deepseek-coder-5.7bmqa-base) | + | | + | |
114+
| [magicoder/6.7b](https://huggingface.co/TheBloke/Magicoder-S-DS-6.7B-GPTQ) | | + | | + |
115+
| [mistral/7b/instruct-v0.1](https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GPTQ) | | + | | + |
116+
| [mixtral/8x7b/instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) | | + | | |
117+
| [deepseek-coder/6.7b/instruct](https://huggingface.co/TheBloke/deepseek-coder-6.7B-instruct-GPTQ) | | + | | + |
118+
| [deepseek-coder/33b/instruct](https://huggingface.co/deepseek-ai/deepseek-coder-33b-instruct) | | + | | |
119+
| [stable/3b/code](https://huggingface.co/stabilityai/stable-code-3b) | + | | | |
120+
| [llama3/8b/instruct](https://huggingface.co/meta-llama/Meta-Llama-3-8B-Instruct) | + | + | | |
121+
| [llama3.1/8b/instruct](https://huggingface.co/meta-llama/Meta-Llama-3.1-8B-Instruct) | + | + | | |
122+
| [llama3.2/1b/instruct](https://huggingface.co/meta-llama/Llama-3.2-1B-Instruct) | + | + | | |
123+
| [llama3.2/3b/instruct](https://huggingface.co/meta-llama/Llama-3.2-3B-Instruct) | + | + | | |
124+
| [qwen2.5/coder/0.5b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-0.5B) | + | | + | |
125+
| [qwen2.5/coder/1.5b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B) | + | | + | |
126+
| [qwen2.5/coder/3b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-3B) | + | | + | |
127+
| [qwen2.5/coder/7b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-7B) | + | | + | |
128+
| [qwen2.5/coder/14b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-14B) | + | | + | |
129+
| [qwen2.5/coder/32b/base](https://huggingface.co/Qwen/Qwen2.5-Coder-32B) | + | | + | |
130+
| [qwen2.5/coder/1.5b/instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-1.5B-Instruct) | + | + | | |
131+
| [qwen2.5/coder/3b/instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-3B-Instruct) | + | + | | |
132+
| [qwen2.5/coder/7b/instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-7B-Instruct) | + | + | | |
133+
| [qwen2.5/coder/14b/instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-14B-Instruct) | + | + | | |
134+
| [qwen2.5/coder/32b/instruct](https://huggingface.co/Qwen/Qwen2.5-Coder-32B-Instruct) | + | + | | |
121135

122136
## Usage
123137

refact_known_models/huggingface.py

Lines changed: 117 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
"required_memory_mb": 8000,
2323
"T": 4096, # in fact this model allows 16k context, but we have 4k context at max in hf inference
2424
"filter_caps": ["chat"],
25+
"deprecated": True
2526
},
2627
"mistral/7b/instruct-v0.1": {
2728
"backend": "autogptq",
@@ -30,6 +31,7 @@
3031
"required_memory_mb": 8000,
3132
"T": 4096, # in fact this model allows 8k context, but we have 4k context at max in hf inference
3233
"filter_caps": ["chat"],
34+
"deprecated": True
3335
},
3436
"mixtral/8x7b/instruct-v0.1": {
3537
"backend": "transformers",
@@ -50,6 +52,7 @@
5052
"required_memory_mb": 8000,
5153
"T": 4096, # in fact this model allows 16k context, but we have 4k context at max in hf inference
5254
"filter_caps": ["chat"],
55+
"deprecated": True
5356
},
5457
"deepseek-coder/33b/instruct": {
5558
"backend": "transformers",
@@ -113,16 +116,126 @@
113116
},
114117
"required_memory_mb": 20000,
115118
"T": 8192,
116-
"filter_caps": ["chat"],
119+
"filter_caps": ["completion", "chat"],
120+
},
121+
"llama3.1/8b/instruct": {
122+
"backend": "transformers",
123+
"model_path": "meta-llama/Llama-3.1-8B-Instruct",
124+
"model_class_kwargs": {
125+
"torch_dtype": "bf16",
126+
},
127+
"required_memory_mb": 20000,
128+
"T": 16384, # in fact this model can handle 128K context
129+
"filter_caps": ["completion", "chat"],
130+
},
131+
"llama3.2/3b/instruct": {
132+
"backend": "transformers",
133+
"model_path": "meta-llama/Llama-3.2-3B-Instruct",
134+
"model_class_kwargs": {
135+
"torch_dtype": "bf16",
136+
},
137+
"required_memory_mb": 12000,
138+
"T": 16384, # in fact this model can handle 128K context
139+
"filter_caps": ["completion", "chat"],
117140
},
118-
"deepseek-coder-v2/16b/instruct": {
141+
"llama3.2/1b/instruct": {
119142
"backend": "transformers",
120-
"model_path": "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct",
143+
"model_path": "meta-llama/Llama-3.2-1B-Instruct",
121144
"model_class_kwargs": {
122145
"torch_dtype": "bf16",
123146
},
124-
"required_memory_mb": 80000,
147+
"required_memory_mb": 8000,
125148
"T": 16384, # in fact this model can handle 128K context
126149
"filter_caps": ["completion", "chat"],
127150
},
151+
# qwen 2.5-coder instruct models
152+
"qwen2.5/coder/32b/instruct": {
153+
"backend": "transformers",
154+
"model_path": "Qwen/Qwen2.5-Coder-32B-Instruct",
155+
"model_class_kwargs": {},
156+
"required_memory_mb": 45000,
157+
"T": 32768,
158+
"filter_caps": ["completion", "chat"],
159+
},
160+
"qwen2.5/coder/14b/instruct": {
161+
"backend": "transformers",
162+
"model_path": "Qwen/Qwen2.5-Coder-14B-Instruct",
163+
"model_class_kwargs": {},
164+
"required_memory_mb": 45000,
165+
"T": 32768,
166+
"filter_caps": ["completion", "chat"],
167+
},
168+
"qwen2.5/coder/7b/instruct": {
169+
"backend": "transformers",
170+
"model_path": "Qwen/Qwen2.5-Coder-7B-Instruct",
171+
"model_class_kwargs": {},
172+
"required_memory_mb": 45000,
173+
"T": 32768,
174+
"filter_caps": ["completion", "chat"],
175+
},
176+
"qwen2.5/coder/3b/instruct": {
177+
"backend": "transformers",
178+
"model_path": "Qwen/Qwen2.5-Coder-3B-Instruct",
179+
"model_class_kwargs": {},
180+
"required_memory_mb": 45000,
181+
"T": 32768,
182+
"filter_caps": ["completion", "chat"],
183+
},
184+
"qwen2.5/coder/1.5b/instruct": {
185+
"backend": "transformers",
186+
"model_path": "Qwen/Qwen2.5-Coder-1.5B-Instruct",
187+
"model_class_kwargs": {},
188+
"required_memory_mb": 45000,
189+
"T": 32768,
190+
"filter_caps": ["completion", "chat"],
191+
},
192+
# qwen 2.5-coder completion models
193+
"qwen2.5/coder/32b/base": {
194+
"backend": "transformers",
195+
"model_path": "Qwen/Qwen2.5-Coder-32B",
196+
"model_class_kwargs": {},
197+
"required_memory_mb": 45000,
198+
"T": 32768,
199+
"filter_caps": ["completion", "finetune"],
200+
},
201+
"qwen2.5/coder/14b/base": {
202+
"backend": "transformers",
203+
"model_path": "Qwen/Qwen2.5-Coder-14B",
204+
"model_class_kwargs": {},
205+
"required_memory_mb": 35000,
206+
"T": 32768,
207+
"filter_caps": ["completion", "finetune"],
208+
},
209+
"qwen2.5/coder/7b/base": {
210+
"backend": "transformers",
211+
"model_path": "Qwen/Qwen2.5-Coder-7B",
212+
"model_class_kwargs": {},
213+
"required_memory_mb": 20000,
214+
"T": 32768,
215+
"filter_caps": ["completion", "finetune"],
216+
},
217+
"qwen2.5/coder/3b/base": {
218+
"backend": "transformers",
219+
"model_path": "Qwen/Qwen2.5-Coder-3B",
220+
"model_class_kwargs": {},
221+
"required_memory_mb": 15000,
222+
"T": 32768,
223+
"filter_caps": ["completion", "finetune"],
224+
},
225+
"qwen2.5/coder/1.5b/base": {
226+
"backend": "transformers",
227+
"model_path": "Qwen/Qwen2.5-Coder-1.5B",
228+
"model_class_kwargs": {},
229+
"required_memory_mb": 10000,
230+
"T": 32768,
231+
"filter_caps": ["completion", "finetune"],
232+
},
233+
"qwen2.5/coder/0.5b/base": {
234+
"backend": "transformers",
235+
"model_path": "Qwen/Qwen2.5-Coder-0.5B",
236+
"model_class_kwargs": {},
237+
"required_memory_mb": 7000,
238+
"T": 32768,
239+
"filter_caps": ["completion", "finetune"],
240+
},
128241
}

0 commit comments

Comments
 (0)