|
22 | 22 | "required_memory_mb": 8000,
|
23 | 23 | "T": 4096, # in fact this model allows 16k context, but we have 4k context at max in hf inference
|
24 | 24 | "filter_caps": ["chat"],
|
| 25 | + "deprecated": True |
25 | 26 | },
|
26 | 27 | "mistral/7b/instruct-v0.1": {
|
27 | 28 | "backend": "autogptq",
|
|
30 | 31 | "required_memory_mb": 8000,
|
31 | 32 | "T": 4096, # in fact this model allows 8k context, but we have 4k context at max in hf inference
|
32 | 33 | "filter_caps": ["chat"],
|
| 34 | + "deprecated": True |
33 | 35 | },
|
34 | 36 | "mixtral/8x7b/instruct-v0.1": {
|
35 | 37 | "backend": "transformers",
|
|
50 | 52 | "required_memory_mb": 8000,
|
51 | 53 | "T": 4096, # in fact this model allows 16k context, but we have 4k context at max in hf inference
|
52 | 54 | "filter_caps": ["chat"],
|
| 55 | + "deprecated": True |
53 | 56 | },
|
54 | 57 | "deepseek-coder/33b/instruct": {
|
55 | 58 | "backend": "transformers",
|
|
113 | 116 | },
|
114 | 117 | "required_memory_mb": 20000,
|
115 | 118 | "T": 8192,
|
116 |
| - "filter_caps": ["chat"], |
| 119 | + "filter_caps": ["completion", "chat"], |
| 120 | + }, |
| 121 | + "llama3.1/8b/instruct": { |
| 122 | + "backend": "transformers", |
| 123 | + "model_path": "meta-llama/Llama-3.1-8B-Instruct", |
| 124 | + "model_class_kwargs": { |
| 125 | + "torch_dtype": "bf16", |
| 126 | + }, |
| 127 | + "required_memory_mb": 20000, |
| 128 | + "T": 16384, # in fact this model can handle 128K context |
| 129 | + "filter_caps": ["completion", "chat"], |
| 130 | + }, |
| 131 | + "llama3.2/3b/instruct": { |
| 132 | + "backend": "transformers", |
| 133 | + "model_path": "meta-llama/Llama-3.2-3B-Instruct", |
| 134 | + "model_class_kwargs": { |
| 135 | + "torch_dtype": "bf16", |
| 136 | + }, |
| 137 | + "required_memory_mb": 12000, |
| 138 | + "T": 16384, # in fact this model can handle 128K context |
| 139 | + "filter_caps": ["completion", "chat"], |
117 | 140 | },
|
118 |
| - "deepseek-coder-v2/16b/instruct": { |
| 141 | + "llama3.2/1b/instruct": { |
119 | 142 | "backend": "transformers",
|
120 |
| - "model_path": "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct", |
| 143 | + "model_path": "meta-llama/Llama-3.2-1B-Instruct", |
121 | 144 | "model_class_kwargs": {
|
122 | 145 | "torch_dtype": "bf16",
|
123 | 146 | },
|
124 |
| - "required_memory_mb": 80000, |
| 147 | + "required_memory_mb": 8000, |
125 | 148 | "T": 16384, # in fact this model can handle 128K context
|
126 | 149 | "filter_caps": ["completion", "chat"],
|
127 | 150 | },
|
| 151 | + # qwen 2.5-coder instruct models |
| 152 | + "qwen2.5/coder/32b/instruct": { |
| 153 | + "backend": "transformers", |
| 154 | + "model_path": "Qwen/Qwen2.5-Coder-32B-Instruct", |
| 155 | + "model_class_kwargs": {}, |
| 156 | + "required_memory_mb": 45000, |
| 157 | + "T": 32768, |
| 158 | + "filter_caps": ["completion", "chat"], |
| 159 | + }, |
| 160 | + "qwen2.5/coder/14b/instruct": { |
| 161 | + "backend": "transformers", |
| 162 | + "model_path": "Qwen/Qwen2.5-Coder-14B-Instruct", |
| 163 | + "model_class_kwargs": {}, |
| 164 | + "required_memory_mb": 45000, |
| 165 | + "T": 32768, |
| 166 | + "filter_caps": ["completion", "chat"], |
| 167 | + }, |
| 168 | + "qwen2.5/coder/7b/instruct": { |
| 169 | + "backend": "transformers", |
| 170 | + "model_path": "Qwen/Qwen2.5-Coder-7B-Instruct", |
| 171 | + "model_class_kwargs": {}, |
| 172 | + "required_memory_mb": 45000, |
| 173 | + "T": 32768, |
| 174 | + "filter_caps": ["completion", "chat"], |
| 175 | + }, |
| 176 | + "qwen2.5/coder/3b/instruct": { |
| 177 | + "backend": "transformers", |
| 178 | + "model_path": "Qwen/Qwen2.5-Coder-3B-Instruct", |
| 179 | + "model_class_kwargs": {}, |
| 180 | + "required_memory_mb": 45000, |
| 181 | + "T": 32768, |
| 182 | + "filter_caps": ["completion", "chat"], |
| 183 | + }, |
| 184 | + "qwen2.5/coder/1.5b/instruct": { |
| 185 | + "backend": "transformers", |
| 186 | + "model_path": "Qwen/Qwen2.5-Coder-1.5B-Instruct", |
| 187 | + "model_class_kwargs": {}, |
| 188 | + "required_memory_mb": 45000, |
| 189 | + "T": 32768, |
| 190 | + "filter_caps": ["completion", "chat"], |
| 191 | + }, |
| 192 | + # qwen 2.5-coder completion models |
| 193 | + "qwen2.5/coder/32b/base": { |
| 194 | + "backend": "transformers", |
| 195 | + "model_path": "Qwen/Qwen2.5-Coder-32B", |
| 196 | + "model_class_kwargs": {}, |
| 197 | + "required_memory_mb": 45000, |
| 198 | + "T": 32768, |
| 199 | + "filter_caps": ["completion", "finetune"], |
| 200 | + }, |
| 201 | + "qwen2.5/coder/14b/base": { |
| 202 | + "backend": "transformers", |
| 203 | + "model_path": "Qwen/Qwen2.5-Coder-14B", |
| 204 | + "model_class_kwargs": {}, |
| 205 | + "required_memory_mb": 35000, |
| 206 | + "T": 32768, |
| 207 | + "filter_caps": ["completion", "finetune"], |
| 208 | + }, |
| 209 | + "qwen2.5/coder/7b/base": { |
| 210 | + "backend": "transformers", |
| 211 | + "model_path": "Qwen/Qwen2.5-Coder-7B", |
| 212 | + "model_class_kwargs": {}, |
| 213 | + "required_memory_mb": 20000, |
| 214 | + "T": 32768, |
| 215 | + "filter_caps": ["completion", "finetune"], |
| 216 | + }, |
| 217 | + "qwen2.5/coder/3b/base": { |
| 218 | + "backend": "transformers", |
| 219 | + "model_path": "Qwen/Qwen2.5-Coder-3B", |
| 220 | + "model_class_kwargs": {}, |
| 221 | + "required_memory_mb": 15000, |
| 222 | + "T": 32768, |
| 223 | + "filter_caps": ["completion", "finetune"], |
| 224 | + }, |
| 225 | + "qwen2.5/coder/1.5b/base": { |
| 226 | + "backend": "transformers", |
| 227 | + "model_path": "Qwen/Qwen2.5-Coder-1.5B", |
| 228 | + "model_class_kwargs": {}, |
| 229 | + "required_memory_mb": 10000, |
| 230 | + "T": 32768, |
| 231 | + "filter_caps": ["completion", "finetune"], |
| 232 | + }, |
| 233 | + "qwen2.5/coder/0.5b/base": { |
| 234 | + "backend": "transformers", |
| 235 | + "model_path": "Qwen/Qwen2.5-Coder-0.5B", |
| 236 | + "model_class_kwargs": {}, |
| 237 | + "required_memory_mb": 7000, |
| 238 | + "T": 32768, |
| 239 | + "filter_caps": ["completion", "finetune"], |
| 240 | + }, |
128 | 241 | }
|
0 commit comments