@@ -1062,6 +1062,17 @@ def from_cli_args(cls, args: argparse.Namespace):
1062
1062
return engine_args
1063
1063
1064
1064
def create_model_config (self ) -> ModelConfig :
1065
+ # gguf file needs a specific model loader and doesn't use hf_repo
1066
+ if check_gguf_file (self .model ):
1067
+ self .quantization = self .load_format = "gguf"
1068
+
1069
+ # NOTE: This is to allow model loading from S3 in CI
1070
+ if (not isinstance (self , AsyncEngineArgs ) and envs .VLLM_CI_USE_S3
1071
+ and self .model in MODELS_ON_S3
1072
+ and self .load_format == LoadFormat .AUTO ): # noqa: E501
1073
+ self .model = f"{ MODEL_WEIGHTS_S3_BUCKET } /{ self .model } "
1074
+ self .load_format = LoadFormat .RUNAI_STREAMER
1075
+
1065
1076
return ModelConfig (
1066
1077
model = self .model ,
1067
1078
task = self .task ,
@@ -1101,26 +1112,6 @@ def create_model_config(self) -> ModelConfig:
1101
1112
)
1102
1113
1103
1114
def create_load_config (self ) -> LoadConfig :
1104
- return LoadConfig (
1105
- load_format = self .load_format ,
1106
- download_dir = self .download_dir ,
1107
- model_loader_extra_config = self .model_loader_extra_config ,
1108
- ignore_patterns = self .ignore_patterns ,
1109
- )
1110
-
1111
- def create_engine_config (self ,
1112
- usage_context : Optional [UsageContext ] = None
1113
- ) -> VllmConfig :
1114
- from vllm .platforms import current_platform
1115
- current_platform .pre_register_and_update ()
1116
-
1117
- if envs .VLLM_USE_V1 :
1118
- self ._override_v1_engine_args (usage_context )
1119
-
1120
- # gguf file needs a specific model loader and doesn't use hf_repo
1121
- if check_gguf_file (self .model ):
1122
- self .quantization = self .load_format = "gguf"
1123
-
1124
1115
# bitsandbytes quantization needs a specific model loader
1125
1116
# so we make sure the quant method and the load format are consistent
1126
1117
if (self .quantization == "bitsandbytes" or
@@ -1137,19 +1128,23 @@ def create_engine_config(self,
1137
1128
"BitsAndBytes load format and QLoRA adapter only support "
1138
1129
f"'bitsandbytes' quantization, but got { self .quantization } " )
1139
1130
1140
- assert self .cpu_offload_gb >= 0 , (
1141
- "CPU offload space must be non-negative"
1142
- f", but got { self .cpu_offload_gb } " )
1131
+ return LoadConfig (
1132
+ load_format = self .load_format ,
1133
+ download_dir = self .download_dir ,
1134
+ model_loader_extra_config = self .model_loader_extra_config ,
1135
+ ignore_patterns = self .ignore_patterns ,
1136
+ )
1143
1137
1144
- device_config = DeviceConfig (device = self .device )
1138
+ def create_engine_config (self ,
1139
+ usage_context : Optional [UsageContext ] = None
1140
+ ) -> VllmConfig :
1141
+ from vllm .platforms import current_platform
1142
+ current_platform .pre_register_and_update ()
1145
1143
1146
- # NOTE: This is to allow model loading from S3 in CI
1147
- if (not isinstance (self , AsyncEngineArgs ) and envs .VLLM_CI_USE_S3
1148
- and self .model in MODELS_ON_S3
1149
- and self .load_format == LoadFormat .AUTO ): # noqa: E501
1150
- self .model = f"{ MODEL_WEIGHTS_S3_BUCKET } /{ self .model } "
1151
- self .load_format = LoadFormat .RUNAI_STREAMER
1144
+ if envs .VLLM_USE_V1 :
1145
+ self ._override_v1_engine_args (usage_context )
1152
1146
1147
+ device_config = DeviceConfig (device = self .device )
1153
1148
model_config = self .create_model_config ()
1154
1149
1155
1150
if (model_config .is_multimodal_model and not envs .VLLM_USE_V1
@@ -1281,16 +1276,6 @@ def create_engine_config(self,
1281
1276
if speculative_config is None \
1282
1277
else speculative_config .num_lookahead_slots
1283
1278
1284
- if not self .use_v2_block_manager :
1285
- logger .warning (
1286
- "[DEPRECATED] Block manager v1 has been removed, "
1287
- "and setting --use-v2-block-manager to True or False has "
1288
- "no effect on vLLM behavior. Please remove "
1289
- "--use-v2-block-manager in your engine argument. "
1290
- "If your use case is not supported by "
1291
- "SelfAttnBlockSpaceManager (i.e. block manager v2),"
1292
- " please file an issue with detailed information." )
1293
-
1294
1279
scheduler_config = SchedulerConfig (
1295
1280
runner_type = model_config .runner_type ,
1296
1281
max_num_batched_tokens = self .max_num_batched_tokens ,
0 commit comments