♻️ fix vllm:main (#341)

prashantgupta24 · web-flow · commit aa3874e70310 · 2025-07-29T13:49:04.000-07:00
It seems `model_config.task` is deprecated, instead from what I understand we can use `model_config.supported_tasks`, which is initialized when an llm engine is instantiated: https://github.com/vllm-project/vllm/pull/21470/files#diff-7eaad0b7dee0626bf29d10081b0f0c5e3ea15a4af97e7b182a4e0d35f8346953R705-R706 To maintain backward compatibility, it's a bit tricky since: - Earlier version had `model_config.task` pointing to the task and `model_config.supported_tasks` as a list of all tasks which could contain more than 1 task ``` model_config.task : generate model_config.supported_tasks : {'embed', 'reward', 'generate', 'classify'} ``` - Latest `main` now populates `model_config.supported_tasks` as the only task the model supports. ``` model_config.task : None model_config.supported_tasks : ['generate'] ``` --------- Signed-off-by: Prashant Gupta <prashantgupta@us.ibm.com> Signed-off-by: Max de Bayser <mbayser@br.ibm.com>
diff --git a/vllm_spyre/platform.py b/vllm_spyre/platform.py
@@ -1,3 +1,4 @@
+import inspect
 import sys
 
 # When running this plugin on a Mac, we assume it's for local development
@@ -80,8 +81,13 @@ class SpyrePlatform(Platform):
     def device_type(cls):
         # TODO: temporary hack while BertModels
         # inherit SupportsV0Only in vllm upstream.
+        import vllm.model_executor.models as me_models
         from vllm.config import ModelConfig
-        ModelConfig.is_v1_compatible = is_v1_compatible
+
+        # no need to patch after the model_config change
+        if 'model_config' not in \
+                inspect.getfullargspec(me_models.ModelRegistry.is_v1_compatible).args:
+            ModelConfig.is_v1_compatible = is_v1_compatible
         return cls._device_type
 
     @classmethod
@@ -106,11 +112,14 @@ def check_and_update_config(cls, vllm_config: VllmConfig) -> None:
         if scheduler_config.is_multi_step:
             raise NotImplementedError
 
-        is_decoder = model_config.task == "generate"
-        is_pooling = model_config.task == "embed"
-        if model_config.task == "auto":
-            is_pooling = "embed" in model_config.supported_tasks
-            is_decoder = "generate" in model_config.supported_tasks
+        # Can be simplified after the model_config change from vllm:main
+        is_decoder = model_config.task == "generate" \
+            if model_config.task \
+                else "generate" in model_config.supported_tasks
+
+        is_pooling = model_config.task == "embed" \
+            if model_config.task \
+        else "embed" in model_config.supported_tasks
 
         if is_decoder and not envs.VLLM_USE_V1:
             raise ValueError("Decoder models are only supported on v1")
diff --git a/vllm_spyre/v1/worker/spyre_worker.py b/vllm_spyre/v1/worker/spyre_worker.py
@@ -57,6 +57,18 @@ class SpyreWorker(WorkerBaseV1):
     """A worker class that executes the model on a group of Spyre cores.
     """
 
+    @property
+    def is_pooling(self) -> bool:
+        return self.model_config.task == "embed" \
+            if self.model_config.task else \
+                "embed" in self.model_config.supported_tasks
+
+    @property
+    def is_decoder(self) -> bool:
+        return self.model_config.task == "generate" \
+            if self.model_config.task else \
+                "generate" in self.model_config.supported_tasks
+
     def get_kv_cache_spec(self) -> KVCacheSpec:
         """Get specifications for KV cache implementation.
 
@@ -85,7 +97,7 @@ def compile_or_warm_up_model(self) -> None:
             (s["prompt_length"], s["new_tokens"], s["batch_size"])
                 for s in self.spyre_warmup_shapes
         ]):
-            if self.model_config.task != "embed":
+            if not self.is_pooling:
                 # TODO: remove if spyre supports
                 # lower number of output tokens
                 assert num_decode_tokens >= 2, (
@@ -168,7 +180,7 @@ def __init__(
         self.model_runner: \
             Union[StaticBatchingSpyreModelRunner,
                   ContinuousBatchingSpyreModelRunner, SpyrePoolingModelRunner]
-        if self.model_config.task == "embed":
+        if self.is_pooling:
             self.model_runner = SpyrePoolingModelRunner(
                 self.vllm_config, self.is_driver_worker)
             self.spyre_warmup_shapes = SpyrePlatform.get_warmup_shapes(
@@ -457,7 +469,7 @@ def _warmup_spyre_fixed_size(self, prompt_len, num_decode_tokens,
             0, len(valid_token_ids_tensor), (batch_size, prompt_len))]
 
         sampling_params, pooling_params = None, None
-        if self.model_config.task != "embed":
+        if not self.is_pooling:
             sampling_params = SamplingParams(max_tokens=num_decode_tokens)
         else:
             pooling_params = PoolingParams()
diff --git a/vllm_spyre/worker/spyre_worker.py b/vllm_spyre/worker/spyre_worker.py
@@ -42,6 +42,18 @@ class SpyreWorker(LoRANotSupportedWorkerBase, LocalOrDistributedWorkerBase):
     """A worker class that executes the model on a group of Spyre cores.
     """
 
+    @property
+    def is_pooling(self) -> bool:
+        return self.model_config.task == "embed" \
+            if self.model_config.task else \
+                "embed" in self.model_config.supported_tasks
+
+    @property
+    def is_decoder(self) -> bool:
+        return self.model_config.task == "generate" \
+            if self.model_config.task else \
+                "generate" in self.model_config.supported_tasks
+
     def __init__(
         self,
         vllm_config: VllmConfig,
@@ -64,7 +76,7 @@ def __init__(
             from vllm.utils import init_cached_hf_modules
             init_cached_hf_modules()
 
-        if self.model_config.task == "embed":
+        if self.is_pooling:
             self.model_runner: SpyreModelRunner = SpyreEmbeddingModelRunner(
                 self.model_config, self.parallel_config, self.scheduler_config,
                 self.device_config, self.is_driver_worker)
@@ -205,7 +217,7 @@ def load_model(self):
             (s["prompt_length"], s["new_tokens"], s["batch_size"])
                 for s in self.spyre_warmup_shapes
         ]):
-            if self.model_config.task != "embed":
+            if not self.is_pooling:
                 # TODO: remove if spyre supports
                 # lower number of output tokens
                 assert num_decode_tokens >= 2, (