[OpenVINO] Fix regression from vllm-project#8346

petersalas · petersalas · commit 52247f431ebe · 2024-11-05T17:05:28.000Z
Signed-off-by: Peter Salas &lt;peter@fixie.ai&gt;
diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py
+docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py
diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py
@@ -1,12 +1,13 @@
 from dataclasses import dataclass
-from typing import List, Tuple, Type
+from typing import Dict, List, Optional, Tuple, Type
 
 import openvino as ov
 import torch
 
 from vllm.attention.backends.abstract import (AttentionBackend,
                                               AttentionMetadata)
 from vllm.attention.backends.utils import CommonAttentionState
+from vllm.multimodal import MultiModalPlaceholderMap
 
 
 def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor,
@@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata:
     # Shape: scalar
     # Type: i32
     max_context_len: torch.Tensor
+
+    # The index maps that relate multi-modal embeddings to the corresponding
+    # placeholders.
+    #
+    # N.B. These aren't really related to attention and don't belong on this
+    # type -- this is just a temporary solution to make them available to
+    # `model_executable`.
+    multi_modal_placeholder_index_maps: Optional[Dict[
+        str, MultiModalPlaceholderMap.IndexMap]]