[torch.compile] fix tensor alias (vllm-project#8982)

youkaichao · garg-amit · commit ced790ba1b92 · 2024-10-28T06:09:04.000Z
Signed-off-by: Amit Garg &lt;mitgarg17495@gmail.com&gt;
diff --git a/vllm/worker/embedding_model_runner.py b/vllm/worker/embedding_model_runner.py
@@ -103,7 +103,8 @@ def execute_model(
         # a placeholder (it has wide hardware support).
         kv_caches = [
             torch.tensor([], dtype=torch.float32, device=self.device)
-        ] * num_layers
+            for _ in range(num_layers)
+        ]
 
         execute_model_kwargs = {
             "input_ids":
diff --git a/vllm/worker/enc_dec_model_runner.py b/vllm/worker/enc_dec_model_runner.py
@@ -348,7 +348,8 @@ def profile_run(self) -> None:
         # a placeholder (it has wide hardware support).
         kv_caches = [
             torch.tensor([], dtype=torch.float32, device=self.device)
-        ] * num_layers
+            for _ in range(num_layers)
+        ]
         finished_requests_ids = [seq.request_id for seq in seqs]
         model_input = self.prepare_model_input(
             seqs, finished_requests_ids=finished_requests_ids)
diff --git a/vllm/worker/model_runner.py b/vllm/worker/model_runner.py
@@ -1276,9 +1276,13 @@ def profile_run(self) -> None:
         # it by reference, rather by specializing on the value ``None``.
         # the `dtype` argument does not matter, and we use `float32` as
         # a placeholder (it has wide hardware support).
+        # it is important to create tensors inside the loop, rather than
+        # multiplying the list, to avoid Dynamo from treating them as
+        # tensor aliasing.
         kv_caches = [
             torch.tensor([], dtype=torch.float32, device=self.device)
-        ] * num_layers
+            for _ in range(num_layers)
+        ]
         finished_requests_ids = [seq.request_id for seq in seqs]
         model_input = self.prepare_model_input(
             seqs, finished_requests_ids=finished_requests_ids)