Skip to content

Commit 07eb6f1

Browse files
authored
[bugfix][distributed] fix multi-node bug for shared memory (#6597)
1 parent f0bbfaf commit 07eb6f1

File tree

1 file changed

+8
-2
lines changed

1 file changed

+8
-2
lines changed

vllm/distributed/device_communicators/shm_broadcast.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,8 +108,14 @@ def __init__(self,
108108
# created by the process. The following patch is a workaround.
109109
with patch("multiprocessing.resource_tracker.register",
110110
lambda *args, **kwargs: None):
111-
self.shared_memory = shared_memory.SharedMemory(name=name)
112-
assert self.shared_memory.size == self.total_bytes_of_buffer
111+
try:
112+
self.shared_memory = shared_memory.SharedMemory(name=name)
113+
assert self.shared_memory.size == self.total_bytes_of_buffer # noqa
114+
except FileNotFoundError:
115+
# we might deserialize the object in a different node
116+
# in this case, this object is not used,
117+
# and we should suppress the error
118+
pass
113119

114120
def __reduce__(self):
115121
return (

0 commit comments

Comments
 (0)