ray-project
diff --git a/‎ci/lint/pydoclint-baseline.txt‎
Lines changed: 2 additions & 2 deletions b/‎ci/lint/pydoclint-baseline.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎python/ray/_private/custom_types.py‎
Lines changed: 15 additions & 11 deletions b/‎python/ray/_private/custom_types.py‎
Lines changed: 15 additions & 11 deletions
diff --git a/‎python/ray/_private/gpu_object_manager.py‎
Lines changed: 97 additions & 76 deletions b/‎python/ray/_private/gpu_object_manager.py‎
Lines changed: 97 additions & 76 deletions
diff --git a/‎python/ray/_private/gpu_object_manager_util.py‎
Lines changed: 83 additions & 0 deletions b/‎python/ray/_private/gpu_object_manager_util.py‎
Lines changed: 83 additions & 0 deletions
@@ -328,9 +328,9 @@ python/ray/_private/worker.py
     DOC201: Function `remote` does not have a return section in docstring
 --------------------
 python/ray/actor.py
-    DOC101: Function `method`: Docstring contains fewer arguments than in function signature.
+    DOC102: Function `method`: Docstring contains more arguments than in function signature.
     DOC106: Function `method`: The option `--arg-type-hints-in-signature` is `True` but there are no argument type hints in the signature
-    DOC103: Function `method`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**kwargs: , *args: ]. Arguments in the docstring but not in the function signature: [num_returns: ].
+    DOC103: Function `method`: Docstring arguments are different from function arguments. (Or could be other formatting issues: https://jsh9.github.io/pydoclint/violation_codes.html#notes-on-doc103 ). Arguments in the function signature but not in the docstring: [**kwargs: , *args: ]. Arguments in the docstring but not in the function signature: [concurrency_group: , max_task_retries: , num_returns: , retry_exceptions: , tensor_transport: ].
     DOC201: Function `method` does not have a return section in docstring
     DOC107: Method `ActorMethod.__init__`: The option `--arg-type-hints-in-signature` is `True` but not all args in the signature have type hints
     DOC101: Method `ActorMethod.options`: Docstring contains fewer arguments than in function signature.
 
@@ -1,9 +1,7 @@
+from enum import Enum
 from typing import Literal
 
 from ray.core.generated.common_pb2 import (
-    GLOO,
-    NCCL,
-    OBJECT_STORE,
     ErrorType,
     Language,
     TaskStatus,
@@ -122,13 +120,19 @@
 LANGUAGE = ["PYTHON", "JAVA", "CPP"]
 
 # See `common.proto` for more details.
-TENSOR_TRANSPORT = [
-    "OBJECT_STORE",
-    "NCCL",
-    "GLOO",
-]
-TypeTensorTransport = Literal[tuple(TENSOR_TRANSPORT)]
-TypeTensorTransportEnum = Literal[OBJECT_STORE, NCCL, GLOO]
+class TensorTransportEnum(Enum):
+    OBJECT_STORE = TensorTransport.Value("OBJECT_STORE")
+    NCCL = TensorTransport.Value("NCCL")
+    GLOO = TensorTransport.Value("GLOO")
+
+    @classmethod
+    def from_str(cls, name: str) -> "TensorTransportEnum":
+        name = name.upper()
+        if name not in cls.__members__:
+            raise ValueError(
+                f"Invalid tensor transport {name}, must be one of {list(cls.__members__.keys())}."
+            )
+        return cls[name]
 
 
 def validate_protobuf_enum(grpc_enum, custom_enum):
@@ -157,4 +161,4 @@ def validate_protobuf_enum(grpc_enum, custom_enum):
 validate_protobuf_enum(TaskType, TASK_TYPE)
 validate_protobuf_enum(ErrorType, ERROR_TYPE)
 validate_protobuf_enum(Language, LANGUAGE)
-validate_protobuf_enum(TensorTransport, TENSOR_TRANSPORT)
+validate_protobuf_enum(TensorTransport, list(TensorTransportEnum.__members__.keys()))
@@ -1,16 +1,40 @@
-from collections import namedtuple
-from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
+from typing import TYPE_CHECKING, Any, Dict, List, NamedTuple, Optional, Tuple
 
+from ray._private.custom_types import TensorTransportEnum
 from ray._raylet import ObjectRef
 from ray.actor import ActorHandle
 
+# Avoid importing util until needed because it requires several external
+# dependencies like torch and cupy. These dependencies can significantly slow
+# down normal worker startup time.
+util = None
+
 if TYPE_CHECKING:
     import torch
 
-# GPUObjectMeta is a named tuple containing the source actor and tensor metadata.
-# The tensor metadata is a list of tuples, each containing the shape and dtype
-# of a tensor in the GPU object store.
-GPUObjectMeta = namedtuple("GPUObjectMeta", ["src_actor", "tensor_meta"])
+    from ray._private import gpu_object_manager_util as util
+
+
+def _get_or_import_util():
+    """Lazily import the gpu_object_manager_util module."""
+    global util
+    if util is None:
+        from ray._private import gpu_object_manager_util as util
+    return util
+
+
+# GPUObjectMeta is a named tuple containing the source actor, tensor transport
+# backend, and tensor metadata.
+# - The tensor transport backend is the backend used to transport the tensors.
+#   Currently, the supported backends are "nccl" and "torch_gloo".
+# - The tensor metadata is a list of tuples, each containing the shape and dtype
+#   of a tensor in the GPU object store.
+class GPUObjectMeta(NamedTuple):
+    src_actor: ActorHandle
+    # Must be a valid backend name as defined in
+    # `ray.util.collective.types.Backend`.
+    tensor_transport_backend: str
+    tensor_meta: List[Tuple["torch.Size", "torch.dtype"]]
 
 
 class GPUObjectManager:
@@ -55,14 +79,30 @@ def __ray_get_tensor_meta__(self, obj_id: str):
 
         return src_actor.__ray_call__.remote(__ray_get_tensor_meta__, obj_id)
 
-    def add_gpu_object_ref(self, obj_ref: ObjectRef, src_actor: ActorHandle):
-        # `obj_ref` is an ObjectRef generated by the `src_actor`'s actor task
-        # that is annotated with `@ray.method(tensor_transport=...)`. This function
-        # adds the `obj_ref` to the `gpu_object_refs` dictionary so that the coordinator
-        # process can determine whether the `obj_ref` is a GPU object reference or not.
+    def add_gpu_object_ref(
+        self,
+        obj_ref: ObjectRef,
+        src_actor: ActorHandle,
+        tensor_transport: TensorTransportEnum,
+    ):
+        """Add a GPU object reference to the GPU object manager. This should be
+        called whenever the current process calls a task that is annotated with
+        `@ray.method(tensor_transport=...)`.
+
+        Args:
+            obj_ref: The ObjectRef of the task output.
+            src_actor: The actor that executes the task and that creates the GPU object.
+            tensor_transport: The tensor transport protocol to use for the GPU object.
+        """
+        util = _get_or_import_util()
+        tensor_transport_backend = util.tensor_transport_to_collective_backend(
+            tensor_transport
+        )
         tensor_meta = self._get_tensor_meta(src_actor, obj_ref.hex())
         self.gpu_object_refs[obj_ref] = GPUObjectMeta(
-            src_actor=src_actor, tensor_meta=tensor_meta
+            src_actor=src_actor,
+            tensor_transport_backend=tensor_transport_backend,
+            tensor_meta=tensor_meta,
         )
 
     # TODO(kevin85421): Call this function to remove the `obj_ref` from the `gpu_object_refs` dictionary
@@ -76,58 +116,30 @@ def _get_gpu_object_ref(self, obj_ref: ObjectRef) -> Optional[GPUObjectMeta]:
     def _is_gpu_object_ref(self, obj_ref: ObjectRef) -> bool:
         return obj_ref in self.gpu_object_refs
 
-    def _send_gpu_object(self, src_actor: ActorHandle, obj_id: str, dst_rank: int):
+    def _send_gpu_object(
+        self, communicator_name: str, src_actor: ActorHandle, obj_id: str, dst_rank: int
+    ):
         # Send tensors stored in the `src_actor`'s GPU object store to the
         # destination rank `dst_rank`.
-        def __ray_send__(self, obj_id: str, dst_rank: int):
-            import torch.distributed as dist
-
-            from ray._private.worker import global_worker
-
-            gpu_object_manager = global_worker.gpu_object_manager
-            assert gpu_object_manager.has_gpu_object(
-                obj_id
-            ), f"obj_id={obj_id} not found in GPU object store"
-            tensors = gpu_object_manager.get_gpu_object(obj_id)
-            for tensor in tensors:
-                dist.send(tensor, dst_rank)
-            # TODO(kevin85421): The current garbage collection implementation for the
-            # in-actor object store is naive. We garbage collect each object after it
-            # is consumed once.
-            gpu_object_manager.remove_gpu_object(obj_id)
-
-        src_actor.__ray_call__.remote(__ray_send__, obj_id, dst_rank)
+        util = _get_or_import_util()
+        src_actor.__ray_call__.remote(
+            util.__ray_send__, communicator_name, obj_id, dst_rank
+        )
 
     def _recv_gpu_object(
         self,
+        communicator_name: str,
         dst_actor: ActorHandle,
         obj_id: str,
         src_rank: int,
         tensor_meta: List[Tuple["torch.Size", "torch.dtype"]],
     ):
         # Receive tensors from the source rank and store them in the
         # `dst_actor`'s GPU object store.
-        def __ray_recv__(
-            self,
-            obj_id: str,
-            src_rank: int,
-            tensor_meta: List[Tuple["torch.Size", "torch.dtype"]],
-        ):
-            import torch
-            import torch.distributed as dist
-
-            from ray._private.worker import global_worker
-
-            gpu_object_manager = global_worker.gpu_object_manager
-            tensors = []
-            for meta in tensor_meta:
-                shape, dtype = meta
-                tensor = torch.zeros(shape, dtype=dtype)
-                dist.recv(tensor, src_rank)
-                tensors.append(tensor)
-            gpu_object_manager.add_gpu_object(obj_id, tensors)
-
-        dst_actor.__ray_call__.remote(__ray_recv__, obj_id, src_rank, tensor_meta)
+        util = _get_or_import_util()
+        dst_actor.__ray_call__.remote(
+            util.__ray_recv__, communicator_name, obj_id, src_rank, tensor_meta
+        )
 
     def trigger_out_of_band_tensor_transfer(
         self, dst_actor: ActorHandle, task_args: Tuple[Any, ...]
@@ -150,11 +162,6 @@ def trigger_out_of_band_tensor_transfer(
             dst_actor: The target actor to receive tensors
             task_args: List of arguments for the target actor task that may contain ObjectRefs.
         """
-        from ray.experimental.channel import ChannelContext
-
-        ctx = ChannelContext.get_current()
-
-        actor_id_to_rank = {}
         for arg in task_args:
             # If an ObjectRef exists in `gpu_object_refs`, it means the ObjectRef
             # is in-actor tensors. Therefore, this function will trigger a tensor
@@ -164,37 +171,51 @@ def trigger_out_of_band_tensor_transfer(
 
             if not self._is_gpu_object_ref(arg):
                 continue
+
+            # Import get_collective_groups here to avoid dependency on
+            # collective libraries for default Ray installation.
+            from ray.experimental.collective import get_collective_groups
+
             gpu_object_meta = self._get_gpu_object_ref(arg)
 
             src_actor = gpu_object_meta.src_actor
             tensor_meta = gpu_object_meta.tensor_meta
-            if not actor_id_to_rank:
-                # TODO(kevin85421): Support multiple communicators.
-                if len(ctx.communicators) != 1:
-                    raise ValueError(
-                        f"There are {len(ctx.communicators)} communicators in the current context. "
-                        "Currently, GPU objects only support 1 communicator. Please make sure only "
-                        "one communicator exists."
-                    )
-                actor_id_to_rank = {
-                    a._ray_actor_id: i for i, a in enumerate(ctx.communicators[0])
-                }
-            if src_actor._ray_actor_id not in actor_id_to_rank:
+            communicators = get_collective_groups(
+                [src_actor, dst_actor], backend=gpu_object_meta.tensor_transport_backend
+            )
+            # TODO(kevin85421): Support multiple communicators.
+            if len(communicators) == 0:
+                raise ValueError(
+                    f"No communicators found for actors {src_actor} and {dst_actor}. "
+                    "Create a communicator with "
+                    "`ray.experimental.collective.create_collective_group` "
+                    "before calling actor tasks."
+                )
+            elif len(communicators) > 1:
+                raise ValueError(
+                    f"There are {len(communicators)} possible communicators that contain actors {src_actor} and {dst_actor}. "
+                    "Currently, GPU objects only support one communicator. Please make sure only "
+                    "one communicator exists."
+                )
+            communicator = communicators[0]
+            src_rank = communicator.get_rank(src_actor)
+            if src_rank == -1:
                 raise ValueError(
-                    f"Sender actor {src_actor._ray_actor_id} not found in communicator. "
+                    f"Sender actor {src_actor} not found in communicator. "
                     "Please make sure the sender and receiver are in the same communicator."
                 )
-            if dst_actor._ray_actor_id not in actor_id_to_rank:
+            dst_rank = communicator.get_rank(dst_actor)
+            if dst_rank == -1:
                 raise ValueError(
-                    f"Receiver actor {dst_actor._ray_actor_id} not found in communicator. "
+                    f"Receiver actor {dst_actor} not found in communicator. "
                     "Please make sure the sender and receiver are in the same communicator."
                 )
-            src_rank = actor_id_to_rank[src_actor._ray_actor_id]
-            dst_rank = actor_id_to_rank[dst_actor._ray_actor_id]
             if src_rank == dst_rank:
                 # If the source and destination ranks are the same, the tensors can
                 # be transferred intra-process, so we skip the out-of-band tensor
                 # transfer.
                 continue
-            self._send_gpu_object(src_actor, arg.hex(), dst_rank)
-            self._recv_gpu_object(dst_actor, arg.hex(), src_rank, tensor_meta)
+            self._send_gpu_object(communicator.name, src_actor, arg.hex(), dst_rank)
+            self._recv_gpu_object(
+                communicator.name, dst_actor, arg.hex(), src_rank, tensor_meta
+            )
@@ -0,0 +1,83 @@
+from typing import List, Tuple
+
+try:
+    import torch
+except ImportError:
+    raise ImportError(
+        "`tensor_transport` requires PyTorch. "
+        "Please install torch with 'pip install torch' to use this feature."
+    )
+
+import ray.util.collective as collective
+from ray._private.custom_types import TensorTransportEnum
+from ray._private.worker import global_worker
+from ray.util.collective.types import Backend
+
+TENSOR_TRANSPORT_TO_COLLECTIVE_BACKEND = {
+    TensorTransportEnum.NCCL: Backend.NCCL,
+    TensorTransportEnum.GLOO: Backend.TORCH_GLOO,
+}
+
+COLLECTIVE_BACKEND_TO_TORCH_DEVICE = {
+    Backend.NCCL: torch.device("cuda"),
+    Backend.TORCH_GLOO: torch.device("cpu"),
+}
+
+
+def tensor_transport_to_collective_backend(
+    tensor_transport: TensorTransportEnum,
+) -> Backend:
+    try:
+        return TENSOR_TRANSPORT_TO_COLLECTIVE_BACKEND[tensor_transport]
+    except KeyError:
+        raise ValueError(
+            f"Invalid tensor transport {tensor_transport.name}, must be one of {list(TENSOR_TRANSPORT_TO_COLLECTIVE_BACKEND.keys())}."
+        )
+
+
+def __ray_send__(self, communicator_name: str, obj_id: str, dst_rank: int):
+    """Helper function that runs on the src actor to send tensors to the dst actor."""
+    gpu_object_manager = global_worker.gpu_object_manager
+    assert gpu_object_manager.has_gpu_object(
+        obj_id
+    ), f"obj_id={obj_id} not found in GPU object store"
+    tensors = gpu_object_manager.get_gpu_object(obj_id)
+
+    backend = collective.get_group_handle(communicator_name).backend()
+    device = COLLECTIVE_BACKEND_TO_TORCH_DEVICE[backend]
+
+    for tensor in tensors:
+        if tensor.device.type != device.type:
+            # TODO(swang): Right now there is no way to catch this error
+            # and the receiving Ray task will hang.
+            raise ValueError(
+                f"tensor device {tensor.device} does not match device {device}"
+            )
+        collective.send(tensor, dst_rank, group_name=communicator_name)
+    # TODO(kevin85421): The current garbage collection implementation for the
+    # in-actor object store is naive. We garbage collect each object after it
+    # is consumed once.
+    gpu_object_manager.remove_gpu_object(obj_id)
+
+
+def __ray_recv__(
+    self,
+    communicator_name: str,
+    obj_id: str,
+    src_rank: int,
+    tensor_meta: List[Tuple["torch.Size", "torch.dtype"]],
+):
+    """Helper function that runs on the dst actor to receive tensors from the src actor."""
+    from ray._private.worker import global_worker
+
+    backend = collective.get_group_handle(communicator_name).backend()
+    device = COLLECTIVE_BACKEND_TO_TORCH_DEVICE[backend]
+
+    gpu_object_manager = global_worker.gpu_object_manager
+    tensors = []
+    for meta in tensor_meta:
+        shape, dtype = meta
+        tensor = torch.zeros(shape, dtype=dtype, device=device)
+        collective.recv(tensor, src_rank, group_name=communicator_name)
+        tensors.append(tensor)
+    gpu_object_manager.add_gpu_object(obj_id, tensors)