rapidsai · rapids-bot · Sep 19, 2025 · Jul 23, 2025 · Jul 31, 2025 · Jul 31, 2025
@@ -75,23 +75,6 @@ cuvsError_t cuvsResourcesCreate(cuvsResources_t* res);
  */
 cuvsError_t cuvsResourcesDestroy(cuvsResources_t res);
 
-/**
- * @brief Create an Initialized opaque C handle for C++ type `raft::device_resources_snmg`
- *        for multi-GPU operations
- *
- * @param[in] res cuvsResources_t opaque C handle
- * @return cuvsError_t
- */
-cuvsError_t cuvsMultiGpuResourcesCreate(cuvsResources_t* res);
-
-/**
- * @brief Destroy and de-allocate opaque C handle for C++ type `raft::device_resources_snmg`
- *
- * @param[in] res cuvsResources_t opaque C handle
- * @return cuvsError_t
- */
-cuvsError_t cuvsMultiGpuResourcesDestroy(cuvsResources_t res);
-
 /**
  * @brief Set cudaStream_t on cuvsResources_t to queue CUDA kernels on APIs
  *        that accept a cuvsResources_t handle

@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -155,7 +155,7 @@ def uncommittedFiles():
     ret = []
     for f in files.splitlines():
         f = f.strip(" ")
-        f = re.sub("\s+", " ", f)  # noqa: W605
+        f = re.sub(r"\s+", " ", f)  # noqa: W605
         tmp = f.split(" ", 1)
         # only consider staged files or uncommitted files
         # in other words, ignore untracked files

@@ -267,7 +267,12 @@ extern "C" cuvsError_t cuvsMultiGpuCagraBuild(cuvsResources_t res,
                                               cuvsMultiGpuCagraIndex_t index)
 {
   return cuvs::core::translate_exceptions([=] {
-    auto dataset      = dataset_tensor->dl_tensor;
+    auto dataset = dataset_tensor->dl_tensor;
+
+    // Multi-GPU CAGRA requires dataset to be in host memory
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(dataset),
+                 "Multi-GPU CAGRA build requires dataset to have host compatible memory");
+
     index->dtype.code = dataset.dtype.code;
     index->dtype.bits = dataset.dtype.bits;
 
@@ -295,7 +300,29 @@ extern "C" cuvsError_t cuvsMultiGpuCagraSearch(cuvsResources_t res,
                                                DLManagedTensor* distances_tensor)
 {
   return cuvs::core::translate_exceptions([=] {
-    auto queries = queries_tensor->dl_tensor;
+    auto queries   = queries_tensor->dl_tensor;
+    auto neighbors = neighbors_tensor->dl_tensor;
+    auto distances = distances_tensor->dl_tensor;
+
+    // Multi-GPU CAGRA requires all tensors to be in host memory
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(queries),
+                 "Multi-GPU CAGRA search requires queries to have host compatible memory");
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(neighbors),
+                 "Multi-GPU CAGRA search requires neighbors to have host compatible memory");
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(distances),
+                 "Multi-GPU CAGRA search requires distances to have host compatible memory");
+
+    // Validate data types
+    RAFT_EXPECTS(neighbors.dtype.code == kDLInt && neighbors.dtype.bits == 64,
+                 "neighbors should be of type int64_t");
+    RAFT_EXPECTS(distances.dtype.code == kDLFloat && distances.dtype.bits == 32,
+                 "distances should be of type float32");
+
+    // Check type compatibility between index and queries
+    RAFT_EXPECTS(queries.dtype.code == index->dtype.code,
+                 "type mismatch between index and queries");
+    RAFT_EXPECTS(queries.dtype.bits == index->dtype.bits,
+                 "type mismatch between index and queries");
 
     if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) {
       _mg_search<float>(res, *params, *index, queries_tensor, neighbors_tensor, distances_tensor);
@@ -321,6 +348,25 @@ extern "C" cuvsError_t cuvsMultiGpuCagraExtend(cuvsResources_t res,
   return cuvs::core::translate_exceptions([=] {
     auto vectors = new_vectors_tensor->dl_tensor;
 
+    // Multi-GPU CAGRA requires vectors to be in host memory
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(vectors),
+                 "Multi-GPU CAGRA extend requires new_vectors to have host compatible memory");
+
+    // Check type compatibility between index and vectors
+    RAFT_EXPECTS(vectors.dtype.code == index->dtype.code,
+                 "type mismatch between index and new_vectors");
+    RAFT_EXPECTS(vectors.dtype.bits == index->dtype.bits,
+                 "type mismatch between index and new_vectors");
+
+    // If indices are provided, they should also be in host memory
+    if (new_indices_tensor != nullptr) {
+      auto indices = new_indices_tensor->dl_tensor;
+      RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(indices),
+                   "Multi-GPU CAGRA extend requires new_indices to have host compatible memory");
+      RAFT_EXPECTS(indices.dtype.code == kDLUInt && indices.dtype.bits == 32,
+                   "new_indices should be of type uint32_t");
+    }
+
     if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 32) {
       _mg_extend<float>(res, *index, new_vectors_tensor, new_indices_tensor);
     } else if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 16) {

@@ -264,7 +264,12 @@ extern "C" cuvsError_t cuvsMultiGpuIvfFlatBuild(cuvsResources_t res,
                                                 cuvsMultiGpuIvfFlatIndex_t index)
 {
   return cuvs::core::translate_exceptions([=] {
-    auto dataset      = dataset_tensor->dl_tensor;
+    auto dataset = dataset_tensor->dl_tensor;
+
+    // Multi-GPU IVF-Flat requires dataset to be in host memory
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(dataset),
+                 "Multi-GPU IVF-Flat build requires dataset to have host compatible memory");
+
     index->dtype.code = dataset.dtype.code;
     index->dtype.bits = dataset.dtype.bits;
 
@@ -292,7 +297,29 @@ extern "C" cuvsError_t cuvsMultiGpuIvfFlatSearch(cuvsResources_t res,
                                                  DLManagedTensor* distances_tensor)
 {
   return cuvs::core::translate_exceptions([=] {
-    auto queries = queries_tensor->dl_tensor;
+    auto queries   = queries_tensor->dl_tensor;
+    auto neighbors = neighbors_tensor->dl_tensor;
+    auto distances = distances_tensor->dl_tensor;
+
+    // Multi-GPU IVF-Flat requires all tensors to be in host memory
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(queries),
+                 "Multi-GPU IVF-Flat search requires queries to have host compatible memory");
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(neighbors),
+                 "Multi-GPU IVF-Flat search requires neighbors to have host compatible memory");
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(distances),
+                 "Multi-GPU IVF-Flat search requires distances to have host compatible memory");
+
+    // Validate data types
+    RAFT_EXPECTS(neighbors.dtype.code == kDLInt && neighbors.dtype.bits == 64,
+                 "neighbors should be of type int64_t");
+    RAFT_EXPECTS(distances.dtype.code == kDLFloat && distances.dtype.bits == 32,
+                 "distances should be of type float32");
+
+    // Check type compatibility between index and queries
+    RAFT_EXPECTS(queries.dtype.code == index->dtype.code,
+                 "type mismatch between index and queries");
+    RAFT_EXPECTS(queries.dtype.bits == index->dtype.bits,
+                 "type mismatch between index and queries");
 
     if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) {
       _mg_search<float>(res, *params, *index, queries_tensor, neighbors_tensor, distances_tensor);
@@ -318,6 +345,25 @@ extern "C" cuvsError_t cuvsMultiGpuIvfFlatExtend(cuvsResources_t res,
   return cuvs::core::translate_exceptions([=] {
     auto vectors = new_vectors_tensor->dl_tensor;
 
+    // Multi-GPU IVF-Flat requires vectors to be in host memory
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(vectors),
+                 "Multi-GPU IVF-Flat extend requires new_vectors to have host compatible memory");
+
+    // Check type compatibility between index and vectors
+    RAFT_EXPECTS(vectors.dtype.code == index->dtype.code,
+                 "type mismatch between index and new_vectors");
+    RAFT_EXPECTS(vectors.dtype.bits == index->dtype.bits,
+                 "type mismatch between index and new_vectors");
+
+    // If indices are provided, they should also be in host memory
+    if (new_indices_tensor != nullptr) {
+      auto indices = new_indices_tensor->dl_tensor;
+      RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(indices),
+                   "Multi-GPU IVF-Flat extend requires new_indices to have host compatible memory");
+      RAFT_EXPECTS(indices.dtype.code == kDLInt && indices.dtype.bits == 64,
+                   "new_indices should be of type int64_t");
+    }
+
     if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 32) {
       _mg_extend<float>(res, *index, new_vectors_tensor, new_indices_tensor);
     } else if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 16) {

@@ -256,7 +256,12 @@ extern "C" cuvsError_t cuvsMultiGpuIvfPqBuild(cuvsResources_t res,
                                               cuvsMultiGpuIvfPqIndex_t index)
 {
   return cuvs::core::translate_exceptions([=] {
-    auto dataset      = dataset_tensor->dl_tensor;
+    auto dataset = dataset_tensor->dl_tensor;
+
+    // Multi-GPU IVF-PQ requires dataset to be in host memory
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(dataset),
+                 "Multi-GPU IVF-PQ build requires dataset to have host compatible memory");
+
     index->dtype.code = dataset.dtype.code;
     index->dtype.bits = dataset.dtype.bits;
 
@@ -284,7 +289,29 @@ extern "C" cuvsError_t cuvsMultiGpuIvfPqSearch(cuvsResources_t res,
                                                DLManagedTensor* distances_tensor)
 {
   return cuvs::core::translate_exceptions([=] {
-    auto queries = queries_tensor->dl_tensor;
+    auto queries   = queries_tensor->dl_tensor;
+    auto neighbors = neighbors_tensor->dl_tensor;
+    auto distances = distances_tensor->dl_tensor;
+
+    // Multi-GPU IVF-PQ requires all tensors to be in host memory
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(queries),
+                 "Multi-GPU IVF-PQ search requires queries to have host compatible memory");
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(neighbors),
+                 "Multi-GPU IVF-PQ search requires neighbors to have host compatible memory");
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(distances),
+                 "Multi-GPU IVF-PQ search requires distances to have host compatible memory");
+
+    // Validate data types
+    RAFT_EXPECTS(neighbors.dtype.code == kDLInt && neighbors.dtype.bits == 64,
+                 "neighbors should be of type int64_t");
+    RAFT_EXPECTS(distances.dtype.code == kDLFloat && distances.dtype.bits == 32,
+                 "distances should be of type float32");
+
+    // Check type compatibility between index and queries
+    RAFT_EXPECTS(queries.dtype.code == index->dtype.code,
+                 "type mismatch between index and queries");
+    RAFT_EXPECTS(queries.dtype.bits == index->dtype.bits,
+                 "type mismatch between index and queries");
 
     if (queries.dtype.code == kDLFloat && queries.dtype.bits == 32) {
       _mg_search<float>(res, *params, *index, queries_tensor, neighbors_tensor, distances_tensor);
@@ -310,6 +337,25 @@ extern "C" cuvsError_t cuvsMultiGpuIvfPqExtend(cuvsResources_t res,
   return cuvs::core::translate_exceptions([=] {
     auto vectors = new_vectors_tensor->dl_tensor;
 
+    // Multi-GPU IVF-PQ requires vectors to be in host memory
+    RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(vectors),
+                 "Multi-GPU IVF-PQ extend requires new_vectors to have host compatible memory");
+
+    // Check type compatibility between index and vectors
+    RAFT_EXPECTS(vectors.dtype.code == index->dtype.code,
+                 "type mismatch between index and new_vectors");
+    RAFT_EXPECTS(vectors.dtype.bits == index->dtype.bits,
+                 "type mismatch between index and new_vectors");
+
+    // If indices are provided, they should also be in host memory
+    if (new_indices_tensor != nullptr) {
+      auto indices = new_indices_tensor->dl_tensor;
+      RAFT_EXPECTS(cuvs::core::is_dlpack_host_compatible(indices),
+                   "Multi-GPU IVF-PQ extend requires new_indices to have host compatible memory");
+      RAFT_EXPECTS(indices.dtype.code == kDLInt && indices.dtype.bits == 64,
+                   "new_indices should be of type int64_t");
+    }
+
     if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 32) {
       _mg_extend<float>(res, *index, new_vectors_tensor, new_indices_tensor);
     } else if (vectors.dtype.code == kDLFloat && vectors.dtype.bits == 16) {
@@ -381,28 +427,8 @@ extern "C" cuvsError_t cuvsMultiGpuIvfPqDistribute(cuvsResources_t res,
                                                    cuvsMultiGpuIvfPqIndex_t index)
 {
   return cuvs::core::translate_exceptions([=] {
-    std::ifstream is(filename, std::ios::in | std::ios::binary);
-    if (!is) { RAFT_FAIL("Cannot open file %s", filename); }
-    char dtype_string[4];
-    is.read(dtype_string, 4);
-    auto dtype = raft::detail::numpy_serializer::parse_descr(std::string(dtype_string, 4));
-    is.close();
-
-    index->dtype.bits = dtype.itemsize * 8;
-    if (dtype.kind == 'f' && dtype.itemsize == 4) {
-      index->dtype.code = kDLFloat;
-      index->addr       = reinterpret_cast<uintptr_t>(_mg_distribute<float>(res, filename));
-    } else if (dtype.kind == 'f' && dtype.itemsize == 2) {
-      index->dtype.code = kDLFloat;
-      index->addr       = reinterpret_cast<uintptr_t>(_mg_distribute<half>(res, filename));
-    } else if (dtype.kind == 'i' && dtype.itemsize == 1) {
-      index->dtype.code = kDLInt;
-      index->addr       = reinterpret_cast<uintptr_t>(_mg_distribute<int8_t>(res, filename));
-    } else if (dtype.kind == 'u' && dtype.itemsize == 1) {
-      index->dtype.code = kDLUInt;
-      index->addr       = reinterpret_cast<uintptr_t>(_mg_distribute<uint8_t>(res, filename));
-    } else {
-      RAFT_FAIL("Unsupported index dtype");
-    }
+    index->dtype.code = kDLFloat;
+    index->dtype.bits = 32;
+    index->addr       = reinterpret_cast<uintptr_t>(_mg_distribute<float>(res, filename));
   });
 }
@@ -5,14 +5,26 @@ Nearest Neighbors
    :language: python
    :class: highlight
 
+Single-GPU Algorithms
+#####################
+
 .. toctree::
    :maxdepth: 2
-   :caption: Contents:
+   :caption: Single-GPU ANN Algorithms:
 
    neighbors_brute_force.rst
    neighbors_cagra.rst
    neighbors_hnsw.rst
    neighbors_ivf_flat.rst
    neighbors_ivf_pq.rst
    neighbors_nn_decent.rst
+
+Multi-GPU Algorithms
+####################
+
+.. toctree::
+   :maxdepth: 2
+   :caption: Multi-GPU Distributed ANN:
+
+   neighbors_multi_gpu.rst
    neighbors_all_neighbors.rst
@@ -0,0 +1,55 @@
+Multi-GPU CAGRA
+===============
+
+Multi-GPU CAGRA extends the graph-based CAGRA algorithm to work across multiple GPUs, providing improved scalability and performance for large-scale vector search. It supports both replicated and sharded distribution modes.
+
+.. role:: py(code)
+   :language: python
+   :class: highlight
+
+.. note::
+   **IMPORTANT**: Multi-GPU CAGRA requires all data (datasets, queries, output arrays) to be in host memory (CPU).
+   If using CuPy/device arrays, transfer to host with ``array.get()`` or ``cp.asnumpy(array)`` before use.
+
+Index build parameters
+######################
+
+.. autoclass:: cuvs.neighbors.mg_cagra.IndexParams
+    :members:
+
+Index search parameters
+#######################
+
+.. autoclass:: cuvs.neighbors.mg_cagra.SearchParams
+    :members:
+
+Index
+#####
+
+.. autoclass:: cuvs.neighbors.mg_cagra.Index
+    :members:
+
+Index build
+###########
+
+.. autofunction:: cuvs.neighbors.mg_cagra.build
+
+Index search
+############
+
+.. autofunction:: cuvs.neighbors.mg_cagra.search
+
+Index save
+##########
+
+.. autofunction:: cuvs.neighbors.mg_cagra.save
+
+Index load
+##########
+
+.. autofunction:: cuvs.neighbors.mg_cagra.load
+
+Index distribute
+################
+
+.. autofunction:: cuvs.neighbors.mg_cagra.distribute