rapidsai · rapids-bot · Jul 22, 2025 · Jul 4, 2025 · Jul 9, 2025 · Jul 10, 2025
@@ -108,7 +108,7 @@ repos:
           - id: verify-copyright
             files: |
               (?x)
-                  [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx|rs)$|
+                  [.](cmake|cpp|cu|cuh|h|hpp|sh|pxd|py|pyx|rs|java)$|
                   CMakeLists[.]txt$|
                   CMakeLists_standalone[.]txt$|
                   meta[.]yaml$

@@ -30,7 +30,7 @@ extern "C" {
  * @{
  */
 
-enum cuvsKMeansInitMethod {
+typedef enum {
   /**
    * Sample the centroids using the kmeans++ strategy
    */
@@ -45,7 +45,7 @@ enum cuvsKMeansInitMethod {
    * User provides the array of initial centroids
    */
   Array
-};
+} cuvsKMeansInitMethod;
 
 /**
  * @brief Hyper-parameters for the kmeans algorithm

@@ -17,6 +17,7 @@
 #pragma once
 
 #include <cuda_runtime.h>
+#include <dlpack/dlpack.h>
 #include <stdbool.h>
 #include <stdint.h>
 
@@ -176,6 +177,34 @@ cuvsError_t cuvsRMMHostFree(void* ptr, size_t bytes);
  */
 cuvsError_t cuvsVersionGet(uint16_t* major, uint16_t* minor, uint16_t* patch);
 
+/**
+ * @brief Copy a matrix
+ *
+ * This function copies a matrix from dst to src. This lets you copy a matrix
+ * from device memory to host memory (or vice versa), while accounting for
+ * differences in strides.
+ *
+ * Both src and dst must have the same shape and dtype, but can have different
+ * strides and device type. The memory for the output dst tensor must already be
+ * allocated and the tensor initialized.
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] src Pointer to DLManagedTensor to copy
+ * @param[out] dst Pointer to DLManagedTensor to receive copy of data
+ */
+cuvsError_t cuvsMatrixCopy(cuvsResources_t res, DLManagedTensor* src, DLManagedTensor* dst);
+
+/**
+ * @brief Slices rows from a matrix
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] src Pointer to DLManagedTensor to copy
+ * @param[in] start First row index to include in the output
+ * @param[in] end Last row index to include in the output
+ * @param[out] dst Pointer to DLManagedTensor to receive slice from matrix
+ */
+cuvsError_t cuvsMatrixSliceRows(
+  cuvsResources_t res, DLManagedTensor* src, int64_t start, int64_t end, DLManagedTensor* dst);
 /** @} */
 
 #ifdef __cplusplus

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -143,9 +143,10 @@ inline bool is_c_contiguous(DLManagedTensor* managed_tensor)
 
 #pragma GCC diagnostic push
 #pragma GCC diagnostic ignored "-Wunused-function"
-static void free_dlmanaged_tensor_shape(DLManagedTensor* tensor)
+static void free_dlmanaged_tensor_metadata(DLManagedTensor* tensor)
 {
   delete[] tensor->dl_tensor.shape;
+  delete[] tensor->dl_tensor.strides;
 }
 #pragma GCC diagnostic pop
 
@@ -157,14 +158,21 @@ static void to_dlpack(MdspanType src, DLManagedTensor* dst)
   tensor->dtype  = data_type_to_DLDataType<typename MdspanType::value_type>();
   tensor->device = accessor_type_to_DLDevice<typename MdspanType::accessor_type>();
   tensor->ndim   = MdspanType::extents_type::rank();
-  tensor->data   = src.data_handle();
-
-  tensor->shape = new int64_t[tensor->ndim];
-  dst->deleter  = free_dlmanaged_tensor_shape;
-
+  tensor->data   = const_cast<typename MdspanType::value_type*>(src.data_handle());
+  tensor->shape  = new int64_t[tensor->ndim];
   for (int64_t i = 0; i < tensor->ndim; ++i) {
     tensor->shape[i] = src.extent(i);
   }
-}
 
+  if constexpr (std::is_same_v<typename MdspanType::layout_type, raft::row_major>) {
+    tensor->strides = nullptr;
+  } else {
+    tensor->strides = new int64_t[tensor->ndim];
+    for (int64_t i = 0; i < tensor->ndim; ++i) {
+      tensor->strides[i] = src.stride(i);
+    }
+  }
+
+  dst->deleter = free_dlmanaged_tensor_metadata;
+}
 }  // namespace cuvs::core::detail
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -358,6 +358,66 @@ cuvsError_t cuvsCagraIndexDestroy(cuvsCagraIndex_t index);
  */
 cuvsError_t cuvsCagraIndexGetDims(cuvsCagraIndex_t index, int* dim);
 
+/**
+ * @brief Get size of the CAGRA index
+ *
+ * @param[in] index CAGRA index
+ * @param[out] size return number of vectors in the index
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsCagraIndexGetSize(cuvsCagraIndex_t index, uint32_t* size);
+
+/**
+ * @brief Get graph degree of the CAGRA index
+ *
+ * @param[in] index CAGRA index
+ * @param[out] graph_degree return graph degree
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsCagraIndexGetGraphDegree(cuvsCagraIndex_t index, uint32_t* graph_degree);
+
+/**
+ * @brief Returns a view of the CAGRA dataset
+ *
+ * This function returns a non-owning view of the CAGRA dataset.
+ * The output will be referencing device memory that is directly used
+ * in CAGRA, without copying the dataset at all. This means that the
+ * output is only valid as long as the CAGRA index is alive, and once
+ * cuvsCagraIndexDestroy is called on the cagra index - the returned
+ * dataset view will be invalid.
+ *
+ * Note that the DLManagedTensor dataset returned will have an associated
+ * 'deleter' function that must be called when the dataset is no longer
+ * needed. This will free up host memory that stores the shape of the
+ * dataset view.
+ *
+ * @param[in] index CAGRA index
+ * @param[out] dataset the dataset used in cagra
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsCagraIndexGetDataset(cuvsCagraIndex_t index, DLManagedTensor* dataset);
+
+/**
+ * @brief Returns a view of the CAGRA graph
+ *
+ * This function returns a non-owning view of the CAGRA graph.
+ * The output will be referencing device memory that is directly used
+ * in CAGRA, without copying the graph at all. This means that the
+ * output is only valid as long as the CAGRA index is alive, and once
+ * cuvsCagraIndexDestroy is called on the cagra index - the returned
+ * graph view will be invalid.
+ *
+ * Note that the DLManagedTensor graph returned will have an associated
+ * 'deleter' function that must be called when the graph is no longer
+ * needed. This will free up host memory that stores the metadata for the
+ * graph view.
+ *
+ * @param[in] index CAGRA index
+ * @param[out] graph the output knn graph.
+ * @return cuvsError_t
+ */
+cuvsError_t cuvsCagraIndexGetGraph(cuvsCagraIndex_t index, DLManagedTensor* graph);
+
 /**
  * @}
  */
@@ -434,7 +494,8 @@ cuvsError_t cuvsCagraMergeParamsDestroy(cuvsCagraMergeParams_t params);
  * @param[in] res cuvsResources_t opaque C handle
  * @param[in] params cuvsCagraIndexParams_t used to build CAGRA index
  * @param[in] dataset DLManagedTensor* training dataset
- * @param[out] index cuvsCagraIndex_t Newly built CAGRA index
+ * @param[inout] index cuvsCagraIndex_t Newly built CAGRA index. This index needs to be already
+ *                                      created with cuvsCagraIndexCreate.
  * @return cuvsError_t
  */
 cuvsError_t cuvsCagraBuild(cuvsResources_t res,
@@ -609,10 +670,51 @@ cuvsError_t cuvsCagraSerializeToHnswlib(cuvsResources_t res,
  *
  * @param[in] res cuvsResources_t opaque C handle
  * @param[in] filename the name of the file that stores the index
- * @param[out] index CAGRA index loaded disk
+ * @param[inout] index cuvsCagraIndex_t CAGRA index loaded from disk. This index needs to be already
+ *                                      created with cuvsCagraIndexCreate.
  */
 cuvsError_t cuvsCagraDeserialize(cuvsResources_t res, const char* filename, cuvsCagraIndex_t index);
 
+/**
+ * Load index from a dataset and graph
+ *
+ * @param[in] res cuvsResources_t opaque C handle
+ * @param[in] metric cuvsDistanceType to use in the index
+ * @param[in] graph the knn graph to use, shape (size, graph_degree)
+ * @param[in] dataset the dataset to use, shape (size, dim)
+ * @param[inout] index cuvsCagraIndex_t CAGRA index populated with the graph and dataset.
+ *                                      This index needs to be already created with
+ *                                      cuvsCagraIndexCreate.
+ *
+ * @code {.c}
+ * #include <cuvs/core/c_api.h>
+ * #include <cuvs/neighbors/cagra.h>
+ *
+ * // Create cuvsResources_t
+ * cuvsResources_t res;
+ * cuvsError_t res_create_status = cuvsResourcesCreate(&res);
+ *
+ * // Create CAGRA index
+ * cuvsCagraIndex_t index;
+ * cuvsError_t index_create_status = cuvsCagraIndexCreate(&index);
+ *
+ * // Assume a populated `DLManagedTensor` type here for the graph and dataset
+ * DLManagedTensor dataset;
+ * DLManagedTensor graph;
+ *
+ * cuvsDistanceType metric = L2Expanded;
+ *
+ * // Build the CAGRA Index from the graph/dataset
+ * cuvsError_t status = cuvsCagraIndexFromArgs(res, metric, &graph, &dataset, index);
+ *
+ * @endcode
+ */
+cuvsError_t cuvsCagraIndexFromArgs(cuvsResources_t res,
+                                   cuvsDistanceType metric,
+                                   DLManagedTensor* graph,
+                                   DLManagedTensor* dataset,
+                                   cuvsCagraIndex_t index);
+
 /**
  * @brief Merge multiple CAGRA indices into a single CAGRA index.
  *

@@ -20,6 +20,7 @@
 
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resources.hpp>
+#include <raft/util/cudart_utils.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 #include <rmm/mr/device/managed_memory_resource.hpp>
@@ -165,3 +166,117 @@ extern "C" cuvsError_t cuvsVersionGet(uint16_t* major, uint16_t* minor, uint16_t
   *patch = CUVS_VERSION_PATCH;
   return CUVS_SUCCESS;
 }
+
+namespace {
+template <typename T>
+void _copy_matrix(cuvsResources_t res, DLManagedTensor* src_managed, DLManagedTensor* dst_managed)
+{
+  DLTensor& src = src_managed->dl_tensor;
+  DLTensor& dst = dst_managed->dl_tensor;
+
+  int64_t src_row_stride = src.strides == nullptr ? src.shape[1] : src.strides[0];
+  int64_t dst_row_stride = dst.strides == nullptr ? dst.shape[1] : dst.strides[0];
+  auto res_ptr           = reinterpret_cast<raft::resources*>(res);
+
+  raft::copy_matrix<T>(static_cast<T*>(dst.data),
+                       dst_row_stride,
+                       static_cast<const T*>(src.data),
+                       src_row_stride,
+                       src.shape[1],
+                       src.shape[0],
+                       raft::resource::get_cuda_stream(*res_ptr));
+}
+}  // namespace
+
+extern "C" cuvsError_t cuvsMatrixCopy(cuvsResources_t res,
+                                      DLManagedTensor* src_managed,
+                                      DLManagedTensor* dst_managed)
+{
+  return cuvs::core::translate_exceptions([=] {
+    DLTensor& src = src_managed->dl_tensor;
+    DLTensor& dst = dst_managed->dl_tensor;
+
+    RAFT_EXPECTS(src.ndim == 2, "src should be a 2 dimensional tensor");
+    RAFT_EXPECTS(dst.ndim == 2, "dst should be a 2 dimensional tensor");
+
+    for (int64_t i = 0; i < src.ndim; ++i) {
+      RAFT_EXPECTS(src.shape[i] == dst.shape[i], "shape mismatch between src and dst tensors");
+    }
+    RAFT_EXPECTS(src.dtype.code == dst.dtype.code, "dtype mismatch between src and dst tensors");
+
+    // at some point we could probably copy from a float32 to a float16 here, but for the
+    // moment this isn't supported
+    RAFT_EXPECTS(src.dtype.bits == dst.dtype.bits,
+                 "dtype bits width mismatch between src and dst tensors");
+
+    if (src.dtype.code == kDLFloat && src.dtype.bits == 32) {
+      _copy_matrix<float>(res, src_managed, dst_managed);
+    } else if (src.dtype.code == kDLFloat && src.dtype.bits == 16) {
+      _copy_matrix<half>(res, src_managed, dst_managed);
+    } else if (src.dtype.code == kDLFloat && src.dtype.bits == 64) {
+      _copy_matrix<double>(res, src_managed, dst_managed);
+    } else if (src.dtype.code == kDLInt && src.dtype.bits == 8) {
+      _copy_matrix<int8_t>(res, src_managed, dst_managed);
+    } else if (src.dtype.code == kDLInt && src.dtype.bits == 16) {
+      _copy_matrix<int16_t>(res, src_managed, dst_managed);
+    } else if (src.dtype.code == kDLInt && src.dtype.bits == 32) {
+      _copy_matrix<int32_t>(res, src_managed, dst_managed);
+    } else if (src.dtype.code == kDLInt && src.dtype.bits == 64) {
+      _copy_matrix<int64_t>(res, src_managed, dst_managed);
+    } else if (src.dtype.code == kDLUInt && src.dtype.bits == 8) {
+      _copy_matrix<uint8_t>(res, src_managed, dst_managed);
+    } else if (src.dtype.code == kDLUInt && src.dtype.bits == 16) {
+      _copy_matrix<uint16_t>(res, src_managed, dst_managed);
+    } else if (src.dtype.code == kDLUInt && src.dtype.bits == 32) {
+      _copy_matrix<uint32_t>(res, src_managed, dst_managed);
+    } else if (src.dtype.code == kDLUInt && src.dtype.bits == 64) {
+      _copy_matrix<uint64_t>(res, src_managed, dst_managed);
+    } else {
+      RAFT_FAIL("Unsupported dtype: %d and bits: %d", src.dtype.code, src.dtype.bits);
+    }
+  });
+}
+
+extern "C" void cuvsMatrixDestroy(DLManagedTensor* tensor)
+{
+  if (tensor->dl_tensor.shape != nullptr) {
+    delete[] tensor->dl_tensor.shape;
+    tensor->dl_tensor.shape = nullptr;
+  }
+  if (tensor->dl_tensor.strides != nullptr) {
+    delete[] tensor->dl_tensor.strides;
+    tensor->dl_tensor.strides = nullptr;
+  }
+}
+
+extern "C" cuvsError_t cuvsMatrixSliceRows(cuvsResources_t res,
+                                           DLManagedTensor* src_managed,
+                                           int64_t start,
+                                           int64_t end,
+                                           DLManagedTensor* dst_managed)
+{
+  return cuvs::core::translate_exceptions([=] {
+    RAFT_EXPECTS(end >= start, "end index must be greater than start index");
+
+    DLTensor& src = src_managed->dl_tensor;
+    DLTensor& dst = dst_managed->dl_tensor;
+    RAFT_EXPECTS(src.ndim == 2, "src should be a 2 dimensional tensor");
+
+    dst.dtype    = src.dtype;
+    dst.device   = src.device;
+    dst.ndim     = 2;
+    dst.shape    = new int64_t[2];
+    dst.shape[0] = end - start;
+    dst.shape[1] = src.shape[1];
+
+    int64_t row_strides = dst.shape[1];
+    if (src.strides) {
+      dst.strides = new int64_t[2];
+      row_strides = dst.strides[0] = src.strides[0];
+      dst.strides[1]               = src.strides[1];
+    }
+
+    dst.data = static_cast<char*>(src.data) + start * row_strides * (dst.dtype.bits / 8);
+    dst_managed->deleter = cuvsMatrixDestroy;
+  });
+}