Add cuda 12.8 support (#605)

robertmaynard · web-flow · commit 21e0f0ead374 · 2025-01-24T06:22:24.000Z
CUDA 12.8 introduces sm_120 that requires a reduced number of threads per sm. We also need to pass -static-global-template-stub=false when building with 12.8 as we violate CUDA ODR kernel rules Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Corey J. Nolet (https://github.com/cjnolet) - https://github.com/jakirkham URL: #605
diff --git a/cpp/cmake/modules/ConfigureCUDA.cmake b/cpp/cmake/modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2024, NVIDIA CORPORATION.
+# Copyright (c) 2018-2025, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -35,6 +35,11 @@ if(CMAKE_COMPILER_IS_GNUCXX)
   endif()
 endif()
 
+# Allow invalid CUDA kernels in the short term
+if(CMAKE_CUDA_COMPILER_VERSION VERSION_GREATER_EQUAL 12.8.0)
+  list(APPEND CUVS_CUDA_FLAGS -static-global-template-stub=false)
+endif()
+
 if(CUDA_LOG_COMPILE_TIME)
   list(APPEND CUVS_CUDA_FLAGS "--time=nvcc_compile_log.csv")
 endif()
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -706,7 +706,8 @@ __device__ __forceinline__ void remove_duplicates(
 template <typename Index_t, typename ID_t = InternalID_t<Index_t>>
 RAFT_KERNEL
 #ifdef __CUDA_ARCH__
-#if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890)
+#if (__CUDA_ARCH__) == 750 || ((__CUDA_ARCH__) >= 860 && (__CUDA_ARCH__) <= 890) || \
+  (__CUDA_ARCH__) == 1200
 __launch_bounds__(BLOCK_SIZE)
 #else
 __launch_bounds__(BLOCK_SIZE, 4)