Properly guard usage of openmp function calls (rapidsai#1435)

robertmaynard · enp1s0 · commit 945f1a80f93f · 2025-10-22T12:23:46.000+09:00
Proper support to disable OpenMP also requires that any calls to functions like `omp_get_max_threads` need to be guarded by a `_OPENMP` check. This is done by adding openmp wrapper functions into `cuvs/core` that properly handle the `_OPENMP` guards and behave as required when OpenMP is disabled. Required to fix rapidsai#1322 Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Divye Gala (https://github.com/divyegala) URL: rapidsai#1435
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -332,7 +332,6 @@ if(NOT BUILD_CPU_ONLY)
         src/neighbors/mg/mg_cagra_half_uint32_t.cu
         src/neighbors/mg/mg_cagra_int8_t_uint32_t.cu
         src/neighbors/mg/mg_cagra_uint8_t_uint32_t.cu
-        src/neighbors/mg/omp_checks.cpp
     )
   endif()
 
@@ -359,6 +358,7 @@ if(NOT BUILD_CPU_ONLY)
     src/cluster/kmeans_transform_float.cu
     src/cluster/single_linkage_float.cu
     src/core/bitset.cu
+    src/core/omp_wrapper.cpp
     src/distance/detail/kernels/gram_matrix.cu
     src/distance/detail/kernels/kernel_factory.cu
     src/distance/detail/kernels/kernel_matrices.cu
diff --git a/cpp/src/core/omp_wrapper.cpp b/cpp/src/core/omp_wrapper.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -15,19 +15,43 @@
  */
 
 #include <omp.h>
+
 #include <raft/core/logger.hpp>
 
-namespace cuvs::neighbors::snmg {
+#include "omp_wrapper.hpp"
+
+namespace cuvs::core::omp {
+
+constexpr bool is_omp_enabled()
+{
+#if defined(_OPENMP)
+  return true;
+#else
+  return false;
+#endif
+}
+
+int get_max_threads() { return is_omp_enabled() ? omp_get_max_threads() : 1; }
+int get_num_procs() { return is_omp_enabled() ? omp_get_num_procs() : 1; }
+int get_num_threads() { return is_omp_enabled() ? omp_get_num_threads() : 1; }
+int get_thread_num() { return is_omp_enabled() ? omp_get_thread_num() : 0; }
+
+void set_nested(int v)
+{
+  (void)v;
+  if constexpr (is_omp_enabled()) { omp_set_nested(v); }
+}
 
-void check_omp_threads(const int requirements)
+void check_threads(const int requirements)
 {
-  const int max_threads = omp_get_max_threads();
-  if (max_threads < requirements)
+  const int max_threads = get_max_threads();
+  if (max_threads < requirements) {
     RAFT_LOG_WARN(
       "OpenMP is only allowed %d threads to run %d GPUs. Please increase the number of OpenMP "
       "threads to avoid NCCL hangs by modifying the environment variable OMP_NUM_THREADS.",
       max_threads,
       requirements);
+  }
 }
 
-}  // namespace cuvs::neighbors::snmg
+}  // namespace cuvs::core::omp
diff --git a/cpp/src/core/omp_wrapper.hpp b/cpp/src/core/omp_wrapper.hpp
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <omp.h>
+
+namespace cuvs::core::omp {
+
+constexpr bool is_omp_enabled();
+
+int get_max_threads();
+int get_num_procs();
+int get_num_threads();
+int get_thread_num();
+
+void set_nested(int v);
+
+void check_threads(const int requirements);
+
+}  // namespace cuvs::core::omp
diff --git a/cpp/src/neighbors/all_neighbors/all_neighbors_batched.cuh b/cpp/src/neighbors/all_neighbors/all_neighbors_batched.cuh
@@ -15,6 +15,7 @@
  */
 
 #pragma once
+#include "../../core/omp_wrapper.hpp"
 #include "../detail/reachability.cuh"
 #include "all_neighbors_builder.cuh"
 #include "raft/core/logger_macros.hpp"
@@ -483,7 +484,7 @@ void batch_build(
 {
   if (raft::resource::is_multi_gpu(handle)) {
     // For efficient CPU-computation of omp parallel for regions per GPU
-    omp_set_nested(1);
+    cuvs::core::omp::set_nested(1);
   }
 
   size_t num_rows = static_cast<size_t>(dataset.extent(0));
diff --git a/cpp/src/neighbors/detail/cagra/add_nodes.cuh b/cpp/src/neighbors/detail/cagra/add_nodes.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#include "../../../core/omp_wrapper.hpp"
 #include "../ann_utils.cuh"
 #include <cuvs/neighbors/cagra.hpp>
 #include <raft/core/device_resources.hpp>
@@ -24,8 +25,6 @@
 
 #include <rmm/device_buffer.hpp>
 
-#include <omp.h>
-
 #include <cstdint>
 
 namespace cuvs::neighbors::cagra {
@@ -165,8 +164,8 @@ void add_node_core(
 #pragma omp parallel
     {
       std::vector<std::pair<IdxT, std::size_t>> detourable_node_count_list(base_degree);
-      for (std::size_t vec_i = omp_get_thread_num(); vec_i < batch.size();
-           vec_i += omp_get_num_threads()) {
+      for (std::size_t vec_i = cuvs::core::omp::get_thread_num(); vec_i < batch.size();
+           vec_i += cuvs::core::omp::get_num_threads()) {
         // Count detourable edges
         for (std::uint32_t i = 0; i < base_degree; i++) {
           std::uint32_t detourable_node_count = 0;
diff --git a/cpp/src/neighbors/detail/cagra/graph_core.cuh b/cpp/src/neighbors/detail/cagra/graph_core.cuh
@@ -25,6 +25,7 @@
 
 // TODO: This shouldn't be invoking anything from spatial/knn
 #include "../../../core/nvtx.hpp"
+#include "../../../core/omp_wrapper.hpp"
 #include "../ann_utils.cuh"
 
 #include <raft/util/bitonic_sort.cuh>
@@ -33,7 +34,6 @@
 #include <cuda_fp16.h>
 
 #include <float.h>
-#include <omp.h>
 #include <sys/time.h>
 
 #include <climits>
diff --git a/cpp/src/neighbors/detail/hnsw.hpp b/cpp/src/neighbors/detail/hnsw.hpp
@@ -17,6 +17,7 @@
 #pragma once
 
 #include "../../core/nvtx.hpp"
+#include "../../core/omp_wrapper.hpp"
 
 #include <cuvs/neighbors/brute_force.hpp>
 #include <cuvs/neighbors/hnsw.hpp>
@@ -29,7 +30,6 @@
 
 #include <filesystem>
 #include <memory>
-#include <omp.h>
 #include <random>
 #include <thread>
 
@@ -194,7 +194,7 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
     params.ef_construction);
   appr_algo->base_layer_init = false;  // tell hnswlib to build upper layers only
   [[maybe_unused]] auto num_threads =
-    params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
+    params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
 #pragma omp parallel for num_threads(num_threads)
   for (int64_t i = 0; i < host_dataset_view.extent(0); i++) {
     appr_algo->addPoint((void*)(host_dataset_view.data_handle() + i * host_dataset_view.extent(1)),
@@ -284,8 +284,9 @@ std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> fro
   std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
 {
   common::nvtx::range<common::nvtx::domain::cuvs> fun_scope("hnsw::from_cagra<GPU>");
-  auto stream      = raft::resource::get_cuda_stream(res);
-  auto num_threads = params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
+  auto stream = raft::resource::get_cuda_stream(res);
+  auto num_threads =
+    params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
 
   /* Note: NNSW data layout
 
@@ -546,7 +547,7 @@ void extend(raft::resources const& res,
   auto current_element_count = hnswlib_index->getCurrentElementCount();
   auto new_element_count     = additional_dataset.extent(0);
   [[maybe_unused]] auto num_threads =
-    params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
+    params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
 
   hnswlib_index->resizeIndex(current_element_count + new_element_count);
 #pragma omp parallel for num_threads(num_threads)
diff --git a/cpp/src/neighbors/detail/nn_descent.cuh b/cpp/src/neighbors/detail/nn_descent.cuh
@@ -20,6 +20,7 @@
 #include "cagra/device_common.hpp"
 #include "nn_descent_gnnd.hpp"
 
+#include "../../core/omp_wrapper.hpp"
 #include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/nn_descent.hpp>
 
@@ -46,7 +47,6 @@
 #include <cuda_runtime.h>
 
 #include <mma.h>
-#include <omp.h>
 
 #include <limits>
 #include <numeric>
diff --git a/cpp/src/neighbors/mg/snmg.cuh b/cpp/src/neighbors/mg/snmg.cuh
@@ -22,6 +22,7 @@
 #include <raft/linalg/add.cuh>
 #include <raft/util/cuda_dev_essentials.cuh>
 
+#include "../../core/omp_wrapper.hpp"
 #include <cuvs/neighbors/cagra.hpp>
 #include <cuvs/neighbors/common.hpp>
 #include <cuvs/neighbors/ivf_flat.hpp>
@@ -42,10 +43,6 @@ void search(const raft::resources& handle,
             raft::device_matrix_view<float, int64_t, row_major> d_distances);
 }  // namespace cuvs::neighbors
 
-namespace cuvs::neighbors::snmg {
-void check_omp_threads(const int requirements);
-}  // namespace cuvs::neighbors::snmg
-
 namespace cuvs::neighbors::snmg::detail {
 using namespace cuvs::neighbors;
 using namespace raft;
@@ -215,7 +212,8 @@ void sharded_search_with_direct_merge(
       queries.data_handle() + query_offset, n_rows_of_current_batch, n_cols);
 
     const int& requirements = index.num_ranks_;
-    check_omp_threads(requirements);  // should use at least num_ranks_ threads to avoid NCCL hang
+    cuvs::core::omp::check_threads(
+      requirements);  // should use at least num_ranks_ threads to avoid NCCL hang
 #pragma omp parallel for num_threads(index.num_ranks_)
     for (int rank = 0; rank < index.num_ranks_; rank++) {
       const raft::resources& dev_res = raft::resource::set_current_device_to_rank(clique, rank);
@@ -335,7 +333,8 @@ void sharded_search_with_tree_merge(
       queries.data_handle() + query_offset, n_rows_of_current_batch, n_cols);
 
     const int& requirements = index.num_ranks_;
-    check_omp_threads(requirements);  // should use at least num_ranks_ threads to avoid NCCL hang
+    cuvs::core::omp::check_threads(
+      requirements);  // should use at least num_ranks_ threads to avoid NCCL hang
 #pragma omp parallel for num_threads(index.num_ranks_)
     for (int rank = 0; rank < index.num_ranks_; rank++) {
       const raft::resources& dev_res = raft::resource::set_current_device_to_rank(clique, rank);
diff --git a/cpp/src/neighbors/refine/refine_host.hpp b/cpp/src/neighbors/refine/refine_host.hpp
@@ -17,8 +17,9 @@
 #pragma once
 
 #include "../../core/nvtx.hpp"
+#include "../../core/omp_wrapper.hpp"
 #include "refine_common.hpp"
-#include <omp.h>
+
 #include <raft/core/host_mdspan.hpp>
 #include <raft/util/integer_utils.hpp>
 
@@ -376,7 +377,8 @@ template <typename DC, typename IdxT, typename DataT, typename DistanceT, typena
   cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
     "neighbors::refine_host(%zu, %zu -> %zu)", n_queries, orig_k, refined_k);
 
-  auto suggested_n_threads = std::max(1, std::min(omp_get_num_procs(), omp_get_max_threads()));
+  auto suggested_n_threads =
+    std::max(1, std::min(cuvs::core::omp::get_num_procs(), cuvs::core::omp::get_max_threads()));
 
   // If the number of queries is small, separate the distance calculation and
   // the top-k calculation into separate loops, and apply finer-grained thread
@@ -438,8 +440,8 @@ template <typename DC, typename IdxT, typename DataT, typename DistanceT, typena
       suggested_n_threads, std::vector<std::tuple<DistanceT, IdxT>>(orig_k));
 #pragma omp parallel num_threads(suggested_n_threads)
     {
-      auto tid = omp_get_thread_num();
-      for (size_t i = tid; i < n_queries; i += omp_get_num_threads()) {
+      auto tid = cuvs::core::omp::get_thread_num();
+      for (size_t i = tid; i < n_queries; i += cuvs::core::omp::get_num_threads()) {
         // Compute the refined distance using original dataset vectors
         const DataT* query = queries.data_handle() + dim * i;
         for (size_t j = 0; j < orig_k; j++) {
diff --git a/cpp/src/neighbors/scann/detail/scann_avq.cuh b/cpp/src/neighbors/scann/detail/scann_avq.cuh
@@ -547,7 +547,7 @@ class cluster_loader {
       auto pinned_cluster = raft::make_pinned_matrix_view<T, int64_t>(
         cluster_buf_.data_handle(), cluster_vectors.extent(0), cluster_vectors.extent(1));
 
-      int n_threads = std::min<int>(omp_get_max_threads(), 32);
+      int n_threads = std::min<int>(cuvs::core::omp::get_max_threads(), 32);
 #pragma omp parallel for num_threads(n_threads)
       for (int i = 0; i < h_cluster_ids.extent(0); i++) {
         memcpy(pinned_cluster.data_handle() + i * pinned_cluster.extent(1),
diff --git a/cpp/src/neighbors/scann/detail/scann_common.cuh b/cpp/src/neighbors/scann/detail/scann_common.cuh
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "../../../core/omp_wrapper.hpp"
+
 #include <raft/core/device_mdarray.hpp>
 #include <raft/core/device_mdspan.hpp>
 #include <raft/matrix/gather.cuh>
@@ -49,7 +51,7 @@ struct gather_functor {
 
     raft::resource::sync_stream(res, stream);
 
-    int n_threads = std::min<int>(omp_get_max_threads(), 32);
+    int n_threads = std::min<int>(cuvs::core::omp::get_max_threads(), 32);
 
 #pragma omp parallel for num_threads(n_threads)
     for (int i = 0; i < h_cluster_ids.extent(0); i++) {

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,7 @@`
`15`	`15`	`*/`
`16`	`16`
`17`	`17`	`#pragma once`
	`18`	`+#include "../../core/omp_wrapper.hpp"`
`18`	`19`	`#include "../detail/reachability.cuh"`
`19`	`20`	`#include "all_neighbors_builder.cuh"`
`20`	`21`	`#include "raft/core/logger_macros.hpp"`
`@@ -483,7 +484,7 @@ void batch_build(`
`483`	`484`	`{`
`484`	`485`	`if (raft::resource::is_multi_gpu(handle)) {`
`485`	`486`	`// For efficient CPU-computation of omp parallel for regions per GPU`
`486`		`- omp_set_nested(1);`
	`487`	`+ cuvs::core::omp::set_nested(1);`
`487`	`488`	`}`
`488`	`489`
`489`	`490`	`size_t num_rows = static_cast<size_t>(dataset.extent(0));`