Skip to content

Commit 945f1a8

Browse files
robertmaynardenp1s0
authored andcommitted
Properly guard usage of openmp function calls (rapidsai#1435)
Proper support to disable OpenMP also requires that any calls to functions like `omp_get_max_threads` need to be guarded by a `_OPENMP` check. This is done by adding openmp wrapper functions into `cuvs/core` that properly handle the `_OPENMP` guards and behave as required when OpenMP is disabled. Required to fix rapidsai#1322 Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Divye Gala (https://github.com/divyegala) URL: rapidsai#1435
1 parent 54d37de commit 945f1a8

File tree

12 files changed

+92
-32
lines changed

12 files changed

+92
-32
lines changed

cpp/CMakeLists.txt

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -332,7 +332,6 @@ if(NOT BUILD_CPU_ONLY)
332332
src/neighbors/mg/mg_cagra_half_uint32_t.cu
333333
src/neighbors/mg/mg_cagra_int8_t_uint32_t.cu
334334
src/neighbors/mg/mg_cagra_uint8_t_uint32_t.cu
335-
src/neighbors/mg/omp_checks.cpp
336335
)
337336
endif()
338337

@@ -359,6 +358,7 @@ if(NOT BUILD_CPU_ONLY)
359358
src/cluster/kmeans_transform_float.cu
360359
src/cluster/single_linkage_float.cu
361360
src/core/bitset.cu
361+
src/core/omp_wrapper.cpp
362362
src/distance/detail/kernels/gram_matrix.cu
363363
src/distance/detail/kernels/kernel_factory.cu
364364
src/distance/detail/kernels/kernel_matrices.cu

cpp/src/neighbors/mg/omp_checks.cpp renamed to cpp/src/core/omp_wrapper.cpp

Lines changed: 30 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -15,19 +15,43 @@
1515
*/
1616

1717
#include <omp.h>
18+
1819
#include <raft/core/logger.hpp>
1920

20-
namespace cuvs::neighbors::snmg {
21+
#include "omp_wrapper.hpp"
22+
23+
namespace cuvs::core::omp {
24+
25+
constexpr bool is_omp_enabled()
26+
{
27+
#if defined(_OPENMP)
28+
return true;
29+
#else
30+
return false;
31+
#endif
32+
}
33+
34+
int get_max_threads() { return is_omp_enabled() ? omp_get_max_threads() : 1; }
35+
int get_num_procs() { return is_omp_enabled() ? omp_get_num_procs() : 1; }
36+
int get_num_threads() { return is_omp_enabled() ? omp_get_num_threads() : 1; }
37+
int get_thread_num() { return is_omp_enabled() ? omp_get_thread_num() : 0; }
38+
39+
void set_nested(int v)
40+
{
41+
(void)v;
42+
if constexpr (is_omp_enabled()) { omp_set_nested(v); }
43+
}
2144

22-
void check_omp_threads(const int requirements)
45+
void check_threads(const int requirements)
2346
{
24-
const int max_threads = omp_get_max_threads();
25-
if (max_threads < requirements)
47+
const int max_threads = get_max_threads();
48+
if (max_threads < requirements) {
2649
RAFT_LOG_WARN(
2750
"OpenMP is only allowed %d threads to run %d GPUs. Please increase the number of OpenMP "
2851
"threads to avoid NCCL hangs by modifying the environment variable OMP_NUM_THREADS.",
2952
max_threads,
3053
requirements);
54+
}
3155
}
3256

33-
} // namespace cuvs::neighbors::snmg
57+
} // namespace cuvs::core::omp

cpp/src/core/omp_wrapper.hpp

Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
/*
2+
* Copyright (c) 2025, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <omp.h>
18+
19+
namespace cuvs::core::omp {
20+
21+
constexpr bool is_omp_enabled();
22+
23+
int get_max_threads();
24+
int get_num_procs();
25+
int get_num_threads();
26+
int get_thread_num();
27+
28+
void set_nested(int v);
29+
30+
void check_threads(const int requirements);
31+
32+
} // namespace cuvs::core::omp

cpp/src/neighbors/all_neighbors/all_neighbors_batched.cuh

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
*/
1616

1717
#pragma once
18+
#include "../../core/omp_wrapper.hpp"
1819
#include "../detail/reachability.cuh"
1920
#include "all_neighbors_builder.cuh"
2021
#include "raft/core/logger_macros.hpp"
@@ -483,7 +484,7 @@ void batch_build(
483484
{
484485
if (raft::resource::is_multi_gpu(handle)) {
485486
// For efficient CPU-computation of omp parallel for regions per GPU
486-
omp_set_nested(1);
487+
cuvs::core::omp::set_nested(1);
487488
}
488489

489490
size_t num_rows = static_cast<size_t>(dataset.extent(0));

cpp/src/neighbors/detail/cagra/add_nodes.cuh

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2024, NVIDIA CORPORATION.
2+
* Copyright (c) 2024-2025, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -13,6 +13,7 @@
1313
* See the License for the specific language governing permissions and
1414
* limitations under the License.
1515
*/
16+
#include "../../../core/omp_wrapper.hpp"
1617
#include "../ann_utils.cuh"
1718
#include <cuvs/neighbors/cagra.hpp>
1819
#include <raft/core/device_resources.hpp>
@@ -24,8 +25,6 @@
2425

2526
#include <rmm/device_buffer.hpp>
2627

27-
#include <omp.h>
28-
2928
#include <cstdint>
3029

3130
namespace cuvs::neighbors::cagra {
@@ -165,8 +164,8 @@ void add_node_core(
165164
#pragma omp parallel
166165
{
167166
std::vector<std::pair<IdxT, std::size_t>> detourable_node_count_list(base_degree);
168-
for (std::size_t vec_i = omp_get_thread_num(); vec_i < batch.size();
169-
vec_i += omp_get_num_threads()) {
167+
for (std::size_t vec_i = cuvs::core::omp::get_thread_num(); vec_i < batch.size();
168+
vec_i += cuvs::core::omp::get_num_threads()) {
170169
// Count detourable edges
171170
for (std::uint32_t i = 0; i < base_degree; i++) {
172171
std::uint32_t detourable_node_count = 0;

cpp/src/neighbors/detail/cagra/graph_core.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525

2626
// TODO: This shouldn't be invoking anything from spatial/knn
2727
#include "../../../core/nvtx.hpp"
28+
#include "../../../core/omp_wrapper.hpp"
2829
#include "../ann_utils.cuh"
2930

3031
#include <raft/util/bitonic_sort.cuh>
@@ -33,7 +34,6 @@
3334
#include <cuda_fp16.h>
3435

3536
#include <float.h>
36-
#include <omp.h>
3737
#include <sys/time.h>
3838

3939
#include <climits>

cpp/src/neighbors/detail/hnsw.hpp

Lines changed: 6 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#pragma once
1818

1919
#include "../../core/nvtx.hpp"
20+
#include "../../core/omp_wrapper.hpp"
2021

2122
#include <cuvs/neighbors/brute_force.hpp>
2223
#include <cuvs/neighbors/hnsw.hpp>
@@ -29,7 +30,6 @@
2930

3031
#include <filesystem>
3132
#include <memory>
32-
#include <omp.h>
3333
#include <random>
3434
#include <thread>
3535

@@ -194,7 +194,7 @@ std::enable_if_t<hierarchy == HnswHierarchy::CPU, std::unique_ptr<index<T>>> fro
194194
params.ef_construction);
195195
appr_algo->base_layer_init = false; // tell hnswlib to build upper layers only
196196
[[maybe_unused]] auto num_threads =
197-
params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
197+
params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
198198
#pragma omp parallel for num_threads(num_threads)
199199
for (int64_t i = 0; i < host_dataset_view.extent(0); i++) {
200200
appr_algo->addPoint((void*)(host_dataset_view.data_handle() + i * host_dataset_view.extent(1)),
@@ -284,8 +284,9 @@ std::enable_if_t<hierarchy == HnswHierarchy::GPU, std::unique_ptr<index<T>>> fro
284284
std::optional<raft::host_matrix_view<const T, int64_t, raft::row_major>> dataset)
285285
{
286286
common::nvtx::range<common::nvtx::domain::cuvs> fun_scope("hnsw::from_cagra<GPU>");
287-
auto stream = raft::resource::get_cuda_stream(res);
288-
auto num_threads = params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
287+
auto stream = raft::resource::get_cuda_stream(res);
288+
auto num_threads =
289+
params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
289290

290291
/* Note: NNSW data layout
291292
@@ -546,7 +547,7 @@ void extend(raft::resources const& res,
546547
auto current_element_count = hnswlib_index->getCurrentElementCount();
547548
auto new_element_count = additional_dataset.extent(0);
548549
[[maybe_unused]] auto num_threads =
549-
params.num_threads == 0 ? omp_get_max_threads() : params.num_threads;
550+
params.num_threads == 0 ? cuvs::core::omp::get_max_threads() : params.num_threads;
550551

551552
hnswlib_index->resizeIndex(current_element_count + new_element_count);
552553
#pragma omp parallel for num_threads(num_threads)

cpp/src/neighbors/detail/nn_descent.cuh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include "cagra/device_common.hpp"
2121
#include "nn_descent_gnnd.hpp"
2222

23+
#include "../../core/omp_wrapper.hpp"
2324
#include <cuvs/distance/distance.hpp>
2425
#include <cuvs/neighbors/nn_descent.hpp>
2526

@@ -46,7 +47,6 @@
4647
#include <cuda_runtime.h>
4748

4849
#include <mma.h>
49-
#include <omp.h>
5050

5151
#include <limits>
5252
#include <numeric>

cpp/src/neighbors/mg/snmg.cuh

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,7 @@
2222
#include <raft/linalg/add.cuh>
2323
#include <raft/util/cuda_dev_essentials.cuh>
2424

25+
#include "../../core/omp_wrapper.hpp"
2526
#include <cuvs/neighbors/cagra.hpp>
2627
#include <cuvs/neighbors/common.hpp>
2728
#include <cuvs/neighbors/ivf_flat.hpp>
@@ -42,10 +43,6 @@ void search(const raft::resources& handle,
4243
raft::device_matrix_view<float, int64_t, row_major> d_distances);
4344
} // namespace cuvs::neighbors
4445

45-
namespace cuvs::neighbors::snmg {
46-
void check_omp_threads(const int requirements);
47-
} // namespace cuvs::neighbors::snmg
48-
4946
namespace cuvs::neighbors::snmg::detail {
5047
using namespace cuvs::neighbors;
5148
using namespace raft;
@@ -215,7 +212,8 @@ void sharded_search_with_direct_merge(
215212
queries.data_handle() + query_offset, n_rows_of_current_batch, n_cols);
216213

217214
const int& requirements = index.num_ranks_;
218-
check_omp_threads(requirements); // should use at least num_ranks_ threads to avoid NCCL hang
215+
cuvs::core::omp::check_threads(
216+
requirements); // should use at least num_ranks_ threads to avoid NCCL hang
219217
#pragma omp parallel for num_threads(index.num_ranks_)
220218
for (int rank = 0; rank < index.num_ranks_; rank++) {
221219
const raft::resources& dev_res = raft::resource::set_current_device_to_rank(clique, rank);
@@ -335,7 +333,8 @@ void sharded_search_with_tree_merge(
335333
queries.data_handle() + query_offset, n_rows_of_current_batch, n_cols);
336334

337335
const int& requirements = index.num_ranks_;
338-
check_omp_threads(requirements); // should use at least num_ranks_ threads to avoid NCCL hang
336+
cuvs::core::omp::check_threads(
337+
requirements); // should use at least num_ranks_ threads to avoid NCCL hang
339338
#pragma omp parallel for num_threads(index.num_ranks_)
340339
for (int rank = 0; rank < index.num_ranks_; rank++) {
341340
const raft::resources& dev_res = raft::resource::set_current_device_to_rank(clique, rank);

cpp/src/neighbors/refine/refine_host.hpp

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,9 @@
1717
#pragma once
1818

1919
#include "../../core/nvtx.hpp"
20+
#include "../../core/omp_wrapper.hpp"
2021
#include "refine_common.hpp"
21-
#include <omp.h>
22+
2223
#include <raft/core/host_mdspan.hpp>
2324
#include <raft/util/integer_utils.hpp>
2425

@@ -376,7 +377,8 @@ template <typename DC, typename IdxT, typename DataT, typename DistanceT, typena
376377
cuvs::common::nvtx::range<cuvs::common::nvtx::domain::cuvs> fun_scope(
377378
"neighbors::refine_host(%zu, %zu -> %zu)", n_queries, orig_k, refined_k);
378379

379-
auto suggested_n_threads = std::max(1, std::min(omp_get_num_procs(), omp_get_max_threads()));
380+
auto suggested_n_threads =
381+
std::max(1, std::min(cuvs::core::omp::get_num_procs(), cuvs::core::omp::get_max_threads()));
380382

381383
// If the number of queries is small, separate the distance calculation and
382384
// the top-k calculation into separate loops, and apply finer-grained thread
@@ -438,8 +440,8 @@ template <typename DC, typename IdxT, typename DataT, typename DistanceT, typena
438440
suggested_n_threads, std::vector<std::tuple<DistanceT, IdxT>>(orig_k));
439441
#pragma omp parallel num_threads(suggested_n_threads)
440442
{
441-
auto tid = omp_get_thread_num();
442-
for (size_t i = tid; i < n_queries; i += omp_get_num_threads()) {
443+
auto tid = cuvs::core::omp::get_thread_num();
444+
for (size_t i = tid; i < n_queries; i += cuvs::core::omp::get_num_threads()) {
443445
// Compute the refined distance using original dataset vectors
444446
const DataT* query = queries.data_handle() + dim * i;
445447
for (size_t j = 0; j < orig_k; j++) {

0 commit comments

Comments
 (0)