Skip to content

Commit 6729def

Browse files
authored
Update RMM adaptors, containers and tests to use get/set_current_device_resource_ref() (#1661)
Closes #1660. This adds a constructor to each MR adaptor to take a resource_ref rather than an `Upstream*`. It also updates RMM to use `get_current_device_resource_ref()` everywhere: in containers, in tests, in adaptors, Thrust allocator, polymorphic allocator, execution_policy, etc. Importantly, this PR also modifies `set_current_device_resource()` to basically call `set_current_device_resource_ref()`. This is necessary, because while RMM C++ uses `get_current_device_resource_ref()` everywhere, the Python API still uses the raw pointer API `set_current_device_resource()`. So we need the latter to update the state for the former. This is a temporary bootstrap to help with the refactoring. Authors: - Mark Harris (https://github.com/harrism) Approvers: - Michael Schellenberger Costa (https://github.com/miscco) - Lawrence Mitchell (https://github.com/wence-) - Rong Ou (https://github.com/rongou) - Bradley Dice (https://github.com/bdice) URL: #1661
1 parent 687ed5c commit 6729def

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

44 files changed

+639
-387
lines changed

benchmarks/device_uvector/device_uvector_bench.cu

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
4040
rmm::mr::cuda_memory_resource cuda_mr{};
4141
rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
4242
&cuda_mr, rmm::percent_of_free_device_memory(50)};
43-
rmm::mr::set_current_device_resource(&mr);
43+
rmm::mr::set_current_device_resource_ref(mr);
4444

4545
for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
4646
rmm::device_uvector<std::int32_t> vec(state.range(0), rmm::cuda_stream_view{});
@@ -49,7 +49,7 @@ void BM_UvectorSizeConstruction(benchmark::State& state)
4949

5050
state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
5151

52-
rmm::mr::set_current_device_resource(nullptr);
52+
rmm::mr::reset_current_device_resource_ref();
5353
}
5454

5555
BENCHMARK(BM_UvectorSizeConstruction)
@@ -62,7 +62,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)
6262
rmm::mr::cuda_memory_resource cuda_mr{};
6363
rmm::mr::pool_memory_resource<rmm::mr::cuda_memory_resource> mr{
6464
&cuda_mr, rmm::percent_of_free_device_memory(50)};
65-
rmm::mr::set_current_device_resource(&mr);
65+
rmm::mr::set_current_device_resource_ref(mr);
6666

6767
for (auto _ : state) { // NOLINT(clang-analyzer-deadcode.DeadStores)
6868
rmm::device_vector<std::int32_t> vec(state.range(0));
@@ -71,7 +71,7 @@ void BM_ThrustVectorSizeConstruction(benchmark::State& state)
7171

7272
state.SetItemsProcessed(static_cast<std::int64_t>(state.iterations()));
7373

74-
rmm::mr::set_current_device_resource(nullptr);
74+
rmm::mr::reset_current_device_resource_ref();
7575
}
7676

7777
BENCHMARK(BM_ThrustVectorSizeConstruction)
@@ -140,7 +140,7 @@ template <typename Vector>
140140
void BM_VectorWorkflow(benchmark::State& state)
141141
{
142142
rmm::mr::cuda_async_memory_resource cuda_async_mr{};
143-
rmm::mr::set_current_device_resource(&cuda_async_mr);
143+
rmm::mr::set_current_device_resource_ref(cuda_async_mr);
144144

145145
rmm::cuda_stream input_stream;
146146
std::vector<rmm::cuda_stream> streams(4);
@@ -158,7 +158,7 @@ void BM_VectorWorkflow(benchmark::State& state)
158158
auto const bytes = num_elements * sizeof(std::int32_t) * num_accesses;
159159
state.SetBytesProcessed(static_cast<std::int64_t>(state.iterations() * bytes));
160160

161-
rmm::mr::set_current_device_resource(nullptr);
161+
rmm::mr::reset_current_device_resource_ref();
162162
}
163163

164164
BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT
@@ -167,9 +167,9 @@ BENCHMARK_TEMPLATE(BM_VectorWorkflow, thrust_vector) // NOLINT
167167
->Unit(benchmark::kMicrosecond)
168168
->UseManualTime();
169169

170-
// The only difference here is that `rmm::device_vector` uses `rmm::current_device_resource()`
171-
// for allocation while `thrust::device_vector` uses cudaMalloc/cudaFree. In the benchmarks we use
172-
// `cuda_async_memory_resource`, which is faster.
170+
// The only difference here is that `rmm::device_vector` uses
171+
// `rmm::get_current_device_resource_ref()` for allocation while `thrust::device_vector` uses
172+
// cudaMalloc/cudaFree. In the benchmarks we use `cuda_async_memory_resource`, which is faster.
173173
BENCHMARK_TEMPLATE(BM_VectorWorkflow, rmm_vector) // NOLINT
174174
->RangeMultiplier(10) // NOLINT
175175
->Range(100'000, 100'000'000) // NOLINT

benchmarks/multi_stream_allocations/multi_stream_allocations_bench.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
7575
{
7676
auto mr = factory();
7777

78-
rmm::mr::set_current_device_resource(mr.get());
78+
rmm::mr::set_current_device_resource_ref(mr.get());
7979

8080
auto num_streams = state.range(0);
8181
auto num_kernels = state.range(1);
@@ -92,7 +92,7 @@ static void BM_MultiStreamAllocations(benchmark::State& state, MRFactoryFunc con
9292

9393
state.SetItemsProcessed(static_cast<int64_t>(state.iterations() * num_kernels));
9494

95-
rmm::mr::set_current_device_resource(nullptr);
95+
rmm::mr::reset_current_device_resource_ref();
9696
}
9797

9898
inline auto make_cuda() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }

include/rmm/device_buffer.hpp

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ namespace RMM_NAMESPACE {
4141
*
4242
* This class allocates untyped and *uninitialized* device memory using a
4343
* `device_async_resource_ref`. If not explicitly specified, the memory resource
44-
* returned from `get_current_device_resource()` is used.
44+
* returned from `get_current_device_resource_ref()` is used.
4545
*
4646
* @note Unlike `std::vector` or `thrust::device_vector`, the device memory
4747
* allocated by a `device_buffer` is uninitialized. Therefore, it is undefined
@@ -95,7 +95,7 @@ class device_buffer {
9595
// `__host__ __device__` specifiers to the defaulted constructor when it is called within the
9696
// context of both host and device functions. Specifically, the `cudf::type_dispatcher` is a host-
9797
// device function. This causes warnings/errors because this ctor invokes host-only functions.
98-
device_buffer() : _mr{rmm::mr::get_current_device_resource()} {}
98+
device_buffer() : _mr{rmm::mr::get_current_device_resource_ref()} {}
9999

100100
/**
101101
* @brief Constructs a new device buffer of `size` uninitialized bytes
@@ -109,7 +109,7 @@ class device_buffer {
109109
*/
110110
explicit device_buffer(std::size_t size,
111111
cuda_stream_view stream,
112-
device_async_resource_ref mr = mr::get_current_device_resource())
112+
device_async_resource_ref mr = mr::get_current_device_resource_ref())
113113
: _stream{stream}, _mr{mr}
114114
{
115115
cuda_set_device_raii dev{_device};
@@ -138,7 +138,7 @@ class device_buffer {
138138
device_buffer(void const* source_data,
139139
std::size_t size,
140140
cuda_stream_view stream,
141-
device_async_resource_ref mr = mr::get_current_device_resource())
141+
device_async_resource_ref mr = mr::get_current_device_resource_ref())
142142
: _stream{stream}, _mr{mr}
143143
{
144144
cuda_set_device_raii dev{_device};
@@ -169,7 +169,7 @@ class device_buffer {
169169
*/
170170
device_buffer(device_buffer const& other,
171171
cuda_stream_view stream,
172-
device_async_resource_ref mr = mr::get_current_device_resource())
172+
device_async_resource_ref mr = mr::get_current_device_resource_ref())
173173
: device_buffer{other.data(), other.size(), stream, mr}
174174
{
175175
}
@@ -419,8 +419,8 @@ class device_buffer {
419419
cuda_stream_view _stream{}; ///< Stream to use for device memory deallocation
420420

421421
rmm::device_async_resource_ref _mr{
422-
rmm::mr::get_current_device_resource()}; ///< The memory resource used to
423-
///< allocate/deallocate device memory
422+
rmm::mr::get_current_device_resource_ref()}; ///< The memory resource used to
423+
///< allocate/deallocate device memory
424424
cuda_device_id _device{get_current_cuda_device()};
425425

426426
/**

include/rmm/device_scalar.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ class device_scalar {
9595
* @param mr Optional, resource with which to allocate.
9696
*/
9797
explicit device_scalar(cuda_stream_view stream,
98-
device_async_resource_ref mr = mr::get_current_device_resource())
98+
device_async_resource_ref mr = mr::get_current_device_resource_ref())
9999
: _storage{1, stream, mr}
100100
{
101101
}
@@ -118,7 +118,7 @@ class device_scalar {
118118
*/
119119
explicit device_scalar(value_type const& initial_value,
120120
cuda_stream_view stream,
121-
device_async_resource_ref mr = mr::get_current_device_resource())
121+
device_async_resource_ref mr = mr::get_current_device_resource_ref())
122122
: _storage{1, stream, mr}
123123
{
124124
set_value_async(initial_value, stream);
@@ -138,7 +138,7 @@ class device_scalar {
138138
*/
139139
device_scalar(device_scalar const& other,
140140
cuda_stream_view stream,
141-
device_async_resource_ref mr = mr::get_current_device_resource())
141+
device_async_resource_ref mr = mr::get_current_device_resource_ref())
142142
: _storage{other._storage, stream, mr}
143143
{
144144
}

include/rmm/device_uvector.hpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,7 @@ namespace RMM_NAMESPACE {
4848
*
4949
* Example:
5050
* @code{.cpp}
51-
* rmm::mr::device_memory_resource * mr = new my_custom_resource();
51+
* auto mr = new my_custom_resource();
5252
* rmm::cuda_stream_view s{};
5353
*
5454
* // Allocates *uninitialized* device memory on stream `s` sufficient for 100 ints using the
@@ -126,7 +126,7 @@ class device_uvector {
126126
*/
127127
explicit device_uvector(std::size_t size,
128128
cuda_stream_view stream,
129-
device_async_resource_ref mr = mr::get_current_device_resource())
129+
device_async_resource_ref mr = mr::get_current_device_resource_ref())
130130
: _storage{elements_to_bytes(size), stream, mr}
131131
{
132132
}
@@ -142,7 +142,7 @@ class device_uvector {
142142
*/
143143
explicit device_uvector(device_uvector const& other,
144144
cuda_stream_view stream,
145-
device_async_resource_ref mr = mr::get_current_device_resource())
145+
device_async_resource_ref mr = mr::get_current_device_resource_ref())
146146
: _storage{other._storage, stream, mr}
147147
{
148148
}

include/rmm/exec_policy.hpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,7 +57,7 @@ class exec_policy : public thrust_exec_policy_t {
5757
* @param mr The resource to use for allocating temporary memory
5858
*/
5959
explicit exec_policy(cuda_stream_view stream = cuda_stream_default,
60-
device_async_resource_ref mr = mr::get_current_device_resource())
60+
device_async_resource_ref mr = mr::get_current_device_resource_ref())
6161
: thrust_exec_policy_t(
6262
thrust::cuda::par(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
6363
{
@@ -81,7 +81,7 @@ using thrust_exec_policy_nosync_t =
8181
class exec_policy_nosync : public thrust_exec_policy_nosync_t {
8282
public:
8383
explicit exec_policy_nosync(cuda_stream_view stream = cuda_stream_default,
84-
device_async_resource_ref mr = mr::get_current_device_resource())
84+
device_async_resource_ref mr = mr::get_current_device_resource_ref())
8585
: thrust_exec_policy_nosync_t(
8686
thrust::cuda::par_nosync(mr::thrust_allocator<char>(stream, mr)).on(stream.value()))
8787
{

include/rmm/mr/device/aligned_resource_adaptor.hpp

Lines changed: 34 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@
2020
#include <rmm/detail/error.hpp>
2121
#include <rmm/detail/export.hpp>
2222
#include <rmm/mr/device/device_memory_resource.hpp>
23+
#include <rmm/mr/device/per_device_resource.hpp>
2324
#include <rmm/resource_ref.hpp>
2425

2526
#include <cstddef>
@@ -59,20 +60,40 @@ class aligned_resource_adaptor final : public device_memory_resource {
5960
/**
6061
* @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
6162
*
62-
* @throws rmm::logic_error if `upstream == nullptr`
6363
* @throws rmm::logic_error if `allocation_alignment` is not a power of 2
6464
*
6565
* @param upstream The resource used for allocating/deallocating device memory.
6666
* @param alignment The size used for allocation alignment.
6767
* @param alignment_threshold Only allocations with a size larger than or equal to this threshold
6868
* are aligned.
6969
*/
70-
explicit aligned_resource_adaptor(Upstream* upstream,
70+
explicit aligned_resource_adaptor(device_async_resource_ref upstream,
7171
std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
7272
std::size_t alignment_threshold = default_alignment_threshold)
7373
: upstream_{upstream}, alignment_{alignment}, alignment_threshold_{alignment_threshold}
7474
{
75-
RMM_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
75+
RMM_EXPECTS(rmm::is_supported_alignment(alignment),
76+
"Allocation alignment is not a power of 2.");
77+
}
78+
79+
/**
80+
* @brief Construct an aligned resource adaptor using `upstream` to satisfy allocation requests.
81+
*
82+
* @throws rmm::logic_error if `upstream == nullptr`
83+
* @throws rmm::logic_error if `alignment` is not a power of 2
84+
*
85+
* @param upstream The resource used for allocating/deallocating device memory.
86+
* @param alignment The size used for allocation alignment.
87+
* @param alignment_threshold Only allocations with a size larger than or equal to this threshold
88+
* are aligned.
89+
*/
90+
explicit aligned_resource_adaptor(Upstream* upstream,
91+
std::size_t alignment = rmm::CUDA_ALLOCATION_ALIGNMENT,
92+
std::size_t alignment_threshold = default_alignment_threshold)
93+
: upstream_{to_device_async_resource_ref_checked(upstream)},
94+
alignment_{alignment},
95+
alignment_threshold_{alignment_threshold}
96+
{
7697
RMM_EXPECTS(rmm::is_supported_alignment(alignment),
7798
"Allocation alignment is not a power of 2.");
7899
}
@@ -92,11 +113,6 @@ class aligned_resource_adaptor final : public device_memory_resource {
92113
return upstream_;
93114
}
94115

95-
/**
96-
* @briefreturn{Upstream* to the upstream memory resource}
97-
*/
98-
[[nodiscard]] Upstream* get_upstream() const noexcept { return upstream_; }
99-
100116
/**
101117
* @brief The default alignment used by the adaptor.
102118
*/
@@ -106,8 +122,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
106122
using lock_guard = std::lock_guard<std::mutex>;
107123

108124
/**
109-
* @brief Allocates memory of size at least `bytes` using the upstream resource with the specified
110-
* alignment.
125+
* @brief Allocates memory of size at least `bytes` using the upstream resource with the
126+
* specified alignment.
111127
*
112128
* @throws rmm::bad_alloc if the requested allocation could not be fulfilled
113129
* by the upstream resource.
@@ -119,10 +135,10 @@ class aligned_resource_adaptor final : public device_memory_resource {
119135
void* do_allocate(std::size_t bytes, cuda_stream_view stream) override
120136
{
121137
if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
122-
return upstream_->allocate(bytes, stream);
138+
return get_upstream_resource().allocate_async(bytes, 1, stream);
123139
}
124140
auto const size = upstream_allocation_size(bytes);
125-
void* pointer = upstream_->allocate(size, stream);
141+
void* pointer = get_upstream_resource().allocate_async(size, 1, stream);
126142
// NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
127143
auto const address = reinterpret_cast<std::size_t>(pointer);
128144
auto const aligned_address = rmm::align_up(address, alignment_);
@@ -145,7 +161,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
145161
void do_deallocate(void* ptr, std::size_t bytes, cuda_stream_view stream) override
146162
{
147163
if (alignment_ == rmm::CUDA_ALLOCATION_ALIGNMENT || bytes < alignment_threshold_) {
148-
upstream_->deallocate(ptr, bytes, stream);
164+
get_upstream_resource().deallocate_async(ptr, bytes, 1, stream);
149165
} else {
150166
{
151167
lock_guard lock(mtx_);
@@ -155,7 +171,7 @@ class aligned_resource_adaptor final : public device_memory_resource {
155171
pointers_.erase(iter);
156172
}
157173
}
158-
upstream_->deallocate(ptr, upstream_allocation_size(bytes), stream);
174+
get_upstream_resource().deallocate_async(ptr, upstream_allocation_size(bytes), 1, stream);
159175
}
160176
}
161177

@@ -176,8 +192,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
176192
}
177193

178194
/**
179-
* @brief Calculate the allocation size needed from upstream to account for alignments of both the
180-
* size and the base pointer.
195+
* @brief Calculate the allocation size needed from upstream to account for alignments of both
196+
* the size and the base pointer.
181197
*
182198
* @param bytes The requested allocation size.
183199
* @return Allocation size needed from upstream to align both the size and the base pointer.
@@ -188,7 +204,8 @@ class aligned_resource_adaptor final : public device_memory_resource {
188204
return aligned_size + alignment_ - rmm::CUDA_ALLOCATION_ALIGNMENT;
189205
}
190206

191-
Upstream* upstream_; ///< The upstream resource used for satisfying allocation requests
207+
/// The upstream resource used for satisfying allocation requests
208+
device_async_resource_ref upstream_;
192209
std::unordered_map<void*, void*> pointers_; ///< Map of aligned pointers to upstream pointers.
193210
std::size_t alignment_; ///< The size used for allocation alignment
194211
std::size_t alignment_threshold_; ///< The size above which allocations should be aligned

0 commit comments

Comments
 (0)