Skip to content

Commit 18f332b

Browse files
committed
add cuda variant and add build guards for cpu
1 parent 484ff1a commit 18f332b

File tree

3 files changed

+60
-27
lines changed

3 files changed

+60
-27
lines changed

include/RAJA/pattern/launch/launch_core.hpp

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -185,27 +185,28 @@ class LaunchContext
185185

186186
void* shared_mem_ptr;
187187

188-
const size_t thread_id[3];
189-
const size_t block_dim[3];
188+
#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
189+
const dim3 thread_id;
190+
const dim3 block_dim;
191+
#endif
190192

191193
#if defined(RAJA_ENABLE_SYCL)
192194
mutable ::sycl::nd_item<3>* itm;
193195
#endif
194196

195-
RAJA_HOST_DEVICE LaunchContext()
197+
RAJA_HOST_DEVICE LaunchContext()
196198
: shared_mem_offset(0),
197-
shared_mem_ptr(nullptr),
198-
thread_id{1, 1, 1},
199-
block_dim{1, 1, 1}
199+
shared_mem_ptr(nullptr)
200200
{}
201201

202-
RAJA_HOST_DEVICE LaunchContext(const size_t tx, const size_t ty, const size_t tz,
203-
const size_t bx, const size_t by, const size_t bz)
202+
#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
203+
RAJA_HOST_DEVICE LaunchContext(dim3 thread_id_, dim3 block_id_)
204204
: shared_mem_offset(0),
205205
shared_mem_ptr(nullptr),
206-
thread_id{tx, ty, tz},
207-
block_dim{bx, by, bz}
208-
{}
206+
thread_id {thread_id_},
207+
block_dim {block_id_}
208+
{}
209+
#endif
209210

210211
// TODO handle alignment
211212
template<typename T>

include/RAJA/policy/cuda/launch.hpp

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY
3333
body_in,
3434
ReduceParams reduce_params)
3535
{
36-
LaunchContext ctx;
36+
LaunchContext ctx(threadIdx, blockDim);
3737

3838
using RAJA::internal::thread_privatize;
3939
auto privatizer = thread_privatize(body_in);
@@ -143,7 +143,7 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
143143
body_in,
144144
ReduceParams reduce_params)
145145
{
146-
LaunchContext ctx;
146+
LaunchContext ctx(threadIdx, blockDim);
147147

148148
using RAJA::internal::thread_privatize;
149149
auto privatizer = thread_privatize(body_in);
@@ -245,6 +245,40 @@ struct LaunchExecute<
245245
}
246246
};
247247

248+
/*
249+
Loop methods which rely on a copy of threaIdx/BlockDim
250+
for performance. In collaboration with AMD we have have this
251+
to be more performat.
252+
*/
253+
254+
template<named_dim DIM>
255+
struct hip_ctx_thread_loop;
256+
257+
using hip_ctx_thread_loop_x = hip_ctx_thread_loop<named_dim::x>;
258+
using hip_ctx_thread_loop_y = hip_ctx_thread_loop<named_dim::y>;
259+
using hip_ctx_thread_loop_z = hip_ctx_thread_loop<named_dim::z>;
260+
261+
template<typename SEGMENT, named_dim DIM>
262+
struct LoopExecute<hip_ctx_thread_loop<DIM>, SEGMENT>
263+
{
264+
265+
template<typename BODY>
266+
static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
267+
SEGMENT const& segment,
268+
BODY const& body)
269+
{
270+
271+
const int len = segment.end() - segment.begin();
272+
constexpr int int_dim = static_cast<int>(DIM);
273+
274+
for (int i = ::RAJA::internal::HipDimHelper<DIM>::get(ctx.thread_id);
275+
i < len; i += ::RAJA::internal::HipDimHelper<DIM>::get(ctx.block_dim))
276+
{
277+
body(*(segment.begin() + i));
278+
}
279+
}
280+
};
281+
248282
/*
249283
CUDA generic loop implementations
250284
*/

include/RAJA/policy/hip/launch.hpp

Lines changed: 12 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -32,8 +32,7 @@ template<typename BODY, typename ReduceParams>
3232
__global__ void launch_new_reduce_global_fcn(const BODY body_in,
3333
ReduceParams reduce_params)
3434
{
35-
LaunchContext ctx(threadIdx.x, threadIdx.y, threadIdx.z,
36-
blockDim.x, blockDim.y, blockDim.z);
35+
LaunchContext ctx(threadIdx, blockDim);
3736

3837
using RAJA::internal::thread_privatize;
3938
auto privatizer = thread_privatize(body_in);
@@ -138,8 +137,7 @@ __launch_bounds__(num_threads, 1) __global__
138137
void launch_new_reduce_global_fcn_fixed(const BODY body_in,
139138
ReduceParams reduce_params)
140139
{
141-
LaunchContext ctx(threadIdx.x, threadIdx.y, threadIdx.z,
142-
blockDim.x, blockDim.y, blockDim.z);
140+
LaunchContext ctx(threadIdx, blockDim);
143141

144142
using RAJA::internal::thread_privatize;
145143
auto privatizer = thread_privatize(body_in);
@@ -241,6 +239,12 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
241239
}
242240
};
243241

242+
/*
243+
Loop methods which rely on a copy of threaIdx/BlockDim
244+
for performance. In collaboration with AMD we have have this
245+
to be more performant.
246+
*/
247+
244248
template<named_dim DIM>
245249
struct hip_ctx_thread_loop;
246250

@@ -258,23 +262,17 @@ struct LoopExecute<hip_ctx_thread_loop<DIM>, SEGMENT>
258262
BODY const& body)
259263
{
260264

261-
const int len = segment.end() - segment.begin();
265+
const int len = segment.end() - segment.begin();
262266
constexpr int int_dim = static_cast<int>(DIM);
263267

264-
//for(int i=::RAJA::internal::HipDimHelper<DIM>::get(threadIdx);
265-
for(int i = ctx.thread_id[int_dim];
266-
i < len;
267-
i+=ctx.block_dim[int_dim])
268-
//i+=4)
268+
for (int i = ::RAJA::internal::HipDimHelper<DIM>::get(ctx.thread_id);
269+
i < len; i += ::RAJA::internal::HipDimHelper<DIM>::get(ctx.block_dim))
269270
{
270-
body(*(segment.begin() + i));
271+
body(*(segment.begin() + i));
271272
}
272-
273273
}
274274
};
275275

276-
277-
278276
/*
279277
HIP generic loop implementations
280278
*/

0 commit comments

Comments
 (0)