add cuda variant and add build guards for cpu

artv3 · artv3 · commit 18f332b7b3ab · 2025-12-02T09:05:26.000-08:00
diff --git a/include/RAJA/pattern/launch/launch_core.hpp b/include/RAJA/pattern/launch/launch_core.hpp
@@ -185,27 +185,28 @@ class LaunchContext
 
   void* shared_mem_ptr;
 
-  const size_t thread_id[3];
-  const size_t block_dim[3];
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+  const dim3 thread_id;
+  const dim3 block_dim;
+#endif
 
 #if defined(RAJA_ENABLE_SYCL)
   mutable ::sycl::nd_item<3>* itm;
 #endif
 
- RAJA_HOST_DEVICE LaunchContext()
+  RAJA_HOST_DEVICE LaunchContext()
       : shared_mem_offset(0),
-        shared_mem_ptr(nullptr),
-        thread_id{1, 1, 1},
-        block_dim{1, 1, 1}
+        shared_mem_ptr(nullptr)
   {}
 
-  RAJA_HOST_DEVICE LaunchContext(const size_t tx, const size_t ty, const size_t tz,
-  const size_t bx, const size_t by, const size_t bz)
+#if defined(RAJA_ENABLE_CUDA) || defined(RAJA_ENABLE_HIP)
+  RAJA_HOST_DEVICE LaunchContext(dim3 thread_id_, dim3 block_id_)
       : shared_mem_offset(0),
         shared_mem_ptr(nullptr),
-        thread_id{tx, ty, tz},
-        block_dim{bx, by, bz}
-  {}        
+        thread_id {thread_id_},
+        block_dim {block_id_}
+  {}
+#endif
 
   // TODO handle alignment
   template<typename T>
diff --git a/include/RAJA/policy/cuda/launch.hpp b/include/RAJA/policy/cuda/launch.hpp
@@ -33,7 +33,7 @@ __global__ void launch_new_reduce_global_fcn(const RAJA_CUDA_GRID_CONSTANT BODY
                                                  body_in,
                                              ReduceParams reduce_params)
 {
-  LaunchContext ctx;
+  LaunchContext ctx(threadIdx, blockDim);
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -143,7 +143,7 @@ __launch_bounds__(num_threads, BLOCKS_PER_SM) __global__
                                                 body_in,
                                             ReduceParams reduce_params)
 {
-  LaunchContext ctx;
+  LaunchContext ctx(threadIdx, blockDim);
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -245,6 +245,40 @@ struct LaunchExecute<
   }
 };
 
+/*
+  Loop methods which rely on a copy of threaIdx/BlockDim
+  for performance. In collaboration with AMD we have have this
+  to be more performat.
+*/
+
+template<named_dim DIM>
+struct hip_ctx_thread_loop;
+
+using hip_ctx_thread_loop_x = hip_ctx_thread_loop<named_dim::x>;
+using hip_ctx_thread_loop_y = hip_ctx_thread_loop<named_dim::y>;
+using hip_ctx_thread_loop_z = hip_ctx_thread_loop<named_dim::z>;
+
+template<typename SEGMENT, named_dim DIM>
+struct LoopExecute<hip_ctx_thread_loop<DIM>, SEGMENT>
+{
+
+  template<typename BODY>
+  static RAJA_INLINE RAJA_DEVICE void exec(LaunchContext const& ctx,
+                                           SEGMENT const& segment,
+                                           BODY const& body)
+  {
+
+    const int len         = segment.end() - segment.begin();
+    constexpr int int_dim = static_cast<int>(DIM);
+
+    for (int i = ::RAJA::internal::HipDimHelper<DIM>::get(ctx.thread_id);
+         i < len; i += ::RAJA::internal::HipDimHelper<DIM>::get(ctx.block_dim))
+    {
+      body(*(segment.begin() + i));
+    }
+  }
+};
+
 /*
    CUDA generic loop implementations
 */
diff --git a/include/RAJA/policy/hip/launch.hpp b/include/RAJA/policy/hip/launch.hpp
@@ -32,8 +32,7 @@ template<typename BODY, typename ReduceParams>
 __global__ void launch_new_reduce_global_fcn(const BODY body_in,
                                              ReduceParams reduce_params)
 {
-  LaunchContext ctx(threadIdx.x, threadIdx.y, threadIdx.z,
-                    blockDim.x, blockDim.y, blockDim.z);
+  LaunchContext ctx(threadIdx, blockDim);
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -138,8 +137,7 @@ __launch_bounds__(num_threads, 1) __global__
     void launch_new_reduce_global_fcn_fixed(const BODY body_in,
                                             ReduceParams reduce_params)
 {
-  LaunchContext ctx(threadIdx.x, threadIdx.y, threadIdx.z,
-                    blockDim.x, blockDim.y, blockDim.z);
+  LaunchContext ctx(threadIdx, blockDim);
 
   using RAJA::internal::thread_privatize;
   auto privatizer = thread_privatize(body_in);
@@ -241,6 +239,12 @@ struct LaunchExecute<RAJA::policy::hip::hip_launch_t<async, nthreads>>
   }
 };
 
+/*
+  Loop methods which rely on a copy of threaIdx/BlockDim
+  for performance. In collaboration with AMD we have have this
+  to be more performant.
+*/
+
 template<named_dim DIM>
 struct hip_ctx_thread_loop;
 
@@ -258,23 +262,17 @@ struct LoopExecute<hip_ctx_thread_loop<DIM>, SEGMENT>
                                            BODY const& body)
   {
 
-    const int len = segment.end() - segment.begin();
+    const int len         = segment.end() - segment.begin();
     constexpr int int_dim = static_cast<int>(DIM);
 
-    //for(int i=::RAJA::internal::HipDimHelper<DIM>::get(threadIdx);
-    for(int i = ctx.thread_id[int_dim];
-          i < len;
-        i+=ctx.block_dim[int_dim])
-        //i+=4)
+    for (int i = ::RAJA::internal::HipDimHelper<DIM>::get(ctx.thread_id);
+         i < len; i += ::RAJA::internal::HipDimHelper<DIM>::get(ctx.block_dim))
     {
-         body(*(segment.begin() + i));
+      body(*(segment.begin() + i));
     }
-
   }
 };
 
-
-
 /*
    HIP generic loop implementations
 */