Optimize OpenCL Addition

gkisalapl · gkisalapl · commit c912f6663894 · 2025-08-13T16:56:33.000+02:00
Unify opencl addition for FP16/32
Add possibility to pass nullptr as local work size
Add SVM option

**Self-evaluation:**
1. Build test: [X]Passed [ ]Failed [ ]Skipped
2. Run test:   [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: Grzegorz Kisala &lt;gkisala@gmail.com&gt;
diff --git a/nntrainer/opencl/opencl_command_queue_manager.cpp b/nntrainer/opencl/opencl_command_queue_manager.cpp
@@ -387,4 +387,22 @@ bool CommandQueueManager::DispatchCommand(
   return true;
 }
 
+bool CommandQueueManager::DispatchCommandAndWait(
+  const cl_kernel kernel, const uint32_t work_dim,
+  const size_t *global_work_size, const size_t *local_work_size) {
+
+  const auto error_code = clEnqueueNDRangeKernel(
+    command_queue_, kernel, work_dim, nullptr, global_work_size,
+    local_work_size, 0, nullptr, nullptr);
+  if (error_code != CL_SUCCESS) {
+    ml_loge("Failed to clEnqueueNDRangeKernel. OpenCL error code: %d",
+            error_code);
+    return false;
+  }
+
+  clFinish(command_queue_);
+
+  return true;
+}
+
 } // namespace nntrainer::opencl
diff --git a/nntrainer/opencl/opencl_command_queue_manager.h b/nntrainer/opencl/opencl_command_queue_manager.h
@@ -191,6 +191,19 @@ class CommandQueueManager : public Singleton<CommandQueueManager> {
                        const int (&work_group_size)[3],
                        cl_event *event = nullptr);
 
+  /**
+   * @brief Overloaded function to initiate execution of the command queue.
+   *
+   * @param kernel OpenCL kernel
+   * @param global_work_size Total number of work items that will execute the
+   * kernel function
+   * @param local_work_size Number of work items that make up a work group
+   * @return true if command queue execution is successful or false otherwise
+   */
+  bool DispatchCommandAndWait(const cl_kernel kernel, const uint32_t work_dim,
+                              const size_t *global_work_size,
+                              const size_t *local_work_size);
+
   /**
    * @brief Get the OpenCL Command Queue object
    *
diff --git a/nntrainer/tensor/cl_operations/blas_kernel_interface.cpp b/nntrainer/tensor/cl_operations/blas_kernel_interface.cpp
@@ -220,23 +220,18 @@ void add_i_cl(Tensor &result, Tensor const &input) {
        result.channel() == input.channel() &&
        result.height() == input.height() && result.width() == input.width())) {
 
-    if (result.getDataType() == ml::train::TensorDim::DataType::FP32) {
-      float *Y = result.getData();
-      const float *X = input.getData();
+    const unsigned int size_input = input.size();
+    const unsigned int size_res = result.size();
 
-      for (unsigned int i = 0; i < result.batch() / input.batch(); ++i) {
-        axpy_cl(input.size(), 1.0f, X, Y);
-        Y += input.size();
-      }
+    if (result.getDataType() == ml::train::TensorDim::DataType::FP32) {
+      const auto *data_input = input.getData<float>();
+      auto *data_res = result.getData<float>();
+      addition_cl(data_input, data_res, size_input, size_res);
     } else if (result.getDataType() == ml::train::TensorDim::DataType::FP16) {
 #ifdef ENABLE_FP16
-      unsigned int size_res = result.size();
-      unsigned int size_input = input.size();
-      _FP16 *data_res = result.getData<_FP16>();
-      const _FP16 *data_input = input.getData<_FP16>();
-
+      const auto *data_input = input.getData<_FP16>();
+      auto *data_res = result.getData<_FP16>();
       addition_cl(data_input, data_res, size_input, size_res);
-
 #else
       throw std::invalid_argument("Error: enable-fp16 is not enabled");
 #endif
diff --git a/nntrainer/tensor/cl_operations/blas_kernel_strings.cpp b/nntrainer/tensor/cl_operations/blas_kernel_strings.cpp
@@ -394,7 +394,6 @@ const std::string &getSgemmClTransABKernel() {
 const std::string &getAdditionClKernel() {
   static const std::string addition_cl_kernel_ =
     R"(__kernel void addition_cl(const __global float* input, __global float* output, unsigned int size_input, unsigned int size_res) {
-        #pragma printf_support
         size_t idx = get_global_id(0);
         if (idx < size_res) {
             output[idx] = output[idx] + input[idx % size_input];
diff --git a/nntrainer/tensor/cl_operations/blas_kernels.cpp b/nntrainer/tensor/cl_operations/blas_kernels.cpp
@@ -272,7 +272,7 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
 }
 
 void addition_cl(const float *input, float *res, unsigned int size_input,
-                 unsigned int size_res) {
+                 unsigned int size_res, const bool use_svm) {
   bool result = false;
   auto *blas_cc =
     static_cast<ClContext *>(Engine::Global().getRegisteredContext("gpu"));
@@ -284,7 +284,7 @@ void addition_cl(const float *input, float *res, unsigned int size_input,
   }
 
   addition_cl_internal<float>(kernel_addition_ptr, input, res, size_input,
-                              size_res);
+                              size_res, use_svm);
 }
 
 void sscal_cl(float *X, const unsigned int N, const float alpha) {
diff --git a/nntrainer/tensor/cl_operations/blas_kernels.h b/nntrainer/tensor/cl_operations/blas_kernels.h
@@ -88,7 +88,7 @@ void sgemm_cl(bool TransA, bool TransB, const float *A, const float *B,
  * @param[in] size_res number of elements in result vector
  */
 void addition_cl(const float *input, float *res, unsigned int size_input,
-                 unsigned int size_res);
+                 unsigned int size_res, const bool use_svm = false);
 
 /**
  * @brief     sscal value element by element immediately
diff --git a/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp b/nntrainer/tensor/cl_operations/blas_kernels_fp16.cpp
@@ -96,7 +96,7 @@ void addition_cl(const _FP16 *input, _FP16 *res, unsigned int size_input,
   }
 
   addition_cl_internal<_FP16>(kernel_addition_fp16_ptr, input, res, size_input,
-                              size_res);
+                              size_res, false);
 }
 
 void sscal_cl(_FP16 *X, const unsigned int N, const float alpha) {
diff --git a/nntrainer/tensor/cl_operations/blas_kernels_templates.h b/nntrainer/tensor/cl_operations/blas_kernels_templates.h
@@ -258,38 +258,56 @@ sgemm_cl_internal(ClContext::SharedPtrClKernel kernel, bool TransA, bool TransB,
 template <typename T>
 inline static void
 addition_cl_internal(ClContext::SharedPtrClKernel kernel, const T *input,
-                     T *res, unsigned int size_input, unsigned int size_res) {
+                     T *res, unsigned int size_input, unsigned int size_res,
+                     const bool use_svm) {
   bool result = false;
 
   auto *blas_cc =
     static_cast<ClContext *>(Engine::Global().getRegisteredContext("gpu"));
-  auto &clbuffInstance = ClBufferManager::Global();
 
   size_t dim1_size = sizeof(T) * size_input;
   size_t dim2_size = sizeof(T) * size_res;
 
-  result = clbuffInstance.getInBufferA()->WriteDataRegion(
-    blas_cc->command_queue_inst_, dim1_size, input);
-  if (!result) {
-    return;
-  }
+  if (use_svm) {
+    blas_cc->command_queue_inst_.enqueueSVMMap(const_cast<float *>(input),
+                                               dim1_size, false);
+    blas_cc->command_queue_inst_.enqueueSVMMap(res, dim2_size, false);
 
-  result = clbuffInstance.getOutBufferA()->WriteDataRegion(
-    blas_cc->command_queue_inst_, dim2_size, res);
-  if (!result) {
-    return;
-  }
+    result = kernel->SetKernelSVMArguments(0, input);
+    if (!result) {
+      return;
+    }
 
-  result = kernel->SetKernelArguments(0, clbuffInstance.getInBufferA(),
-                                      sizeof(cl_mem));
-  if (!result) {
-    return;
-  }
+    result = kernel->SetKernelSVMArguments(1, res);
+    if (!result) {
+      return;
+    }
+  } else {
+    auto &clbuffInstance = ClBufferManager::Global();
+    result = clbuffInstance.getInBufferA()->WriteDataRegion(
+      blas_cc->command_queue_inst_, dim1_size, input);
+    if (!result) {
+      return;
+    }
 
-  result = kernel->SetKernelArguments(1, clbuffInstance.getOutBufferA(),
-                                      sizeof(cl_mem));
-  if (!result) {
-    return;
+    result = clbuffInstance.getOutBufferA()->WriteDataRegion(
+      blas_cc->command_queue_inst_, dim2_size, res);
+    if (!result) {
+      return;
+    }
+
+    auto bufferInA = clbuffInstance.getInBufferA()->GetBuffer();
+    auto bufferOutA = clbuffInstance.getOutBufferA()->GetBuffer();
+
+    result = kernel->SetKernelArguments(0, &bufferInA, sizeof(cl_mem));
+    if (!result) {
+      return;
+    }
+
+    result = kernel->SetKernelArguments(1, &bufferOutA, sizeof(cl_mem));
+    if (!result) {
+      return;
+    }
   }
 
   result = kernel->SetKernelArguments(2, &size_input, sizeof(int));
@@ -302,20 +320,26 @@ addition_cl_internal(ClContext::SharedPtrClKernel kernel, const T *input,
     return;
   }
 
-  const int work_groups_count[3] = {(int)size_res, 1, 1};
-  /// @todo: create a group size by device & input
-  const int work_group_size[3] = {1, 1, 1}; // test-value
-  result = blas_cc->command_queue_inst_.DispatchCommand(
-    kernel, work_groups_count, work_group_size);
+  std::array<size_t, 3> global_work_size = {size_res, 1, 1};
+
+  result = blas_cc->command_queue_inst_.DispatchCommandAndWait(
+    kernel->GetKernel(), global_work_size.size(), global_work_size.data(),
+    nullptr);
   if (!result) {
     return;
   }
 
-  result = clbuffInstance.getOutBufferA()->ReadDataRegion(
-    blas_cc->command_queue_inst_, dim2_size, res);
+  if (use_svm) {
+    blas_cc->command_queue_inst_.enqueueSVMUnmap(const_cast<float *>(input));
+    blas_cc->command_queue_inst_.enqueueSVMUnmap(res);
+  } else {
+    auto &clbuffInstance = ClBufferManager::Global();
+    result = clbuffInstance.getOutBufferA()->ReadDataRegion(
+      blas_cc->command_queue_inst_, dim2_size, res);
 
-  if (!result) {
-    return;
+    if (!result) {
+      return;
+    }
   }
 }
 
diff --git a/test/unittest/unittest_blas_kernels_cl.cpp b/test/unittest/unittest_blas_kernels_cl.cpp
@@ -441,8 +441,8 @@ TEST(blas_kernels, dot_gemm_50_768_2048_transAB) {
 TEST(blas_kernels, addition_i) {
   const int batch = 12;
   const int channel = 1;
-  const int height = 26;
-  const int width = 26;
+  const int height = 2048;
+  const int width = 2048;
 
   const int batch_b = 1;
 
@@ -474,8 +474,20 @@ TEST(blas_kernels, addition_i) {
                             MOD) *
                              alpha);
 
+  auto t1 = std::chrono::high_resolution_clock::now();
   A_fp32.add_i(B_fp32);
+  auto t2 = std::chrono::high_resolution_clock::now();
+  auto dt_cpu = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
+
+  auto t3 = std::chrono::high_resolution_clock::now();
   add_i_cl(C_fp32, D_fp32);
+  auto t4 = std::chrono::high_resolution_clock::now();
+  auto dt_gpu = std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3);
+
+  std::cout << "FP32 ADD : N: " << batch << " C: " << channel
+            << " H: " << height << " W: " << width << std::endl;
+  std::cout << " - time : CPU = " << dt_cpu.count() << " us" << std::endl;
+  std::cout << " - time : GPU = " << dt_gpu.count() << " us" << std::endl;
 
   float mseError =
     mse<float>(A_fp32.getData<float>(), C_fp32.getData<float>(), A_fp32.size());
@@ -489,6 +501,94 @@ TEST(blas_kernels, addition_i) {
   EXPECT_IN_RANGE((float)cosSim, 0.99, 1);
 }
 
+TEST(blas_kernels, addition_i_cl) {
+  const int batch = 12;
+  const int channel = 1;
+  const int height = 2048;
+  const int width = 2048;
+
+  const int batch_b = 1;
+
+  const float alpha = 1e-1;
+  const int MOD = 10;
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+  nntrainer::Tensor A_fp32(batch, channel, height, width, t_type_nchw_fp32);
+  nntrainer::Tensor B_fp32(batch_b, channel, height, width, t_type_nchw_fp32);
+  nntrainer::Tensor C_fp32(batch, channel, height, width, t_type_nchw_fp32);
+  nntrainer::Tensor D_fp32(batch_b, channel, height, width, t_type_nchw_fp32);
+
+  GEN_TEST_INPUT(A_fp32, ((i * (batch * height * channel) +
+                           j * (batch * height) + k * (width) + l + 1) %
+                          MOD) *
+                           alpha);
+  GEN_TEST_INPUT_C(B_fp32, ((i * (batch_b * height * channel) +
+                             j * (batch_b * height) + k * (width) + l + 1) %
+                            MOD) *
+                             alpha);
+  GEN_TEST_INPUT(C_fp32, ((i * (batch * height * channel) +
+                           j * (batch * height) + k * (width) + l + 1) %
+                          MOD) *
+                           alpha);
+  GEN_TEST_INPUT_C(D_fp32, ((i * (batch_b * height * channel) +
+                             j * (batch_b * height) + k * (width) + l + 1) %
+                            MOD) *
+                             alpha);
+
+  auto t1 = std::chrono::high_resolution_clock::now();
+  A_fp32.add_i(B_fp32);
+  auto t2 = std::chrono::high_resolution_clock::now();
+  auto dt_cpu = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
+
+  auto *cl_context =
+    static_cast<ClContext *>(Engine::Global().getRegisteredContext("gpu"));
+
+  void *C_fp32_svm =
+    cl_context->context_inst_.createSVMRegion(C_fp32.size() * sizeof(float));
+  void *D_fp32_svm =
+    cl_context->context_inst_.createSVMRegion(D_fp32.size() * sizeof(float));
+
+  cl_context->command_queue_inst_.enqueueSVMMap(
+    C_fp32_svm, C_fp32.size() * sizeof(float), false);
+  cl_context->command_queue_inst_.enqueueSVMMap(
+    D_fp32_svm, D_fp32.size() * sizeof(float), false);
+
+  std::memcpy(C_fp32_svm, C_fp32.getData<float>(),
+              C_fp32.size() * sizeof(float));
+  std::memcpy(D_fp32_svm, D_fp32.getData<float>(),
+              D_fp32.size() * sizeof(float));
+
+  cl_context->command_queue_inst_.enqueueSVMUnmap(C_fp32_svm);
+  cl_context->command_queue_inst_.enqueueSVMUnmap(D_fp32_svm);
+
+  auto t3 = std::chrono::high_resolution_clock::now();
+  addition_cl((float *)D_fp32_svm, (float *)C_fp32_svm, D_fp32.size(),
+              C_fp32.size(), true);
+  auto t4 = std::chrono::high_resolution_clock::now();
+  auto dt_gpu = std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3);
+
+  std::cout << "FP32 ADD : N: " << batch << " C: " << channel
+            << " H: " << height << " W: " << width << std::endl;
+  std::cout << " - time : CPU = " << dt_cpu.count() << " us" << std::endl;
+  std::cout << " - time : GPU = " << dt_gpu.count() << " us" << std::endl;
+
+  float mseError =
+    mse<float>(A_fp32.getData<float>(), (float *)C_fp32_svm, A_fp32.size());
+
+  double cosSim = cosine_similarity<float>(A_fp32.getData<float>(),
+                                           (float *)C_fp32_svm, A_fp32.size());
+
+  const float epsilon = 1e-3 * width;
+
+  cl_context->context_inst_.releaseSVMRegion(C_fp32_svm);
+  cl_context->context_inst_.releaseSVMRegion(D_fp32_svm);
+
+  EXPECT_IN_RANGE(mseError, 0, epsilon);
+  EXPECT_IN_RANGE((float)cosSim, 0.99, 1);
+}
+
 TEST(blas_kernels, l2norm) {
   const int batch = 1;
   const int channel = 1;

Original file line number	Diff line number	Diff line change
`@@ -272,7 +272,7 @@ void sgemm_cl(bool TransA, bool TransB, const float A, const float B,`
`272`	`272`	`}`
`273`	`273`
`274`	`274`	`void addition_cl(const float input, float res, unsigned int size_input,`
`275`		`- unsigned int size_res) {`
	`275`	`+ unsigned int size_res, const bool use_svm) {`
`276`	`276`	`bool result = false;`
`277`	`277`	`auto *blas_cc =`
`278`	`278`	`static_cast<ClContext *>(Engine::Global().getRegisteredContext("gpu"));`
`@@ -284,7 +284,7 @@ void addition_cl(const float input, float res, unsigned int size_input,`
`284`	`284`	`}`
`285`	`285`
`286`	`286`	`addition_cl_internal<float>(kernel_addition_ptr, input, res, size_input,`
`287`		`- size_res);`
	`287`	`+ size_res, use_svm);`
`288`	`288`	`}`
`289`	`289`
`290`	`290`	`void sscal_cl(float *X, const unsigned int N, const float alpha) {`
Original file line number	Diff line number	Diff line change
`@@ -96,7 +96,7 @@ void addition_cl(const _FP16 input, _FP16 res, unsigned int size_input,`
`96`	`96`	`}`
`97`	`97`
`98`	`98`	`addition_cl_internal<_FP16>(kernel_addition_fp16_ptr, input, res, size_input,`
`99`		`- size_res);`
	`99`	`+ size_res, false);`
`100`	`100`	`}`
`101`	`101`
`102`	`102`	`void sscal_cl(_FP16 *X, const unsigned int N, const float alpha) {`