Skip to content

Commit b9da07d

Browse files
committed
Optimize OpenCL Addition
Unify opencl addition for FP16/32 Add possibility to pass nullptr as local work size **Self-evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: Grzegorz Kisala <[email protected]>
1 parent 508e105 commit b9da07d

File tree

6 files changed

+63
-25
lines changed

6 files changed

+63
-25
lines changed

nntrainer/opencl/opencl_command_queue_manager.cpp

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -387,4 +387,22 @@ bool CommandQueueManager::DispatchCommand(
387387
return true;
388388
}
389389

390+
bool CommandQueueManager::DispatchCommandAndWait(
391+
const cl_kernel kernel, const uint32_t work_dim,
392+
const size_t *global_work_size, const size_t *local_work_size) {
393+
394+
const auto error_code = clEnqueueNDRangeKernel(
395+
command_queue_, kernel, work_dim, nullptr, global_work_size,
396+
local_work_size, 0, nullptr, nullptr);
397+
if (error_code != CL_SUCCESS) {
398+
ml_loge("Failed to clEnqueueNDRangeKernel. OpenCL error code: %d",
399+
error_code);
400+
return false;
401+
}
402+
403+
clFinish(command_queue_);
404+
405+
return true;
406+
}
407+
390408
} // namespace nntrainer::opencl

nntrainer/opencl/opencl_command_queue_manager.h

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -191,6 +191,19 @@ class CommandQueueManager : public Singleton<CommandQueueManager> {
191191
const int (&work_group_size)[3],
192192
cl_event *event = nullptr);
193193

194+
/**
195+
* @brief Overloaded function to initiate execution of the command queue.
196+
*
197+
* @param kernel OpenCL kernel
198+
* @param global_work_size Total number of work items that will execute the
199+
* kernel function
200+
* @param local_work_size Number of work items that make up a work group
201+
* @return true if command queue execution is successful or false otherwise
202+
*/
203+
bool DispatchCommandAndWait(const cl_kernel kernel, const uint32_t work_dim,
204+
const size_t *global_work_size,
205+
const size_t *local_work_size);
206+
194207
/**
195208
* @brief Get the OpenCL Command Queue object
196209
*

nntrainer/tensor/cl_operations/blas_kernel_interface.cpp

Lines changed: 8 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -220,23 +220,18 @@ void add_i_cl(Tensor &result, Tensor const &input) {
220220
result.channel() == input.channel() &&
221221
result.height() == input.height() && result.width() == input.width())) {
222222

223-
if (result.getDataType() == ml::train::TensorDim::DataType::FP32) {
224-
float *Y = result.getData();
225-
const float *X = input.getData();
223+
const unsigned int size_input = input.size();
224+
const unsigned int size_res = result.size();
226225

227-
for (unsigned int i = 0; i < result.batch() / input.batch(); ++i) {
228-
axpy_cl(input.size(), 1.0f, X, Y);
229-
Y += input.size();
230-
}
226+
if (result.getDataType() == ml::train::TensorDim::DataType::FP32) {
227+
const auto *data_input = input.getData<float>();
228+
auto *data_res = result.getData<float>();
229+
addition_cl(data_input, data_res, size_input, size_res);
231230
} else if (result.getDataType() == ml::train::TensorDim::DataType::FP16) {
232231
#ifdef ENABLE_FP16
233-
unsigned int size_res = result.size();
234-
unsigned int size_input = input.size();
235-
_FP16 *data_res = result.getData<_FP16>();
236-
const _FP16 *data_input = input.getData<_FP16>();
237-
232+
const auto *data_input = input.getData<_FP16>();
233+
auto *data_res = result.getData<_FP16>();
238234
addition_cl(data_input, data_res, size_input, size_res);
239-
240235
#else
241236
throw std::invalid_argument("Error: enable-fp16 is not enabled");
242237
#endif

nntrainer/tensor/cl_operations/blas_kernel_strings.cpp

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -394,7 +394,6 @@ const std::string &getSgemmClTransABKernel() {
394394
const std::string &getAdditionClKernel() {
395395
static const std::string addition_cl_kernel_ =
396396
R"(__kernel void addition_cl(const __global float* input, __global float* output, unsigned int size_input, unsigned int size_res) {
397-
#pragma printf_support
398397
size_t idx = get_global_id(0);
399398
if (idx < size_res) {
400399
output[idx] = output[idx] + input[idx % size_input];

nntrainer/tensor/cl_operations/blas_kernels_templates.h

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -280,14 +280,15 @@ addition_cl_internal(ClContext::SharedPtrClKernel kernel, const T *input,
280280
return;
281281
}
282282

283-
result = kernel->SetKernelArguments(0, clbuffInstance.getInBufferA(),
284-
sizeof(cl_mem));
283+
auto bufferInA = clbuffInstance.getInBufferA()->GetBuffer();
284+
auto bufferOutA = clbuffInstance.getOutBufferA()->GetBuffer();
285+
286+
result = kernel->SetKernelArguments(0, &bufferInA, sizeof(cl_mem));
285287
if (!result) {
286288
return;
287289
}
288290

289-
result = kernel->SetKernelArguments(1, clbuffInstance.getOutBufferA(),
290-
sizeof(cl_mem));
291+
result = kernel->SetKernelArguments(1, &bufferOutA, sizeof(cl_mem));
291292
if (!result) {
292293
return;
293294
}
@@ -302,11 +303,11 @@ addition_cl_internal(ClContext::SharedPtrClKernel kernel, const T *input,
302303
return;
303304
}
304305

305-
const int work_groups_count[3] = {(int)size_res, 1, 1};
306-
/// @todo: create a group size by device & input
307-
const int work_group_size[3] = {1, 1, 1}; // test-value
308-
result = blas_cc->command_queue_inst_.DispatchCommand(
309-
kernel, work_groups_count, work_group_size);
306+
std::array<size_t, 3> global_work_size = {size_res, 1, 1};
307+
308+
result = blas_cc->command_queue_inst_.DispatchCommandAndWait(
309+
kernel->GetKernel(), global_work_size.size(), global_work_size.data(),
310+
nullptr);
310311
if (!result) {
311312
return;
312313
}

test/unittest/unittest_blas_kernels_cl.cpp

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -441,8 +441,8 @@ TEST(blas_kernels, dot_gemm_50_768_2048_transAB) {
441441
TEST(blas_kernels, addition_i) {
442442
const int batch = 12;
443443
const int channel = 1;
444-
const int height = 26;
445-
const int width = 26;
444+
const int height = 2048;
445+
const int width = 2048;
446446

447447
const int batch_b = 1;
448448

@@ -474,8 +474,20 @@ TEST(blas_kernels, addition_i) {
474474
MOD) *
475475
alpha);
476476

477+
auto t1 = std::chrono::high_resolution_clock::now();
477478
A_fp32.add_i(B_fp32);
479+
auto t2 = std::chrono::high_resolution_clock::now();
480+
auto dt_cpu = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
481+
482+
auto t3 = std::chrono::high_resolution_clock::now();
478483
add_i_cl(C_fp32, D_fp32);
484+
auto t4 = std::chrono::high_resolution_clock::now();
485+
auto dt_gpu = std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3);
486+
487+
std::cout << "FP32 ADD : N: " << batch << " C: " << channel
488+
<< " H: " << height << " W: " << width << std::endl;
489+
std::cout << " - time : CPU = " << dt_cpu.count() << " us" << std::endl;
490+
std::cout << " - time : GPU = " << dt_gpu.count() << " us" << std::endl;
479491

480492
float mseError =
481493
mse<float>(A_fp32.getData<float>(), C_fp32.getData<float>(), A_fp32.size());

0 commit comments

Comments
 (0)