@@ -441,8 +441,8 @@ TEST(blas_kernels, dot_gemm_50_768_2048_transAB) {
441
441
TEST (blas_kernels, addition_i) {
442
442
const int batch = 12 ;
443
443
const int channel = 1 ;
444
- const int height = 26 ;
445
- const int width = 26 ;
444
+ const int height = 2048 ;
445
+ const int width = 2048 ;
446
446
447
447
const int batch_b = 1 ;
448
448
@@ -474,8 +474,20 @@ TEST(blas_kernels, addition_i) {
474
474
MOD) *
475
475
alpha);
476
476
477
+ auto t1 = std::chrono::high_resolution_clock::now ();
477
478
A_fp32.add_i (B_fp32);
479
+ auto t2 = std::chrono::high_resolution_clock::now ();
480
+ auto dt_cpu = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
481
+
482
+ auto t3 = std::chrono::high_resolution_clock::now ();
478
483
add_i_cl (C_fp32, D_fp32);
484
+ auto t4 = std::chrono::high_resolution_clock::now ();
485
+ auto dt_gpu = std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3);
486
+
487
+ std::cout << " FP32 ADD : N: " << batch << " C: " << channel
488
+ << " H: " << height << " W: " << width << std::endl;
489
+ std::cout << " - time : CPU = " << dt_cpu.count () << " us" << std::endl;
490
+ std::cout << " - time : GPU = " << dt_gpu.count () << " us" << std::endl;
479
491
480
492
float mseError =
481
493
mse<float >(A_fp32.getData <float >(), C_fp32.getData <float >(), A_fp32.size ());
@@ -489,6 +501,94 @@ TEST(blas_kernels, addition_i) {
489
501
EXPECT_IN_RANGE ((float )cosSim, 0.99 , 1 );
490
502
}
491
503
504
+ TEST (blas_kernels, addition_i_cl) {
505
+ const int batch = 12 ;
506
+ const int channel = 1 ;
507
+ const int height = 2048 ;
508
+ const int width = 2048 ;
509
+
510
+ const int batch_b = 1 ;
511
+
512
+ const float alpha = 1e-1 ;
513
+ const int MOD = 10 ;
514
+
515
+ nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
516
+ nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
517
+
518
+ nntrainer::Tensor A_fp32 (batch, channel, height, width, t_type_nchw_fp32);
519
+ nntrainer::Tensor B_fp32 (batch_b, channel, height, width, t_type_nchw_fp32);
520
+ nntrainer::Tensor C_fp32 (batch, channel, height, width, t_type_nchw_fp32);
521
+ nntrainer::Tensor D_fp32 (batch_b, channel, height, width, t_type_nchw_fp32);
522
+
523
+ GEN_TEST_INPUT (A_fp32, ((i * (batch * height * channel) +
524
+ j * (batch * height) + k * (width) + l + 1 ) %
525
+ MOD) *
526
+ alpha);
527
+ GEN_TEST_INPUT_C (B_fp32, ((i * (batch_b * height * channel) +
528
+ j * (batch_b * height) + k * (width) + l + 1 ) %
529
+ MOD) *
530
+ alpha);
531
+ GEN_TEST_INPUT (C_fp32, ((i * (batch * height * channel) +
532
+ j * (batch * height) + k * (width) + l + 1 ) %
533
+ MOD) *
534
+ alpha);
535
+ GEN_TEST_INPUT_C (D_fp32, ((i * (batch_b * height * channel) +
536
+ j * (batch_b * height) + k * (width) + l + 1 ) %
537
+ MOD) *
538
+ alpha);
539
+
540
+ auto t1 = std::chrono::high_resolution_clock::now ();
541
+ A_fp32.add_i (B_fp32);
542
+ auto t2 = std::chrono::high_resolution_clock::now ();
543
+ auto dt_cpu = std::chrono::duration_cast<std::chrono::microseconds>(t2 - t1);
544
+
545
+ auto *cl_context =
546
+ static_cast <ClContext *>(Engine::Global ().getRegisteredContext (" gpu" ));
547
+
548
+ void *C_fp32_svm =
549
+ cl_context->context_inst_ .createSVMRegion (C_fp32.size () * sizeof (float ));
550
+ void *D_fp32_svm =
551
+ cl_context->context_inst_ .createSVMRegion (D_fp32.size () * sizeof (float ));
552
+
553
+ cl_context->command_queue_inst_ .enqueueSVMMap (
554
+ C_fp32_svm, C_fp32.size () * sizeof (float ), false );
555
+ cl_context->command_queue_inst_ .enqueueSVMMap (
556
+ D_fp32_svm, D_fp32.size () * sizeof (float ), false );
557
+
558
+ std::memcpy (C_fp32_svm, C_fp32.getData <float >(),
559
+ C_fp32.size () * sizeof (float ));
560
+ std::memcpy (D_fp32_svm, D_fp32.getData <float >(),
561
+ D_fp32.size () * sizeof (float ));
562
+
563
+ cl_context->command_queue_inst_ .enqueueSVMUnmap (C_fp32_svm);
564
+ cl_context->command_queue_inst_ .enqueueSVMUnmap (D_fp32_svm);
565
+
566
+ auto t3 = std::chrono::high_resolution_clock::now ();
567
+ addition_cl ((float *)D_fp32_svm, (float *)C_fp32_svm, D_fp32.size (),
568
+ C_fp32.size (), true );
569
+ auto t4 = std::chrono::high_resolution_clock::now ();
570
+ auto dt_gpu = std::chrono::duration_cast<std::chrono::microseconds>(t4 - t3);
571
+
572
+ std::cout << " FP32 ADD : N: " << batch << " C: " << channel
573
+ << " H: " << height << " W: " << width << std::endl;
574
+ std::cout << " - time : CPU = " << dt_cpu.count () << " us" << std::endl;
575
+ std::cout << " - time : GPU = " << dt_gpu.count () << " us" << std::endl;
576
+
577
+ float mseError =
578
+ mse<float >(A_fp32.getData <float >(), (float *)C_fp32_svm, A_fp32.size ());
579
+
580
+ double cosSim = cosine_similarity<float >(A_fp32.getData <float >(),
581
+ (float *)C_fp32_svm, A_fp32.size ());
582
+
583
+ const float epsilon = 1e-3 * width;
584
+
585
+ cl_context->context_inst_ .releaseSVMRegion (C_fp32_svm);
586
+ cl_context->context_inst_ .releaseSVMRegion (D_fp32_svm);
587
+
588
+ EXPECT_IN_RANGE (mseError, 0 , epsilon);
589
+ EXPECT_IN_RANGE ((float )cosSim, 0.99 , 1 );
590
+ }
591
+
492
592
TEST (blas_kernels, l2norm) {
493
593
const int batch = 1 ;
494
594
const int channel = 1 ;
0 commit comments