Add bfp16 and unit tests for the CK implicit gemm bwd and fwd xdlops solvers (#3521)

JonathanLichtnerAMD · bghimireamd · web-flow · commit b1cc950a484e · 2025-03-05T13:21:01.000-07:00
* Enable bfp16 for implicit gemm fwd and bwd xdlops solvers

* Add unit tests for implicit gemm fwd and bwd xdlops solvers

* Add int8 handling to the gtest unit conv solver

* Remove custom smoke test for ConvHipImplicitGemmFwdXdlops

---------

Co-authored-by: Bibek Ghimire &lt;bghimire@amd.com&gt;
diff --git a/src/solver/conv/conv_hip_implicit_gemm_bwd_data_xdlops.cpp b/src/solver/conv/conv_hip_implicit_gemm_bwd_data_xdlops.cpp
@@ -181,12 +181,12 @@ void PerformanceConfigHipImplicitGemmBwdXdlops::HeuristicInit(
     {
     case miopenHalf: Init<ck::half_t>(problem); break;
     case miopenFloat: Init<float>(problem); break;
+    case miopenBFloat16: Init<ck::bhalf_t>(problem); break;
     case miopenFloat8_fnuz:
     case miopenBFloat8_fnuz:
     case miopenInt8:
     case miopenInt32:
     case miopenInt64:
-    case miopenBFloat16:
     case miopenDouble: break;
     }
 #endif
@@ -223,12 +223,12 @@ bool PerformanceConfigHipImplicitGemmBwdXdlops::IsValid(
     {
     case miopenHalf: return CheckIsSupportCKArgs<ck::half_t>(problem);
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
+    case miopenBFloat16: return CheckIsSupportCKArgs<ck::bhalf_t>(problem);
     case miopenFloat8_fnuz:
     case miopenBFloat8_fnuz:
     case miopenInt8:
     case miopenInt32:
     case miopenInt64:
-    case miopenBFloat16:
     case miopenDouble: break;
     }
 #endif
@@ -304,12 +304,12 @@ bool ConvHipImplicitGemmBwdXdlops::IsApplicable(
     {
     case miopenHalf: return CheckCKApplicability<ck::half_t>(problem);
     case miopenFloat: return CheckCKApplicability<float>(problem);
+    case miopenBFloat16: return CheckCKApplicability<ck::bhalf_t>(problem);
     case miopenFloat8_fnuz:
     case miopenBFloat8_fnuz:
     case miopenInt8:
     case miopenInt32:
     case miopenInt64:
-    case miopenBFloat16:
     case miopenDouble: break;
     }
 #endif
@@ -334,10 +334,14 @@ ConvSolution ConvHipImplicitGemmBwdXdlops::GetSolution(
                                       CKArgs,
                                       miopen::conv::DataInvokeParams>(
             ctx, problem, config.kernel_id);
+    case miopenBFloat16:
+        return InitInvokerFactoryNHWC<DeviceOpBwdPtrs<ck::bhalf_t>,
+                                      CKArgs,
+                                      miopen::conv::DataInvokeParams>(
+            ctx, problem, config.kernel_id);
     case miopenInt8:
     case miopenInt32:
     case miopenInt64:
-    case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8_fnuz:
     case miopenBFloat8_fnuz:
diff --git a/src/solver/conv/conv_hip_implicit_gemm_fwd_xdlops.cpp b/src/solver/conv/conv_hip_implicit_gemm_fwd_xdlops.cpp
@@ -182,11 +182,11 @@ void PerformanceConfigHipImplicitGemmFwdXdlops::HeuristicInit(
     case miopenInt8: Init<int8_t>(problem); break;
     case miopenHalf: Init<ck::half_t>(problem); break;
     case miopenFloat: Init<float>(problem); break;
+    case miopenBFloat16: Init<ck::bhalf_t>(problem); break;
     case miopenFloat8_fnuz:
     case miopenBFloat8_fnuz:
     case miopenInt64:
     case miopenInt32:
-    case miopenBFloat16:
     case miopenDouble: break;
     }
 #endif
@@ -225,11 +225,11 @@ bool PerformanceConfigHipImplicitGemmFwdXdlops::IsValid(
     case miopenInt8: return CheckIsSupportCKArgs<int8_t>(problem);
     case miopenHalf: return CheckIsSupportCKArgs<ck::half_t>(problem);
     case miopenFloat: return CheckIsSupportCKArgs<float>(problem);
+    case miopenBFloat16: return CheckIsSupportCKArgs<ck::bhalf_t>(problem);
     case miopenFloat8_fnuz:
     case miopenBFloat8_fnuz:
     case miopenInt64:
     case miopenInt32:
-    case miopenBFloat16:
     case miopenDouble: break;
     }
 #endif
@@ -306,11 +306,11 @@ bool ConvHipImplicitGemmFwdXdlops::IsApplicable(
     case miopenInt8: return CheckCKApplicability<int8_t>(problem);
     case miopenHalf: return CheckCKApplicability<ck::half_t>(problem);
     case miopenFloat: return CheckCKApplicability<float>(problem);
+    case miopenBFloat16: return CheckCKApplicability<ck::bhalf_t>(problem);
     case miopenFloat8_fnuz:
     case miopenBFloat8_fnuz:
     case miopenInt64:
     case miopenInt32:
-    case miopenBFloat16:
     case miopenDouble: break;
     }
 #endif
@@ -336,9 +336,13 @@ ConvSolution ConvHipImplicitGemmFwdXdlops::GetSolution(
     case miopenFloat:
         return InitInvokerFactoryNHWC<DeviceOpPtrs<float>, CKArgs, miopen::conv::DataInvokeParams>(
             ctx, problem, config.kernel_id);
+    case miopenBFloat16:
+        return InitInvokerFactoryNHWC<DeviceOpPtrs<ck::bhalf_t>,
+                                      CKArgs,
+                                      miopen::conv::DataInvokeParams>(
+            ctx, problem, config.kernel_id);
     case miopenInt64:
     case miopenInt32:
-    case miopenBFloat16:
     case miopenDouble:
     case miopenFloat8_fnuz:
     case miopenBFloat8_fnuz:
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
@@ -836,11 +836,6 @@ endif()
 #   message output to the log, which happens if something is broken in the tuning machinery.
 # * Use MIOPEN_DEBUG_TUNING_ITERATIONS_MAX to save testing time.
 
-add_custom_test(smoke_solver_ConvHipImplicitGemmFwdXdlops GFX900_DISABLED GFX906_DISABLED GFX90A_DISABLED GFX94X_ENABLED HALF_ENABLED INT8_ENABLED
-    ENVIRONMENT MIOPEN_FIND_ENFORCE=SEARCH_DB_UPDATE MIOPEN_DEBUG_TUNING_ITERATIONS_MAX=5 MIOPEN_FIND_MODE=normal MIOPEN_DEBUG_FIND_ONLY_SOLVER=ConvHipImplicitGemmFwdXdlops
-    COMMAND $<TARGET_FILE:test_conv2d> ${TEST_CONV_VERBOSE_F} --input 128 64 56 56 --weights 64 64 1 1 --pads_strides_dilations 0 0 1 1 1 1 ${MIOPEN_TEST_CONV_INT8_OUTPUT_TYPE_INT8} --in_layout NHWC --fil_layout NHWC --out_layout NHWC ${MIOPEN_TEST_FLAGS_ARGS}
-)
-
 # FP16 ALT attribute is disabled to enable the backward solver on MI200 for HALF.
 add_custom_test(smoke_solver_ConvWinograd3x3MultipassWrW_3x2 HALF_ENABLED BF16_ENABLED SKIP_XNACK_ON
     ENVIRONMENT MIOPEN_DEBUG_CONVOLUTION_ATTRIB_FP16_ALT_IMPL=0 MIOPEN_FIND_MODE=normal MIOPEN_DEBUG_FIND_ONLY_SOLVER='ConvWinograd3x3MultipassWrW<3-2>'
diff --git a/test/gtest/unit_conv_solver.cpp b/test/gtest/unit_conv_solver.cpp
@@ -737,6 +737,9 @@ void RunSolver(const miopen::solver::conv::ConvSolverInterface& solver,
         case miopenBFloat16:
             RunSolver<bfloat16, bfloat16>(solver, params, direction, conv_config, algo);
             return;
+        case miopenInt8:
+            RunSolver<int8_t, int8_t>(solver, params, direction, conv_config, algo);
+            return;
         default:
             throw std::runtime_error("handling of this data type is not yet implemented");
         }
diff --git a/test/gtest/unit_conv_solver_ConvHipImplicitGemmBwdXdlops.cpp b/test/gtest/unit_conv_solver_ConvHipImplicitGemmBwdXdlops.cpp
@@ -0,0 +1,152 @@
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2025 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+
+#include "unit_conv_solver.hpp"
+
+namespace {
+
+auto GetConvSmokeTestCases(miopenDataType_t datatype)
+{
+    using TestCase = miopen::unit_tests::ConvTestCase;
+
+    return std::vector{
+        // clang-format off
+        TestCase{{datatype, miopenTensorNHWC, {1, 32, 8, 8}},
+                 {datatype, miopenTensorNHWC, {32, 32, 1, 1}},
+                 datatype, {{0, 0}, {1, 1}, {1, 1}}},
+        // clang-format on
+    };
+}
+
+auto GetConvFullTestCases(miopenDataType_t datatype)
+{
+    using TestCase = miopen::unit_tests::ConvTestCase;
+
+    return std::vector{
+        // clang-format off
+        TestCase{{datatype, miopenTensorNHWC, {1, 32, 8, 8}},
+                 {datatype, miopenTensorNHWC, {32, 32, 3, 3}},
+                 datatype, {{1, 1}, {1, 1}, {1, 1}}}, // non-zero padding
+        TestCase{{datatype, miopenTensorNHWC, {1, 64, 24, 48}},
+                 {datatype, miopenTensorNHWC, {96, 64, 1, 1}},
+                 datatype, {{0, 0}, {2, 2}, {1, 1}}}, // stride > 1
+        TestCase{{datatype, miopenTensorNHWC, {1, 32, 8, 8}},
+                 {datatype, miopenTensorNHWC, {32, 32, 3, 3}},
+                 datatype, {{0, 0}, {1, 1}, {3, 3}}}, // dilation > 1
+        TestCase{{datatype, miopenTensorNHWC, {1, 64, 24, 48}},
+                 {datatype, miopenTensorNHWC, {96, 64, 1, 1}},
+                 datatype, {{0, 0}, {1, 1}, {1, 1}}}, // some different NCHW and k parameters
+        // clang-format on
+    };
+}
+
+auto GetTestParams(miopenDataType_t datatype)
+{
+    Gpu supportedDevices = Gpu::gfx908 | Gpu::gfx90A | Gpu::gfx94X;
+    auto params          = miopen::unit_tests::UnitTestConvSolverParams(supportedDevices);
+    params.Tunable(5);
+    if(datatype == miopenHalf)
+    {
+        // Enable the backward solver on MI200 for fp16 by disabling the alternate implementation
+        params.SetConvAttrFp16Alt(0);
+    }
+
+    return params;
+}
+
+} // namespace
+
+using GPU_UnitTestConvSolverImplicitGemmBwdXdlops_FP16  = GPU_UnitTestConvSolverBwd_FP16;
+using GPU_UnitTestConvSolverImplicitGemmBwdXdlops_BFP16 = GPU_UnitTestConvSolverBwd_BFP16;
+using GPU_UnitTestConvSolverImplicitGemmBwdXdlops_FP32  = GPU_UnitTestConvSolverBwd_FP32;
+using CPU_UnitTestConvSolverImplicitGemmBwdXdlopsDevApplicability_FP16 =
+    CPU_UnitTestConvSolverDevApplicabilityBwd_NONE;
+
+TEST_P(GPU_UnitTestConvSolverImplicitGemmBwdXdlops_FP16, ConvHipImplicitGemmBwdXdlops)
+{
+    this->RunTest(miopen::solver::conv::ConvHipImplicitGemmBwdXdlops{});
+};
+
+TEST_P(GPU_UnitTestConvSolverImplicitGemmBwdXdlops_BFP16, ConvHipImplicitGemmBwdXdlops)
+{
+    this->RunTest(miopen::solver::conv::ConvHipImplicitGemmBwdXdlops{});
+};
+
+TEST_P(GPU_UnitTestConvSolverImplicitGemmBwdXdlops_FP32, ConvHipImplicitGemmBwdXdlops)
+{
+    this->RunTest(miopen::solver::conv::ConvHipImplicitGemmBwdXdlops{});
+};
+
+TEST_P(CPU_UnitTestConvSolverImplicitGemmBwdXdlopsDevApplicability_FP16,
+       ConvHipImplicitGemmBwdXdlops)
+{
+    this->RunTest(miopen::solver::conv::ConvHipImplicitGemmBwdXdlops{});
+};
+
+// Smoke tests
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_UnitTestConvSolverImplicitGemmBwdXdlops_FP16,
+                         testing::Combine(testing::Values(GetTestParams(miopenHalf)),
+                                          testing::Values(miopenConvolutionAlgoImplicitGEMM),
+                                          testing::ValuesIn(GetConvSmokeTestCases(miopenHalf))));
+
+INSTANTIATE_TEST_SUITE_P(
+    Smoke,
+    GPU_UnitTestConvSolverImplicitGemmBwdXdlops_BFP16,
+    testing::Combine(testing::Values(GetTestParams(miopenBFloat16)),
+                     testing::Values(miopenConvolutionAlgoImplicitGEMM),
+                     testing::ValuesIn(GetConvSmokeTestCases(miopenBFloat16))));
+
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         GPU_UnitTestConvSolverImplicitGemmBwdXdlops_FP32,
+                         testing::Combine(testing::Values(GetTestParams(miopenFloat)),
+                                          testing::Values(miopenConvolutionAlgoImplicitGEMM),
+                                          testing::ValuesIn(GetConvSmokeTestCases(miopenFloat))));
+
+// Full tests
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_UnitTestConvSolverImplicitGemmBwdXdlops_FP16,
+                         testing::Combine(testing::Values(GetTestParams(miopenHalf)),
+                                          testing::Values(miopenConvolutionAlgoImplicitGEMM),
+                                          testing::ValuesIn(GetConvFullTestCases(miopenHalf))));
+
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_UnitTestConvSolverImplicitGemmBwdXdlops_BFP16,
+                         testing::Combine(testing::Values(GetTestParams(miopenBFloat16)),
+                                          testing::Values(miopenConvolutionAlgoImplicitGEMM),
+                                          testing::ValuesIn(GetConvFullTestCases(miopenBFloat16))));
+
+INSTANTIATE_TEST_SUITE_P(Full,
+                         GPU_UnitTestConvSolverImplicitGemmBwdXdlops_FP32,
+                         testing::Combine(testing::Values(GetTestParams(miopenFloat)),
+                                          testing::Values(miopenConvolutionAlgoImplicitGEMM),
+                                          testing::ValuesIn(GetConvFullTestCases(miopenFloat))));
+
+// Device applicability tests
+INSTANTIATE_TEST_SUITE_P(Smoke,
+                         CPU_UnitTestConvSolverImplicitGemmBwdXdlopsDevApplicability_FP16,
+                         testing::Combine(testing::Values(GetTestParams(miopenHalf)),
+                                          testing::Values(GetConvSmokeTestCases(miopenHalf)[0])));
diff --git a/test/gtest/unit_conv_solver_ConvHipImplicitGemmFwdXdlops.cpp b/test/gtest/unit_conv_solver_ConvHipImplicitGemmFwdXdlops.cpp