[Inference] Use cuda core(int8_sq) for m <=4 in gemm_dequant OP (#9707)

zhink · web-flow · commit 7197b79c1d4b · 2024-12-27T14:07:01.000+08:00
diff --git a/csrc/gpu/int8_gemm_with_cutlass/gemm_dequant.cu b/csrc/gpu/int8_gemm_with_cutlass/gemm_dequant.cu
@@ -15,6 +15,131 @@
 #include "gemm_dequant.h"
 #include "cutlass_helper.h"
 
+template <typename Type, int CtaM, int CtaN, int Threads>
+__global__ void int8_sq(int8_t const* act,
+                          int8_t const* weight,
+                          float const* scale,
+                          Type* output,
+                          int m,
+                          int n,
+                          int k) {
+  using VecType = int4;
+  static constexpr int kStepK = 128 / (8 * sizeof(int8_t));
+  static constexpr int CtaK = kStepK * Threads;
+  int tile_id_m = blockIdx.x * CtaM;
+  int tile_id_n = blockIdx.y * CtaN;
+  int tid = threadIdx.x;
+  int8_t tile_a[kStepK], tile_w[CtaN * kStepK];
+  int acc[CtaM * CtaN];
+#pragma unroll
+  for (int i = 0; i < CtaM * CtaN; ++i) {
+    acc[i] = 0;
+  }
+  act += tile_id_m * k;
+  weight += tile_id_n * k;
+  scale += tile_id_n;
+  output += tile_id_m * n + tile_id_n;
+  for (int idx_k = tid * kStepK; idx_k < k; idx_k += CtaK) {
+#pragma unroll
+    for (int i = 0; i < CtaN; ++i) {
+      reinterpret_cast<VecType*>(tile_w)[i] =
+          reinterpret_cast<VecType const*>(weight + i * k + idx_k)[0];
+    }
+#pragma unroll
+    for (int i = 0; i < CtaM; ++i) {
+      reinterpret_cast<VecType*>(tile_a)[0] =
+          reinterpret_cast<VecType const*>(act + i * k + idx_k)[0];
+#pragma unroll
+      for (int j = 0; j < CtaN; ++j) {
+#pragma unroll
+        for (int l = 0; l < kStepK; l += 4) {
+          acc[i * CtaN + j] =
+              __dp4a(reinterpret_cast<int*>(tile_a + l)[0],
+                     reinterpret_cast<int*>(tile_w + j * kStepK + l)[0],
+                     acc[i * CtaN + j]);
+        }
+      }
+    }
+  }
+
+  static constexpr int kWarpSize = 32;
+  static constexpr int kWarpNum = Threads / kWarpSize;
+  __shared__ int shmem[CtaM * CtaN * kWarpNum];
+  int warp_id = tid / kWarpSize, lane_id = tid % kWarpSize;
+#pragma unroll
+  for (int i = 0; i < CtaM; ++i) {
+#pragma unroll
+    for (int j = 0; j < CtaN; ++j) {
+      int val = acc[i * CtaN + j];
+      val += __shfl_xor_sync(~0, val, 16);
+      val += __shfl_xor_sync(~0, val, 8);
+      val += __shfl_xor_sync(~0, val, 4);
+      val += __shfl_xor_sync(~0, val, 2);
+      val += __shfl_xor_sync(~0, val, 1);
+      if (lane_id == 0) {
+        shmem[i * CtaN + j + warp_id * CtaM * CtaN] = val;
+      }
+    }
+  }
+  __syncthreads();
+#pragma unroll
+  for (int ii = tid; ii < CtaM * CtaN; ii += Threads) {
+    int mid = ii / CtaN, nid = ii % CtaN;
+    int val = 0;
+#pragma unroll
+    for (int jj = 0; jj < kWarpNum; ++jj) {
+      val += shmem[jj * CtaM * CtaN + ii];
+    }
+    output[mid * n + nid] = static_cast<Type>(static_cast<float>(val)*(float)*(scale+nid));
+  }
+}
+
+template <typename InputType,
+          typename OutputType,
+          int32_t TILE_M,
+          int32_t TILE_N,
+          int32_t BLOCK_SIZE>
+void int8_sq_kernel(GemmDequantParams const& params) {
+  dim3 block(BLOCK_SIZE);
+  dim3 grid(params.m / TILE_M, params.n / TILE_N);
+  int8_sq<OutputType, TILE_M, TILE_N, BLOCK_SIZE>
+      <<<grid, block, 0, params.stream>>>(
+          reinterpret_cast<InputType const*>(params.act),
+          reinterpret_cast<InputType const*>(params.weight),
+          reinterpret_cast<float const*>(params.scale),
+          reinterpret_cast<OutputType*>(params.output),
+          params.m,
+          params.n,
+          params.k);
+}
+
+template <typename InputType,
+          typename OutputType,
+          int TILE_M,
+          int TILE_N,
+          int BLOCK_SIZE>
+bool int8_sq_kernel_caller(GemmDequantParams const& params) {
+  constexpr int cudaCoreGemmTemplateMaxM = 16;
+  if (params.m == TILE_M) {
+    int8_sq_kernel<InputType, OutputType, TILE_M, TILE_N, BLOCK_SIZE>(
+        params);
+    return true;
+  }
+  if constexpr (TILE_M < cudaCoreGemmTemplateMaxM) {
+    return int8_sq_kernel_caller<InputType,
+                                      OutputType,
+                                      TILE_M + 1,
+                                      TILE_N,
+                                      BLOCK_SIZE>(params);
+  }
+  return false;
+}
+
+template <typename InputType, typename OutputType>
+bool int8_sq_kernel_launcher(GemmDequantParams const& params) {
+  return int8_sq_kernel_caller<InputType, OutputType, 1, 2, 256>(params);
+}
+
 template <paddle::DataType D, typename T>
 void RunGemmDequant(const int8_t* a,
                     const int8_t* b,  // Transposed
@@ -114,6 +239,49 @@ std::vector<paddle::Tensor> GemmDequant(const paddle::Tensor& x,
   int64_t m = x_dims[x_dims.size() - 2];
   int64_t k = x_dims[x_dims.size() - 1];
   int64_t n = y_dims[y_dims.size() - 2];
+
+    if(m <= 4)
+    {
+        if (out_dtype == "bfloat16") {
+            paddle::Tensor out =
+                    paddle::empty({m, n}, paddle::DataType::BFLOAT16, x.place());
+            GemmDequantParams params = {
+                reinterpret_cast<const void*>(x.data<int8_t>()),
+                reinterpret_cast<const void*>(y.data<int8_t>()),
+                reinterpret_cast<const void*>(scale.data<float>()),
+                reinterpret_cast<void*>(out.data<paddle::bfloat16>()),
+                m,
+                n,
+                k,
+                x.stream()
+            };
+            if (!int8_sq_kernel_launcher<int8_t, __nv_bfloat16>(params)) {
+                PADDLE_THROW(common::errors::Fatal("gemm dequamt kernel run error"));
+            }
+            return {out};
+        } else if (out_dtype == "float16") {
+            paddle::Tensor out =
+                    paddle::empty({m, n}, paddle::DataType::FLOAT16, x.place());
+            GemmDequantParams params = {
+                reinterpret_cast<const void*>(x.data<int8_t>()),
+                reinterpret_cast<const void*>(y.data<int8_t>()),
+                reinterpret_cast<const void*>(scale.data<float>()),
+                reinterpret_cast<void*>(out.data<paddle::float16>()),
+                m,
+                n,
+                k,
+                x.stream()
+            };
+            if (!int8_sq_kernel_launcher<int8_t, half>(params)) {
+                PADDLE_THROW(common::errors::Fatal("gemm dequamt kernel run error"));
+            }
+            return {out};
+        } else {
+            PADDLE_THROW(phi::errors::InvalidArgument(
+                "only support bfloat16 and float16, but got %s", out_dtype));
+        }
+    }
+
   if (out_dtype == "bfloat16") {
     paddle::Tensor out = paddle::empty({m, n}, paddle::DataType::BFLOAT16, x.place());
     RunGemmDequant<paddle::DataType::BFLOAT16, paddle::bfloat16>(x.data<int8_t>(),
diff --git a/csrc/gpu/int8_gemm_with_cutlass/gemm_dequant.h b/csrc/gpu/int8_gemm_with_cutlass/gemm_dequant.h
@@ -1582,3 +1582,12 @@ class GemmDequant {
 };
 
 }  // namespace cutlass
+
+typedef struct {
+  void const* act;
+  void const* weight;
+  void const* scale;
+  void* output;
+  int32_t m, n, k;
+  cudaStream_t stream;
+} GemmDequantParams;