[DCU] high performance LLM train and inference for DCU (#8580)

yuguo-Jack · web-flow · commit 5364d5e612f3 · 2024-06-28T15:04:31.000+08:00
* [DCU] high performance LLM train and inference for DCU
diff --git a/csrc/generation/get_padding_offset_v2.cu b/csrc/generation/get_padding_offset_v2.cu
@@ -1,3 +1,17 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "paddle/extension.h"
 
 __global__ void RemovePaddingV2(int64_t *output_data,
diff --git a/csrc/generation/helper.h b/csrc/generation/helper.h
@@ -15,12 +15,44 @@
 #pragma once
 
 #include "paddle/extension.h"
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#include <hipcub/hipcub.hpp>
+#include <hiprand.h>
+#include <hiprand_kernel.h>
+namespace cub = hipcub;
+#else
 #include <cub/cub.cuh>
 #include <curand_kernel.h>
+#endif
 
 constexpr int kBlockSize = 256; 
 constexpr int kNumWaves = 16; 
 
+#ifdef PADDLE_WITH_HIP
+inline hipError_t GetNumBlocks(int64_t n, int* num_blocks) {
+  int dev;
+  {
+    hipError_t err = hipGetDevice(&dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int sm_count;
+  {
+    hipError_t err = hipDeviceGetAttribute(&sm_count, hipDeviceAttributeMultiprocessorCount, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  int tpm;
+  {
+    hipError_t err = hipDeviceGetAttribute(&tpm, hipDeviceAttributeMaxThreadsPerMultiProcessor, dev);
+    if (err != hipSuccess) { return err; }
+  }
+  *num_blocks = std::max<int>(1, std::min<int64_t>((n + kBlockSize - 1) / kBlockSize,
+                                                    sm_count * tpm / kBlockSize * kNumWaves));
+  return hipSuccess;
+}
+#else
 inline cudaError_t GetNumBlocks(int64_t n, int* num_blocks) {
   int dev;
   {
@@ -41,6 +73,7 @@ inline cudaError_t GetNumBlocks(int64_t n, int* num_blocks) {
                                                     sm_count * tpm / kBlockSize * kNumWaves));
   return cudaSuccess;
 }
+#endif
 
 template<typename T>
 __device__ T max_func(const T a, const T b) {
@@ -74,7 +107,11 @@ class PDTraits<paddle::DataType::FLOAT16> {
 template <>
 class PDTraits<paddle::DataType::BFLOAT16> {
 public:
+#ifdef PADDLE_WITH_HIP
+  typedef hip_bfloat16 DataType;
+#else
   typedef __nv_bfloat16 DataType;
+#endif
   typedef paddle::bfloat16 data_t;
 };
 
diff --git a/csrc/generation/quant_int8.cu b/csrc/generation/quant_int8.cu
@@ -22,8 +22,13 @@
 #include<sys/mman.h>
 #include<stdio.h>
 #include<algorithm>
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_fp16.h>
+#include <hip/hip_bfloat16.h>
+#else
 #include<cuda_fp16.h>
 #include<cuda_bf16.h>
+#endif
 
 
 constexpr int DequantKernelVecSize = 4;
@@ -52,11 +57,17 @@ __forceinline__ __device__ half add_mul<half>(half a, half b, half c) {
     return __hmul(__hadd(a, b), c);
 }
 
+#ifdef PADDLE_WITH_HIP
+template<>
+__forceinline__ __device__ hip_bfloat16 add_mul<hip_bfloat16>(hip_bfloat16 a, hip_bfloat16 b, hip_bfloat16 c) {
+    return (a + b) * c;
+}
+#else
 template<>
 __forceinline__ __device__ __nv_bfloat16 add_mul<__nv_bfloat16>(__nv_bfloat16 a, __nv_bfloat16 b, __nv_bfloat16 c) {
     return __hmul(__hadd(a, b), c);
 }
-
+#endif
 
 
 template <typename data_t>
@@ -173,8 +184,13 @@ std::vector<paddle::Tensor> LaunchQuantInt8(const paddle::Tensor& input,
     auto output=paddle::full(input_shape, -1, paddle::DataType::INT8, input.place());
     int m = input_shape[0];
     int n = input_shape[1];
+#ifdef PADDLE_WITH_HIP
+    dim3 grid(((n >> 2) + 63) / 64, (m + 7) / 8);
+    dim3 block(64, 8);
+#else
     dim3 grid((n >> 2 + 31) / 32, (m + 31) / 32);
     dim3 block(32, 32);
+#endif
     auto stream = input.stream();
     if (shift && smooth) {
         QuantKernel<DataType_><<<grid, block, 0, stream>>>(reinterpret_cast<const DataType_*>(input.data<data_t>()),
diff --git a/csrc/generation/rebuild_padding.cu b/csrc/generation/rebuild_padding.cu
@@ -58,7 +58,12 @@ void InvokeRebuildPadding(T *output_data,
                           const int *padding_offset,
                           const int token_num,
                           const int dim_embed,
-                          cudaStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+                          hipStream_t stream
+#else
+                          cudaStream_t stream
+#endif
+                          ) {
   // src: [token_num, dim_embed]
   // dst: [batch_size * max_seq_len, dim_embed]
   RebuildPaddingKernel<<<token_num, 256, 0, stream>>>(
diff --git a/csrc/generation/rebuild_padding_v2.cu b/csrc/generation/rebuild_padding_v2.cu
@@ -1,3 +1,17 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "helper.h"
 
 template <typename T, int VecSize>
diff --git a/csrc/generation/set_value_by_flags_v2.cu b/csrc/generation/set_value_by_flags_v2.cu
@@ -1,3 +1,17 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "paddle/extension.h"
 
 __global__ void set_value_by_flag_and_id_v2(const bool *stop_flags, 
diff --git a/csrc/generation/step.cu b/csrc/generation/step.cu
@@ -1,3 +1,17 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "helper.h"
 
 // #define DEBUG_STEP
@@ -255,7 +269,11 @@ void StepPaddle(const paddle::Tensor& stop_flags,
         max_decoder_block_num
     );
 #ifdef DEBUG_STEP
+#ifdef PADDLE_WITH_HIP
+    hipDeviceSynchronize();
+#else
     cudaDeviceSynchronize();
+#endif
 #endif
     auto cpu_recover_lens = recover_lens.copy_to(paddle::CPUPlace(), false);
     const int grid_size = cpu_recover_lens.data<int>()[0];
@@ -287,7 +305,11 @@ void StepPaddle(const paddle::Tensor& stop_flags,
             first_token_id
         );
 #ifdef DEBUG_STEP
+#ifdef PADDLE_WITH_HIP
+        hipDeviceSynchronize();
+#else
         cudaDeviceSynchronize();
+#endif
 #endif
     }
 }
diff --git a/csrc/generation/stop_generation_multi_ends_v2.cu b/csrc/generation/stop_generation_multi_ends_v2.cu
@@ -1,3 +1,17 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "paddle/extension.h"
 #include<stdlib.h>
 #include<string.h>
diff --git a/csrc/generation/token_penalty_multi_scores_v2.cu b/csrc/generation/token_penalty_multi_scores_v2.cu
@@ -1,3 +1,17 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+// 
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+// 
+//     http://www.apache.org/licenses/LICENSE-2.0
+// 
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
 #include "helper.h"
 
 
diff --git a/csrc/generation/transpose_removing_padding.cu b/csrc/generation/transpose_removing_padding.cu
@@ -65,7 +65,12 @@ void InvokeTransposeRemovePadding(const T* input_data,
                                   const int head_dim,
                                   const int token_num,
                                   const int* padding_offset,
-                                  cudaStream_t cu_stream) {
+#ifdef PADDLE_WITH_HIP
+                                  hipStream_t cu_stream
+#else
+                                  cudaStream_t cu_stream
+#endif
+                                  ) {
   // [batch_size, num_head, max_len_this_time, head_dim] -> [token_num, num_head,
   // head_dim]
   constexpr int VEC_16B = 16;
diff --git a/csrc/generation/write_int8_cache_kv.cu b/csrc/generation/write_int8_cache_kv.cu
@@ -14,8 +14,13 @@
 
 #include "helper.h"
 
+#ifdef PADDLE_WITH_HIP
+constexpr int32_t WARP_SIZE = 64; 
+constexpr int32_t HALF_WARP = 32; 
+#else
 constexpr int32_t WARP_SIZE = 32; 
 constexpr int32_t HALF_WARP = 16; 
+#endif
 constexpr float QUANT_MAX_BOUND = 127.0;
 constexpr float QUANT_MIN_BOUND = -127.0;
 
@@ -47,14 +52,22 @@ struct MaxFunc{
 template<>
 struct MaxFunc<half>{
   __device__ half operator()(half a, half b){
-#if __CUDA_ARCH__ >= 800
+#if (__CUDA_ARCH__ >= 800) || defined(PADDLE_WITH_HIP)
     return __hmax(a, b); 
 #else
     return max(static_cast<float>(a), static_cast<float>(b));
 #endif
   }
 }; 
 
+#ifdef PADDLE_WITH_HIP
+template<>
+struct MaxFunc<hip_bfloat16>{
+  __device__ hip_bfloat16 operator()(hip_bfloat16 a, hip_bfloat16 b){
+    return static_cast<hip_bfloat16>(max(static_cast<float>(a), static_cast<float>(b)));
+  }
+}; 
+#else
 template<>
 struct MaxFunc<__nv_bfloat16>{
   __device__ __nv_bfloat16 operator()(__nv_bfloat16 a, __nv_bfloat16 b){
@@ -65,6 +78,7 @@ struct MaxFunc<__nv_bfloat16>{
 #endif
   }
 }; 
+#endif
 
 template<typename T>
 struct AbsFunc{
@@ -76,14 +90,22 @@ struct AbsFunc{
 template<>
 struct AbsFunc<half>{
   __device__ half operator()(half x){
-  #if __CUDA_ARCH__ >= 800
+  #if (__CUDA_ARCH__ >= 800) || defined(PADDLE_WITH_HIP)
     return __habs(x); 
   #else
     return abs(static_cast<float>(x));
   #endif
   }
 }; 
 
+#ifdef PADDLE_WITH_HIP
+template<>
+struct AbsFunc<hip_bfloat16>{
+  __device__ hip_bfloat16 operator()(hip_bfloat16 x) {
+    return static_cast<hip_bfloat16>(abs(static_cast<float>(x)));
+  }
+}; 
+#else
 template<>
 struct AbsFunc<__nv_bfloat16>{
   __device__ __nv_bfloat16 operator()(__nv_bfloat16 x){
@@ -94,6 +116,7 @@ struct AbsFunc<__nv_bfloat16>{
   #endif
   }
 }; 
+#endif
 
 template <typename T, typename Vec, int VecSize>
 __inline__ __device__ T LocalReduceMax(Vec& vec) {
@@ -109,7 +132,11 @@ template <typename T>
 __inline__ __device__ T WarpReduceAbsMax(T val, unsigned lane_mask) {
   #pragma unroll
   for (int mask = HALF_WARP; mask > 0; mask >>= 1){
+#ifdef PADDLE_WITH_HIP
+    val = MaxFunc<T>()(val, static_cast<T>(__shfl_xor(static_cast<float>(val), mask, WARP_SIZE)));
+#else
     val = MaxFunc<T>()(val, __shfl_xor_sync(lane_mask, val, mask, WARP_SIZE));
+#endif
   }
   return val;
 }
@@ -147,7 +174,7 @@ __global__ void write_cache_k_int8_kernel(const T* k, const int64_t num_head, co
     InVec abs_max_vec;
 #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
-      abs_max_vec[i] = 0.0f;
+      abs_max_vec[i] = static_cast<T>(0.0f);
     }
 
     T local_abs_max;
@@ -205,7 +232,7 @@ __global__ void write_cache_v_int8_kernel(const T* v, const int64_t num_head, co
     InVec abs_max_vec;
   #pragma unroll
     for (int i = 0; i < VecSize; ++i) {
-      abs_max_vec[i] = 0.0f;
+      abs_max_vec[i] = static_cast<T>(0.0f);
     }
 
     T local_abs_max;
diff --git a/csrc/setup_hip.py b/csrc/setup_hip.py