[PHI decoupling] remove "gpu_device_function.h" in fluid. (#48117)

huangjiyi · web-flow · commit 4da1a0fe66f6 · 2022-11-22T11:23:31.000+08:00
* move "paddle/phi/backends/gpu/gpu_device_function.h" to phi

* update copyright years

* rm "fluid/platform/device/gpu/gpu_device_function.h" in phi

* rm dependence to "gpu_device_function.h" in fluid

* rm gpu_device_function.h etc in fluid

* fix rocm-complie bugs

* fix cuda_helper_test.cu bugs
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
@@ -13,7 +13,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -42,7 +42,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 #include "paddle/phi/kernels/gpu/elementwise_grad.h"
 
@@ -982,7 +982,7 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(
 #pragma unroll
     for (int i = BLOCK_X >> 1; i > 0; i >>= 1) {
       // reduce sum with wrap
-      val += platform::CudaShuffleXorSync(0xFFFFFFFF, val, i);
+      val += phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, val, i);
     }
 
     size_t idx_j = j + threadIdx.y;
@@ -1004,7 +1004,8 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(
 #pragma unroll
         for (int i = BLOCK_X >> 1; i > 0; i >>= 1) {
           // reduce sum with wrap
-          inter_val += platform::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i);
+          inter_val +=
+              phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i);
         }
         if (threadIdx.x == 0 && (idx_j < w)) d_intermediate[idx_j] = inter_val;
       }
@@ -1160,22 +1161,22 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(
   h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;
   if (BcastY) {
     if (dy) {
-      val = paddle::platform::reduceSum(val, tid, h);
+      val = phi::backends::gpu::reduceSum(val, tid, h);
       if (threadIdx.x == 0) {
         dy[j] = val;
       }
     }
   } else {
     if (dx) {
-      val = paddle::platform::reduceSum(val, tid, h);
+      val = phi::backends::gpu::reduceSum(val, tid, h);
       if (threadIdx.x == 0) {
         dx[j] = val;
       }
     }
   }
   if (!SameShapeOfIntermediateOutAndOut) {
     if (d_intermediate) {
-      inter_val = paddle::platform::reduceSum(inter_val, tid, h);
+      inter_val = phi::backends::gpu::reduceSum(inter_val, tid, h);
       if (threadIdx.x == 0) {
         d_intermediate[j] = inter_val;
       }
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/attn_gemm.h"
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/broadcast_function.h"
 #include "paddle/phi/kernels/funcs/elementwise_functor.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
diff --git a/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu b/paddle/fluid/operators/fused/fused_bias_dropout_residual_layer_norm_op.cu
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fused/fused_dropout_common.h b/paddle/fluid/operators/fused/fused_dropout_common.h
@@ -22,10 +22,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 #include "paddle/phi/kernels/funcs/functors.h"
 
diff --git a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -25,8 +25,8 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_gate_attention_op.cu b/paddle/fluid/operators/fused/fused_gate_attention_op.cu
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/fused/attn_gemm.h"
 #include "paddle/fluid/operators/fused/fused_gate_attention.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h b/paddle/fluid/operators/fused/fused_multi_transformer_op.cu.h
@@ -26,9 +26,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/attn_gemm.h"
 #include "paddle/fluid/operators/fused/fmha_ref.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
@@ -21,7 +21,7 @@ namespace cub = hipcub;
 #endif
 
 #include "paddle/fluid/operators/group_norm_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -25,8 +25,8 @@ namespace cub = hipcub;
 #include <iostream>
 
 #include "paddle/fluid/operators/fused/quant_dequant_kernel.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/core/ddim.h"
 #include "paddle/phi/kernels/funcs/aligned_vector.h"
 
@@ -55,7 +55,7 @@ static __forceinline__ __device__ U WarpReduceSum(U val) {
   unsigned mask = 0u;
   CREATE_SHFL_MASK(mask, true);
   for (int offset = warpSize / 2; offset > 0; offset /= 2) {
-    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
+    val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
   }
   return val;
 }
diff --git a/paddle/fluid/operators/math/beam_search.cu b/paddle/fluid/operators/math/beam_search.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
@@ -12,7 +12,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/row_conv_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
 
 namespace paddle {
@@ -242,7 +242,7 @@ __global__ void RowConvGradFilterImproved(const T *in,
 
         for (int offset = 16; offset > 0;
              offset = offset / 2) {  // blockDim.x is 32.
-          val += platform::CudaShuffleDownSync(mask, val, offset);
+          val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
         }
         __syncthreads();
 
@@ -307,7 +307,7 @@ __global__ void RowConvGradFilter(const T *in,
 
         for (int offset = 16; offset > 0;
              offset = offset / 2) {  // blockDim.x is 32.
-          val += platform::CudaShuffleDownSync(mask, val, offset);
+          val += phi::backends::gpu::CudaShuffleDownSync(mask, val, offset);
         }
         __syncthreads();
 
diff --git a/paddle/fluid/operators/top_k_function_cuda.h b/paddle/fluid/operators/top_k_function_cuda.h
@@ -26,9 +26,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/kernel_primitives/functor_primitives.h"
 #include "paddle/fluid/operators/top_k_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_primitives.h"
 
 #define FINAL_MASK 0xffffffff
@@ -283,8 +283,10 @@ __forceinline__ __device__ Pair<T> WarpReduce(Pair<T> input,
   if (largest) {
 #pragma unroll
     for (int offset = 16; offset > 0; offset >>= 1) {
-      T tmp_val = platform::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
-      int tmp_id = platform::CudaShuffleDownSync(FINAL_MASK, input.id, offset);
+      T tmp_val =
+          phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
+      int tmp_id =
+          phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.id, offset);
       if (input.v < tmp_val || (input.v == tmp_val && input.id > tmp_id)) {
         input.v = tmp_val;
         input.id = tmp_id;
@@ -293,8 +295,10 @@ __forceinline__ __device__ Pair<T> WarpReduce(Pair<T> input,
   } else {
 #pragma unroll
     for (int offset = 16; offset > 0; offset >>= 1) {
-      T tmp_val = platform::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
-      int tmp_id = platform::CudaShuffleDownSync(FINAL_MASK, input.id, offset);
+      T tmp_val =
+          phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.v, offset);
+      int tmp_id =
+          phi::backends::gpu::CudaShuffleDownSync(FINAL_MASK, input.id, offset);
       if (input.v > tmp_val || (input.v == tmp_val && input.id > tmp_id)) {
         input.v = tmp_val;
         input.id = tmp_id;
@@ -357,7 +361,8 @@ __device__ __forceinline__ void BlockReduce(Pair<T> shared_max[],
     unsigned mask = 0u;
     CREATE_SHFL_MASK(mask, true);
     if (tid_max / 32 == wid) {
-      if (platform::CudaShuffleSync(mask, *beam, tid_max % 32, 32) == MaxLength)
+      if (phi::backends::gpu::CudaShuffleSync(mask, *beam, tid_max % 32, 32) ==
+          MaxLength)
         break;
     }
   }
diff --git a/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h b/paddle/fluid/platform/device/gpu/cuda/cuda_device_function.h
diff --git a/paddle/fluid/platform/device/gpu/cuda_helper_test.cu b/paddle/fluid/platform/device/gpu/cuda_helper_test.cu
diff --git a/paddle/fluid/platform/device/gpu/gpu_device_function.h b/paddle/fluid/platform/device/gpu/gpu_device_function.h
diff --git a/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h b/paddle/fluid/platform/device/gpu/rocm/rocm_device_function.h

Original file line number	Diff line number	Diff line change
`@@ -42,7 +42,7 @@ limitations under the License. */`
`42`	`42`
`43`	`43`	`#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"`
`44`	`44`	`#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"`
`45`		`-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"`
	`45`	`+#include "paddle/phi/backends/gpu/gpu_device_function.h"`
`46`	`46`	`#include "paddle/phi/backends/gpu/gpu_primitives.h"`
`47`	`47`	`#include "paddle/phi/kernels/gpu/elementwise_grad.h"`
`48`	`48`
`@@ -982,7 +982,7 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(`
`982`	`982`	`#pragma unroll`
`983`	`983`	`for (int i = BLOCK_X >> 1; i > 0; i >>= 1) {`
`984`	`984`	`// reduce sum with wrap`
`985`		`- val += platform::CudaShuffleXorSync(0xFFFFFFFF, val, i);`
	`985`	`+ val += phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, val, i);`
`986`	`986`	`}`
`987`	`987`
`988`	`988`	`size_t idx_j = j + threadIdx.y;`
`@@ -1004,7 +1004,8 @@ static __global__ void FusedElemwiseAndActGradBroadcast1CUDAKernel(`
`1004`	`1004`	`#pragma unroll`
`1005`	`1005`	`for (int i = BLOCK_X >> 1; i > 0; i >>= 1) {`
`1006`	`1006`	`// reduce sum with wrap`
`1007`		`- inter_val += platform::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i);`
	`1007`	`+ inter_val +=`
	`1008`	`+ phi::backends::gpu::CudaShuffleXorSync(0xFFFFFFFF, inter_val, i);`
`1008`	`1009`	`}`
`1009`	`1010`	`if (threadIdx.x == 0 && (idx_j < w)) d_intermediate[idx_j] = inter_val;`
`1010`	`1011`	`}`
`@@ -1160,22 +1161,22 @@ static __global__ void FusedElemwiseAndActGradBroadcast2CUDAKernel(`
`1160`	`1161`	`h = h > ELEMWISE_MAX_BLOCK_DIM ? ELEMWISE_MAX_BLOCK_DIM : h;`
`1161`	`1162`	`if (BcastY) {`
`1162`	`1163`	`if (dy) {`
`1163`		`- val = paddle::platform::reduceSum(val, tid, h);`
	`1164`	`+ val = phi::backends::gpu::reduceSum(val, tid, h);`
`1164`	`1165`	`if (threadIdx.x == 0) {`
`1165`	`1166`	`dy[j] = val;`
`1166`	`1167`	`}`
`1167`	`1168`	`}`
`1168`	`1169`	`} else {`
`1169`	`1170`	`if (dx) {`
`1170`		`- val = paddle::platform::reduceSum(val, tid, h);`
	`1171`	`+ val = phi::backends::gpu::reduceSum(val, tid, h);`
`1171`	`1172`	`if (threadIdx.x == 0) {`
`1172`	`1173`	`dx[j] = val;`
`1173`	`1174`	`}`
`1174`	`1175`	`}`
`1175`	`1176`	`}`
`1176`	`1177`	`if (!SameShapeOfIntermediateOutAndOut) {`
`1177`	`1178`	`if (d_intermediate) {`
`1178`		`- inter_val = paddle::platform::reduceSum(inter_val, tid, h);`
	`1179`	`+ inter_val = phi::backends::gpu::reduceSum(inter_val, tid, h);`
`1179`	`1180`	`if (threadIdx.x == 0) {`
`1180`	`1181`	`d_intermediate[j] = inter_val;`
`1181`	`1182`	`}`