fix conflic

zhangboSJTU · zhangboSJTU · commit 751ce2a0fc1f · 2023-04-28T02:34:09.000Z
diff --git a/paddle/phi/kernels/funcs/broadcast_function.h b/paddle/phi/kernels/funcs/broadcast_function.h
@@ -189,45 +189,29 @@ struct BroadcastDataLoader<Index, VecSize, false, kElementwise> {
   }
 };
 
-// Common broadcast data loader.
-template <int Index, int VecSize, bool IsBoundary>
-struct BroadcastDataLoader<Index, VecSize, IsBoundary, kBroadcast> {
-  template <typename Array1, typename Array2, typename Array3, typename ArgsT>
-  static __device__ __forceinline__ void Apply(const Array1 &ins,
-                                               ArgsT *args,
-                                               const Array2 &configs,
-                                               const Array3 &use_broadcast,
-                                               const int block_offset,
-                                               const int num,
-                                               const uint32_t numel) {
+template <int Index, int VecSize>
+struct BroadcastDataInit {
+  template <typename ArgsT>
+  static __device__ __forceinline__ void Apply(ArgsT *args) {
     using Type = std::tuple_element_t<Index, ArgsT>;
-    uint32_t index_bc[VecSize];
 #pragma unroll
     for (int k = 0; k < VecSize; ++k) {
-      index_bc[k] = 0;
       std::get<Index>(args[k]) = static_cast<Type>(1);
     }
+  }
+};
 
-    uint32_t thread_offset = block_offset + threadIdx.x * VecSize;
-#pragma unroll
-    for (int k = 0; k < VecSize; ++k) {
-      uint32_t idx = thread_offset + k;
-      if (IsBoundary && idx == numel) {
-        break;
-      }
-#pragma unroll
-      for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
-        if (i == configs[0].rank) break;
-        auto fast_divmoder = configs[0].divmoders[i].Divmod(idx);
-        idx = fast_divmoder.val[0];
-        index_bc[k] += fast_divmoder.val[1] * configs[Index].strides[i];
-      }
-    }
-
+template <int Index, int VecSize>
+struct BroadcastDataSetter {
+  template <typename Array, typename ArgsT>
+  static __device__ __forceinline__ void Apply(const Array &ins,
+                                               ArgsT *args,
+                                               uint32_t index_bc[][VecSize]) {
+    using Type = std::tuple_element_t<Index, ArgsT>;
 #pragma unroll
     for (int k = 0; k < VecSize; ++k) {
       std::get<Index>(args[k]) =
-          reinterpret_cast<const _ptr_ Type *>(ins[Index])[index_bc[k]];
+          reinterpret_cast<const _ptr_ Type *>(ins[Index])[index_bc[Index][k]];
     }
   }
 };
@@ -285,8 +269,30 @@ __device__ void VectorizedBroadcastKernelImpl(
   __simd__ ArgsT args[VecSize];
   __simd__ ConditionalT<OutT, NumOuts> result[VecSize];
 
-  BcUnroller<BroadcastDataLoader, IsBoundary, LoadType, VecSize, Arity>::step(
-      ins, args, configs, use_broadcast, block_offset, num, numel);
+  if (LoadType == kBroadcast) {
+    uint32_t index_bc[Arity][VecSize] = {0};
+    Unroller<BroadcastDataInit, VecSize, Arity>::step(args);
+    uint32_t thread_offset = block_offset + threadIdx.x * VecSize;
+#pragma unroll
+    for (int k = 0; k < VecSize; ++k) {
+      uint32_t idx = thread_offset + k;
+      if (IsBoundary && idx == numel) break;
+#pragma unroll
+      for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
+        if (i == configs[0].rank) break;
+        auto fast_divmoder = configs[0].divmoders[i].Divmod(idx);
+        idx = fast_divmoder.val[0];
+#pragma unroll
+        for (int j = 0; j < Arity; ++j) {
+          index_bc[j][k] += fast_divmoder.val[1] * configs[j].strides[i];
+        }
+      }
+    }
+    Unroller<BroadcastDataSetter, VecSize, Arity>::step(ins, args, index_bc);
+  } else {
+    BcUnroller<BroadcastDataLoader, IsBoundary, LoadType, VecSize, Arity>::step(
+        ins, args, configs, use_broadcast, block_offset, num, numel);
+  }
 
   SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
                                      VecSize,
@@ -783,11 +789,7 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<OutT,
 };
 #endif
 
-template <ElementwiseType ET,
-          typename OutT,
-          typename Functor,
-          int kArity,
-          int NumOuts = 1>
+template <typename OutT, typename Functor, int kArity, int NumOuts = 1>
 void BroadcastKernelForDifferentVecSize(
     const KPDevice &ctx,
     const std::vector<const DenseTensor *> &ins,
@@ -922,16 +924,12 @@ void BroadcastKernelForDifferentVecSize(
   }
 }
 
-template <ElementwiseType ET,
-          typename InT,
-          typename OutT,
-          typename Functor,
-          int NumOuts = 1>
+template <typename OutT, typename Functor, int NumOuts = 1>
 void BroadcastKernel(const KPDevice &ctx,
                      const std::vector<const DenseTensor *> &ins,
                      std::vector<DenseTensor *> *outs,
-                     int axis,
-                     Functor func) {
+                     Functor func,
+                     int axis = -1) {
   // When there are multiple inputs, the outputs's rank should be equal the
   // maximum rank of all inputs.
   using Traits = phi::funcs::FunctionTraits<Functor>;
@@ -968,23 +966,22 @@ void BroadcastKernel(const KPDevice &ctx,
     max_rank = std::max(max_rank, (*outs)[0]->dims().size());
   }
   axis = axis == -1 ? max_rank - min_rank : axis;
-  BroadcastKernelForDifferentVecSize<ET, OutT, Functor, kArity, NumOuts>(
+  BroadcastKernelForDifferentVecSize<OutT, Functor, kArity, NumOuts>(
       ctx, ins, outs, axis, func);
 }
 
 template <typename Functor, typename T, typename OutType = T>
 void ElementwiseCompute(const GPUContext &dev_ctx,
                         const DenseTensor &x,
                         const DenseTensor &y,
-                        int axis,
                         Functor func,
-                        DenseTensor *z) {
+                        DenseTensor *z,
+                        int axis = -1) {
   std::vector<const DenseTensor *> ins = {&x, &y};
   std::vector<DenseTensor *> outs = {z};
   dev_ctx.template Alloc<OutType>(z);
 
-  BroadcastKernel<ElementwiseType::kBinary, T, OutType, Functor, 1>(
-      dev_ctx, ins, &outs, axis, func);
+  BroadcastKernel<OutType, Functor, 1>(dev_ctx, ins, &outs, func, axis);
 }
 
 template <typename DeviceContext,
@@ -999,7 +996,7 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
   auto x_dims = x.dims();
   auto y_dims = y.dims();
   dev_ctx.template Alloc<T>(z);
-  funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+  funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, Functor(), z, axis);
 }
 
 #else
@@ -1017,10 +1014,10 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
   auto y_dims = y.dims();
   dev_ctx.template Alloc<T>(z);
   if (x_dims.size() >= y_dims.size()) {
-    funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
+    funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, Functor(), z, axis);
   } else {
     funcs::ElementwiseCompute<InverseFunctor, T>(
-        dev_ctx, x, y, axis, InverseFunctor(), z);
+        dev_ctx, x, y, InverseFunctor(), z, axis);
   }
 }
 #endif
diff --git a/paddle/phi/kernels/funcs/dropout_impl.cu.h b/paddle/phi/kernels/funcs/dropout_impl.cu.h
@@ -191,25 +191,19 @@ __global__ void VectorizedRandomGenerator(const size_t n,
 }
 
 template <typename T>
-__global__ void DropOutNdForwardKernel(
-    const size_t n,
-    uint64_t seed,
-    const float dropout_prob,
-    const T* src,
-    uint8_t* mask,
-    uint64_t increment,
-    size_t main_offset,
-    DstFunctor<T> dst_functor,
-    MaskFunctor<T> mask_functor,
-    T* y,
-    int64_t N,
-    kps::details::BroadcastConfig broadcast_config,
-    const uint64_t* seed_ptr) {
+__global__ void VectorizedGeneratorMask(const size_t n,
+                                        uint64_t seed,
+                                        const float dropout_prob,
+                                        const T* src,
+                                        uint8_t* mask,
+                                        uint64_t increment,
+                                        size_t main_offset,
+                                        MaskFunctor<T> mask_functor,
+
+                                        const uint64_t* seed_ptr) {
   // Vectorized Generate Mask
   // kCount is 4 for curand_uniform4 is used
-  if (seed_ptr) {
-    seed = seed_ptr[0];
-  }
+  if (seed_ptr) seed = seed_ptr[0];
 
   constexpr int kCount = phi::funcs::uniform_distribution<float>::kReturnsCount;
   size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
@@ -259,22 +253,6 @@ __global__ void DropOutNdForwardKernel(
     kps::WriteData<uint8_t, kCount, 1, true>(
         mask + fix, &mask_result[0], remainder);
   }
-  // Broadcast mask data and do elementwise operaiton with DstFunctor
-  CUDA_KERNEL_LOOP(i, N) {
-    uint32_t offset = 0u;
-    uint32_t idx = i;
-    // Use (j < phi::DDim::kMaxRank) conditiion rather than
-    // (j < broadcast_config.rank) for (#pragma unroll)
-#pragma unroll
-    for (int j = 0; j < phi::DDim::kMaxRank; ++j) {
-      if (j == broadcast_config.rank) break;
-      auto fast_divmoder = broadcast_config.divmoders[j].Divmod(idx);
-      idx = fast_divmoder.val[0];
-      offset += broadcast_config.strides[j] * fast_divmoder.val[1];
-    }
-    __syncthreads();
-    y[i] = dst_functor(src[i], mask[offset]);
-  }
 }
 
 template <typename T, typename MT>
@@ -348,18 +326,6 @@ void DropoutFwGPUKernelDriver(
         size / (block_size * kVecSize) * (block_size * kVecSize);
 
     if (is_dropout_nd) {
-      auto dst_functor =
-          DstFunctor<T>(1.0f - dropout_prob, upscale_in_train, x_numel);
-
-      std::vector<int64_t> out_dims =
-          std::move(phi::vectorize<int64_t>(x.dims()));
-      std::vector<int64_t> in_dims =
-          std::move(phi::vectorize<int64_t>(mask->dims()));
-      std::reverse(out_dims.begin(), out_dims.end());
-      std::reverse(in_dims.begin(), in_dims.end());
-      kps::details::BroadcastConfig broadcast_config(
-          out_dims, in_dims, x.dims().size());
-
       auto mask_functor = MaskFunctor<T>(1.0f - dropout_prob);
       bool copy_in_kernel = GetSeedDataAndIncrement(dev_ctx,
                                                     seed,
@@ -372,20 +338,22 @@ void DropoutFwGPUKernelDriver(
       const uint64_t* seed_ptr =
           copy_in_kernel ? seed->data<uint64_t>() : nullptr;
 
-      DropOutNdForwardKernel<T>
+      VectorizedGeneratorMask<T>
           <<<grid_size, block_size, 0, stream>>>(size,
                                                  seed_data,
                                                  dropout_prob,
                                                  x_data,
                                                  mask_data,
+
                                                  increment,
                                                  main_offset,
-                                                 dst_functor,
                                                  mask_functor,
-                                                 y_data,
-                                                 y->numel(),
-                                                 broadcast_config,
                                                  seed_ptr);
+      auto dst_functor =
+          DstFunctor<T>(1.0f - dropout_prob, upscale_in_train, x_numel);
+      std::vector<const phi::DenseTensor*> ins = {&x, mask};
+      std::vector<phi::DenseTensor*> outs = {y};
+      phi::funcs::BroadcastKernel<T>(dev_ctx, ins, &outs, dst_functor);
     } else {
       bool copy_in_kernel = GetSeedDataAndIncrement(
           dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);
@@ -469,30 +437,13 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
       MT factor = upscale_in_train
                       ? static_cast<MT>(1.0f / (1.0f - dropout_prob))
                       : static_cast<MT>(1.0f);
+
+      std::vector<const phi::DenseTensor*> ins = {&grad_y, &mask};
+      std::vector<phi::DenseTensor*> outs = {grad_x};
       if (is_dropout_nd) {
-        phi::DenseTensor broadcasted_mask;
-
-        broadcasted_mask.Resize(grad_y.dims());
-        dev_ctx.template Alloc<uint8_t>(&broadcasted_mask);
-
-        std::vector<const phi::DenseTensor*> broadcast_ins = {&mask};
-        std::vector<phi::DenseTensor*> broadcast_outs = {&broadcasted_mask};
-        phi::funcs::BroadcastKernel<phi::ElementwiseType::kUnary,
-                                    uint8_t,
-                                    uint8_t>(dev_ctx,
-                                             broadcast_ins,
-                                             &broadcast_outs,
-                                             -1,
-                                             kps::IdentityFunctor<uint8_t>());
-
-        std::vector<const phi::DenseTensor*> ins = {&grad_y, &broadcasted_mask};
-        std::vector<phi::DenseTensor*> outs = {grad_x};
-        phi::funcs::ElementwiseKernel<T>(
+        phi::funcs::BroadcastKernel<T>(
             dev_ctx, ins, &outs, CudaDropoutGradFunctor<T>(factor));
-
       } else {
-        std::vector<const phi::DenseTensor*> ins = {&grad_y, &mask};
-        std::vector<phi::DenseTensor*> outs = {grad_x};
         phi::funcs::ElementwiseKernel<T>(
             dev_ctx, ins, &outs, CudaDropoutGradFunctor<T>(factor));
       }
diff --git a/paddle/phi/kernels/funcs/elementwise_base.h b/paddle/phi/kernels/funcs/elementwise_base.h
@@ -35,7 +35,6 @@ namespace kps = phi::kps;
 
 namespace phi {
 
-enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3 };
 /* Packing scalar type T(float, int etc.) into Array<T, NumOuts> type
    for supporting multiple-output feature in elementwise system.*/
 template <class T, int Num>
@@ -369,9 +368,9 @@ template <typename Functor, typename T, typename OutType = T>
 void ElementwiseCompute(const CPUContext &dev_ctx,
                         const DenseTensor &x,
                         const DenseTensor &y,
-                        int axis,
                         Functor func,
-                        DenseTensor *z) {
+                        DenseTensor *z,
+                        int axis = -1) {
   dev_ctx.Alloc<OutType>(z);
   auto x_dims = x.dims();
   auto y_dims = y.dims();
diff --git a/paddle/phi/kernels/legacy/kps/elementwise_raw_kernel.cu b/paddle/phi/kernels/legacy/kps/elementwise_raw_kernel.cu
@@ -174,4 +174,5 @@ PD_REGISTER_KERNEL(elementwise_pow_raw,
                    float16,
                    int64_t,
                    bfloat16) {}
-#endif
+
+#endif