PaddlePaddle · JamesLim-sy · Jan 5, 2022 · Dec 10, 2021 · Dec 13, 2021 · Dec 13, 2021
diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.cu b/paddle/fluid/operators/elementwise/elementwise_div_op.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/elementwise/elementwise_div_op.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -29,13 +30,11 @@ static __global__ void SimpleElemwiseDivGradCUDAKernel(const T* x, const T* y,
                                                        const T* dout,
                                                        int64_t size, T* dx,
                                                        T* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    T o = dout[col];
-    dx[col] = o / y[col];
-    dy[col] = -o * out[col] / y[col];
-    col += blockDim.x * gridDim.x;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    T o = dout[i];
+    dx[i] = o / y[i];
+    dy[i] = -o * out[i] / y[i];
   }
 }
 
@@ -48,16 +47,14 @@ SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex<float>>(
     const paddle::platform::complex<float>* dout, int64_t size,
     paddle::platform::complex<float>* dx,
     paddle::platform::complex<float>* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
-
-  while (col < size) {
-    paddle::platform::complex<float> o = dout[col];
-    paddle::platform::complex<float> y_conj(y[col].real, -y[col].imag);
-    paddle::platform::complex<float> out_div_y_conj((out[col] / y[col]).real,
-                                                    -(out[col] / y[col]).imag);
-    dx[col] = o / y_conj;
-    dy[col] = -o * out_div_y_conj;
-    col += blockDim.x * gridDim.x;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    paddle::platform::complex<float> o = dout[i];
+    paddle::platform::complex<float> y_conj(y[i].real, -y[i].imag);
+    paddle::platform::complex<float> out_div_y_conj((out[i] / y[i]).real,
+                                                    -(out[i] / y[i]).imag);
+    dx[i] = o / y_conj;
+    dy[i] = -dout[i] * out_div_y_conj;
   }
 }
 
@@ -70,16 +67,102 @@ SimpleElemwiseDivGradCUDAKernel<paddle::platform::complex<double>>(
     const paddle::platform::complex<double>* dout, int64_t size,
     paddle::platform::complex<double>* dx,
     paddle::platform::complex<double>* dy) {
-  int col = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < size;
+       i += blockDim.x * gridDim.x) {
+    paddle::platform::complex<double> o = dout[i];
+    paddle::platform::complex<double> y_conj(y[i].real, -y[i].imag);
+    paddle::platform::complex<double> out_div_y_conj((out[i] / y[i]).real,
+                                                     -(out[i] / y[i]).imag);
+    dx[i] = o / y_conj;
+    dy[i] = -dout[i] * out_div_y_conj;
+  }
+}
+
+template <typename T>
+void reduce_functor(const framework::ExecutionContext& ctx,
+                    const framework::Tensor* in, const framework::Tensor* out,
+                    framework::Tensor* src, framework::Tensor* dst) {
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  if (dst->dims() == out->dims()) {
+    dst->ShareDataWith(*src);
+    return;
+  }
+  int axis = ctx.Attr<int>("axis");
+  std::vector<int> reduce_dims = GetReduceDim(in->dims(), out->dims(), axis);
+  gpuStream_t stream = ctx.cuda_device_context().stream();
+  TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
+      *src, dst, kps::IdentityFunctor<T>(), reduce_dims, stream);
+}
+
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+default_elementwise_div_grad(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y,
+                             const framework::Tensor* out,
+                             const framework::Tensor* dout,
+                             framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+  auto* dout_data = dout->data<T>();
+  dim3 block_size = dim3(ELEMENTWISE_BLOCK_SIZE, 1);
+  const auto& dev_ctx =
+      ctx.template device_context<platform::CUDADeviceContext>();
+  framework::Tensor tmp_dx;
+  tmp_dx.mutable_data<T>(dout->dims(), ctx.GetPlace());
+  framework::Tensor tmp_dy;
+  tmp_dy.mutable_data<T>(dout->dims(), ctx.GetPlace());
+  if (dx != nullptr && dy != nullptr) {
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    // For inplace strategy, dx will be stored in addr of dout, which makes
+    // the result of dy wrong.
+    if (dx->IsSharedBufferWith(*dout)) {
+      dx->clear();
+      dx->mutable_data<T>(x->dims(), ctx.GetPlace());
+    }
+    // dout.dims==out.dims
+    std::vector<const framework::Tensor*> ins = {dout, out, y};
+    std::vector<framework::Tensor*> outs = {&tmp_dx, &tmp_dy};
+    auto functor = DivGradXYFunctor<T, T>();
+    LaunchElementwiseCudaKernel<ElementwiseType::kTernary, T, T,
+                                decltype(functor), 2>(dev_ctx, ins, &outs, axis,
+                                                      functor);
 
-  while (col < size) {
-    paddle::platform::complex<double> o = dout[col];
-    paddle::platform::complex<double> y_conj(y[col].real, -y[col].imag);
-    paddle::platform::complex<double> out_div_y_conj((out[col] / y[col]).real,
-                                                     -(out[col] / y[col]).imag);
-    dx[col] = o / y_conj;
-    dy[col] = -o * out_div_y_conj;
-    col += blockDim.x * gridDim.x;
+    if (dx->dims() == dout->dims() && dy->dims() == dout->dims()) {
+      dx->ShareDataWith(tmp_dx);
+      dy->ShareDataWith(tmp_dy);
+    } else {
+      reduce_functor<T>(ctx, x, out, &tmp_dx, dx);
+      reduce_functor<T>(ctx, y, out, &tmp_dy, dy);
+    }
+  } else if (dx != nullptr && dy == nullptr) {
+    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    if (dx->IsSharedBufferWith(*dout)) {
+      dx->clear();
+      dx->mutable_data<T>(x->dims(), ctx.GetPlace());
+    }
+    std::vector<const framework::Tensor*> ins = {dout, y};
+    std::vector<framework::Tensor*> outs = {&tmp_dx};
+    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(
+        dev_ctx, ins, &outs, axis, DivGradFunctor<T>());
+    if (dx->dims() != dout->dims()) {
+      reduce_functor<T>(ctx, x, out, &tmp_dx, dx);
+    } else {
+      dx->ShareDataWith(tmp_dx);
+    }
+  } else if (dy != nullptr && dx == nullptr) {
+    auto* dy_data = dy->mutable_data<T>(ctx.GetPlace());
+    std::vector<const framework::Tensor*> ins = {dout, out, y};
+    std::vector<framework::Tensor*> outs = {&tmp_dy};
+    LaunchElementwiseCudaKernel<ElementwiseType::kTernary, T, T>(
+        dev_ctx, ins, &outs, axis, DivGradYFunctor<T>());
+    if (dy->dims() != dout->dims()) {
+      reduce_functor<T>(ctx, y, out, &tmp_dy, dy);
+    } else {
+      dy->ShareDataWith(tmp_dy);
+    }
   }
 }
 

diff --git a/paddle/fluid/operators/elementwise/elementwise_div_op.h b/paddle/fluid/operators/elementwise/elementwise_div_op.h
@@ -108,6 +108,21 @@ struct DivDoubleDY {
   }
 };
 
+template <typename DeviceContext, typename T>
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
+default_elementwise_div_grad(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y,
+                             const framework::Tensor* out,
+                             const framework::Tensor* dout,
+                             framework::Tensor* dx, framework::Tensor* dy) {
+  int axis = ctx.Attr<int>("axis");
+
+  ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
+      ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
+}
+
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CPUDeviceContext>::value>::type
@@ -116,13 +131,21 @@ elementwise_div_grad(const framework::ExecutionContext& ctx,
                      const framework::Tensor* out,
                      const framework::Tensor* dout, framework::Tensor* dx,
                      framework::Tensor* dy) {
-  int axis = ctx.Attr<int>("axis");
-  ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
-      ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(), DivGradDY<T>());
+  default_elementwise_div_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
 }
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+template <typename DeviceContext, typename T>
 // cuda definition
+typename std::enable_if<
+    std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
+default_elementwise_div_grad(const framework::ExecutionContext& ctx,
+                             const framework::Tensor* x,
+                             const framework::Tensor* y,
+                             const framework::Tensor* out,
+                             const framework::Tensor* dout,
+                             framework::Tensor* dx, framework::Tensor* dy);
+
 template <typename DeviceContext, typename T>
 typename std::enable_if<
     std::is_same<DeviceContext, platform::CUDADeviceContext>::value>::type
@@ -146,14 +169,12 @@ class ElementwiseDivGradKernel : public ElemwiseGradKernel<T> {
     auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
     auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
     auto* dy = ctx.Output<Tensor>(framework::GradVarName("Y"));
-    int axis = ctx.Attr<int>("axis");
 
     if (dx != nullptr && dy != nullptr && (dx->dims() == dy->dims())) {
       elementwise_div_grad<DeviceContext, T>(ctx, x, y, out, dout, dx, dy);
     } else {
-      ElemwiseGradCompute<DeviceContext, T, DivGradDX<T>, DivGradDY<T>>(
-          ctx, *x, *y, *out, *dout, axis, dx, dy, DivGradDX<T>(),
-          DivGradDY<T>());
+      default_elementwise_div_grad<DeviceContext, T>(ctx, x, y, out, dout, dx,
+                                                     dy);
     }
   }
 };

diff --git a/paddle/fluid/operators/elementwise/elementwise_functor.h b/paddle/fluid/operators/elementwise/elementwise_functor.h
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/framework/array.h"
+#include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"
@@ -113,6 +115,70 @@ struct MinFunctor {
   }
 };
 
+template <typename T>
+using Complex = paddle::platform::complex<T>;
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor {
+  inline HOSTDEVICE paddle::framework::Array<OutT, 2> operator()(InT a, InT b,
+                                                                 InT c) {
+    // dx = dout / y
+    // dy = - dout * out / y
+    paddle::framework::Array<OutT, 2> outs;
+    outs[0] = a / c;
+    outs[1] = -a * b / c;
+    return outs;
+  }
+};
+
+template <typename InT, typename OutT>
+struct DivGradXYFunctor<Complex<InT>, Complex<OutT>> {
+  inline HOSTDEVICE paddle::framework::Array<Complex<OutT>, 2> operator()(
+      Complex<InT> a, Complex<InT> b, Complex<InT> c) {
+    paddle::framework::Array<Complex<OutT>, 2> outs;
+    Complex<InT> c_conj(c.real, -c.imag);
+    Complex<InT> out_div_y_conj((b / c).real, -(b / c).imag);
+    outs[0] = a / c_conj;
+    outs[1] = -a * out_div_y_conj;
+    return outs;
+  }
+};
+
+// Float div grad
+template <typename T>
+struct DivGradFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b) const { return a / b; }
+};
+
+// Complex div grad
+template <typename T>
+struct DivGradFunctor<Complex<T>> {
+  inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
+                                          const Complex<T>& b) const {
+    Complex<T> b_conj(b.real, -b.imag);
+    return a / b_conj;
+  }
+};
+
+// Float mul and div
+template <typename T>
+struct DivGradYFunctor {
+  inline HOSTDEVICE T operator()(const T& a, const T& b, const T& c) const {
+    return -a * b / c;
+  }
+};
+
+// Complex mul and div
+template <typename T>
+struct DivGradYFunctor<Complex<T>> {
+  inline HOSTDEVICE Complex<T> operator()(const Complex<T>& a,
+                                          const Complex<T>& b,
+                                          const Complex<T>& c) const {
+    Complex<T> out_div_y_conj((b / c).real, -(b / c).imag);
+    return -a * out_div_y_conj;
+  }
+};
+
 // Fmax
 template <typename T>
 struct FMaxFunctor {