YqGe585
diff --git a/‎paddle/phi/kernels/cpu/activation_grad_kernel.cc‎
Lines changed: 41 additions & 3 deletions b/‎paddle/phi/kernels/cpu/activation_grad_kernel.cc‎
Lines changed: 41 additions & 3 deletions
diff --git a/‎paddle/phi/kernels/cpu/activation_kernel.cc‎
Lines changed: 56 additions & 14 deletions b/‎paddle/phi/kernels/cpu/activation_kernel.cc‎
Lines changed: 56 additions & 14 deletions
diff --git a/‎paddle/phi/kernels/funcs/activation_functor.h‎
Lines changed: 113 additions & 0 deletions b/‎paddle/phi/kernels/funcs/activation_functor.h‎
Lines changed: 113 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/funcs/elementwise_functor.h‎
Lines changed: 2 additions & 5 deletions b/‎paddle/phi/kernels/funcs/elementwise_functor.h‎
Lines changed: 2 additions & 5 deletions
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/impl/activation_grad_impl.h"
 
 namespace phi {
@@ -248,6 +249,35 @@ void HardSwishGradKernel(const Context& dev_ctx,
       dev_ctx, &x, nullptr, &dout, dx, functor);
 }
 
+template <typename T, typename Context>
+void PowGradKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const DenseTensor& dout,
+                   const Scalar& factor,
+                   DenseTensor* dx) {
+  if (factor.to<float>() == 0) {
+    std::vector<int64_t> vec_dims = common::vectorize(dx->dims());
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(vec_dims), static_cast<T>(0), dx);
+    return;
+  }
+  PADDLE_ENFORCE_NOT_NULL(
+      dx,
+      errors::InvalidArgument("The output DenseTensor dx can not be nullptr"));
+  dev_ctx.template Alloc<T>(dx);
+  auto dout_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&dout, "Input", "Out@GRAD", "PowGrad"));
+  auto dx_flatten = EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(dx, "Output", "X@GRAD", "PowGrad"));
+  auto x_flatten =
+      EigenVector<T>::Flatten(GET_DATA_SAFELY(&x, "Input", "X", "PowGrad"));
+  auto* place = dev_ctx.eigen_device();
+  phi::funcs::PowGradFunctor<T> functor;
+  auto attrs = functor.GetAttrs();
+  *(attrs[0].second) = factor.to<float>();
+  functor(*place, x_flatten, nullptr, dout_flatten, dx_flatten);
+}
+
 }  // namespace phi
 
 PD_REGISTER_KERNEL(
@@ -462,20 +492,28 @@ PD_REGISTER_KERNEL(pow_grad,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
 PD_REGISTER_KERNEL(pow_double_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::PowDoubleGradKernel,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
+
 PD_REGISTER_KERNEL(pow_triple_grad,
                    CPU,
                    ALL_LAYOUT,
                    phi::PowTripleGradKernel,
                    float,
                    double,
                    int,
-                   int64_t) {}
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/full_kernel.h"
 #include "paddle/phi/kernels/funcs/activation_functor.h"
 #include "paddle/phi/kernels/impl/activation_impl.h"
 
@@ -173,6 +174,35 @@ void RoundKernel(const Context& dev_ctx,
       dev_ctx, x, out, functor);
 }
 
+template <typename T, typename Context>
+void PowKernel(const Context& dev_ctx,
+               const DenseTensor& x,
+               const Scalar& factor,
+               DenseTensor* out) {
+  PADDLE_ENFORCE_NOT_NULL(
+      out, errors::InvalidArgument("Output Out should not be nullptr"));
+  dev_ctx.template Alloc<T>(out);
+  if (factor.to<float>() == 0) {
+    std::vector<int64_t> vec_dims = common::vectorize(out->dims());
+    phi::Full<T, Context>(
+        dev_ctx, phi::IntArray(vec_dims), static_cast<T>(1), out);
+    return;
+  }
+  if (factor.to<float>() == 1) {
+    phi::Copy<Context>(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+    return;
+  }
+  auto x_flatten = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(&x, "Input", "X", "Activation"));
+  auto out_flatten = phi::EigenVector<T>::Flatten(
+      GET_DATA_SAFELY(out, "Output", "Out", "Activation"));
+  auto* place = dev_ctx.eigen_device();
+  phi::funcs::PowFunctor<T> functor;
+  auto attrs = functor.GetAttrs();
+  *(attrs[0].second) = factor.to<float>();
+  functor(*place, x_flatten, out_flatten);
+}
+
 }  // namespace phi
 PD_REGISTER_KERNEL(relu, CPU, ALL_LAYOUT, phi::ReluKernel, float, double) {}
 
@@ -215,6 +245,18 @@ PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(reciprocal, ReciprocalKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sqrt, SqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL(rsqrt, RsqrtKernel)
 PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(softplus, SoftplusKernel)
+PD_REGISTER_ACTIVATION_KERNEL(logit, LogitKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(softsign, SoftsignKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sigmoid, SigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(logsigmoid, LogSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(hardsigmoid, HardSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
+PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(hardswish, HardSwishKernel)
+PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel)
+PD_REGISTER_ACTIVATION_KERNEL(floor, FloorKernel)
+PD_REGISTER_ACTIVATION_KERNEL(ceil, CeilKernel)
+PD_REGISTER_ACTIVATION_KERNEL(celu, CeluKernel)
 
 PD_REGISTER_KERNEL(exp,
                    CPU,
@@ -240,7 +282,6 @@ PD_REGISTER_KERNEL(expm1,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PD_REGISTER_KERNEL(logit, CPU, ALL_LAYOUT, phi::LogitKernel, float, double) {}
 PD_REGISTER_KERNEL(square,
                    CPU,
                    ALL_LAYOUT,
@@ -251,12 +292,6 @@ PD_REGISTER_KERNEL(square,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(softsign, SoftsignKernel)
-PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(sigmoid, SigmoidKernel)
-PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(logsigmoid, LogSigmoidKernel)
-PD_REGISTER_ACTIVATION_KERNEL(hardsigmoid, HardSigmoidKernel)
-PD_REGISTER_ACTIVATION_KERNEL(swish, SwishKernel)
-PD_REGISTER_ACTIVATION_KERNEL(relu6, Relu6Kernel)
 
 PD_REGISTER_KERNEL(log,
                    CPU,
@@ -270,6 +305,7 @@ PD_REGISTER_KERNEL(log,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
 PD_REGISTER_KERNEL(log2,
                    CPU,
                    ALL_LAYOUT,
@@ -282,6 +318,7 @@ PD_REGISTER_KERNEL(log2,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
 PD_REGISTER_KERNEL(log10,
                    CPU,
                    ALL_LAYOUT,
@@ -294,6 +331,7 @@ PD_REGISTER_KERNEL(log10,
                    phi::dtype::bfloat16,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
+
 PD_REGISTER_KERNEL(log1p,
                    CPU,
                    ALL_LAYOUT,
@@ -307,10 +345,6 @@ PD_REGISTER_KERNEL(log1p,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
 
-PD_REGISTER_ACTIVATION_KERNEL_WITH_COMPLEX(hardswish, HardSwishKernel)
-PD_REGISTER_ACTIVATION_KERNEL(round, RoundKernel)
-PD_REGISTER_ACTIVATION_KERNEL(floor, FloorKernel)
-PD_REGISTER_ACTIVATION_KERNEL(ceil, CeilKernel)
 PD_REGISTER_KERNEL(negative,
                    CPU,
                    ALL_LAYOUT,
@@ -322,6 +356,14 @@ PD_REGISTER_KERNEL(negative,
                    int64_t,
                    phi::dtype::complex<float>,
                    phi::dtype::complex<double>) {}
-PD_REGISTER_ACTIVATION_KERNEL(celu, CeluKernel)
-PD_REGISTER_KERNEL(
-    pow, CPU, ALL_LAYOUT, phi::PowKernel, float, double, int, int64_t) {}
+
+PD_REGISTER_KERNEL(pow,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::PowKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
@@ -31,6 +31,7 @@
 
 #include "paddle/phi/common/amp_type_traits.h"
 #include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/complex.h"
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
@@ -2935,6 +2936,19 @@ struct PowFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct PowFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  float factor;
+  typename BaseActivationFunctor<ComplexType<T>>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.pow(static_cast<ComplexType<T>>(factor));  // NOLINT
+  }
+};
+
 template <typename T>
 struct PowGradFunctor : public BaseActivationFunctor<T> {
   float factor;
@@ -2954,6 +2968,27 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
 };
 
+template <typename T>
+struct PowGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  float factor;
+  typename BaseActivationFunctor<ComplexType<T>>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out UNUSED, dOut dout, dX dx) const {
+    dx.device(d) =
+        dout * static_cast<ComplexType<T>>(factor) *
+        x.pow(static_cast<ComplexType<T>>(factor - 1)).unaryExpr(Conj<T>());
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 // floor(x) = flooring(x)
 template <typename T>
 struct FloorFunctor : public BaseActivationFunctor<T> {
@@ -5194,6 +5229,84 @@ struct CudaCeilFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T, typename MPType>
+__device__ __forceinline__
+    typename std::enable_if<std::is_integral<T>::value, T>::type
+    compute_pow(const T a, const T b) {
+  // TODO(wujionghao): A potential speed improvement is supporting different
+  // types in C++.
+  // On CUDAPlace, pow(3, 1) calls pow(float, float), and
+  // it will return a float number like 2.99... , which floor to 2
+  // when cast to int by default and it is wrong.
+  // Use llrint to cast it to the nearest integer, which is 3.
+  return llrint(pow(static_cast<double>(a), static_cast<double>(b)));
+}
+
+template <typename T, typename MPType>
+__device__ __forceinline__
+    typename std::enable_if<!std::is_integral<T>::value, T>::type
+    compute_pow(const T a, const T b) {
+  MPType a_val = static_cast<MPType>(a);
+  MPType b_val = static_cast<MPType>(b);
+  return static_cast<T>(pow(a_val, b_val));
+}
+
+template <typename T>
+struct CudaPowFunctor : public BaseActivationFunctor<T> {
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  __device__ __forceinline__ T operator()(const T x) const {
+    return compute_pow<T, MT>(x, static_cast<T>(factor));
+  }
+};
+
+template <typename T>
+struct CudaPowGradFunctor : public BaseActivationFunctor<T> {
+  using MT = typename phi::dtype::MPTypeTrait<T>::Type;
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  // dx = dout * n * pow(x, n - 1)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout * static_cast<T>(factor) *
+           compute_pow<T, MT>(x, static_cast<T>(factor - 1));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaPowFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> x) const {
+    return pow(x, static_cast<ComplexType<T>>(factor));
+  }
+};
+
+template <typename T>
+struct CudaPowGradFunctor<ComplexType<T>>
+    : public BaseActivationFunctor<ComplexType<T>> {
+  float factor;
+  typename BaseActivationFunctor<T>::AttrPair GetAttrs() {
+    return {{"factor", &factor}};
+  }
+  // dx = dout * n * pow(x, n - 1)
+  __device__ __forceinline__ ComplexType<T> operator()(
+      const ComplexType<T> dout, const ComplexType<T> x) const {
+    return dout * conj(static_cast<ComplexType<T>>(factor) *
+                       pow(x, static_cast<ComplexType<T>>(factor - 1)));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 template <typename T>
 struct CudaFloorFunctor : public BaseActivationFunctor<T> {
   using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
 
@@ -959,21 +959,18 @@ inline HOSTDEVICE typename std::enable_if<std::is_integral<T>::value, T>::type
 compute_pow(const T a, const T b) {
   // TODO(wujionghao): A potential speed improvement is supporting different
   // types in C++.
-  // On CUDAPlace, std::pow(3, 1) calls pow(float, float), and
+  // On CUDAPlace, pow(3, 1) calls pow(float, float), and
   // it will return a float number like 2.99... , which floor to 2
   // when cast to int by default and it is wrong.
   // Use llrint to cast it to the nearest integer, which is 3.
-  return std::llrint(std::pow(static_cast<double>(a), static_cast<double>(b)));
+  return llrint(pow(static_cast<double>(a), static_cast<double>(b)));
 }
 template <typename T, typename MPType>
 inline HOSTDEVICE typename std::enable_if<!std::is_integral<T>::value, T>::type
 compute_pow(const T a, const T b) {
   MPType a_val = static_cast<MPType>(a);
   MPType b_val = static_cast<MPType>(b);
-#ifdef PADDLE_WITH_XPU_KP
   return static_cast<T>(pow(a_val, b_val));
-#endif
-  return static_cast<T>(std::pow(a_val, b_val));
 }
 #else
 template <typename T, typename MPType>