Fix (#67894)

co63oc · web-flow · commit 446d55b48614 · 2024-09-03T09:22:36.000+08:00
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -84,13 +84,7 @@ op_library(quantize_linear_op DEPS phi common)
 op_library(save_combine_op DEPS string_array phi common)
 op_library(load_combine_op DEPS string_array)
 
-if (WITH_GPU OR WITH_ROCM)
-    op_library(activation_op SRCS activation_op.cc soft_relu_op.cu DEPS ${OP_HEADER_DEPS})
-elseif (WITH_XPU_KP)
-    op_library(activation_op SRCS activation_op.cc DEPS ${OP_HEADER_DEPS})
-else()
-    op_library(activation_op SRCS activation_op.cc DEPS ${OP_HEADER_DEPS})
-endif()
+op_library(activation_op SRCS activation_op.cc DEPS ${OP_HEADER_DEPS})
 
 if (WITH_GPU OR WITH_ROCM)
     op_library(sync_batch_norm_op DEPS phi common)
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
@@ -376,18 +376,6 @@ namespace ops = paddle::operators;
 
 FOR_EACH_ACTIVATION_OP(REGISTER_ACTIVATION_OP);
 
-#define REGISTER_ACTIVATION_CPU_KERNEL(act_type, op_name)                \
-  PD_REGISTER_STRUCT_KERNEL(                                             \
-      act_type, CPU, ALL_LAYOUT, ops::op_name##Kernel, float, double) {} \
-  PD_REGISTER_STRUCT_KERNEL(act_type##_grad,                             \
-                            CPU,                                         \
-                            ALL_LAYOUT,                                  \
-                            ops::op_name##GradKernel,                    \
-                            float,                                       \
-                            double) {}
-
-REGISTER_ACTIVATION_CPU_KERNEL(soft_relu, SoftRelu)
-
 REGISTER_ACTIVATION_OP(mish, Mish, MishFunctor, MishGradFunctor);
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/fluid/operators/soft_relu_op.cu b/paddle/fluid/operators/soft_relu_op.cu
diff --git a/paddle/phi/kernels/cpu/soft_relu_grad_kernel.cc b/paddle/phi/kernels/cpu/soft_relu_grad_kernel.cc
@@ -0,0 +1,87 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
+#include <type_traits>
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+struct SoftReluGradFunctor {
+  float threshold;
+  void SetAttrs(float threshold_) { threshold = threshold_; }
+
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x UNUSED, Out out, dOut dout, dX dx) {
+    auto tmp = static_cast<T>(threshold);
+    auto temp = ((out > -tmp) * (out < tmp)).template cast<T>();
+    dx.device(d) = dout * (static_cast<T>(1) - (-out).exp()) * temp;
+  }
+};
+
+template <typename T, typename Context>
+void SoftmaxGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x_in,
+                       const DenseTensor& out_in,
+                       const DenseTensor& out_grad,
+                       float threshold,
+                       DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  auto dout = phi::EigenVector<T>::Flatten(out_grad);
+  auto out = phi::EigenVector<T>::Flatten(out_in);
+  auto dx = phi::EigenVector<T>::Flatten(*x_grad);
+  auto x = phi::EigenVector<T>::Flatten(x_in);
+  auto* eigen_dev = dev_ctx.eigen_device();
+  SoftReluGradFunctor<T> functor;
+  functor.SetAttrs(threshold);
+  // use 32bit index to speed up computation
+  bool use_32bit_index = out.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
+  if (use_32bit_index && is_gpu_place) {
+    functor(*eigen_dev,
+            To32BitIndex(x),
+            To32BitIndex(out),
+            To32BitIndex(dout),
+            To32BitIndex(dx));
+  } else {
+    functor(*eigen_dev, x, out, dout, dx);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    soft_relu_grad, CPU, ALL_LAYOUT, phi::SoftmaxGradKernel, float, double) {}
diff --git a/paddle/phi/kernels/cpu/soft_relu_kernel.cc b/paddle/phi/kernels/cpu/soft_relu_kernel.cc
@@ -0,0 +1,77 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <glog/logging.h>
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+#ifndef _USE_MATH_DEFINES
+#define _USE_MATH_DEFINES
+#endif
+
+#include <type_traits>
+
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/blas/blas.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"
+
+namespace phi {
+
+template <typename T>
+struct SoftReluFunctor {
+  float threshold;
+  void SetAttrs(float threshold_) { threshold = threshold_; }
+
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) {
+    auto tmp = static_cast<T>(threshold);
+    auto temp = x.cwiseMax(-tmp).cwiseMin(tmp);
+    out.device(d) = (static_cast<T>(1) + temp.exp()).log();
+  }
+};
+
+template <typename T, typename Context>
+void SoftmaxKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   float threshold,
+                   DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+
+  auto x_flatten = phi::EigenVector<T>::Flatten(x);
+  auto out_flatten = phi::EigenVector<T>::Flatten(*out);
+  auto* eigen_dev = dev_ctx.eigen_device();
+  SoftReluFunctor<T> functor;
+  functor.SetAttrs(threshold);
+  // use 32bit index to speed up computation
+  bool use_32bit_index = out_flatten.size() < Eigen::NumTraits<int>::highest();
+  bool is_gpu_place = dev_ctx.GetPlace().GetType() == phi::AllocationType::GPU;
+  if (use_32bit_index && is_gpu_place) {
+    functor(*eigen_dev, To32BitIndex(x_flatten), To32BitIndex(out_flatten));
+  } else {
+    functor(*eigen_dev, x_flatten, out_flatten);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    soft_relu, CPU, ALL_LAYOUT, phi::SoftmaxKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/soft_relu_grad_kernel.cu b/paddle/phi/kernels/gpu/soft_relu_grad_kernel.cu
@@ -0,0 +1,71 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise/elementwise_op_impl.cu.h"
+
+namespace phi {
+
+template <typename T>
+struct CudaSoftReluGradFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;
+
+  void SetAttrs(float threshold_) { threshold = threshold_; }
+
+  // dx = (out > -threshold && out < threshold) ? dout * (1 - exp(-out)) : 0
+  // threshold should not be negative
+  __device__ __forceinline__ T operator()(const T arg_dout, const T arg_out) {
+    MPType dout = static_cast<MPType>(arg_dout);
+    MPType out = static_cast<MPType>(arg_out);
+    MPType t = static_cast<MPType>(threshold);
+    return (out > -t && out < t) ? static_cast<T>(dout * (one - exp(-out)))
+                                 : static_cast<T>(0.0f);
+  }
+};
+
+template <typename T, typename Context>
+void SoftReluGradCudaKernel(const Context& dev_ctx,
+                            const DenseTensor& x_in UNUSED,
+                            const DenseTensor& out_in,
+                            const DenseTensor& out_grad,
+                            float threshold,
+                            DenseTensor* x_grad) {
+  dev_ctx.template Alloc<T>(x_grad);
+  CudaSoftReluGradFunctor<T> functor;
+  functor.SetAttrs(threshold);
+
+  std::vector<const phi::DenseTensor*> ins = {&out_grad};
+  std::vector<phi::DenseTensor*> outs = {x_grad};
+
+  // Only need forward output Out
+  ins.push_back(&out_in);
+  phi::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
+      dev_ctx, ins, &outs, functor);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(soft_relu_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SoftReluGradCudaKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/soft_relu_kernel.cu b/paddle/phi/kernels/gpu/soft_relu_kernel.cu
@@ -0,0 +1,65 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/gpu/gpu_device_function.h"
+#include "paddle/phi/common/amp_type_traits.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/activation_functor.h"
+#include "paddle/phi/kernels/funcs/elementwise/elementwise_op_impl.cu.h"
+
+namespace phi {
+
+template <typename T>
+struct CudaSoftReluFunctor {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  float threshold;
+
+  void SetAttrs(float threshold_) { threshold = threshold_; }
+
+  // soft_relu(x) = log(1 + exp(max(min(x, threshold), -threshold)))
+  // threshold should not be negative
+  __device__ __forceinline__ T operator()(const T arg_x) {
+    MPType x = static_cast<MPType>(arg_x);
+    MPType t = static_cast<MPType>(threshold);
+    MPType temp_min = x < t ? x : t;
+    MPType temp_max = temp_min > -t ? temp_min : -t;
+    return static_cast<T>(log(one + exp(temp_max)));
+  }
+};
+
+template <typename T, typename Context>
+void SoftReluCudaKernel(const Context& dev_ctx,
+                        const DenseTensor& x,
+                        float threshold,
+                        DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
+  std::vector<const phi::DenseTensor*> ins = {&x};
+  std::vector<phi::DenseTensor*> outs = {out};
+  CudaSoftReluFunctor<T> functor;
+  functor.SetAttrs(threshold);
+  phi::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
+      dev_ctx, ins, &outs, functor);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(soft_relu,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SoftReluCudaKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/ops/yaml/inconsistent/onednn_ops_extra.yaml b/paddle/phi/ops/yaml/inconsistent/onednn_ops_extra.yaml
@@ -269,6 +269,10 @@
 
 - op : sigmoid_grad
 
+- op : soft_relu
+
+- op : soft_relu_grad
+
 - op : slice
   extra_args : str mkldnn_data_type="float32"
 
diff --git a/paddle/phi/ops/yaml/inconsistent/static_backward.yaml b/paddle/phi/ops/yaml/inconsistent/static_backward.yaml
diff --git a/paddle/phi/ops/yaml/inconsistent/static_ops.yaml b/paddle/phi/ops/yaml/inconsistent/static_ops.yaml