[XPU] Add layernorm_relu pass and kernel (PaddlePaddle#68451)

gitliuyf · gitliuyf · commit 8d1354a25358 · 2024-10-18T16:49:33.000+08:00
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -297,6 +297,8 @@ if(WITH_XPU)
                ${XPU_PASS_DEPS})
   pass_library(add_layernorm_xpu_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
+  pass_library(layer_norm_relu_xpu_fuse_pass inference DIR xpu DEPS
+               ${XPU_PASS_DEPS})
   pass_library(xpu_delete_cast_op_pass inference DIR xpu DEPS ${XPU_PASS_DEPS})
   pass_library(fold_interp_outsize_fuse_pass inference DIR xpu DEPS
                ${XPU_PASS_DEPS})
diff --git a/paddle/fluid/framework/ir/pass.cc b/paddle/fluid/framework/ir/pass.cc
@@ -60,6 +60,7 @@ static const std::vector<std::string> xpu_support_subgraph_passes = {
     "constant_folding_pass",
     "delete_elementwise_mul_op_pass",
     "generate_sequence_xpu_fuse_pass",
+    "layer_norm_relu_xpu_fuse_pass",
     "embedding_with_eltwise_add_xpu_fuse_pass",
     "multi_encoder_xpu_fuse_pass",
     "multi_encoder_xpu_adaptive_seqlen_fuse_pass",
diff --git a/paddle/fluid/framework/ir/xpu/layer_norm_relu_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/layer_norm_relu_xpu_fuse_pass.cc
@@ -0,0 +1,214 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+
+#include "glog/logging.h"
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+#include "paddle/fluid/framework/ir/xpu/pass_utils.h"
+#include "paddle/fluid/framework/ir/xpu/quant_utils.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+
+namespace phi {
+class DenseTensor;
+}  // namespace phi
+
+namespace paddle {
+namespace framework {
+class Scope;
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace framework {
+namespace ir {
+namespace patterns {
+
+/*
+fuse ln + activation block in to xpu_ele_fusion op
+For example:
+graph:
+                      X
+              Scale   |   Bias
+                   \  |  /
+                  layer norm
+                   /  |  \
+                  /   |   \
+            variance  |   mean
+                      |
+                     relu
+                      |
+                    output
+------------------------------------------------------
+After the pass is applied:
+                      X
+              Scale   |   Bias
+                   \  |  /
+                ln_relu_fusion
+                      |
+                     Out
+*/
+struct LayerNormalizeReluXPUPattern : public PatternBase {
+  LayerNormalizeReluXPUPattern(PDPattern* pattern,
+                               const std::string& name_scope);
+  // declare operator node's name
+  PATTERN_DECL_NODE(ln);
+  PATTERN_DECL_NODE(relu);
+  // declare variable node's name
+  PATTERN_DECL_NODE(ln_x);
+  PATTERN_DECL_NODE(ln_bias);
+  PATTERN_DECL_NODE(ln_scale);
+  PATTERN_DECL_NODE(ln_y);
+  PATTERN_DECL_NODE(ln_mean);
+  PATTERN_DECL_NODE(ln_variance);
+  PATTERN_DECL_NODE(relu_out);
+};
+
+LayerNormalizeReluXPUPattern::LayerNormalizeReluXPUPattern(
+    PDPattern* pattern, const std::string& name_scope)
+    : PatternBase(pattern, name_scope, name_scope) {
+  auto ln = pattern->NewNode(ln_repr())->assert_is_op("layer_norm");
+  auto ln_x = pattern->NewNode(ln_x_repr())
+                  ->assert_is_op_input("layer_norm", "X")
+                  ->AsInput();
+  auto ln_bias = pattern->NewNode(ln_bias_repr())
+                     ->assert_is_op_input("layer_norm", "Bias")
+                     ->assert_is_persistable_var()
+                     ->AsInput();
+  auto ln_scale = pattern->NewNode(ln_scale_repr())
+                      ->assert_is_op_input("layer_norm", "Scale")
+                      ->assert_is_persistable_var()
+                      ->AsInput();
+  auto ln_y = pattern->NewNode(ln_y_repr())
+                  ->assert_is_op_output("layer_norm", "Y")
+                  ->assert_is_op_input("relu", "X")
+                  ->assert_has_n_outputs(1);
+  auto ln_mean = pattern->NewNode(ln_mean_repr())
+                     ->assert_is_op_output("layer_norm", "Mean")
+                     ->assert_has_n_outputs(0);
+  auto ln_variance = pattern->NewNode(ln_variance_repr())
+                         ->assert_is_op_output("layer_norm", "Variance")
+                         ->assert_has_n_outputs(0);
+  ln->LinksFrom({ln_x, ln_bias, ln_scale})
+      .LinksTo({ln_y, ln_mean, ln_variance});
+
+  auto relu = pattern->NewNode(relu_repr())->assert_is_op("relu");
+  auto relu_out = pattern->NewNode(relu_out_repr())
+                      ->AsOutput()
+                      ->assert_is_op_output("relu", "Out");
+  relu->LinksFrom({ln_y}).LinksTo({relu_out});
+}
+
+}  // namespace patterns
+
+class LayerNormalizeReluXPUFusePass : public FusePassBase {
+ protected:
+  void ApplyImpl(ir::Graph* graph) const override;
+
+ private:
+  void FuseLayerNormalizeRelu(ir::Graph* graph) const;
+
+  const std::string name_scope_{"layer_norm_relu_xpu_fuse_pass"};
+};
+
+void LayerNormalizeReluXPUFusePass::ApplyImpl(ir::Graph* graph) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, common::errors::PreconditionNotMet("graph should not be null."));
+  Init(name_scope_, graph);
+  auto* dev_ctx = static_cast<phi::CPUContext*>(
+      phi::DeviceContextPool::Instance().Get(phi::XPUPlace()));
+  auto version =
+      phi::backends::xpu::get_xpu_version(dev_ctx->GetPlace().GetDeviceId());
+  if (version == phi::backends::xpu::XPUVersion::XPU2) {
+    FuseLayerNormalizeRelu(graph);
+  }
+}
+
+void LayerNormalizeReluXPUFusePass::FuseLayerNormalizeRelu(
+    ir::Graph* graph) const {
+  GraphPatternDetector gpd;
+  patterns::LayerNormalizeReluXPUPattern pattern(gpd.mutable_pattern(),
+                                                 name_scope_);
+
+  int found_subgraph_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* graph) {
+    VLOG(4) << "handle LayerNormalizeReluXPUFusePass fuse";
+    // declare operator node's name
+    GET_IR_NODE(ln);
+    GET_IR_NODE(relu);
+    // declare variable node's name
+    GET_IR_NODE(ln_x);
+    GET_IR_NODE(ln_bias);
+    GET_IR_NODE(ln_scale);
+    GET_IR_NODE(ln_y);
+    GET_IR_NODE(ln_mean);
+    GET_IR_NODE(ln_variance);
+    GET_IR_NODE(relu_out);
+
+    auto* block = ln->Op()->Block();
+    auto* scope = param_scope();
+    PADDLE_ENFORCE_NOT_NULL(
+        scope, common::errors::InvalidArgument("Scope cannot be nullptr."));
+    // delete useless node
+    std::unordered_set<const Node*> delete_nodes;
+
+    float eps = PADDLE_GET_CONST(float, ln->Op()->GetAttr("epsilon"));
+    int begin_norm_axis =
+        PADDLE_GET_CONST(int, ln->Op()->GetAttr("begin_norm_axis"));
+
+    std::string fused_op_out_name;
+    fused_op_out_name = relu_out->Name();
+    // Generate add_layernorm fused op
+    framework::OpDesc fused_op_desc(block);
+
+    fused_op_desc.SetType("layer_norm_relu_xpu");
+    // set attrs for fused op
+    fused_op_desc.SetAttr("begin_norm_axis", begin_norm_axis);
+    fused_op_desc.SetInput("x", {ln_x->Name()});
+    fused_op_desc.SetInput("bias", {ln_bias->Name()});
+    fused_op_desc.SetInput("scale", {ln_scale->Name()});
+    fused_op_desc.SetAttr("epsilon", eps);
+    fused_op_desc.SetOutput("out", {fused_op_out_name});
+    // relink fused op
+    auto* fused_op = graph->CreateOpNode(&fused_op_desc);
+    IR_NODE_LINK_TO(ln_x, fused_op);
+    IR_NODE_LINK_TO(ln_bias, fused_op);
+    IR_NODE_LINK_TO(ln_scale, fused_op);
+    IR_NODE_LINK_TO(fused_op, relu_out);
+
+    delete_nodes.insert({ln, relu, ln_y, ln_mean, ln_variance});
+    GraphSafeRemoveNodes(graph, delete_nodes);
+    found_subgraph_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_subgraph_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(layer_norm_relu_xpu_fuse_pass,
+              paddle::framework::ir::LayerNormalizeReluXPUFusePass);
+
+REGISTER_PASS_CAPABILITY(layer_norm_relu_xpu_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination().EQ(
+            "layer_norm_relu_xpu", 0));
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -535,6 +535,7 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) {
       "cast_embedding_trans_ids_to_int32_pass",
       "delete_elementwise_mul_op_pass",
       "generate_sequence_xpu_fuse_pass",
+      "layer_norm_relu_xpu_fuse_pass",
       "embedding_with_eltwise_add_xpu_fuse_pass",
       "multi_encoder_xpu_fuse_pass",
       "multi_encoder_xpu_adaptive_seqlen_fuse_pass",
diff --git a/paddle/phi/api/yaml/fused_ops.yaml b/paddle/phi/api/yaml/fused_ops.yaml
@@ -371,6 +371,16 @@
     func : layer_norm_act_xpu
     data_type : x
 
+- op : layer_norm_relu_xpu
+  args : (Tensor x, Tensor scale, Tensor bias, int begin_norm_axis, float epsilon = 1e-5)
+  output : Tensor(out)
+  infer_meta :
+    func : LayerNormalizeReluXPUInferMeta
+  kernel :
+    func : layer_norm_relu_xpu
+    data_type : x
+  optional : scale, bias
+
 - op : multi_encoder_xpu
   args : (Tensor x, Tensor[] fc_weight, Tensor[] fc_weight_max, Tensor[] fc_bias, Tensor[] ln_scale, Tensor[] ln_bias, Tensor mask, Tensor seq_lod, Tensor max_seq_len, int layer_num, bool norm_before, int hidden_dim, int head_num, int size_per_head, int ffn_hidden_dim_scale, int act_type, int relative_type, int slice_idx)
   output : Tensor(out), Tensor(x_fp16), Tensor(out_fp16)
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -501,6 +501,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::FLOAT32})},
       {"grid_sampler_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"grid_sampler", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"layer_norm_relu_xpu",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"hard_sigmoid_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"hard_sigmoid",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
diff --git a/paddle/phi/infermeta/fusion.cc b/paddle/phi/infermeta/fusion.cc
@@ -116,6 +116,18 @@ void AddLayernormXPUInferMeta(const MetaTensor& x,
   out->share_lod(x);
 }
 
+void LayerNormalizeReluXPUInferMeta(const MetaTensor& x,
+                                    const MetaTensor& scale,
+                                    const MetaTensor& bias,
+                                    int begin_norm_axis,
+                                    float epsilon,
+                                    MetaTensor* out) {
+  out->set_dims(x.dims());
+  //   y->share_lod(x);
+  out->set_dtype(x.dtype());
+  out->set_layout(x.layout());
+}
+
 void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv,
                                       const MetaTensor& key_cache,
                                       const MetaTensor& value_cache,
diff --git a/paddle/phi/infermeta/fusion.h b/paddle/phi/infermeta/fusion.h
@@ -38,6 +38,13 @@ void AddLayernormXPUInferMeta(const MetaTensor& x,
                               float epsilon,
                               MetaTensor* out);
 
+void LayerNormalizeReluXPUInferMeta(const MetaTensor& x,
+                                    const MetaTensor& scale,
+                                    const MetaTensor& bias,
+                                    int begin_norm_axis,
+                                    float epsilon,
+                                    MetaTensor* out);
+
 void BlockMultiheadAttentionInferMeta(const MetaTensor& qkv,
                                       const MetaTensor& key_cache,
                                       const MetaTensor& value_cache,
diff --git a/paddle/phi/kernels/fusion/xpu/layer_norm_relu_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/layer_norm_relu_xpu_kernel.cc
@@ -0,0 +1,99 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/norm_utils.h"
+
+namespace phi {
+namespace fusion {
+
+template <typename T, typename Context>
+void LayerNormalizeReluXPUKernel(const Context& ctx,
+                                 const DenseTensor& x,
+                                 const paddle::optional<DenseTensor>& scale,
+                                 const paddle::optional<DenseTensor>& bias,
+                                 int begin_norm_axis,
+                                 float epsilon,
+                                 DenseTensor* y) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  const auto& x_dims = x.dims();
+  auto matrix_dim = common::flatten_to_2d(x_dims, begin_norm_axis);
+  int left = static_cast<int>(matrix_dim[0]);
+  int right = static_cast<int>(matrix_dim[1]);
+  const auto* x_data = x.data<T>();
+
+  xpu::ctx_guard RAII_GUARD(ctx.x_context());
+
+  // scale
+  const float* scale_data_fp32 = nullptr;
+  const auto* scale_ptr = scale.get_ptr();
+  if (scale_ptr == nullptr) {
+    // no scale, do nothing
+  } else if (scale_ptr->dtype() == phi::DataType::FLOAT16) {
+    float* scale_data_temp =
+        RAII_GUARD.alloc_l3_or_gm<float>(scale_ptr->numel());
+    int r = xpu::cast<XPUType, float>(
+        ctx.x_context(),
+        reinterpret_cast<const XPUType*>(scale_ptr->data<T>()),
+        scale_data_temp,
+        scale_ptr->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+    scale_data_fp32 = scale_data_temp;
+  } else {
+    // no need to cast
+    scale_data_fp32 = scale_ptr->data<float>();
+  }
+
+  // bias
+  const float* bias_data_fp32 = nullptr;
+  const auto* bias_ptr = bias.get_ptr();
+  if (bias_ptr == nullptr) {
+    // no bias, do nothing
+  } else if (bias_ptr->dtype() == phi::DataType::FLOAT16) {
+    float* bias_data_temp = RAII_GUARD.alloc_l3_or_gm<float>(bias_ptr->numel());
+    int r = xpu::cast<XPUType, float>(
+        ctx.x_context(),
+        reinterpret_cast<const XPUType*>(bias_ptr->data<T>()),
+        bias_data_temp,
+        bias_ptr->numel());
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "cast");
+    bias_data_fp32 = bias_data_temp;
+  } else {
+    // no need to cast
+    bias_data_fp32 = bias_ptr->data<float>();
+  }
+
+  auto* out_data = ctx.template Alloc<T>(y);
+
+  int r = xpu::layer_norm_relu_fusion(ctx.x_context(),
+                                      reinterpret_cast<const XPUType*>(x_data),
+                                      reinterpret_cast<XPUType*>(out_data),
+                                      left,
+                                      right,
+                                      epsilon,
+                                      scale_data_fp32,
+                                      bias_data_fp32);
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "layer_norm_relu_fusion");
+}
+
+}  // namespace fusion
+}  // namespace phi
+
+PD_REGISTER_KERNEL(layer_norm_relu_xpu,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::fusion::LayerNormalizeReluXPUKernel,
+                   float,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/xpu/c_embedding_kernel.cc b/paddle/phi/kernels/xpu/c_embedding_kernel.cc
diff --git a/test/ir/inference/test_xpu_layer_norm_relu_pass.py b/test/ir/inference/test_xpu_layer_norm_relu_pass.py