PaddlePaddle
diff --git a/‎paddle/phi/backends/xpu/xpu2_op_list.cc‎
Lines changed: 3 additions & 0 deletions b/‎paddle/phi/backends/xpu/xpu2_op_list.cc‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎paddle/phi/backends/xpu/xpu3_op_list.cc‎
Lines changed: 8 additions & 0 deletions b/‎paddle/phi/backends/xpu/xpu3_op_list.cc‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc‎
Lines changed: 5 additions & 4 deletions b/‎paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎paddle/phi/kernels/fusion/xpu/fused_gemm_epilogue_kernel.cc‎
Lines changed: 1 addition & 1 deletion b/‎paddle/phi/kernels/fusion/xpu/fused_gemm_epilogue_kernel.cc‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎paddle/phi/kernels/xpu/addmm_grad_kernel.cc‎
Lines changed: 140 additions & 0 deletions b/‎paddle/phi/kernels/xpu/addmm_grad_kernel.cc‎
Lines changed: 140 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/xpu/addmm_kernel.cc‎
Lines changed: 170 additions & 0 deletions b/‎paddle/phi/kernels/xpu/addmm_kernel.cc‎
Lines changed: 170 additions & 0 deletions
@@ -46,6 +46,9 @@ XPUOpMap& get_kl2_ops() {
       {"adagrad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"addcmul_xpu",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"addmm", XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"addmm_grad",
+       XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
       {"arange_tensor",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::INT32,
 
@@ -40,6 +40,14 @@ XPUOpMap& get_kl3_ops() {
       {"adagrad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"addcmul_xpu",
        XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})},
+      {"addmm",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
+      {"addmm_grad",
+       XPUKernelSet({phi::DataType::FLOAT32,
+                     phi::DataType::FLOAT16,
+                     phi::DataType::BFLOAT16})},
       {"arange_tensor",
        XPUKernelSet({phi::DataType::FLOAT32,
                      phi::DataType::INT32,
 
@@ -246,10 +246,10 @@ void FFNGrad(const phi::XPUContext& dev_ctx,
   }
 
   phi::MatMulXPUFunction<XPUTypeT>(
-      xpu_ctx, a_1, b_1, c_1, info_d_dropout1, 1.0f, true);
+      xpu_ctx, a_1, b_1, c_1, info_d_dropout1, 1.0f, 0.f, true);
 
   phi::MatMulXPUFunction<XPUTypeT>(
-      xpu_ctx, a_2, b_2, c_2, info_dw2, 1.0f, true);
+      xpu_ctx, a_2, b_2, c_2, info_dw2, 1.0f, 0.f, true);
 
   // dropout_grad1
   DropoutGrad(xpu_ctx,
@@ -335,10 +335,11 @@ void FFNGrad(const phi::XPUContext& dev_ctx,
 
   std::tie(info_dx, info_dw1, a_1, b_1, a_2, b_2) = fc_info;
 
-  phi::MatMulXPUFunction<XPUTypeT>(xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f, true);
+  phi::MatMulXPUFunction<XPUTypeT>(
+      xpu_ctx, a_1, b_1, c_1, info_dx, 1.0f, 0.f, true);
 
   phi::MatMulXPUFunction<XPUTypeT>(
-      xpu_ctx, a_2, b_2, c_2, info_dw1, 1.0f, true);
+      xpu_ctx, a_2, b_2, c_2, info_dw1, 1.0f, 0.f, true);
 
   if (pre_layer_norm) {
     r = xpu::layer_norm_grad(xpu_ctx,
 
@@ -73,7 +73,7 @@ void FusedGemmEpilogueKernel(const Context& dev_ctx,
           "FusedGemm do not support batched fc now, but got batch size %d.",
           batch_size));
   MatMulXPUFunction<XPUType>(
-      xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, 1.0f, false, act);
+      xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, 1.0f, 0.f, false, act);
 }
 
 }  // namespace fusion
 
@@ -0,0 +1,140 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/addmm_grad_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void AddmmGradKernel(const Context& dev_ctx,
+                     const DenseTensor& input,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     float alpha,
+                     float beta,
+                     DenseTensor* input_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  xpu::Context* xpu_ctx = dev_ctx.x_context();
+  xpu::ctx_guard RAII_GUARD(xpu_ctx);
+  int r;
+
+  if (input_grad) {
+    dev_ctx.template Alloc<T>(input_grad);
+    XPUType* input_grad_ptr = reinterpret_cast<XPUType*>(input_grad->data<T>());
+    r = xpu::constant(xpu_ctx, input_grad_ptr, input.numel(), (XPUType)(beta));
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+    if (input_grad->dims().size() == 1 && out_grad.dims()[0] > 1) {
+      r = xpu::scale<XPUType>(xpu_ctx,
+                              input_grad_ptr,
+                              input_grad_ptr,
+                              input_grad->numel(),
+                              true,
+                              static_cast<float>(out_grad.dims()[0]),
+                              0.f);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "scale");
+    }
+  }
+  if (x_grad) {
+    dev_ctx.template Alloc<T>(x_grad);
+  }
+  if (y_grad) {
+    dev_ctx.template Alloc<T>(y_grad);
+  }
+
+  const XPUType* out_grad_ptr =
+      reinterpret_cast<const XPUType*>(out_grad.data<T>());
+  const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x.data<T>());
+  const XPUType* y_ptr = reinterpret_cast<const XPUType*>(y.data<T>());
+
+  XpuFcInfo info_forward;
+  GetFCInfo(x.dims(), y.dims(), false, false, &info_forward);
+  // begin calculate
+  const XPUType* a_1 = nullptr;
+  const XPUType* b_1 = nullptr;
+  const XPUType* a_2 = nullptr;
+  const XPUType* b_2 = nullptr;
+  XPUType* c_1 = reinterpret_cast<XPUType*>(x_grad->data<T>());
+  XPUType* c_2 = reinterpret_cast<XPUType*>(y_grad->data<T>());
+
+  if (x_grad && info_forward.is_x_need_broadcast) {
+    c_1 = RAII_GUARD.alloc_l3_or_gm<XPUType>(info_forward.bs * info_forward.m *
+                                             info_forward.k);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(c_1);
+  }
+
+  if (y_grad && info_forward.is_y_need_broadcast) {
+    c_2 = RAII_GUARD.alloc_l3_or_gm<XPUType>(info_forward.bs * info_forward.k *
+                                             info_forward.n);
+    PADDLE_ENFORCE_XDNN_NOT_NULL(c_2);
+  }
+
+  XpuFcInfo info_x_grad;
+  XpuFcInfo info_y_grad;
+  std::tuple<XpuFcInfo,
+             XpuFcInfo,
+             const XPUType*,
+             const XPUType*,
+             const XPUType*,
+             const XPUType*>
+      fc_info = MatmulGradFcInfo(xpu_ctx,
+                                 &RAII_GUARD,
+                                 info_forward,
+                                 false,
+                                 false,
+                                 x_ptr,
+                                 y_ptr,
+                                 out_grad_ptr);
+  std::tie(info_x_grad, info_y_grad, a_1, b_1, a_2, b_2) = fc_info;
+  if (x_grad) {
+    MatMulXPUFunction<XPUType>(xpu_ctx, a_1, b_1, c_1, info_x_grad, alpha, 0.f);
+    if (info_forward.is_x_need_broadcast) {
+      r = xpu::reduce_sum<XPUType>(
+          xpu_ctx,
+          c_1,
+          reinterpret_cast<XPUType*>(x_grad->data<T>()),
+          {info_forward.bs, info_forward.m, info_forward.k},
+          {0});
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+    }
+  }
+  if (y_grad) {
+    MatMulXPUFunction<XPUType>(xpu_ctx, a_2, b_2, c_2, info_y_grad, alpha, 0.f);
+    if (info_forward.is_y_need_broadcast) {
+      r = xpu::reduce_sum<XPUType>(
+          xpu_ctx,
+          c_2,
+          reinterpret_cast<XPUType*>(y_grad->data<T>()),
+          {info_forward.bs, info_forward.k, info_forward.n},
+          {0});
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
+    }
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(addmm_grad,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::AddmmGradKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
@@ -0,0 +1,170 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/addmm_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/backends/xpu/xpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "xblas/cublasLt.h"
+
+#ifndef PADDLE_WITH_XPU_XRE5
+#include "paddle/phi/kernels/xpu/xpu_api_wrapper.h"
+#endif
+
+namespace xblas = baidu::xpu::xblas;
+
+namespace phi {
+
+template <typename T, typename Context>
+void AddmmKernel(const Context& dev_ctx,
+                 const DenseTensor& input,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 float beta,
+                 float alpha,
+                 DenseTensor* out) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+
+  auto input_dims = input.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      input_dims.size() == 2 || input_dims.size() == 1,
+      true,
+      common::errors::InvalidArgument(
+          "Variable 'input' of AddmmOp must be 1-dimensional or 2-dimensional, "
+          "but received shape: [%s]",
+          input_dims));
+  PADDLE_ENFORCE_EQ(x_dims.size() == 2,
+                    true,
+                    common::errors::InvalidArgument(
+                        "Variable 'x' of AddmmOp must be 2-dimensional, "
+                        "but received shape: [%s]",
+                        input_dims));
+  PADDLE_ENFORCE_EQ(y_dims.size() == 2,
+                    true,
+                    common::errors::InvalidArgument(
+                        "Variable 'y' of AddmmOp must be 2-dimensional, "
+                        "but received shape: [%s]",
+                        input_dims));
+
+  dev_ctx.template Alloc<T>(out);
+  const XPUType* x_ptr = reinterpret_cast<const XPUType*>(x.data<T>());
+  const XPUType* y_ptr = reinterpret_cast<const XPUType*>(y.data<T>());
+  const XPUType* input_ptr = reinterpret_cast<const XPUType*>(input.data<T>());
+  XPUType* out_ptr = reinterpret_cast<XPUType*>(out->data<T>());
+
+  int r;
+  if (alpha == 0.f) {
+    if (beta == 0.f) {
+      r = xpu::constant(dev_ctx.x_context(),
+                        out_ptr,
+                        out->numel(),
+                        static_cast<XPUType>(0.0f));
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+    } else {
+      xpu::ctx_guard RAII_GUARD(dev_ctx.x_context());
+      T* beta_xpu = RAII_GUARD.alloc_l3_or_gm<T>(1);
+      r = xpu::constant(dev_ctx.x_context(),
+                        reinterpret_cast<XPUType*>(beta_xpu),
+                        out->numel(),
+                        static_cast<XPUType>(beta));
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
+      auto input_dims_vec = common::vectorize<int64_t>(input.dims());
+      auto out_dims_vec = common::vectorize<int64_t>(out->dims());
+      r = xpu::broadcast_mul<XPUType>(dev_ctx.x_context(),
+                                      input_ptr,
+                                      reinterpret_cast<XPUType*>(beta_xpu),
+                                      out_ptr,
+                                      input_dims_vec,
+                                      out_dims_vec);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_mul");
+    }
+#ifdef PADDLE_WITH_XPU_XRE5
+  } else {
+    xblas::FcFusionTensor<const XPUType> t_input{
+        input_ptr,
+        nullptr,
+        input.dims()[0],
+        input.dims()[1],
+        input.dims()[1],
+        false,
+    };
+    xblas::FcFusionTensor<const XPUType> t_x{
+        x_ptr,
+        nullptr,
+        x.dims()[0],
+        x.dims()[1],
+        x.dims()[1],
+        false,
+    };
+    xblas::FcFusionTensor<const XPUType> t_y{
+        y_ptr,
+        nullptr,
+        y.dims()[0],
+        y.dims()[1],
+        y.dims()[1],
+        false,
+    };
+    xblas::FcFusionTensor<XPUType> t_out{
+        out_ptr,
+        nullptr,
+        out->dims()[0],
+        out->dims()[1],
+        out->dims()[1],
+        false,
+    };
+    xblas::FcFusionDesc<float, float, XPUType> desc{
+        alpha,
+        beta,
+    };
+    xblas::FcFusionEpilogue<float, float> epilogue{
+        xdnn::Activation_t::LINEAR,
+        nullptr,
+        nullptr,
+        nullptr,
+        0,
+        0,
+        nullptr,
+    };
+    r = xblas::fc_fusion<XPUType,
+                         XPUType,
+                         XPUType,
+                         XPUType,
+                         float,
+                         float,
+                         XPUType,
+                         float,
+                         float>(
+        dev_ctx.x_context(), t_x, t_y, t_input, t_out, desc, epilogue);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_fusion");
+#else
+  } else {
+    Copy(dev_ctx, input, dev_ctx.GetPlace(), false, out);
+    XpuFcInfo fc_info;
+    GetFCInfo(x_dims, y_dims, false, false, &fc_info);
+    MatMulXPUFunction<XPUType>(
+        dev_ctx.x_context(), x_ptr, y_ptr, out_ptr, fc_info, alpha, beta);
+#endif
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(addmm,
+                   XPU,
+                   ALL_LAYOUT,
+                   phi::AddmmKernel,
+                   float,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16) {}
Original file line number	Diff line number	Diff line change
`@@ -73,7 +73,7 @@ void FusedGemmEpilogueKernel(const Context& dev_ctx,`
`73`	`73`	`"FusedGemm do not support batched fc now, but got batch size %d.",`
`74`	`74`	`batch_size));`
`75`	`75`	`MatMulXPUFunction<XPUType>(`
`76`		`- xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, 1.0f, false, act);`
	`76`	`+ xpu_ctx, x_ptr, y_ptr, out_ptr, fc_info, 1.0f, 0.f, false, act);`
`77`	`77`	`}`
`78`	`78`
`79`	`79`	`} // namespace fusion`