dynamicheart
diff --git a/‎paddle/phi/kernels/funcs/fused_gemm_epilogue_xpu.h‎
Lines changed: 5 additions & 2 deletions b/‎paddle/phi/kernels/funcs/fused_gemm_epilogue_xpu.h‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc‎
Lines changed: 7 additions & 4 deletions b/‎paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc‎
Lines changed: 7 additions & 4 deletions
diff --git a/‎paddle/phi/kernels/legacy/xpu/compare_kernel.cc‎
Lines changed: 17 additions & 16 deletions b/‎paddle/phi/kernels/legacy/xpu/compare_kernel.cc‎
Lines changed: 17 additions & 16 deletions
diff --git a/‎paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc‎
Lines changed: 10 additions & 10 deletions b/‎paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎paddle/phi/kernels/xpu/activation_grad_kernel.cc‎
Lines changed: 2 additions & 2 deletions b/‎paddle/phi/kernels/xpu/activation_grad_kernel.cc‎
Lines changed: 2 additions & 2 deletions
@@ -143,8 +143,11 @@ void ComputeFusedGemmEpilogueBackwardXPU(const phi::XPUContext& dev_ctx,
     XPUType* dbias_ptr;
     auto* dbias_tmp_ptr = dev_ctx.template Alloc<T>(dbias);
     dbias_ptr = reinterpret_cast<XPUType*>(dbias_tmp_ptr);
-    r = xpu::reduce_sum(
-        xpu_ctx, dout_fc_ptr, dbias_ptr, {info_forward.m, info_forward.n}, {0});
+    r = xpu::reduce_sum(xpu_ctx,
+                        dout_fc_ptr,
+                        dbias_ptr,
+                        {(int64_t)info_forward.m, (int64_t)info_forward.n},
+                        {0LL});
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
   }
 }
 
@@ -190,8 +190,11 @@ void FFNGrad(const phi::XPUContext& dev_ctx,
                    dropout_param2,
                    bsz_seq * d_model);
   // linear_grad2
-  r = xpu::reduce_sum(
-      xpu_ctx, d_dropout2_out_ptr, d_linear2_bias_ptr, {bsz_seq, d_model}, {0});
+  r = xpu::reduce_sum(xpu_ctx,
+                      d_dropout2_out_ptr,
+                      d_linear2_bias_ptr,
+                      {(int64_t)bsz_seq, (int64_t)d_model},
+                      {0LL});
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
 
   phi::XpuFcInfo linear2_fc_info;
@@ -285,8 +288,8 @@ void FFNGrad(const phi::XPUContext& dev_ctx,
   r = xpu::reduce_sum(xpu_ctx,
                       d_act_out_ptr,
                       d_linear1_bias_ptr,
-                      {bsz_seq, dim_feedforward},
-                      {0});
+                      {(int64_t)bsz_seq, (int64_t)dim_feedforward},
+                      {0LL});
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
 
   phi::XpuFcInfo linear1_fc_info;
 
@@ -23,24 +23,25 @@
 namespace phi {
 
 template <typename T, typename XPUType, typename Context>
-void XPUCompareRawKernelImpl(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             DenseTensor* out,
-                             std::function<int(xpu::Context*,
-                                               const XPUType*,
-                                               const XPUType*,
-                                               bool*,
-                                               const std::vector<int>&,
-                                               const std::vector<int>&)> func) {
-  auto x_shape = common::vectorize<int>(x.dims());
-  auto y_shape = common::vectorize<int>(y.dims());
+void XPUCompareRawKernelImpl(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& y,
+    DenseTensor* out,
+    std::function<int(xpu::Context*,
+                      const XPUType*,
+                      const XPUType*,
+                      bool*,
+                      const std::vector<int64_t>&,
+                      const std::vector<int64_t>&)> func) {
+  auto x_shape = common::vectorize<int64_t>(x.dims());
+  auto y_shape = common::vectorize<int64_t>(y.dims());
 
   if (x.dims().size() == 0) {
-    x_shape = std::vector<int>({1});
+    x_shape = std::vector<int64_t>({1});
   }
   if (y.dims().size() == 0) {
-    y_shape = std::vector<int>({1});
+    y_shape = std::vector<int64_t>({1});
   }
 
   auto x_data = reinterpret_cast<const XPUType*>(x.data<T>());
@@ -64,8 +65,8 @@ void XPUCompareRawKernelImpl(const Context& dev_ctx,
                 const XPUType* x,                                        \
                 const XPUType* y,                                        \
                 bool* z,                                                 \
-                const std::vector<int>& xshape,                          \
-                const std::vector<int>& yshape) {                        \
+                const std::vector<int64_t>& xshape,                      \
+                const std::vector<int64_t>& yshape) {                    \
       return functor(ctx, x, y, z, xshape, yshape);                      \
     };                                                                   \
     XPUCompareRawKernelImpl<T, XPUType, Context>(dev_ctx, x, y, out, f); \
 
@@ -40,8 +40,8 @@ void AddRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_add<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
 
@@ -35,8 +35,8 @@ void DivideRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_div<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
 
@@ -30,8 +30,8 @@ void MaximumRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_max<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
@@ -49,8 +49,8 @@ void MinimumRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_min<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
@@ -68,8 +68,8 @@ void RemainderRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_mod<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
@@ -87,8 +87,8 @@ void FloorDivideRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_floordiv<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
@@ -106,8 +106,8 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_pow<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
 
@@ -35,8 +35,8 @@ void MultiplyRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_mul<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
 
@@ -30,8 +30,8 @@ void SubtractRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_sub<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
 
@@ -33,8 +33,8 @@ void MaxRawKernel(const Context& dev_ctx,
   auto f = [](xpu::Context* ctx,
               const T* x,
               T* y,
-              const std::vector<int>& xdims,
-              const std::vector<int>& reduce_dims) {
+              const std::vector<int64_t>& xdims,
+              const std::vector<int64_t>& reduce_dims) {
     return xpu::reduce_max<XPUType>(ctx,
                                     reinterpret_cast<const XPUType*>(x),
                                     reinterpret_cast<XPUType*>(y),
 
@@ -180,11 +180,11 @@ struct XPULogGradFunctor : public funcs::BaseActivationFunctor<T> {
         dev_ctx.x_context(), tmp, x->numel(), static_cast<T>(1.0));
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
 
-    auto x_dims = common::vectorize<int>(x->dims());
+    auto x_dims = common::vectorize<int64_t>(x->dims());
 
     // use [1] to replace [], because xpu not support []
     if (x_dims.size() == 0) {
-      x_dims = std::vector<int>({1});
+      x_dims = std::vector<int64_t>({1});
     }
     // dx.device(d) = dout * (static_cast<T>(1) / x);
     r = xpu::broadcast_div(dev_ctx.x_context(),
Original file line number	Diff line number	Diff line change
`@@ -143,8 +143,11 @@ void ComputeFusedGemmEpilogueBackwardXPU(const phi::XPUContext& dev_ctx,`
`143`	`143`	`XPUType* dbias_ptr;`
`144`	`144`	`auto* dbias_tmp_ptr = dev_ctx.template Alloc<T>(dbias);`
`145`	`145`	`dbias_ptr = reinterpret_cast<XPUType*>(dbias_tmp_ptr);`
`146`		`- r = xpu::reduce_sum(`
`147`		`- xpu_ctx, dout_fc_ptr, dbias_ptr, {info_forward.m, info_forward.n}, {0});`
	`146`	`+ r = xpu::reduce_sum(xpu_ctx,`
	`147`	`+ dout_fc_ptr,`
	`148`	`+ dbias_ptr,`
	`149`	`+ {(int64_t)info_forward.m, (int64_t)info_forward.n},`
	`150`	`+ {0LL});`
`148`	`151`	`PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");`
`149`	`152`	`}`
`150`	`153`	`}`