PaddlePaddle · yongqiangma · Mar 5, 2025 · Mar 3, 2025 · Mar 4, 2025
diff --git a/paddle/phi/kernels/funcs/fused_gemm_epilogue_xpu.h b/paddle/phi/kernels/funcs/fused_gemm_epilogue_xpu.h
@@ -143,8 +143,11 @@ void ComputeFusedGemmEpilogueBackwardXPU(const phi::XPUContext& dev_ctx,
     XPUType* dbias_ptr;
     auto* dbias_tmp_ptr = dev_ctx.template Alloc<T>(dbias);
     dbias_ptr = reinterpret_cast<XPUType*>(dbias_tmp_ptr);
-    r = xpu::reduce_sum(
-        xpu_ctx, dout_fc_ptr, dbias_ptr, {info_forward.m, info_forward.n}, {0});
+    r = xpu::reduce_sum(xpu_ctx,
+                        dout_fc_ptr,
+                        dbias_ptr,
+                        {(int64_t)info_forward.m, (int64_t)info_forward.n},
+                        {0LL});
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
   }
 }

diff --git a/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc b/paddle/phi/kernels/fusion/xpu/fused_feedforward_grad_kernel.cc
@@ -190,8 +190,11 @@ void FFNGrad(const phi::XPUContext& dev_ctx,
                    dropout_param2,
                    bsz_seq * d_model);
   // linear_grad2
-  r = xpu::reduce_sum(
-      xpu_ctx, d_dropout2_out_ptr, d_linear2_bias_ptr, {bsz_seq, d_model}, {0});
+  r = xpu::reduce_sum(xpu_ctx,
+                      d_dropout2_out_ptr,
+                      d_linear2_bias_ptr,
+                      {(int64_t)bsz_seq, (int64_t)d_model},
+                      {0LL});
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
 
   phi::XpuFcInfo linear2_fc_info;
@@ -285,8 +288,8 @@ void FFNGrad(const phi::XPUContext& dev_ctx,
   r = xpu::reduce_sum(xpu_ctx,
                       d_act_out_ptr,
                       d_linear1_bias_ptr,
-                      {bsz_seq, dim_feedforward},
-                      {0});
+                      {(int64_t)bsz_seq, (int64_t)dim_feedforward},
+                      {0LL});
   PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
 
   phi::XpuFcInfo linear1_fc_info;

diff --git a/paddle/phi/kernels/legacy/xpu/compare_kernel.cc b/paddle/phi/kernels/legacy/xpu/compare_kernel.cc
@@ -23,24 +23,25 @@
 namespace phi {
 
 template <typename T, typename XPUType, typename Context>
-void XPUCompareRawKernelImpl(const Context& dev_ctx,
-                             const DenseTensor& x,
-                             const DenseTensor& y,
-                             DenseTensor* out,
-                             std::function<int(xpu::Context*,
-                                               const XPUType*,
-                                               const XPUType*,
-                                               bool*,
-                                               const std::vector<int>&,
-                                               const std::vector<int>&)> func) {
-  auto x_shape = common::vectorize<int>(x.dims());
-  auto y_shape = common::vectorize<int>(y.dims());
+void XPUCompareRawKernelImpl(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& y,
+    DenseTensor* out,
+    std::function<int(xpu::Context*,
+                      const XPUType*,
+                      const XPUType*,
+                      bool*,
+                      const std::vector<int64_t>&,
+                      const std::vector<int64_t>&)> func) {
+  auto x_shape = common::vectorize<int64_t>(x.dims());
+  auto y_shape = common::vectorize<int64_t>(y.dims());
 
   if (x.dims().size() == 0) {
-    x_shape = std::vector<int>({1});
+    x_shape = std::vector<int64_t>({1});
   }
   if (y.dims().size() == 0) {
-    y_shape = std::vector<int>({1});
+    y_shape = std::vector<int64_t>({1});
   }
 
   auto x_data = reinterpret_cast<const XPUType*>(x.data<T>());
@@ -64,8 +65,8 @@ void XPUCompareRawKernelImpl(const Context& dev_ctx,
                 const XPUType* x,                                        \
                 const XPUType* y,                                        \
                 bool* z,                                                 \
-                const std::vector<int>& xshape,                          \
-                const std::vector<int>& yshape) {                        \
+                const std::vector<int64_t>& xshape,                      \
+                const std::vector<int64_t>& yshape) {                    \
       return functor(ctx, x, y, z, xshape, yshape);                      \
     };                                                                   \
     XPUCompareRawKernelImpl<T, XPUType, Context>(dev_ctx, x, y, out, f); \

diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_add_kernel.cc
@@ -40,8 +40,8 @@ void AddRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_add<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 

diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_divide_kernel.cc
@@ -35,8 +35,8 @@ void DivideRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_div<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 

diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_kernel.cc
@@ -30,8 +30,8 @@ void MaximumRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_max<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
@@ -49,8 +49,8 @@ void MinimumRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_min<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
@@ -68,8 +68,8 @@ void RemainderRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_mod<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
@@ -87,8 +87,8 @@ void FloorDivideRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_floordiv<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 
@@ -106,8 +106,8 @@ void ElementwisePowRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_pow<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 

diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_multiply_kernel.cc
@@ -35,8 +35,8 @@ void MultiplyRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_mul<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 

diff --git a/paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc b/paddle/phi/kernels/legacy/xpu/elementwise_subtract_kernel.cc
@@ -30,8 +30,8 @@ void SubtractRawKernel(const Context& dev_ctx,
               const XPUType* x,
               const XPUType* y,
               XPUType* z,
-              const std::vector<int>& xshape,
-              const std::vector<int>& yshape) {
+              const std::vector<int64_t>& xshape,
+              const std::vector<int64_t>& yshape) {
     return xpu::broadcast_sub<XPUType>(ctx, x, y, z, xshape, yshape);
   };
 

diff --git a/paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc b/paddle/phi/kernels/legacy/xpu/reduce_max_kernel.cc
@@ -33,8 +33,8 @@ void MaxRawKernel(const Context& dev_ctx,
   auto f = [](xpu::Context* ctx,
               const T* x,
               T* y,
-              const std::vector<int>& xdims,
-              const std::vector<int>& reduce_dims) {
+              const std::vector<int64_t>& xdims,
+              const std::vector<int64_t>& reduce_dims) {
     return xpu::reduce_max<XPUType>(ctx,
                                     reinterpret_cast<const XPUType*>(x),
                                     reinterpret_cast<XPUType*>(y),

diff --git a/paddle/phi/kernels/xpu/activation_grad_kernel.cc b/paddle/phi/kernels/xpu/activation_grad_kernel.cc
@@ -180,11 +180,11 @@ struct XPULogGradFunctor : public funcs::BaseActivationFunctor<T> {
         dev_ctx.x_context(), tmp, x->numel(), static_cast<T>(1.0));
     PADDLE_ENFORCE_XDNN_SUCCESS(r, "constant");
 
-    auto x_dims = common::vectorize<int>(x->dims());
+    auto x_dims = common::vectorize<int64_t>(x->dims());
 
     // use [1] to replace [], because xpu not support []
     if (x_dims.size() == 0) {
-      x_dims = std::vector<int>({1});
+      x_dims = std::vector<int64_t>({1});
     }
     // dx.device(d) = dout * (static_cast<T>(1) / x);
     r = xpu::broadcast_div(dev_ctx.x_context(),

diff --git a/paddle/phi/kernels/xpu/addmm_grad_kernel.cc b/paddle/phi/kernels/xpu/addmm_grad_kernel.cc
@@ -111,8 +111,10 @@ void AddmmGradKernel(const Context& dev_ctx,
           xpu_ctx,
           c_1,
           reinterpret_cast<XPUType*>(x_grad->data<T>()),
-          {info_forward.bs, info_forward.m, info_forward.k},
-          {0});
+          {(int64_t)info_forward.bs,
+           (int64_t)info_forward.m,
+           (int64_t)info_forward.k},
+          {0LL});
       PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
     }
   }
@@ -123,8 +125,10 @@ void AddmmGradKernel(const Context& dev_ctx,
           xpu_ctx,
           c_2,
           reinterpret_cast<XPUType*>(y_grad->data<T>()),
-          {info_forward.bs, info_forward.k, info_forward.n},
-          {0});
+          {(int64_t)info_forward.bs,
+           (int64_t)info_forward.k,
+           (int64_t)info_forward.n},
+          {0LL});
       PADDLE_ENFORCE_XDNN_SUCCESS(r, "reduce_sum");
     }
   }

diff --git a/paddle/phi/kernels/xpu/affine_channel_grad_kernel.cc b/paddle/phi/kernels/xpu/affine_channel_grad_kernel.cc
@@ -54,9 +54,9 @@ void AffineChannelGradXPUKernel(const Context& dev_ctx,
   T* dscale_d = dscale ? dev_ctx.template Alloc<T>(dscale) : nullptr;
   T* dbias_d = dbias ? dev_ctx.template Alloc<T>(dbias) : nullptr;
 
-  std::vector<int> x_shape;
-  std::vector<int> b_shape;
-  std::vector<int> rdims;
+  std::vector<int64_t> x_shape;
+  std::vector<int64_t> b_shape;
+  std::vector<int64_t> rdims;
   if (layout == phi::DataLayout::kNCHW) {
     x_shape.push_back(N);
     x_shape.push_back(C);

diff --git a/paddle/phi/kernels/xpu/affine_channel_kernel.cc b/paddle/phi/kernels/xpu/affine_channel_kernel.cc
@@ -48,8 +48,8 @@ void AffineChannelXPUKernel(const Context& dev_ctx,
 
   auto* x_d = x->data<T>();
   auto* y_d = y->data<T>();
-  std::vector<int> x_shape;
-  std::vector<int> b_shape;
+  std::vector<int64_t> x_shape;
+  std::vector<int64_t> b_shape;
   if (layout == phi::DataLayout::kNCHW) {
     x_shape.push_back(N);
     x_shape.push_back(C);

diff --git a/paddle/phi/kernels/xpu/batch_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/batch_norm_grad_kernel.cc
@@ -36,8 +36,8 @@ static int CalculateInvBNY(xpu::Context *ctx,
                     y,
                     common::errors::InvalidArgument(
                         "X and Y should be inplaced in inplace mode"));
-  std::vector<int> tensor_shape_vec({N, C, M});
-  std::vector<int> array_shape_vec({1, C, 1});
+  std::vector<int64_t> tensor_shape_vec({N, C, M});
+  std::vector<int64_t> array_shape_vec({1, C, 1});
   // y - bias
   int r1 =
       xpu::broadcast_sub<T>(ctx, bias, y, x, array_shape_vec, tensor_shape_vec);
@@ -62,8 +62,8 @@ static int CalculateInvVar(xpu::Context *ctx,
                            T *epsilon_data,
                            T *inv_var) {
   int r1 = constant(ctx, epsilon_data, 1, epsilon);
-  std::vector<int> tensor_shape_vec({C});
-  std::vector<int> array_shape_vec({1});
+  std::vector<int64_t> tensor_shape_vec({C});
+  std::vector<int64_t> array_shape_vec({1});
   int r2 = xpu::broadcast_add<T>(
       ctx, epsilon_data, var, inv_var, array_shape_vec, tensor_shape_vec);
   int r3 = xpu::rsqrt<T>(ctx, inv_var, inv_var, C);

diff --git a/paddle/phi/kernels/xpu/c_softmax_with_cross_entropy_kernel.cc b/paddle/phi/kernels/xpu/c_softmax_with_cross_entropy_kernel.cc
@@ -184,8 +184,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::XPUContext, T> {
       auto f = [](xpu::Context* ctx,
                   const T* x,
                   T* y,
-                  const std::vector<int>& xdims,
-                  const std::vector<int>& reduce_dims) {
+                  const std::vector<int64_t>& xdims,
+                  const std::vector<int64_t>& reduce_dims) {
         return xpu::reduce_max<XPUType>(ctx,
                                         reinterpret_cast<const XPUType*>(x),
                                         reinterpret_cast<XPUType*>(y),
@@ -210,8 +210,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::XPUContext, T> {
                   const XPUType* x,
                   const XPUType* y,
                   XPUType* z,
-                  const std::vector<int>& xshape,
-                  const std::vector<int>& yshape) {
+                  const std::vector<int64_t>& xshape,
+                  const std::vector<int64_t>& yshape) {
         return xpu::broadcast_sub<XPUType>(ctx, x, y, z, xshape, yshape);
       };
       phi::XPUElementwise<T, XPUType>(
@@ -277,8 +277,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::XPUContext, T> {
       auto f = [](xpu::Context* ctx,
                   const T* x,
                   T* y,
-                  const std::vector<int>& xdims,
-                  const std::vector<int>& reduce_dims) {
+                  const std::vector<int64_t>& xdims,
+                  const std::vector<int64_t>& reduce_dims) {
         return xpu::reduce_sum<XPUType>(ctx,
                                         reinterpret_cast<const XPUType*>(x),
                                         reinterpret_cast<XPUType*>(y),

diff --git a/paddle/phi/kernels/xpu/compare_kernel.cc b/paddle/phi/kernels/xpu/compare_kernel.cc
@@ -23,24 +23,25 @@
 namespace phi {
 
 template <typename T, typename XPUType, typename Context>
-void XPUCompareKernelImpl(const Context& dev_ctx,
-                          const DenseTensor& x,
-                          const DenseTensor& y,
-                          DenseTensor* out,
-                          std::function<int(xpu::Context*,
-                                            const XPUType*,
-                                            const XPUType*,
-                                            bool*,
-                                            const std::vector<int>&,
-                                            const std::vector<int>&)> func) {
-  auto x_shape = common::vectorize<int>(x.dims());
-  auto y_shape = common::vectorize<int>(y.dims());
+void XPUCompareKernelImpl(
+    const Context& dev_ctx,
+    const DenseTensor& x,
+    const DenseTensor& y,
+    DenseTensor* out,
+    std::function<int(xpu::Context*,
+                      const XPUType*,
+                      const XPUType*,
+                      bool*,
+                      const std::vector<int64_t>&,
+                      const std::vector<int64_t>&)> func) {
+  auto x_shape = common::vectorize<int64_t>(x.dims());
+  auto y_shape = common::vectorize<int64_t>(y.dims());
 
   if (x.dims().size() == 0) {
-    x_shape = std::vector<int>({1});
+    x_shape = std::vector<int64_t>({1});
   }
   if (y.dims().size() == 0) {
-    y_shape = std::vector<int>({1});
+    y_shape = std::vector<int64_t>({1});
   }
 
   auto x_data = reinterpret_cast<const XPUType*>(x.data<T>());
@@ -63,8 +64,8 @@ void XPUCompareKernelImpl(const Context& dev_ctx,
                 const XPUType* x,                                     \
                 const XPUType* y,                                     \
                 bool* z,                                              \
-                const std::vector<int>& xshape,                       \
-                const std::vector<int>& yshape) {                     \
+                const std::vector<int64_t>& xshape,                   \
+                const std::vector<int64_t>& yshape) {                 \
       return functor(ctx, x, y, z, xshape, yshape);                   \
     };                                                                \
     XPUCompareKernelImpl<T, XPUType, Context>(dev_ctx, x, y, out, f); \