fix xpu comm stream (#71289)

AndSonder · web-flow · commit c3f8dc3591a1 · 2025-02-27T14:00:22.000+08:00
diff --git a/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h b/paddle/phi/core/distributed/auto_parallel/reshard/reshard_utils.h
@@ -27,6 +27,10 @@
 #include "paddle/phi/core/tensor_base.h"
 #include "paddle/phi/core/visit_type.h"
 
+#if defined(PADDLE_WITH_XPU)
+#include <xpu/runtime.h>
+#endif
+
 namespace phi {
 class DeviceContext;
 
@@ -86,6 +90,18 @@ phi::DDim InferShapeForReshardFromReplicate(
 #define DEVICE_CONTEXT CustomContext
 #endif
 
+#if defined(PADDLE_WITH_XPU)
+#define DEVICE_WAIT(dev_ctx) \
+  do {                       \
+    xpu_wait();              \
+    (dev_ctx)->Wait();       \
+  } while (0)
+#else
+#define DEVICE_WAIT(dev_ctx) \
+  do {                       \
+  } while (0)  // no need to wait on other devices.
+#endif
+
 // Some reshard function supports fewer data types on xpu than on gpu. For
 // example, `Transpose`, `Split`, and `Divide` do not support double type.
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
@@ -125,12 +141,14 @@ phi::DDim InferShapeForReshardFromReplicate(
                             __VA_ARGS__);                                   \
           }));                                                              \
     } else if (DEVICE_CONTEXT::classof(dev_ctx)) {                          \
+      DEVICE_WAIT(dev_ctx);                                                 \
       VLOG(4) << "Call `" << #fn_name << "` in Resharding on device.";      \
       PD_VISIT_RESHARD_TYPES(                                               \
           dtype, #fn_name, ([&] {                                           \
             fn_name<data_t>(static_cast<const DEVICE_CONTEXT&>(*dev_ctx),   \
                             __VA_ARGS__);                                   \
           }));                                                              \
+      DEVICE_WAIT(dev_ctx);                                                 \
     } else {                                                                \
       PADDLE_THROW(common::errors::Unimplemented(                           \
           "The %s in reshard only supported on CPU, GPU, and XPU for now.", \
diff --git a/paddle/phi/kernels/xpu/all_gather_kernel.cc b/paddle/phi/kernels/xpu/all_gather_kernel.cc
@@ -46,7 +46,7 @@ void AllGatherKernel(const Context& dev_ctx,
       errors::InvalidArgument(
           "nranks: %s should equal to %s", nranks, comm_ctx->GetSize()));
 
-  XPUStream stream = comm_ctx->GetStream();
+  XPUStream stream = dev_ctx.stream();
   comm_ctx->AllGather(out, x, stream);
 #else
   PADDLE_THROW(common::errors::PreconditionNotMet(
diff --git a/paddle/phi/kernels/xpu/all_reduce_kernel.cc b/paddle/phi/kernels/xpu/all_reduce_kernel.cc
@@ -37,7 +37,8 @@ void AllReduceKernel(const Context& dev_ctx,
                     common::errors::Unavailable(
                         "BKCLCommContext is nullptr, collective op should "
                         "has ring_id attr."));
-  XPUStream stream = comm_ctx->GetStream();
+
+  XPUStream stream = dev_ctx.stream();
 
   BKCLOp bkcl_reduce_type = BKCL_ADD;
   switch (static_cast<ReduceType>(reduce_type)) {
diff --git a/paddle/phi/kernels/xpu/all_to_all_kernel.cc b/paddle/phi/kernels/xpu/all_to_all_kernel.cc
@@ -38,7 +38,7 @@ void AllToAllKernel(const Context& dev_ctx,
                         "BKCLCommContext is nullptr, collective op should "
                         "has ring_id attr."));
 
-  XPUStream stream = comm_ctx->GetStream();
+  XPUStream stream = dev_ctx.stream();
   int nranks = comm_ctx->GetSize();
   PADDLE_ENFORCE_EQ(
       x_dims[0] % nranks,
diff --git a/paddle/phi/kernels/xpu/barrier_kernel.cc b/paddle/phi/kernels/xpu/barrier_kernel.cc
@@ -42,7 +42,7 @@ void BarrierKernel(const Context &dev_ctx,
                     common::errors::Unavailable(
                         "BKCLCommContext is nullptr, collective op should "
                         "has ring_id attr."));
-  XPUStream stream = comm_ctx->GetStream();
+  XPUStream stream = dev_ctx.stream();
   BKCLOp bkcl_reduce_type = BKCL_ADD;
   comm_ctx->AllReduce(out, *in, bkcl_reduce_type, stream);
   XPUStreamSync(stream);
diff --git a/paddle/phi/kernels/xpu/reduce_scatter_kernel.cc b/paddle/phi/kernels/xpu/reduce_scatter_kernel.cc
@@ -47,7 +47,7 @@ void ReduceScatterKernel(const Context& dev_ctx,
                         "BKCLCommContext is nullptr, collective op should "
                         "has ring_id attr."));
 
-  XPUStream stream = comm_ctx->GetStream();
+  XPUStream stream = dev_ctx.stream();
   comm_ctx->ReduceScatter(out, x, BKCL_ADD, stream);
 #else
   PADDLE_THROW(common::errors::PreconditionNotMet(