PaddlePaddle · wanghuancoder · Nov 25, 2024 · Nov 22, 2024
diff --git a/paddle/fluid/pybind/eager_functions.cc b/paddle/fluid/pybind/eager_functions.cc
@@ -139,6 +139,7 @@ static PyObject* eager_api_scale(PyObject* self,
   paddle::Tensor ret;
   {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     ret = egr::scale(tensor, scale, bias, bias_after_scale, trace_backward);
   }
   return ToPyObject(ret);
@@ -159,6 +160,7 @@ static PyObject* eager_api_run_backward(PyObject* self,
   }
   {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     egr::Backward(tensors, grad_tensors, retain_graph);
   }
   RETURN_PY_NONE
@@ -189,6 +191,7 @@ static PyObject* eager_api_run_partial_grad(PyObject* self,
   std::vector<paddle::Tensor> result;
   {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     result = egr::Grad(tensors,
                        inputs,
                        grad_tensors,
@@ -216,6 +219,7 @@ static PyObject* eager_api_tensor_copy(PyObject* self,
 
   {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     dst = src.copy_to(place, blocking);
     egr::EagerUtils::autograd_meta(&dst)->SetStopGradient(
         egr::EagerUtils::autograd_meta(&(src))->StopGradient());
@@ -461,6 +465,7 @@ static PyObject* eager_api_jit_function_call(PyObject* self,
   std::vector<paddle::Tensor> outs;
   {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     outs = (*function)(ins);
   }
   return ToPyObject(outs);
@@ -672,6 +677,7 @@ PyObject* eager_api_run_custom_op(PyObject* self,
 
   {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     ctx.ConstructInplaceIndex(inputs, outputs, inplace_map);
     const auto& inplace_reverse_idx_map = ctx.GetInplaceReverseIndexMap();
     for (size_t out_idx = 0; out_idx < outputs.size(); ++out_idx) {
@@ -872,6 +878,7 @@ static PyObject* eager_api_sparse_coo_tensor(PyObject* self,
   paddle::Tensor tensor;
   {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     PADDLE_ENFORCE(
         non_zero_indices.is_dense_tensor(),
         common::errors::Fatal("the non-zero indices must be a DenseTensor."));
@@ -916,6 +923,7 @@ static PyObject* eager_api_sparse_csr_tensor(PyObject* self,
   paddle::Tensor tensor;
   {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     PADDLE_ENFORCE(non_zero_crows.is_dense_tensor(),
                    common::errors::Fatal(
                        "the compressed non-zero rows must be a DenseTensor."));
@@ -996,6 +1004,7 @@ static PyObject* eager_api_async_read(PyObject* self,
 
   {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     PADDLE_ENFORCE_EQ(
         src.is_gpu_pinned(),
         true,
@@ -1175,6 +1184,7 @@ static PyObject* eager_api_async_write(PyObject* self,
   }
   {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     PADDLE_ENFORCE_EQ(
         src.is_gpu(),
         true,

diff --git a/paddle/fluid/pybind/eager_method.cc b/paddle/fluid/pybind/eager_method.cc
@@ -619,6 +619,9 @@ static PyObject* tensor_method__copy_to(TensorObject* self,
   paddle::Tensor cp_tensor;
   {
     eager_gil_scoped_release guard;
+
+    EagerSetDeviceId();
+
     cp_tensor = self->tensor.copy_to(place, blocking);
     if (!blocking) {
       IncreaseTensorReferenceCountUntilCopyComplete(self->tensor, place);
@@ -690,6 +693,9 @@ static PyObject* tensor_method_copy_(TensorObject* self,
           << self->tensor.name();
   if (!self->tensor.initialized()) {
     eager_gil_scoped_release guard;
+
+    EagerSetDeviceId();
+
     egr::EagerUtils::autograd_meta(&(self->tensor))
         ->SetStopGradient(
             egr::EagerUtils::autograd_meta(&(src_tensor))->StopGradient());
@@ -702,6 +708,9 @@ static PyObject* tensor_method_copy_(TensorObject* self,
   } else {
     if (src_tensor.has_allocation()) {
       eager_gil_scoped_release guard;
+
+      EagerSetDeviceId();
+
       self->tensor.copy_(src_tensor, self->tensor.place(), blocking);
     }
   }
@@ -766,6 +775,9 @@ static PyObject* tensor_method_clone(TensorObject* self,
   paddle::Tensor out;
   {
     eager_gil_scoped_release guard;
+
+    EagerSetDeviceId();
+
     PADDLE_ENFORCE_EQ(
         self->tensor.initialized(),
         true,
@@ -922,6 +934,7 @@ static PyObject* tensor_clear_gradient(TensorObject* self,
                   ->unsafe_mutable_value();
         }
         if (set_to_zero) {
+          EagerSetDeviceId();
           auto* dev_ctx =
               phi::DeviceContextPool::Instance().Get(grad_t->place());
           phi::funcs::set_constant(*dev_ctx, grad_t, 0.0);
@@ -952,6 +965,7 @@ static PyObject* tensor__zero_grads(TensorObject* self,
 
   if (egr::EagerUtils::IsLeafTensor(self->tensor)) {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     // Add RetainGrad as PostHook to AccumulationNode
     paddle::Tensor* grad = egr::EagerUtils::mutable_grad(self->tensor);
     PADDLE_ENFORCE(
@@ -976,6 +990,7 @@ static PyObject* tensor__zero_grads(TensorObject* self,
     }
   } else {
     eager_gil_scoped_release guard;
+    EagerSetDeviceId();
     auto meta = egr::EagerUtils::unsafe_autograd_meta(self->tensor);
     if (meta->MutableGrad()->initialized()) {
       if (meta->MutableGrad()->is_dense_tensor() ||
@@ -3250,6 +3265,7 @@ static PyObject* tensor_contiguous(TensorObject* self,
       return reinterpret_cast<PyObject*>(self);
     } else {
       eager_gil_scoped_release guard;
+      EagerSetDeviceId();
       *dense_tensor = paddle::experimental::Trans2Contiguous(*dense_tensor);
       Py_INCREF(self);
       return reinterpret_cast<PyObject*>(self);

diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc
@@ -2797,4 +2797,39 @@ CvtPlacements(Placements placements, int ndim) {
   return {dim_map, partial_status};
 }
 
+void EagerSetDeviceId() {
+  auto expected_place = egr::Controller::Instance().GetExpectedPlace();
+
+  if (phi::is_gpu_place(expected_place)) {
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+    phi::backends::gpu::SetDeviceId(expected_place.device);
+    VLOG(4) << "CurrentDeviceId: " << phi::backends::gpu::GetCurrentDeviceId()
+            << " from " << (int)expected_place.device;  // NOLINT
+#else
+    PADDLE_THROW(common::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with GPU if use CUDAPlace."));
+#endif
+  } else if (phi::is_custom_place(expected_place)) {
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+    phi::DeviceManager::SetDevice(expected_place);
+    VLOG(4) << "CurrentDeviceId: "
+            << phi::DeviceManager::GetDevice(expected_place.GetDeviceType())
+            << " from " << (int)expected_place.device;  // NOLINT
+#else
+    PADDLE_THROW(common::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with CUSTOM_DEVICE if use CustomPlace."));
+#endif
+  } else if (phi::is_xpu_place(expected_place)) {
+#if defined(PADDLE_WITH_XPU)
+    phi::backends::xpu::SetXPUDeviceId(expected_place.device);
+    VLOG(4) << "CurrentDeviceId: "
+            << phi::backends::xpu::GetXPUCurrentDeviceId() << " from "
+            << (int)expected_place.device;  // NOLINT
+#else
+    PADDLE_THROW(common::errors::PreconditionNotMet(
+        "PaddlePaddle should compile with XPU if use XPUPlace."));
+#endif
+  }
+}
+
 }  // namespace paddle::pybind
diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h
@@ -449,5 +449,7 @@ std::tuple<std::vector<int64_t>,
            paddle::flat_hash_map<int64_t, phi::ReduceType>>
 CvtPlacements(phi::distributed::Placements placements, int ndim);
 
+void EagerSetDeviceId();
+
 }  // namespace pybind
 }  // namespace paddle