PaddlePaddle · luotao1 · Feb 14, 2025 · Jan 6, 2025 · Jan 6, 2025 · Feb 14, 2025
diff --git a/paddle/fluid/operators/custom_device_common_op_registry.cc b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -1316,40 +1316,6 @@ void FeedDenseTensorKernel(const Context& dev_ctx,
 
 void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-  auto device_type = dev_type.c_str();
-  REGISTER_OP_CUSTOM_DEVICE_KERNEL(
-      c_concat,
-      device_type,
-      paddle::operators::CConcatOpCustomDeviceKernel<phi::CustomContext, float>,
-      paddle::operators::CConcatOpCustomDeviceKernel<phi::CustomContext,
-                                                     phi::dtype::float16>,
-      paddle::operators::CConcatOpCustomDeviceKernel<phi::CustomContext,
-                                                     phi::dtype::bfloat16>);
-  REGISTER_OP_CUSTOM_DEVICE_KERNEL(
-      c_softmax_with_cross_entropy,
-      device_type,
-      paddle::operators::CSoftmaxWithCrossEntropyOpCustomDeviceKernel<
-          phi::CustomContext,
-          float>,
-      paddle::operators::CSoftmaxWithCrossEntropyOpCustomDeviceKernel<
-          phi::CustomContext,
-          double>,
-      paddle::operators::CSoftmaxWithCrossEntropyOpCustomDeviceKernel<
-          phi::CustomContext,
-          phi::dtype::float16>) {}
-
-  REGISTER_OP_CUSTOM_DEVICE_KERNEL(
-      c_softmax_with_cross_entropy_grad,
-      device_type,
-      paddle::operators::CSoftmaxWithCrossEntropyGradCustomDeviceKernel<
-          phi::CustomContext,
-          float>,
-      paddle::operators::CSoftmaxWithCrossEntropyGradCustomDeviceKernel<
-          phi::CustomContext,
-          double>,
-      paddle::operators::CSoftmaxWithCrossEntropyGradCustomDeviceKernel<
-          phi::CustomContext,
-          phi::dtype::float16>) {}
 
 #endif
 }

diff --git a/paddle/phi/kernels/custom/c_concat_kernel.cc b/paddle/phi/kernels/custom/c_concat_kernel.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/core/distributed/collective/process_group.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/xccl_comm_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+namespace phi {
+
+template <typename T, typename Context>
+void CConcatKernel(const Context& dev_ctx,
+                   const DenseTensor& x_in,
+                   int rank,
+                   int nranks,
+                   int ring_id UNUSED,
+                   bool use_calc_stream UNUSED,
+                   bool use_model_parallel UNUSED,
+                   DenseTensor* out) {
+  auto x = &x_in;
+  int rid = ring_id;
+  auto place = dev_ctx.GetPlace();
+
+  PADDLE_ENFORCE_GE(rank,
+                    0,
+                    common::errors::PreconditionNotMet(
+                        "The value of rank (%d) for c_concat must be "
+                        "greater than or equal to 0.",
+                        rank));
+  PADDLE_ENFORCE_GE(nranks,
+                    2,
+                    common::errors::PreconditionNotMet(
+                        "The value of nranks (%d) for c_concat must be "
+                        "greater than or equal to 2.",
+                        nranks));
+  PADDLE_ENFORCE_LT(rank,
+                    nranks,
+                    common::errors::PreconditionNotMet(
+                        "The value of rank (%d) for c_concat must be "
+                        "less than that of nranks (%d).",
+                        rank,
+                        nranks));
+
+  phi::DenseTensor temp_out;
+  phi::DDim temp_out_dims = x->dims();
+  temp_out_dims[0] *= nranks;
+  temp_out.Resize(temp_out_dims);
+  dev_ctx.template Alloc<T>(&temp_out);
+
+  auto map = distributed::ProcessGroupMapFromGid::getInstance();
+  if (map->has(rid)) {
+    // Use ProcessGroup
+    distributed::ProcessGroup* pg = map->get(rid);
+    std::vector<phi::DenseTensor> in_tensor;
+    std::vector<phi::DenseTensor> out_tensor;
+    in_tensor.push_back(*x);
+    out_tensor.push_back(temp_out);
+    auto task = pg->AllGather(in_tensor, out_tensor);
+    task->Wait();
+  } else {
+    auto comm = reinterpret_cast<phi::distributed::XCCLCommContext*>(
+        phi::distributed::CommContextManager::GetInstance().Get(
+            std::to_string(rid)));
+    PADDLE_ENFORCE_EQ(
+        nranks,
+        comm->GetSize(),
+        common::errors::InvalidArgument(
+            "nranks: %s should equal to %s", nranks, comm->GetSize()));
+
+    int64_t send_numel = x->numel();
+    const T* send_buff = x->data<T>();
+    T* recv_buff = temp_out.data<T>();
+    // should ExecutionContext for calc stream.
+    auto& stream = *dev_ctx.GetStream();
+    phi::DeviceManager::CCLAllGather(
+        place.GetDeviceType(),
+        reinterpret_cast<void*>(const_cast<T*>(send_buff)),
+        recv_buff,
+        send_numel,
+        x->dtype(),
+        comm->GetXcclComm(),
+        stream);
+  }
+  std::vector<phi::DenseTensor> inputs;
+  int axis = x->dims().size() - 1;
+  auto out_dims = x->dims();
+  out_dims[out_dims.size() - 1] *= nranks;
+  int rows_per_tensor = x->dims()[0];
+  int offset = 0;
+  for (int i = 0; i < nranks; i++) {
+    phi::DenseTensor temp = temp_out.Slice(offset, offset + rows_per_tensor);
+    inputs.emplace_back(temp);
+    offset += rows_per_tensor;
+  }
+
+  out->Resize(out_dims);
+  std::vector<paddle::Tensor> inputs_t(inputs.size());
+  for (size_t i = 0; i < inputs.size(); i++) {
+    auto t = std::make_shared<phi::DenseTensor>();
+    t->ShareDataWith(inputs[i]);
+    inputs_t[i].set_impl(t);
+  }
+  auto output = paddle::experimental::concat(inputs_t, axis);
+  out->ShareDataWith(*reinterpret_cast<phi::DenseTensor*>(output.impl().get()));
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(c_concat,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CConcatKernel,
+                   float,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16) {}
+#endif
diff --git a/paddle/phi/kernels/custom/c_softmax_with_entropy_grad_kernel.cc b/paddle/phi/kernels/custom/c_softmax_with_entropy_grad_kernel.cc
@@ -0,0 +1,111 @@
+// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/core/distributed/collective/process_group.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/xccl_comm_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#include "paddle/phi/kernels/funcs/axis_utils.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+namespace phi {
+
+template <typename T, typename Context>
+void CSoftmaxWithEntropyGradKernel(const Context& dev_ctx,
+                                   const DenseTensor& softmax_in,
+                                   const DenseTensor& label_in,
+                                   const DenseTensor& loss_grad_in,
+                                   int64_t ignore_index,
+                                   int ring_id,
+                                   int rank,
+                                   int nranks,
+                                   DenseTensor* logits_grad) {
+  const phi::DenseTensor* labels = &label_in;
+  const phi::DenseTensor* loss_grad = &loss_grad_in;
+  const phi::DenseTensor* softmax = &softmax_in;
+  phi::DenseTensor* logit_grad = logits_grad;
+
+  if (logit_grad != softmax) {
+    phi::Copy(dev_ctx, *softmax, dev_ctx.GetPlace(), false, logit_grad);
+  }
+  const auto softmax_dims = softmax->dims();
+  const int axis = softmax_dims.size() - 1;
+  const int N = phi::funcs::SizeToAxis(axis, softmax_dims);
+  const int D = phi::funcs::SizeFromAxis(axis, softmax_dims);
+  const auto& label_type = labels->dtype();
+
+  if (label_type == phi::DataType::INT32 ||
+      label_type == phi::DataType::INT64) {
+    auto logit_grad_t = std::make_shared<phi::DenseTensor>();
+    logit_grad_t->ShareDataWith(*logit_grad).Resize({N, D});
+    auto loss_grad_t = std::make_shared<phi::DenseTensor>();
+    loss_grad_t->ShareDataWith(*loss_grad).Resize({N});
+    auto labels_1d = std::make_shared<phi::DenseTensor>();
+    labels_1d->ShareDataWith(*labels).Resize({N});
+    paddle::Tensor logits_grad_tensor(logit_grad_t),
+        loss_grad_tensor(loss_grad_t), labels_1d_tensor(labels_1d);
+
+    auto labels_1d_not_equal_ignore = paddle::experimental::reshape(
+        paddle::experimental::not_equal(
+            labels_1d_tensor,
+            paddle::experimental::full_like(labels_1d_tensor,
+                                            ignore_index,
+                                            labels_1d_tensor.dtype(),
+                                            labels_1d_tensor.place())),
+        {N, 1});
+    auto start_index_tensor =
+        paddle::experimental::full_like(labels_1d_tensor,
+                                        rank * D,
+                                        labels_1d_tensor.dtype(),
+                                        labels_1d_tensor.place());
+
+    auto logits_grad_out_tensor1 = paddle::experimental::subtract(
+        paddle::experimental::multiply(
+            logits_grad_tensor,
+            paddle::experimental::cast(labels_1d_not_equal_ignore,
+                                       logits_grad_tensor.dtype())),
+        paddle::experimental::cast(
+            paddle::experimental::one_hot(
+                paddle::experimental::subtract(labels_1d_tensor,
+                                               start_index_tensor),
+                D),
+            logits_grad_tensor.dtype()));
+
+    auto logits_grad_out_tensor2 = paddle::experimental::multiply(
+        logits_grad_out_tensor1,
+        paddle::experimental::reshape(loss_grad_tensor, {N, 1}));
+    logit_grad
+        ->ShareDataWith(*reinterpret_cast<phi::DenseTensor*>(
+            logits_grad_out_tensor2.impl().get()))
+        .Resize(softmax_dims);
+  } else {
+    PADDLE_THROW(common::errors::Unavailable(
+        "CustomDevice c_softmax_with_cross_entropy_grad "
+        "label_type only support int32/int64"));
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(c_softmax_with_cross_entropy_grad,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CSoftmaxWithEntropyGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
+#endif