PaddlePaddle
diff --git a/‎paddle/fluid/operators/custom_device_common_op_registry.cc‎
Lines changed: 0 additions & 1521 deletions b/‎paddle/fluid/operators/custom_device_common_op_registry.cc‎
Lines changed: 0 additions & 1521 deletions
diff --git a/‎paddle/phi/kernels/custom/assign_pos_kernel.cc‎
Lines changed: 72 additions & 0 deletions b/‎paddle/phi/kernels/custom/assign_pos_kernel.cc‎
Lines changed: 72 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/custom/barrier_kernel.cc‎
Lines changed: 63 additions & 0 deletions b/‎paddle/phi/kernels/custom/barrier_kernel.cc‎
Lines changed: 63 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/custom/c_allreduce_kernel_impl.h‎
Lines changed: 102 additions & 0 deletions b/‎paddle/phi/kernels/custom/c_allreduce_kernel_impl.h‎
Lines changed: 102 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/custom/c_allreduce_max_kernel.cc‎
Lines changed: 38 additions & 0 deletions b/‎paddle/phi/kernels/custom/c_allreduce_max_kernel.cc‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/custom/c_allreduce_min_kernel.cc‎
Lines changed: 38 additions & 0 deletions b/‎paddle/phi/kernels/custom/c_allreduce_min_kernel.cc‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/custom/c_allreduce_prod_kernel.cc‎
Lines changed: 38 additions & 0 deletions b/‎paddle/phi/kernels/custom/c_allreduce_prod_kernel.cc‎
Lines changed: 38 additions & 0 deletions
diff --git a/‎paddle/phi/kernels/custom/c_allreduce_sum_kernel.cc‎
Lines changed: 38 additions & 0 deletions b/‎paddle/phi/kernels/custom/c_allreduce_sum_kernel.cc‎
Lines changed: 38 additions & 0 deletions
@@ -0,0 +1,72 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/assign_pos_kernel.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+namespace phi {
+
+template <typename T, typename Context>
+void AssignPosKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const DenseTensor& cum_count_in,
+                     const DenseTensor& eff_num_len_in,
+                     DenseTensor* out) {
+  // assign pos decides which tokens should be fetched belong to specially
+  // counter orderly.
+  auto cum_count = &cum_count_in;      // (counter number) int32 | int64
+  auto numbers = &x;                   // (batch_size * seq_len, topk) int32
+  auto eff_num_len = &eff_num_len_in;  // (sum(cum_count))
+  // out: (cum_count) value ranges
+  // from 0 to batch_size *
+  // seq_len * topk
+
+  phi::DenseTensor cpu_eff_num_len;
+  int64_t cpu_eff_num_len_data = 0;
+  if (eff_num_len->place().GetType() == phi::AllocationType::CPU) {
+    cpu_eff_num_len_data = eff_num_len->data<T>()[0];
+  } else {
+    phi::Copy(dev_ctx, *eff_num_len, phi::CPUPlace(), true, &cpu_eff_num_len);
+    cpu_eff_num_len_data = cpu_eff_num_len.data<T>()[0];
+  }
+
+  out->Resize({cpu_eff_num_len_data});
+  dev_ctx.template Alloc<T>(out);
+
+  phi::DenseTensor numbers_cpu, cum_count_cpu;
+  phi::Copy(dev_ctx, *numbers, phi::CPUPlace(), true, &numbers_cpu);
+  phi::Copy(dev_ctx, *cum_count, phi::CPUPlace(), true, &cum_count_cpu);
+  auto* numbers_data = numbers_cpu.data<T>();
+  auto* cum_count_data = cum_count_cpu.data<T>();
+
+  std::vector<T> out_data(cpu_eff_num_len_data);
+  for (int64_t i = 0; i < numbers->numel(); ++i) {
+    int number_idx = numbers_data[i];
+    if (number_idx > -1) {
+      cum_count_data[number_idx] -= 1;
+      int p = cum_count_data[number_idx];
+      out_data[p] = i;
+    }
+  }
+  phi::TensorFromVector<int64_t>(out_data, dev_ctx, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    assign_pos, Custom, ALL_LAYOUT, phi::AssignPosKernel, int64_t) {}
+#endif
@@ -0,0 +1,63 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/api/backward/backward_api.h"
+#include "paddle/phi/api/include/api.h"
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/core/distributed/collective/process_group.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/xccl_comm_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+namespace phi {
+
+template <typename T, typename Context>
+void BarrierKernel(const Context& dev_ctx,
+                   const DenseTensor& x_in,
+                   int ring_id,
+                   bool use_calc_stream,
+                   DenseTensor* out) {
+  auto in = &x_in;
+
+  auto place = dev_ctx.GetPlace();
+  int64_t numel = in->numel();
+  const void* sendbuff = in->data();
+  void* recvbuff = dev_ctx.template Alloc<T>(out);
+  int rid = ring_id;
+
+  auto comm = reinterpret_cast<phi::distributed::XCCLCommContext*>(
+      phi::distributed::CommContextManager::GetInstance().Get(
+          std::to_string(rid)));
+
+  std::shared_ptr<phi::stream::Stream> stream;
+  if (use_calc_stream) {
+    stream = dev_ctx.GetStream();
+  } else {
+    stream = comm->GetStream();
+  }
+  phi::DeviceManager::CCLAllReduce(place.GetDeviceType(),
+                                   const_cast<void*>(sendbuff),
+                                   recvbuff,
+                                   numel,
+                                   in->dtype(),
+                                   phi::ccl::CCLReduceOp::SUM,
+                                   comm->GetXcclComm(),
+                                   *stream);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(barrier, Custom, ALL_LAYOUT, phi::BarrierKernel, int) {}
+#endif
@@ -0,0 +1,102 @@
+// Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/device_manager.h"
+#include "paddle/phi/core/distributed/collective/process_group.h"
+#include "paddle/phi/core/distributed/comm_context_manager.h"
+#include "paddle/phi/core/distributed/xccl_comm_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/core/tensor_utils.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+namespace phi {
+
+template <typename T, typename Context, phi::ccl::CCLReduceOp red_type>
+void CAllReduceKernel(const Context& dev_ctx,
+                      const DenseTensor& x_in,
+                      int ring_id,
+                      bool use_calc_stream,
+                      bool use_model_parallel,
+                      DenseTensor* out) {
+  auto in = &x_in;
+  int rid = ring_id;
+
+  auto place = dev_ctx.GetPlace();
+  auto dtype = in->dtype();
+  int64_t numel = in->numel();
+  const void* sendbuff = in->data<T>();
+  out->Resize(in->dims());
+  void* recvbuff = dev_ctx.template Alloc<T>(out);
+
+  auto map = phi::distributed::ProcessGroupMapFromGid::getInstance();
+  if (map->has(rid)) {
+    // Use ProcessGroup
+    phi::distributed::ProcessGroup* pg = map->get(rid);
+    std::vector<phi::DenseTensor> in_tensor;
+    std::vector<phi::DenseTensor> out_tensor;
+    in_tensor.push_back(*in);
+    out_tensor.push_back(*out);
+
+    phi::distributed::AllreduceOptions opts;
+    switch (red_type) {
+      case phi::ccl::CCLReduceOp::SUM:
+        opts.reduce_op = phi::distributed::ReduceOp::SUM;
+        break;
+
+      case phi::ccl::CCLReduceOp::MAX:
+        opts.reduce_op = phi::distributed::ReduceOp::MAX;
+        break;
+
+      case phi::ccl::CCLReduceOp::MIN:
+        opts.reduce_op = phi::distributed::ReduceOp::MIN;
+        break;
+
+      case phi::ccl::CCLReduceOp::PRODUCT:
+        opts.reduce_op = phi::distributed::ReduceOp::PRODUCT;
+        break;
+
+      default:
+        PADDLE_THROW(common::errors::InvalidArgument("Invalid reduce type: %d",
+                                                     red_type));
+    }
+
+    auto task = pg->AllReduce(in_tensor, out_tensor, opts);
+    task->Wait();
+    return;
+  }
+
+  auto comm = reinterpret_cast<phi::distributed::XCCLCommContext*>(
+      phi::distributed::CommContextManager::GetInstance().Get(
+          std::to_string(rid)));
+
+  std::shared_ptr<phi::stream::Stream> stream;
+  if (use_calc_stream) {
+    stream = dev_ctx.GetStream();
+  } else {
+    stream = comm->GetStream();
+  }
+  phi::DeviceManager::CCLAllReduce(place.GetDeviceType(),
+                                   const_cast<void*>(sendbuff),
+                                   recvbuff,
+                                   numel,
+                                   dtype,
+                                   red_type,
+                                   comm->GetXcclComm(),
+                                   *stream);
+}
+}  // namespace phi
+
+#endif
@@ -0,0 +1,38 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/custom/c_allreduce_kernel_impl.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+namespace phi {
+template <typename T, typename Context>
+void CAllReduceMaxKernel(const Context& dev_ctx,
+                         const DenseTensor& x_in,
+                         int ring_id,
+                         bool use_calc_stream,
+                         bool use_model_parallel,
+                         DenseTensor* out) {
+  CAllReduceKernel<T, Context, phi::ccl::CCLReduceOp::MAX>(
+      dev_ctx, x_in, ring_id, use_calc_stream, use_model_parallel, out);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(c_allreduce_max,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CAllReduceMaxKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16) {}
+#endif
@@ -0,0 +1,38 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/custom/c_allreduce_kernel_impl.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+namespace phi {
+template <typename T, typename Context>
+void CAllReduceMinKernel(const Context& dev_ctx,
+                         const DenseTensor& x_in,
+                         int ring_id,
+                         bool use_calc_stream,
+                         bool use_model_parallel,
+                         DenseTensor* out) {
+  CAllReduceKernel<T, Context, phi::ccl::CCLReduceOp::MIN>(
+      dev_ctx, x_in, ring_id, use_calc_stream, use_model_parallel, out);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(c_allreduce_min,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CAllReduceMinKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16) {}
+#endif
@@ -0,0 +1,38 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/custom/c_allreduce_kernel_impl.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+namespace phi {
+template <typename T, typename Context>
+void CAllReduceProdKernel(const Context& dev_ctx,
+                          const DenseTensor& x_in,
+                          int ring_id,
+                          bool use_calc_stream,
+                          bool use_model_parallel,
+                          DenseTensor* out) {
+  CAllReduceKernel<T, Context, phi::ccl::CCLReduceOp::PRODUCT>(
+      dev_ctx, x_in, ring_id, use_calc_stream, use_model_parallel, out);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(c_allreduce_prod,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CAllReduceProdKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16) {}
+#endif
@@ -0,0 +1,38 @@
+/* Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/custom/c_allreduce_kernel_impl.h"
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+namespace phi {
+template <typename T, typename Context>
+void CAllReduceSumKernel(const Context& dev_ctx,
+                         const DenseTensor& x_in,
+                         int ring_id,
+                         bool use_calc_stream,
+                         bool use_model_parallel,
+                         DenseTensor* out) {
+  CAllReduceKernel<T, Context, phi::ccl::CCLReduceOp::SUM>(
+      dev_ctx, x_in, ring_id, use_calc_stream, use_model_parallel, out);
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(c_allreduce_sum,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::CAllReduceSumKernel,
+                   float,
+                   double,
+                   int32_t,
+                   int64_t,
+                   phi::dtype::float16) {}
+#endif