Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 0 additions & 34 deletions paddle/fluid/operators/custom_device_common_op_registry.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1316,40 +1316,6 @@ void FeedDenseTensorKernel(const Context& dev_ctx,

void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
#ifdef PADDLE_WITH_CUSTOM_DEVICE
auto device_type = dev_type.c_str();
REGISTER_OP_CUSTOM_DEVICE_KERNEL(
c_concat,
device_type,
paddle::operators::CConcatOpCustomDeviceKernel<phi::CustomContext, float>,
paddle::operators::CConcatOpCustomDeviceKernel<phi::CustomContext,
phi::dtype::float16>,
paddle::operators::CConcatOpCustomDeviceKernel<phi::CustomContext,
phi::dtype::bfloat16>);
REGISTER_OP_CUSTOM_DEVICE_KERNEL(
c_softmax_with_cross_entropy,
device_type,
paddle::operators::CSoftmaxWithCrossEntropyOpCustomDeviceKernel<
phi::CustomContext,
float>,
paddle::operators::CSoftmaxWithCrossEntropyOpCustomDeviceKernel<
phi::CustomContext,
double>,
paddle::operators::CSoftmaxWithCrossEntropyOpCustomDeviceKernel<
phi::CustomContext,
phi::dtype::float16>) {}

REGISTER_OP_CUSTOM_DEVICE_KERNEL(
c_softmax_with_cross_entropy_grad,
device_type,
paddle::operators::CSoftmaxWithCrossEntropyGradCustomDeviceKernel<
phi::CustomContext,
float>,
paddle::operators::CSoftmaxWithCrossEntropyGradCustomDeviceKernel<
phi::CustomContext,
double>,
paddle::operators::CSoftmaxWithCrossEntropyGradCustomDeviceKernel<
phi::CustomContext,
phi::dtype::float16>) {}

#endif
}
Expand Down
131 changes: 131 additions & 0 deletions paddle/phi/kernels/custom/c_concat_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/phi/api/backward/backward_api.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/device_manager.h"
#include "paddle/phi/core/distributed/collective/process_group.h"
#include "paddle/phi/core/distributed/comm_context_manager.h"
#include "paddle/phi/core/distributed/xccl_comm_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#ifdef PADDLE_WITH_CUSTOM_DEVICE
namespace phi {

template <typename T, typename Context>
void CConcatKernel(const Context& dev_ctx,
const DenseTensor& x_in,
int rank,
int nranks,
int ring_id UNUSED,
bool use_calc_stream UNUSED,
bool use_model_parallel UNUSED,
DenseTensor* out) {
auto x = &x_in;
int rid = ring_id;
auto place = dev_ctx.GetPlace();

PADDLE_ENFORCE_GE(rank,
0,
common::errors::PreconditionNotMet(
"The value of rank (%d) for c_concat must be "
"greater than or equal to 0.",
rank));
PADDLE_ENFORCE_GE(nranks,
2,
common::errors::PreconditionNotMet(
"The value of nranks (%d) for c_concat must be "
"greater than or equal to 2.",
nranks));
PADDLE_ENFORCE_LT(rank,
nranks,
common::errors::PreconditionNotMet(
"The value of rank (%d) for c_concat must be "
"less than that of nranks (%d).",
rank,
nranks));

phi::DenseTensor temp_out;
phi::DDim temp_out_dims = x->dims();
temp_out_dims[0] *= nranks;
temp_out.Resize(temp_out_dims);
dev_ctx.template Alloc<T>(&temp_out);

auto map = distributed::ProcessGroupMapFromGid::getInstance();
if (map->has(rid)) {
// Use ProcessGroup
distributed::ProcessGroup* pg = map->get(rid);
std::vector<phi::DenseTensor> in_tensor;
std::vector<phi::DenseTensor> out_tensor;
in_tensor.push_back(*x);
out_tensor.push_back(temp_out);
auto task = pg->AllGather(in_tensor, out_tensor);
task->Wait();
} else {
auto comm = reinterpret_cast<phi::distributed::XCCLCommContext*>(
phi::distributed::CommContextManager::GetInstance().Get(
std::to_string(rid)));
PADDLE_ENFORCE_EQ(
nranks,
comm->GetSize(),
common::errors::InvalidArgument(
"nranks: %s should equal to %s", nranks, comm->GetSize()));

int64_t send_numel = x->numel();
const T* send_buff = x->data<T>();
T* recv_buff = temp_out.data<T>();
// should ExecutionContext for calc stream.
auto& stream = *dev_ctx.GetStream();
phi::DeviceManager::CCLAllGather(
place.GetDeviceType(),
reinterpret_cast<void*>(const_cast<T*>(send_buff)),
recv_buff,
send_numel,
x->dtype(),
comm->GetXcclComm(),
stream);
}
std::vector<phi::DenseTensor> inputs;
int axis = x->dims().size() - 1;
auto out_dims = x->dims();
out_dims[out_dims.size() - 1] *= nranks;
int rows_per_tensor = x->dims()[0];
int offset = 0;
for (int i = 0; i < nranks; i++) {
phi::DenseTensor temp = temp_out.Slice(offset, offset + rows_per_tensor);
inputs.emplace_back(temp);
offset += rows_per_tensor;
}

out->Resize(out_dims);
std::vector<paddle::Tensor> inputs_t(inputs.size());
for (size_t i = 0; i < inputs.size(); i++) {
auto t = std::make_shared<phi::DenseTensor>();
t->ShareDataWith(inputs[i]);
inputs_t[i].set_impl(t);
}
auto output = paddle::experimental::concat(inputs_t, axis);
out->ShareDataWith(*reinterpret_cast<phi::DenseTensor*>(output.impl().get()));
}
} // namespace phi

PD_REGISTER_KERNEL(c_concat,
Custom,
ALL_LAYOUT,
phi::CConcatKernel,
float,
phi::dtype::float16,
phi::dtype::bfloat16) {}
#endif
111 changes: 111 additions & 0 deletions paddle/phi/kernels/custom/c_softmax_with_entropy_grad_kernel.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
// Copyright (c) 2025 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

#include "paddle/phi/api/backward/backward_api.h"
#include "paddle/phi/api/include/api.h"
#include "paddle/phi/backends/all_context.h"
#include "paddle/phi/backends/device_manager.h"
#include "paddle/phi/core/distributed/collective/process_group.h"
#include "paddle/phi/core/distributed/comm_context_manager.h"
#include "paddle/phi/core/distributed/xccl_comm_context.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/core/tensor_utils.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#ifdef PADDLE_WITH_CUSTOM_DEVICE
namespace phi {

template <typename T, typename Context>
void CSoftmaxWithEntropyGradKernel(const Context& dev_ctx,
const DenseTensor& softmax_in,
const DenseTensor& label_in,
const DenseTensor& loss_grad_in,
int64_t ignore_index,
int ring_id,
int rank,
int nranks,
DenseTensor* logits_grad) {
const phi::DenseTensor* labels = &label_in;
const phi::DenseTensor* loss_grad = &loss_grad_in;
const phi::DenseTensor* softmax = &softmax_in;
phi::DenseTensor* logit_grad = logits_grad;

if (logit_grad != softmax) {
phi::Copy(dev_ctx, *softmax, dev_ctx.GetPlace(), false, logit_grad);
}
const auto softmax_dims = softmax->dims();
const int axis = softmax_dims.size() - 1;
const int N = phi::funcs::SizeToAxis(axis, softmax_dims);
const int D = phi::funcs::SizeFromAxis(axis, softmax_dims);
const auto& label_type = labels->dtype();

if (label_type == phi::DataType::INT32 ||
label_type == phi::DataType::INT64) {
auto logit_grad_t = std::make_shared<phi::DenseTensor>();
logit_grad_t->ShareDataWith(*logit_grad).Resize({N, D});
auto loss_grad_t = std::make_shared<phi::DenseTensor>();
loss_grad_t->ShareDataWith(*loss_grad).Resize({N});
auto labels_1d = std::make_shared<phi::DenseTensor>();
labels_1d->ShareDataWith(*labels).Resize({N});
paddle::Tensor logits_grad_tensor(logit_grad_t),
loss_grad_tensor(loss_grad_t), labels_1d_tensor(labels_1d);

auto labels_1d_not_equal_ignore = paddle::experimental::reshape(
paddle::experimental::not_equal(
labels_1d_tensor,
paddle::experimental::full_like(labels_1d_tensor,
ignore_index,
labels_1d_tensor.dtype(),
labels_1d_tensor.place())),
{N, 1});
auto start_index_tensor =
paddle::experimental::full_like(labels_1d_tensor,
rank * D,
labels_1d_tensor.dtype(),
labels_1d_tensor.place());

auto logits_grad_out_tensor1 = paddle::experimental::subtract(
paddle::experimental::multiply(
logits_grad_tensor,
paddle::experimental::cast(labels_1d_not_equal_ignore,
logits_grad_tensor.dtype())),
paddle::experimental::cast(
paddle::experimental::one_hot(
paddle::experimental::subtract(labels_1d_tensor,
start_index_tensor),
D),
logits_grad_tensor.dtype()));

auto logits_grad_out_tensor2 = paddle::experimental::multiply(
logits_grad_out_tensor1,
paddle::experimental::reshape(loss_grad_tensor, {N, 1}));
logit_grad
->ShareDataWith(*reinterpret_cast<phi::DenseTensor*>(
logits_grad_out_tensor2.impl().get()))
.Resize(softmax_dims);
} else {
PADDLE_THROW(common::errors::Unavailable(
"CustomDevice c_softmax_with_cross_entropy_grad "
"label_type only support int32/int64"));
}
}
} // namespace phi

PD_REGISTER_KERNEL(c_softmax_with_cross_entropy_grad,
Custom,
ALL_LAYOUT,
phi::CSoftmaxWithEntropyGradKernel,
float,
double,
phi::dtype::float16) {}
#endif
Loading