Skip to content

Commit 751ce2a

Browse files
committed
fix conflic
1 parent b817edd commit 751ce2a

File tree

4 files changed

+75
-127
lines changed

4 files changed

+75
-127
lines changed

paddle/phi/kernels/funcs/broadcast_function.h

Lines changed: 49 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -189,45 +189,29 @@ struct BroadcastDataLoader<Index, VecSize, false, kElementwise> {
189189
}
190190
};
191191

192-
// Common broadcast data loader.
193-
template <int Index, int VecSize, bool IsBoundary>
194-
struct BroadcastDataLoader<Index, VecSize, IsBoundary, kBroadcast> {
195-
template <typename Array1, typename Array2, typename Array3, typename ArgsT>
196-
static __device__ __forceinline__ void Apply(const Array1 &ins,
197-
ArgsT *args,
198-
const Array2 &configs,
199-
const Array3 &use_broadcast,
200-
const int block_offset,
201-
const int num,
202-
const uint32_t numel) {
192+
template <int Index, int VecSize>
193+
struct BroadcastDataInit {
194+
template <typename ArgsT>
195+
static __device__ __forceinline__ void Apply(ArgsT *args) {
203196
using Type = std::tuple_element_t<Index, ArgsT>;
204-
uint32_t index_bc[VecSize];
205197
#pragma unroll
206198
for (int k = 0; k < VecSize; ++k) {
207-
index_bc[k] = 0;
208199
std::get<Index>(args[k]) = static_cast<Type>(1);
209200
}
201+
}
202+
};
210203

211-
uint32_t thread_offset = block_offset + threadIdx.x * VecSize;
212-
#pragma unroll
213-
for (int k = 0; k < VecSize; ++k) {
214-
uint32_t idx = thread_offset + k;
215-
if (IsBoundary && idx == numel) {
216-
break;
217-
}
218-
#pragma unroll
219-
for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
220-
if (i == configs[0].rank) break;
221-
auto fast_divmoder = configs[0].divmoders[i].Divmod(idx);
222-
idx = fast_divmoder.val[0];
223-
index_bc[k] += fast_divmoder.val[1] * configs[Index].strides[i];
224-
}
225-
}
226-
204+
template <int Index, int VecSize>
205+
struct BroadcastDataSetter {
206+
template <typename Array, typename ArgsT>
207+
static __device__ __forceinline__ void Apply(const Array &ins,
208+
ArgsT *args,
209+
uint32_t index_bc[][VecSize]) {
210+
using Type = std::tuple_element_t<Index, ArgsT>;
227211
#pragma unroll
228212
for (int k = 0; k < VecSize; ++k) {
229213
std::get<Index>(args[k]) =
230-
reinterpret_cast<const _ptr_ Type *>(ins[Index])[index_bc[k]];
214+
reinterpret_cast<const _ptr_ Type *>(ins[Index])[index_bc[Index][k]];
231215
}
232216
}
233217
};
@@ -285,8 +269,30 @@ __device__ void VectorizedBroadcastKernelImpl(
285269
__simd__ ArgsT args[VecSize];
286270
__simd__ ConditionalT<OutT, NumOuts> result[VecSize];
287271

288-
BcUnroller<BroadcastDataLoader, IsBoundary, LoadType, VecSize, Arity>::step(
289-
ins, args, configs, use_broadcast, block_offset, num, numel);
272+
if (LoadType == kBroadcast) {
273+
uint32_t index_bc[Arity][VecSize] = {0};
274+
Unroller<BroadcastDataInit, VecSize, Arity>::step(args);
275+
uint32_t thread_offset = block_offset + threadIdx.x * VecSize;
276+
#pragma unroll
277+
for (int k = 0; k < VecSize; ++k) {
278+
uint32_t idx = thread_offset + k;
279+
if (IsBoundary && idx == numel) break;
280+
#pragma unroll
281+
for (int i = 0; i < phi::DDim::kMaxRank; ++i) {
282+
if (i == configs[0].rank) break;
283+
auto fast_divmoder = configs[0].divmoders[i].Divmod(idx);
284+
idx = fast_divmoder.val[0];
285+
#pragma unroll
286+
for (int j = 0; j < Arity; ++j) {
287+
index_bc[j][k] += fast_divmoder.val[1] * configs[j].strides[i];
288+
}
289+
}
290+
}
291+
Unroller<BroadcastDataSetter, VecSize, Arity>::step(ins, args, index_bc);
292+
} else {
293+
BcUnroller<BroadcastDataLoader, IsBoundary, LoadType, VecSize, Arity>::step(
294+
ins, args, configs, use_broadcast, block_offset, num, numel);
295+
}
290296

291297
SameDimsElementwisePrimitiveCaller<ConditionalT<OutT, NumOuts>,
292298
VecSize,
@@ -783,11 +789,7 @@ struct LaunchBroadcastKernelWithInt64IndexHelper<OutT,
783789
};
784790
#endif
785791

786-
template <ElementwiseType ET,
787-
typename OutT,
788-
typename Functor,
789-
int kArity,
790-
int NumOuts = 1>
792+
template <typename OutT, typename Functor, int kArity, int NumOuts = 1>
791793
void BroadcastKernelForDifferentVecSize(
792794
const KPDevice &ctx,
793795
const std::vector<const DenseTensor *> &ins,
@@ -922,16 +924,12 @@ void BroadcastKernelForDifferentVecSize(
922924
}
923925
}
924926

925-
template <ElementwiseType ET,
926-
typename InT,
927-
typename OutT,
928-
typename Functor,
929-
int NumOuts = 1>
927+
template <typename OutT, typename Functor, int NumOuts = 1>
930928
void BroadcastKernel(const KPDevice &ctx,
931929
const std::vector<const DenseTensor *> &ins,
932930
std::vector<DenseTensor *> *outs,
933-
int axis,
934-
Functor func) {
931+
Functor func,
932+
int axis = -1) {
935933
// When there are multiple inputs, the outputs's rank should be equal the
936934
// maximum rank of all inputs.
937935
using Traits = phi::funcs::FunctionTraits<Functor>;
@@ -968,23 +966,22 @@ void BroadcastKernel(const KPDevice &ctx,
968966
max_rank = std::max(max_rank, (*outs)[0]->dims().size());
969967
}
970968
axis = axis == -1 ? max_rank - min_rank : axis;
971-
BroadcastKernelForDifferentVecSize<ET, OutT, Functor, kArity, NumOuts>(
969+
BroadcastKernelForDifferentVecSize<OutT, Functor, kArity, NumOuts>(
972970
ctx, ins, outs, axis, func);
973971
}
974972

975973
template <typename Functor, typename T, typename OutType = T>
976974
void ElementwiseCompute(const GPUContext &dev_ctx,
977975
const DenseTensor &x,
978976
const DenseTensor &y,
979-
int axis,
980977
Functor func,
981-
DenseTensor *z) {
978+
DenseTensor *z,
979+
int axis = -1) {
982980
std::vector<const DenseTensor *> ins = {&x, &y};
983981
std::vector<DenseTensor *> outs = {z};
984982
dev_ctx.template Alloc<OutType>(z);
985983

986-
BroadcastKernel<ElementwiseType::kBinary, T, OutType, Functor, 1>(
987-
dev_ctx, ins, &outs, axis, func);
984+
BroadcastKernel<OutType, Functor, 1>(dev_ctx, ins, &outs, func, axis);
988985
}
989986

990987
template <typename DeviceContext,
@@ -999,7 +996,7 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
999996
auto x_dims = x.dims();
1000997
auto y_dims = y.dims();
1001998
dev_ctx.template Alloc<T>(z);
1002-
funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
999+
funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, Functor(), z, axis);
10031000
}
10041001

10051002
#else
@@ -1017,10 +1014,10 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
10171014
auto y_dims = y.dims();
10181015
dev_ctx.template Alloc<T>(z);
10191016
if (x_dims.size() >= y_dims.size()) {
1020-
funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
1017+
funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, Functor(), z, axis);
10211018
} else {
10221019
funcs::ElementwiseCompute<InverseFunctor, T>(
1023-
dev_ctx, x, y, axis, InverseFunctor(), z);
1020+
dev_ctx, x, y, InverseFunctor(), z, axis);
10241021
}
10251022
}
10261023
#endif

paddle/phi/kernels/funcs/dropout_impl.cu.h

Lines changed: 22 additions & 71 deletions
Original file line numberDiff line numberDiff line change
@@ -191,25 +191,19 @@ __global__ void VectorizedRandomGenerator(const size_t n,
191191
}
192192

193193
template <typename T>
194-
__global__ void DropOutNdForwardKernel(
195-
const size_t n,
196-
uint64_t seed,
197-
const float dropout_prob,
198-
const T* src,
199-
uint8_t* mask,
200-
uint64_t increment,
201-
size_t main_offset,
202-
DstFunctor<T> dst_functor,
203-
MaskFunctor<T> mask_functor,
204-
T* y,
205-
int64_t N,
206-
kps::details::BroadcastConfig broadcast_config,
207-
const uint64_t* seed_ptr) {
194+
__global__ void VectorizedGeneratorMask(const size_t n,
195+
uint64_t seed,
196+
const float dropout_prob,
197+
const T* src,
198+
uint8_t* mask,
199+
uint64_t increment,
200+
size_t main_offset,
201+
MaskFunctor<T> mask_functor,
202+
203+
const uint64_t* seed_ptr) {
208204
// Vectorized Generate Mask
209205
// kCount is 4 for curand_uniform4 is used
210-
if (seed_ptr) {
211-
seed = seed_ptr[0];
212-
}
206+
if (seed_ptr) seed = seed_ptr[0];
213207

214208
constexpr int kCount = phi::funcs::uniform_distribution<float>::kReturnsCount;
215209
size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
@@ -259,22 +253,6 @@ __global__ void DropOutNdForwardKernel(
259253
kps::WriteData<uint8_t, kCount, 1, true>(
260254
mask + fix, &mask_result[0], remainder);
261255
}
262-
// Broadcast mask data and do elementwise operaiton with DstFunctor
263-
CUDA_KERNEL_LOOP(i, N) {
264-
uint32_t offset = 0u;
265-
uint32_t idx = i;
266-
// Use (j < phi::DDim::kMaxRank) conditiion rather than
267-
// (j < broadcast_config.rank) for (#pragma unroll)
268-
#pragma unroll
269-
for (int j = 0; j < phi::DDim::kMaxRank; ++j) {
270-
if (j == broadcast_config.rank) break;
271-
auto fast_divmoder = broadcast_config.divmoders[j].Divmod(idx);
272-
idx = fast_divmoder.val[0];
273-
offset += broadcast_config.strides[j] * fast_divmoder.val[1];
274-
}
275-
__syncthreads();
276-
y[i] = dst_functor(src[i], mask[offset]);
277-
}
278256
}
279257

280258
template <typename T, typename MT>
@@ -348,18 +326,6 @@ void DropoutFwGPUKernelDriver(
348326
size / (block_size * kVecSize) * (block_size * kVecSize);
349327

350328
if (is_dropout_nd) {
351-
auto dst_functor =
352-
DstFunctor<T>(1.0f - dropout_prob, upscale_in_train, x_numel);
353-
354-
std::vector<int64_t> out_dims =
355-
std::move(phi::vectorize<int64_t>(x.dims()));
356-
std::vector<int64_t> in_dims =
357-
std::move(phi::vectorize<int64_t>(mask->dims()));
358-
std::reverse(out_dims.begin(), out_dims.end());
359-
std::reverse(in_dims.begin(), in_dims.end());
360-
kps::details::BroadcastConfig broadcast_config(
361-
out_dims, in_dims, x.dims().size());
362-
363329
auto mask_functor = MaskFunctor<T>(1.0f - dropout_prob);
364330
bool copy_in_kernel = GetSeedDataAndIncrement(dev_ctx,
365331
seed,
@@ -372,20 +338,22 @@ void DropoutFwGPUKernelDriver(
372338
const uint64_t* seed_ptr =
373339
copy_in_kernel ? seed->data<uint64_t>() : nullptr;
374340

375-
DropOutNdForwardKernel<T>
341+
VectorizedGeneratorMask<T>
376342
<<<grid_size, block_size, 0, stream>>>(size,
377343
seed_data,
378344
dropout_prob,
379345
x_data,
380346
mask_data,
347+
381348
increment,
382349
main_offset,
383-
dst_functor,
384350
mask_functor,
385-
y_data,
386-
y->numel(),
387-
broadcast_config,
388351
seed_ptr);
352+
auto dst_functor =
353+
DstFunctor<T>(1.0f - dropout_prob, upscale_in_train, x_numel);
354+
std::vector<const phi::DenseTensor*> ins = {&x, mask};
355+
std::vector<phi::DenseTensor*> outs = {y};
356+
phi::funcs::BroadcastKernel<T>(dev_ctx, ins, &outs, dst_functor);
389357
} else {
390358
bool copy_in_kernel = GetSeedDataAndIncrement(
391359
dev_ctx, seed, is_fix_seed, seed_val, offset, &seed_data, &increment);
@@ -469,30 +437,13 @@ void DropoutGradGPUKernelDriver(const phi::GPUContext& dev_ctx,
469437
MT factor = upscale_in_train
470438
? static_cast<MT>(1.0f / (1.0f - dropout_prob))
471439
: static_cast<MT>(1.0f);
440+
441+
std::vector<const phi::DenseTensor*> ins = {&grad_y, &mask};
442+
std::vector<phi::DenseTensor*> outs = {grad_x};
472443
if (is_dropout_nd) {
473-
phi::DenseTensor broadcasted_mask;
474-
475-
broadcasted_mask.Resize(grad_y.dims());
476-
dev_ctx.template Alloc<uint8_t>(&broadcasted_mask);
477-
478-
std::vector<const phi::DenseTensor*> broadcast_ins = {&mask};
479-
std::vector<phi::DenseTensor*> broadcast_outs = {&broadcasted_mask};
480-
phi::funcs::BroadcastKernel<phi::ElementwiseType::kUnary,
481-
uint8_t,
482-
uint8_t>(dev_ctx,
483-
broadcast_ins,
484-
&broadcast_outs,
485-
-1,
486-
kps::IdentityFunctor<uint8_t>());
487-
488-
std::vector<const phi::DenseTensor*> ins = {&grad_y, &broadcasted_mask};
489-
std::vector<phi::DenseTensor*> outs = {grad_x};
490-
phi::funcs::ElementwiseKernel<T>(
444+
phi::funcs::BroadcastKernel<T>(
491445
dev_ctx, ins, &outs, CudaDropoutGradFunctor<T>(factor));
492-
493446
} else {
494-
std::vector<const phi::DenseTensor*> ins = {&grad_y, &mask};
495-
std::vector<phi::DenseTensor*> outs = {grad_x};
496447
phi::funcs::ElementwiseKernel<T>(
497448
dev_ctx, ins, &outs, CudaDropoutGradFunctor<T>(factor));
498449
}

paddle/phi/kernels/funcs/elementwise_base.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,6 @@ namespace kps = phi::kps;
3535

3636
namespace phi {
3737

38-
enum ElementwiseType { kUnary = 1, kBinary = 2, kTernary = 3 };
3938
/* Packing scalar type T(float, int etc.) into Array<T, NumOuts> type
4039
for supporting multiple-output feature in elementwise system.*/
4140
template <class T, int Num>
@@ -369,9 +368,9 @@ template <typename Functor, typename T, typename OutType = T>
369368
void ElementwiseCompute(const CPUContext &dev_ctx,
370369
const DenseTensor &x,
371370
const DenseTensor &y,
372-
int axis,
373371
Functor func,
374-
DenseTensor *z) {
372+
DenseTensor *z,
373+
int axis = -1) {
375374
dev_ctx.Alloc<OutType>(z);
376375
auto x_dims = x.dims();
377376
auto y_dims = y.dims();

paddle/phi/kernels/legacy/kps/elementwise_raw_kernel.cu

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -174,4 +174,5 @@ PD_REGISTER_KERNEL(elementwise_pow_raw,
174174
float16,
175175
int64_t,
176176
bfloat16) {}
177-
#endif
177+
178+
#endif

0 commit comments

Comments
 (0)