Skip to content

Commit 9adca1e

Browse files
authored
move "gpu_primitives.h" to phi (#48015)
1 parent e4ebf38 commit 9adca1e

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+895
-328
lines changed

paddle/phi/backends/gpu/gpu_primitives.h

Lines changed: 610 additions & 0 deletions
Large diffs are not rendered by default.

paddle/phi/kernels/funcs/detail/gru_gpu_kernel.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ limitations under the License. */
1515
#pragma once
1616
#include <type_traits>
1717

18-
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
1918
#include "paddle/fluid/platform/device_context.h"
19+
#include "paddle/phi/backends/gpu/gpu_primitives.h"
2020
#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
2121
#include "paddle/phi/kernels/funcs/gru_compute.h"
2222

paddle/phi/kernels/funcs/detail/lstm_gpu_kernel.h

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ limitations under the License. */
1515
#pragma once
1616
#include <type_traits>
1717

18-
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
1918
#include "paddle/fluid/platform/device_context.h"
19+
#include "paddle/phi/backends/gpu/gpu_primitives.h"
2020
#include "paddle/phi/kernels/funcs/detail/activation_functions.h"
2121
#include "paddle/phi/kernels/funcs/lstm_compute.h"
2222

@@ -202,15 +202,12 @@ __global__ void KeLstmBackward(Op op,
202202
if (is_batch) {
203203
if (value.prev_state_value) {
204204
if (grad.check_ig_grad)
205-
paddle::platform::CudaAtomicAdd(grad.check_ig_grad + frame_idx,
206-
r_checkIGrad);
205+
phi::CudaAtomicAdd(grad.check_ig_grad + frame_idx, r_checkIGrad);
207206
if (grad.check_fg_grad)
208-
paddle::platform::CudaAtomicAdd(grad.check_fg_grad + frame_idx,
209-
r_checkFGrad);
207+
phi::CudaAtomicAdd(grad.check_fg_grad + frame_idx, r_checkFGrad);
210208
}
211209
if (grad.check_og_grad)
212-
paddle::platform::CudaAtomicAdd(grad.check_og_grad + frame_idx,
213-
r_checkOGrad);
210+
phi::CudaAtomicAdd(grad.check_og_grad + frame_idx, r_checkOGrad);
214211
} else {
215212
if (value.prev_state_value) {
216213
if (grad.check_ig_grad) grad.check_ig_grad[frame_idx] += r_checkIGrad;

paddle/phi/kernels/funcs/gather.cu.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ limitations under the License. */
1818

1919
#include "paddle/fluid/memory/memcpy.h"
2020
// TODO(paddle-dev): move gpu_primitives.h to phi
21-
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
2221
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
22+
#include "paddle/phi/backends/gpu/gpu_primitives.h"
2323
#include "paddle/phi/common/place.h"
2424
#include "paddle/phi/core/dense_tensor.h"
2525
#include "paddle/phi/kernels/funcs/math_function.h"
@@ -217,7 +217,7 @@ __global__ void GatherGradGPUKernel(const T* input,
217217
int64_t out_index =
218218
inner_dim_index * (outer_dim_size * out_index_dim_size) +
219219
index[index_dim_index] * outer_dim_size + out_dim_index;
220-
paddle::platform::CudaAtomicAdd(out + out_index, *(input + idx));
220+
phi::CudaAtomicAdd(out + out_index, *(input + idx));
221221
}
222222
}
223223

paddle/phi/kernels/funcs/pooling.cu

Lines changed: 4 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ limitations under the License. */
1515
#include <algorithm>
1616
#include <vector>
1717

18-
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
1918
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
19+
#include "paddle/phi/backends/gpu/gpu_primitives.h"
2020
#include "paddle/phi/kernels/funcs/pooling.h"
2121
#include "paddle/phi/kernels/funcs/reduce_function.h"
2222
#include "paddle/phi/kernels/primitive/datamover_primitives.h"
@@ -428,8 +428,7 @@ __global__ void KernelMaxPool2DGrad(const int nthreads,
428428

429429
if (maxIndex != -1) {
430430
// atomic add
431-
paddle::platform::CudaAtomicAdd(input_grad + maxIndex,
432-
output_grad[index]);
431+
phi::CudaAtomicAdd(input_grad + maxIndex, output_grad[index]);
433432
}
434433
}
435434
}
@@ -1330,7 +1329,7 @@ __global__ void KernelMaxPool3DGrad(const int nthreads,
13301329
}
13311330
if (maxIdx != -1) {
13321331
// atomic add
1333-
paddle::platform::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
1332+
phi::CudaAtomicAdd(input_grad + maxIdx, output_grad[index]);
13341333
}
13351334
}
13361335
}
@@ -2359,7 +2358,7 @@ __global__ void KernelMaxPool3DWithIdxGrad(
23592358
w_offset;
23602359
int max_index = mask[output_index];
23612360
if (max_index != -1) {
2362-
paddle::platform::CudaAtomicAdd(
2361+
phi::CudaAtomicAdd(
23632362
&input_grad[nc_offset * input_depth * input_height * input_width +
23642363
max_index],
23652364
output_grad[output_index]);

paddle/phi/kernels/funcs/scatter.cu.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,8 @@ limitations under the License. */
1616
#include <unordered_set>
1717
#include <vector>
1818

19-
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
2019
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
20+
#include "paddle/phi/backends/gpu/gpu_primitives.h"
2121
#include "paddle/phi/common/place.h"
2222
#include "paddle/phi/core/dense_tensor.h"
2323
#include "paddle/phi/kernels/funcs/math_function.h"
@@ -70,7 +70,7 @@ __global__ void ScatterCUDAKernel(const T* params,
7070
if (overwrite) {
7171
*(output + out_i) = *(params + i);
7272
} else {
73-
paddle::platform::CudaAtomicAdd(output + out_i, *(params + i));
73+
phi::CudaAtomicAdd(output + out_i, *(params + i));
7474
}
7575
}
7676
}
@@ -104,7 +104,7 @@ __global__ void ScatterNdCUDAKernel(const T* update,
104104
temp *= output_dims[j];
105105
}
106106
int64_t output_i = gather_i + slice_i;
107-
paddle::platform::CudaAtomicAdd(output + output_i, *(update + i));
107+
phi::CudaAtomicAdd(output + output_i, *(update + i));
108108
}
109109
}
110110

paddle/phi/kernels/funcs/segment_pooling.cu

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -14,9 +14,9 @@ limitations under the License. */
1414

1515
#include <algorithm>
1616

17-
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
1817
#include "paddle/phi/backends/gpu/gpu_context.h"
1918
#include "paddle/phi/backends/gpu/gpu_launch_config.h"
19+
#include "paddle/phi/backends/gpu/gpu_primitives.h"
2020
#include "paddle/phi/kernels/funcs/gather.cu.h"
2121
#include "paddle/phi/kernels/funcs/math_function.h"
2222
#include "paddle/phi/kernels/funcs/segment_pooling.h"
@@ -60,7 +60,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
6060
}
6161
if (j > 0) {
6262
if (last_segment_id == first_segment_id) {
63-
paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
63+
phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
6464
} else {
6565
*(summed_ids + last_segment_id) = sum;
6666
}
@@ -70,7 +70,7 @@ __global__ void SegmentSumIdsKernel(const Index* segment_ids,
7070
sum += T(1);
7171
last_segment_id = current_segment_id;
7272
}
73-
paddle::platform::CudaAtomicAdd(summed_ids + last_segment_id, sum);
73+
phi::CudaAtomicAdd(summed_ids + last_segment_id, sum);
7474
}
7575
}
7676

@@ -111,8 +111,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
111111
last_segment_id * inner_dim_size + segment_offset;
112112

113113
if (last_segment_id == first_segment_id) {
114-
paddle::platform::CudaAtomicAdd(
115-
output + output_index, sum / *(summed_ids + last_segment_id));
114+
phi::CudaAtomicAdd(output + output_index,
115+
sum / *(summed_ids + last_segment_id));
116116
} else {
117117
*(output + output_index) = sum / *(summed_ids + last_segment_id);
118118
}
@@ -123,8 +123,8 @@ __global__ void SegmentMeanKernel(const Index* segment_ids,
123123
last_segment_id = current_segment_id;
124124
}
125125
Index output_index = last_segment_id * inner_dim_size + segment_offset;
126-
paddle::platform::CudaAtomicAdd(output + output_index,
127-
sum / *(summed_ids + last_segment_id));
126+
phi::CudaAtomicAdd(output + output_index,
127+
sum / *(summed_ids + last_segment_id));
128128
}
129129
}
130130

@@ -215,7 +215,7 @@ class MaxPool {
215215
DEVICE inline T initial() { return static_cast<T>(-FLT_MAX); }
216216
DEVICE inline void compute(const T& x, T* y) { *y = *y > x ? *y : x; }
217217
DEVICE inline T atomic(T* address, const T val) {
218-
return paddle::platform::CudaAtomicMax(address, val);
218+
return phi::CudaAtomicMax(address, val);
219219
}
220220
};
221221

@@ -225,7 +225,7 @@ class MinPool {
225225
DEVICE inline T initial() { return static_cast<T>(FLT_MAX); }
226226
DEVICE inline void compute(const T& x, T* y) { *y = *y < x ? *y : x; }
227227
DEVICE inline T atomic(T* address, const T val) {
228-
return paddle::platform::CudaAtomicMin(address, val);
228+
return phi::CudaAtomicMin(address, val);
229229
}
230230
};
231231

@@ -235,7 +235,7 @@ class SumPool {
235235
DEVICE inline T initial() { return static_cast<T>(0); }
236236
DEVICE inline void compute(const T& x, T* y) { *y = *y + x; }
237237
DEVICE inline T atomic(T* address, const T val) {
238-
return paddle::platform::CudaAtomicAdd(address, val);
238+
return phi::CudaAtomicAdd(address, val);
239239
}
240240
};
241241

paddle/phi/kernels/funcs/selected_rows_functor.cu

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ limitations under the License. */
1515
#include <set>
1616
#include <vector>
1717

18-
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
18+
#include "paddle/phi/backends/gpu/gpu_primitives.h"
1919
#include "paddle/phi/common/bfloat16.h"
2020
#include "paddle/phi/common/float16.h"
2121
#include "paddle/phi/kernels/funcs/math_function.h"
@@ -127,7 +127,7 @@ __global__ void SelectedRowsAddTensorKernel(const T* selected_rows,
127127
// Since index in rows of SelectedRows can be duplicate, we can not use
128128
// tensor_out[index] += selected_rows[index]; Instead, we have to use
129129
// AtomicAdd to avoid concurrent write error.
130-
paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
130+
phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
131131
}
132132
}
133133
} // namespace
@@ -279,7 +279,7 @@ __global__ void SelectedRowsAddToTensorKernel(const T* selected_rows,
279279
for (int index = tid; index < row_numel; index += block_size) {
280280
// Since index in rows of SelectedRows can be duplicate, we have to use
281281
// Atomic Operation to avoid concurrent write error.
282-
paddle::platform::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
282+
phi::CudaAtomicAdd(tensor_out + index, selected_rows[index]);
283283
}
284284
}
285285
} // namespace
@@ -360,7 +360,7 @@ __global__ void MergeAddKernel(const T* input,
360360
input += ty * row_numel;
361361
out += out_idx * row_numel;
362362
for (int index = tid; index < row_numel; index += block_size) {
363-
paddle::platform::CudaAtomicAdd(out + index, input[index]);
363+
phi::CudaAtomicAdd(out + index, input[index]);
364364
}
365365
}
366366

@@ -623,9 +623,9 @@ struct UpdateToTensor<phi::GPUContext, T> {
623623
auto* in1_data = in1_value.template data<T>();
624624
auto* in2_data = input2->data<T>();
625625

626-
dim3 threads(paddle::platform::PADDLE_CUDA_NUM_THREADS, 1);
626+
dim3 threads(phi::PADDLE_CUDA_NUM_THREADS, 1);
627627
dim3 grid(in1_rows.size(), 1);
628-
UpdateToTensorKernel<T, paddle::platform::PADDLE_CUDA_NUM_THREADS>
628+
UpdateToTensorKernel<T, phi::PADDLE_CUDA_NUM_THREADS>
629629
<<<grid, threads, 0, context.stream()>>>(
630630
in1_data, in1_rows.cuda_data(), op, in2_data, in1_row_numel);
631631
}

paddle/phi/kernels/gpu/accuracy_kernel.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@
1717
#include <thrust/execution_policy.h>
1818
#include <thrust/reduce.h>
1919

20-
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
2120
#include "paddle/phi/backends/gpu/gpu_context.h"
2221
#include "paddle/phi/backends/gpu/gpu_info.h"
22+
#include "paddle/phi/backends/gpu/gpu_primitives.h"
2323
#include "paddle/phi/common/float16.h"
2424
#include "paddle/phi/core/kernel_registry.h"
2525

2626
namespace phi {
27-
using paddle::platform::PADDLE_CUDA_NUM_THREADS;
27+
using phi::PADDLE_CUDA_NUM_THREADS;
2828

2929
template <int BlockSize>
3030
__global__ void AccuracyCudaKernel(const int N,

paddle/phi/kernels/gpu/adagrad_kernel.cu

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,8 @@
1414

1515
#include "paddle/phi/kernels/adagrad_kernel.h"
1616

17-
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
1817
#include "paddle/phi/backends/gpu/gpu_context.h"
18+
#include "paddle/phi/backends/gpu/gpu_primitives.h"
1919
#include "paddle/phi/core/kernel_registry.h"
2020
#include "paddle/phi/kernels/funcs/math_function.h"
2121
#include "paddle/phi/kernels/funcs/selected_rows_functor.h"
@@ -47,7 +47,7 @@ __global__ void MergeGradKernel(const T* grad,
4747
grad += ty * row_numel;
4848
grad_merge += grad_merge_idx * row_numel;
4949
for (int index = tid; index < row_numel; index += block_size) {
50-
paddle::platform::CudaAtomicAdd(grad_merge + index, grad[index]);
50+
phi::CudaAtomicAdd(grad_merge + index, grad[index]);
5151
}
5252
}
5353

@@ -69,9 +69,9 @@ __global__ void SparseAdagradFunctorKernel(const T* grad,
6969
for (int index = tid; index < row_numel; index += block_size) {
7070
// Since index in rows of SelectedRows can be duplicate, we have to use
7171
// Atomic Operation to avoid concurrent write error.
72-
paddle::platform::CudaAtomicAdd(param + index,
73-
-1.0 * learning_rate[0] * grad[index] /
74-
(sqrt(moment[index]) + epsilon));
72+
phi::CudaAtomicAdd(param + index,
73+
-1.0 * learning_rate[0] * grad[index] /
74+
(sqrt(moment[index]) + epsilon));
7575
}
7676
}
7777

0 commit comments

Comments
 (0)