Skip to content

Commit 890638c

Browse files
authored
optimize flip op, removing duplicated computation when dim size is one (#37825)
1 parent 18aca3f commit 890638c

File tree

1 file changed

+0
-41
lines changed

1 file changed

+0
-41
lines changed

paddle/fluid/operators/flip_op.cu

Lines changed: 0 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -24,24 +24,6 @@ namespace operators {
2424
using Tensor = framework::Tensor;
2525
using CUDADeviceContext = paddle::platform::CUDADeviceContext;
2626

27-
template <typename T>
28-
__global__ void kernel_pointwise_flip_apply(const int N, const T* in_data,
29-
T* out_data, int dim0, int stride0,
30-
int dim1, int flip_dim) {
31-
for (int idx = blockIdx.x * blockDim.x + threadIdx.x; idx < N;
32-
idx += gridDim.x * blockDim.x) {
33-
int dst_offset = 0;
34-
if (flip_dim == 0) {
35-
// flip 1st dim
36-
dst_offset = (dim0 - 1 - idx / stride0) * stride0 + idx % stride0;
37-
} else {
38-
// flip last dim
39-
dst_offset = idx / stride0 * stride0 + (dim1 - 1 - idx % stride0);
40-
}
41-
out_data[dst_offset] = in_data[idx];
42-
}
43-
}
44-
4527
template <typename T>
4628
__global__ void flip_cuda_kernel(const int N, const T* in_data, T* out_data,
4729
int64_t* x_shape, int64_t* x_stride,
@@ -103,29 +85,6 @@ class FlipKernel<platform::CUDADeviceContext, T>
10385
std::vector<int64_t> x_dims_v = framework::vectorize(x_dims);
10486
std::vector<int64_t> x_stride_v = framework::vectorize(x_stride);
10587

106-
// wrap high-dims to 2-dims
107-
if (flip_dims_size == 1 &&
108-
(flip_dims[0] == 0 || flip_dims[0] == total_dims - 1)) {
109-
int dim0 = 1, dim1 = 1;
110-
int stride0 = 1;
111-
if (flip_dims[0] == 0) {
112-
dim0 = x_dims_v[0];
113-
stride0 = x_stride_v[0];
114-
for (size_t i = 1; i < total_dims; ++i) {
115-
dim1 *= x_dims_v[i];
116-
}
117-
} else {
118-
dim1 = x_dims_v[total_dims - 1];
119-
for (size_t i = 0; i < total_dims - 1; ++i) {
120-
dim0 *= x_dims_v[i];
121-
}
122-
stride0 *= x_dims_v[total_dims - 1];
123-
}
124-
kernel_pointwise_flip_apply<
125-
T><<<dim_grid, dim_block, 0, ctx.cuda_device_context().stream()>>>(
126-
N, in_data, out_data, dim0, stride0, dim1, flip_dims[0]);
127-
}
128-
12988
int bytes = total_dims * sizeof(int64_t);
13089
auto x_strides_array_tmp = memory::Alloc(dev_ctx, bytes);
13190
int64_t* x_strides_array_gpu =

0 commit comments

Comments
 (0)