@@ -24,24 +24,6 @@ namespace operators {
2424using Tensor = framework::Tensor;
2525using CUDADeviceContext = paddle::platform::CUDADeviceContext;
2626
27- template <typename T>
28- __global__ void kernel_pointwise_flip_apply (const int N, const T* in_data,
29- T* out_data, int dim0, int stride0,
30- int dim1, int flip_dim) {
31- for (int idx = blockIdx .x * blockDim .x + threadIdx .x ; idx < N;
32- idx += gridDim .x * blockDim .x ) {
33- int dst_offset = 0 ;
34- if (flip_dim == 0 ) {
35- // flip 1st dim
36- dst_offset = (dim0 - 1 - idx / stride0) * stride0 + idx % stride0;
37- } else {
38- // flip last dim
39- dst_offset = idx / stride0 * stride0 + (dim1 - 1 - idx % stride0);
40- }
41- out_data[dst_offset] = in_data[idx];
42- }
43- }
44-
4527template <typename T>
4628__global__ void flip_cuda_kernel (const int N, const T* in_data, T* out_data,
4729 int64_t * x_shape, int64_t * x_stride,
@@ -103,29 +85,6 @@ class FlipKernel<platform::CUDADeviceContext, T>
10385 std::vector<int64_t > x_dims_v = framework::vectorize (x_dims);
10486 std::vector<int64_t > x_stride_v = framework::vectorize (x_stride);
10587
106- // wrap high-dims to 2-dims
107- if (flip_dims_size == 1 &&
108- (flip_dims[0 ] == 0 || flip_dims[0 ] == total_dims - 1 )) {
109- int dim0 = 1 , dim1 = 1 ;
110- int stride0 = 1 ;
111- if (flip_dims[0 ] == 0 ) {
112- dim0 = x_dims_v[0 ];
113- stride0 = x_stride_v[0 ];
114- for (size_t i = 1 ; i < total_dims; ++i) {
115- dim1 *= x_dims_v[i];
116- }
117- } else {
118- dim1 = x_dims_v[total_dims - 1 ];
119- for (size_t i = 0 ; i < total_dims - 1 ; ++i) {
120- dim0 *= x_dims_v[i];
121- }
122- stride0 *= x_dims_v[total_dims - 1 ];
123- }
124- kernel_pointwise_flip_apply<
125- T><<<dim_grid, dim_block, 0 , ctx.cuda_device_context().stream()>>> (
126- N, in_data, out_data, dim0, stride0, dim1, flip_dims[0 ]);
127- }
128-
12988 int bytes = total_dims * sizeof (int64_t );
13089 auto x_strides_array_tmp = memory::Alloc (dev_ctx, bytes);
13190 int64_t * x_strides_array_gpu =
0 commit comments