@@ -58,82 +58,6 @@ static void InitMultiGPUOpVarMap() {
5858 multi_op_var2gpu_str_mutex ().swap (tmp_multi_mutex);
5959}
6060
61- template <typename T>
62- __device__ __forceinline__ void PrintNanInfKernel (const T* value,
63- const size_t numel,
64- int print_num,
65- char * debug_info) {
66- const size_t tid = threadIdx .x + blockIdx .x * blockDim .x ;
67-
68- __shared__ unsigned int nan_count, inf_count, num_count;
69- if (threadIdx .x == 0 ) nan_count = inf_count = num_count = 0 ;
70- __syncthreads ;
71-
72- for (size_t i = tid; i < numel; i += blockDim .x * gridDim .x ) {
73- unsigned int count = 0 ;
74- if (isnan (value[i])) {
75- count = atomicAdd (&nan_count, 1 );
76- } else if (isinf (value[i])) {
77- count = atomicAdd (&inf_count, 1 );
78- } else {
79- count = atomicAdd (&num_count, 1 );
80- }
81- // for cuda, print in every block
82- if (count < print_num) {
83- printf (" numel:%lu idx:%lu value:%f\n " ,
84- static_cast <uint64_t >(numel),
85- static_cast <uint64_t >(i),
86- static_cast <float >(value[i]));
87- }
88- }
89- __syncthreads ;
90-
91- #ifdef __HIPCC__
92- if (true && hipThreadIdx_x == 0 ) {
93- printf (" In block %d, there has %u,%u,%u nan,inf,num\n " ,
94- hipBlockIdx_x,
95- nan_count,
96- inf_count,
97- num_count);
98- #else
99- if (true && threadIdx .x == 0 ) {
100- printf (" In block %d, there has %u,%u,%u nan,inf,num\n " ,
101- blockIdx .x ,
102- nan_count,
103- inf_count,
104- num_count);
105- #endif
106- PADDLE_ENFORCE (false , " ===ERROR: in %s find nan or inf===" , debug_info);
107- }
108- }
109-
110- // Resnet 2gpus speed test, no check 270 images/s, this check 229 images/s
111- template <typename T>
112- __global__ void CheckNanInfKernel (const T* value,
113- const size_t numel,
114- int print_num,
115- char * debug_info) {
116- // / step 1, judge wheater has nan or inf
117- __shared__ volatile int has_nan_inf;
118- if (threadIdx .x == 0 ) has_nan_inf = false ;
119- __syncthreads ();
120-
121- const size_t tid = threadIdx .x + blockIdx .x * blockDim .x ;
122- T sum = static_cast <T>(0.0 );
123- // Todo(wangxi). simd speed up
124- for (size_t i = tid; i < numel; i += blockDim .x * gridDim .x ) {
125- sum += (value[i] - value[i]);
126- }
127-
128- if (isnan (sum) || isinf (sum)) has_nan_inf = true ;
129- __syncthreads ();
130-
131- // / Note. different blocks may behave differently
132- if (!has_nan_inf) return ;
133-
134- PrintNanInfKernel (value, numel, print_num, debug_info);
135- }
136-
13761template <typename T, int ReduceType>
13862__device__ T BlockReduce (T value) {
13963 __shared__ T shared_mem[1024 ];
@@ -509,19 +433,7 @@ void CheckNumericsKernel(const Context& ctx,
509433 size_t blocks =
510434 std::min (static_cast <size_t >(128 ),
511435 static_cast <size_t >((tensor.numel () + threads - 1 ) / threads));
512- #ifdef __HIPCC__
513- int print_num = 3 ;
514-
515- hipLaunchKernelGGL (CheckNanInfKernel,
516- dim3 (blocks),
517- dim3 (threads),
518- 0 ,
519- ctx.stream (),
520- tensor.data <T>(),
521- tensor.numel (),
522- print_num,
523- gpu_str_ptr);
524- #else
436+
525437 using MT = typename phi::dtype::MPTypeTrait<T>::Type;
526438
527439 int64_t numel_max_min = blocks;
@@ -586,7 +498,6 @@ void CheckNumericsKernel(const Context& ctx,
586498 if (check_nan_inf_level == 0 && stack_height_limit > 0 ) {
587499 PrintStack<T>(ctx, *stats, op_type, var_name, dev_id);
588500 }
589- #endif
590501}
591502
592503} // namespace phi
0 commit comments