2020#include < c10/cuda/CUDAStream.h>
2121#include < c10/util/irange.h>
2222
23+ #include < cmath>
2324#include < fstream>
2425
2526namespace torch {
@@ -29,6 +30,16 @@ namespace cuda {
2930
3031int FusionExecutor::fusion_id_counter_ = 0 ; // NOLINT
3132
33+ bool fill_allocation_with_nan_ = false ;
34+
35+ bool shouldFillAllocationWithNan () {
36+ return fill_allocation_with_nan_;
37+ }
38+
39+ void setFillAllocationWithNan (bool value) {
40+ fill_allocation_with_nan_ = value;
41+ }
42+
3243namespace {
3344
3445static const char * defineIndexMode (KernelIndexMode index_mode) {
@@ -280,6 +291,42 @@ void FusionExecutor::compileFusion(
280291
281292namespace {
282293
294+ void fillTensorWithNan (at::Tensor& t) {
295+ switch (t.scalar_type ()) {
296+ case at::ScalarType::Byte:
297+ t.fill_ (0xFF );
298+ break ;
299+ case at::ScalarType::Char:
300+ t.fill_ (0x7F );
301+ break ;
302+ case at::ScalarType::Short:
303+ t.fill_ (0x7FFF );
304+ break ;
305+ case at::ScalarType::Int:
306+ t.fill_ (0x7FFFFFFF );
307+ break ;
308+ case at::ScalarType::Long:
309+ t.fill_ (0x7FFFFFFFFFFFFFFFL );
310+ break ;
311+ case at::ScalarType::Bool:
312+ t.fill_ (true );
313+ break ;
314+ case at::ScalarType::Half:
315+ case at::ScalarType::Float:
316+ case at::ScalarType::Double:
317+ case at::ScalarType::BFloat16:
318+ t.fill_ (std::nan (" " ));
319+ break ;
320+ case at::ScalarType::ComplexHalf:
321+ case at::ScalarType::ComplexFloat:
322+ case at::ScalarType::ComplexDouble:
323+ t.fill_ (c10::complex <double >(std::nan (" " ), std::nan (" " )));
324+ break ;
325+ default :
326+ TORCH_INTERNAL_ASSERT (false , " Unknown dtype" );
327+ }
328+ }
329+
283330at::Tensor inferAndAlloc (
284331 const TensorView* tv,
285332 const std::vector<Val*>& sizes,
@@ -349,6 +396,9 @@ at::Tensor inferAndAlloc(
349396 // Non Variable type guard for empty_cuda call
350397 at::AutoDispatchBelowADInplaceOrView non_variable_type_mode;
351398 auto empty = at::empty (isizes, tensor_options);
399+ if (shouldFillAllocationWithNan ()) {
400+ fillTensorWithNan (empty);
401+ }
352402 if (expanded_dim) {
353403 return empty.expand (expanded_sizes);
354404 }
@@ -892,6 +942,9 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
892942 c10::nullopt ,
893943 options_.device ,
894944 c10::nullopt ));
945+ if (shouldFillAllocationWithNan ()) {
946+ fillTensorWithNan (allocated_outputs.back ());
947+ }
895948 }
896949 // Note: aliased output is not returned as output. But we still need it
897950 // for kernel execution, so would need to push them to args
@@ -932,6 +985,9 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
932985 c10::nullopt ,
933986 options_.device ,
934987 c10::nullopt ));
988+ if (shouldFillAllocationWithNan ()) {
989+ fillTensorWithNan (global_buffers.buffers .back ());
990+ }
935991 global_buffers.zero_init .push_back (false );
936992 }
937993 }
0 commit comments