Trivial forwarding (pytorch#1995)

jjsjann123 · web-flow · commit e4b65850eee1 · 2022-09-30T11:45:38.000-07:00
Enables trivial forwarding

Background:
nvfuser codegen doesn't handle aliases at all. When we have a fusion that forwards an input to output without any operations on it, this is a no-op for codegen and the output tensor is never written to. However, the codegen cannot "forward" an input to output, since all outputs are allocated in integration. If we do not special case it, we'll ended up having a "fresh" tensor allocated for the forwarded-input.

Approach:
There are two aspects of the support:
step 1. Codegen handles forwarding implicitly. Forwarded inputs doesn't have any producer in the IR, hence the output argument is not used in the code. But it does require to have an argument in the kernel as a place-holder so we'll map each arguments correctly.
step 2. Integration handles the trivial forwarding of inputs. When we put together fusion_outputs for a given fusion, when outputs are just fusion inputs, we directly return the input tensor.
diff --git a/torch/csrc/jit/codegen/cuda/executor.cpp b/torch/csrc/jit/codegen/cuda/executor.cpp
@@ -741,29 +741,24 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
 }
 
 std::vector<at::Tensor> FusionExecutor::allocOutputs(
+    const KernelArgumentHolder& args,
     kir::ExpressionEvaluator& expr_eval,
     const std::unordered_set<int>& alias_indices) {
   FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs");
   const auto kernel = lowered_->kernel();
   // NOLINTNEXTLINE(cppcoreguidelines-init-variables)
   std::vector<at::Tensor> outputs;
+  TORCH_INTERNAL_ASSERT(
+      args.size() == kernel->inputs().size(),
+      "kernel arguments length does not match runtime arguments.");
   for (const auto out_i : c10::irange(kernel->outputs().size())) {
-    // TODO: FIX this short-cut where we trivially forward inputs to outputs
     if (kernel->outputs()[out_i]->isFusionInput()) {
-      TORCH_INTERNAL_ASSERT(false, "trivial input forwarding NOT IMPLEMENTED");
-      // for (auto inp_i : c10::irange(kernel->inputs().size())) {
-      //   if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
-      //     TORCH_INTERNAL_ASSERT(
-      //         inp_i < inputs.size(),
-      //         "Issue with an input showing up as output, couldn't find
-      //         input.");
-      //     TORCH_INTERNAL_ASSERT(
-      //         inputs[inp_i].isTensor(),
-      //         "Cannot register a scalar as an output in a fusion.");
-      //     outputs.push_back(inputs[inp_i].toTensor());
-      //     break;
-      //   }
-      // }
+      // pushing empty tensor for trivial forwarding. Since we handle this in
+      // integration, see step 1 - note [trivial forwarding]
+      c10::Device device(c10::DeviceType::CUDA, args.getDeviceIndex());
+      const auto tensor_options =
+          at::TensorOptions().dtype(at::kFloat).device(device);
+      outputs.emplace_back(at::empty({0}, tensor_options));
     } else {
       TORCH_INTERNAL_ASSERT(
           kernel->outputs()[out_i]->isA<TensorView>(),
@@ -803,7 +798,8 @@ KernelArgumentHolder FusionExecutor::evaluateOutputSizes(
   meta_options.device = c10::Device(DeviceType::Meta, 0);
 
   for (const auto out_i : c10::irange(kernel->outputs().size())) {
-    // If the output is just trivially the input, just "copy" it over.
+    // If the output is just trivially the input, just "copy" it over, see note
+    // [trivial forwarding]
     if (kernel->outputs()[out_i]->isFusionInput()) {
       for (auto inp_i : c10::irange(kernel->inputs().size())) {
         if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
@@ -1124,7 +1120,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
 
       auto& output_alias_indices = output_alias_indices_entry.get();
 
-      allocated_outputs = allocOutputs(expr_eval, output_alias_indices);
+      allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);
 
       for (const auto& entry : alias_indices) {
         auto aliased_output_index = entry.first;
diff --git a/torch/csrc/jit/codegen/cuda/executor.h b/torch/csrc/jit/codegen/cuda/executor.h
@@ -220,6 +220,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
   // skip allocating real storage for those, but still maintain its spot to
   // maintain the indexing from output aliases to inputs
   std::vector<at::Tensor> allocOutputs(
+      const KernelArgumentHolder& args,
       kir::ExpressionEvaluator& expr_eval,
       const std::unordered_set<int>& alias_indices = {});
 
diff --git a/torch/csrc/jit/codegen/cuda/kernel_cache.cpp b/torch/csrc/jit/codegen/cuda/kernel_cache.cpp
@@ -649,11 +649,16 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
         group_outputs.size() == group_runtime_outputs.size(),
         "output size does not match");
     for (const size_t group_out_i : c10::irange(group_outputs.size())) {
-      output_holder[group_outputs[group_out_i]] =
-          group_runtime_outputs[group_out_i];
+      // trivial forwarding outputs empty tensor to save bandwidth, skip
+      // tensor_map update on those, since we want all future use of inputs on
+      // the original tensor input. See note [trivial forwarding]
+      if (!group_outputs[group_out_i]->isFusionInput()) {
+        output_holder[group_outputs[group_out_i]] =
+            group_runtime_outputs[group_out_i];
 
-      args.push(group_runtime_outputs[group_out_i]);
-      tensor_map.emplace(group_outputs[group_out_i], args.back());
+        args.push(group_runtime_outputs[group_out_i]);
+        tensor_map.emplace(group_outputs[group_out_i], args.back());
+      }
     }
   }
 
@@ -669,10 +674,30 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
     if (iter != output_holder.end()) {
       fusion_outputs.push_back(iter->second);
     } else if (output->isFusionInput()) {
+      // Note [ trivial forwarding ]
+      //
+      // Background:
+      // nvfuser codegen doesn't handle aliases at all. When we have a fusion
+      // that forwards an input to output without any operations on it, this is
+      // a no-op for codegen and the output tensor is never written to. However,
+      // the codegen cannot "forward" an input to output, since all outputs are
+      // allocated in integration. If we do not special case it, we'll ended up
+      // having a "fresh" tensor allocated for the forwarded-input.
+      //
+      // Approach:
+      // There are two aspects of the support:
+      // step 1. Codegen handles forwarding implicitly. Forwarded inputs doesn't
+      // have any producer in the IR, hence the output argument is not used in
+      // the code. But it does require to have an argument in the kernel as a
+      // place-holder so we'll map each arguments correctly.
+      // step 2. Integration handles the trivial forwarding of inputs. When we
+      // put together `fusion_outputs` for a given fusion, when outputs are just
+      // fusion inputs, we directly return the input tensor.
       const auto iter = tensor_map.find(output);
       TORCH_INTERNAL_ASSERT(
           iter != tensor_map.end(), "Can not find output as aliased intput");
       auto arg = dynamic_cast<const TensorArgAbstract*>(iter->second);
+      // See step 2 - note [ trivial forwarding ]
       fusion_outputs.push_back(arg->getTensor());
     } else {
       bool empty_type_check = output->getDataType().has_value() &&
diff --git a/torch/csrc/jit/codegen/cuda/ops/normalization.cpp b/torch/csrc/jit/codegen/cuda/ops/normalization.cpp
@@ -589,7 +589,7 @@ ForwardNormResult batch_norm(
     // During inference, mean/invstd output are empty tensors
     // on CPU, but not on CUDA. We need to make sure we have the same
     // behavior as with eager mode on CUDA.
-    mean = set(running_mean);
+    mean = running_mean;
     invstd = unbiased_invstd;
     y = mul(x_sub_mean, invstd_bcast);
   }
@@ -844,7 +844,7 @@ ForwardNormResult instance_norm(
     // During inference, mean/invstd output are empty tensors
     // on CPU, but not on CUDA. We need to make sure we have the same
     // behavior as with eager mode on CUDA.
-    mean = set(running_mean);
+    mean = running_mean;
     invstd = unbiased_invstd;
     y = mul(x_sub_mean, invstd_bcast);
   }
diff --git a/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp b/torch/csrc/jit/codegen/cuda/test/test_gpu.cpp
@@ -26026,6 +26026,34 @@ TEST_F(NVFuserTest, FusionInlineAt_CUDA) {
   testValidate(fusion, {out}, {t0}, {t0.sin().cos()}, __LINE__, __FILE__);
 }
 
+TEST_F(NVFuserTest, FusionTrivialInputForwarding_CUDA) {
+  std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
+  auto fusion = fusion_ptr.get();
+  FusionGuard fg(fusion);
+
+  TensorView* tv0 = makeConcreteTensor({-1, -1});
+  TensorView* tv1 = makeConcreteTensor({-1, -1});
+  fusion->addInput(tv0);
+  fusion->addInput(tv1);
+  // Note: tv2 is not needed. Kept it here since previously there was an
+  // assertion from sorting in codegen.
+  auto tv2 = add(tv1, IrBuilder::create<Double>(3.141));
+  fusion->addOutput(tv0);
+
+  auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
+  at::Tensor t0 = at::randn({10, 4}, options);
+  at::Tensor t1 = at::randn({10, 4}, options);
+
+  FusionExecutorCache fec(std::move(fusion_ptr));
+  auto cg_outputs = fec.runFusionWithInputs({t0, t1});
+
+  testValidate(fusion, cg_outputs, {t0, t1}, {t0}, __LINE__, __FILE__);
+
+  // Second run to ensure cache hit handles trivial forwarding properly
+  auto cg_outputs2 = fec.runFusionWithInputs({t0, t1});
+  testValidate(fusion, cg_outputs2, {t0, t1}, {t0}, __LINE__, __FILE__);
+}
+
 } // namespace jit
 } // namespace torch
 #endif // #if defined(USE_CUDA)