Skip to content

Commit e4b6585

Browse files
authored
Trivial forwarding (pytorch#1995)
Enables trivial forwarding Background: nvfuser codegen doesn't handle aliases at all. When we have a fusion that forwards an input to output without any operations on it, this is a no-op for codegen and the output tensor is never written to. However, the codegen cannot "forward" an input to output, since all outputs are allocated in integration. If we do not special case it, we'll ended up having a "fresh" tensor allocated for the forwarded-input. Approach: There are two aspects of the support: step 1. Codegen handles forwarding implicitly. Forwarded inputs doesn't have any producer in the IR, hence the output argument is not used in the code. But it does require to have an argument in the kernel as a place-holder so we'll map each arguments correctly. step 2. Integration handles the trivial forwarding of inputs. When we put together fusion_outputs for a given fusion, when outputs are just fusion inputs, we directly return the input tensor.
1 parent 1a0e355 commit e4b6585

File tree

5 files changed

+73
-23
lines changed

5 files changed

+73
-23
lines changed

torch/csrc/jit/codegen/cuda/executor.cpp

Lines changed: 13 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -741,29 +741,24 @@ FusionExecutor::GlobalBuffers FusionExecutor::allocGlobalVals(
741741
}
742742

743743
std::vector<at::Tensor> FusionExecutor::allocOutputs(
744+
const KernelArgumentHolder& args,
744745
kir::ExpressionEvaluator& expr_eval,
745746
const std::unordered_set<int>& alias_indices) {
746747
FUSER_PERF_SCOPE("FusionExecutor::AllocOutputs");
747748
const auto kernel = lowered_->kernel();
748749
// NOLINTNEXTLINE(cppcoreguidelines-init-variables)
749750
std::vector<at::Tensor> outputs;
751+
TORCH_INTERNAL_ASSERT(
752+
args.size() == kernel->inputs().size(),
753+
"kernel arguments length does not match runtime arguments.");
750754
for (const auto out_i : c10::irange(kernel->outputs().size())) {
751-
// TODO: FIX this short-cut where we trivially forward inputs to outputs
752755
if (kernel->outputs()[out_i]->isFusionInput()) {
753-
TORCH_INTERNAL_ASSERT(false, "trivial input forwarding NOT IMPLEMENTED");
754-
// for (auto inp_i : c10::irange(kernel->inputs().size())) {
755-
// if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
756-
// TORCH_INTERNAL_ASSERT(
757-
// inp_i < inputs.size(),
758-
// "Issue with an input showing up as output, couldn't find
759-
// input.");
760-
// TORCH_INTERNAL_ASSERT(
761-
// inputs[inp_i].isTensor(),
762-
// "Cannot register a scalar as an output in a fusion.");
763-
// outputs.push_back(inputs[inp_i].toTensor());
764-
// break;
765-
// }
766-
// }
756+
// pushing empty tensor for trivial forwarding. Since we handle this in
757+
// integration, see step 1 - note [trivial forwarding]
758+
c10::Device device(c10::DeviceType::CUDA, args.getDeviceIndex());
759+
const auto tensor_options =
760+
at::TensorOptions().dtype(at::kFloat).device(device);
761+
outputs.emplace_back(at::empty({0}, tensor_options));
767762
} else {
768763
TORCH_INTERNAL_ASSERT(
769764
kernel->outputs()[out_i]->isA<TensorView>(),
@@ -803,7 +798,8 @@ KernelArgumentHolder FusionExecutor::evaluateOutputSizes(
803798
meta_options.device = c10::Device(DeviceType::Meta, 0);
804799

805800
for (const auto out_i : c10::irange(kernel->outputs().size())) {
806-
// If the output is just trivially the input, just "copy" it over.
801+
// If the output is just trivially the input, just "copy" it over, see note
802+
// [trivial forwarding]
807803
if (kernel->outputs()[out_i]->isFusionInput()) {
808804
for (auto inp_i : c10::irange(kernel->inputs().size())) {
809805
if (kernel->inputs()[inp_i] == kernel->outputs()[out_i]) {
@@ -1124,7 +1120,7 @@ std::vector<at::Tensor> FusionExecutor::runFusion(
11241120

11251121
auto& output_alias_indices = output_alias_indices_entry.get();
11261122

1127-
allocated_outputs = allocOutputs(expr_eval, output_alias_indices);
1123+
allocated_outputs = allocOutputs(args, expr_eval, output_alias_indices);
11281124

11291125
for (const auto& entry : alias_indices) {
11301126
auto aliased_output_index = entry.first;

torch/csrc/jit/codegen/cuda/executor.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,6 +220,7 @@ class TORCH_CUDA_CU_API FusionExecutor : public NonCopyable {
220220
// skip allocating real storage for those, but still maintain its spot to
221221
// maintain the indexing from output aliases to inputs
222222
std::vector<at::Tensor> allocOutputs(
223+
const KernelArgumentHolder& args,
223224
kir::ExpressionEvaluator& expr_eval,
224225
const std::unordered_set<int>& alias_indices = {});
225226

torch/csrc/jit/codegen/cuda/kernel_cache.cpp

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -649,11 +649,16 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
649649
group_outputs.size() == group_runtime_outputs.size(),
650650
"output size does not match");
651651
for (const size_t group_out_i : c10::irange(group_outputs.size())) {
652-
output_holder[group_outputs[group_out_i]] =
653-
group_runtime_outputs[group_out_i];
652+
// trivial forwarding outputs empty tensor to save bandwidth, skip
653+
// tensor_map update on those, since we want all future use of inputs on
654+
// the original tensor input. See note [trivial forwarding]
655+
if (!group_outputs[group_out_i]->isFusionInput()) {
656+
output_holder[group_outputs[group_out_i]] =
657+
group_runtime_outputs[group_out_i];
654658

655-
args.push(group_runtime_outputs[group_out_i]);
656-
tensor_map.emplace(group_outputs[group_out_i], args.back());
659+
args.push(group_runtime_outputs[group_out_i]);
660+
tensor_map.emplace(group_outputs[group_out_i], args.back());
661+
}
657662
}
658663
}
659664

@@ -669,10 +674,30 @@ std::vector<at::Tensor> FusionKernelRuntime::runWithInput(
669674
if (iter != output_holder.end()) {
670675
fusion_outputs.push_back(iter->second);
671676
} else if (output->isFusionInput()) {
677+
// Note [ trivial forwarding ]
678+
//
679+
// Background:
680+
// nvfuser codegen doesn't handle aliases at all. When we have a fusion
681+
// that forwards an input to output without any operations on it, this is
682+
// a no-op for codegen and the output tensor is never written to. However,
683+
// the codegen cannot "forward" an input to output, since all outputs are
684+
// allocated in integration. If we do not special case it, we'll ended up
685+
// having a "fresh" tensor allocated for the forwarded-input.
686+
//
687+
// Approach:
688+
// There are two aspects of the support:
689+
// step 1. Codegen handles forwarding implicitly. Forwarded inputs doesn't
690+
// have any producer in the IR, hence the output argument is not used in
691+
// the code. But it does require to have an argument in the kernel as a
692+
// place-holder so we'll map each arguments correctly.
693+
// step 2. Integration handles the trivial forwarding of inputs. When we
694+
// put together `fusion_outputs` for a given fusion, when outputs are just
695+
// fusion inputs, we directly return the input tensor.
672696
const auto iter = tensor_map.find(output);
673697
TORCH_INTERNAL_ASSERT(
674698
iter != tensor_map.end(), "Can not find output as aliased intput");
675699
auto arg = dynamic_cast<const TensorArgAbstract*>(iter->second);
700+
// See step 2 - note [ trivial forwarding ]
676701
fusion_outputs.push_back(arg->getTensor());
677702
} else {
678703
bool empty_type_check = output->getDataType().has_value() &&

torch/csrc/jit/codegen/cuda/ops/normalization.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -589,7 +589,7 @@ ForwardNormResult batch_norm(
589589
// During inference, mean/invstd output are empty tensors
590590
// on CPU, but not on CUDA. We need to make sure we have the same
591591
// behavior as with eager mode on CUDA.
592-
mean = set(running_mean);
592+
mean = running_mean;
593593
invstd = unbiased_invstd;
594594
y = mul(x_sub_mean, invstd_bcast);
595595
}
@@ -844,7 +844,7 @@ ForwardNormResult instance_norm(
844844
// During inference, mean/invstd output are empty tensors
845845
// on CPU, but not on CUDA. We need to make sure we have the same
846846
// behavior as with eager mode on CUDA.
847-
mean = set(running_mean);
847+
mean = running_mean;
848848
invstd = unbiased_invstd;
849849
y = mul(x_sub_mean, invstd_bcast);
850850
}

torch/csrc/jit/codegen/cuda/test/test_gpu.cpp

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26026,6 +26026,34 @@ TEST_F(NVFuserTest, FusionInlineAt_CUDA) {
2602626026
testValidate(fusion, {out}, {t0}, {t0.sin().cos()}, __LINE__, __FILE__);
2602726027
}
2602826028

26029+
TEST_F(NVFuserTest, FusionTrivialInputForwarding_CUDA) {
26030+
std::unique_ptr<Fusion> fusion_ptr = std::make_unique<Fusion>();
26031+
auto fusion = fusion_ptr.get();
26032+
FusionGuard fg(fusion);
26033+
26034+
TensorView* tv0 = makeConcreteTensor({-1, -1});
26035+
TensorView* tv1 = makeConcreteTensor({-1, -1});
26036+
fusion->addInput(tv0);
26037+
fusion->addInput(tv1);
26038+
// Note: tv2 is not needed. Kept it here since previously there was an
26039+
// assertion from sorting in codegen.
26040+
auto tv2 = add(tv1, IrBuilder::create<Double>(3.141));
26041+
fusion->addOutput(tv0);
26042+
26043+
auto options = at::TensorOptions().dtype(kFloat).device(at::kCUDA, 0);
26044+
at::Tensor t0 = at::randn({10, 4}, options);
26045+
at::Tensor t1 = at::randn({10, 4}, options);
26046+
26047+
FusionExecutorCache fec(std::move(fusion_ptr));
26048+
auto cg_outputs = fec.runFusionWithInputs({t0, t1});
26049+
26050+
testValidate(fusion, cg_outputs, {t0, t1}, {t0}, __LINE__, __FILE__);
26051+
26052+
// Second run to ensure cache hit handles trivial forwarding properly
26053+
auto cg_outputs2 = fec.runFusionWithInputs({t0, t1});
26054+
testValidate(fusion, cg_outputs2, {t0, t1}, {t0}, __LINE__, __FILE__);
26055+
}
26056+
2602926057
} // namespace jit
2603026058
} // namespace torch
2603126059
#endif // #if defined(USE_CUDA)

0 commit comments

Comments
 (0)