kulinseth
diff --git a/‎benchmarks/cpp/nvfuser/timm.cpp‎
Lines changed: 4 additions & 4 deletions b/‎benchmarks/cpp/nvfuser/timm.cpp‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/arith.cpp‎
Lines changed: 71 additions & 19 deletions b/‎torch/csrc/jit/codegen/cuda/arith.cpp‎
Lines changed: 71 additions & 19 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/arith.h‎
Lines changed: 18 additions & 4 deletions b/‎torch/csrc/jit/codegen/cuda/arith.h‎
Lines changed: 18 additions & 4 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/codegen.cpp‎
Lines changed: 8 additions & 6 deletions b/‎torch/csrc/jit/codegen/cuda/codegen.cpp‎
Lines changed: 8 additions & 6 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/dispatch.cpp‎
Lines changed: 15 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/dispatch.cpp‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/dispatch.h‎
Lines changed: 4 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/dispatch.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_builder.cpp‎
Lines changed: 1 addition & 0 deletions b/‎torch/csrc/jit/codegen/cuda/ir_builder.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_cloner.cpp‎
Lines changed: 4 additions & 0 deletions b/‎torch/csrc/jit/codegen/cuda/ir_cloner.cpp‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_cloner.h‎
Lines changed: 1 addition & 0 deletions b/‎torch/csrc/jit/codegen/cuda/ir_cloner.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎torch/csrc/jit/codegen/cuda/ir_graphviz.cpp‎
Lines changed: 15 additions & 6 deletions b/‎torch/csrc/jit/codegen/cuda/ir_graphviz.cpp‎
Lines changed: 15 additions & 6 deletions
@@ -115,7 +115,7 @@ static void setup_vit_base_patch16_224_bcast5(Fusion* fusion, void* null) {
   auto t6 = set(t5);
   auto t7 = broadcast(t6, bcast_pattern0);
   auto t8 = add(t4, t7);
-  auto t9 = randlike(t8);
+  auto t9 = rand_like(t8);
   auto d34 =
       sub(IrBuilder::create<Double>(1.0), IrBuilder::create<Double>(0.0));
   auto t10 = lt(t9, d34);
@@ -289,7 +289,7 @@ static void setup_vit_base_patch16_224_norm_inner3(Fusion* fusion, void* null) {
   auto t10 = broadcast(t9, {false, false, false, true});
   auto t11 = reciprocal(t10);
   auto t12 = mul(t8, t11);
-  auto t13 = randlike(t12);
+  auto t13 = rand_like(t12);
   auto d79 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
   auto t14 = lt(t13, d79);
   auto t15 = castOp(DataType::Float, t14);
@@ -367,7 +367,7 @@ static void setup_vit_base_patch16_224_bcast_outer6(
   auto t9 = add(IrBuilder::create<Double>(1), t8);
   auto t10 = mul(IrBuilder::create<Double>(0.5), t9);
   auto t11 = mul(t6, t10);
-  auto t12 = randlike(t11);
+  auto t12 = rand_like(t11);
   auto d66 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
   auto t13 = lt(t12, d66);
   auto t14 = castOp(DataType::Float, t13);
@@ -456,7 +456,7 @@ static void setup_vit_base_patch16_224_bcast_inner6(
   auto t9 = add(IrBuilder::create<Double>(1), t8);
   auto t10 = mul(IrBuilder::create<Double>(0.5), t9);
   auto t11 = mul(t6, t10);
-  auto t12 = randlike(t11);
+  auto t12 = rand_like(t11);
   auto d66 = sub(IrBuilder::create<Double>(1), IrBuilder::create<Double>(0));
   auto t13 = lt(t12, d66);
   auto t14 = castOp(DataType::Float, t13);
 
@@ -449,10 +449,79 @@ TensorView* rand(const std::vector<Val*>& shape, DataType dtype) {
                  .contiguity(std::vector<bool>(n, true))
                  .shape(shape)
                  .build();
-  IrBuilder::create<RNGOp>(RNGOpType::Uniform, out);
+  IrBuilder::create<RNGOp>(RNGOpType::Uniform, out, dtype);
   return out;
 }
 
+TensorView* rand_like(TensorView* v) {
+  TORCH_CHECK(
+      isFloatingPointType(v->dtype()),
+      "input must have floating point type, but got ",
+      v->dtype());
+  std::vector<Val*> shape;
+  shape.reserve(v->getMaybeRFactorDomain().size());
+  for (auto id : v->getMaybeRFactorDomain()) {
+    shape.emplace_back(id->getMaybeExpandedExtent());
+  }
+  return rand(shape, v->dtype());
+}
+
+Val* rand_like(Val* v) {
+  return rand_like(v->as<TensorView>());
+}
+
+TensorView* full(
+    const std::vector<Val*>& shape,
+    Val* fill_value,
+    DataType dtype) {
+  auto n = shape.size();
+  auto out = TensorViewBuilder()
+                 .ndims(n)
+                 .dtype(dtype)
+                 .contiguity(std::vector<bool>(n, true))
+                 .shape(shape)
+                 .build();
+  IrBuilder::create<FullOp>(out, fill_value, dtype);
+  return out;
+}
+
+TensorView* full_like(TensorView* tv, Val* fill_value) {
+  std::vector<Val*> shape;
+  shape.reserve(tv->getMaybeRFactorDomain().size());
+  for (auto id : tv->getMaybeRFactorDomain()) {
+    shape.emplace_back(id->getMaybeExpandedExtent());
+  }
+  return full(shape, fill_value, tv->dtype());
+}
+
+Val* full_like(Val* v, Val* fill_value) {
+  return full_like(v->as<TensorView>(), fill_value);
+}
+
+TensorView* zeros(const std::vector<Val*>& shape, DataType dtype) {
+  return full(shape, FusionGuard::getCurFusion()->zeroVal(), dtype);
+}
+
+TensorView* zeros_like(TensorView* tv) {
+  return full_like(tv, FusionGuard::getCurFusion()->zeroVal());
+}
+
+Val* zeros_like(Val* v) {
+  return zeros_like(v->as<TensorView>());
+}
+
+TensorView* ones(const std::vector<Val*>& shape, DataType dtype) {
+  return full(shape, FusionGuard::getCurFusion()->oneVal(), dtype);
+}
+
+TensorView* ones_like(TensorView* tv) {
+  return full_like(tv, FusionGuard::getCurFusion()->oneVal());
+}
+
+Val* ones_like(Val* v) {
+  return ones_like(v->as<TensorView>());
+}
+
 TensorView* arange(Val* end, DataType dtype) {
   return arange(FusionGuard::getCurFusion()->zeroVal(), end, dtype);
 }
@@ -480,7 +549,7 @@ TensorView* arange(Val* start, Val* end, Val* step, DataType dtype) {
                  .contiguity({true})
                  .shape({size})
                  .build();
-  IrBuilder::create<ARangeOp>(out, start, end, step);
+  IrBuilder::create<ARangeOp>(out, start, end, step, dtype);
   return out;
 }
 
@@ -506,23 +575,6 @@ NVFUSER_DEFINE_UNARY_OP(trunc, Trunc)
 NVFUSER_DEFINE_UNARY_OP(print, Print)
 #undef NVFUSER_DEFINE_UNARY_OP
 
-TensorView* randlike(TensorView* v) {
-  TORCH_CHECK(
-      isFloatingPointType(v->dtype()),
-      "input must have floating point type, but got ",
-      v->dtype());
-  std::vector<Val*> shape;
-  shape.reserve(v->getMaybeRFactorDomain().size());
-  for (auto id : v->getMaybeRFactorDomain()) {
-    shape.emplace_back(id->getMaybeExpandedExtent());
-  }
-  return rand(shape, v->dtype());
-}
-
-Val* randlike(Val* v) {
-  return randlike(v->as<TensorView>());
-}
-
 Val* bitwise_not(Val* v) {
   TORCH_CHECK(
       isIntegralType(v->dtype()) || v->dtype() == DataType::Bool,
 
@@ -125,7 +125,24 @@ TORCH_CUDA_CU_API WelfordResult Welford(
 TORCH_CUDA_CU_API TensorView* rand(
     const std::vector<Val*>& shape,
     DataType dtype);
-
+TORCH_CUDA_CU_API Val* rand_like(Val*);
+TORCH_CUDA_CU_API TensorView* rand_like(TensorView*);
+TORCH_CUDA_CU_API TensorView* full(
+    const std::vector<Val*>& shape,
+    Val* fill_value,
+    DataType dtype);
+TORCH_CUDA_CU_API TensorView* full_like(TensorView* tv, Val* fill_value);
+TORCH_CUDA_CU_API Val* full_like(Val* tv, Val* fill_value);
+TORCH_CUDA_CU_API TensorView* zeros(
+    const std::vector<Val*>& shape,
+    DataType dtype);
+TORCH_CUDA_CU_API TensorView* zeros_like(TensorView*);
+TORCH_CUDA_CU_API Val* zeros_like(Val*);
+TORCH_CUDA_CU_API TensorView* ones(
+    const std::vector<Val*>& shape,
+    DataType dtype);
+TORCH_CUDA_CU_API TensorView* ones_like(TensorView*);
+TORCH_CUDA_CU_API Val* ones_like(Val*);
 //! WARNING: giving invalid combinations of the start, end and step
 //! arguments can result in undefined behavior. Specifically, the
 //! signs of `end - start` and step must be the same.
@@ -204,9 +221,6 @@ TORCH_CUDA_CU_API TensorView* log2(TensorView*);
 // neg
 TORCH_CUDA_CU_API Val* neg(Val*);
 TORCH_CUDA_CU_API TensorView* neg(TensorView*);
-// randlike
-TORCH_CUDA_CU_API Val* randlike(Val*);
-TORCH_CUDA_CU_API TensorView* randlike(TensorView*);
 // real
 TORCH_CUDA_CU_API Val* real(Val*);
 TORCH_CUDA_CU_API TensorView* real(TensorView*);
 
@@ -560,10 +560,14 @@ class CudaKernelGenerator : private OptOutConstDispatch {
           << "&" << gen(ldst->in()) << ");\n";
   }
 
+  void handle(const FullOp* fop) final {
+    indent() << gen(fop->output(0)) << " = (" << fop->dtype() << ")"
+             << gen(fop->getFillValue()) << ";\n";
+  }
+
   void handle(const ARangeOp* aop) final {
     auto index = genTensorIndex(aop->getLinearIndex()->as<kir::TensorIndex>());
-    indent() << gen(aop->output(0)) << " = arange<" << aop->output(0)->dtype()
-             << ">";
+    indent() << gen(aop->output(0)) << " = arange<" << aop->dtype() << ">";
     code_ << "(" << index << ", " << gen(aop->start()) << ", "
           << gen(aop->step()) << ");\n";
   }
@@ -759,9 +763,8 @@ class CudaKernelGenerator : private OptOutConstDispatch {
   void handle(const RNGOp* rop) final {
     // TODO: TORCH_INTERNAL_ASSERT that the scheduler correctly creates an
     // innermost ID of size 4 (float) or size 2 (double)?
-    auto out_tv = rop->output(0)->as<kir::TensorIndex>()->view();
     auto index = genTensorIndex(rop->getPhiloxIndex()->as<kir::TensorIndex>());
-    int multiple = out_tv->getDataType() == DataType::Double ? 2 : 4;
+    int multiple = rop->dtype() == DataType::Double ? 2 : 4;
     indent() << "nvfuser_index_t linear_index" << rop->name() << " = " << index
              << ";\n";
     indent() << "nvfuser_index_t rng_subseq" << rop->name() << " = linear_index"
@@ -780,8 +783,7 @@ class CudaKernelGenerator : private OptOutConstDispatch {
     indent() << "}\n";
     auto op_type = rop->getRNGOpType();
     indent() << gen(rop->output(0)) << " = " << op_type;
-    if (needFloatSuffix(op_type) &&
-        rop->output(0)->dtype() == DataType::Float) {
+    if (needFloatSuffix(op_type) && rop->dtype() == DataType::Float) {
       code_ << "f";
     }
     code_ << "(rng_result, rng_component" << rop->name() << ");\n";
 
@@ -95,6 +95,9 @@ void Val::dispatch(T handler, Val* val) {
 template <typename T>
 void Expr::dispatch(T handler, Expr* expr) {
   switch (*(expr->getExprType())) {
+    case ExprType::FullOp:
+      ptr(handler)->handle(expr->as<FullOp>());
+      return;
     case ExprType::ARangeOp:
       ptr(handler)->handle(expr->as<ARangeOp>());
       return;
@@ -281,6 +284,9 @@ void Val::constDispatch(T handler, const Val* val) {
 template <typename T>
 void Expr::constDispatch(T handler, const Expr* expr) {
   switch (*(expr->getExprType())) {
+    case ExprType::FullOp:
+      ptr(handler)->handle(expr->as<FullOp>());
+      return;
     case ExprType::ARangeOp:
       ptr(handler)->handle(expr->as<ARangeOp>());
       return;
@@ -475,6 +481,9 @@ void Val::mutatorDispatch(T mutator, Val* val) {
 template <typename T>
 void Expr::mutatorDispatch(T mutator, Expr* expr) {
   switch (*(expr->getExprType())) {
+    case ExprType::FullOp:
+      ptr(mutator)->mutate(expr->as<FullOp>());
+      return;
     case ExprType::ARangeOp:
       ptr(mutator)->mutate(expr->as<ARangeOp>());
       return;
@@ -734,6 +743,9 @@ void OptOutConstDispatch::handle(const kir::IntPair* stmt) {
 }
 
 // Exprs
+void OptOutConstDispatch::handle(const FullOp* stmt) {
+  unhandled(stmt);
+}
 void OptOutConstDispatch::handle(const ARangeOp* stmt) {
   unhandled(stmt);
 }
@@ -890,6 +902,9 @@ void OptOutDispatch::handle(kir::IntPair* stmt) {
 }
 
 // Exprs
+void OptOutDispatch::handle(FullOp* stmt) {
+  unhandled(stmt);
+}
 void OptOutDispatch::handle(ARangeOp* stmt) {
   unhandled(stmt);
 }
 
@@ -68,6 +68,7 @@ class ComplexDouble;
 class NamedScalar;
 
 // Exprs
+class FullOp;
 class ARangeOp;
 class UnaryOp;
 class BinaryOp;
@@ -144,6 +145,7 @@ class TORCH_CUDA_CU_API OptOutConstDispatch : public PolymorphicBase {
   virtual void handle(const kir::IntPair*);
 
   // Exprs
+  virtual void handle(const FullOp* stmt);
   virtual void handle(const ARangeOp* stmt);
   virtual void handle(const UnaryOp* stmt);
   virtual void handle(const BinaryOp* stmt);
@@ -211,6 +213,7 @@ class TORCH_CUDA_CU_API OptOutDispatch : public PolymorphicBase {
   virtual void handle(kir::IntPair*);
 
   // Exprs
+  virtual void handle(FullOp* stmt);
   virtual void handle(ARangeOp* stmt);
   virtual void handle(UnaryOp* stmt);
   virtual void handle(BinaryOp* stmt);
@@ -319,6 +322,7 @@ class TORCH_CUDA_CU_API OptOutMutator : public PolymorphicBase {
   virtual void mutate(kir::IntPair*);
 
   // Exprs
+  virtual void mutate(FullOp*);
   virtual void mutate(ARangeOp*);
   virtual void mutate(UnaryOp*);
   virtual void mutate(BinaryOp*);
 
@@ -60,6 +60,7 @@ IR_BUILDER_INSTANTIATE(ShiftOp)
 IR_BUILDER_INSTANTIATE(GatherOp)
 IR_BUILDER_INSTANTIATE(ViewAsScalar)
 IR_BUILDER_INSTANTIATE(ViewOp)
+IR_BUILDER_INSTANTIATE(FullOp)
 IR_BUILDER_INSTANTIATE(ARangeOp)
 IR_BUILDER_INSTANTIATE(UnaryOp)
 IR_BUILDER_INSTANTIATE(BinaryOp)
 
@@ -88,6 +88,10 @@ void IrCloner::handle(const TensorView* tv) {
   clone_ = IrBuilder::clone(tv, this);
 }
 
+void IrCloner::handle(const FullOp* op) {
+  clone_ = IrBuilder::clone(op, this);
+}
+
 void IrCloner::handle(const ARangeOp* op) {
   clone_ = IrBuilder::clone(op, this);
 }
 
@@ -68,6 +68,7 @@ class TORCH_CUDA_CU_API IrCloner : private OptInConstDispatch {
   void handle(const ComplexDouble*) override;
   void handle(const NamedScalar*) override;
 
+  void handle(const FullOp*) override;
   void handle(const ARangeOp*) override;
   void handle(const UnaryOp*) override;
   void handle(const BinaryOp*) override;
 
@@ -407,15 +407,24 @@ void IrGraphGenerator::handle(const TensorView* tv) {
   tensor_views_.push_back(tv);
 }
 
-void IrGraphGenerator::handle(const ARangeOp* uop) {
+void IrGraphGenerator::handle(const FullOp* fop) {
   // node
-  printExpr(uop, "arange");
+  printExpr(fop, "full");
 
   // inputs & outputs
-  addArc(uop->start(), uop);
-  addArc(uop->end(), uop);
-  addArc(uop->step(), uop);
-  addArc(uop, uop->output(0));
+  addArc(fop->getFillValue(), fop);
+  addArc(fop, fop->output(0));
+}
+
+void IrGraphGenerator::handle(const ARangeOp* aop) {
+  // node
+  printExpr(aop, "arange");
+
+  // inputs & outputs
+  addArc(aop->start(), aop);
+  addArc(aop->end(), aop);
+  addArc(aop->step(), aop);
+  addArc(aop, aop->output(0));
 }
 
 void IrGraphGenerator::handle(const UnaryOp* uop) {
Original file line number	Diff line number	Diff line change
`@@ -88,6 +88,10 @@ void IrCloner::handle(const TensorView* tv) {`
`88`	`88`	`clone_ = IrBuilder::clone(tv, this);`
`89`	`89`	`}`
`90`	`90`
	`91`	`+void IrCloner::handle(const FullOp* op) {`
	`92`	`+ clone_ = IrBuilder::clone(op, this);`
	`93`	`+}`
	`94`	`+`
`91`	`95`	`void IrCloner::handle(const ARangeOp* op) {`
`92`	`96`	`clone_ = IrBuilder::clone(op, this);`
`93`	`97`	`}`