pytorch
diff --git a/‎aten/src/ATen/core/interned_strings.h‎
Lines changed: 3 additions & 0 deletions b/‎aten/src/ATen/core/interned_strings.h‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎benchmarks/cpp/nvfuser/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎benchmarks/cpp/nvfuser/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp‎
Lines changed: 0 additions & 4 deletions b/‎benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp‎
Lines changed: 0 additions & 4 deletions b/‎benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp‎
Lines changed: 0 additions & 4 deletions b/‎benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp‎
Lines changed: 0 additions & 4 deletions b/‎benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎benchmarks/cpp/nvfuser/gelu_backward.cpp‎
Lines changed: 0 additions & 3 deletions b/‎benchmarks/cpp/nvfuser/gelu_backward.cpp‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎benchmarks/cpp/nvfuser/layer_norm.cpp‎
Lines changed: 0 additions & 2 deletions b/‎benchmarks/cpp/nvfuser/layer_norm.cpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎benchmarks/cpp/nvfuser/layer_norm_backward.cpp‎
Lines changed: 0 additions & 3 deletions b/‎benchmarks/cpp/nvfuser/layer_norm_backward.cpp‎
Lines changed: 0 additions & 3 deletions
@@ -50,8 +50,11 @@ namespace c10 {
   _(prim, FunctionalGraph)           \
   _(prim, add_optional)              \
   _(prim, view_copy)                 \
+  _(prim, permute_copy)              \
   _(prim, reshape_copy)              \
   _(prim, squeeze_copy)              \
+  _(prim, t_copy)                    \
+  _(prim, transpose_copy)            \
   _(prim, unsqueeze_copy)            \
   _(prim, flatten_copy)              \
   _(prim, expand_copy)               \
 
@@ -20,6 +20,7 @@ if(USE_CUDA)
     softmax_backward.cpp
     scale_bias_relu.cpp
     transpose.cpp
+    matmul.cpp
     timm.cpp
     utils.cpp
     main.cpp)
 
@@ -73,10 +73,6 @@ static void NvFuserScheduler_BatchNorm(
     DataType dtype) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
       benchmark_state.range(1),
 
@@ -25,7 +25,6 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
   FusionGuard fg(fusion);
 
   const bool kTraining = true;
-  const float kMomentum = 0.1;
   const float kEps = 1e-5;
 
   // setup fusion
@@ -85,9 +84,6 @@ static void NvFuserScheduler_BatchNorm_BWD(
     DataType dtype) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
-  const bool kTraining = true;
-  const float kEps = 1e-5;
-
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
       benchmark_state.range(1),
 
@@ -74,10 +74,6 @@ static void NvFuserScheduler_BatchNorm_nhwc(
     DataType dtype) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
-  const bool kTraining = true;
-  const float kMomentum = 0.1;
-  const float kEps = 1e-5;
-
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
       benchmark_state.range(2),
 
@@ -25,7 +25,6 @@ static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {
   FusionGuard fg(fusion);
 
   const bool kTraining = true;
-  const float kMomentum = 0.1;
   const float kEps = 1e-5;
 
   // setup fusion
@@ -86,9 +85,6 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD(
     DataType dtype) {
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
-  const bool kTraining = true;
-  const float kEps = 1e-5;
-
   std::vector<int64_t> input_shape{
       benchmark_state.range(0),
       benchmark_state.range(2),
 
@@ -113,9 +113,6 @@ BENCHMARK(GeluBackward_AutoSchedule)->Unit(benchmark::kMicrosecond);
 //------------------------------------------------------------------------------
 
 static void GeluBackward_Lower(benchmark::State& benchmark_state) {
-  constexpr int kHiddenFeatures = 512;
-  constexpr int kBatchSize = 64;
-
   Fusion fusion;
 
   // setup fusion
 
@@ -22,7 +22,6 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) {
 
   FusionGuard fg(fusion);
 
-  const int kReductionAxis = 1;
   const float kEps = 1e-5;
 
   Double* eps_ptr = IrBuilder::create<Double>(kEps);
@@ -61,7 +60,6 @@ static void NvFuserScheduler_LayerNorm(
 
   std::vector<int64_t> input_shape{
       benchmark_state.range(0), benchmark_state.range(1)};
-  const float kEps = 1e-5;
 
   // inputs
   at::manual_seed(0);
 
@@ -22,9 +22,6 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
 
   TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
 
-  const int kReductionAxis = 1;
-  Double* eps_ptr = IrBuilder::create<Double>(1e-5);
-
   // setup fusion
   auto grad_out = makeContigTensor(2, dtype);
   auto input = makeContigTensor(2, dtype);