Skip to content

Commit 6360145

Browse files
committed
Commits that's in this PR from the devel branch:
7117a7e patching nvfuser conv cudnn test numerics mismatch (#2048) 65af1a4 Inserting sync for redundant parallel types is already done at the (#2023) 6ac74d1 Fix sync map (#2047) f5bca33 Bank conflict checker improvements (#2032) d2ca7e3 Minor update on cp.async code generation. (#1901) d36cf61 Test file cleanup (#2040) 0b8e83f Allow non-root trivial reductions (#2037) a2dfe40 Fix vectorize size calculation (#2035) e040676 Use withPredicate to replace setPredicate to maintain Exprs immutable (#2025) 197221b removing ci workflow (#2034) 40e2703 Reduction rand like patch (#2031) bc77266 Add utility for checking bank conflict of shared memory (#2029) ddd1cf7 Add back FusionReductionWithTrivialReduction_CUDA (#2030) fbd97e5 Revert "Cleanup trivial reduction workarounds (#2006)" (#2024) bca20c1 Cleanup trivial reduction workarounds (#2006) e4b6585 Trivial forwarding (#1995) 1a0e355 Fix contiguity analysis of predicates to match updated contiguity. (#1991) a4effa6 Enable output allocation cache (#2010) 35440b7 Patching bn inference (#2016) 0f9f0b4 Add matmul benchmark (#2007) 45045cd Enable tests previously disabled due to an aliasing bug (#2005) 967aa77 Contiguous indexing for View operations (#1990) a43cb20 Make inlining even more modular (#2004) dc45835 Test util cleanup (#2003) 3ca21eb More strict validation (#2000) a7a7d57 Fix build problem (#1999) fc235b0 Just fixes comments (#1998) 482386c cleanup (#1997) 4cbe0db Improve divisible split detection (#1970) 42ccc52 Minor build fix. (#1996) fcf8c09 Cleanup of lower_utils.cpp: Isolate out GpuLower usage (#1989) 15f2f6d Move ConcretizedBroadcastDomains to shared_ptr in GpuLower. (#1988) 8f1c7f5 Minor cleanup lower_unroll.cpp (#1994) 1d9858c Minor cleanup (#1992) f262d9c Add support for uniform RNG (#1986) eb1dad1 Remove non-const functions, remove GpuLower instance on build, pass in ca_map. (#1987) 634820c Add support for some empty fusion (#1981) eabe8d8 Segment self mapping fusions (#1954) e96aacf Enable Transpose operation (#1882) 425dce2 Add a null scheduler that helps segmenting away no-op schedules (#1835) 306d4a6 Fix canScheduleCompileTime check of transpose scheduler (#1969) b1bd32c Minor fix (#1967) bd93578 Enable transpose scheduler (#1927) b7a206e Move scheduler vectorize utilities into their own file (#1959) d9420e4 View scheduling (#1928) c668e13 Upstream push ci fixes (#1965) c40202b Fix dump effective bandwidth (#1962) 93505bc WAR on index mapping when exact and permissive maps differ (#1960) 45e95fd Allow splitting inner-most ID to create virtual innermost ID in transpose scheduler (#1930) a3ecb33 Improve the comments at the beginning of index_compute.h (#1946) f7bc341 Remove unused variables (#1955) df3393a Some cleanup (#1957) 7d1d7c8 TVDomainGuard factory (#1953) 357ba22 Fill allocation with nan on tests (#1956) 8eafc54 Fix detection of unmappable root domains (#1952) 90a51f2 Some indexing cleanups, Add eye support (#1940) ddc01e4 Exclude unsupported data types (#1951) 992e17c test the groups the same order as they are merged (#1949) 208262b Move detection of self mapping IDs to IterDomainGraph from (#1941) ac4de38 Merge pull request #1945 from csarofeen/master_merge_0828 6310948 Add full, full_like, zeros, zeros_like, ones, ones_like (#1943) aab10bc Merge remote-tracking branch 'upstream/viable/strict' into HEAD 4c254c0 Fix arange when step is negative (#1942) 89330aa Tensor factories must set the output shape as its input (#1939) [ghstack-poisoned]
1 parent 5308886 commit 6360145

File tree

152 files changed

+35103
-28327
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

152 files changed

+35103
-28327
lines changed

aten/src/ATen/core/interned_strings.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,8 +50,11 @@ namespace c10 {
5050
_(prim, FunctionalGraph) \
5151
_(prim, add_optional) \
5252
_(prim, view_copy) \
53+
_(prim, permute_copy) \
5354
_(prim, reshape_copy) \
5455
_(prim, squeeze_copy) \
56+
_(prim, t_copy) \
57+
_(prim, transpose_copy) \
5558
_(prim, unsqueeze_copy) \
5659
_(prim, flatten_copy) \
5760
_(prim, expand_copy) \

benchmarks/cpp/nvfuser/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ if(USE_CUDA)
2020
softmax_backward.cpp
2121
scale_bias_relu.cpp
2222
transpose.cpp
23+
matmul.cpp
2324
timm.cpp
2425
utils.cpp
2526
main.cpp)

benchmarks/cpp/nvfuser/batch_norm_channels_first.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,10 +73,6 @@ static void NvFuserScheduler_BatchNorm(
7373
DataType dtype) {
7474
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
7575

76-
const bool kTraining = true;
77-
const float kMomentum = 0.1;
78-
const float kEps = 1e-5;
79-
8076
std::vector<int64_t> input_shape{
8177
benchmark_state.range(0),
8278
benchmark_state.range(1),

benchmarks/cpp/nvfuser/batch_norm_channels_first_backward.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ static void setupBatchNorm_BWD(Fusion* fusion, DataType dtype) {
2525
FusionGuard fg(fusion);
2626

2727
const bool kTraining = true;
28-
const float kMomentum = 0.1;
2928
const float kEps = 1e-5;
3029

3130
// setup fusion
@@ -85,9 +84,6 @@ static void NvFuserScheduler_BatchNorm_BWD(
8584
DataType dtype) {
8685
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
8786

88-
const bool kTraining = true;
89-
const float kEps = 1e-5;
90-
9187
std::vector<int64_t> input_shape{
9288
benchmark_state.range(0),
9389
benchmark_state.range(1),

benchmarks/cpp/nvfuser/batch_norm_channels_last.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,10 +74,6 @@ static void NvFuserScheduler_BatchNorm_nhwc(
7474
DataType dtype) {
7575
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
7676

77-
const bool kTraining = true;
78-
const float kMomentum = 0.1;
79-
const float kEps = 1e-5;
80-
8177
std::vector<int64_t> input_shape{
8278
benchmark_state.range(0),
8379
benchmark_state.range(2),

benchmarks/cpp/nvfuser/batch_norm_channels_last_backward.cpp

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,6 @@ static void setupBatchNorm_nhwc_BWD(Fusion* fusion, DataType dtype) {
2525
FusionGuard fg(fusion);
2626

2727
const bool kTraining = true;
28-
const float kMomentum = 0.1;
2928
const float kEps = 1e-5;
3029

3130
// setup fusion
@@ -86,9 +85,6 @@ static void NvFuserScheduler_BatchNorm_nhwc_BWD(
8685
DataType dtype) {
8786
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
8887

89-
const bool kTraining = true;
90-
const float kEps = 1e-5;
91-
9288
std::vector<int64_t> input_shape{
9389
benchmark_state.range(0),
9490
benchmark_state.range(2),

benchmarks/cpp/nvfuser/gelu_backward.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -113,9 +113,6 @@ BENCHMARK(GeluBackward_AutoSchedule)->Unit(benchmark::kMicrosecond);
113113
//------------------------------------------------------------------------------
114114

115115
static void GeluBackward_Lower(benchmark::State& benchmark_state) {
116-
constexpr int kHiddenFeatures = 512;
117-
constexpr int kBatchSize = 64;
118-
119116
Fusion fusion;
120117

121118
// setup fusion

benchmarks/cpp/nvfuser/layer_norm.cpp

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,6 @@ static void setupLayerNorm(Fusion* fusion, DataType dtype) {
2222

2323
FusionGuard fg(fusion);
2424

25-
const int kReductionAxis = 1;
2625
const float kEps = 1e-5;
2726

2827
Double* eps_ptr = IrBuilder::create<Double>(kEps);
@@ -61,7 +60,6 @@ static void NvFuserScheduler_LayerNorm(
6160

6261
std::vector<int64_t> input_shape{
6362
benchmark_state.range(0), benchmark_state.range(1)};
64-
const float kEps = 1e-5;
6563

6664
// inputs
6765
at::manual_seed(0);

benchmarks/cpp/nvfuser/layer_norm_backward.cpp

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,9 +22,6 @@ static void setupLayerNorm_BWD(Fusion* fusion, DataType dtype) {
2222

2323
TORCH_INTERNAL_ASSERT(dtype == DataType::Float || dtype == DataType::Half);
2424

25-
const int kReductionAxis = 1;
26-
Double* eps_ptr = IrBuilder::create<Double>(1e-5);
27-
2825
// setup fusion
2926
auto grad_out = makeContigTensor(2, dtype);
3027
auto input = makeContigTensor(2, dtype);

0 commit comments

Comments
 (0)