|
1 | | -#include <torch/csrc/jit/codegen/cuda/executor.h> |
| 1 | +#pragma once |
| 2 | + |
2 | 3 | #include <torch/csrc/jit/codegen/cuda/executor_utils.h> |
3 | 4 | #include <torch/csrc/jit/codegen/cuda/expr_evaluator.h> |
4 | 5 | #include <torch/csrc/jit/codegen/cuda/fusion.h> |
5 | 6 | #include <torch/csrc/jit/codegen/cuda/ir_iostream.h> |
6 | 7 | #include <torch/csrc/jit/codegen/cuda/lower_utils.h> |
7 | 8 |
|
8 | 9 | #include <ATen/cuda/CUDAContext.h> |
9 | | -#include <c10/cuda/CUDACachingAllocator.h> |
10 | | -#include <torch/torch.h> |
11 | 10 |
|
12 | 11 | #include <unordered_map> |
13 | 12 |
|
| 13 | +// Tests go in torch::jit |
14 | 14 | namespace torch { |
15 | 15 | namespace jit { |
16 | | -namespace fuser { |
17 | | -namespace cuda { |
18 | | - |
19 | | -inline bool deviceMajorMinorCheck(int major, int minor = 0) { |
20 | | - auto dev_prop = at::cuda::getCurrentDeviceProperties(); |
21 | | - if (dev_prop->major < major || |
22 | | - (dev_prop->major == major && dev_prop->minor < minor)) { |
23 | | - return false; |
24 | | - } |
25 | | - return true; |
26 | | -} |
27 | 16 |
|
28 | | -inline int deviceSMCount() { |
29 | | - int sm_count = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; |
30 | | - return sm_count; |
31 | | -} |
| 17 | +using namespace torch::jit::fuser::cuda; |
32 | 18 |
|
33 | | -class NVFuserTest : public ::testing::Test { |
34 | | - protected: |
35 | | - void SetUp() override { |
36 | | - // requires PASCAL or newer |
37 | | - if (!deviceMajorMinorCheck(6)) { |
38 | | - GTEST_SKIP() << "skipping tests on pre-PASCAL GPUs"; |
39 | | - } |
40 | | - setFillAllocationWithNan(true); |
41 | | - } |
42 | | - |
43 | | - void TearDown() override { |
44 | | - c10::cuda::CUDACachingAllocator::emptyCache(); |
45 | | - } |
46 | | -}; |
| 19 | +namespace { |
47 | 20 |
|
48 | 21 | struct ValidationConstants { |
49 | 22 | // Tolerances generated from randn + add + sum fusion |
@@ -74,8 +47,6 @@ struct ValidationConstants { |
74 | 47 | double base_float_rel_tol = -1; |
75 | 48 | }; |
76 | 49 |
|
77 | | -namespace { |
78 | | - |
79 | 50 | // Returns abs and relative values to use for validation |
80 | 51 | std::pair<double, double> getTolerance( |
81 | 52 | DataType dtype, |
@@ -338,15 +309,13 @@ ExpressionEvaluator bindInputsAndLaunchParams( |
338 | 309 | return expr_eval; |
339 | 310 | } |
340 | 311 |
|
341 | | -} // namespace |
342 | | - |
343 | 312 | // Validation will look through the fusion and figure out how many elements were |
344 | 313 | // reduced to create each output. It will then compute a tolernace to use for |
345 | 314 | // allclose based on experimental results. The experimental results were based |
346 | 315 | // on adding two tensors then summing them. This of course has an assumption |
347 | 316 | // that we're always summing values between -2 and 2. If we start summing values |
348 | 317 | // larger than that this approach might not hold. |
349 | | -inline void testValidate( |
| 318 | +void testValidate( |
350 | 319 | Fusion* fusion, |
351 | 320 | const std::vector<at::Tensor>& fusion_outputs, |
352 | 321 | const at::ArrayRef<IValue>& aten_inputs, |
@@ -466,18 +435,6 @@ inline void testValidate( |
466 | 435 | } |
467 | 436 | } |
468 | 437 |
|
469 | | -inline void clearL2Cache() { |
470 | | - torch::NoGradGuard no_grad; |
471 | | - auto l2_cache_size = at::cuda::getCurrentDeviceProperties()->l2CacheSize; |
472 | | - auto options = |
473 | | - torch::TensorOptions().dtype(torch::kFloat32).device(at::kCUDA, 0); |
474 | | - |
475 | | - auto l2_elems = l2_cache_size / 4; |
476 | | - torch::Tensor t0 = torch::empty(l2_elems, options); |
477 | | - torch::Tensor t1 = torch::clone(t0); |
478 | | -}; |
479 | | - |
480 | | -} // namespace cuda |
481 | | -} // namespace fuser |
| 438 | +} // namespace |
482 | 439 | } // namespace jit |
483 | 440 | } // namespace torch |
0 commit comments