Improve device auto-detection (#7787)

zpcore · web-flow · commit 2ba4e0574df0 · 2024-08-01T11:31:19.000-07:00
diff --git a/test/cpp/cpp_test_util.cpp b/test/cpp/cpp_test_util.cpp
@@ -437,12 +437,6 @@ torch::lazy::NodePtr CreateNonZeroNode2d(int64_t num_non_zero_element,
   return nonzero_node;
 }
 
-bool UsingPjRt() {
-  static bool using_pjrt =
-      !torch_xla::runtime::sys_util::GetEnvString("PJRT_DEVICE", "").empty();
-  return using_pjrt;
-}
-
 bool UsingTpu() {
   static bool using_tpu =
       absl::StartsWith(
diff --git a/test/cpp/cpp_test_util.h b/test/cpp/cpp_test_util.h
@@ -116,8 +116,6 @@ void TestBackward(
 torch::lazy::NodePtr CreateNonZeroNode2d(int64_t num_non_zero_element,
                                          int64_t num_row, int64_t num_col);
 
-bool UsingPjRt();
-
 bool UsingTpu();
 
 }  // namespace cpp_test
diff --git a/test/pjrt/test_profiler.py b/test/pjrt/test_profiler.py
@@ -31,8 +31,6 @@ def _profile(logdir: str, port: int = 9012):
 class TestPjRtProfiler(absltest.TestCase):
 
   def setUp(self):
-    assert xr.using_pjrt()
-
     # HACK: ensure libtpu is loaded if using TPU
     xm.xla_device()
 
diff --git a/test/pjrt/test_runtime.py b/test/pjrt/test_runtime.py
@@ -15,8 +15,6 @@
 class TestExperimentalPjrt(parameterized.TestCase):
 
   def setUp(self):
-    global xr
-    reload(xr)
     xr.set_device_type('CPU')
 
   @parameterized.parameters(('CPU', 'CPU'), ('CUDA', 'CUDA'), ('TPU', 'TPU'))
@@ -44,12 +42,6 @@ def test_set_device_type_same_device(self):
             torch_xla._XLAC, '_xla_runtime_is_initialized', return_value=True):
       xr.set_device_type('CPU')
 
-  def test_requires_pjrt(self):
-    with mock.patch.dict(
-        os.environ, {'PJRT_SELECT_DEFAULT_DEVICE': '0'}, clear=True):
-      with self.assertRaises(NotImplementedError):
-        xr.xla_device()
-
   def test_default_ordinals(self):
     global_ordinal = xr.global_ordinal()
     self.assertEqual(global_ordinal, 0)
@@ -65,9 +57,6 @@ def test_num_global_devices(self):
     self.assertLen(torch_xla._XLAC._xla_get_all_devices(),
                    xr.global_device_count())
 
-  def test_world_size(self):
-    self.assertEqual(xr.world_size(), xr.world_size())
-
   def test_xla_device_error(self):
     with self.assertRaises(IndexError):
       xm.xla_device(10)
@@ -87,21 +76,19 @@ def test_xla_device_error(self):
       'GPU_NUM_DEVICES': '4'
   }, True))
   def test_pjrt_default_device(self, env_vars, expect_using_pjrt):
-    with mock.patch.dict(os.environ, env_vars, clear=True):
-      # Print a warningif we had to select a default runtime
-      if 'PJRT_DEVICE' not in os.environ and expect_using_pjrt:
-        logs_context = self.assertLogs(level=logging.WARNING)
-      else:
+    # Prevent flag checking during reinitialization of PJRT backend.
+    # Without the patch, the test will be impacted by other tests when torch_xla reloads.
+    with mock.patch(
+        'torch_xla._XLAC._xla_runtime_is_initialized', return_value=False):
+      with mock.patch.dict(os.environ, env_vars, clear=True):
+        # We need to reload the torch_xla module because clear=True will clear all os.environ.
+        global torch_xla
+        reload(torch_xla)
         logs_context = contextlib.nullcontext()
-
-      with logs_context:
-        # Configure default device
-        xr.using_pjrt()
-
-      if expect_using_pjrt:
-        self.assertIn(xr.device_type(), ['CPU', 'CUDA', 'TPU'])
-      else:
-        self.assertIsNone(xr.device_type())
+        if expect_using_pjrt:
+          self.assertIn(xr.device_type(), ['CPU', 'CUDA', 'TPU'])
+        else:
+          self.assertIsNone(xr.device_type())
 
   def test_host_index(self):
     self.assertEqual(xr.host_index(), 0)
diff --git a/test/test_operations.py b/test/test_operations.py
@@ -1659,9 +1659,7 @@ def test_fn(t):
         t += torch.tensor(i, dtype=torch.float, device=t.device)
       return t
 
-    # This test is for PjRT only
-    if xr.using_pjrt():
-      self.runAtenTest([torch.tensor(20.0)], test_fn)
+    self.runAtenTest([torch.tensor(20.0)], test_fn)
 
   def test_view_and_copy_(self):
     xla_device = xm.xla_device()
diff --git a/test/test_train_mp_imagenet.py b/test/test_train_mp_imagenet.py
@@ -255,8 +255,7 @@ def train_imagenet():
 
   # Initialization is nondeterministic with multiple threads in PjRt.
   # Synchronize model parameters across replicas manually.
-  if xr.using_pjrt():
-    xm.broadcast_master_param(model)
+  xm.broadcast_master_param(model)
 
   if FLAGS.ddp:
     model = DDP(model, gradient_as_bucket_view=True, broadcast_buffers=False)
diff --git a/test/test_train_mp_mnist.py b/test/test_train_mp_mnist.py
@@ -135,8 +135,7 @@ def train_mnist(flags, **kwargs):
 
   # Initialization is nondeterministic with multiple threads in PjRt.
   # Synchronize model parameters across replicas manually.
-  if xr.using_pjrt():
-    xm.broadcast_master_param(model)
+  xm.broadcast_master_param(model)
 
   if flags.ddp:
     model = DDP(model, gradient_as_bucket_view=True)
diff --git a/torch_xla/__init__.py b/torch_xla/__init__.py
@@ -251,3 +251,6 @@ def _init_xla_lazy_backend():
 
 # register all custom kenels and decomp by default
 from ._internal import custom_kernel, decomp_registration, c10d_registration
+
+# select default PJRT_DEVICE before any execution
+runtime._maybe_select_default_device()
diff --git a/torch_xla/_internal/pjrt.py b/torch_xla/_internal/pjrt.py
@@ -41,7 +41,6 @@ def _merge_replica_results(
   return dict(replica_results)
 
 
-@runtime.requires_pjrt
 def _run_thread_per_device(
     local_rank: int, local_world_size: int, fn: Callable[[], R],
     initializer_fn: Callable[[int, int], None]) -> Dict[int, R]:
@@ -81,7 +80,6 @@ def _thread_fn(device: torch.device):
   return _merge_replica_results(replica_results)
 
 
-@runtime.requires_pjrt
 def _run_singleprocess(fn: Callable[..., R], *args, **kwargs) -> Dict[int, R]:
   """Runs `fn` on a single device core.
 
@@ -99,7 +97,6 @@ def _run_singleprocess(fn: Callable[..., R], *args, **kwargs) -> Dict[int, R]:
   return fn(*args, **kwargs)
 
 
-@runtime.requires_pjrt
 def initialize_singleprocess():
   os.environ.setdefault(xenv.PJRT_LOCAL_PROCESS_COUNT, '1')
 
@@ -110,7 +107,6 @@ def initialize_singleprocess():
   xm.set_replication(xm.xla_device(), [])
 
 
-@runtime.requires_pjrt
 def initialize_multiprocess(local_rank: int, local_world_size: int):
   os.environ.setdefault(xenv.PJRT_LOCAL_PROCESS_RANK, str(local_rank))
   os.environ.setdefault(xenv.PJRT_LOCAL_PROCESS_COUNT, str(local_world_size))
@@ -126,7 +122,6 @@ def initialize_multiprocess(local_rank: int, local_world_size: int):
   xm.set_replication(xm.xla_device(), devices)
 
 
-@runtime.requires_pjrt
 def run_multiprocess(fn: Callable[..., R],
                      *args,
                      start_method: str = 'spawn',
@@ -214,7 +209,6 @@ def spawn(fn: Callable,
   run_multiprocess(spawn_fn, start_method=start_method)
 
 
-@runtime.requires_pjrt
 def _initialize_single_process(local_rank: int, local_world_size: int):
   os.environ.setdefault(xenv.PJRT_LOCAL_PROCESS_RANK, str(local_rank))
   os.environ.setdefault(xenv.PJRT_LOCAL_PROCESS_COUNT, str(local_world_size))
diff --git a/torch_xla/_internal/utils.py b/torch_xla/_internal/utils.py
@@ -1,21 +1,7 @@
-import functools
 import re
 
 
 def parse_xla_device(device: str):
   m = re.match(r'([A-Z]+):(\d+)$', device)
   if m:
     return (m.group(1), int(m.group(2)))
-
-
-def run_once(func):
-  result = None
-
-  @functools.wraps(func)
-  def wrapper(*args, **kwargs):
-    nonlocal result
-    if result is None:
-      result = func(*args, **kwargs)
-    return result
-
-  return wrapper
diff --git a/torch_xla/distributed/spmd/xla_sharding.py b/torch_xla/distributed/spmd/xla_sharding.py
@@ -573,7 +573,6 @@ def disable_manual_sharding(t: Union[torch.Tensor, XLAShardedTensor],
   return wrap_as_sharded_tensor(t)
 
 
-@xr.requires_pjrt
 def mark_sharding(
     t: Union[torch.Tensor, XLAShardedTensor], mesh: Mesh,
     partition_spec: Tuple[Union[Tuple, int, str, None]]) -> XLAShardedTensor:
@@ -695,7 +694,6 @@ class ShardingSpec:
   _replication_groups: List[int] = field(init=False)
   _sharding_type: ShardingType = field(init=False)
 
-  @xr.requires_pjrt
   def __post_init__(self):
     mesh = self.mesh
     partition_spec = _translate_named_partition_spec(mesh, self.partition_spec)
diff --git a/torch_xla/distributed/xla_multiprocessing.py b/torch_xla/distributed/xla_multiprocessing.py
@@ -3,7 +3,6 @@
 from torch_xla._internal import pjrt
 
 
-@xr.requires_pjrt
 def spawn(fn,
           args=(),
           nprocs=None,
diff --git a/torch_xla/runtime.py b/torch_xla/runtime.py