add manual bucketing

ruisizhang123 · ruisizhang123 · commit ad080411d079 · 2025-11-06T18:05:39.000-08:00
diff --git a/test/distributed/test_aten_comm_compute_reordering.py b/test/distributed/test_aten_comm_compute_reordering.py
@@ -10,6 +10,7 @@
 
 # for some reason importing functional collectives after dynamo breaks collectives handling!
 import torch.distributed._functional_collectives as _functional_collectives
+import torch.fx as fx
 from torch._C import FileCheck
 from torch._dynamo.utils import counters, same
 from torch._inductor.utils import run_and_get_code, run_and_get_triton_code
@@ -888,6 +889,168 @@ def func(a, b, c, d, *, ranks):
             self.assertTrue(same(test_out, correct))
 
 
+def get_toy_model(device_type: str):
+    """
+    Helper to construct a small multi-layer ToyModel
+    """
+
+    class ToyBlock(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.wq = torch.nn.Linear(4, 4)
+            self.wk = torch.nn.Linear(4, 4)
+            self.proj = torch.nn.Linear(4, 4)
+
+        def forward(self, x):
+            attn = self.wq(x) + self.wk(x)
+            return self.proj(torch.nn.functional.relu(attn))
+
+    class ToyModel(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.layers = torch.nn.ModuleList([ToyBlock() for _ in range(2)])
+            self.norm = torch.nn.LayerNorm(4)
+
+        def forward(self, x):
+            for blk in self.layers:
+                x = blk(x)
+            return self.norm(x)
+
+    model = ToyModel().to(device_type)
+    return model
+
+
+def apply_manual_reordering_and_get_graph(graph, module_bucket_plans, out_li) -> None:
+    gm = graph.owning_module
+    from torch._inductor.fx_passes.overlap_manual_scheduling import (
+        ManualOverlapScheduler,
+    )
+
+    # Read config values, only pass non-None values to use function defaults
+    kwargs: dict[str, object] = {}
+    for node in list(gm.graph.nodes):
+        if (
+            node.name == "all_gather_into_tensor"
+            or node.name == "all_gather_into_tensor_1"
+            or node.name == "wait_tensor"
+            or node.name == "wait_tensor_1"
+        ):
+            node.meta["nn_module_stack"] = {"test": ["module_1", ""]}
+        if (
+            node.name == "all_gather_into_tensor_2"
+            or node.name == "all_gather_into_tensor_3"
+            or node.name == "wait_tensor_2"
+            or node.name == "wait_tensor_3"
+        ):
+            node.meta["nn_module_stack"] = {"test": ["module_2", ""]}
+
+    overlapped_gm = ManualOverlapScheduler(gm, module_bucket_plans).run()
+    overlapped_gm.graph.lint()
+    out_li.append(overlapped_gm.graph)
+
+
+def run_and_get_manual_aten_graph(fn, *inputs):
+    li = []
+    apply = functools.partial(
+        apply_manual_reordering_and_get_graph,
+        module_bucket_plans=["module_1", "module_2"],
+        out_li=li,
+    )
+    with torch._inductor.config.patch(post_grad_custom_post_pass=apply):
+        out = fn(*inputs)
+
+    return out, li[0]
+
+
+class TestManualOverlapBucketing(TestComputeCommReorderingMultiProc):
+    """
+    Tests for manual overlap scheduling and subgraph utilities.
+    """
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_make_graph_view_and_get_subgraph_by_path(self):
+        from torch._inductor.fx_passes.overlap_manual_scheduling import (
+            get_subgraph_by_path,
+            make_graph_view,
+        )
+
+        model = get_toy_model(device_type)
+        gm = fx.symbolic_trace(model)
+        graph_view = make_graph_view(gm.graph)
+        # Fetch subgraph for first transformer layer
+        sub_nodes = get_subgraph_by_path(graph_view, "layers.0.wq")
+        self.assertEqual([n.name for n in sub_nodes], ["layers_0_wq"])
+
+        # Fetch multiple paths at once
+        multi_nodes = get_subgraph_by_path(graph_view, ["layers.0.wq", "layers.0.proj"])
+        self.assertEqual(
+            [n.name for n in multi_nodes], ["layers_0_wq", "layers_0_proj"]
+        )
+
+    @unittest.skipIf(not HAS_GPU, "Inductor+gpu needs triton and recent GPU arch")
+    def test_manual_reordering_bucketing_pass(
+        self,
+    ):
+        def func(a, b, c, d, *, ranks):
+            # All 4 all-gathers are independent - COULD be bucketed together
+            ag1 = _functional_collectives.all_gather_tensor(a, 0, ranks)
+            ag2 = _functional_collectives.all_gather_tensor(b, 0, ranks)
+            ag3 = _functional_collectives.all_gather_tensor(c[:4], 0, ranks)
+            ag4 = _functional_collectives.all_gather_tensor(d[:4], 0, ranks)
+
+            # First compute - can hide ag1 and ag2
+            e = a * 5  # Use a to avoid fusion
+            mm1 = torch.matmul(e, e.T)
+
+            # Force ag1/ag2 to complete before mm2 (but ag3/ag4 can still be deferred)
+            # Use first 8x8 elements to match mm1's shape
+            intermediate = ag1[:8, :8] + ag2[:8, :8]
+
+            # Second compute - depends on ag1/ag2 through intermediate, can hide ag3/ag4
+            mm2 = torch.matmul(mm1 + intermediate, c[:8])
+
+            # Use all results
+            result = (
+                ag1.sum() * 1.1
+                + ag2.sum() * 1.2
+                + ag3.sum() * 1.3
+                + ag4.sum() * 1.4
+                + mm1.sum()
+                + mm2.sum()
+            )
+            return result
+
+        with _dynamo_dist_per_rank_init(
+            self.rank,
+            self.world_size,
+            self.backend(device_type),
+            fake_pg=not at_least_x_gpu(2),
+        ):
+            a = torch.ones(8, 8, dtype=torch.float, device=device_type)
+            b = torch.ones(8, 8, dtype=torch.float, device=device_type) * 2
+            c = torch.ones(8, 8, dtype=torch.float, device=device_type) * 3
+            d = torch.ones(8, 8, dtype=torch.float, device=device_type) * 4
+            ranks = list(range(self.world_size))
+
+            func_c = functools.partial(func, ranks=ranks)
+            compiled = torch.compile(func_c)
+            out, aten_graph = run_and_get_manual_aten_graph(compiled, a, b, c, d)
+
+            (
+                FileCheck()
+                .check("_pre_bucket_all_gather")
+                .check("all_gather_into_tensor_out")
+                .check("_pre_bucket_all_gather_1")
+                .check("all_gather_into_tensor_out_1")
+                .check("wait_tensor_4")
+                .check("wait_tensor_5")
+                .run(str(aten_graph))
+            )
+
+            correct = func(a, b, c, d, ranks=ranks)
+            self.assertTrue(same(out, correct))
+
+
 if __name__ == "__main__":
     from torch._dynamo.test_case import run_tests
 
diff --git a/torch/_inductor/fx_passes/bucketing.py b/torch/_inductor/fx_passes/bucketing.py
@@ -117,9 +117,9 @@ def bucket_reduce_scatter(
 
 
 def is_all_gather_into_tensor(node: torch.fx.Node) -> bool:  # type: ignore[arg-type]
-    return (
-        node.op == "call_function"
-        and node.target == torch.ops._c10d_functional.all_gather_into_tensor.default
+    return node.op == "call_function" and (
+        node.target == torch.ops._c10d_functional.all_gather_into_tensor.default
+        or node.target == torch.ops._c10d_functional.all_gather_into_tensor_out.default
     )
 
 
diff --git a/torch/_inductor/fx_passes/overlap_manual_scheduling.py b/torch/_inductor/fx_passes/overlap_manual_scheduling.py

Original file line number	Diff line number	Diff line change
`@@ -117,9 +117,9 @@ def bucket_reduce_scatter(`
`117`	`117`
`118`	`118`
`119`	`119`	`def is_all_gather_into_tensor(node: torch.fx.Node) -> bool: # type: ignore[arg-type]`
`120`		`- return (`
`121`		`- node.op == "call_function"`
`122`		`- and node.target == torch.ops._c10d_functional.all_gather_into_tensor.default`
	`120`	`+ return node.op == "call_function" and (`
	`121`	`+ node.target == torch.ops._c10d_functional.all_gather_into_tensor.default`
	`122`	`+ or node.target == torch.ops._c10d_functional.all_gather_into_tensor_out.default`
`123`	`123`	`)`
`124`	`124`
`125`	`125`