NVIDIA
diff --git a/‎.github/workflows/check_api_backwards_compatibility_workflow.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/check_api_backwards_compatibility_workflow.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py‎
Lines changed: 0 additions & 3 deletions b/‎examples/inference/gpt/gpt_dynamic_inference_with_coordinator.py‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 0 additions & 1 deletion b/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎megatron/core/inference/data_parallel_inference_coordinator.py‎
Lines changed: 48 additions & 2 deletions b/‎megatron/core/inference/data_parallel_inference_coordinator.py‎
Lines changed: 48 additions & 2 deletions
@@ -66,7 +66,7 @@ jobs:
       # Default baseline for automatic PR checks
       # Can be: branch name (e.g., 'main'), commit hash, or tag
       # Will be resolved to commit hash during execution
-      DEFAULT_BASELINE: '29a810e644d079a91955c0ab98afb0798b10ab52'
+      DEFAULT_BASELINE: '53bbf7a23d7194de1fbe991ba120a0d49bd5b097'
       # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*')
       TAG_PATTERN: 'core_v*'
       # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only)
 
@@ -43,7 +43,6 @@ async def main(
     engine: DynamicInferenceEngine,
     requests: List[Request],
     port: int,
-    mp_port: int,
     sampling_params: SamplingParams | None = None,
 ):
     if sampling_params is not None:
@@ -58,7 +57,6 @@ async def main(
 
     await engine.start_listening_to_data_parallel_coordinator(
         inference_coordinator_port=port,
-        inference_mp_coordinator_port=mp_port,
         launch_inference_coordinator=True,
         verbose=True,
     )
@@ -258,6 +256,5 @@ async def main(
                 engine,
                 requests,
                 args.inference_coordinator_port,
-                args.inference_mp_coordinator_port
             )
         )
@@ -1079,7 +1079,6 @@ def initialize_attention_state(
                 self.padded_active_token_count = min(
                     self.padded_active_token_count, self.max_active_requests
                 )
-        self.padding_slice = slice(active_token_count, self.padded_active_token_count)
 
         # How are we calculating the padded active request count?
         # Case 1: Using cuda graphs:
 
@@ -109,6 +109,8 @@ def __init__(self, inference_coordinator_port: int, data_parallel_size: int):
             self.identities_of_data_parallel_ranks.append(identity)
         logging.info("Inference Coordinator: Connected with data parallel ranks...")
         self.data_parallel_rank_iterator = cycle(self.identities_of_data_parallel_ranks)
+        self.data_parallel_pause_acks = set()
+        self.data_parallel_stop_acks = set()
 
         self.request_id_to_client_id = {}
         self.request_id_to_client_request_id = {}
@@ -151,7 +153,7 @@ def start(self):
                 # print(f"New client connected: {sender_identity}")
                 known_clients.add(sender_identity)
                 self.router_socket.send_multipart(
-                    [sender_identity, msgpack.packb([Headers.ACK.value], use_bin_type=True)]
+                    [sender_identity, msgpack.packb([Headers.CONNECT_ACK.value], use_bin_type=True)]
                 )
 
             elif header == Headers.SUBMIT_REQUEST:
@@ -208,6 +210,50 @@ def start(self):
                     self.router_socket.send_multipart(
                         [data_parallel_rank_id, msgpack.packb([header.value], use_bin_type=True)]
                     )
+                if header == Headers.UNPAUSE:
+                    self.data_parallel_pause_acks = set()
+            elif header == Headers.PAUSE_ACK:
+                # control signal ack from the engine
+                assert sender_identity in self.identities_of_data_parallel_ranks
+                assert sender_identity not in self.data_parallel_pause_acks
+                self.data_parallel_pause_acks.add(sender_identity)
+                # route to all clients only once we have gotten an ack from all data parallel ranks
+                if len(self.data_parallel_pause_acks) == self.data_parallel_size:
+                    for client_id in known_clients:
+                        self.router_socket.send_multipart(
+                            [
+                                client_id,
+                                msgpack.packb([header.value, sender_identity], use_bin_type=True),
+                            ]
+                        )
+                    for data_parallel_rank_id in self.identities_of_data_parallel_ranks:
+                        self.router_socket.send_multipart(
+                            [
+                                data_parallel_rank_id,
+                                msgpack.packb([Headers.PAUSE_ACK.value], use_bin_type=True),
+                            ]
+                        )
+            elif header == Headers.STOP_ACK:
+                # control signal ack from the engine
+                assert sender_identity in self.identities_of_data_parallel_ranks
+                assert sender_identity not in self.data_parallel_stop_acks
+                self.data_parallel_stop_acks.add(sender_identity)
+                # route to all clients only once we have gotten an ack from all data parallel ranks
+                if len(self.data_parallel_stop_acks) == self.data_parallel_size:
+                    for client_id in known_clients:
+                        self.router_socket.send_multipart(
+                            [
+                                client_id,
+                                msgpack.packb([header.value, sender_identity], use_bin_type=True),
+                            ]
+                        )
+                    for data_parallel_rank_id in self.identities_of_data_parallel_ranks:
+                        self.router_socket.send_multipart(
+                            [
+                                data_parallel_rank_id,
+                                msgpack.packb([Headers.STOP_ACK.value], use_bin_type=True),
+                            ]
+                        )
             elif header == Headers.ENGINE_REPLY:
                 # This is the output of a single engine step on some data parallel rank.
                 assert sender_identity in self.identities_of_data_parallel_ranks
@@ -224,7 +270,7 @@ def start(self):
                         [
                             client_identity,
                             msgpack.packb(
-                                [client_request_identity, finished_request_record],
+                                [header.value, client_request_identity, finished_request_record],
                                 use_bin_type=True,
                             ),
                         ]
Original file line number	Diff line number	Diff line change
`@@ -1079,7 +1079,6 @@ def initialize_attention_state(`
`1079`	`1079`	`self.padded_active_token_count = min(`
`1080`	`1080`	`self.padded_active_token_count, self.max_active_requests`
`1081`	`1081`	`)`
`1082`		`- self.padding_slice = slice(active_token_count, self.padded_active_token_count)`
`1083`	`1082`
`1084`	`1083`	`# How are we calculating the padded active request count?`
`1085`	`1084`	`# Case 1: Using cuda graphs:`