NVIDIA
diff --git a/‎.github/workflows/check_api_backwards_compatibility_workflow.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/check_api_backwards_compatibility_workflow.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 16 additions & 7 deletions b/‎megatron/core/inference/contexts/dynamic_context.py‎
Lines changed: 16 additions & 7 deletions
@@ -66,7 +66,7 @@ jobs:
       # Default baseline for automatic PR checks
       # Can be: branch name (e.g., 'main'), commit hash, or tag
       # Will be resolved to commit hash during execution
-      DEFAULT_BASELINE: 'f7fb5ecbe218672719053fa304d91767ce30ffa1'
+      DEFAULT_BASELINE: '29a810e644d079a91955c0ab98afb0798b10ab52'
       # Tag pattern for auto-detection (e.g., 'core_r*', 'core_v*')
       TAG_PATTERN: 'core_v*'
       # Tag regex filter (e.g., '^core_v[0-9]+\.[0-9]+\.[0-9]+$' for stable versions only)
 
@@ -302,8 +302,8 @@ def __init__(
             assert (
                 mamba_ssm_states_shape is not None
             ), "`mamba_ssm_states_shape` must be specified for hybrid models"
-            assert (
-                not use_cuda_graphs_for_non_decode_steps
+            assert not (
+                num_cuda_graphs is not None and use_cuda_graphs_for_non_decode_steps
             ), "Non-decode CUDA graphs not yet supported for hybrid models"
 
             # For hybrid models, the layer map converts the global layer index to the
@@ -1079,6 +1079,7 @@ def initialize_attention_state(
                 self.padded_active_token_count = min(
                     self.padded_active_token_count, self.max_active_requests
                 )
+        self.padding_slice = slice(active_token_count, self.padded_active_token_count)
 
         # How are we calculating the padded active request count?
         # Case 1: Using cuda graphs:
@@ -1427,6 +1428,14 @@ def _swap_book_keeping_tensors(self, src_idxs, dst_idxs, next_tokens):
         if self.is_hybrid_model:
             tensor_swap(self.mamba_metadata.request_to_mamba_state_idx, src_idxs, dst_idxs)
 
+    def get_index_of_chunked_prefill_request(self) -> int:
+        """Get the index of the chunked prefill request in the context.
+
+        Return:
+            (int) Index of the chunked prefill request, or -1 if none exists.
+        """
+        return torch.where(self.request_ids == self.chunked_prefill_request_id)[0][0]
+
     # TODO: see if we can compile this function
     def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> Tensor:
         """Update context state after calling engine.step().
@@ -1583,8 +1592,9 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
 
             if self.chunked_prefill_request_id != -1:
                 # find the id in request_ids that is the chunked_prefill_request_id. Only one request should be chunked.
-                pos = torch.where(self.request_ids == self.chunked_prefill_request_id)[0][0]
-                active_requests_requiring_new_block[pos] = 0  # chunked prefill should not be paused
+                active_requests_requiring_new_block[self.get_index_of_chunked_prefill_request()] = (
+                    0  # chunked prefill should not be paused
+                )
 
             active_requests_requiring_new_block_count = (
                 (active_requests_requiring_new_block == 1).sum().item()
@@ -1651,11 +1661,10 @@ def update_requests(self, active_requests_mask: Tensor, new_tokens: Tensor) -> T
         active_request_count += resume_request_count
         assert active_request_count > 0, "active_request_count == %d." % active_request_count
 
-        # finally, swap the chunked prefill to the end of the active requests to obey the invariant
+        # finally, swap the chunked prefill to the end of the active requests to obey the invariance
         if self.chunked_prefill_request_id != -1:
-            pos = torch.where(self.request_ids == self.chunked_prefill_request_id)[0][0]
             self._swap_book_keeping_tensors(
-                src_idxs=torch.tensor([pos]),
+                src_idxs=torch.tensor([self.get_index_of_chunked_prefill_request()]),
                 dst_idxs=torch.tensor([active_request_count + self.paused_request_count - 1]),
                 next_tokens=next_tokens,
             )