Revert "[None][feat] Optimize CUDA graph memory usage for spec decode cases (#6718)"

nv-yilinf · nv-yilinf · commit a9d76c87c904 · 2025-08-08T13:59:26.000-07:00
This reverts commit 8df7a26.
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -726,11 +726,8 @@ def disable_optimization(backend: Backend):
             # For non-draft model, we also capture the CUDA graph instance for draft length 0,
             # so that when we disable spec decode at runtime, we can still run the captured graph.
             # Note that for one engine mode, we are not able to turn off spec decode at runtime.
-            if (not self.is_draft_model and self.max_draft_len > 0
-                    and not self.spec_config.spec_dec_mode.use_one_engine()
-                    # Assume that speculation is always on if the user didn't give us a max_concurrency
-                    # value. This will save on memory.
-                    and self.spec_config.max_concurrency is not None):
+            if not self.is_draft_model and self.max_draft_len > 0 and not self.spec_config.spec_dec_mode.use_one_engine(
+            ):
                 draft_lengths.append(0)
 
             for bs in cuda_graph_batch_sizes:
diff --git a/tensorrt_llm/_torch/speculative/drafter.py b/tensorrt_llm/_torch/speculative/drafter.py
@@ -1,5 +1,5 @@
 from abc import ABC, abstractmethod
-from typing import List, Optional, final
+from typing import List, Optional
 
 from ..pyexecutor.llm_request import LlmRequest
 from ..pyexecutor.resource_manager import ResourceManager
@@ -26,13 +26,8 @@ def prepare_draft_tokens(
         """
         raise NotImplementedError
 
-    @final
     def should_use_spec_decode(self, requests: List[LlmRequest]) -> bool:
-        """
-        You probably don't want to override this. ModelEngine
-        assumes that speculation is always on if max_concurrency
-        is not specified by the user's spec config.
-        """
+        """Check if spec decode should be used for the current iteration."""
         if self.max_concurrency is not None:
             return len(requests) <= self.max_concurrency
         return True