File tree Expand file tree Collapse file tree 2 files changed +4
-12
lines changed Expand file tree Collapse file tree 2 files changed +4
-12
lines changed Original file line number Diff line number Diff line change @@ -726,11 +726,8 @@ def disable_optimization(backend: Backend):
726
726
# For non-draft model, we also capture the CUDA graph instance for draft length 0,
727
727
# so that when we disable spec decode at runtime, we can still run the captured graph.
728
728
# Note that for one engine mode, we are not able to turn off spec decode at runtime.
729
- if (not self .is_draft_model and self .max_draft_len > 0
730
- and not self .spec_config .spec_dec_mode .use_one_engine ()
731
- # Assume that speculation is always on if the user didn't give us a max_concurrency
732
- # value. This will save on memory.
733
- and self .spec_config .max_concurrency is not None ):
729
+ if not self .is_draft_model and self .max_draft_len > 0 and not self .spec_config .spec_dec_mode .use_one_engine (
730
+ ):
734
731
draft_lengths .append (0 )
735
732
736
733
for bs in cuda_graph_batch_sizes :
Original file line number Diff line number Diff line change 1
1
from abc import ABC , abstractmethod
2
- from typing import List , Optional , final
2
+ from typing import List , Optional
3
3
4
4
from ..pyexecutor .llm_request import LlmRequest
5
5
from ..pyexecutor .resource_manager import ResourceManager
@@ -26,13 +26,8 @@ def prepare_draft_tokens(
26
26
"""
27
27
raise NotImplementedError
28
28
29
- @final
30
29
def should_use_spec_decode (self , requests : List [LlmRequest ]) -> bool :
31
- """
32
- You probably don't want to override this. ModelEngine
33
- assumes that speculation is always on if max_concurrency
34
- is not specified by the user's spec config.
35
- """
30
+ """Check if spec decode should be used for the current iteration."""
36
31
if self .max_concurrency is not None :
37
32
return len (requests ) <= self .max_concurrency
38
33
return True
You can’t perform that action at this time.
0 commit comments