review comments

Varun Sundar Rabindranath · Varun Sundar Rabindranath · commit 89e790c5945f · 2024-09-27T15:29:41.000Z
diff --git a/vllm/core/scheduler.py b/vllm/core/scheduler.py
@@ -913,10 +913,9 @@ def _schedule_prefills(
                 break
             elif can_allocate == AllocStatus.NEVER:
                 logger.warning(
-                    "Input prompt (%d tokens) + lookahead slots "
-                    "({num_lookahead_slots}) is too long"
-                    " and exceeds the capacity of block_manager",
-                    num_new_tokens)
+                    "Input prompt (%d tokens) + lookahead slots (%d) is "
+                    "too long and exceeds the capacity of block_manager",
+                    num_new_tokens, num_lookahead_slots)
                 for seq in waiting_seqs:
                     seq.status = SequenceStatus.FINISHED_IGNORED
                 ignored_seq_groups.append(seq_group)
diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py
@@ -974,6 +974,11 @@ def update_prefill_num_computed_tokens(
                 seq_group_meta: SequenceGroupMetadata, num_outputs: int,
                 is_first_step_output: Optional[bool]) -> None:
             """
+            When multi-step and chunked-prefill are enabled together, the
+            prefill sequence scheduled for multi-step execution turn into
+            decodes in the first step itself. This function accounts
+            for that conversion.
+
             seq_group: SequenceGroup - A prefill seq_group
             seq_group_meta: SequenceGroupMetadata - Metadata of the given
               prefill seq_group
@@ -987,11 +992,6 @@ def update_prefill_num_computed_tokens(
                 must be None, as num_outputs > 1 indicates that outputs from
                 all the steps in multi-step are submitted in a single burst.
                 When multi-step is disabled, this value is always True.
-
-            When multi-step and chunked-prefill are enabled together, the
-            prefill sequence scheduled for multi-step execution turn into
-            decodes in the first step itself. This function accounts
-            for that conversion.
             """
 
             assert seq_group_meta.is_prompt
diff --git a/vllm/model_executor/sampling_metadata.py b/vllm/model_executor/sampling_metadata.py
@@ -160,7 +160,7 @@ def prepare_multistep_tensors(self, num_queries: int, device: str,
 
         Example:
         Let 2 prompts and 2 decodes be scheduled together. Let the
-        num-tokens to process for the 2 prompts be 5 and 8 resply.
+        num-tokens to process for the 2 prompts be 5 and 8 respectively.
 
         In that case, self.sampled_token_indices will be,
         [4, 12, 13, 14] as it is constructed for the first-step in