@@ -624,12 +624,12 @@ def _kill_proc():
624
624
625
625
# Shared variables between tasks
626
626
last_running_req , last_queue_req = 0 , 0
627
- prev_running_req_at_release = 0 # Track running requests at last semaphore release
627
+ running_reqs_decreased = False
628
628
server_printed_ready_message = False
629
629
last_semaphore_release = time .time ()
630
630
631
631
async def process_line (line ):
632
- nonlocal last_running_req , last_queue_req , last_semaphore_release , server_printed_ready_message
632
+ nonlocal last_running_req , last_queue_req , running_reqs_decreased , last_semaphore_release , server_printed_ready_message
633
633
server_logger .info (line )
634
634
635
635
if "Detected errors during sampling" in line :
@@ -640,12 +640,15 @@ async def process_line(line):
640
640
server_printed_ready_message = True
641
641
last_semaphore_release = time .time ()
642
642
643
- match = re .search (r"Running: (\d+)" , line )
644
- if match :
645
- last_running_req = int (match .group (1 ))
643
+ if match := re .search (r"Running: (\d+)" , line ):
644
+ current_running = int (match .group (1 ))
645
+ # Check for negative derivative (decrease in running requests), to not overload VLLM
646
+ if current_running < last_running_req :
647
+ running_reqs_decreased = True
648
+ logger .info (f"Running requests decreased: { last_running_req } -> { current_running } " )
649
+ last_running_req = current_running
646
650
647
- match = re .search (r"(?:Waiting|Pending):\s*(\d+)" , line )
648
- if match :
651
+ if match := re .search (r"(?:Waiting|Pending):\s*(\d+)" , line ):
649
652
last_queue_req = int (match .group (1 ))
650
653
logger .info (f"vllm running req: { last_running_req } queue req: { last_queue_req } " )
651
654
@@ -661,25 +664,25 @@ async def read_stream(stream):
661
664
logger .warning (f"Got { ex } when reading log line from inference server, skipping" )
662
665
663
666
async def timeout_task ():
664
- nonlocal last_running_req , last_queue_req , last_semaphore_release , prev_running_req_at_release
667
+ nonlocal last_running_req , last_queue_req , last_semaphore_release , running_reqs_decreased
665
668
try :
666
669
while True :
667
670
await asyncio .sleep (1 )
668
-
671
+
669
672
# Check if we should release the semaphore
670
673
should_release = (
671
674
server_printed_ready_message
672
675
and last_queue_req == 0
673
676
and time .time () - last_semaphore_release > 30
674
677
and semaphore .locked ()
675
- and (last_running_req == 0 or last_running_req < prev_running_req_at_release )
678
+ and (last_running_req == 0 or running_reqs_decreased )
676
679
)
677
-
680
+
678
681
if should_release :
679
682
semaphore .release ()
680
- prev_running_req_at_release = last_running_req
683
+ running_reqs_decreased = False # Reset flag after release
681
684
last_semaphore_release = time .time ()
682
- logger .info (f"Semaphore released, allowing a worker to proceed. Running requests: { last_running_req } (prev: { prev_running_req_at_release } ) " )
685
+ logger .info (f"Semaphore released, allowing a worker to proceed. Running requests: { last_running_req } " )
683
686
except asyncio .CancelledError :
684
687
pass # Clean up if the task is cancelled
685
688
0 commit comments