@@ -530,285 +530,6 @@ def test_two_sequences_finish_same_time_as_new_arrive(
530
530
hf_results = hf_outputs )
531
531
532
532
533
- @pytest .mark .cb
534
- @pytest .mark .parametrize ("model" , get_spyre_model_list ())
535
- @pytest .mark .parametrize ("backend" , get_spyre_backend_list ())
536
- def test_new_sequence_joins_during_decode (model : str , backend : str ,
537
- monkeypatch : pytest .MonkeyPatch ):
538
- """ Scenario where a new sequence joins while decoding other sequences
539
-
540
- Configuration:
541
- * max_num_seqs: 4
542
- * number of prompts: 4
543
- * 1: len = 49, max tokens = 119, step joining = 0
544
- * 2: len = 14, max tokens = 52, step joining = 0
545
- * 3: len = 89, max tokens = 104, step joining = 32
546
- * 4: len = 9, max tokens = 64, step joining = 131
547
- """
548
- # TODO change to 65 max_tokens for last prompt if ever possible
549
-
550
- seqs_max_tokens = [119 , 52 , 104 , 64 ]
551
- prompts_lengths = [49 , 14 , 89 , 9 ]
552
- steps_add_reqs = [0 , 0 , 32 , 131 ]
553
- available_blocks = - 1 # no restriction
554
- max_num_seqs = 4
555
- max_model_len = 256
556
- # check_output = backend == "sendnn"
557
- check_output = True
558
-
559
- checked_steps = [
560
- {
561
- "step" : 0 ,
562
- "tkv" : 0 ,
563
- "waiting" : ["0" , "1" ],
564
- "running" : [],
565
- "request_outputs" : [],
566
- "n_reserved_blocks" : 0 ,
567
- "n_used_blocks" : 0
568
- },
569
- {
570
- # Prefill sequence 0
571
- "step" : 1 ,
572
- "tkv" : 64 ,
573
- "waiting" : ["1" ],
574
- "running" : ["0" ],
575
- "request_outputs" : ["0" ],
576
- "n_reserved_blocks" : 3 , # prefill (1 block) + 119 decode (2 block)
577
- "n_used_blocks" : 1
578
- },
579
- {
580
- # Prefill sequence 1
581
- "step" : 2 ,
582
- "tkv" : 64 ,
583
- "waiting" : [],
584
- "running" : ["1" , "0" ],
585
- "request_outputs" : ["1" ],
586
- "n_reserved_blocks" : 5 , # prefill (1 block) + 51 decodes (1 block)
587
- "n_used_blocks" : 2
588
- },
589
- {
590
- # Decode sequences 0 and 1
591
- "step" : 3 ,
592
- "tkv" : 65 ,
593
- "waiting" : [],
594
- "running" : ["1" , "0" ],
595
- "request_outputs" : ["1" , "0" ],
596
- "n_reserved_blocks" : 5 ,
597
- "n_used_blocks" : 4 # 2 blocks extended, one for each sequence
598
- },
599
- {
600
- # Sequence 2 joins: one iteration in waiting queue
601
- "step" : 32 ,
602
- "tkv" : 94 ,
603
- "waiting" : ["2" ],
604
- "running" : ["1" , "0" ],
605
- "request_outputs" : ["1" , "0" ],
606
- "n_reserved_blocks" : 5 ,
607
- "n_used_blocks" : 4
608
- },
609
- {
610
- # Prefill sequence 2
611
- "step" : 33 ,
612
- "tkv" : 94 ,
613
- "waiting" : [],
614
- "running" : ["2" , "1" , "0" ],
615
- "request_outputs" : ["2" ],
616
- "n_reserved_blocks" : 9 , # prefill (2 block) + 103 decode (2 block)
617
- "n_used_blocks" : 6
618
- },
619
- {
620
- # Decode sequences 0, 1, and 2
621
- "step" : 34 ,
622
- "tkv" : 95 ,
623
- "waiting" : [],
624
- "running" : ["2" , "1" , "0" ],
625
- "request_outputs" : ["2" , "1" , "0" ],
626
- "n_reserved_blocks" : 9 ,
627
- "n_used_blocks" : 6
628
- },
629
- {
630
- # Sequence 1 finishes at step 54
631
- # (start step + 2 prefills + 51 decodes - 1) = 2 + 2 + 51 - 1 = 54
632
- "step" : 54 ,
633
- "tkv" : 115 ,
634
- "waiting" : [],
635
- "running" : ["2" , "0" ],
636
- "request_outputs" : ["2" , "1" , "0" ],
637
- "finished_requests" : ["1" ],
638
- "n_reserved_blocks" : 9 ,
639
- "n_used_blocks" : 6
640
- },
641
- {
642
- # Decode sequences 0 and 2
643
- "step" : 55 ,
644
- "tkv" : 116 ,
645
- "waiting" : [],
646
- "running" : ["2" , "0" ],
647
- "request_outputs" : ["2" , "0" ],
648
- "n_reserved_blocks" : 7 , # two blocks released
649
- "n_used_blocks" : 4 # two blocks released
650
- },
651
- {
652
- # Decode sequences 0 and 2, tkv arrives to new block
653
- "step" : 68 ,
654
- "tkv" : 129 ,
655
- "waiting" : [],
656
- "running" : ["2" , "0" ],
657
- "request_outputs" : ["2" , "0" ],
658
- "n_reserved_blocks" : 7 ,
659
- "n_used_blocks" : 6 # 2 blocks extended, one for each sequence
660
- },
661
- {
662
- # Sequence 0 finishes at step 121
663
- # (start step + 3 prefills + 118 decode - 1) = 1 + 3 + 118 - 1 = 121
664
- "step" : 121 ,
665
- "tkv" : 182 ,
666
- "waiting" : [],
667
- "running" : ["2" ],
668
- "request_outputs" : ["2" , "0" ],
669
- "finished_requests" : ["0" ],
670
- "n_reserved_blocks" : 7 ,
671
- "n_used_blocks" : 6
672
- },
673
- {
674
- # Decode sequence 2
675
- "step" : 122 ,
676
- "tkv" : 183 ,
677
- "waiting" : [],
678
- "running" : ["2" ],
679
- "request_outputs" : ["2" ],
680
- "n_reserved_blocks" : 4 , # 3 blocks released
681
- "n_used_blocks" : 3 # 3 blocks released
682
- },
683
- {
684
- # Sequence 3 joins: one iteration in waiting queue
685
- "step" : 131 ,
686
- "tkv" : 192 ,
687
- "waiting" : ["3" ],
688
- "running" : ["2" ],
689
- "request_outputs" : ["2" ],
690
- "n_reserved_blocks" : 4 ,
691
- "n_used_blocks" : 3
692
- },
693
- {
694
- # Prefill sequence 3
695
- "step" : 132 ,
696
- "tkv" : 192 ,
697
- "waiting" : [],
698
- "running" : ["3" , "2" ],
699
- "request_outputs" : ["3" ],
700
- "n_reserved_blocks" : 8 , # prefill (3 blocks) + 63 decode (1 block)
701
- "n_used_blocks" : 6 # prefill (3 block)
702
- },
703
- {
704
- # Decode sequences 2 and 3
705
- "step" : 133 ,
706
- "tkv" : 193 ,
707
- "waiting" : [],
708
- "running" : ["3" , "2" ],
709
- "request_outputs" : ["3" , "2" ],
710
- "n_reserved_blocks" : 8 ,
711
- "n_used_blocks" : 8 # 2 blocks extended, one for each sequence
712
- },
713
- {
714
- # Sequence 2 finishes at step 137
715
- # (start step + 2 prefills + 103 decodes) = 33 + 2 + 103 - 1 = 137
716
- "step" : 137 ,
717
- "tkv" : 197 ,
718
- "waiting" : [],
719
- "running" : ["3" ],
720
- "request_outputs" : ["3" , "2" ],
721
- "finished_requests" : ["2" ],
722
- "n_reserved_blocks" : 8 ,
723
- "n_used_blocks" : 8
724
- },
725
- {
726
- # Decode sequence 3
727
- "step" : 138 ,
728
- "tkv" : 70 ,
729
- "waiting" : [],
730
- "running" : ["3" ],
731
- "request_outputs" : ["3" ],
732
- # 6 blocks freed: finished sequence (4) + left padding stripping (2)
733
- "n_reserved_blocks" : 2 ,
734
- "n_used_blocks" : 2
735
- },
736
- {
737
- # Sequence 3 finishes at step 196
738
- # (start step + 1 prefills + 103 decodes) = 132 + 1 + 63 - 1 = 196
739
- "step" : 195 ,
740
- "tkv" : 127 ,
741
- "waiting" : [],
742
- "running" : [],
743
- "request_outputs" : ["3" ],
744
- "finished_requests" : ["3" ],
745
- "n_reserved_blocks" : 2 ,
746
- "n_used_blocks" : 2
747
- },
748
- {
749
- # Tkv should be cleared one step later
750
- "step" : 196 ,
751
- "tkv" : 0 ,
752
- "waiting" : [],
753
- "running" : [],
754
- "request_outputs" : [],
755
- "n_reserved_blocks" : 0 ,
756
- "n_used_blocks" : 0
757
- },
758
- # TODO this is when max_tokens = 65 for last prompt
759
- # {
760
- # # Sequence 3 finishes at step 196
761
- # # (start step + 1 prefills + 103 decodes) = 132 + 1 + 64 - 1 = 196
762
- # "step": 196,
763
- # "tkv": 128,
764
- # "waiting": [],
765
- # "running": [],
766
- # "request_outputs": ["3"],
767
- # "finished_requests": ["3"],
768
- # "n_reserved_blocks": 2,
769
- # "n_used_blocks": 2
770
- # },
771
- # {
772
- # # Tkv should be cleared one step later
773
- # "step": 197,
774
- # "tkv": 0,
775
- # "waiting": [],
776
- # "running": [],
777
- # "request_outputs": [],
778
- # "n_reserved_blocks": 0,
779
- # "n_used_blocks": 0
780
- # },
781
- ]
782
-
783
- cb_outputs , prompts = check_scheduler_inference_steps (
784
- model = model ,
785
- backend = backend ,
786
- monkeypatch = monkeypatch ,
787
- seqs_max_tokens = seqs_max_tokens ,
788
- prompts_lengths = prompts_lengths ,
789
- steps_add_reqs = steps_add_reqs ,
790
- checked_steps = checked_steps ,
791
- max_num_seqs = max_num_seqs ,
792
- max_model_len = max_model_len ,
793
- available_blocks = available_blocks ,
794
- use_cb = True ,
795
- collect_outputs = check_output ,
796
- )
797
-
798
- if check_output :
799
- hf_outputs = generate_hf_output (
800
- model = model ,
801
- prompts = prompts ,
802
- max_new_tokens = seqs_max_tokens ,
803
- ignore_eos = True ,
804
- )
805
- compare_results (model = model ,
806
- tensor_parallel_size = 1 ,
807
- backend = backend ,
808
- vllm_results = cb_outputs ,
809
- hf_results = hf_outputs )
810
-
811
-
812
533
@pytest .mark .cb
813
534
@pytest .mark .parametrize ("model" , get_spyre_model_list ())
814
535
@pytest .mark .parametrize ("backend" , get_spyre_backend_list ())
0 commit comments