vllm-project
diff --git a/‎tests/conftest.py
Lines changed: 11 additions & 0 deletions b/‎tests/conftest.py
Lines changed: 11 additions & 0 deletions
diff --git a/‎tests/e2e/test_spyre_basic.py
Lines changed: 16 additions & 39 deletions b/‎tests/e2e/test_spyre_basic.py
Lines changed: 16 additions & 39 deletions
diff --git a/‎tests/e2e/test_spyre_cb_scheduler_steps.py
Lines changed: 52 additions & 22 deletions b/‎tests/e2e/test_spyre_cb_scheduler_steps.py
Lines changed: 52 additions & 22 deletions
@@ -13,6 +13,9 @@
     del os.environ["VLLM_USE_V1"]
 # 🌶️🌶️🌶️ end hack
 
+import hashlib
+import random
+
 import pytest
 import torch
 from spyre_util import RemoteOpenAIServer, skip_unsupported_tp_size
@@ -139,3 +142,11 @@ def remote_openai_server(request):
             yield server
     except Exception as e:
         pytest.fail(f"Failed to setup server: {e}")
+
+
+@pytest.fixture
+def set_random_seed(request):
+    func_hash = hashlib.sha256(request.node.originalname.encode('utf-8'))
+    seed = int(func_hash.hexdigest(), 16)
+    random.seed(seed)
+    yield
@@ -4,10 +4,10 @@
 """
 
 import pytest
-from spyre_util import (compare_results, create_random_request,
-                        generate_hf_output, generate_spyre_vllm_output,
-                        get_chicken_soup_prompts, get_spyre_backend_list,
-                        get_spyre_model_list, skip_unsupported_tp_size)
+from spyre_util import (check_output_against_hf, create_random_request,
+                        generate_spyre_vllm_output, get_chicken_soup_prompts,
+                        get_spyre_backend_list, get_spyre_model_list,
+                        skip_unsupported_tp_size)
 from vllm import EngineArgs, SamplingParams
 from vllm.v1.engine.core import EngineCore
 from vllm.v1.executor.abstract import Executor
@@ -85,17 +85,8 @@ def test_output(
         backend=backend,
         monkeypatch=monkeypatch,
         **kwargs)
-
-    hf_results = generate_hf_output(model=model,
-                                    prompts=prompts,
-                                    max_new_tokens=max_new_tokens)
-
-    compare_results(model=model,
-                    prompts=prompts,
-                    tensor_parallel_size=tp_size,
-                    backend=backend,
-                    vllm_results=vllm_results,
-                    hf_results=hf_results)
+    check_output_against_hf(model, backend, max_new_tokens, vllm_results,
+                            prompts)
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -137,16 +128,8 @@ def test_output_sendnn_decoder(
         backend=backend,
         monkeypatch=monkeypatch)
 
-    hf_results = generate_hf_output(model=model,
-                                    prompts=prompts,
-                                    max_new_tokens=max_new_tokens)
-
-    compare_results(model=model,
-                    prompts=prompts,
-                    tensor_parallel_size=1,
-                    backend=backend,
-                    vllm_results=vllm_results,
-                    hf_results=hf_results)
+    check_output_against_hf(model, backend, max_new_tokens, vllm_results,
+                            prompts)
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -194,18 +177,9 @@ def test_batch_handling(model: str, backend: str, cb: int,
         backend=backend,
         monkeypatch=monkeypatch,
         **kwargs)
-    hf_results = generate_hf_output(model=model,
-                                    prompts=prompts,
-                                    max_new_tokens=max_new_tokens)
 
-    compare_results(
-        model=model,
-        prompts=prompts,
-        tensor_parallel_size=1,
-        backend=backend,
-        vllm_results=vllm_results,
-        hf_results=hf_results,
-    )
+    check_output_against_hf(model, backend, max_new_tokens, vllm_results,
+                            prompts)
 
 
 @pytest.mark.parametrize("model", get_spyre_model_list())
@@ -251,9 +225,12 @@ def test_full_batch_scheduling(model: str, backend: str, monkeypatch):
                                           logprobs=0)
     for i in range(batch_size):
         engine_core.add_request(
-            create_random_request(request_id=i,
-                                  num_tokens=max_batched_tokens,
-                                  sampling_params=vllm_sampling_params))
+            create_random_request(
+                request_id=i,
+                num_tokens=max_batched_tokens,
+                sampling_params=vllm_sampling_params,
+                model=model,
+            ))
     schedule = scheduler.schedule()
 
     assert len(schedule.scheduled_new_reqs) == batch_size
@@ -8,14 +8,16 @@
 
 import pytest
 from scheduling_utils import check_scheduler_inference_steps
-from spyre_util import get_spyre_backend_list, get_spyre_model_list
+from spyre_util import (check_output_against_hf, get_spyre_backend_list,
+                        get_spyre_model_list)
 
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
-                                             monkeypatch: pytest.MonkeyPatch):
+                                             monkeypatch: pytest.MonkeyPatch,
+                                             set_random_seed: None):
     """ Scenario where it happens that all the sequences get scheduled in a 
     fashion where they are aligned with the block boundaries (i.e. tkv multiple 
     of 64 at the time of prefilling).
@@ -162,7 +164,7 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -176,12 +178,16 @@ def test_prompts_aligned_with_tkv_boundaries(model: str, backend: str,
         use_cb=True,
     )
 
+    check_output_against_hf(model, backend, seqs_max_tokens, cb_outputs,
+                            prompts)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 def test_prompts_misaligned_with_tkv_boundaries(
-        model: str, backend: str, monkeypatch: pytest.MonkeyPatch):
+        model: str, backend: str, monkeypatch: pytest.MonkeyPatch,
+        set_random_seed: None):
     """ Scenario where it happens that some sequence gets scheduled in a way 
     that it is misaligned with the block boundary (i.e. tkv is not a multiple 
     of 64 at the time of prefilling).
@@ -193,7 +199,6 @@ def test_prompts_misaligned_with_tkv_boundaries(
             * 2: len = 41, max tokens = 67, step joining = 0
             * 3: len = 47, max tokens = 9, step joining = 0
     """
-
     seqs_max_tokens = [57, 67, 9]
     prompts_lengths = [49, 41, 47]
     steps_add_reqs = [0, 0, 0]  # add all requests in the beginning
@@ -326,7 +331,7 @@ def test_prompts_misaligned_with_tkv_boundaries(
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -340,12 +345,16 @@ def test_prompts_misaligned_with_tkv_boundaries(
         use_cb=True,
     )
 
+    check_output_against_hf(model, backend, seqs_max_tokens, cb_outputs,
+                            prompts)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 def test_two_sequences_finish_same_time_as_new_arrive(
-        model: str, backend: str, monkeypatch: pytest.MonkeyPatch):
+        model: str, backend: str, monkeypatch: pytest.MonkeyPatch,
+        set_random_seed):
     """ 2-cases-in-1: (1) Two sequences finish at the same time and (2) a new
     request arrives when another finishes.
 
@@ -356,7 +365,6 @@ def test_two_sequences_finish_same_time_as_new_arrive(
             * 2: len = 30, max tokens = 30, step joining = 0
             * 3: len = 20, max tokens = 10, step joining = 31
     """
-
     seqs_max_tokens = [30, 30, 10]
     prompts_lengths = [49, 30, 20]
     steps_add_reqs = [0, 0, 31]
@@ -466,7 +474,7 @@ def test_two_sequences_finish_same_time_as_new_arrive(
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -480,12 +488,16 @@ def test_two_sequences_finish_same_time_as_new_arrive(
         use_cb=True,
     )
 
+    check_output_against_hf(model, backend, seqs_max_tokens, cb_outputs,
+                            prompts)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 def test_new_sequence_joins_during_decode(model: str, backend: str,
-                                          monkeypatch: pytest.MonkeyPatch):
+                                          monkeypatch: pytest.MonkeyPatch,
+                                          set_random_seed):
     """ Scenario where a new sequence joins while decoding other sequences
 
     Configuration:
@@ -731,7 +743,7 @@ def test_new_sequence_joins_during_decode(model: str, backend: str,
         # },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -745,12 +757,16 @@ def test_new_sequence_joins_during_decode(model: str, backend: str,
         use_cb=True,
     )
 
+    check_output_against_hf(model, backend, seqs_max_tokens, cb_outputs,
+                            prompts)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 def test_prompt_too_long_for_current_tkv(model: str, backend: str,
-                                         monkeypatch: pytest.MonkeyPatch):
+                                         monkeypatch: pytest.MonkeyPatch,
+                                         set_random_seed):
     """ Scenario where the requested prompt is too long for current tkv value
 
     Configuration:
@@ -880,7 +896,7 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str,
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -894,13 +910,18 @@ def test_prompt_too_long_for_current_tkv(model: str, backend: str,
         use_cb=True,
     )
 
+    check_output_against_hf(model, backend, seqs_max_tokens, cb_outputs,
+                            prompts)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 def test_requested_tokens_not_fitting_remaining_space(
-        model: str, backend: str, monkeypatch: pytest.MonkeyPatch):
-    """ Scenario where the request goes beyond max_model_len 
+        model: str, backend: str, monkeypatch: pytest.MonkeyPatch,
+        set_random_seed):
+    """ Scenario where the request goes beyond max_model_len and needs to wait
+    for a new batch.
 
     Configuration:
         * max_num_seqs: 2
@@ -909,7 +930,6 @@ def test_requested_tokens_not_fitting_remaining_space(
             * 2: len = 49, max tokens = 57, step joining = 0
             * 3: len = 41, max tokens = 80, step joining = 0
     """
-
     seqs_max_tokens = [67, 57, 80]
     prompts_lengths = [70, 49, 41]
     steps_add_reqs = [0, 0, 0]
@@ -1067,7 +1087,7 @@ def test_requested_tokens_not_fitting_remaining_space(
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -1081,12 +1101,16 @@ def test_requested_tokens_not_fitting_remaining_space(
         use_cb=True,
     )
 
+    check_output_against_hf(model, backend, seqs_max_tokens, cb_outputs,
+                            prompts)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 def test_requests_use_all_available_blocks(model: str, backend: str,
-                                           monkeypatch: pytest.MonkeyPatch):
+                                           monkeypatch: pytest.MonkeyPatch,
+                                           set_random_seed):
     """ Scenario where the requests use all of the available blocks 
     
     Configuration:
@@ -1098,7 +1122,6 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
             * 4: len = 10, max tokens = 3, step joining = 0
         * available_blocks: 8
     """
-
     seqs_max_tokens = [3, 3, 3, 3]  # 2 decodes into a new block per sequence
     prompts_lengths = [10, 10, 10, 10]  # 1 block for prefil per sequence
     steps_add_reqs = [0, 0, 0, 0]
@@ -1201,7 +1224,7 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -1215,12 +1238,16 @@ def test_requests_use_all_available_blocks(model: str, backend: str,
         use_cb=True,
     )
 
+    check_output_against_hf(model, backend, seqs_max_tokens, cb_outputs,
+                            prompts)
+
 
 @pytest.mark.cb
 @pytest.mark.parametrize("model", get_spyre_model_list())
 @pytest.mark.parametrize("backend", get_spyre_backend_list())
 def test_requests_use_more_than_available_blocks(
-        model: str, backend: str, monkeypatch: pytest.MonkeyPatch):
+        model: str, backend: str, monkeypatch: pytest.MonkeyPatch,
+        set_random_seed):
     """ Scenario where some request need to wait because of the number of 
     available blocks. 
     
@@ -1361,7 +1388,7 @@ def test_requests_use_more_than_available_blocks(
         },
     ]
 
-    check_scheduler_inference_steps(
+    cb_outputs, prompts = check_scheduler_inference_steps(
         model=model,
         backend=backend,
         monkeypatch=monkeypatch,
@@ -1374,3 +1401,6 @@ def test_requests_use_more_than_available_blocks(
         available_blocks=available_blocks,
         use_cb=True,
     )
+
+    check_output_against_hf(model, backend, seqs_max_tokens, cb_outputs,
+                            prompts)