Skip to content

Commit 8e7d565

Browse files
authored
[v1] remove v0 code (#344)
### [v1] remove v0 code Now as we have v1 support for embedding models (#277 ), we can finally delete the v0 code. Note: for decoder models v0 support was depreciated some time ago. --------- Signed-off-by: Yannick Schnider <[email protected]>
1 parent f0b3ead commit 8e7d565

21 files changed

+21
-2843
lines changed

docs/user_guide/configuration.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -13,8 +13,8 @@ To run inference on IBM Spyre Accelerators, the backend should be set as:
1313
| --- | --- | --- | --- |
1414
| Decoder | v0 | sendnn | V0 support for decoder models is deprecated |
1515
| Decoder | v1 | sendnn | |
16-
| Embedding | v0 | sendnn | |
17-
| Embedding | v1 | N/A | Embedding models are not yet supported on V1 |
16+
| Embedding | v0 | sendnn | V0 support for embedding models is deprecated|
17+
| Embedding | v1 | sendnn | |
1818

1919
## Batching Modes
2020

docs/user_guide/supported_features.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@ This table summarize the status of features on Spyre. By default, those features
1010
| Prompt Adapter || Being deprecated in vLLM [vllm#13981](https://github.com/vllm-project/vllm/issues/13981) |
1111
| Speculative Decoding | 🗓️ | |
1212
| Guided Decoding | 🗓️ | |
13-
| Pooling | ⚠️ | Works with V0. V1 still being developed in vLLM [vllm#18052](https://github.com/vllm-project/vllm/issues/18052) |
13+
| Pooling | | |
1414
| Enc-dec || No plans for now |
1515
| Multi Modality | 🗓️ | |
1616
| LogProbs || |

examples/offline_inference/cb_spyre_inference.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -42,7 +42,6 @@
4242
if "VLLM_SPYRE_DYNAMO_BACKEND" not in os.environ:
4343
os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
4444
os.environ['VLLM_SPYRE_USE_CB'] = '1'
45-
os.environ['VLLM_USE_V1'] = '1'
4645

4746
template = (
4847
"Below is an instruction that describes a task. Write a response that "

examples/offline_inference/long_context.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,6 @@
6868
if "VLLM_SPYRE_DYNAMO_BACKEND" not in os.environ:
6969
os.environ['VLLM_SPYRE_DYNAMO_BACKEND'] = 'eager'
7070
os.environ['VLLM_SPYRE_USE_CB'] = '1'
71-
os.environ['VLLM_USE_V1'] = '1'
7271

7372
template = ("Summarize the following code: \n\n{}")
7473

tests/conftest.py

Lines changed: 2 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,19 +1,5 @@
1-
# 🌶️🌶️🌶️ Hack to allow testing of both engines
2-
import os
3-
4-
# If `VLLM_USE_V1=1` is set upon first vLLM import, then there is a side effect
5-
# that will cause the V1 engine to always be selected. This is intentionally
6-
# done for backwards-compatibility of code that was using the AsyncLLMEngine
7-
# constructor directly, instead of using the `.from_engine_args` construction
8-
# methods that will select the appropriate v0 or v1 engine. See:
9-
# https://github.com/vllm-project/vllm/blob/v0.8.4/vllm/engine/llm_engine.py#L2169-L2171
10-
# Deleting VLLM_USE_V1 here before importing vLLM allows us to continue testing
11-
# both engines.
12-
if "VLLM_USE_V1" in os.environ:
13-
del os.environ["VLLM_USE_V1"]
14-
# 🌶️🌶️🌶️ end hack
15-
161
import hashlib
2+
import os
173
import random
184

195
import pytest
@@ -98,8 +84,7 @@ def remote_openai_server(request):
9884
max_num_seqs = params["max_num_seqs"]
9985
env_dict = {
10086
"VLLM_SPYRE_USE_CB": "1",
101-
"VLLM_SPYRE_DYNAMO_BACKEND": backend,
102-
"VLLM_USE_V1": "1"
87+
"VLLM_SPYRE_DYNAMO_BACKEND": backend
10388
}
10489
server_args = [
10590
"--max_num_seqs",
@@ -121,8 +106,6 @@ def remote_openai_server(request):
121106
','.join(map(str, warmup_batch_size)),
122107
"VLLM_SPYRE_DYNAMO_BACKEND":
123108
backend,
124-
"VLLM_USE_V1":
125-
"1"
126109
}
127110

128111
# Default to None if not present

tests/e2e/test_spyre_async_llm.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,6 @@ async def test_abort(
6565
):
6666
"""Test handling of cancelled requests"""
6767
with monkeypatch.context() as m, ExitStack() as after:
68-
m.setenv("VLLM_USE_V1", "1")
6968
m.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
7069
if cb == 1:
7170
m.setenv("VLLM_SPYRE_USE_CB", "1")

tests/e2e/test_spyre_basic.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -205,7 +205,6 @@ def test_full_batch_scheduling(model: str, backend: str, monkeypatch):
205205
f"{max_batched_tokens}")
206206
monkeypatch.setenv("VLLM_SPYRE_WARMUP_NEW_TOKENS", "20")
207207

208-
monkeypatch.setenv("VLLM_USE_V1", "1")
209208
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
210209

211210
# Setup the engine

tests/e2e/test_spyre_embeddings.py

Lines changed: 1 addition & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -18,12 +18,10 @@
1818
[(64, 4), (64, 8), (128, 4),
1919
(128, 8)]) # (prompt_length/batch_size)
2020
@pytest.mark.parametrize("backend", get_spyre_backend_list())
21-
@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
2221
def test_output(
2322
model: str,
2423
warmup_shape: tuple[int, int],
2524
backend: str,
26-
vllm_version: str,
2725
monkeypatch,
2826
) -> None:
2927
'''
@@ -34,7 +32,6 @@ def test_output(
3432
'''
3533

3634
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
37-
monkeypatch.setenv("VLLM_USE_V1", "1" if vllm_version == "V1" else "0")
3835
patch_warmup_shapes([warmup_shape], monkeypatch)
3936

4037
prompts = get_chicken_soup_prompts(1)
@@ -44,8 +41,7 @@ def test_output(
4441
max_model_len=256,
4542
block_size=256,
4643
tensor_parallel_size=1,
47-
backend=backend,
48-
vllm_version=vllm_version)
44+
backend=backend)
4945

5046
hf_results = st_embeddings(model=model, prompts=prompts)
5147

@@ -65,12 +61,10 @@ def test_output(
6561
]) # (prompt_length/batch_size)
6662
@pytest.mark.parametrize("backend", get_spyre_backend_list())
6763
@pytest.mark.parametrize("model", get_spyre_model_list(isEmbeddings=True))
68-
@pytest.mark.parametrize("vllm_version", ["V0", "V1"])
6964
def test_scheduling_invariance(
7065
model,
7166
backend,
7267
warmup_shape: tuple[int, int],
73-
vllm_version,
7468
monkeypatch,
7569
) -> None:
7670
'''
@@ -83,7 +77,6 @@ def test_scheduling_invariance(
8377
'''
8478

8579
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
86-
monkeypatch.setenv("VLLM_USE_V1", "1" if vllm_version == "V1" else "0")
8780
patch_warmup_shapes([warmup_shape], monkeypatch)
8881

8982
prompts = get_chicken_soup_prompts(4)

tests/e2e/test_spyre_prompt_logprobs.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,7 +40,6 @@ def test_prompt_logprobs(
4040

4141
prompts = get_chicken_soup_prompts(4)
4242

43-
monkeypatch.setenv("VLLM_USE_V1", "1")
4443
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
4544
monkeypatch.setenv("VLLM_SPYRE_ENABLE_PROMPT_LOGPROBS", "1")
4645
llm = LLM(model, tensor_parallel_size=tp_size, tokenizer=model)

tests/e2e/test_spyre_static_batching_limits.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,7 +30,6 @@ def test_max_prompt_len_and_new_tokens(model: str,
3030
'''
3131
monkeypatch.setenv("VLLM_SPYRE_DYNAMO_BACKEND", backend)
3232
patch_warmup_shapes(warmup_shapes, monkeypatch)
33-
monkeypatch.setenv("VLLM_USE_V1", "1")
3433

3534
max_prompt_length = max([t[0] for t in warmup_shapes])
3635
max_new_tokens = max([t[1] for t in warmup_shapes])

0 commit comments

Comments
 (0)