Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
48 changes: 45 additions & 3 deletions tests/integration/defs/stress_test/stress_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@
# [sys.executable, "-m", "pip", "install", "-r", requirements_file])

# Define a constant for process termination timeouts
GRACEFUL_TERMINATION_TIMEOUT = 10 # seconds - set longer when stress large model
GRACEFUL_TERMINATION_TIMEOUT = 300 # seconds - set longer when stress large model


@dataclass(frozen=True)
Expand Down Expand Up @@ -384,7 +384,34 @@ def stress_test(config, test_mode, server_config=None):
)

# Define test configurations
performance_config = PerformanceParams() if run_performance else None
performance_config = None
if run_performance:
performance_config = PerformanceParams()

# For ds v3 specific parameters
if "DeepSeek-V3" in config.model_dir:
performance_config = PerformanceParams(
test_timeout=
36000 # 10 hours for ds v3, change this value if needed
)

# For ds v3 specific server parameters
if "DeepSeek-V3" in config.model_dir:
test_server_config = ServerConfig(
port=test_server_config.port,
host=test_server_config.host,
pp_size=test_server_config.pp_size,
ep_size=8, # DeepSeek-V3 specific ep_size
max_batch_size=161, # DeepSeek-V3 specific max_batch_size
max_num_tokens=1160, # DeepSeek-V3 specific max_num_tokens
kv_cache_free_gpu_memory_fraction=
0.7, # DeepSeek-V3 specific kv_cache fraction
capacity_scheduler_policy=test_server_config.
capacity_scheduler_policy,
wait_interval=test_server_config.wait_interval,
max_wait_seconds=7200, # DeepSeek-V3 specific wait time (2 hours)
health_check_timeout=test_server_config.health_check_timeout)

stress_config = StressTestConfig(
model_config=config,
server_config=test_server_config) if run_stress else None
Expand All @@ -405,14 +432,29 @@ def stress_test(config, test_mode, server_config=None):
if not os.path.exists(model_path):
raise RuntimeError(f"Model path does not exist: {model_path}")

# Create a temporary YAML file for 'capacity_scheduler_policy'
# Create a temporary YAML file for extra_llm_options
extra_llm_options = {
"scheduler_config": {
"capacity_scheduler_policy":
test_server_config.capacity_scheduler_policy
}
}

# Add DeepSeek-V3 specific configuration
if "DeepSeek-V3" in config.model_dir:

extra_llm_options["enable_attention_dp"] = True

if config.backend == "pytorch":
extra_llm_options["pytorch_backend_config"] = {
"use_cuda_graph": True,
"cuda_graph_padding_enabled": True,
"cuda_graph_batch_sizes":
[1, 2, 4, 8, 16, 32, 64, 128, 256, 384],
"print_iter_log": True,
"enable_overlap_scheduler": True
}

with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml',
delete=False) as temp_file:
yaml.dump(extra_llm_options, temp_file)
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/test_lists/test-db/l0_a10.yml
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ l0_a10:
- disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_overlap[TinyLlama-1.1B-Chat-v1.0]
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-MAX_UTILIZATION-pytorch-stress-test]
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-pytorch-stress-stage-alone]
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-pytorch-stress-test]
- condition:
ranges:
system_gpu_count:
Expand Down Expand Up @@ -111,7 +111,7 @@ l0_a10:
- examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin]
- examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin] # 3 mins
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-MAX_UTILIZATION-trt-stress-test]
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-trt-stress-stage-alone]
- stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-GUARANTEED_NO_EVICT-trt-stress-test]
- condition:
ranges:
system_gpu_count:
Expand Down