Skip to content
Merged
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
160 changes: 117 additions & 43 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,126 +329,200 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
task.evaluate(llm)

@pytest.mark.skip_device_not_contain(["H100"])
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False, False),
(True, False, False, False),
(False, True, False, False),
(False, False, True, False),
(False, False, False, True),
(True, True, True, True)])
@parametrize_with_ids("mtp_nextn", [None, 2])
def test_fp8_block_scales(self, mtp_nextn, attention_dp, cuda_graph,
def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
overlap_scheduler):
# OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)

quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
pytorch_config.kv_cache_dtype = "fp8"

if mtp_nextn is not None and mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
else:
mtp_config = None

llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
kv_cache_config=kv_cache_config,
pytorch_backend_config=pytorch_config,
quant_config=quant_config,
enable_attention_dp=attention_dp,
speculative_config=mtp_config)

assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
if fp8kv:
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
if attention_dp and cuda_graph and overlap_scheduler:
# No need to run these tests for fp8kv
if not fp8kv:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.skip_less_device(4)
@pytest.mark.skip_device_not_contain(["H100"])
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False, False),
(True, False, False, False),
(False, True, False, False),
(False, False, True, False),
(False, False, False, True),
(True, True, True, True)])
@parametrize_with_ids("mtp_nextn", [None, 2])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
(2, 2, 1), (1, 4, 1)],
ids=["tp4", "ep4", "tp2pp2", "pp4"])
def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
attention_dp, cuda_graph,
overlap_scheduler):
def test_fp8_block_scales_4gpus(self, mtp_nextn, fp8kv, attention_dp,
cuda_graph, overlap_scheduler, tp_size,
pp_size, ep_size):
# OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)

quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
pytorch_config.kv_cache_dtype = "fp8"

if mtp_nextn is not None and mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
else:
mtp_config = None

llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
kv_cache_config=kv_cache_config,
pytorch_backend_config=pytorch_config,
quant_config=quant_config,
enable_attention_dp=attention_dp,
speculative_config=mtp_config)

assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
if fp8kv:
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
if attention_dp and cuda_graph and overlap_scheduler:
# No need to run these tests for fp8kv
if not fp8kv:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_blackwell
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
def test_nvfp4(self, attention_dp, cuda_graph, overlap_scheduler):
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False, False),
(True, False, False, False),
(False, True, False, False),
(False, False, True, False),
(False, False, False, True),
(True, True, True, True)])
def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler):
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)

quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.NVFP4
if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
pytorch_config.kv_cache_dtype = "fp8"

llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
pytorch_backend_config=pytorch_config,
quant_config=quant_config,
enable_attention_dp=attention_dp)

assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
if fp8kv:
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
if attention_dp and cuda_graph and overlap_scheduler:
# No need to run these tests for fp8kv
if not fp8kv:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.skip_less_device(4)
@skip_pre_blackwell
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False, False),
(True, False, False, False),
(False, True, False, False),
(False, False, True, False),
(False, False, False, True),
(True, True, True, True)])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
(2, 2, 1), (1, 4, 1)],
ids=["tp4", "ep4", "tp2pp2", "pp4"])
def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
cuda_graph, overlap_scheduler):
def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
overlap_scheduler, tp_size, pp_size, ep_size):
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)

quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.NVFP4
if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
pytorch_config.kv_cache_dtype = "fp8"

llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
pytorch_backend_config=pytorch_config,
quant_config=quant_config,
enable_attention_dp=attention_dp)

assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
if fp8kv:
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
if attention_dp and cuda_graph and overlap_scheduler:
# No need to run these tests for fp8kv
if not fp8kv:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

Expand Down