Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions tests/integration/defs/accuracy/references/gsm8k.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,24 @@ deepseek-ai/DeepSeek-V3-Lite:
- accuracy: 64.74
- quant_algo: NVFP4
accuracy: 63.71
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 63.71
- quant_algo: FP8_BLOCK_SCALES
accuracy: 64.74
- quant_algo: FP8_BLOCK_SCALES
kv_cache_quant_algo: FP8
accuracy: 64.74
- spec_dec_algo: MTP
accuracy: 64.44
- spec_dec_algo: MTP
kv_cache_quant_algo: FP8
accuracy: 64.44
- quant_algo: FP8_BLOCK_SCALES
spec_dec_algo: MTP
accuracy: 64.14
- quant_algo: FP8_BLOCK_SCALES
kv_cache_quant_algo: FP8
spec_dec_algo: MTP
accuracy: 64.14
deepseek-ai/DeepSeek-R1:
Expand Down
190 changes: 144 additions & 46 deletions tests/integration/defs/accuracy/test_llm_api_pytorch.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,126 +371,200 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
task.evaluate(llm)

@pytest.mark.skip_device_not_contain(["H100"])
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False, False),
(True, False, False, False),
(False, True, False, False),
(False, False, True, False),
(False, False, False, True),
(True, True, True, True)])
@parametrize_with_ids("mtp_nextn", [None, 2])
def test_fp8_block_scales(self, mtp_nextn, attention_dp, cuda_graph,
def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
overlap_scheduler):
# OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)

quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
pytorch_config.kv_cache_dtype = "fp8"

if mtp_nextn is not None and mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
else:
mtp_config = None

llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
kv_cache_config=kv_cache_config,
pytorch_backend_config=pytorch_config,
quant_config=quant_config,
enable_attention_dp=attention_dp,
speculative_config=mtp_config)

assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
if fp8kv:
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
if attention_dp and cuda_graph and overlap_scheduler:
# No need to run these tests for fp8kv
if not fp8kv:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.skip_less_device(4)
@pytest.mark.skip_device_not_contain(["H100"])
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False, False),
(True, False, False, False),
(False, True, False, False),
(False, False, True, False),
(False, False, False, True),
(False, True, True, True), (True, True, True, True)])
@parametrize_with_ids("mtp_nextn", [None, 2])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
(2, 2, 1), (1, 4, 1)],
ids=["tp4", "ep4", "tp2pp2", "pp4"])
def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
attention_dp, cuda_graph,
fp8kv, attention_dp, cuda_graph,
overlap_scheduler):
# OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)

quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
pytorch_config.kv_cache_dtype = "fp8"

if mtp_nextn is not None and mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
else:
mtp_config = None

llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
kv_cache_config=kv_cache_config,
pytorch_backend_config=pytorch_config,
quant_config=quant_config,
enable_attention_dp=attention_dp,
speculative_config=mtp_config)

assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
if fp8kv:
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
if attention_dp and cuda_graph and overlap_scheduler:
# No need to run these tests for fp8kv
if not fp8kv:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@skip_pre_blackwell
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
def test_nvfp4(self, attention_dp, cuda_graph, overlap_scheduler):
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False, False),
(True, False, False, False),
(False, True, False, False),
(False, False, True, False),
(False, False, False, True),
(True, True, True, True)])
def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler):
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)

quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.NVFP4
if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
pytorch_config.kv_cache_dtype = "fp8"

llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
pytorch_backend_config=pytorch_config,
quant_config=quant_config,
enable_attention_dp=attention_dp)

assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
if fp8kv:
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
if attention_dp and cuda_graph and overlap_scheduler:
# No need to run these tests for fp8kv
if not fp8kv:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

@pytest.mark.skip_less_device(4)
@skip_pre_blackwell
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
@parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False, False),
(True, False, False, False),
(False, True, False, False),
(False, False, True, False),
(False, False, False, True),
(True, True, True, True)])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
(2, 2, 1), (1, 4, 1)],
ids=["tp4", "ep4", "tp2pp2", "pp4"])
def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
cuda_graph, overlap_scheduler):
def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
overlap_scheduler, tp_size, pp_size, ep_size):
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)

quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.NVFP4
if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
pytorch_config.kv_cache_dtype = "fp8"

llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
pytorch_backend_config=pytorch_config,
quant_config=quant_config,
enable_attention_dp=attention_dp)

assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
if fp8kv:
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
if attention_dp and cuda_graph and overlap_scheduler:
# No need to run these tests for fp8kv
if not fp8kv:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
# Run GSM8K for fp8kv, or if all the other optimizations are enabled
if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)

Expand All @@ -504,16 +578,24 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
@parametrize_with_ids("overlap_scheduler", [False, True])
@parametrize_with_ids("cuda_graph", [False, True])
@parametrize_with_ids("attention_dp", [False, True])
@parametrize_with_ids("fp8kv", [False, True])
@parametrize_with_ids("mtp_nextn", [None, 2])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
(8, 1, 8)],
ids=["tp8", "tp8ep4", "tp8ep8"])
def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
attention_dp, cuda_graph, overlap_scheduler):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)

quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.NVFP4
if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
pytorch_config.kv_cache_dtype = "fp8"

if mtp_nextn is not None and mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
else:
Expand All @@ -524,9 +606,13 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
moe_expert_parallel_size=ep_size,
kv_cache_config=kv_cache_config,
pytorch_backend_config=pytorch_config,
quant_config=quant_config,
enable_attention_dp=attention_dp,
speculative_config=mtp_config)
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
if fp8kv:
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

with llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
Expand All @@ -539,17 +625,24 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
@pytest.mark.skip_less_device(8)
@skip_pre_hopper
@pytest.mark.parametrize(
"tp_size,pp_size,ep_size,mtp_nextn,attention_dp,cuda_graph,overlap_scheduler,batch_size",
[(8, 1, 4, 3, False, True, True, 1),
(8, 1, 8, 0, True, True, True, 24)],
"tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,batch_size",
[(8, 1, 4, 3, False, False, True, True, 1),
(8, 1, 8, 0, True, True, True, True, 24)],
ids=["latency", "throughput"])
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn,
def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
attention_dp, cuda_graph, overlap_scheduler,
batch_size):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)

quant_config = QuantConfig()
quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
if fp8kv:
quant_config.kv_cache_quant_algo = QuantAlgo.FP8
pytorch_config.kv_cache_dtype = "fp8"

if mtp_nextn is not None and mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
else:
Expand All @@ -561,8 +654,13 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn,
moe_expert_parallel_size=ep_size,
kv_cache_config=kv_cache_config,
pytorch_backend_config=pytorch_config,
quant_config=quant_config,
enable_attention_dp=attention_dp,
speculative_config=mtp_config)
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
if fp8kv:
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
Expand Down
12 changes: 6 additions & 6 deletions tests/integration/test_lists/qa/examples_test_list.txt
Original file line number Diff line number Diff line change
Expand Up @@ -439,12 +439,12 @@ accuracy/test_llm_api_pytorch.py::TestNemotronSuper::test_auto_dtype_tp2
accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8---cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8-mtp_nextn=2--cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4---cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4-mtp_nextn=2--cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8---cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8-mtp_nextn=2--cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8--fp8kv--cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8-mtp_nextn=2-fp8kv--cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4--fp8kv--cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4-mtp_nextn=2-fp8kv--cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8--fp8kv--cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8-mtp_nextn=2-fp8kv--cuda_graph-overlap_scheduler]
accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
accuracy/test_disaggregated_serving.py::TestLlama3_1_8B::test_auto_dtype[False]
Expand Down
3 changes: 2 additions & 1 deletion tests/integration/test_lists/test-db/l0_b200.yml
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@ l0_b200:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv-attention_dp-cuda_graph-overlap_scheduler]
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
- test_e2e.py::test_ptq_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]
Expand Down
Loading