NVIDIA · syuoni · May 9, 2025 · Apr 29, 2025 · Apr 29, 2025 · Apr 30, 2025
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -19,11 +19,24 @@ deepseek-ai/DeepSeek-V3-Lite:
   - accuracy: 64.74
   - quant_algo: NVFP4
     accuracy: 63.71
+  - quant_algo: NVFP4
+    kv_cache_quant_algo: FP8
+    accuracy: 63.71
   - quant_algo: FP8_BLOCK_SCALES
     accuracy: 64.74
+  - quant_algo: FP8_BLOCK_SCALES
+    kv_cache_quant_algo: FP8
+    accuracy: 64.74
   - spec_dec_algo: MTP
     accuracy: 64.44
+  - spec_dec_algo: MTP
+    kv_cache_quant_algo: FP8
+    accuracy: 64.44
+  - quant_algo: FP8_BLOCK_SCALES
+    spec_dec_algo: MTP
+    accuracy: 64.14
   - quant_algo: FP8_BLOCK_SCALES
+    kv_cache_quant_algo: FP8
     spec_dec_algo: MTP
     accuracy: 64.14
 deepseek-ai/DeepSeek-R1:

diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -371,126 +371,200 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                 task.evaluate(llm)
 
     @pytest.mark.skip_device_not_contain(["H100"])
-    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
-                          [(False, False, False), (True, False, False),
-                           (False, True, False), (False, False, True),
-                           (True, True, True)])
+    @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, False, False, False),
+                           (True, False, False, False),
+                           (False, True, False, False),
+                           (False, False, True, False),
+                           (False, False, False, True),
+                           (True, True, True, True)])
     @parametrize_with_ids("mtp_nextn", [None, 2])
-    def test_fp8_block_scales(self, mtp_nextn, attention_dp, cuda_graph,
+    def test_fp8_block_scales(self, mtp_nextn, fp8kv, attention_dp, cuda_graph,
                               overlap_scheduler):
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
         pytorch_config = PyTorchConfig(
             enable_overlap_scheduler=overlap_scheduler,
             use_cuda_graph=cuda_graph)
+
+        quant_config = QuantConfig()
+        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
+        if fp8kv:
+            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
+            pytorch_config.kv_cache_dtype = "fp8"
+
         if mtp_nextn is not None and mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         else:
             mtp_config = None
+
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                   kv_cache_config=kv_cache_config,
                   pytorch_backend_config=pytorch_config,
+                  quant_config=quant_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
+
         assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+        if fp8kv:
+            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+
         with llm:
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-            if attention_dp and cuda_graph and overlap_scheduler:
+            # No need to run these tests for fp8kv
+            if not fp8kv:
+                task = CnnDailymail(self.MODEL_NAME)
+                task.evaluate(llm)
+                task = MMLU(self.MODEL_NAME)
+                task.evaluate(llm)
+            # Run GSM8K for fp8kv, or if all the other optimizations are enabled
+            if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
                 task = GSM8K(self.MODEL_NAME)
                 task.evaluate(llm)
 
     @pytest.mark.skip_less_device(4)
     @pytest.mark.skip_device_not_contain(["H100"])
-    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
-                          [(False, False, False), (True, False, False),
-                           (False, True, False), (False, False, True),
-                           (True, True, True)])
+    @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, False, False, False),
+                           (True, False, False, False),
+                           (False, True, False, False),
+                           (False, False, True, False),
+                           (False, False, False, True),
+                           (False, True, True, True), (True, True, True, True)])
     @parametrize_with_ids("mtp_nextn", [None, 2])
     @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
                                                          (2, 2, 1), (1, 4, 1)],
                              ids=["tp4", "ep4", "tp2pp2", "pp4"])
     def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
-                                    attention_dp, cuda_graph,
+                                    fp8kv, attention_dp, cuda_graph,
                                     overlap_scheduler):
         # OOM on H100 with default free_gpu_memory_fraction=0.9
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
         pytorch_config = PyTorchConfig(
             enable_overlap_scheduler=overlap_scheduler,
             use_cuda_graph=cuda_graph)
+
+        quant_config = QuantConfig()
+        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
+        if fp8kv:
+            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
+            pytorch_config.kv_cache_dtype = "fp8"
+
         if mtp_nextn is not None and mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         else:
             mtp_config = None
+
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
                   kv_cache_config=kv_cache_config,
                   pytorch_backend_config=pytorch_config,
+                  quant_config=quant_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
+
         assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+        if fp8kv:
+            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+
         with llm:
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-            if attention_dp and cuda_graph and overlap_scheduler:
+            # No need to run these tests for fp8kv
+            if not fp8kv:
+                task = CnnDailymail(self.MODEL_NAME)
+                task.evaluate(llm)
+                task = MMLU(self.MODEL_NAME)
+                task.evaluate(llm)
+            # Run GSM8K for fp8kv, or if all the other optimizations are enabled
+            if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
                 task = GSM8K(self.MODEL_NAME)
                 task.evaluate(llm)
 
     @skip_pre_blackwell
-    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
-                          [(False, False, False), (True, False, False),
-                           (False, True, False), (False, False, True),
-                           (True, True, True)])
-    def test_nvfp4(self, attention_dp, cuda_graph, overlap_scheduler):
+    @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, False, False, False),
+                           (True, False, False, False),
+                           (False, True, False, False),
+                           (False, False, True, False),
+                           (False, False, False, True),
+                           (True, True, True, True)])
+    def test_nvfp4(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler):
         pytorch_config = PyTorchConfig(
             enable_overlap_scheduler=overlap_scheduler,
             use_cuda_graph=cuda_graph)
+
+        quant_config = QuantConfig()
+        quant_config.quant_algo = QuantAlgo.NVFP4
+        if fp8kv:
+            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
+            pytorch_config.kv_cache_dtype = "fp8"
+
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
                   pytorch_backend_config=pytorch_config,
+                  quant_config=quant_config,
                   enable_attention_dp=attention_dp)
+
         assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+        if fp8kv:
+            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+
         with llm:
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-            if attention_dp and cuda_graph and overlap_scheduler:
+            # No need to run these tests for fp8kv
+            if not fp8kv:
+                task = CnnDailymail(self.MODEL_NAME)
+                task.evaluate(llm)
+                task = MMLU(self.MODEL_NAME)
+                task.evaluate(llm)
+            # Run GSM8K for fp8kv, or if all the other optimizations are enabled
+            if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
                 task = GSM8K(self.MODEL_NAME)
                 task.evaluate(llm)
 
     @pytest.mark.skip_less_device(4)
     @skip_pre_blackwell
-    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
-                          [(False, False, False), (True, False, False),
-                           (False, True, False), (False, False, True),
-                           (True, True, True)])
+    @parametrize_with_ids("fp8kv,attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, False, False, False),
+                           (True, False, False, False),
+                           (False, True, False, False),
+                           (False, False, True, False),
+                           (False, False, False, True),
+                           (True, True, True, True)])
     @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
                                                          (2, 2, 1), (1, 4, 1)],
                              ids=["tp4", "ep4", "tp2pp2", "pp4"])
-    def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
-                         cuda_graph, overlap_scheduler):
+    def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
+                         overlap_scheduler, tp_size, pp_size, ep_size):
         pytorch_config = PyTorchConfig(
             enable_overlap_scheduler=overlap_scheduler,
             use_cuda_graph=cuda_graph)
+
+        quant_config = QuantConfig()
+        quant_config.quant_algo = QuantAlgo.NVFP4
+        if fp8kv:
+            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
+            pytorch_config.kv_cache_dtype = "fp8"
+
         llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
                   tensor_parallel_size=tp_size,
                   pipeline_parallel_size=pp_size,
                   moe_expert_parallel_size=ep_size,
                   pytorch_backend_config=pytorch_config,
+                  quant_config=quant_config,
                   enable_attention_dp=attention_dp)
+
         assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+        if fp8kv:
+            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+
         with llm:
-            task = CnnDailymail(self.MODEL_NAME)
-            task.evaluate(llm)
-            task = MMLU(self.MODEL_NAME)
-            task.evaluate(llm)
-            if attention_dp and cuda_graph and overlap_scheduler:
+            # No need to run these tests for fp8kv
+            if not fp8kv:
+                task = CnnDailymail(self.MODEL_NAME)
+                task.evaluate(llm)
+                task = MMLU(self.MODEL_NAME)
+                task.evaluate(llm)
+            # Run GSM8K for fp8kv, or if all the other optimizations are enabled
+            if fp8kv or (attention_dp and cuda_graph and overlap_scheduler):
                 task = GSM8K(self.MODEL_NAME)
                 task.evaluate(llm)
 
@@ -504,16 +578,24 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
     @parametrize_with_ids("overlap_scheduler", [False, True])
     @parametrize_with_ids("cuda_graph", [False, True])
     @parametrize_with_ids("attention_dp", [False, True])
+    @parametrize_with_ids("fp8kv", [False, True])
     @parametrize_with_ids("mtp_nextn", [None, 2])
     @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 1), (8, 1, 4),
                                                          (8, 1, 8)],
                              ids=["tp8", "tp8ep4", "tp8ep8"])
-    def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
+    def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                          attention_dp, cuda_graph, overlap_scheduler):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
         pytorch_config = PyTorchConfig(
             enable_overlap_scheduler=overlap_scheduler,
             use_cuda_graph=cuda_graph)
+
+        quant_config = QuantConfig()
+        quant_config.quant_algo = QuantAlgo.NVFP4
+        if fp8kv:
+            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
+            pytorch_config.kv_cache_dtype = "fp8"
+
         if mtp_nextn is not None and mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         else:
@@ -524,9 +606,13 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                   moe_expert_parallel_size=ep_size,
                   kv_cache_config=kv_cache_config,
                   pytorch_backend_config=pytorch_config,
+                  quant_config=quant_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
         assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+        if fp8kv:
+            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+
         with llm:
             task = MMLU(self.MODEL_NAME)
             task.evaluate(llm)
@@ -539,17 +625,24 @@ def test_nvfp4_8gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
     @pytest.mark.skip_less_device(8)
     @skip_pre_hopper
     @pytest.mark.parametrize(
-        "tp_size,pp_size,ep_size,mtp_nextn,attention_dp,cuda_graph,overlap_scheduler,batch_size",
-        [(8, 1, 4, 3, False, True, True, 1),
-         (8, 1, 8, 0, True, True, True, 24)],
+        "tp_size,pp_size,ep_size,mtp_nextn,fp8kv,attention_dp,cuda_graph,overlap_scheduler,batch_size",
+        [(8, 1, 4, 3, False, False, True, True, 1),
+         (8, 1, 8, 0, True, True, True, True, 24)],
         ids=["latency", "throughput"])
-    def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn,
+    def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn, fp8kv,
                             attention_dp, cuda_graph, overlap_scheduler,
                             batch_size):
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.4)
         pytorch_config = PyTorchConfig(
             enable_overlap_scheduler=overlap_scheduler,
             use_cuda_graph=cuda_graph)
+
+        quant_config = QuantConfig()
+        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES
+        if fp8kv:
+            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
+            pytorch_config.kv_cache_dtype = "fp8"
+
         if mtp_nextn is not None and mtp_nextn > 0:
             mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
         else:
@@ -561,8 +654,13 @@ def test_fp8_blockscale(self, tp_size, pp_size, ep_size, mtp_nextn,
                   moe_expert_parallel_size=ep_size,
                   kv_cache_config=kv_cache_config,
                   pytorch_backend_config=pytorch_config,
+                  quant_config=quant_config,
                   enable_attention_dp=attention_dp,
                   speculative_config=mtp_config)
+        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+        if fp8kv:
+            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+
         with llm:
             task = CnnDailymail(self.MODEL_NAME)
             task.evaluate(llm)

diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt
@@ -439,12 +439,12 @@ accuracy/test_llm_api_pytorch.py::TestNemotronSuper::test_auto_dtype_tp2
 accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8---cuda_graph-overlap_scheduler]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8-mtp_nextn=2--cuda_graph-overlap_scheduler]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4---cuda_graph-overlap_scheduler]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4-mtp_nextn=2--cuda_graph-overlap_scheduler]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8---cuda_graph-overlap_scheduler]
-accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8-mtp_nextn=2--cuda_graph-overlap_scheduler]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8--fp8kv--cuda_graph-overlap_scheduler]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8-mtp_nextn=2-fp8kv--cuda_graph-overlap_scheduler]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4--fp8kv--cuda_graph-overlap_scheduler]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep4-mtp_nextn=2-fp8kv--cuda_graph-overlap_scheduler]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8--fp8kv--cuda_graph-overlap_scheduler]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[tp8ep8-mtp_nextn=2-fp8kv--cuda_graph-overlap_scheduler]
 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
 accuracy/test_disaggregated_serving.py::TestLlama3_1_8B::test_auto_dtype[False]

diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -26,10 +26,11 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[cuda_graph]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[overlap_scheduler]
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv-attention_dp-cuda_graph-overlap_scheduler]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
   - test_e2e.py::test_ptq_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]