test: Accuracy test improvement (Part 3.3): Move DeepSeek tests (#3260)

add skip fix fix update update test list fixqa list move bf16 to postmerge Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
2026-02-18 00:35:04 +08:00 · 2025-04-08 07:19:04 +08:00 · 2025-04-08 07:19:04 +08:00 · ba019a43d6
commit ba019a43d6
parent f3237e52ed
14 changed files with 371 additions and 294 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -1201,6 +1201,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
        "A100X-TensorRT-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2],
        "L40S-TensorRT-[Post-Merge]-1": ["l40s", "l0_l40s", 1, 2],
        "L40S-TensorRT-[Post-Merge]-2": ["l40s", "l0_l40s", 2, 2],
+        "H100_PCIe-PyTorch-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1],
        "H100_PCIe-CPP-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1],
        "H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 2],
        "H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 2],
--- a/tests/integration/defs/accuracy/accuracy_core.py
+++ b/tests/integration/defs/accuracy/accuracy_core.py
@ -23,8 +23,10 @@ import yaml

 import tensorrt_llm.evaluate
 from tensorrt_llm._torch import LLM as PyTorchLLM
+from tensorrt_llm._torch.speculative import SpecConfig
 from tensorrt_llm.builder import BuildConfig
 from tensorrt_llm.llmapi import LLM, SamplingParams
+from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
 from tensorrt_llm.logger import logger
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo
@ -138,9 +140,16 @@ class AccuracyTask:
                 extra_evaluator_kwargs: Optional[dict] = None):
        assert self.EVALUATOR_CLS is not None

-        spec_dec_algo = None
-        if llm.args.speculative_config is not None:
+        if llm.args.speculative_config is None:
+            spec_dec_algo = None
+        elif isinstance(llm.args.speculative_config, DecodingBaseConfig):
            spec_dec_algo = llm.args.speculative_config.decoding_type
+        elif isinstance(llm.args.speculative_config, SpecConfig):
+            spec_dec_algo = llm.args.speculative_config.spec_dec_name
+        else:
+            raise ValueError(
+                f"Not recognized speculative_config: {llm.args.speculative_config}."
+            )

        num_samples, threshold = self.get_num_samples_and_threshold(
            dtype=llm.args.dtype,
--- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
+++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml
@ -229,8 +229,13 @@ nvidia/Nemotron-Mini-4B-Instruct:
    kv_cache_quant_algo: FP8
    accuracy: 25.72
 deepseek-ai/DeepSeek-V3-Lite:
-  - accuracy: 25.682
+  - accuracy: 26.465
  - quant_algo: NVFP4
-    accuracy: 25.243
+    accuracy: 26.629
  - quant_algo: FP8_BLOCK_SCALES
-    accuracy: 25.546
+    accuracy: 26.103
+  - spec_dec_algo: MTP
+    accuracy: 26.479
+  - quant_algo: FP8_BLOCK_SCALES
+    spec_dec_algo: MTP
+    accuracy: 26.230
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@ -51,8 +51,13 @@ Qwen/Qwen2.5-7B-Instruct:
  - quant_algo: FP8
    accuracy: 75.32
 deepseek-ai/DeepSeek-V3-Lite:
-  - accuracy: 71.47
+  - accuracy: 71.40
  - quant_algo: NVFP4
-    accuracy: 70.66
+    accuracy: 70.60
  - quant_algo: FP8_BLOCK_SCALES
-    accuracy: 71.32
+    accuracy: 71.27
+  - spec_dec_algo: MTP
+    accuracy: 71.39
+  - quant_algo: FP8_BLOCK_SCALES
+    spec_dec_algo: MTP
+    accuracy: 71.29
--- a/tests/integration/defs/accuracy/test_cli_flow.py
+++ b/tests/integration/defs/accuracy/test_cli_flow.py
@ -18,8 +18,8 @@ from tensorrt_llm.llmapi import (EagleDecodingConfig, LookaheadDecodingConfig,
                                 MedusaDecodingConfig)
 from tensorrt_llm.quantization import QuantAlgo

-from ..conftest import (llm_models_root, skip_no_nvls, skip_pre_ada,
-                        skip_pre_blackwell, skip_pre_hopper)
+from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_nvls,
+                        skip_pre_ada, skip_pre_blackwell, skip_pre_hopper)
 from .accuracy_core import (MMLU, CliFlowAccuracyTestHarness, CnnDailymail,
                            Humaneval, PassKeyRetrieval64k,
                            PassKeyRetrieval128k, SlimPajama6B, ZeroScrolls)
@ -57,9 +57,8 @@ class TestGpt2(CliFlowAccuracyTestHarness):
    def test_int8_kv_cache(self):
        self.run(kv_cache_quant_algo=QuantAlgo.INT8)

-    @pytest.mark.parametrize("per_token,per_channel", [(False, False),
-                                                       (True, True)],
-                             ids=["", "per_token-per_channel"])
+    @parametrize_with_ids("per_token,per_channel", [(False, False),
+                                                    (True, True)])
    def test_smooth_quant(self, per_token: bool, per_channel: bool):
        if per_token:
            if per_channel:
@ -297,8 +296,7 @@ class TestVicuna7B(CliFlowAccuracyTestHarness):
                 ],
                 extra_summarize_args=["--lookahead_config=[7,7,7]"])

-    @pytest.mark.parametrize("cuda_graph", [False, True],
-                             ids=["", "cuda_graph"])
+    @parametrize_with_ids("cuda_graph", [False, True])
    def test_medusa(self, cuda_graph, mocker):
        mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "medusa")
        mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8)
@ -318,13 +316,9 @@ class TestVicuna7B(CliFlowAccuracyTestHarness):
                 extra_build_args=["--speculative_decoding_mode=medusa"],
                 extra_summarize_args=extra_summarize_args)

-    @pytest.mark.parametrize("cuda_graph,chunked_context,typical_acceptance",
-                             [(False, False, False), (True, False, False),
-                              (True, True, False), (True, False, True)],
-                             ids=[
-                                 "", "cuda_graph", "cuda_graph-chunked_context",
-                                 "cuda_graph-typical_acceptance"
-                             ])
+    @parametrize_with_ids("cuda_graph,chunked_context,typical_acceptance",
+                          [(False, False, False), (True, False, False),
+                           (True, True, False), (True, False, True)])
    def test_eagle(self, cuda_graph, chunked_context, typical_acceptance,
                   mocker):
        mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "eagle")
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -15,10 +15,12 @@
 import pytest

 from tensorrt_llm._torch import LLM
-from tensorrt_llm.llmapi import KvCacheConfig
+from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
+from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
 from tensorrt_llm.quantization import QuantAlgo

-from ..conftest import llm_models_root, skip_pre_blackwell
+from ..conftest import (llm_models_root, parametrize_with_ids,
+                        skip_pre_blackwell, skip_pre_hopper)
 from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness


@ -113,33 +115,178 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
    MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
    MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"

-    @pytest.mark.skip_less_device_memory(80000)
-    def test_auto_dtype(self):
-        # https://nvbugs/5141289: OOM on H100 with default free_gpu_memory_fraction=0.9
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
-        with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
+    @pytest.mark.skip_less_device_memory(60000)
+    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, False, False), (True, False, False),
+                           (False, True, False), (False, False, True),
+                           (True, True, True)])
+    # Only Hopper and Blackwell MLA kernel supports MTP
+    @parametrize_with_ids("mtp_nextn",
+                          [None, pytest.param(2, marks=skip_pre_hopper)])
+    def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
+                      overlap_scheduler):
+        # OOM on H100 with default free_gpu_memory_fraction=0.9
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+        pytorch_config = PyTorchConfig(
+            enable_overlap_scheduler=overlap_scheduler,
+            use_cuda_graph=cuda_graph)
+        if mtp_nextn is not None and mtp_nextn > 0:
+            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
+        else:
+            mtp_config = None
+        llm = LLM(self.MODEL_PATH,
+                  kv_cache_config=kv_cache_config,
+                  pytorch_backend_config=pytorch_config,
+                  enable_attention_dp=attention_dp,
+                  speculative_config=mtp_config)
+        with llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @pytest.mark.skip_less_device(4)
+    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, False, False), (True, False, False),
+                           (False, True, False), (False, False, True),
+                           (True, True, True)])
+    # Only Hopper and Blackwell MLA kernel supports MTP
+    @parametrize_with_ids("mtp_nextn",
+                          [None, pytest.param(2, marks=skip_pre_hopper)])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
+                                                         (2, 2, 1)],
+                             ids=["tp4", "ep4", "tp2pp2"])
+    def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
+                            attention_dp, cuda_graph, overlap_scheduler):
+        # OOM on H100 with default free_gpu_memory_fraction=0.9
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
+        pytorch_config = PyTorchConfig(
+            enable_overlap_scheduler=overlap_scheduler,
+            use_cuda_graph=cuda_graph)
+        if mtp_nextn is not None and mtp_nextn > 0:
+            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
+        else:
+            mtp_config = None
+        llm = LLM(self.MODEL_PATH,
+                  tensor_parallel_size=tp_size,
+                  pipeline_parallel_size=pp_size,
+                  moe_expert_parallel_size=ep_size,
+                  kv_cache_config=kv_cache_config,
+                  pytorch_backend_config=pytorch_config,
+                  enable_attention_dp=attention_dp,
+                  speculative_config=mtp_config)
+        with llm:
            task = CnnDailymail(self.MODEL_NAME)
            task.evaluate(llm)
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)

    @pytest.mark.skip_device_not_contain(["H100"])
-    def test_fp8_block_scales(self):
-        model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/fp8"
-        # https://nvbugs/5141289: OOM on H100 with default free_gpu_memory_fraction=0.9
+    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, False, False), (True, False, False),
+                           (False, True, False), (False, False, True),
+                           (True, True, True)])
+    @parametrize_with_ids("mtp_nextn", [None, 2])
+    def test_fp8_block_scales(self, mtp_nextn, attention_dp, cuda_graph,
+                              overlap_scheduler):
+        # OOM on H100 with default free_gpu_memory_fraction=0.9
        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
-        with LLM(model_path, kv_cache_config=kv_cache_config) as llm:
-            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+        pytorch_config = PyTorchConfig(
+            enable_overlap_scheduler=overlap_scheduler,
+            use_cuda_graph=cuda_graph)
+        if mtp_nextn is not None and mtp_nextn > 0:
+            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
+        else:
+            mtp_config = None
+        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
+                  kv_cache_config=kv_cache_config,
+                  pytorch_backend_config=pytorch_config,
+                  enable_attention_dp=attention_dp,
+                  speculative_config=mtp_config)
+        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+        with llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.skip_device_not_contain(["H100"])
+    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, False, False), (True, False, False),
+                           (False, True, False), (False, False, True),
+                           (True, True, True)])
+    @parametrize_with_ids("mtp_nextn", [None, 2])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
+                                                         (2, 2, 1)],
+                             ids=["tp4", "ep4", "tp2pp2"])
+    def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
+                                    attention_dp, cuda_graph,
+                                    overlap_scheduler):
+        # OOM on H100 with default free_gpu_memory_fraction=0.9
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
+        pytorch_config = PyTorchConfig(
+            enable_overlap_scheduler=overlap_scheduler,
+            use_cuda_graph=cuda_graph)
+        if mtp_nextn is not None and mtp_nextn > 0:
+            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
+        else:
+            mtp_config = None
+        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
+                  tensor_parallel_size=tp_size,
+                  pipeline_parallel_size=pp_size,
+                  moe_expert_parallel_size=ep_size,
+                  kv_cache_config=kv_cache_config,
+                  pytorch_backend_config=pytorch_config,
+                  enable_attention_dp=attention_dp,
+                  speculative_config=mtp_config)
+        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+        with llm:
            task = CnnDailymail(self.MODEL_NAME)
            task.evaluate(llm)
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)

    @skip_pre_blackwell
-    def test_nvfp4(self):
-        model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only"
-        with LLM(model_path) as llm:
-            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, False, False), (True, False, False),
+                           (False, True, False), (False, False, True),
+                           (True, True, True)])
+    def test_nvfp4(self, attention_dp, cuda_graph, overlap_scheduler):
+        pytorch_config = PyTorchConfig(
+            enable_overlap_scheduler=overlap_scheduler,
+            use_cuda_graph=cuda_graph)
+        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
+                  pytorch_backend_config=pytorch_config,
+                  enable_attention_dp=attention_dp)
+        assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+        with llm:
+            task = CnnDailymail(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @skip_pre_blackwell
+    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, False, False), (True, False, False),
+                           (False, True, False), (False, False, True),
+                           (True, True, True)])
+    @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
+                                                         (2, 2, 1)],
+                             ids=["tp4", "ep4", "tp2pp2"])
+    def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
+                         cuda_graph, overlap_scheduler):
+        pytorch_config = PyTorchConfig(
+            enable_overlap_scheduler=overlap_scheduler,
+            use_cuda_graph=cuda_graph)
+        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
+                  tensor_parallel_size=tp_size,
+                  pipeline_parallel_size=pp_size,
+                  moe_expert_parallel_size=ep_size,
+                  pytorch_backend_config=pytorch_config,
+                  enable_attention_dp=attention_dp)
+        assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+        with llm:
            task = CnnDailymail(self.MODEL_NAME)
            task.evaluate(llm)
            task = MMLU(self.MODEL_NAME)
--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@ -25,11 +25,13 @@ import time
 import urllib.request
 from functools import wraps
 from pathlib import Path
+from typing import Iterable, Sequence

 import defs.ci_profiler
 import psutil
 import pytest
 import yaml
+from _pytest.mark import ParameterSet

 from tensorrt_llm.bindings import ipc_nvls_supported

@ -1830,6 +1832,37 @@ def star_attention_input_root(llm_root):
    return star_attention_input_root


+def parametrize_with_ids(argnames: str | Sequence[str],
+                         argvalues: Iterable[ParameterSet | Sequence[object]
+                                             | object], **kwargs):
+    if isinstance(argnames, str):
+        argname_list = [n.strip() for n in argnames.split(",")]
+    else:
+        argname_list = argnames
+
+    case_ids = []
+    for case_argvalues in argvalues:
+        if isinstance(case_argvalues, ParameterSet):
+            case_argvalues = case_argvalues.values
+        elif case_argvalues is None or isinstance(case_argvalues,
+                                                  (str, float, int, bool)):
+            case_argvalues = (case_argvalues, )
+        assert len(case_argvalues) == len(argname_list)
+
+        case_id = []
+        for name, value in zip(argname_list, case_argvalues):
+            if value is None:
+                pass
+            elif isinstance(value, bool):
+                if value:
+                    case_id.append(name)
+            else:
+                case_id.append(f"{name}={value}")
+        case_ids.append("-".join(case_id))
+
+    return pytest.mark.parametrize(argnames, argvalues, ids=case_ids, **kwargs)
+
+
@pytest.fixture(autouse=True)
 def skip_by_device_count(request):
    "fixture for skip less device count"
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@ -425,9 +425,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]

 test_e2e.py::test_benchmark_sanity[bert_base] # 127.18s
 test_e2e.py::test_benchmark_sanity[gpt_350m] # 64.06s
--- a/tests/integration/test_lists/qa/llm_sanity_test.txt
+++ b/tests/integration/test_lists/qa/llm_sanity_test.txt
@ -130,9 +130,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales
-accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]

 # Pivot to Pytorch test cases.
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@ -15,8 +15,21 @@ l0_b200:
  tests:
  # ------------- PyTorch tests ---------------
  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp-cuda_graph-overlap_scheduler]
  - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
  - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
  - test_e2e.py::test_ptq_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]
@ -25,8 +38,6 @@ l0_b200:
  - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)"
  - unittest/_torch -k "modeling_llama"
  - unittest/_torch/modeling -k "modeling_mixtral"
-  - unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and nextn0"
-  - unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and not nextn0"
  - unittest/_torch/auto_deploy/unit/singlegpu
  - unittest/_torch/speculative/test_eagle3.py
 - condition:
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@ -15,15 +15,38 @@ l0_dgx_h100:
  tests:
  # ------------- PyTorch tests ---------------
  - unittest/_torch/multi_gpu
-  - unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep1_nextn0"
-  - unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep1_nextn2"
-  - unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep4_nextn0"
-  - unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep4_nextn2"
-  - unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn0_disable_dp"
-  - unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn0_enable_dp"
-  - unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn2"
  - unittest/_torch/multi_gpu_modeling -k "llama and not (tp1 and pp1)"
  - unittest/_torch/auto_deploy/unit/multigpu
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
  - disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
  - disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
  - disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]
@ -108,6 +131,36 @@ l0_dgx_h100:
      backend: pytorch
  tests:
  # ------------- PyTorch tests ---------------
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
  - unittest/_torch/auto_deploy/integration/test_ad_build.py
  - unittest/_torch/auto_deploy/integration/test_lm_eval.py
 - condition:
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@ -18,11 +18,20 @@ l0_h100:
  - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)"
  - unittest/_torch -k "modeling_llama"
  - unittest/_torch/multi_gpu_modeling -k "llama and tp1 and pp1"
+  - unittest/_torch/multi_gpu_modeling -k "deepseek"
  - unittest/_torch/modeling -k "modeling_mixtral"
  - unittest/_torch/modeling -k "modeling_nemotron"
-  - unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and nextn0"
  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_auto_dtype
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
  - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]
  - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True]
 - condition:
@ -110,6 +119,30 @@ l0_h100:
  - examples/test_enc_dec.py::test_llm_enc_dec_mmlu[flan-t5-small-float32-tp:1-pp:1-nb:1-disable_fp8] # 4 mins
  - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
  - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*h100*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  # ------------- PyTorch tests ---------------
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp-cuda_graph-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-cuda_graph]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler]
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
 - condition:
    ranges:
      system_gpu_count:
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -418,6 +418,28 @@ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP (
 unittest/_torch/auto_deploy/integration/test_lm_eval.py SKIP (https://nvbugs/5144854)
 examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] SKIP (https://nvbugs/5155141)

+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2] SKIP (https://nvbugs/5170160)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp] SKIP (https://nvbugs/5170160)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-cuda_graph] SKIP (https://nvbugs/5170160)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler] SKIP (https://nvbugs/5170160)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5170160)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2] SKIP (https://nvbugs/5170160)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp] SKIP (https://nvbugs/5170160)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-cuda_graph] SKIP (https://nvbugs/5170160)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler] SKIP (https://nvbugs/5170160)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5170160)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-overlap_scheduler] SKIP (https://nvbugs/5201514)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201514)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-overlap_scheduler] SKIP (https://nvbugs/5201514)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201514)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
 full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype SKIP (https://nvbugs/5176851)
 full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] SKIP (https://nvbugs/5176851)
 full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] SKIP (https://nvbugs/5176851)
--- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
+++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py
@ -10,251 +10,15 @@ from utils.util import getSMVersion
 from tensorrt_llm import SamplingParams
 from tensorrt_llm._torch import LLM
 from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
-from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
+from tensorrt_llm.llmapi import KvCacheConfig
 from tensorrt_llm.llmapi.utils import get_total_gpu_memory

-# Test combinations for different scenarios
-# Each tuple contains: (tp_size, pp_size, ep_size, mtp_nextn, enable_dp, enable_cuda_graph, enable_overlap_scheduler, test_id)
-TEST_COMBINATIONS = [
-    # single-gpu test
-    # basic test
-    (1, 1, 1, 0, False, False, False,
-     "tp1_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (1, 1, 1, 0, True, False, False,
-     "tp1_pp1_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (1, 1, 1, 0, False, True, False,
-     "tp1_pp1_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (1, 1, 1, 0, False, False, True,
-     "tp1_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (1, 1, 1, 0, True, True, True,
-     "tp1_pp1_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
-    # mtp test
-    (1, 1, 1, 2, False, False, False,
-     "tp1_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (1, 1, 1, 2, False, False, True,
-     "tp1_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (1, 1, 1, 2, False, True, True,
-     "tp1_pp1_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (1, 1, 1, 2, True, False, True,
-     "tp1_pp1_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (1, 1, 1, 2, True, True, True,
-     "tp1_pp1_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
-    # multi-gpu test
-    # tp4
-    (4, 1, 1, 0, False, False, False,
-     "tp4_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (4, 1, 1, 0, True, False, False,
-     "tp4_pp1_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (4, 1, 1, 0, False, True, False,
-     "tp4_pp1_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (4, 1, 1, 0, False, False, True,
-     "tp4_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (4, 1, 1, 0, True, True, True,
-     "tp4_pp1_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
-    #tp4, mtp2
-    (4, 1, 1, 2, False, False, False,
-     "tp4_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (4, 1, 1, 2, False, False, True,
-     "tp4_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (4, 1, 1, 2, False, True, True,
-     "tp4_pp1_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (4, 1, 1, 2, True, False, True,
-     "tp4_pp1_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (4, 1, 1, 2, True, True, True,
-     "tp4_pp1_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
-    # tp4, ep4
-    (4, 1, 4, 0, False, False, False,
-     "tp4_pp1_ep4_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (4, 1, 4, 0, True, False, False,
-     "tp4_pp1_ep4_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (4, 1, 4, 0, False, True, False,
-     "tp4_pp1_ep4_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (4, 1, 4, 0, False, False, True,
-     "tp4_pp1_ep4_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (4, 1, 4, 0, True, True, True,
-     "tp4_pp1_ep4_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
-    #tp4, ep4, mtp2
-    (4, 1, 4, 2, False, False, False,
-     "tp4_pp1_ep4_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (4, 1, 4, 2, False, False, True,
-     "tp4_pp1_ep4_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (4, 1, 4, 2, False, True, True,
-     "tp4_pp1_ep4_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (4, 1, 4, 2, True, False, True,
-     "tp4_pp1_ep4_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (4, 1, 4, 2, True, True, True,
-     "tp4_pp1_ep4_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
-    #tp2, pp2
-    (2, 2, 1, 0, False, False, False,
-     "tp2_pp2_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (2, 2, 1, 0, True, False, False,
-     "tp2_pp2_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (2, 2, 1, 0, False, True, False,
-     "tp2_pp2_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (2, 2, 1, 0, False, False, True,
-     "tp2_pp2_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (2, 2, 1, 0, True, True, True,
-     "tp2_pp2_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
-    #tp2, pp2, mtp2
-    (2, 2, 1, 2, False, False, False,
-     "tp2_pp2_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
-     ),
-    (2, 2, 1, 2, False, False, True,
-     "tp2_pp2_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (2, 2, 1, 2, False, True, True,
-     "tp2_pp2_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (2, 2, 1, 2, True, False, True,
-     "tp2_pp2_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
-     ),
-    (2, 2, 1, 2, True, True, True,
-     "tp2_pp2_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
-]
-

 def similar(a, b, threshold=0.9):
    "similar compare a and b "
    return SequenceMatcher(None, a, b).ratio() >= threshold


-@pytest.mark.parametrize("model_name", ["DeepSeek-V3-Lite"],
-                         ids=["deepseekv3_lite"])
-@pytest.mark.parametrize("backend", ["TRTLLM"], ids=["trtllm"])
-@pytest.mark.parametrize("quant", ["bf16", "fp8", "fp4"],
-                         ids=["bf16", "fp8", "fp4"])
-@pytest.mark.parametrize("test_config", TEST_COMBINATIONS, ids=lambda x: x[-1])
-def test_deepseek(model_name, backend, quant, test_config):
-    tp_size, pp_size, ep_size, mtp_nextn, enable_dp, enable_cuda_graph, enable_overlap_scheduler, _ = test_config
-
-    model_path = {
-        "bf16": "bf16",
-        "fp8": "fp8",
-        "fp4": "nvfp4_moe_only",
-    }
-    assert quant in model_path.keys()
-
-    is_fp8 = quant == "fp8"
-    is_fp4 = quant == "fp4"
-
-    if (not enable_overlap_scheduler and enable_cuda_graph and not enable_dp
-            and mtp_nextn == 0 and ep_size == 1 and pp_size == 4
-            and tp_size == 1 and is_fp8):
-
-        pytest.skip("https://nvbugspro.nvidia.com/bug/5189673")
-
-    if ep_size > tp_size:
-        pytest.skip(
-            f"Expert parallel size {ep_size} must be less than or equal to tensor parallel size {tp_size}"
-        )
-
-    if torch.cuda.device_count() < tp_size * pp_size:
-        pytest.skip(f"Not enough GPUs available, need {tp_size * pp_size} "
-                    f"but only have {torch.cuda.device_count()}")
-
-    if is_fp8 and getSMVersion() != 90:
-        pytest.skip(f"FP8 is not supported in this SM version {getSMVersion()}")
-
-    if is_fp4 and getSMVersion() < 100:
-        pytest.skip(f"FP4 is not supported in this SM version {getSMVersion()}")
-
-    if is_fp4 and mtp_nextn > 0:
-        pytest.skip(f"FP4 checkpoint has no MTP weights")
-
-    if mtp_nextn > 0 and getSMVersion() < 90:
-        pytest.skip(f"Only Hopper and Blackwell MLA kernel can support MTP now")
-
-    if pp_size > 1 and mtp_nextn > 0:
-        pytest.skip(
-            "PP + MTP is not supported: https://nvbugspro.nvidia.com/bug/5170160"
-        )
-    if pp_size > 2 and enable_cuda_graph and enable_overlap_scheduler:
-        pytest.skip(
-            "Race condition causes incorrect output for some requests: https://nvbugspro.nvidia.com/bug/5177565"
-        )
-
-    if get_total_gpu_memory(0) < 60 * 1024**3:
-        pytest.skip(f"Not enough GPU memory to run. {get_total_gpu_memory(0)}")
-
-    prompts = [
-        "The president of the United States is",
-    ] * 32
-
-    expected_outputs = [
-        " the head of state and head of government of the",
-    ] * 32
-
-    pytorch_config = PyTorchConfig(
-        enable_overlap_scheduler=enable_overlap_scheduler,
-        use_cuda_graph=enable_cuda_graph,
-        kv_cache_dtype="auto",
-        attn_backend=backend,
-    )
-
-    mtp_config = MTPDecodingConfig(
-        num_nextn_predict_layers=mtp_nextn) if mtp_nextn > 0 else None
-
-    model_dir = str(llm_models_root() / model_name / model_path[quant])
-
-    assert Path(model_dir).exists()
-
-    llm = LLM(model=model_dir,
-              tensor_parallel_size=tp_size,
-              pipeline_parallel_size=pp_size,
-              enable_chunked_prefill=False,
-              pytorch_backend_config=pytorch_config,
-              moe_expert_parallel_size=ep_size,
-              moe_tensor_parallel_size=-1,
-              enable_attention_dp=enable_dp,
-              kv_cache_config=KvCacheConfig(enable_block_reuse=False),
-              speculative_config=mtp_config)
-
-    with llm:
-        outputs = llm.generate(
-            prompts,
-            sampling_params=SamplingParams(max_tokens=10),
-        )
-
-    assert len(outputs) == len(expected_outputs), "Output length mismatch"
-    for output, expected in zip(outputs, expected_outputs):
-        output_text = output.outputs[0].text
-        # print(output_text)
-        # print(output.outputs[0].token_ids)
-        # Limited by the kv cache length, the output length of MTP maybe
-        # a little smaller than original model.
-        expected = expected[0:len(output_text)] if mtp_nextn > 0 else expected
-        assert similar(output_text, expected,
-                       1.0), f"Expected '{expected}' but get '{output_text}'"
-
-
@pytest.mark.parametrize("model_name", ["DeepSeek-V3-Lite"],
                         ids=["deepseekv3_lite"])
@pytest.mark.parametrize("backend", ["TRTLLM"], ids=["trtllm"])