test: Accuracy test improvement (Part 3.3): Move DeepSeek tests (#3260)

add skip



fix



fix



update



update test list



fixqa list



move bf16 to postmerge

Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
This commit is contained in:
Enwei Zhu 2025-04-08 07:19:04 +08:00 committed by GitHub
parent f3237e52ed
commit ba019a43d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
14 changed files with 371 additions and 294 deletions

View File

@ -1201,6 +1201,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"A100X-TensorRT-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2],
"L40S-TensorRT-[Post-Merge]-1": ["l40s", "l0_l40s", 1, 2],
"L40S-TensorRT-[Post-Merge]-2": ["l40s", "l0_l40s", 2, 2],
"H100_PCIe-PyTorch-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1],
"H100_PCIe-CPP-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1],
"H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 2],
"H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 2],

View File

@ -23,8 +23,10 @@ import yaml
import tensorrt_llm.evaluate
from tensorrt_llm._torch import LLM as PyTorchLLM
from tensorrt_llm._torch.speculative import SpecConfig
from tensorrt_llm.builder import BuildConfig
from tensorrt_llm.llmapi import LLM, SamplingParams
from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
from tensorrt_llm.logger import logger
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization import QuantAlgo
@ -138,9 +140,16 @@ class AccuracyTask:
extra_evaluator_kwargs: Optional[dict] = None):
assert self.EVALUATOR_CLS is not None
spec_dec_algo = None
if llm.args.speculative_config is not None:
if llm.args.speculative_config is None:
spec_dec_algo = None
elif isinstance(llm.args.speculative_config, DecodingBaseConfig):
spec_dec_algo = llm.args.speculative_config.decoding_type
elif isinstance(llm.args.speculative_config, SpecConfig):
spec_dec_algo = llm.args.speculative_config.spec_dec_name
else:
raise ValueError(
f"Not recognized speculative_config: {llm.args.speculative_config}."
)
num_samples, threshold = self.get_num_samples_and_threshold(
dtype=llm.args.dtype,

View File

@ -229,8 +229,13 @@ nvidia/Nemotron-Mini-4B-Instruct:
kv_cache_quant_algo: FP8
accuracy: 25.72
deepseek-ai/DeepSeek-V3-Lite:
- accuracy: 25.682
- accuracy: 26.465
- quant_algo: NVFP4
accuracy: 25.243
accuracy: 26.629
- quant_algo: FP8_BLOCK_SCALES
accuracy: 25.546
accuracy: 26.103
- spec_dec_algo: MTP
accuracy: 26.479
- quant_algo: FP8_BLOCK_SCALES
spec_dec_algo: MTP
accuracy: 26.230

View File

@ -51,8 +51,13 @@ Qwen/Qwen2.5-7B-Instruct:
- quant_algo: FP8
accuracy: 75.32
deepseek-ai/DeepSeek-V3-Lite:
- accuracy: 71.47
- accuracy: 71.40
- quant_algo: NVFP4
accuracy: 70.66
accuracy: 70.60
- quant_algo: FP8_BLOCK_SCALES
accuracy: 71.32
accuracy: 71.27
- spec_dec_algo: MTP
accuracy: 71.39
- quant_algo: FP8_BLOCK_SCALES
spec_dec_algo: MTP
accuracy: 71.29

View File

@ -18,8 +18,8 @@ from tensorrt_llm.llmapi import (EagleDecodingConfig, LookaheadDecodingConfig,
MedusaDecodingConfig)
from tensorrt_llm.quantization import QuantAlgo
from ..conftest import (llm_models_root, skip_no_nvls, skip_pre_ada,
skip_pre_blackwell, skip_pre_hopper)
from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_nvls,
skip_pre_ada, skip_pre_blackwell, skip_pre_hopper)
from .accuracy_core import (MMLU, CliFlowAccuracyTestHarness, CnnDailymail,
Humaneval, PassKeyRetrieval64k,
PassKeyRetrieval128k, SlimPajama6B, ZeroScrolls)
@ -57,9 +57,8 @@ class TestGpt2(CliFlowAccuracyTestHarness):
def test_int8_kv_cache(self):
self.run(kv_cache_quant_algo=QuantAlgo.INT8)
@pytest.mark.parametrize("per_token,per_channel", [(False, False),
(True, True)],
ids=["", "per_token-per_channel"])
@parametrize_with_ids("per_token,per_channel", [(False, False),
(True, True)])
def test_smooth_quant(self, per_token: bool, per_channel: bool):
if per_token:
if per_channel:
@ -297,8 +296,7 @@ class TestVicuna7B(CliFlowAccuracyTestHarness):
],
extra_summarize_args=["--lookahead_config=[7,7,7]"])
@pytest.mark.parametrize("cuda_graph", [False, True],
ids=["", "cuda_graph"])
@parametrize_with_ids("cuda_graph", [False, True])
def test_medusa(self, cuda_graph, mocker):
mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "medusa")
mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8)
@ -318,13 +316,9 @@ class TestVicuna7B(CliFlowAccuracyTestHarness):
extra_build_args=["--speculative_decoding_mode=medusa"],
extra_summarize_args=extra_summarize_args)
@pytest.mark.parametrize("cuda_graph,chunked_context,typical_acceptance",
[(False, False, False), (True, False, False),
(True, True, False), (True, False, True)],
ids=[
"", "cuda_graph", "cuda_graph-chunked_context",
"cuda_graph-typical_acceptance"
])
@parametrize_with_ids("cuda_graph,chunked_context,typical_acceptance",
[(False, False, False), (True, False, False),
(True, True, False), (True, False, True)])
def test_eagle(self, cuda_graph, chunked_context, typical_acceptance,
mocker):
mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "eagle")

View File

@ -15,10 +15,12 @@
import pytest
from tensorrt_llm._torch import LLM
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
from tensorrt_llm.quantization import QuantAlgo
from ..conftest import llm_models_root, skip_pre_blackwell
from ..conftest import (llm_models_root, parametrize_with_ids,
skip_pre_blackwell, skip_pre_hopper)
from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness
@ -113,33 +115,178 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
@pytest.mark.skip_less_device_memory(80000)
def test_auto_dtype(self):
# https://nvbugs/5141289: OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
@pytest.mark.skip_less_device_memory(60000)
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
# Only Hopper and Blackwell MLA kernel supports MTP
@parametrize_with_ids("mtp_nextn",
[None, pytest.param(2, marks=skip_pre_hopper)])
def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
overlap_scheduler):
# OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)
if mtp_nextn is not None and mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
else:
mtp_config = None
llm = LLM(self.MODEL_PATH,
kv_cache_config=kv_cache_config,
pytorch_backend_config=pytorch_config,
enable_attention_dp=attention_dp,
speculative_config=mtp_config)
with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
@pytest.mark.skip_less_device(4)
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
# Only Hopper and Blackwell MLA kernel supports MTP
@parametrize_with_ids("mtp_nextn",
[None, pytest.param(2, marks=skip_pre_hopper)])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
(2, 2, 1)],
ids=["tp4", "ep4", "tp2pp2"])
def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
attention_dp, cuda_graph, overlap_scheduler):
# OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)
if mtp_nextn is not None and mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
else:
mtp_config = None
llm = LLM(self.MODEL_PATH,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
kv_cache_config=kv_cache_config,
pytorch_backend_config=pytorch_config,
enable_attention_dp=attention_dp,
speculative_config=mtp_config)
with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
@pytest.mark.skip_device_not_contain(["H100"])
def test_fp8_block_scales(self):
model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/fp8"
# https://nvbugs/5141289: OOM on H100 with default free_gpu_memory_fraction=0.9
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
@parametrize_with_ids("mtp_nextn", [None, 2])
def test_fp8_block_scales(self, mtp_nextn, attention_dp, cuda_graph,
overlap_scheduler):
# OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
with LLM(model_path, kv_cache_config=kv_cache_config) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)
if mtp_nextn is not None and mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
else:
mtp_config = None
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
kv_cache_config=kv_cache_config,
pytorch_backend_config=pytorch_config,
enable_attention_dp=attention_dp,
speculative_config=mtp_config)
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
@pytest.mark.skip_less_device(4)
@pytest.mark.skip_device_not_contain(["H100"])
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
@parametrize_with_ids("mtp_nextn", [None, 2])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
(2, 2, 1)],
ids=["tp4", "ep4", "tp2pp2"])
def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
attention_dp, cuda_graph,
overlap_scheduler):
# OOM on H100 with default free_gpu_memory_fraction=0.9
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)
if mtp_nextn is not None and mtp_nextn > 0:
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
else:
mtp_config = None
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
kv_cache_config=kv_cache_config,
pytorch_backend_config=pytorch_config,
enable_attention_dp=attention_dp,
speculative_config=mtp_config)
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
@skip_pre_blackwell
def test_nvfp4(self):
model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only"
with LLM(model_path) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
def test_nvfp4(self, attention_dp, cuda_graph, overlap_scheduler):
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
pytorch_backend_config=pytorch_config,
enable_attention_dp=attention_dp)
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
@skip_pre_blackwell
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
[(False, False, False), (True, False, False),
(False, True, False), (False, False, True),
(True, True, True)])
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
(2, 2, 1)],
ids=["tp4", "ep4", "tp2pp2"])
def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
cuda_graph, overlap_scheduler):
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=overlap_scheduler,
use_cuda_graph=cuda_graph)
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
moe_expert_parallel_size=ep_size,
pytorch_backend_config=pytorch_config,
enable_attention_dp=attention_dp)
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
with llm:
task = CnnDailymail(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)

View File

@ -25,11 +25,13 @@ import time
import urllib.request
from functools import wraps
from pathlib import Path
from typing import Iterable, Sequence
import defs.ci_profiler
import psutil
import pytest
import yaml
from _pytest.mark import ParameterSet
from tensorrt_llm.bindings import ipc_nvls_supported
@ -1830,6 +1832,37 @@ def star_attention_input_root(llm_root):
return star_attention_input_root
def parametrize_with_ids(argnames: str | Sequence[str],
argvalues: Iterable[ParameterSet | Sequence[object]
| object], **kwargs):
if isinstance(argnames, str):
argname_list = [n.strip() for n in argnames.split(",")]
else:
argname_list = argnames
case_ids = []
for case_argvalues in argvalues:
if isinstance(case_argvalues, ParameterSet):
case_argvalues = case_argvalues.values
elif case_argvalues is None or isinstance(case_argvalues,
(str, float, int, bool)):
case_argvalues = (case_argvalues, )
assert len(case_argvalues) == len(argname_list)
case_id = []
for name, value in zip(argname_list, case_argvalues):
if value is None:
pass
elif isinstance(value, bool):
if value:
case_id.append(name)
else:
case_id.append(f"{name}={value}")
case_ids.append("-".join(case_id))
return pytest.mark.parametrize(argnames, argvalues, ids=case_ids, **kwargs)
@pytest.fixture(autouse=True)
def skip_by_device_count(request):
"fixture for skip less device count"

View File

@ -425,9 +425,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
test_e2e.py::test_benchmark_sanity[bert_base] # 127.18s
test_e2e.py::test_benchmark_sanity[gpt_350m] # 64.06s

View File

@ -130,9 +130,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
# Pivot to Pytorch test cases.
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]

View File

@ -15,8 +15,21 @@ l0_b200:
tests:
# ------------- PyTorch tests ---------------
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp-cuda_graph-overlap_scheduler]
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
- test_e2e.py::test_ptq_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]
@ -25,8 +38,6 @@ l0_b200:
- unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)"
- unittest/_torch -k "modeling_llama"
- unittest/_torch/modeling -k "modeling_mixtral"
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and nextn0"
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and not nextn0"
- unittest/_torch/auto_deploy/unit/singlegpu
- unittest/_torch/speculative/test_eagle3.py
- condition:

View File

@ -15,15 +15,38 @@ l0_dgx_h100:
tests:
# ------------- PyTorch tests ---------------
- unittest/_torch/multi_gpu
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep1_nextn0"
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep1_nextn2"
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep4_nextn0"
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep4_nextn2"
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn0_disable_dp"
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn0_enable_dp"
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn2"
- unittest/_torch/multi_gpu_modeling -k "llama and not (tp1 and pp1)"
- unittest/_torch/auto_deploy/unit/multigpu
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
- disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]
@ -108,6 +131,36 @@ l0_dgx_h100:
backend: pytorch
tests:
# ------------- PyTorch tests ---------------
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
- unittest/_torch/auto_deploy/integration/test_ad_build.py
- unittest/_torch/auto_deploy/integration/test_lm_eval.py
- condition:

View File

@ -18,11 +18,20 @@ l0_h100:
- unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)"
- unittest/_torch -k "modeling_llama"
- unittest/_torch/multi_gpu_modeling -k "llama and tp1 and pp1"
- unittest/_torch/multi_gpu_modeling -k "deepseek"
- unittest/_torch/modeling -k "modeling_mixtral"
- unittest/_torch/modeling -k "modeling_nemotron"
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and nextn0"
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_auto_dtype
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
- test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]
- test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True]
- condition:
@ -110,6 +119,30 @@ l0_h100:
- examples/test_enc_dec.py::test_llm_enc_dec_mmlu[flan-t5-small-float32-tp:1-pp:1-nb:1-disable_fp8] # 4 mins
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*h100*'
linux_distribution_name: ubuntu*
terms:
stage: post_merge
backend: pytorch
tests:
# ------------- PyTorch tests ---------------
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp-cuda_graph-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-cuda_graph]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
- condition:
ranges:
system_gpu_count:

View File

@ -418,6 +418,28 @@ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP (
unittest/_torch/auto_deploy/integration/test_lm_eval.py SKIP (https://nvbugs/5144854)
examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] SKIP (https://nvbugs/5155141)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2] SKIP (https://nvbugs/5170160)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp] SKIP (https://nvbugs/5170160)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-cuda_graph] SKIP (https://nvbugs/5170160)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler] SKIP (https://nvbugs/5170160)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5170160)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2] SKIP (https://nvbugs/5170160)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp] SKIP (https://nvbugs/5170160)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-cuda_graph] SKIP (https://nvbugs/5170160)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler] SKIP (https://nvbugs/5170160)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5170160)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-overlap_scheduler] SKIP (https://nvbugs/5201514)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201514)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-overlap_scheduler] SKIP (https://nvbugs/5201514)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201514)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype SKIP (https://nvbugs/5176851)
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] SKIP (https://nvbugs/5176851)
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] SKIP (https://nvbugs/5176851)

View File

@ -10,251 +10,15 @@ from utils.util import getSMVersion
from tensorrt_llm import SamplingParams
from tensorrt_llm._torch import LLM
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
from tensorrt_llm.llmapi import KvCacheConfig
from tensorrt_llm.llmapi.utils import get_total_gpu_memory
# Test combinations for different scenarios
# Each tuple contains: (tp_size, pp_size, ep_size, mtp_nextn, enable_dp, enable_cuda_graph, enable_overlap_scheduler, test_id)
TEST_COMBINATIONS = [
# single-gpu test
# basic test
(1, 1, 1, 0, False, False, False,
"tp1_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(1, 1, 1, 0, True, False, False,
"tp1_pp1_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(1, 1, 1, 0, False, True, False,
"tp1_pp1_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
),
(1, 1, 1, 0, False, False, True,
"tp1_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(1, 1, 1, 0, True, True, True,
"tp1_pp1_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
# mtp test
(1, 1, 1, 2, False, False, False,
"tp1_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(1, 1, 1, 2, False, False, True,
"tp1_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(1, 1, 1, 2, False, True, True,
"tp1_pp1_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
),
(1, 1, 1, 2, True, False, True,
"tp1_pp1_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(1, 1, 1, 2, True, True, True,
"tp1_pp1_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
# multi-gpu test
# tp4
(4, 1, 1, 0, False, False, False,
"tp4_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(4, 1, 1, 0, True, False, False,
"tp4_pp1_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(4, 1, 1, 0, False, True, False,
"tp4_pp1_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
),
(4, 1, 1, 0, False, False, True,
"tp4_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(4, 1, 1, 0, True, True, True,
"tp4_pp1_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
#tp4, mtp2
(4, 1, 1, 2, False, False, False,
"tp4_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(4, 1, 1, 2, False, False, True,
"tp4_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(4, 1, 1, 2, False, True, True,
"tp4_pp1_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
),
(4, 1, 1, 2, True, False, True,
"tp4_pp1_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(4, 1, 1, 2, True, True, True,
"tp4_pp1_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
# tp4, ep4
(4, 1, 4, 0, False, False, False,
"tp4_pp1_ep4_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(4, 1, 4, 0, True, False, False,
"tp4_pp1_ep4_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(4, 1, 4, 0, False, True, False,
"tp4_pp1_ep4_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
),
(4, 1, 4, 0, False, False, True,
"tp4_pp1_ep4_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(4, 1, 4, 0, True, True, True,
"tp4_pp1_ep4_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
#tp4, ep4, mtp2
(4, 1, 4, 2, False, False, False,
"tp4_pp1_ep4_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(4, 1, 4, 2, False, False, True,
"tp4_pp1_ep4_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(4, 1, 4, 2, False, True, True,
"tp4_pp1_ep4_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
),
(4, 1, 4, 2, True, False, True,
"tp4_pp1_ep4_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(4, 1, 4, 2, True, True, True,
"tp4_pp1_ep4_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
#tp2, pp2
(2, 2, 1, 0, False, False, False,
"tp2_pp2_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(2, 2, 1, 0, True, False, False,
"tp2_pp2_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(2, 2, 1, 0, False, True, False,
"tp2_pp2_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
),
(2, 2, 1, 0, False, False, True,
"tp2_pp2_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(2, 2, 1, 0, True, True, True,
"tp2_pp2_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
#tp2, pp2, mtp2
(2, 2, 1, 2, False, False, False,
"tp2_pp2_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
),
(2, 2, 1, 2, False, False, True,
"tp2_pp2_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(2, 2, 1, 2, False, True, True,
"tp2_pp2_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
),
(2, 2, 1, 2, True, False, True,
"tp2_pp2_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
),
(2, 2, 1, 2, True, True, True,
"tp2_pp2_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
]
def similar(a, b, threshold=0.9):
"similar compare a and b "
return SequenceMatcher(None, a, b).ratio() >= threshold
@pytest.mark.parametrize("model_name", ["DeepSeek-V3-Lite"],
ids=["deepseekv3_lite"])
@pytest.mark.parametrize("backend", ["TRTLLM"], ids=["trtllm"])
@pytest.mark.parametrize("quant", ["bf16", "fp8", "fp4"],
ids=["bf16", "fp8", "fp4"])
@pytest.mark.parametrize("test_config", TEST_COMBINATIONS, ids=lambda x: x[-1])
def test_deepseek(model_name, backend, quant, test_config):
tp_size, pp_size, ep_size, mtp_nextn, enable_dp, enable_cuda_graph, enable_overlap_scheduler, _ = test_config
model_path = {
"bf16": "bf16",
"fp8": "fp8",
"fp4": "nvfp4_moe_only",
}
assert quant in model_path.keys()
is_fp8 = quant == "fp8"
is_fp4 = quant == "fp4"
if (not enable_overlap_scheduler and enable_cuda_graph and not enable_dp
and mtp_nextn == 0 and ep_size == 1 and pp_size == 4
and tp_size == 1 and is_fp8):
pytest.skip("https://nvbugspro.nvidia.com/bug/5189673")
if ep_size > tp_size:
pytest.skip(
f"Expert parallel size {ep_size} must be less than or equal to tensor parallel size {tp_size}"
)
if torch.cuda.device_count() < tp_size * pp_size:
pytest.skip(f"Not enough GPUs available, need {tp_size * pp_size} "
f"but only have {torch.cuda.device_count()}")
if is_fp8 and getSMVersion() != 90:
pytest.skip(f"FP8 is not supported in this SM version {getSMVersion()}")
if is_fp4 and getSMVersion() < 100:
pytest.skip(f"FP4 is not supported in this SM version {getSMVersion()}")
if is_fp4 and mtp_nextn > 0:
pytest.skip(f"FP4 checkpoint has no MTP weights")
if mtp_nextn > 0 and getSMVersion() < 90:
pytest.skip(f"Only Hopper and Blackwell MLA kernel can support MTP now")
if pp_size > 1 and mtp_nextn > 0:
pytest.skip(
"PP + MTP is not supported: https://nvbugspro.nvidia.com/bug/5170160"
)
if pp_size > 2 and enable_cuda_graph and enable_overlap_scheduler:
pytest.skip(
"Race condition causes incorrect output for some requests: https://nvbugspro.nvidia.com/bug/5177565"
)
if get_total_gpu_memory(0) < 60 * 1024**3:
pytest.skip(f"Not enough GPU memory to run. {get_total_gpu_memory(0)}")
prompts = [
"The president of the United States is",
] * 32
expected_outputs = [
" the head of state and head of government of the",
] * 32
pytorch_config = PyTorchConfig(
enable_overlap_scheduler=enable_overlap_scheduler,
use_cuda_graph=enable_cuda_graph,
kv_cache_dtype="auto",
attn_backend=backend,
)
mtp_config = MTPDecodingConfig(
num_nextn_predict_layers=mtp_nextn) if mtp_nextn > 0 else None
model_dir = str(llm_models_root() / model_name / model_path[quant])
assert Path(model_dir).exists()
llm = LLM(model=model_dir,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
enable_chunked_prefill=False,
pytorch_backend_config=pytorch_config,
moe_expert_parallel_size=ep_size,
moe_tensor_parallel_size=-1,
enable_attention_dp=enable_dp,
kv_cache_config=KvCacheConfig(enable_block_reuse=False),
speculative_config=mtp_config)
with llm:
outputs = llm.generate(
prompts,
sampling_params=SamplingParams(max_tokens=10),
)
assert len(outputs) == len(expected_outputs), "Output length mismatch"
for output, expected in zip(outputs, expected_outputs):
output_text = output.outputs[0].text
# print(output_text)
# print(output.outputs[0].token_ids)
# Limited by the kv cache length, the output length of MTP maybe
# a little smaller than original model.
expected = expected[0:len(output_text)] if mtp_nextn > 0 else expected
assert similar(output_text, expected,
1.0), f"Expected '{expected}' but get '{output_text}'"
@pytest.mark.parametrize("model_name", ["DeepSeek-V3-Lite"],
ids=["deepseekv3_lite"])
@pytest.mark.parametrize("backend", ["TRTLLM"], ids=["trtllm"])