mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-18 00:35:04 +08:00
test: Accuracy test improvement (Part 3.3): Move DeepSeek tests (#3260)
add skip fix fix update update test list fixqa list move bf16 to postmerge Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com>
This commit is contained in:
parent
f3237e52ed
commit
ba019a43d6
@ -1201,6 +1201,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
"A100X-TensorRT-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2],
|
||||
"L40S-TensorRT-[Post-Merge]-1": ["l40s", "l0_l40s", 1, 2],
|
||||
"L40S-TensorRT-[Post-Merge]-2": ["l40s", "l0_l40s", 2, 2],
|
||||
"H100_PCIe-PyTorch-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1],
|
||||
"H100_PCIe-CPP-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 2],
|
||||
"H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 2],
|
||||
|
||||
@ -23,8 +23,10 @@ import yaml
|
||||
|
||||
import tensorrt_llm.evaluate
|
||||
from tensorrt_llm._torch import LLM as PyTorchLLM
|
||||
from tensorrt_llm._torch.speculative import SpecConfig
|
||||
from tensorrt_llm.builder import BuildConfig
|
||||
from tensorrt_llm.llmapi import LLM, SamplingParams
|
||||
from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig
|
||||
from tensorrt_llm.logger import logger
|
||||
from tensorrt_llm.models.modeling_utils import QuantConfig
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
@ -138,9 +140,16 @@ class AccuracyTask:
|
||||
extra_evaluator_kwargs: Optional[dict] = None):
|
||||
assert self.EVALUATOR_CLS is not None
|
||||
|
||||
spec_dec_algo = None
|
||||
if llm.args.speculative_config is not None:
|
||||
if llm.args.speculative_config is None:
|
||||
spec_dec_algo = None
|
||||
elif isinstance(llm.args.speculative_config, DecodingBaseConfig):
|
||||
spec_dec_algo = llm.args.speculative_config.decoding_type
|
||||
elif isinstance(llm.args.speculative_config, SpecConfig):
|
||||
spec_dec_algo = llm.args.speculative_config.spec_dec_name
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Not recognized speculative_config: {llm.args.speculative_config}."
|
||||
)
|
||||
|
||||
num_samples, threshold = self.get_num_samples_and_threshold(
|
||||
dtype=llm.args.dtype,
|
||||
|
||||
@ -229,8 +229,13 @@ nvidia/Nemotron-Mini-4B-Instruct:
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 25.72
|
||||
deepseek-ai/DeepSeek-V3-Lite:
|
||||
- accuracy: 25.682
|
||||
- accuracy: 26.465
|
||||
- quant_algo: NVFP4
|
||||
accuracy: 25.243
|
||||
accuracy: 26.629
|
||||
- quant_algo: FP8_BLOCK_SCALES
|
||||
accuracy: 25.546
|
||||
accuracy: 26.103
|
||||
- spec_dec_algo: MTP
|
||||
accuracy: 26.479
|
||||
- quant_algo: FP8_BLOCK_SCALES
|
||||
spec_dec_algo: MTP
|
||||
accuracy: 26.230
|
||||
|
||||
@ -51,8 +51,13 @@ Qwen/Qwen2.5-7B-Instruct:
|
||||
- quant_algo: FP8
|
||||
accuracy: 75.32
|
||||
deepseek-ai/DeepSeek-V3-Lite:
|
||||
- accuracy: 71.47
|
||||
- accuracy: 71.40
|
||||
- quant_algo: NVFP4
|
||||
accuracy: 70.66
|
||||
accuracy: 70.60
|
||||
- quant_algo: FP8_BLOCK_SCALES
|
||||
accuracy: 71.32
|
||||
accuracy: 71.27
|
||||
- spec_dec_algo: MTP
|
||||
accuracy: 71.39
|
||||
- quant_algo: FP8_BLOCK_SCALES
|
||||
spec_dec_algo: MTP
|
||||
accuracy: 71.29
|
||||
|
||||
@ -18,8 +18,8 @@ from tensorrt_llm.llmapi import (EagleDecodingConfig, LookaheadDecodingConfig,
|
||||
MedusaDecodingConfig)
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
from ..conftest import (llm_models_root, skip_no_nvls, skip_pre_ada,
|
||||
skip_pre_blackwell, skip_pre_hopper)
|
||||
from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_nvls,
|
||||
skip_pre_ada, skip_pre_blackwell, skip_pre_hopper)
|
||||
from .accuracy_core import (MMLU, CliFlowAccuracyTestHarness, CnnDailymail,
|
||||
Humaneval, PassKeyRetrieval64k,
|
||||
PassKeyRetrieval128k, SlimPajama6B, ZeroScrolls)
|
||||
@ -57,9 +57,8 @@ class TestGpt2(CliFlowAccuracyTestHarness):
|
||||
def test_int8_kv_cache(self):
|
||||
self.run(kv_cache_quant_algo=QuantAlgo.INT8)
|
||||
|
||||
@pytest.mark.parametrize("per_token,per_channel", [(False, False),
|
||||
(True, True)],
|
||||
ids=["", "per_token-per_channel"])
|
||||
@parametrize_with_ids("per_token,per_channel", [(False, False),
|
||||
(True, True)])
|
||||
def test_smooth_quant(self, per_token: bool, per_channel: bool):
|
||||
if per_token:
|
||||
if per_channel:
|
||||
@ -297,8 +296,7 @@ class TestVicuna7B(CliFlowAccuracyTestHarness):
|
||||
],
|
||||
extra_summarize_args=["--lookahead_config=[7,7,7]"])
|
||||
|
||||
@pytest.mark.parametrize("cuda_graph", [False, True],
|
||||
ids=["", "cuda_graph"])
|
||||
@parametrize_with_ids("cuda_graph", [False, True])
|
||||
def test_medusa(self, cuda_graph, mocker):
|
||||
mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "medusa")
|
||||
mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8)
|
||||
@ -318,13 +316,9 @@ class TestVicuna7B(CliFlowAccuracyTestHarness):
|
||||
extra_build_args=["--speculative_decoding_mode=medusa"],
|
||||
extra_summarize_args=extra_summarize_args)
|
||||
|
||||
@pytest.mark.parametrize("cuda_graph,chunked_context,typical_acceptance",
|
||||
[(False, False, False), (True, False, False),
|
||||
(True, True, False), (True, False, True)],
|
||||
ids=[
|
||||
"", "cuda_graph", "cuda_graph-chunked_context",
|
||||
"cuda_graph-typical_acceptance"
|
||||
])
|
||||
@parametrize_with_ids("cuda_graph,chunked_context,typical_acceptance",
|
||||
[(False, False, False), (True, False, False),
|
||||
(True, True, False), (True, False, True)])
|
||||
def test_eagle(self, cuda_graph, chunked_context, typical_acceptance,
|
||||
mocker):
|
||||
mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "eagle")
|
||||
|
||||
@ -15,10 +15,12 @@
|
||||
import pytest
|
||||
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm.llmapi import KvCacheConfig
|
||||
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
||||
from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
from ..conftest import llm_models_root, skip_pre_blackwell
|
||||
from ..conftest import (llm_models_root, parametrize_with_ids,
|
||||
skip_pre_blackwell, skip_pre_hopper)
|
||||
from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness
|
||||
|
||||
|
||||
@ -113,33 +115,178 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
|
||||
MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
def test_auto_dtype(self):
|
||||
# https://nvbugs/5141289: OOM on H100 with default free_gpu_memory_fraction=0.9
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
|
||||
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
|
||||
@pytest.mark.skip_less_device_memory(60000)
|
||||
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
|
||||
[(False, False, False), (True, False, False),
|
||||
(False, True, False), (False, False, True),
|
||||
(True, True, True)])
|
||||
# Only Hopper and Blackwell MLA kernel supports MTP
|
||||
@parametrize_with_ids("mtp_nextn",
|
||||
[None, pytest.param(2, marks=skip_pre_hopper)])
|
||||
def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph,
|
||||
overlap_scheduler):
|
||||
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
|
||||
pytorch_config = PyTorchConfig(
|
||||
enable_overlap_scheduler=overlap_scheduler,
|
||||
use_cuda_graph=cuda_graph)
|
||||
if mtp_nextn is not None and mtp_nextn > 0:
|
||||
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
|
||||
else:
|
||||
mtp_config = None
|
||||
llm = LLM(self.MODEL_PATH,
|
||||
kv_cache_config=kv_cache_config,
|
||||
pytorch_backend_config=pytorch_config,
|
||||
enable_attention_dp=attention_dp,
|
||||
speculative_config=mtp_config)
|
||||
with llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
|
||||
[(False, False, False), (True, False, False),
|
||||
(False, True, False), (False, False, True),
|
||||
(True, True, True)])
|
||||
# Only Hopper and Blackwell MLA kernel supports MTP
|
||||
@parametrize_with_ids("mtp_nextn",
|
||||
[None, pytest.param(2, marks=skip_pre_hopper)])
|
||||
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
|
||||
(2, 2, 1)],
|
||||
ids=["tp4", "ep4", "tp2pp2"])
|
||||
def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
|
||||
attention_dp, cuda_graph, overlap_scheduler):
|
||||
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
|
||||
pytorch_config = PyTorchConfig(
|
||||
enable_overlap_scheduler=overlap_scheduler,
|
||||
use_cuda_graph=cuda_graph)
|
||||
if mtp_nextn is not None and mtp_nextn > 0:
|
||||
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
|
||||
else:
|
||||
mtp_config = None
|
||||
llm = LLM(self.MODEL_PATH,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
moe_expert_parallel_size=ep_size,
|
||||
kv_cache_config=kv_cache_config,
|
||||
pytorch_backend_config=pytorch_config,
|
||||
enable_attention_dp=attention_dp,
|
||||
speculative_config=mtp_config)
|
||||
with llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip_device_not_contain(["H100"])
|
||||
def test_fp8_block_scales(self):
|
||||
model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/fp8"
|
||||
# https://nvbugs/5141289: OOM on H100 with default free_gpu_memory_fraction=0.9
|
||||
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
|
||||
[(False, False, False), (True, False, False),
|
||||
(False, True, False), (False, False, True),
|
||||
(True, True, True)])
|
||||
@parametrize_with_ids("mtp_nextn", [None, 2])
|
||||
def test_fp8_block_scales(self, mtp_nextn, attention_dp, cuda_graph,
|
||||
overlap_scheduler):
|
||||
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
|
||||
with LLM(model_path, kv_cache_config=kv_cache_config) as llm:
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
|
||||
pytorch_config = PyTorchConfig(
|
||||
enable_overlap_scheduler=overlap_scheduler,
|
||||
use_cuda_graph=cuda_graph)
|
||||
if mtp_nextn is not None and mtp_nextn > 0:
|
||||
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
|
||||
else:
|
||||
mtp_config = None
|
||||
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
|
||||
kv_cache_config=kv_cache_config,
|
||||
pytorch_backend_config=pytorch_config,
|
||||
enable_attention_dp=attention_dp,
|
||||
speculative_config=mtp_config)
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
|
||||
with llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@pytest.mark.skip_device_not_contain(["H100"])
|
||||
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
|
||||
[(False, False, False), (True, False, False),
|
||||
(False, True, False), (False, False, True),
|
||||
(True, True, True)])
|
||||
@parametrize_with_ids("mtp_nextn", [None, 2])
|
||||
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
|
||||
(2, 2, 1)],
|
||||
ids=["tp4", "ep4", "tp2pp2"])
|
||||
def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
|
||||
attention_dp, cuda_graph,
|
||||
overlap_scheduler):
|
||||
# OOM on H100 with default free_gpu_memory_fraction=0.9
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
|
||||
pytorch_config = PyTorchConfig(
|
||||
enable_overlap_scheduler=overlap_scheduler,
|
||||
use_cuda_graph=cuda_graph)
|
||||
if mtp_nextn is not None and mtp_nextn > 0:
|
||||
mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
|
||||
else:
|
||||
mtp_config = None
|
||||
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
moe_expert_parallel_size=ep_size,
|
||||
kv_cache_config=kv_cache_config,
|
||||
pytorch_backend_config=pytorch_config,
|
||||
enable_attention_dp=attention_dp,
|
||||
speculative_config=mtp_config)
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
|
||||
with llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@skip_pre_blackwell
|
||||
def test_nvfp4(self):
|
||||
model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only"
|
||||
with LLM(model_path) as llm:
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
|
||||
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
|
||||
[(False, False, False), (True, False, False),
|
||||
(False, True, False), (False, False, True),
|
||||
(True, True, True)])
|
||||
def test_nvfp4(self, attention_dp, cuda_graph, overlap_scheduler):
|
||||
pytorch_config = PyTorchConfig(
|
||||
enable_overlap_scheduler=overlap_scheduler,
|
||||
use_cuda_graph=cuda_graph)
|
||||
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
|
||||
pytorch_backend_config=pytorch_config,
|
||||
enable_attention_dp=attention_dp)
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
|
||||
with llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@skip_pre_blackwell
|
||||
@parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
|
||||
[(False, False, False), (True, False, False),
|
||||
(False, True, False), (False, False, True),
|
||||
(True, True, True)])
|
||||
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4),
|
||||
(2, 2, 1)],
|
||||
ids=["tp4", "ep4", "tp2pp2"])
|
||||
def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp,
|
||||
cuda_graph, overlap_scheduler):
|
||||
pytorch_config = PyTorchConfig(
|
||||
enable_overlap_scheduler=overlap_scheduler,
|
||||
use_cuda_graph=cuda_graph)
|
||||
llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
moe_expert_parallel_size=ep_size,
|
||||
pytorch_backend_config=pytorch_config,
|
||||
enable_attention_dp=attention_dp)
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
|
||||
with llm:
|
||||
task = CnnDailymail(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
|
||||
@ -25,11 +25,13 @@ import time
|
||||
import urllib.request
|
||||
from functools import wraps
|
||||
from pathlib import Path
|
||||
from typing import Iterable, Sequence
|
||||
|
||||
import defs.ci_profiler
|
||||
import psutil
|
||||
import pytest
|
||||
import yaml
|
||||
from _pytest.mark import ParameterSet
|
||||
|
||||
from tensorrt_llm.bindings import ipc_nvls_supported
|
||||
|
||||
@ -1830,6 +1832,37 @@ def star_attention_input_root(llm_root):
|
||||
return star_attention_input_root
|
||||
|
||||
|
||||
def parametrize_with_ids(argnames: str | Sequence[str],
|
||||
argvalues: Iterable[ParameterSet | Sequence[object]
|
||||
| object], **kwargs):
|
||||
if isinstance(argnames, str):
|
||||
argname_list = [n.strip() for n in argnames.split(",")]
|
||||
else:
|
||||
argname_list = argnames
|
||||
|
||||
case_ids = []
|
||||
for case_argvalues in argvalues:
|
||||
if isinstance(case_argvalues, ParameterSet):
|
||||
case_argvalues = case_argvalues.values
|
||||
elif case_argvalues is None or isinstance(case_argvalues,
|
||||
(str, float, int, bool)):
|
||||
case_argvalues = (case_argvalues, )
|
||||
assert len(case_argvalues) == len(argname_list)
|
||||
|
||||
case_id = []
|
||||
for name, value in zip(argname_list, case_argvalues):
|
||||
if value is None:
|
||||
pass
|
||||
elif isinstance(value, bool):
|
||||
if value:
|
||||
case_id.append(name)
|
||||
else:
|
||||
case_id.append(f"{name}={value}")
|
||||
case_ids.append("-".join(case_id))
|
||||
|
||||
return pytest.mark.parametrize(argnames, argvalues, ids=case_ids, **kwargs)
|
||||
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def skip_by_device_count(request):
|
||||
"fixture for skip less device count"
|
||||
|
||||
@ -425,9 +425,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
|
||||
|
||||
test_e2e.py::test_benchmark_sanity[bert_base] # 127.18s
|
||||
test_e2e.py::test_benchmark_sanity[gpt_350m] # 64.06s
|
||||
|
||||
@ -130,9 +130,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
|
||||
|
||||
# Pivot to Pytorch test cases.
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
|
||||
@ -15,8 +15,21 @@ l0_b200:
|
||||
tests:
|
||||
# ------------- PyTorch tests ---------------
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp-cuda_graph-overlap_scheduler]
|
||||
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
|
||||
- test_e2e.py::test_ptq_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]
|
||||
@ -25,8 +38,6 @@ l0_b200:
|
||||
- unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)"
|
||||
- unittest/_torch -k "modeling_llama"
|
||||
- unittest/_torch/modeling -k "modeling_mixtral"
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and nextn0"
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and not nextn0"
|
||||
- unittest/_torch/auto_deploy/unit/singlegpu
|
||||
- unittest/_torch/speculative/test_eagle3.py
|
||||
- condition:
|
||||
|
||||
@ -15,15 +15,38 @@ l0_dgx_h100:
|
||||
tests:
|
||||
# ------------- PyTorch tests ---------------
|
||||
- unittest/_torch/multi_gpu
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep1_nextn0"
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep1_nextn2"
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep4_nextn0"
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep4_nextn2"
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn0_disable_dp"
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn0_enable_dp"
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn2"
|
||||
- unittest/_torch/multi_gpu_modeling -k "llama and not (tp1 and pp1)"
|
||||
- unittest/_torch/auto_deploy/unit/multigpu
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
|
||||
- disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0]
|
||||
@ -108,6 +131,36 @@ l0_dgx_h100:
|
||||
backend: pytorch
|
||||
tests:
|
||||
# ------------- PyTorch tests ---------------
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- unittest/_torch/auto_deploy/integration/test_ad_build.py
|
||||
- unittest/_torch/auto_deploy/integration/test_lm_eval.py
|
||||
- condition:
|
||||
|
||||
@ -18,11 +18,20 @@ l0_h100:
|
||||
- unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)"
|
||||
- unittest/_torch -k "modeling_llama"
|
||||
- unittest/_torch/multi_gpu_modeling -k "llama and tp1 and pp1"
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek"
|
||||
- unittest/_torch/modeling -k "modeling_mixtral"
|
||||
- unittest/_torch/modeling -k "modeling_nemotron"
|
||||
- unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and nextn0"
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_auto_dtype
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False]
|
||||
- test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True]
|
||||
- condition:
|
||||
@ -110,6 +119,30 @@ l0_h100:
|
||||
- examples/test_enc_dec.py::test_llm_enc_dec_mmlu[flan-t5-small-float32-tp:1-pp:1-nb:1-disable_fp8] # 4 mins
|
||||
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
|
||||
- examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
lte: 1
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*h100*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
# ------------- PyTorch tests ---------------
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp-cuda_graph-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-cuda_graph]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler]
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
|
||||
@ -418,6 +418,28 @@ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP (
|
||||
unittest/_torch/auto_deploy/integration/test_lm_eval.py SKIP (https://nvbugs/5144854)
|
||||
examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] SKIP (https://nvbugs/5155141)
|
||||
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2] SKIP (https://nvbugs/5170160)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp] SKIP (https://nvbugs/5170160)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-cuda_graph] SKIP (https://nvbugs/5170160)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler] SKIP (https://nvbugs/5170160)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5170160)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2] SKIP (https://nvbugs/5170160)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp] SKIP (https://nvbugs/5170160)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-cuda_graph] SKIP (https://nvbugs/5170160)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler] SKIP (https://nvbugs/5170160)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5170160)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-overlap_scheduler] SKIP (https://nvbugs/5201514)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201514)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-overlap_scheduler] SKIP (https://nvbugs/5201514)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201514)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype SKIP (https://nvbugs/5176851)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] SKIP (https://nvbugs/5176851)
|
||||
full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] SKIP (https://nvbugs/5176851)
|
||||
|
||||
@ -10,251 +10,15 @@ from utils.util import getSMVersion
|
||||
from tensorrt_llm import SamplingParams
|
||||
from tensorrt_llm._torch import LLM
|
||||
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
||||
from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig
|
||||
from tensorrt_llm.llmapi import KvCacheConfig
|
||||
from tensorrt_llm.llmapi.utils import get_total_gpu_memory
|
||||
|
||||
# Test combinations for different scenarios
|
||||
# Each tuple contains: (tp_size, pp_size, ep_size, mtp_nextn, enable_dp, enable_cuda_graph, enable_overlap_scheduler, test_id)
|
||||
TEST_COMBINATIONS = [
|
||||
# single-gpu test
|
||||
# basic test
|
||||
(1, 1, 1, 0, False, False, False,
|
||||
"tp1_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(1, 1, 1, 0, True, False, False,
|
||||
"tp1_pp1_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(1, 1, 1, 0, False, True, False,
|
||||
"tp1_pp1_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(1, 1, 1, 0, False, False, True,
|
||||
"tp1_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(1, 1, 1, 0, True, True, True,
|
||||
"tp1_pp1_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
|
||||
# mtp test
|
||||
(1, 1, 1, 2, False, False, False,
|
||||
"tp1_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(1, 1, 1, 2, False, False, True,
|
||||
"tp1_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(1, 1, 1, 2, False, True, True,
|
||||
"tp1_pp1_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(1, 1, 1, 2, True, False, True,
|
||||
"tp1_pp1_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(1, 1, 1, 2, True, True, True,
|
||||
"tp1_pp1_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
|
||||
# multi-gpu test
|
||||
# tp4
|
||||
(4, 1, 1, 0, False, False, False,
|
||||
"tp4_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 1, 0, True, False, False,
|
||||
"tp4_pp1_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 1, 0, False, True, False,
|
||||
"tp4_pp1_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 1, 0, False, False, True,
|
||||
"tp4_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 1, 0, True, True, True,
|
||||
"tp4_pp1_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
|
||||
#tp4, mtp2
|
||||
(4, 1, 1, 2, False, False, False,
|
||||
"tp4_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 1, 2, False, False, True,
|
||||
"tp4_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 1, 2, False, True, True,
|
||||
"tp4_pp1_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 1, 2, True, False, True,
|
||||
"tp4_pp1_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 1, 2, True, True, True,
|
||||
"tp4_pp1_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
|
||||
# tp4, ep4
|
||||
(4, 1, 4, 0, False, False, False,
|
||||
"tp4_pp1_ep4_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 4, 0, True, False, False,
|
||||
"tp4_pp1_ep4_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 4, 0, False, True, False,
|
||||
"tp4_pp1_ep4_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 4, 0, False, False, True,
|
||||
"tp4_pp1_ep4_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 4, 0, True, True, True,
|
||||
"tp4_pp1_ep4_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
|
||||
#tp4, ep4, mtp2
|
||||
(4, 1, 4, 2, False, False, False,
|
||||
"tp4_pp1_ep4_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 4, 2, False, False, True,
|
||||
"tp4_pp1_ep4_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 4, 2, False, True, True,
|
||||
"tp4_pp1_ep4_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 4, 2, True, False, True,
|
||||
"tp4_pp1_ep4_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(4, 1, 4, 2, True, True, True,
|
||||
"tp4_pp1_ep4_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
|
||||
#tp2, pp2
|
||||
(2, 2, 1, 0, False, False, False,
|
||||
"tp2_pp2_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(2, 2, 1, 0, True, False, False,
|
||||
"tp2_pp2_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(2, 2, 1, 0, False, True, False,
|
||||
"tp2_pp2_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(2, 2, 1, 0, False, False, True,
|
||||
"tp2_pp2_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(2, 2, 1, 0, True, True, True,
|
||||
"tp2_pp2_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
|
||||
#tp2, pp2, mtp2
|
||||
(2, 2, 1, 2, False, False, False,
|
||||
"tp2_pp2_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler"
|
||||
),
|
||||
(2, 2, 1, 2, False, False, True,
|
||||
"tp2_pp2_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(2, 2, 1, 2, False, True, True,
|
||||
"tp2_pp2_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(2, 2, 1, 2, True, False, True,
|
||||
"tp2_pp2_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler"
|
||||
),
|
||||
(2, 2, 1, 2, True, True, True,
|
||||
"tp2_pp2_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"),
|
||||
]
|
||||
|
||||
|
||||
def similar(a, b, threshold=0.9):
|
||||
"similar compare a and b "
|
||||
return SequenceMatcher(None, a, b).ratio() >= threshold
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", ["DeepSeek-V3-Lite"],
|
||||
ids=["deepseekv3_lite"])
|
||||
@pytest.mark.parametrize("backend", ["TRTLLM"], ids=["trtllm"])
|
||||
@pytest.mark.parametrize("quant", ["bf16", "fp8", "fp4"],
|
||||
ids=["bf16", "fp8", "fp4"])
|
||||
@pytest.mark.parametrize("test_config", TEST_COMBINATIONS, ids=lambda x: x[-1])
|
||||
def test_deepseek(model_name, backend, quant, test_config):
|
||||
tp_size, pp_size, ep_size, mtp_nextn, enable_dp, enable_cuda_graph, enable_overlap_scheduler, _ = test_config
|
||||
|
||||
model_path = {
|
||||
"bf16": "bf16",
|
||||
"fp8": "fp8",
|
||||
"fp4": "nvfp4_moe_only",
|
||||
}
|
||||
assert quant in model_path.keys()
|
||||
|
||||
is_fp8 = quant == "fp8"
|
||||
is_fp4 = quant == "fp4"
|
||||
|
||||
if (not enable_overlap_scheduler and enable_cuda_graph and not enable_dp
|
||||
and mtp_nextn == 0 and ep_size == 1 and pp_size == 4
|
||||
and tp_size == 1 and is_fp8):
|
||||
|
||||
pytest.skip("https://nvbugspro.nvidia.com/bug/5189673")
|
||||
|
||||
if ep_size > tp_size:
|
||||
pytest.skip(
|
||||
f"Expert parallel size {ep_size} must be less than or equal to tensor parallel size {tp_size}"
|
||||
)
|
||||
|
||||
if torch.cuda.device_count() < tp_size * pp_size:
|
||||
pytest.skip(f"Not enough GPUs available, need {tp_size * pp_size} "
|
||||
f"but only have {torch.cuda.device_count()}")
|
||||
|
||||
if is_fp8 and getSMVersion() != 90:
|
||||
pytest.skip(f"FP8 is not supported in this SM version {getSMVersion()}")
|
||||
|
||||
if is_fp4 and getSMVersion() < 100:
|
||||
pytest.skip(f"FP4 is not supported in this SM version {getSMVersion()}")
|
||||
|
||||
if is_fp4 and mtp_nextn > 0:
|
||||
pytest.skip(f"FP4 checkpoint has no MTP weights")
|
||||
|
||||
if mtp_nextn > 0 and getSMVersion() < 90:
|
||||
pytest.skip(f"Only Hopper and Blackwell MLA kernel can support MTP now")
|
||||
|
||||
if pp_size > 1 and mtp_nextn > 0:
|
||||
pytest.skip(
|
||||
"PP + MTP is not supported: https://nvbugspro.nvidia.com/bug/5170160"
|
||||
)
|
||||
if pp_size > 2 and enable_cuda_graph and enable_overlap_scheduler:
|
||||
pytest.skip(
|
||||
"Race condition causes incorrect output for some requests: https://nvbugspro.nvidia.com/bug/5177565"
|
||||
)
|
||||
|
||||
if get_total_gpu_memory(0) < 60 * 1024**3:
|
||||
pytest.skip(f"Not enough GPU memory to run. {get_total_gpu_memory(0)}")
|
||||
|
||||
prompts = [
|
||||
"The president of the United States is",
|
||||
] * 32
|
||||
|
||||
expected_outputs = [
|
||||
" the head of state and head of government of the",
|
||||
] * 32
|
||||
|
||||
pytorch_config = PyTorchConfig(
|
||||
enable_overlap_scheduler=enable_overlap_scheduler,
|
||||
use_cuda_graph=enable_cuda_graph,
|
||||
kv_cache_dtype="auto",
|
||||
attn_backend=backend,
|
||||
)
|
||||
|
||||
mtp_config = MTPDecodingConfig(
|
||||
num_nextn_predict_layers=mtp_nextn) if mtp_nextn > 0 else None
|
||||
|
||||
model_dir = str(llm_models_root() / model_name / model_path[quant])
|
||||
|
||||
assert Path(model_dir).exists()
|
||||
|
||||
llm = LLM(model=model_dir,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
enable_chunked_prefill=False,
|
||||
pytorch_backend_config=pytorch_config,
|
||||
moe_expert_parallel_size=ep_size,
|
||||
moe_tensor_parallel_size=-1,
|
||||
enable_attention_dp=enable_dp,
|
||||
kv_cache_config=KvCacheConfig(enable_block_reuse=False),
|
||||
speculative_config=mtp_config)
|
||||
|
||||
with llm:
|
||||
outputs = llm.generate(
|
||||
prompts,
|
||||
sampling_params=SamplingParams(max_tokens=10),
|
||||
)
|
||||
|
||||
assert len(outputs) == len(expected_outputs), "Output length mismatch"
|
||||
for output, expected in zip(outputs, expected_outputs):
|
||||
output_text = output.outputs[0].text
|
||||
# print(output_text)
|
||||
# print(output.outputs[0].token_ids)
|
||||
# Limited by the kv cache length, the output length of MTP maybe
|
||||
# a little smaller than original model.
|
||||
expected = expected[0:len(output_text)] if mtp_nextn > 0 else expected
|
||||
assert similar(output_text, expected,
|
||||
1.0), f"Expected '{expected}' but get '{output_text}'"
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", ["DeepSeek-V3-Lite"],
|
||||
ids=["deepseekv3_lite"])
|
||||
@pytest.mark.parametrize("backend", ["TRTLLM"], ids=["trtllm"])
|
||||
|
||||
Loading…
Reference in New Issue
Block a user