From ba019a43d66b961582a6062f3c7f7e4722b9a30c Mon Sep 17 00:00:00 2001 From: Enwei Zhu <21126786+syuoni@users.noreply.github.com> Date: Tue, 8 Apr 2025 07:19:04 +0800 Subject: [PATCH] test: Accuracy test improvement (Part 3.3): Move DeepSeek tests (#3260) add skip fix fix update update test list fixqa list move bf16 to postmerge Signed-off-by: Enwei Zhu <21126786+syuoni@users.noreply.github.com> --- jenkins/L0_Test.groovy | 1 + .../defs/accuracy/accuracy_core.py | 13 +- .../accuracy/references/cnn_dailymail.yaml | 11 +- .../defs/accuracy/references/mmlu.yaml | 11 +- .../defs/accuracy/test_cli_flow.py | 22 +- .../defs/accuracy/test_llm_api_pytorch.py | 179 +++++++++++-- tests/integration/defs/conftest.py | 33 +++ .../test_lists/qa/examples_test_list.txt | 6 +- .../test_lists/qa/llm_sanity_test.txt | 6 +- .../test_lists/test-db/l0_b200.yml | 19 +- .../test_lists/test-db/l0_dgx_h100.yml | 67 ++++- .../test_lists/test-db/l0_h100.yml | 37 ++- tests/integration/test_lists/waives.txt | 22 ++ .../multi_gpu_modeling/test_deepseek.py | 238 +----------------- 14 files changed, 371 insertions(+), 294 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index caf754d76b..0d69018ade 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1201,6 +1201,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "A100X-TensorRT-[Post-Merge]-2": ["a100x", "l0_a100", 2, 2], "L40S-TensorRT-[Post-Merge]-1": ["l40s", "l0_l40s", 1, 2], "L40S-TensorRT-[Post-Merge]-2": ["l40s", "l0_l40s", 2, 2], + "H100_PCIe-PyTorch-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1], "H100_PCIe-CPP-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 1], "H100_PCIe-TensorRT-[Post-Merge]-1": ["h100-cr", "l0_h100", 1, 2], "H100_PCIe-TensorRT-[Post-Merge]-2": ["h100-cr", "l0_h100", 2, 2], diff --git a/tests/integration/defs/accuracy/accuracy_core.py b/tests/integration/defs/accuracy/accuracy_core.py index a92b1b99c4..1a3e74c8cb 100644 --- a/tests/integration/defs/accuracy/accuracy_core.py +++ b/tests/integration/defs/accuracy/accuracy_core.py @@ -23,8 +23,10 @@ import yaml import tensorrt_llm.evaluate from tensorrt_llm._torch import LLM as PyTorchLLM +from tensorrt_llm._torch.speculative import SpecConfig from tensorrt_llm.builder import BuildConfig from tensorrt_llm.llmapi import LLM, SamplingParams +from tensorrt_llm.llmapi.llm_args import DecodingBaseConfig from tensorrt_llm.logger import logger from tensorrt_llm.models.modeling_utils import QuantConfig from tensorrt_llm.quantization import QuantAlgo @@ -138,9 +140,16 @@ class AccuracyTask: extra_evaluator_kwargs: Optional[dict] = None): assert self.EVALUATOR_CLS is not None - spec_dec_algo = None - if llm.args.speculative_config is not None: + if llm.args.speculative_config is None: + spec_dec_algo = None + elif isinstance(llm.args.speculative_config, DecodingBaseConfig): spec_dec_algo = llm.args.speculative_config.decoding_type + elif isinstance(llm.args.speculative_config, SpecConfig): + spec_dec_algo = llm.args.speculative_config.spec_dec_name + else: + raise ValueError( + f"Not recognized speculative_config: {llm.args.speculative_config}." + ) num_samples, threshold = self.get_num_samples_and_threshold( dtype=llm.args.dtype, diff --git a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml index af183961fe..ef23f17969 100644 --- a/tests/integration/defs/accuracy/references/cnn_dailymail.yaml +++ b/tests/integration/defs/accuracy/references/cnn_dailymail.yaml @@ -229,8 +229,13 @@ nvidia/Nemotron-Mini-4B-Instruct: kv_cache_quant_algo: FP8 accuracy: 25.72 deepseek-ai/DeepSeek-V3-Lite: - - accuracy: 25.682 + - accuracy: 26.465 - quant_algo: NVFP4 - accuracy: 25.243 + accuracy: 26.629 - quant_algo: FP8_BLOCK_SCALES - accuracy: 25.546 + accuracy: 26.103 + - spec_dec_algo: MTP + accuracy: 26.479 + - quant_algo: FP8_BLOCK_SCALES + spec_dec_algo: MTP + accuracy: 26.230 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index effdb3576f..fe4dd3bb59 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -51,8 +51,13 @@ Qwen/Qwen2.5-7B-Instruct: - quant_algo: FP8 accuracy: 75.32 deepseek-ai/DeepSeek-V3-Lite: - - accuracy: 71.47 + - accuracy: 71.40 - quant_algo: NVFP4 - accuracy: 70.66 + accuracy: 70.60 - quant_algo: FP8_BLOCK_SCALES - accuracy: 71.32 + accuracy: 71.27 + - spec_dec_algo: MTP + accuracy: 71.39 + - quant_algo: FP8_BLOCK_SCALES + spec_dec_algo: MTP + accuracy: 71.29 diff --git a/tests/integration/defs/accuracy/test_cli_flow.py b/tests/integration/defs/accuracy/test_cli_flow.py index 0c2016a62e..54e68396a1 100644 --- a/tests/integration/defs/accuracy/test_cli_flow.py +++ b/tests/integration/defs/accuracy/test_cli_flow.py @@ -18,8 +18,8 @@ from tensorrt_llm.llmapi import (EagleDecodingConfig, LookaheadDecodingConfig, MedusaDecodingConfig) from tensorrt_llm.quantization import QuantAlgo -from ..conftest import (llm_models_root, skip_no_nvls, skip_pre_ada, - skip_pre_blackwell, skip_pre_hopper) +from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_nvls, + skip_pre_ada, skip_pre_blackwell, skip_pre_hopper) from .accuracy_core import (MMLU, CliFlowAccuracyTestHarness, CnnDailymail, Humaneval, PassKeyRetrieval64k, PassKeyRetrieval128k, SlimPajama6B, ZeroScrolls) @@ -57,9 +57,8 @@ class TestGpt2(CliFlowAccuracyTestHarness): def test_int8_kv_cache(self): self.run(kv_cache_quant_algo=QuantAlgo.INT8) - @pytest.mark.parametrize("per_token,per_channel", [(False, False), - (True, True)], - ids=["", "per_token-per_channel"]) + @parametrize_with_ids("per_token,per_channel", [(False, False), + (True, True)]) def test_smooth_quant(self, per_token: bool, per_channel: bool): if per_token: if per_channel: @@ -297,8 +296,7 @@ class TestVicuna7B(CliFlowAccuracyTestHarness): ], extra_summarize_args=["--lookahead_config=[7,7,7]"]) - @pytest.mark.parametrize("cuda_graph", [False, True], - ids=["", "cuda_graph"]) + @parametrize_with_ids("cuda_graph", [False, True]) def test_medusa(self, cuda_graph, mocker): mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "medusa") mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8) @@ -318,13 +316,9 @@ class TestVicuna7B(CliFlowAccuracyTestHarness): extra_build_args=["--speculative_decoding_mode=medusa"], extra_summarize_args=extra_summarize_args) - @pytest.mark.parametrize("cuda_graph,chunked_context,typical_acceptance", - [(False, False, False), (True, False, False), - (True, True, False), (True, False, True)], - ids=[ - "", "cuda_graph", "cuda_graph-chunked_context", - "cuda_graph-typical_acceptance" - ]) + @parametrize_with_ids("cuda_graph,chunked_context,typical_acceptance", + [(False, False, False), (True, False, False), + (True, True, False), (True, False, True)]) def test_eagle(self, cuda_graph, chunked_context, typical_acceptance, mocker): mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "eagle") diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 3d111ca6cd..025d025b4c 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -15,10 +15,12 @@ import pytest from tensorrt_llm._torch import LLM -from tensorrt_llm.llmapi import KvCacheConfig +from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig +from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig from tensorrt_llm.quantization import QuantAlgo -from ..conftest import llm_models_root, skip_pre_blackwell +from ..conftest import (llm_models_root, parametrize_with_ids, + skip_pre_blackwell, skip_pre_hopper) from .accuracy_core import MMLU, CnnDailymail, LlmapiAccuracyTestHarness @@ -113,33 +115,178 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness): MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite" MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16" - @pytest.mark.skip_less_device_memory(80000) - def test_auto_dtype(self): - # https://nvbugs/5141289: OOM on H100 with default free_gpu_memory_fraction=0.9 - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) - with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm: + @pytest.mark.skip_less_device_memory(60000) + @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", + [(False, False, False), (True, False, False), + (False, True, False), (False, False, True), + (True, True, True)]) + # Only Hopper and Blackwell MLA kernel supports MTP + @parametrize_with_ids("mtp_nextn", + [None, pytest.param(2, marks=skip_pre_hopper)]) + def test_bfloat16(self, mtp_nextn, attention_dp, cuda_graph, + overlap_scheduler): + # OOM on H100 with default free_gpu_memory_fraction=0.9 + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + pytorch_config = PyTorchConfig( + enable_overlap_scheduler=overlap_scheduler, + use_cuda_graph=cuda_graph) + if mtp_nextn is not None and mtp_nextn > 0: + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) + else: + mtp_config = None + llm = LLM(self.MODEL_PATH, + kv_cache_config=kv_cache_config, + pytorch_backend_config=pytorch_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) + with llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @pytest.mark.skip_less_device(4) + @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", + [(False, False, False), (True, False, False), + (False, True, False), (False, False, True), + (True, True, True)]) + # Only Hopper and Blackwell MLA kernel supports MTP + @parametrize_with_ids("mtp_nextn", + [None, pytest.param(2, marks=skip_pre_hopper)]) + @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4), + (2, 2, 1)], + ids=["tp4", "ep4", "tp2pp2"]) + def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, + attention_dp, cuda_graph, overlap_scheduler): + # OOM on H100 with default free_gpu_memory_fraction=0.9 + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7) + pytorch_config = PyTorchConfig( + enable_overlap_scheduler=overlap_scheduler, + use_cuda_graph=cuda_graph) + if mtp_nextn is not None and mtp_nextn > 0: + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) + else: + mtp_config = None + llm = LLM(self.MODEL_PATH, + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + pytorch_backend_config=pytorch_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) + with llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) task.evaluate(llm) @pytest.mark.skip_device_not_contain(["H100"]) - def test_fp8_block_scales(self): - model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/fp8" - # https://nvbugs/5141289: OOM on H100 with default free_gpu_memory_fraction=0.9 + @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", + [(False, False, False), (True, False, False), + (False, True, False), (False, False, True), + (True, True, True)]) + @parametrize_with_ids("mtp_nextn", [None, 2]) + def test_fp8_block_scales(self, mtp_nextn, attention_dp, cuda_graph, + overlap_scheduler): + # OOM on H100 with default free_gpu_memory_fraction=0.9 kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) - with LLM(model_path, kv_cache_config=kv_cache_config) as llm: - assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + pytorch_config = PyTorchConfig( + enable_overlap_scheduler=overlap_scheduler, + use_cuda_graph=cuda_graph) + if mtp_nextn is not None and mtp_nextn > 0: + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) + else: + mtp_config = None + llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", + kv_cache_config=kv_cache_config, + pytorch_backend_config=pytorch_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + with llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @pytest.mark.skip_less_device(4) + @pytest.mark.skip_device_not_contain(["H100"]) + @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", + [(False, False, False), (True, False, False), + (False, True, False), (False, False, True), + (True, True, True)]) + @parametrize_with_ids("mtp_nextn", [None, 2]) + @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4), + (2, 2, 1)], + ids=["tp4", "ep4", "tp2pp2"]) + def test_fp8_block_scales_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, + attention_dp, cuda_graph, + overlap_scheduler): + # OOM on H100 with default free_gpu_memory_fraction=0.9 + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8) + pytorch_config = PyTorchConfig( + enable_overlap_scheduler=overlap_scheduler, + use_cuda_graph=cuda_graph) + if mtp_nextn is not None and mtp_nextn > 0: + mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn) + else: + mtp_config = None + llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + pytorch_backend_config=pytorch_config, + enable_attention_dp=attention_dp, + speculative_config=mtp_config) + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + with llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) task.evaluate(llm) @skip_pre_blackwell - def test_nvfp4(self): - model_path = f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only" - with LLM(model_path) as llm: - assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", + [(False, False, False), (True, False, False), + (False, True, False), (False, False, True), + (True, True, True)]) + def test_nvfp4(self, attention_dp, cuda_graph, overlap_scheduler): + pytorch_config = PyTorchConfig( + enable_overlap_scheduler=overlap_scheduler, + use_cuda_graph=cuda_graph) + llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only", + pytorch_backend_config=pytorch_config, + enable_attention_dp=attention_dp) + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + with llm: + task = CnnDailymail(self.MODEL_NAME) + task.evaluate(llm) + task = MMLU(self.MODEL_NAME) + task.evaluate(llm) + + @skip_pre_blackwell + @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", + [(False, False, False), (True, False, False), + (False, True, False), (False, False, True), + (True, True, True)]) + @pytest.mark.parametrize("tp_size,pp_size,ep_size", [(4, 1, 1), (4, 1, 4), + (2, 2, 1)], + ids=["tp4", "ep4", "tp2pp2"]) + def test_nvfp4_4gpus(self, tp_size, pp_size, ep_size, attention_dp, + cuda_graph, overlap_scheduler): + pytorch_config = PyTorchConfig( + enable_overlap_scheduler=overlap_scheduler, + use_cuda_graph=cuda_graph) + llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only", + tensor_parallel_size=tp_size, + pipeline_parallel_size=pp_size, + moe_expert_parallel_size=ep_size, + pytorch_backend_config=pytorch_config, + enable_attention_dp=attention_dp) + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + with llm: task = CnnDailymail(self.MODEL_NAME) task.evaluate(llm) task = MMLU(self.MODEL_NAME) diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py index c1c6eb7100..6cfcfce3b0 100644 --- a/tests/integration/defs/conftest.py +++ b/tests/integration/defs/conftest.py @@ -25,11 +25,13 @@ import time import urllib.request from functools import wraps from pathlib import Path +from typing import Iterable, Sequence import defs.ci_profiler import psutil import pytest import yaml +from _pytest.mark import ParameterSet from tensorrt_llm.bindings import ipc_nvls_supported @@ -1830,6 +1832,37 @@ def star_attention_input_root(llm_root): return star_attention_input_root +def parametrize_with_ids(argnames: str | Sequence[str], + argvalues: Iterable[ParameterSet | Sequence[object] + | object], **kwargs): + if isinstance(argnames, str): + argname_list = [n.strip() for n in argnames.split(",")] + else: + argname_list = argnames + + case_ids = [] + for case_argvalues in argvalues: + if isinstance(case_argvalues, ParameterSet): + case_argvalues = case_argvalues.values + elif case_argvalues is None or isinstance(case_argvalues, + (str, float, int, bool)): + case_argvalues = (case_argvalues, ) + assert len(case_argvalues) == len(argname_list) + + case_id = [] + for name, value in zip(argname_list, case_argvalues): + if value is None: + pass + elif isinstance(value, bool): + if value: + case_id.append(name) + else: + case_id.append(f"{name}={value}") + case_ids.append("-".join(case_id)) + + return pytest.mark.parametrize(argnames, argvalues, ids=case_ids, **kwargs) + + @pytest.fixture(autouse=True) def skip_by_device_count(request): "fixture for skip less device count" diff --git a/tests/integration/test_lists/qa/examples_test_list.txt b/tests/integration/test_lists/qa/examples_test_list.txt index 1ca4aae1a7..a49c4f4e8f 100644 --- a/tests/integration/test_lists/qa/examples_test_list.txt +++ b/tests/integration/test_lists/qa/examples_test_list.txt @@ -425,9 +425,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4 +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[] test_e2e.py::test_benchmark_sanity[bert_base] # 127.18s test_e2e.py::test_benchmark_sanity[gpt_350m] # 64.06s diff --git a/tests/integration/test_lists/qa/llm_sanity_test.txt b/tests/integration/test_lists/qa/llm_sanity_test.txt index 75173e9b7c..e9e8aa759b 100644 --- a/tests/integration/test_lists/qa/llm_sanity_test.txt +++ b/tests/integration/test_lists/qa/llm_sanity_test.txt @@ -130,9 +130,9 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2 -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales -accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4 +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[] +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[] # Pivot to Pytorch test cases. test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 62a88d3016..8faf3bfdae 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -15,8 +15,21 @@ l0_b200: tests: # ------------- PyTorch tests --------------- - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_auto_dtype - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4 + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[attention_dp-cuda_graph-overlap_scheduler] - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B] - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] - test_e2e.py::test_ptq_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16] @@ -25,8 +38,6 @@ l0_b200: - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" - unittest/_torch -k "modeling_llama" - unittest/_torch/modeling -k "modeling_mixtral" - - unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and nextn0" - - unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and not nextn0" - unittest/_torch/auto_deploy/unit/singlegpu - unittest/_torch/speculative/test_eagle3.py - condition: diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 0be700f8f3..e623ec16ae 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -15,15 +15,38 @@ l0_dgx_h100: tests: # ------------- PyTorch tests --------------- - unittest/_torch/multi_gpu - - unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep1_nextn0" - - unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep1_nextn2" - - unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep4_nextn0" - - unittest/_torch/multi_gpu_modeling -k "deepseek and tp4_pp1_ep4_nextn2" - - unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn0_disable_dp" - - unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn0_enable_dp" - - unittest/_torch/multi_gpu_modeling -k "deepseek and tp2_pp2_ep1_nextn2" - unittest/_torch/multi_gpu_modeling -k "llama and not (tp1 and pp1)" - unittest/_torch/auto_deploy/unit/multigpu + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] - disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0] - disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0] - disaggregated/test_disaggregated.py::test_disaggregated_mixed[TinyLlama-1.1B-Chat-v1.0] @@ -108,6 +131,36 @@ l0_dgx_h100: backend: pytorch tests: # ------------- PyTorch tests --------------- + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] - unittest/_torch/auto_deploy/integration/test_ad_build.py - unittest/_torch/auto_deploy/integration/test_lm_eval.py - condition: diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index dee61df6da..f3262dc6da 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -18,11 +18,20 @@ l0_h100: - unittest/_torch -k "not (modeling or multi_gpu or auto_deploy)" - unittest/_torch -k "modeling_llama" - unittest/_torch/multi_gpu_modeling -k "llama and tp1 and pp1" + - unittest/_torch/multi_gpu_modeling -k "deepseek" - unittest/_torch/modeling -k "modeling_mixtral" - unittest/_torch/modeling -k "modeling_nemotron" - - unittest/_torch/multi_gpu_modeling -k "deepseek and tp1 and nextn0" - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_auto_dtype - - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-False-False] - test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-instruct-hf-fp8-True-True] - condition: @@ -110,6 +119,30 @@ l0_h100: - examples/test_enc_dec.py::test_llm_enc_dec_mmlu[flan-t5-small-float32-tp:1-pp:1-nb:1-disable_fp8] # 4 mins - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins - examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8] # 3 mins +- condition: + ranges: + system_gpu_count: + gte: 1 + lte: 1 + wildcards: + gpu: + - '*h100*' + linux_distribution_name: ubuntu* + terms: + stage: post_merge + backend: pytorch + tests: + # ------------- PyTorch tests --------------- + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[attention_dp-cuda_graph-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-cuda_graph] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-overlap_scheduler] + - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] - condition: ranges: system_gpu_count: diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index c47a16ba35..86e0c8a706 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -418,6 +418,28 @@ examples/test_gpt.py::test_starcoder_fp8_quantization_2gpu[starcoderplus] SKIP ( unittest/_torch/auto_deploy/integration/test_lm_eval.py SKIP (https://nvbugs/5144854) examples/test_qwen.py::test_llm_qwen1_5_moe_plugin_single_gpu_lora[qwen1.5_moe_a2.7b_chat-Upcycled-Qwen1.5-MoE2.7B-LoRA] SKIP (https://nvbugs/5155141) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2] SKIP (https://nvbugs/5170160) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp] SKIP (https://nvbugs/5170160) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-cuda_graph] SKIP (https://nvbugs/5170160) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler] SKIP (https://nvbugs/5170160) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5170160) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2] SKIP (https://nvbugs/5170160) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp] SKIP (https://nvbugs/5170160) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-cuda_graph] SKIP (https://nvbugs/5170160) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-overlap_scheduler] SKIP (https://nvbugs/5170160) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5170160) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-overlap_scheduler] SKIP (https://nvbugs/5201514) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201514) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-overlap_scheduler] SKIP (https://nvbugs/5201514) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp2pp2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201514) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530) +accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-attention_dp-cuda_graph-overlap_scheduler] SKIP (https://nvbugs/5201530) full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_auto_dtype SKIP (https://nvbugs/5176851) full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int8] SKIP (https://nvbugs/5176851) full:L40S/accuracy/test_cli_flow.py::TestGemma2_9BIt::test_weight_only[int4] SKIP (https://nvbugs/5176851) diff --git a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py index c10b9009fc..df82ed6759 100644 --- a/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py +++ b/tests/unittest/_torch/multi_gpu_modeling/test_deepseek.py @@ -10,251 +10,15 @@ from utils.util import getSMVersion from tensorrt_llm import SamplingParams from tensorrt_llm._torch import LLM from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig -from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig +from tensorrt_llm.llmapi import KvCacheConfig from tensorrt_llm.llmapi.utils import get_total_gpu_memory -# Test combinations for different scenarios -# Each tuple contains: (tp_size, pp_size, ep_size, mtp_nextn, enable_dp, enable_cuda_graph, enable_overlap_scheduler, test_id) -TEST_COMBINATIONS = [ - # single-gpu test - # basic test - (1, 1, 1, 0, False, False, False, - "tp1_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (1, 1, 1, 0, True, False, False, - "tp1_pp1_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (1, 1, 1, 0, False, True, False, - "tp1_pp1_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler" - ), - (1, 1, 1, 0, False, False, True, - "tp1_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (1, 1, 1, 0, True, True, True, - "tp1_pp1_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"), - # mtp test - (1, 1, 1, 2, False, False, False, - "tp1_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (1, 1, 1, 2, False, False, True, - "tp1_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (1, 1, 1, 2, False, True, True, - "tp1_pp1_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler" - ), - (1, 1, 1, 2, True, False, True, - "tp1_pp1_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (1, 1, 1, 2, True, True, True, - "tp1_pp1_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"), - # multi-gpu test - # tp4 - (4, 1, 1, 0, False, False, False, - "tp4_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (4, 1, 1, 0, True, False, False, - "tp4_pp1_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (4, 1, 1, 0, False, True, False, - "tp4_pp1_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler" - ), - (4, 1, 1, 0, False, False, True, - "tp4_pp1_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (4, 1, 1, 0, True, True, True, - "tp4_pp1_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"), - #tp4, mtp2 - (4, 1, 1, 2, False, False, False, - "tp4_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (4, 1, 1, 2, False, False, True, - "tp4_pp1_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (4, 1, 1, 2, False, True, True, - "tp4_pp1_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler" - ), - (4, 1, 1, 2, True, False, True, - "tp4_pp1_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (4, 1, 1, 2, True, True, True, - "tp4_pp1_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"), - # tp4, ep4 - (4, 1, 4, 0, False, False, False, - "tp4_pp1_ep4_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (4, 1, 4, 0, True, False, False, - "tp4_pp1_ep4_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (4, 1, 4, 0, False, True, False, - "tp4_pp1_ep4_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler" - ), - (4, 1, 4, 0, False, False, True, - "tp4_pp1_ep4_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (4, 1, 4, 0, True, True, True, - "tp4_pp1_ep4_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"), - #tp4, ep4, mtp2 - (4, 1, 4, 2, False, False, False, - "tp4_pp1_ep4_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (4, 1, 4, 2, False, False, True, - "tp4_pp1_ep4_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (4, 1, 4, 2, False, True, True, - "tp4_pp1_ep4_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler" - ), - (4, 1, 4, 2, True, False, True, - "tp4_pp1_ep4_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (4, 1, 4, 2, True, True, True, - "tp4_pp1_ep4_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"), - #tp2, pp2 - (2, 2, 1, 0, False, False, False, - "tp2_pp2_ep1_nextn0_disable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (2, 2, 1, 0, True, False, False, - "tp2_pp2_ep1_nextn0_enable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (2, 2, 1, 0, False, True, False, - "tp2_pp2_ep1_nextn0_disable_dp_enable_cuda_graph_disable_overlap_scheduler" - ), - (2, 2, 1, 0, False, False, True, - "tp2_pp2_ep1_nextn0_disable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (2, 2, 1, 0, True, True, True, - "tp2_pp2_ep1_nextn0_enable_dp_enable_cuda_graph_enable_overlap_scheduler"), - #tp2, pp2, mtp2 - (2, 2, 1, 2, False, False, False, - "tp2_pp2_ep1_nextn2_disable_dp_disable_cuda_graph_disable_overlap_scheduler" - ), - (2, 2, 1, 2, False, False, True, - "tp2_pp2_ep1_nextn2_disable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (2, 2, 1, 2, False, True, True, - "tp2_pp2_ep1_nextn2_disable_dp_enable_cuda_graph_enable_overlap_scheduler" - ), - (2, 2, 1, 2, True, False, True, - "tp2_pp2_ep1_nextn2_enable_dp_disable_cuda_graph_enable_overlap_scheduler" - ), - (2, 2, 1, 2, True, True, True, - "tp2_pp2_ep1_nextn2_enable_dp_enable_cuda_graph_enable_overlap_scheduler"), -] - def similar(a, b, threshold=0.9): "similar compare a and b " return SequenceMatcher(None, a, b).ratio() >= threshold -@pytest.mark.parametrize("model_name", ["DeepSeek-V3-Lite"], - ids=["deepseekv3_lite"]) -@pytest.mark.parametrize("backend", ["TRTLLM"], ids=["trtllm"]) -@pytest.mark.parametrize("quant", ["bf16", "fp8", "fp4"], - ids=["bf16", "fp8", "fp4"]) -@pytest.mark.parametrize("test_config", TEST_COMBINATIONS, ids=lambda x: x[-1]) -def test_deepseek(model_name, backend, quant, test_config): - tp_size, pp_size, ep_size, mtp_nextn, enable_dp, enable_cuda_graph, enable_overlap_scheduler, _ = test_config - - model_path = { - "bf16": "bf16", - "fp8": "fp8", - "fp4": "nvfp4_moe_only", - } - assert quant in model_path.keys() - - is_fp8 = quant == "fp8" - is_fp4 = quant == "fp4" - - if (not enable_overlap_scheduler and enable_cuda_graph and not enable_dp - and mtp_nextn == 0 and ep_size == 1 and pp_size == 4 - and tp_size == 1 and is_fp8): - - pytest.skip("https://nvbugspro.nvidia.com/bug/5189673") - - if ep_size > tp_size: - pytest.skip( - f"Expert parallel size {ep_size} must be less than or equal to tensor parallel size {tp_size}" - ) - - if torch.cuda.device_count() < tp_size * pp_size: - pytest.skip(f"Not enough GPUs available, need {tp_size * pp_size} " - f"but only have {torch.cuda.device_count()}") - - if is_fp8 and getSMVersion() != 90: - pytest.skip(f"FP8 is not supported in this SM version {getSMVersion()}") - - if is_fp4 and getSMVersion() < 100: - pytest.skip(f"FP4 is not supported in this SM version {getSMVersion()}") - - if is_fp4 and mtp_nextn > 0: - pytest.skip(f"FP4 checkpoint has no MTP weights") - - if mtp_nextn > 0 and getSMVersion() < 90: - pytest.skip(f"Only Hopper and Blackwell MLA kernel can support MTP now") - - if pp_size > 1 and mtp_nextn > 0: - pytest.skip( - "PP + MTP is not supported: https://nvbugspro.nvidia.com/bug/5170160" - ) - if pp_size > 2 and enable_cuda_graph and enable_overlap_scheduler: - pytest.skip( - "Race condition causes incorrect output for some requests: https://nvbugspro.nvidia.com/bug/5177565" - ) - - if get_total_gpu_memory(0) < 60 * 1024**3: - pytest.skip(f"Not enough GPU memory to run. {get_total_gpu_memory(0)}") - - prompts = [ - "The president of the United States is", - ] * 32 - - expected_outputs = [ - " the head of state and head of government of the", - ] * 32 - - pytorch_config = PyTorchConfig( - enable_overlap_scheduler=enable_overlap_scheduler, - use_cuda_graph=enable_cuda_graph, - kv_cache_dtype="auto", - attn_backend=backend, - ) - - mtp_config = MTPDecodingConfig( - num_nextn_predict_layers=mtp_nextn) if mtp_nextn > 0 else None - - model_dir = str(llm_models_root() / model_name / model_path[quant]) - - assert Path(model_dir).exists() - - llm = LLM(model=model_dir, - tensor_parallel_size=tp_size, - pipeline_parallel_size=pp_size, - enable_chunked_prefill=False, - pytorch_backend_config=pytorch_config, - moe_expert_parallel_size=ep_size, - moe_tensor_parallel_size=-1, - enable_attention_dp=enable_dp, - kv_cache_config=KvCacheConfig(enable_block_reuse=False), - speculative_config=mtp_config) - - with llm: - outputs = llm.generate( - prompts, - sampling_params=SamplingParams(max_tokens=10), - ) - - assert len(outputs) == len(expected_outputs), "Output length mismatch" - for output, expected in zip(outputs, expected_outputs): - output_text = output.outputs[0].text - # print(output_text) - # print(output.outputs[0].token_ids) - # Limited by the kv cache length, the output length of MTP maybe - # a little smaller than original model. - expected = expected[0:len(output_text)] if mtp_nextn > 0 else expected - assert similar(output_text, expected, - 1.0), f"Expected '{expected}' but get '{output_text}'" - - @pytest.mark.parametrize("model_name", ["DeepSeek-V3-Lite"], ids=["deepseekv3_lite"]) @pytest.mark.parametrize("backend", ["TRTLLM"], ids=["trtllm"])