[TRTLLM-10245][feat] Add accuracy tests for super v3 fp8 model (#10482)

Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
This commit is contained in:
Wanli Jiang 2026-01-15 10:07:02 +08:00 committed by GitHub
parent 0f2d61b8c6
commit 73d1840c12
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 48 additions and 0 deletions

View File

@ -305,6 +305,9 @@ mistral/Mistral-Large-3-675B:
accuracy: 86.1
nvidia/Nemotron-Super-V3:
- accuracy: 83.74
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 82.25
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 80.85

View File

@ -350,6 +350,9 @@ mistral/Mistral-Large-3-675B:
accuracy: 85.30
nvidia/Nemotron-Super-V3:
- accuracy: 81.07
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 78.22
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 77.56

View File

@ -5359,6 +5359,44 @@ class TestNemotronV3Super(LlmapiAccuracyTestHarness):
task.evaluate(llm,
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
@skip_pre_hopper
@pytest.mark.skip_less_mpi_world_size(4)
@pytest.mark.skip_less_device_memory(40000)
@pytest.mark.parametrize(
"attention_dp",
[
False,
True,
],
ids=[
"attention_dp_off",
"attention_dp_on",
],
)
def test_fp8_4gpus(self, attention_dp):
with LLM(
f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv",
kv_cache_config=KvCacheConfig(
enable_block_reuse=False,
mamba_ssm_cache_dtype="float16",
free_gpu_memory_fraction=0.5,
),
max_batch_size=32,
tensor_parallel_size=4,
moe_expert_parallel_size=4,
enable_attention_dp=attention_dp,
cuda_graph_config=CudaGraphConfig(max_batch_size=512,
enable_padding=True),
disable_overlap_scheduler=False,
moe_config=MoeConfig(backend="CUTLASS"),
) as llm:
task = MMLU(self.MODEL_NAME)
task.evaluate(llm,
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm,
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
@skip_pre_blackwell
@pytest.mark.skip_less_mpi_world_size(8)
@pytest.mark.parametrize(

View File

@ -252,6 +252,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-True-True]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_on]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_off]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_on]
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_off]

View File

@ -98,6 +98,8 @@ l0_dgx_h100:
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=TRTLLM-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
- accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_off]
- accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_on]
- test_e2e.py::test_ptp_quickstart_advanced_bs1
- test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8]
# ------------- Disaggregated serving tests ---------------