mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-04 02:02:01 +08:00
[TRTLLM-10245][feat] Add accuracy tests for super v3 fp8 model (#10482)
Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
This commit is contained in:
parent
0f2d61b8c6
commit
73d1840c12
@ -305,6 +305,9 @@ mistral/Mistral-Large-3-675B:
|
||||
accuracy: 86.1
|
||||
nvidia/Nemotron-Super-V3:
|
||||
- accuracy: 83.74
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 82.25
|
||||
- quant_algo: NVFP4
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 80.85
|
||||
|
||||
@ -350,6 +350,9 @@ mistral/Mistral-Large-3-675B:
|
||||
accuracy: 85.30
|
||||
nvidia/Nemotron-Super-V3:
|
||||
- accuracy: 81.07
|
||||
- quant_algo: FP8
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 78.22
|
||||
- quant_algo: NVFP4
|
||||
kv_cache_quant_algo: FP8
|
||||
accuracy: 77.56
|
||||
|
||||
@ -5359,6 +5359,44 @@ class TestNemotronV3Super(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
|
||||
|
||||
@skip_pre_hopper
|
||||
@pytest.mark.skip_less_mpi_world_size(4)
|
||||
@pytest.mark.skip_less_device_memory(40000)
|
||||
@pytest.mark.parametrize(
|
||||
"attention_dp",
|
||||
[
|
||||
False,
|
||||
True,
|
||||
],
|
||||
ids=[
|
||||
"attention_dp_off",
|
||||
"attention_dp_on",
|
||||
],
|
||||
)
|
||||
def test_fp8_4gpus(self, attention_dp):
|
||||
with LLM(
|
||||
f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv",
|
||||
kv_cache_config=KvCacheConfig(
|
||||
enable_block_reuse=False,
|
||||
mamba_ssm_cache_dtype="float16",
|
||||
free_gpu_memory_fraction=0.5,
|
||||
),
|
||||
max_batch_size=32,
|
||||
tensor_parallel_size=4,
|
||||
moe_expert_parallel_size=4,
|
||||
enable_attention_dp=attention_dp,
|
||||
cuda_graph_config=CudaGraphConfig(max_batch_size=512,
|
||||
enable_padding=True),
|
||||
disable_overlap_scheduler=False,
|
||||
moe_config=MoeConfig(backend="CUTLASS"),
|
||||
) as llm:
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
|
||||
|
||||
@skip_pre_blackwell
|
||||
@pytest.mark.skip_less_mpi_world_size(8)
|
||||
@pytest.mark.parametrize(
|
||||
|
||||
@ -252,6 +252,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-True-True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_on]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_off]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_on]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_off]
|
||||
|
||||
|
||||
@ -98,6 +98,8 @@ l0_dgx_h100:
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=TRTLLM-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=TRTLLM-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
|
||||
- accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_off]
|
||||
- accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_on]
|
||||
- test_e2e.py::test_ptp_quickstart_advanced_bs1
|
||||
- test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8]
|
||||
# ------------- Disaggregated serving tests ---------------
|
||||
|
||||
Loading…
Reference in New Issue
Block a user