diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 96a9ef6b94..a4365a5ce9 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -305,6 +305,9 @@ mistral/Mistral-Large-3-675B: accuracy: 86.1 nvidia/Nemotron-Super-V3: - accuracy: 83.74 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 82.25 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 80.85 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 9cbd7a9f73..208a31f52b 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -350,6 +350,9 @@ mistral/Mistral-Large-3-675B: accuracy: 85.30 nvidia/Nemotron-Super-V3: - accuracy: 81.07 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 78.22 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 77.56 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 2b8639bbfc..9689078c9c 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -5359,6 +5359,44 @@ class TestNemotronV3Super(LlmapiAccuracyTestHarness): task.evaluate(llm, extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS) + @skip_pre_hopper + @pytest.mark.skip_less_mpi_world_size(4) + @pytest.mark.skip_less_device_memory(40000) + @pytest.mark.parametrize( + "attention_dp", + [ + False, + True, + ], + ids=[ + "attention_dp_off", + "attention_dp_on", + ], + ) + def test_fp8_4gpus(self, attention_dp): + with LLM( + f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv", + kv_cache_config=KvCacheConfig( + enable_block_reuse=False, + mamba_ssm_cache_dtype="float16", + free_gpu_memory_fraction=0.5, + ), + max_batch_size=32, + tensor_parallel_size=4, + moe_expert_parallel_size=4, + enable_attention_dp=attention_dp, + cuda_graph_config=CudaGraphConfig(max_batch_size=512, + enable_padding=True), + disable_overlap_scheduler=False, + moe_config=MoeConfig(backend="CUTLASS"), + ) as llm: + task = MMLU(self.MODEL_NAME) + task.evaluate(llm, + extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm, + extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS) + @skip_pre_blackwell @pytest.mark.skip_less_mpi_world_size(8) @pytest.mark.parametrize( diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index d56861e986..1928574cf3 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -252,6 +252,8 @@ accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True] accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True] accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-True-True] +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_on] +accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_off] accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_on] accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus[attention_dp_off] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 91bf2542b7..93606c7f73 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -98,6 +98,8 @@ l0_dgx_h100: - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=TRTLLM-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp2pp2-fp8kv=True-attn_backend=TRTLLM-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] + - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_off] + - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_fp8_4gpus[attention_dp_on] - test_e2e.py::test_ptp_quickstart_advanced_bs1 - test_e2e.py::test_ptp_quickstart_advanced_deepseek_v3_lite_4gpus_adp_balance[DeepSeek-V3-Lite-FP8-DeepSeek-V3-Lite/fp8] # ------------- Disaggregated serving tests ---------------