diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 208a31f52b..6504712b58 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -349,10 +349,10 @@ mistral/Mistral-Large-3-675B: - spec_dec_algo: Eagle accuracy: 85.30 nvidia/Nemotron-Super-V3: - - accuracy: 81.07 + - accuracy: 80.00 - quant_algo: FP8 kv_cache_quant_algo: FP8 - accuracy: 78.22 + accuracy: 77.80 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 77.56 diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 49d6b46762..92730768fe 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -20,6 +20,7 @@ from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM from tensorrt_llm.quantization import QuantAlgo from tensorrt_llm.sampling_params import SamplingParams +from ..conftest import get_device_count, llm_models_root from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness @@ -244,6 +245,9 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness): MODEL_NAME = "nvidia/Nemotron-Super-V3" MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev" + MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv" + MODEL_PATH_FP4 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv" + # Set minimum possible seq len + small buffer, for test speed & memory usage MAX_SEQ_LEN = max(MMLU.MAX_INPUT_LEN + MMLU.MAX_OUTPUT_LEN, GSM8K.MAX_INPUT_LEN + GSM8K.MAX_OUTPUT_LEN) @@ -289,3 +293,45 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness): task.evaluate(llm, sampling_params=sampling_params) task = GSM8K(self.MODEL_NAME) task.evaluate(llm) + + @pytest.mark.skip("Skipping FP8 test until it is supported") + @pytest.mark.skip_less_device_memory(180000) + @pytest.mark.parametrize("world_size", [4, 8]) + def test_fp8(self, world_size): + if get_device_count() < world_size: + pytest.skip("Not enough devices for world size, skipping test") + kwargs = self.get_default_kwargs() + sampling_params = self.get_default_sampling_params() + with AutoDeployLLM(model=self.MODEL_PATH_FP8, + tokenizer=self.MODEL_PATH_FP8, + world_size=world_size, + **kwargs) as llm: + # Manually set quant_config for FP8 model to get the accuracy threshold + llm.args.quant_config.quant_algo = QuantAlgo.FP8 + llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8 + + task = MMLU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=sampling_params) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + @pytest.mark.skip("Skipping FP4 test until it is supported") + @pytest.mark.skip_less_device_memory(180000) + @pytest.mark.parametrize("world_size", [1, 4, 8]) + def test_fp4(self, world_size): + if get_device_count() < world_size: + pytest.skip("Not enough devices for world size, skipping test") + kwargs = self.get_default_kwargs() + sampling_params = self.get_default_sampling_params() + with AutoDeployLLM(model=self.MODEL_PATH_FP4, + tokenizer=self.MODEL_PATH_FP4, + world_size=world_size, + **kwargs) as llm: + # Manually set quant_config for FP4 model to get the accuracy threshold + llm.args.quant_config.quant_algo = QuantAlgo.NVFP4 + llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.NVFP4 + + task = MMLU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=sampling_params) + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 493f4d354f..a780c7e4b0 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -218,4 +218,6 @@ l0_dgx_b200: tests: - unittest/_torch/auto_deploy/unit/multigpu - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] - - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16 + - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16 + - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4] + - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8] diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index aaa329d5d4..aa823a0450 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -322,4 +322,6 @@ l0_dgx_h100: tests: - unittest/_torch/auto_deploy/unit/multigpu - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4] - - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16 + - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16 + - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4] + - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index ddde25ba75..4e8c4bab79 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -192,6 +192,7 @@ triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414) full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] SKIP (https://nvbugs/5596343) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] SKIP (https://nvbugs/5596343) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto] SKIP (https://nvbugs/5596343) examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313) triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359) triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369)