diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 4313ba15f3..d5836220d0 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -358,3 +358,8 @@ MiniMaxAI/MiniMax-M2: - accuracy: 85 - quant_algo: FP8_BLOCK_SCALES accuracy: 85 +nvidia/NVIDIA-Nemotron-3-Super-120B-012726: + - accuracy: 82.363 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 82.121 diff --git a/tests/integration/defs/accuracy/references/mmlu.yaml b/tests/integration/defs/accuracy/references/mmlu.yaml index 51198b62fe..6698920605 100644 --- a/tests/integration/defs/accuracy/references/mmlu.yaml +++ b/tests/integration/defs/accuracy/references/mmlu.yaml @@ -392,3 +392,8 @@ nvidia/Nemotron-3-Nano: - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 74.35 +nvidia/NVIDIA-Nemotron-3-Super-120B-012726: + - accuracy: 86.88 + - quant_algo: FP8 + kv_cache_quant_algo: FP8 + accuracy: 86.12 diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py index 9e0a4f23f8..3ee8eb2d62 100644 --- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py +++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py @@ -319,10 +319,10 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness): Runs the model via AutoDeploy and verifies benchmark performance on MMLU and GSM8K """ - MODEL_NAME = "nvidia/Nemotron-Super-V3" - MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev" - MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv" - MODEL_PATH_FP4 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv" + MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-012726" + MODEL_PATH_BF16 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-BF16-BF16KV-012726" + MODEL_PATH_FP8 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-FP8-FP8KV-012726" + MODEL_PATH_FP4 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-NVFP4-FP8KV-012726" # Set minimum possible seq len + small buffer, for test speed & memory usage MAX_SEQ_LEN = max(MMLU.MAX_INPUT_LEN + MMLU.MAX_OUTPUT_LEN, @@ -371,7 +371,6 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness): task.evaluate(llm) print_memory_usage("After evaluation") - @pytest.mark.skip("Skipping FP8 test until it is supported") @pytest.mark.skip_less_device_memory(180000) @pytest.mark.parametrize("world_size", [1, 4, 8]) def test_fp8(self, world_size): @@ -394,7 +393,7 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness): @pytest.mark.skip("Skipping FP4 test until it is supported") @pytest.mark.skip_less_device_memory(180000) - @pytest.mark.parametrize("world_size", [1, 4, 8]) + @pytest.mark.parametrize("world_size", [4, 8]) def test_fp4(self, world_size): if get_device_count() < world_size: pytest.skip("Not enough devices for world size, skipping test")