[None][chore] AutoDeploy update SuperV3 checkpoints and accuracy thresholds (#11107)

Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com> Signed-off-by: Gal Hubara-Agam <96368689+galagam@users.noreply.github.com>
2026-02-16 15:55:08 +08:00 · 2026-02-06 14:55:18 +02:00 · 2026-02-06 14:55:18 +02:00 · f9eed3ecc2
commit f9eed3ecc2
parent b1268e1b37
3 changed files with 15 additions and 6 deletions
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@ -358,3 +358,8 @@ MiniMaxAI/MiniMax-M2:
  - accuracy: 85
  - quant_algo: FP8_BLOCK_SCALES
    accuracy: 85
+nvidia/NVIDIA-Nemotron-3-Super-120B-012726:
+  - accuracy: 82.363
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 82.121
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@ -392,3 +392,8 @@ nvidia/Nemotron-3-Nano:
  - quant_algo: FP8
    kv_cache_quant_algo: FP8
    accuracy: 74.35
+nvidia/NVIDIA-Nemotron-3-Super-120B-012726:
+  - accuracy: 86.88
+  - quant_algo: FP8
+    kv_cache_quant_algo: FP8
+    accuracy: 86.12
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@ -319,10 +319,10 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
    Runs the model via AutoDeploy and verifies benchmark performance on MMLU and GSM8K
    """

-    MODEL_NAME = "nvidia/Nemotron-Super-V3"
-    MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev"
-    MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv"
-    MODEL_PATH_FP4 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv"
+    MODEL_NAME = "nvidia/NVIDIA-Nemotron-3-Super-120B-012726"
+    MODEL_PATH_BF16 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-BF16-BF16KV-012726"
+    MODEL_PATH_FP8 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-FP8-FP8KV-012726"
+    MODEL_PATH_FP4 = f"{llm_models_root()}/NVIDIA-Nemotron-3-Super-120B-NVFP4-FP8KV-012726"

    # Set minimum possible seq len + small buffer, for test speed & memory usage
    MAX_SEQ_LEN = max(MMLU.MAX_INPUT_LEN + MMLU.MAX_OUTPUT_LEN,
@ -371,7 +371,6 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
            task.evaluate(llm)
        print_memory_usage("After evaluation")

-    @pytest.mark.skip("Skipping FP8 test until it is supported")
    @pytest.mark.skip_less_device_memory(180000)
    @pytest.mark.parametrize("world_size", [1, 4, 8])
    def test_fp8(self, world_size):
@ -394,7 +393,7 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):

    @pytest.mark.skip("Skipping FP4 test until it is supported")
    @pytest.mark.skip_less_device_memory(180000)
-    @pytest.mark.parametrize("world_size", [1, 4, 8])
+    @pytest.mark.parametrize("world_size", [4, 8])
    def test_fp4(self, world_size):
        if get_device_count() < world_size:
            pytest.skip("Not enough devices for world size, skipping test")