[#10707][fix] AutoDeploy: Super accuracy test fixes (#10717)

Signed-off-by: Gal Hubara Agam <96368689+galagam@users.noreply.github.com> Signed-off-by: Gal Hubara-Agam <96368689+galagam@users.noreply.github.com>
2026-02-05 02:31:33 +08:00 · 2026-01-20 18:16:13 +02:00 · 2026-01-20 18:16:13 +02:00 · e61c942d1f
commit e61c942d1f
parent ae8f74b620
5 changed files with 55 additions and 4 deletions
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@ -349,10 +349,10 @@ mistral/Mistral-Large-3-675B:
  - spec_dec_algo: Eagle
    accuracy: 85.30
 nvidia/Nemotron-Super-V3:
-  - accuracy: 81.07
+  - accuracy: 80.00
  - quant_algo: FP8
    kv_cache_quant_algo: FP8
-    accuracy: 78.22
+    accuracy: 77.80
  - quant_algo: NVFP4
    kv_cache_quant_algo: FP8
    accuracy: 77.56
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@ -20,6 +20,7 @@ from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
 from tensorrt_llm.quantization import QuantAlgo
 from tensorrt_llm.sampling_params import SamplingParams

+from ..conftest import get_device_count, llm_models_root
 from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness


@ -244,6 +245,9 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):

    MODEL_NAME = "nvidia/Nemotron-Super-V3"
    MODEL_PATH_BF16 = f"{llm_models_root()}/Nemotron-Super-3-120B-A12B-dev"
+    MODEL_PATH_FP8 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-fp8-fp8kv"
+    MODEL_PATH_FP4 = f"{llm_models_root()}/Nemotron-SuperV3-phase1-mtp-nvfp4-fp8kv"
+
    # Set minimum possible seq len + small buffer, for test speed & memory usage
    MAX_SEQ_LEN = max(MMLU.MAX_INPUT_LEN + MMLU.MAX_OUTPUT_LEN,
                      GSM8K.MAX_INPUT_LEN + GSM8K.MAX_OUTPUT_LEN)
@ -289,3 +293,45 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
            task.evaluate(llm, sampling_params=sampling_params)
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)
+
+    @pytest.mark.skip("Skipping FP8 test until it is supported")
+    @pytest.mark.skip_less_device_memory(180000)
+    @pytest.mark.parametrize("world_size", [4, 8])
+    def test_fp8(self, world_size):
+        if get_device_count() < world_size:
+            pytest.skip("Not enough devices for world size, skipping test")
+        kwargs = self.get_default_kwargs()
+        sampling_params = self.get_default_sampling_params()
+        with AutoDeployLLM(model=self.MODEL_PATH_FP8,
+                           tokenizer=self.MODEL_PATH_FP8,
+                           world_size=world_size,
+                           **kwargs) as llm:
+            # Manually set quant_config for FP8 model to get the accuracy threshold
+            llm.args.quant_config.quant_algo = QuantAlgo.FP8
+            llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.FP8
+
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=sampling_params)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @pytest.mark.skip("Skipping FP4 test until it is supported")
+    @pytest.mark.skip_less_device_memory(180000)
+    @pytest.mark.parametrize("world_size", [1, 4, 8])
+    def test_fp4(self, world_size):
+        if get_device_count() < world_size:
+            pytest.skip("Not enough devices for world size, skipping test")
+        kwargs = self.get_default_kwargs()
+        sampling_params = self.get_default_sampling_params()
+        with AutoDeployLLM(model=self.MODEL_PATH_FP4,
+                           tokenizer=self.MODEL_PATH_FP4,
+                           world_size=world_size,
+                           **kwargs) as llm:
+            # Manually set quant_config for FP4 model to get the accuracy threshold
+            llm.args.quant_config.quant_algo = QuantAlgo.NVFP4
+            llm.args.quant_config.kv_cache_quant_algo = QuantAlgo.NVFP4
+
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm, sampling_params=sampling_params)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@ -218,4 +218,6 @@ l0_dgx_b200:
  tests:
  - unittest/_torch/auto_deploy/unit/multigpu
  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
-  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8]
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@ -322,4 +322,6 @@ l0_dgx_h100:
  tests:
  - unittest/_torch/auto_deploy/unit/multigpu
  - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-4]
-  - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_bf16
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[4]
+  - accuracy/test_llm_api_autodeploy.py::TestNemotronSuperV3::test_fp8[8]
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -192,6 +192,7 @@ triton_server/test_triton.py::test_llava[llava] SKIP (https://nvbugs/5547414)
 full:RTX/accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (https://nvbugs/5569696)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto] SKIP (https://nvbugs/5596343)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto] SKIP (https://nvbugs/5596343)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-trtllm-auto] SKIP (https://nvbugs/5596343)
 examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16] SKIP (https://nvbugs/5612313)
 triton_server/test_triton.py::test_cpp_unit_tests[cpp-unit-tests] SKIP (https://nvbugs/5619359)
 triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] SKIP (https://nvbugs/5619369)