[TRTLLM-4932] Add CLI accuracy tests for Llama-3_3-Nemotron-Super-49B-v1 and LLM API FP8 variant (#4375)

* Add CLI TestNemotronSuper acc tests Signed-off-by: moraxu <mguzek@nvidia.com> * Update mmlu.yaml Signed-off-by: moraxu <mguzek@nvidia.com> * Update yaml files Signed-off-by: moraxu <mguzek@nvidia.com> * Skip FP8 test in CLI Signed-off-by: moraxu <mguzek@nvidia.com> * Address reviews Signed-off-by: moraxu <mguzek@nvidia.com> * Address review comments Signed-off-by: moraxu <mguzek@nvidia.com> --------- Signed-off-by: moraxu <mguzek@nvidia.com>
2026-01-14 06:27:45 +08:00 · 2025-05-23 12:17:23 -07:00 · 2025-05-23 12:17:23 -07:00 · d2e6af2fe4
commit d2e6af2fe4
parent 53008d3ee8
8 changed files with 54 additions and 3 deletions
--- a/tests/integration/defs/accuracy/accuracy_core.py
+++ b/tests/integration/defs/accuracy/accuracy_core.py
@ -434,6 +434,9 @@ class CliFlowAccuracyTestHarness:
            f"--dtype={self.dtype}",
        ]

+        if "nemotron_nas" in self.EXAMPLE_FOLDER:
+            convert_cmd.append("--trust_remote_code")
+
        if self.MODEL_FORMAT == "NEMO":
            convert_cmd.append(f"--nemo_ckpt_path={self.MODEL_PATH}")
        else:
--- a/tests/integration/defs/accuracy/references/gpqa_diamond.yaml
+++ b/tests/integration/defs/accuracy/references/gpqa_diamond.yaml
@ -16,3 +16,5 @@ deepseek-ai/DeepSeek-R1:
    accuracy: 70.45
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
  - accuracy: 44.95
+  - quant_algo: FP8
+    accuracy: 49.49
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@ -65,5 +65,7 @@ Qwen3/Qwen3-30B-A3B:
    accuracy: 83.43
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
  - accuracy: 92.57
+  - quant_algo: FP8
+    accuracy: 92.42
 nvidia/Nemotron-H-8B-Base-8K:
  - accuracy: 46.20
--- a/tests/integration/defs/accuracy/references/mmlu.yaml
+++ b/tests/integration/defs/accuracy/references/mmlu.yaml
@ -121,6 +121,8 @@ Qwen3/Qwen3-30B-A3B:
    accuracy: 80.65
 nvidia/Llama-3_3-Nemotron-Super-49B-v1:
  - accuracy: 79.43
+  - quant_algo: FP8
+    accuracy: 79.26
 nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
  - accuracy: 57.97
 nvidia/Nemotron-H-8B-Base-8K:
--- a/tests/integration/defs/accuracy/test_cli_flow.py
+++ b/tests/integration/defs/accuracy/test_cli_flow.py
@ -200,6 +200,30 @@ class TestNemotronMini4BInstruct(CliFlowAccuracyTestHarness):
        self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)


+# TODO: Remove the CLI tests once NIMs use PyTorch backend
+class TestLlama3_3NemotronSuper49Bv1(CliFlowAccuracyTestHarness):
+    MODEL_NAME = "nvidia/Llama-3_3-Nemotron-Super-49B-v1"
+    MODEL_PATH = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1"
+    EXAMPLE_FOLDER = "models/core/nemotron_nas"
+
+    @pytest.mark.skip_less_device(2)
+    def test_auto_dtype_tp2(self):
+        self.run(tasks=[MMLU(self.MODEL_NAME)], tp_size=2, dtype='auto')
+
+    @pytest.mark.skip(
+        reason="nemotron-nas scripts have to accommodate fp8 flags")
+    @pytest.mark.skip_less_device(2)
+    @pytest.mark.skip_device_not_contain(["H100", "B200"])
+    def test_fp8_prequantized_tp2(self, mocker):
+        mocker.patch.object(
+            self.__class__, "MODEL_PATH",
+            f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8"
+        )
+        self.run(tasks=[MMLU(self.MODEL_NAME)],
+                 tp_size=2,
+                 quant_algo=QuantAlgo.FP8)
+
+
 class TestPhi2(CliFlowAccuracyTestHarness):
    MODEL_NAME = "microsoft/phi-2"
    MODEL_PATH = f"{llm_models_root()}/phi-2"
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -891,7 +891,7 @@ class TestNemotronNas(LlmapiAccuracyTestHarness):
            task.evaluate(llm)


-class TestNemotronSuper(LlmapiAccuracyTestHarness):
+class TestLlama3_3NemotronSuper49Bv1(LlmapiAccuracyTestHarness):
    MODEL_NAME = "nvidia/Llama-3_3-Nemotron-Super-49B-v1"
    MODEL_PATH = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1"

@ -906,6 +906,20 @@ class TestNemotronSuper(LlmapiAccuracyTestHarness):
            task.evaluate(llm,
                          extra_evaluator_kwargs=dict(apply_chat_template=True))

+    @pytest.mark.skip_less_device(2)
+    @pytest.mark.skip_device_not_contain(["H100", "B200"])
+    def test_fp8_prequantized_tp2(self):
+        model_path = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8"
+        with LLM(model_path, tensor_parallel_size=2) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = GPQADiamond(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=dict(apply_chat_template=True))
+

 class TestNemotronNano(LlmapiAccuracyTestHarness):
    MODEL_NAME = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@ -445,7 +445,10 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
 accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
-accuracy/test_llm_api_pytorch.py::TestNemotronSuper::test_auto_dtype_tp2
+accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
+accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
+accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
+accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
 accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
--- a/tests/integration/test_lists/qa/llm_sanity_test.txt
+++ b/tests/integration/test_lists/qa/llm_sanity_test.txt
@ -136,7 +136,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-c
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
 accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
 accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
-accuracy/test_llm_api_pytorch.py::TestNemotronSuper::test_auto_dtype_tp2
+accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
+accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
 accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]