[TRTLLM-4932] Add CLI accuracy tests for Llama-3_3-Nemotron-Super-49B-v1 and LLM API FP8 variant (#4375)

* Add CLI TestNemotronSuper acc tests

Signed-off-by: moraxu <mguzek@nvidia.com>

* Update mmlu.yaml

Signed-off-by: moraxu <mguzek@nvidia.com>

* Update yaml files

Signed-off-by: moraxu <mguzek@nvidia.com>

* Skip FP8 test in CLI

Signed-off-by: moraxu <mguzek@nvidia.com>

* Address reviews

Signed-off-by: moraxu <mguzek@nvidia.com>

* Address review comments

Signed-off-by: moraxu <mguzek@nvidia.com>

---------

Signed-off-by: moraxu <mguzek@nvidia.com>
This commit is contained in:
Michal Guzek 2025-05-23 12:17:23 -07:00 committed by GitHub
parent 53008d3ee8
commit d2e6af2fe4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 54 additions and 3 deletions

View File

@ -434,6 +434,9 @@ class CliFlowAccuracyTestHarness:
f"--dtype={self.dtype}",
]
if "nemotron_nas" in self.EXAMPLE_FOLDER:
convert_cmd.append("--trust_remote_code")
if self.MODEL_FORMAT == "NEMO":
convert_cmd.append(f"--nemo_ckpt_path={self.MODEL_PATH}")
else:

View File

@ -16,3 +16,5 @@ deepseek-ai/DeepSeek-R1:
accuracy: 70.45
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
- accuracy: 44.95
- quant_algo: FP8
accuracy: 49.49

View File

@ -65,5 +65,7 @@ Qwen3/Qwen3-30B-A3B:
accuracy: 83.43
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
- accuracy: 92.57
- quant_algo: FP8
accuracy: 92.42
nvidia/Nemotron-H-8B-Base-8K:
- accuracy: 46.20

View File

@ -121,6 +121,8 @@ Qwen3/Qwen3-30B-A3B:
accuracy: 80.65
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
- accuracy: 79.43
- quant_algo: FP8
accuracy: 79.26
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
- accuracy: 57.97
nvidia/Nemotron-H-8B-Base-8K:

View File

@ -200,6 +200,30 @@ class TestNemotronMini4BInstruct(CliFlowAccuracyTestHarness):
self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
# TODO: Remove the CLI tests once NIMs use PyTorch backend
class TestLlama3_3NemotronSuper49Bv1(CliFlowAccuracyTestHarness):
MODEL_NAME = "nvidia/Llama-3_3-Nemotron-Super-49B-v1"
MODEL_PATH = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1"
EXAMPLE_FOLDER = "models/core/nemotron_nas"
@pytest.mark.skip_less_device(2)
def test_auto_dtype_tp2(self):
self.run(tasks=[MMLU(self.MODEL_NAME)], tp_size=2, dtype='auto')
@pytest.mark.skip(
reason="nemotron-nas scripts have to accommodate fp8 flags")
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_device_not_contain(["H100", "B200"])
def test_fp8_prequantized_tp2(self, mocker):
mocker.patch.object(
self.__class__, "MODEL_PATH",
f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8"
)
self.run(tasks=[MMLU(self.MODEL_NAME)],
tp_size=2,
quant_algo=QuantAlgo.FP8)
class TestPhi2(CliFlowAccuracyTestHarness):
MODEL_NAME = "microsoft/phi-2"
MODEL_PATH = f"{llm_models_root()}/phi-2"

View File

@ -891,7 +891,7 @@ class TestNemotronNas(LlmapiAccuracyTestHarness):
task.evaluate(llm)
class TestNemotronSuper(LlmapiAccuracyTestHarness):
class TestLlama3_3NemotronSuper49Bv1(LlmapiAccuracyTestHarness):
MODEL_NAME = "nvidia/Llama-3_3-Nemotron-Super-49B-v1"
MODEL_PATH = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1"
@ -906,6 +906,20 @@ class TestNemotronSuper(LlmapiAccuracyTestHarness):
task.evaluate(llm,
extra_evaluator_kwargs=dict(apply_chat_template=True))
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_device_not_contain(["H100", "B200"])
def test_fp8_prequantized_tp2(self):
model_path = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8"
with LLM(model_path, tensor_parallel_size=2) as llm:
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
task = GPQADiamond(self.MODEL_NAME)
task.evaluate(llm,
extra_evaluator_kwargs=dict(apply_chat_template=True))
class TestNemotronNano(LlmapiAccuracyTestHarness):
MODEL_NAME = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"

View File

@ -445,7 +445,10 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
accuracy/test_llm_api_pytorch.py::TestNemotronSuper::test_auto_dtype_tp2
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype

View File

@ -136,7 +136,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-c
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
accuracy/test_llm_api_pytorch.py::TestNemotronSuper::test_auto_dtype_tp2
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]