mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-4932] Add CLI accuracy tests for Llama-3_3-Nemotron-Super-49B-v1 and LLM API FP8 variant (#4375)
* Add CLI TestNemotronSuper acc tests Signed-off-by: moraxu <mguzek@nvidia.com> * Update mmlu.yaml Signed-off-by: moraxu <mguzek@nvidia.com> * Update yaml files Signed-off-by: moraxu <mguzek@nvidia.com> * Skip FP8 test in CLI Signed-off-by: moraxu <mguzek@nvidia.com> * Address reviews Signed-off-by: moraxu <mguzek@nvidia.com> * Address review comments Signed-off-by: moraxu <mguzek@nvidia.com> --------- Signed-off-by: moraxu <mguzek@nvidia.com>
This commit is contained in:
parent
53008d3ee8
commit
d2e6af2fe4
@ -434,6 +434,9 @@ class CliFlowAccuracyTestHarness:
|
||||
f"--dtype={self.dtype}",
|
||||
]
|
||||
|
||||
if "nemotron_nas" in self.EXAMPLE_FOLDER:
|
||||
convert_cmd.append("--trust_remote_code")
|
||||
|
||||
if self.MODEL_FORMAT == "NEMO":
|
||||
convert_cmd.append(f"--nemo_ckpt_path={self.MODEL_PATH}")
|
||||
else:
|
||||
|
||||
@ -16,3 +16,5 @@ deepseek-ai/DeepSeek-R1:
|
||||
accuracy: 70.45
|
||||
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
|
||||
- accuracy: 44.95
|
||||
- quant_algo: FP8
|
||||
accuracy: 49.49
|
||||
|
||||
@ -65,5 +65,7 @@ Qwen3/Qwen3-30B-A3B:
|
||||
accuracy: 83.43
|
||||
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
|
||||
- accuracy: 92.57
|
||||
- quant_algo: FP8
|
||||
accuracy: 92.42
|
||||
nvidia/Nemotron-H-8B-Base-8K:
|
||||
- accuracy: 46.20
|
||||
|
||||
@ -121,6 +121,8 @@ Qwen3/Qwen3-30B-A3B:
|
||||
accuracy: 80.65
|
||||
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
|
||||
- accuracy: 79.43
|
||||
- quant_algo: FP8
|
||||
accuracy: 79.26
|
||||
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
|
||||
- accuracy: 57.97
|
||||
nvidia/Nemotron-H-8B-Base-8K:
|
||||
|
||||
@ -200,6 +200,30 @@ class TestNemotronMini4BInstruct(CliFlowAccuracyTestHarness):
|
||||
self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
|
||||
|
||||
|
||||
# TODO: Remove the CLI tests once NIMs use PyTorch backend
|
||||
class TestLlama3_3NemotronSuper49Bv1(CliFlowAccuracyTestHarness):
|
||||
MODEL_NAME = "nvidia/Llama-3_3-Nemotron-Super-49B-v1"
|
||||
MODEL_PATH = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1"
|
||||
EXAMPLE_FOLDER = "models/core/nemotron_nas"
|
||||
|
||||
@pytest.mark.skip_less_device(2)
|
||||
def test_auto_dtype_tp2(self):
|
||||
self.run(tasks=[MMLU(self.MODEL_NAME)], tp_size=2, dtype='auto')
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason="nemotron-nas scripts have to accommodate fp8 flags")
|
||||
@pytest.mark.skip_less_device(2)
|
||||
@pytest.mark.skip_device_not_contain(["H100", "B200"])
|
||||
def test_fp8_prequantized_tp2(self, mocker):
|
||||
mocker.patch.object(
|
||||
self.__class__, "MODEL_PATH",
|
||||
f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8"
|
||||
)
|
||||
self.run(tasks=[MMLU(self.MODEL_NAME)],
|
||||
tp_size=2,
|
||||
quant_algo=QuantAlgo.FP8)
|
||||
|
||||
|
||||
class TestPhi2(CliFlowAccuracyTestHarness):
|
||||
MODEL_NAME = "microsoft/phi-2"
|
||||
MODEL_PATH = f"{llm_models_root()}/phi-2"
|
||||
|
||||
@ -891,7 +891,7 @@ class TestNemotronNas(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestNemotronSuper(LlmapiAccuracyTestHarness):
|
||||
class TestLlama3_3NemotronSuper49Bv1(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "nvidia/Llama-3_3-Nemotron-Super-49B-v1"
|
||||
MODEL_PATH = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1"
|
||||
|
||||
@ -906,6 +906,20 @@ class TestNemotronSuper(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=dict(apply_chat_template=True))
|
||||
|
||||
@pytest.mark.skip_less_device(2)
|
||||
@pytest.mark.skip_device_not_contain(["H100", "B200"])
|
||||
def test_fp8_prequantized_tp2(self):
|
||||
model_path = f"{llm_models_root()}/nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8"
|
||||
with LLM(model_path, tensor_parallel_size=2) as llm:
|
||||
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = GPQADiamond(self.MODEL_NAME)
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=dict(apply_chat_template=True))
|
||||
|
||||
|
||||
class TestNemotronNano(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "nvidia/Llama-3.1-Nemotron-Nano-8B-v1"
|
||||
|
||||
@ -445,7 +445,10 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronSuper::test_auto_dtype_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
|
||||
accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
|
||||
accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
|
||||
|
||||
@ -136,7 +136,8 @@ accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-c
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronSuper::test_auto_dtype_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
|
||||
accuracy/test_cli_flow.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronNano::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user