# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import pytest from tensorrt_llm.llmapi import (EagleDecodingConfig, LookaheadDecodingConfig, MedusaDecodingConfig) from tensorrt_llm.quantization import QuantAlgo from ..conftest import (llm_models_root, parametrize_with_ids, skip_no_nvls, skip_post_blackwell, skip_pre_ada, skip_pre_blackwell, skip_pre_hopper) from .accuracy_core import (MMLU, CliFlowAccuracyTestHarness, CnnDailymail, Humaneval, PassKeyRetrieval64k, PassKeyRetrieval128k, SlimPajama6B, ZeroScrolls) class TestGpt2(CliFlowAccuracyTestHarness): MODEL_NAME = "gpt2" MODEL_PATH = f"{llm_models_root()}/gpt2" EXAMPLE_FOLDER = "gpt" def test_auto_dtype(self): # float16 self.run(dtype='auto') def test_gemm_plugin(self): self.run(extra_build_args=["--gemm_plugin=auto"]) def test_attention_ootb(self): self.run(extra_build_args=[ "--gpt_attention_plugin=disable", "--context_fmha=disable", "--paged_kv_cache=disable", "--remove_input_padding=disable" ]) def test_context_fmha_disabled(self): self.run(extra_build_args=["--context_fmha=disable"]) def test_context_fmha_fp32_acc(self): self.run(extra_summarize_args=["--enable_context_fmha_fp32_acc"]) @pytest.mark.parametrize("precision", ["int8", "int4"]) def test_weight_only(self, precision: str): quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16 self.run(quant_algo=quant_algo) def test_int8_kv_cache(self): self.run(kv_cache_quant_algo=QuantAlgo.INT8) @skip_post_blackwell @parametrize_with_ids("per_token,per_channel", [(False, False), (True, True)]) def test_smooth_quant(self, per_token: bool, per_channel: bool): if per_token: if per_channel: quant_algo = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN else: quant_algo = QuantAlgo.W8A8_SQ_PER_TENSOR_PER_TOKEN_PLUGIN else: if per_channel: quant_algo = QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TENSOR_PLUGIN else: quant_algo = QuantAlgo.W8A8_SQ_PER_TENSOR_PLUGIN self.run(quant_algo=quant_algo) def test_beam_search(self): self.run(extra_acc_spec="beam_width=4", extra_build_args=["--max_beam_width=4"], extra_summarize_args=["--num_beams=4", "--length_penalty=2.0"]) def test_beam_search_large(self, mocker): mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8) self.run(extra_acc_spec="beam_width=256", extra_build_args=["--max_beam_width=256"], extra_summarize_args=["--num_beams=256"]) def test_weight_streaming_ootb(self): self.run(extra_build_args=[ "--gpt_attention_plugin=disable", "--weight_streaming", "--remove_input_padding=disable", "--paged_kv_cache=disable" ], extra_summarize_args=[ "--gpu_weights_percent=0.5", "--use_py_session" ]) def test_weight_streaming_plugin(self): self.run(extra_build_args=["--weight_streaming"], extra_summarize_args=["--gpu_weights_percent=0"]) def test_cuda_graph(self): self.run(extra_summarize_args=["--cuda_graph_mode"]) class TestGpt2Medium(CliFlowAccuracyTestHarness): MODEL_NAME = "gpt2-medium" MODEL_PATH = f"{llm_models_root()}/gpt2-medium" EXAMPLE_FOLDER = "gpt" def test_auto_dtype(self): self.run(dtype='auto') @skip_pre_ada def test_fp8(self): self.run(quant_algo=QuantAlgo.FP8) @skip_pre_ada def test_fp8_lm_head(self): self.run(quant_algo=QuantAlgo.FP8, extra_convert_args=["--quantize_lm_head"]) class TestSantacoder(CliFlowAccuracyTestHarness): MODEL_NAME = "bigcode/santacoder" MODEL_PATH = f"{llm_models_root()}/santacoder" EXAMPLE_FOLDER = "gpt" def test_auto_dtype(self): # float16 self.run(tasks=[Humaneval(self.MODEL_NAME)], dtype='auto') class TestStarcoder2_3B(CliFlowAccuracyTestHarness): MODEL_NAME = "bigcode/starcoder2-3b" MODEL_PATH = f"{llm_models_root()}/starcoder2-3b" EXAMPLE_FOLDER = "gpt" def test_auto_dtype(self): self.run(tasks=[Humaneval(self.MODEL_NAME)], dtype='auto') class TestStarcoder2_15B(CliFlowAccuracyTestHarness): MODEL_NAME = "bigcode/starcoder2-15b" MODEL_PATH = f"{llm_models_root()}/starcoder2-model" EXAMPLE_FOLDER = "gpt" @skip_post_blackwell def test_smooth_quant_ootb(self): self.run(tasks=[Humaneval(self.MODEL_NAME)], quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL) class TestGptNext(CliFlowAccuracyTestHarness): MODEL_NAME = "gpt-next" MODEL_PATH = f"{llm_models_root()}/gpt-next/megatron_converted_843m_tp1_pp1.nemo" MODEL_FORMAT = "NEMO" EXAMPLE_FOLDER = "gpt" def test_auto_dtype(self): # bfloat16 self.run(dtype='auto') class TestMinitron4BBase(CliFlowAccuracyTestHarness): MODEL_NAME = "nvidia/Minitron-4B-Base" MODEL_PATH = f"{llm_models_root()}/nemotron/Minitron-4B-Base" EXAMPLE_FOLDER = "gpt" def test_auto_dtype(self): self.run(tasks=[Humaneval(self.MODEL_NAME)], dtype='auto') @skip_pre_ada def test_fp8(self, mocker): # Accuracy regression when using large batch size mocker.patch.object(Humaneval, "MAX_BATCH_SIZE", 1) self.run(tasks=[Humaneval(self.MODEL_NAME)], quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) class TestNemotronMini4BInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "nvidia/Nemotron-Mini-4B-Instruct" MODEL_PATH = f"{llm_models_root()}/nemotron/Nemotron-Mini-4B-Instruct" EXAMPLE_FOLDER = "gpt" @skip_pre_ada def test_fp8_prequantized(self, mocker): mocker.patch.object( self.__class__, "MODEL_PATH", f"{llm_models_root()}/nemotron/nemotron-mini-4b-instruct_vfp8-fp8-bf16-export" ) self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) class TestPhi2(CliFlowAccuracyTestHarness): MODEL_NAME = "microsoft/phi-2" MODEL_PATH = f"{llm_models_root()}/phi-2" EXAMPLE_FOLDER = "phi" @skip_post_blackwell def test_auto_dtype(self): self.run(dtype='auto') @skip_post_blackwell @pytest.mark.skip_less_device(2) def test_tp2(self): self.run(tp_size=2) class TestPhi3Mini4kInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct" MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-mini-4k-instruct" EXAMPLE_FOLDER = "phi" def test_auto_dtype(self): self.run(dtype='auto') class TestPhi3Mini128kInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "microsoft/Phi-3-mini-128k-instruct" MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-mini-128k-instruct" EXAMPLE_FOLDER = "phi" def test_auto_dtype(self): self.run(dtype='auto') class TestPhi3Small8kInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "microsoft/Phi-3-small-8k-instruct" MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-small-8k-instruct" EXAMPLE_FOLDER = "phi" def test_auto_dtype(self): self.run(dtype='auto') class TestPhi3Small128kInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "microsoft/Phi-3-small-128k-instruct" MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-small-128k-instruct" EXAMPLE_FOLDER = "phi" def test_auto_dtype(self): self.run(dtype='auto') class TestPhi3_5MiniInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "microsoft/Phi-3.5-mini-instruct" MODEL_PATH = f"{llm_models_root()}/Phi-3.5/Phi-3.5-mini-instruct" EXAMPLE_FOLDER = "phi" def test_auto_dtype(self): self.run(dtype='auto') # Long sequence length test: # Model FP16 7B + 32K tokens in KV cache = 14 * 1024 MB + 32K * 0.5 MB = 30720 MB + scratch memory @pytest.mark.skip_less_device_memory(40000) class TestLongAlpaca7B(CliFlowAccuracyTestHarness): MODEL_NAME = "Yukang/LongAlpaca-7B" MODEL_PATH = f"{llm_models_root()}/LongAlpaca-7B" EXAMPLE_FOLDER = "llama" def test_auto_dtype(self): self.run(tasks=[ZeroScrolls(self.MODEL_NAME)]) def test_multiblock_aggressive(self): # MMHA + aggressive Multi_block_mode (export TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG=1) self.run(tasks=[ZeroScrolls(self.MODEL_NAME)], extra_build_args=["--gemm_plugin=auto"], env={ "TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG": "1", "TRTLLM_MMHA_BLOCKS_PER_SEQUENCE": "32" }) class TestMamba130M(CliFlowAccuracyTestHarness): MODEL_NAME = "state-spaces/mamba-130m-hf" MODEL_PATH = f"{llm_models_root()}/mamba/mamba-130m-hf" EXAMPLE_FOLDER = "mamba" def test_auto_dtype(self): self.run(dtype='auto') class TestVicuna7B(CliFlowAccuracyTestHarness): MODEL_NAME = "lmsys/vicuna-7b-v1.3" MODEL_PATH = f"{llm_models_root()}/vicuna-7b-v1.3" EXAMPLE_FOLDER = "llama" MEDUSA_MODEL_NAME = "FasterDecoding/medusa-vicuna-7b-v1.3" MEDUSA_MODEL_PATH = f"{llm_models_root()}/medusa-vicuna-7b-v1.3" EAGLE_MODEL_NAME = "yuhuili/EAGLE-Vicuna-7B-v1.3" EAGLE_MODEL_PATH = f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3" def test_lookahead(self, mocker): mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8) self.run(spec_dec_algo=LookaheadDecodingConfig.decoding_type, extra_build_args=[ "--max_draft_len=83", "--speculative_decoding_mode=lookahead_decoding" ], extra_summarize_args=["--lookahead_config=[7,7,7]"]) @parametrize_with_ids("cuda_graph", [False, True]) def test_medusa(self, cuda_graph, mocker): mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "medusa") mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8) extra_summarize_args = [ "--medusa_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], [0, 0, 0, 0], [0, 1, 1], [0, 0, 6], [0, 3, 0], [5, 0], [1, 3], [0, 0, 7], [0, 0, 8], [0, 0, 9], [6, 0], [0, 4, 0], [1, 4], [7, 0], [0, 1, 2], [2, 0, 0], [3, 1], [2, 2], [8, 0], [0, 5, 0], [1, 5], [1, 0, 1], [0, 2, 1], [9, 0], [0, 6, 0], [0, 0, 0, 1], [1, 6], [0, 7, 0]]" ] if cuda_graph: extra_summarize_args.append("--cuda_graph_mode") self.run(dtype="float16", spec_dec_algo=MedusaDecodingConfig.decoding_type, extra_convert_args=[ f"--medusa_model_dir={self.MEDUSA_MODEL_PATH}", "--num_medusa_heads=4" ], extra_build_args=["--speculative_decoding_mode=medusa"], extra_summarize_args=extra_summarize_args) @skip_post_blackwell @parametrize_with_ids("cuda_graph,chunked_context,typical_acceptance", [(False, False, False), (True, False, False), (True, True, False), (True, False, True)]) def test_eagle(self, cuda_graph, chunked_context, typical_acceptance, mocker): mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "eagle") mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8) extra_summarize_args = [ "--eagle_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], [0, 0, 0, 0], [0, 1, 1], [0, 0, 6], [0, 3, 0], [5, 0], [1, 3], [0, 0, 7], [0, 0, 8], [0, 0, 9], [6, 0], [0, 4, 0], [1, 4], [7, 0], [0, 1, 2], [2, 0, 0], [3, 1], [2, 2], [8, 0], [0, 5, 0], [1, 5], [1, 0, 1], [0, 2, 1], [9, 0], [0, 6, 0], [0, 0, 0, 1], [1, 6], [0, 7, 0]]" ] if cuda_graph: extra_summarize_args.append("--cuda_graph_mode") if chunked_context: extra_summarize_args.append("--enable_chunked_context") if typical_acceptance: extra_summarize_args.extend( ["--eagle_posterior_threshold=0.09", "--temperature=0.7"]) self.run(spec_dec_algo=EagleDecodingConfig.decoding_type, extra_convert_args=[ f"--eagle_model_dir={self.EAGLE_MODEL_PATH}", "--max_draft_len=63", "--num_eagle_layers=4", "--max_non_leaves_per_layer=10" ], extra_build_args=[ "--speculative_decoding_mode=eagle", "--max_draft_len=63" ], extra_summarize_args=extra_summarize_args) class TestLlama7B(CliFlowAccuracyTestHarness): MODEL_NAME = "llama-7b-hf" MODEL_PATH = f"{llm_models_root()}/llama-models/llama-7b-hf" EXAMPLE_FOLDER = "llama" def test_auto_dtype(self): self.run(dtype='auto') def test_beam_search(self): self.run(extra_acc_spec="beam_width=5", extra_build_args=["--max_beam_width=5"], extra_summarize_args=["--num_beams=5"]) @skip_post_blackwell def test_int4_gptq(self): self.run( quant_algo=QuantAlgo.W4A16_GPTQ, extra_convert_args=[ f"--quant_ckpt_path={llm_models_root()}/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors" ]) def test_streamingllm(self): self.run(extra_acc_spec="streamingllm", extra_build_args=["--streamingllm=enable"], extra_summarize_args=[ "--max_attention_window_size=2048", "--sink_token_length=4" ]) def test_manage_weights(self): self.run(extra_build_args=["--fast_build"]) class TestLlama2_7B(CliFlowAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-2-7b-hf" MODEL_PATH = f"{llm_models_root()}/llama-models-v2/llama-v2-7b-hf" EXAMPLE_FOLDER = "llama" def test_auto_dtype(self): self.run(dtype='auto') @skip_post_blackwell def test_smooth_quant(self): self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN) @skip_pre_ada def test_fp8(self): self.run(tasks=[CnnDailymail(self.MODEL_NAME), MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) @skip_pre_ada @pytest.mark.skip_less_device(2) @pytest.mark.parametrize("tp_size,pp_size,cp_size", [(2, 1, 1), (1, 2, 1), (1, 1, 2)], ids=["tp2", "pp2", "cp2"]) def test_fp8_2gpus(self, tp_size, pp_size, cp_size): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8, tp_size=tp_size, pp_size=pp_size, cp_size=cp_size) @skip_pre_ada @pytest.mark.skip_less_device(4) def test_tp2cp2(self): self.run(tp_size=2, cp_size=2) @skip_pre_ada def test_fp8_gemm_plugin(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8, extra_build_args=["--gemm_plugin=fp8"]) @skip_pre_ada def test_fp8_gemm_swiglu_plugin(self): self.run( quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8, extra_build_args=["--gemm_plugin=fp8", "--gemm_swiglu_plugin=fp8"]) @skip_pre_ada def test_fp8_low_latency_gemm_plugin(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8, extra_build_args=["--low_latency_gemm_plugin=fp8"]) @pytest.mark.skip_less_device(2) @skip_post_blackwell def test_smooth_quant_ootb_tp2(self): self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL, tp_size=2) @pytest.mark.skip_less_device(2) @skip_post_blackwell def test_int4_awq_tp2(self): self.run(quant_algo=QuantAlgo.W4A16_AWQ, tp_size=2) @pytest.mark.skip_less_device(2) @skip_post_blackwell def test_int4_awq_prequantized_tp2(self, mocker): mocker.patch.object( self.__class__, "MODEL_PATH", f"{llm_models_root()}/llama-models-v2/Llama-2-7B-AWQ") self.run(quant_algo=QuantAlgo.W4A16_AWQ, tp_size=2) @pytest.mark.skip_less_device(2) @skip_post_blackwell def test_int4_gptq_prequantized_tp2(self, mocker): mocker.patch.object( self.__class__, "MODEL_PATH", f"{llm_models_root()}/llama-models-v2/Llama-2-7B-GPTQ") self.run(quant_algo=QuantAlgo.W4A16_GPTQ, tp_size=2) def test_weight_sparsity(self): self.run(extra_build_args=["--weight_sparsity"]) class TestTinyLlama1_1BChat(CliFlowAccuracyTestHarness): MODEL_NAME = "TinyLlama/TinyLlama-1.1B-Chat-v1.0" MODEL_PATH = f"{llm_models_root()}/llama-models-v2/TinyLlama-1.1B-Chat-v1.0" EXAMPLE_FOLDER = "llama" def test_auto_dtype(self): self.run(dtype='auto') def test_float32(self): self.run(dtype='float32') @skip_post_blackwell @pytest.mark.parametrize("precision", ["int8", "int4"]) def test_weight_only(self, precision: str): quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16 self.run(quant_algo=quant_algo) @skip_post_blackwell @pytest.mark.parametrize("precision", ["int8", "int4"]) def test_weight_only_int8_kv_cache(self, precision: str): quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16 self.run(quant_algo=quant_algo, kv_cache_quant_algo=QuantAlgo.INT8) @skip_post_blackwell @pytest.mark.parametrize("precision", ["int8", "int4"]) def test_weight_only_manage_weights(self, precision: str): quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16 self.run(quant_algo=quant_algo, extra_build_args=["--fast_build"]) @skip_pre_ada def test_fp8(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) @pytest.mark.skip_less_device(4) def test_pp4(self): # Test num_hidden_layers (22) undivisible by pp_size (4) self.run(extra_acc_spec="pp_size=4", pp_size=4) class TestLlama3_8BInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "meta-llama/Meta-Llama-3-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/llama-models-v3/llama-v3-8b-instruct-hf" EXAMPLE_FOLDER = "llama" def test_auto_dtype(self): self.run(dtype='auto') @skip_pre_ada def test_fp8(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) def test_int8_gptq(self): self.run( quant_algo=QuantAlgo.W8A16_GPTQ, extra_convert_args=[ f"--quant_ckpt_path={llm_models_root()}/int8-quantized-gptq/llama-3-8b-8bit-gs64-gptq.safetensors" ]) @skip_pre_blackwell def test_nvfp4(self): self.run(tasks=[MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.NVFP4, kv_cache_quant_algo=QuantAlgo.FP8, extra_build_args=["--gemm_plugin=disable"]) @pytest.mark.skip( reason="Broken by modelopt. Will be fixed in next release") @skip_pre_blackwell @pytest.mark.parametrize("fuse_fp4_quant", [False, True], ids=["disable_fused_quant", "enable_fused_quant"]) @pytest.mark.parametrize( "norm_quant_fusion", [False, True], ids=["disable_norm_quant_fusion", "enable_norm_quant_fusion"]) def test_nvfp4_gemm_plugin(self, fuse_fp4_quant: bool, norm_quant_fusion: bool): extra_build_args = ["--gemm_plugin=nvfp4"] if fuse_fp4_quant: extra_build_args.extend([ "--use_paged_context_fmha=enable", "--use_fp8_context_fmha=enable", "--fuse_fp4_quant=enable" ]) if norm_quant_fusion: extra_build_args.append("--norm_quant_fusion=enable") self.run(tasks=[MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.NVFP4, kv_cache_quant_algo=QuantAlgo.FP8, extra_build_args=extra_build_args) class TestLlama3_8BInstructGradient1048k(CliFlowAccuracyTestHarness): MODEL_NAME = "gradientai/Llama-3-8B-Instruct-Gradient-1048k" MODEL_PATH = f"{llm_models_root()}/llama-models-v3/Llama-3-8B-Instruct-Gradient-1048k" EXAMPLE_FOLDER = "llama" @pytest.mark.skip_less_device_memory(60000) def test_long_context(self): self.run(tasks=[PassKeyRetrieval128k(self.MODEL_NAME)]) @pytest.mark.skip_less_device_memory(60000) def test_long_context_ppl(self): self.run(tasks=[SlimPajama6B(self.MODEL_NAME)], extra_build_args=["--gather_context_logits"]) class TestLlama3_1_8B(CliFlowAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.1-8B" MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Meta-Llama-3.1-8B" EXAMPLE_FOLDER = "llama" def test_auto_dtype(self): self.run(dtype='auto') @skip_post_blackwell def test_smooth_quant(self): self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN) @skip_pre_ada def test_fp8(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) @skip_pre_ada @skip_post_blackwell def test_fp8_rowwise(self): self.run(tasks=[CnnDailymail(self.MODEL_NAME), MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN) @skip_pre_ada @skip_post_blackwell def test_fp8_rowwise_meta_recipe(self): self.run(quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN, extra_acc_spec="meta_recipe", extra_convert_args=["--use_meta_fp8_rowwise_recipe"]) @pytest.mark.skip_less_device(4) @pytest.mark.parametrize( "gemm_allreduce", [False, pytest.param(True, marks=skip_no_nvls)], ids=["disable_gemm_allreduce_plugin", "enable_gemm_allreduce_plugin"]) def test_tp4(self, gemm_allreduce: bool): extra_build_args = None if gemm_allreduce: extra_build_args = ["--gemm_allreduce_plugin=bfloat16"] self.run( tasks=[PassKeyRetrieval64k(self.MODEL_NAME), MMLU(self.MODEL_NAME)], tp_size=4, extra_build_args=extra_build_args) @skip_pre_ada @skip_post_blackwell @pytest.mark.skip_less_device(4) @pytest.mark.parametrize( "gemm_allreduce", [False, pytest.param(True, marks=skip_no_nvls)], ids=["disable_gemm_allreduce_plugin", "enable_gemm_allreduce_plugin"]) def test_fp8_rowwise_tp4(self, gemm_allreduce: bool): extra_build_args = None if gemm_allreduce: extra_build_args = ["--gemm_allreduce_plugin=bfloat16"] self.run( tasks=[PassKeyRetrieval64k(self.MODEL_NAME), MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN, tp_size=4, extra_build_args=extra_build_args) @skip_pre_ada def test_autoq(self): self.run(tasks=[CnnDailymail(self.MODEL_NAME), MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.MIXED_PRECISION, extra_acc_spec= "autoq_format=int4_awq,fp8,w4a8_awq;auto_quantize_bits=5.8", extra_convert_args=[ "--autoq_format=int4_awq,fp8,w4a8_awq", "--auto_quantize_bits=5.8", "--calib_size=4", "--batch_size=4" ]) class TestLlama3_1_8BInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.1-8B-Instruct" MODEL_PATH = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct" EXAMPLE_FOLDER = "llama" def test_auto_dtype(self): self.run(dtype='auto') @skip_pre_ada def test_fp8_prequantized(self, mocker): mocker.patch.object( self.__class__, "MODEL_PATH", f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8") self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) @skip_pre_ada @skip_post_blackwell def test_medusa_fp8_prequantized(self, mocker): # nvidia/Llama-3.1-8B-Medusa-FP8 mocker.patch.object(self.__class__, "MODEL_PATH", f"{llm_models_root()}/llama3.1-medusa-8b-hf_v0.1") mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "medusa") mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8) extra_summarize_args = [ "--medusa_choices=[[0], [0, 0], [1], [0, 1], [2], [0, 0, 0], [1, 0], [0, 2], [3], [0, 3], [4], [0, 4], [2, 0], [0, 5], [0, 0, 1], [5], [0, 6], [6], [0, 7], [0, 1, 0], [1, 1], [7], [0, 8], [0, 0, 2], [3, 0], [0, 9], [8], [9], [1, 0, 0], [0, 2, 0], [1, 2], [0, 0, 3], [4, 0], [2, 1], [0, 0, 4], [0, 0, 5], [0, 1, 1], [0, 0, 6], [0, 3, 0], [5, 0], [1, 3], [0, 0, 7], [0, 0, 8], [0, 0, 9], [6, 0], [0, 4, 0], [1, 4], [7, 0], [0, 1, 2], [2, 0, 0], [3, 1], [2, 2], [8, 0], [0, 5, 0], [1, 5], [1, 0, 1], [0, 2, 1], [9, 0], [0, 6, 0], [1, 6], [0, 7, 0]]" ] self.run(dtype="float16", spec_dec_algo=MedusaDecodingConfig.decoding_type, extra_build_args=["--speculative_decoding_mode=medusa"], extra_summarize_args=extra_summarize_args) class TestLlama3_2_1B(CliFlowAccuracyTestHarness): MODEL_NAME = "meta-llama/Llama-3.2-1B" MODEL_PATH = f"{llm_models_root()}/llama-3.2-models/Llama-3.2-1B" EXAMPLE_FOLDER = "llama" def test_auto_dtype(self): self.run(dtype='auto') @skip_post_blackwell def test_smooth_quant(self): self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN) @skip_post_blackwell def test_smooth_quant_ootb(self): self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL) @skip_post_blackwell def test_smooth_quant_ootb_manage_weights(self): self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL, extra_build_args=["--fast_build"]) @skip_post_blackwell def test_int4_awq(self): self.run(quant_algo=QuantAlgo.W4A16_AWQ) @skip_post_blackwell def test_int4_awq_int8_kv_cache(self): self.run(quant_algo=QuantAlgo.W4A16_AWQ, kv_cache_quant_algo=QuantAlgo.INT8) @skip_post_blackwell def test_int4_awq_manage_weights(self): self.run(quant_algo=QuantAlgo.W4A16_AWQ, extra_build_args=["--fast_build"]) @skip_pre_ada def test_fp8(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) @skip_pre_ada @pytest.mark.skip_less_device(2) @pytest.mark.parametrize( "fp8_context_fmha", [False, True], ids=["disable_fp8_context_fmha", "enable_fp8_context_fmha"]) @pytest.mark.parametrize( "reduce_fusion", [False, True], ids=["disable_reduce_fusion", "enable_reduce_fusion"]) def test_fp8_tp2(self, fp8_context_fmha: bool, reduce_fusion: bool): if fp8_context_fmha: extra_build_args = [ "--use_fp8_context_fmha=enable", "--use_paged_context_fmha=enable" ] else: extra_build_args = [ "--use_fp8_context_fmha=disable", "--use_paged_context_fmha=disable" ] if reduce_fusion: extra_build_args.append("--reduce_fusion=enable") else: extra_build_args.append("--reduce_fusion=disable") self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8, tp_size=2, extra_build_args=extra_build_args) @skip_pre_ada @pytest.mark.skip_less_device(2) def test_fp8_pp2(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8, pp_size=2) @skip_pre_ada @skip_post_blackwell def test_fp8_rowwise(self): self.run(quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN) @skip_pre_ada @skip_post_blackwell def test_fp8_rowwise_meta_recipe(self): self.run(quant_algo=QuantAlgo.FP8_PER_CHANNEL_PER_TOKEN, extra_acc_spec="meta_recipe", extra_convert_args=["--use_meta_fp8_rowwise_recipe"]) @pytest.mark.parametrize("max_gpu_percent", [0.1, 1.0]) def test_weight_streaming(self, max_gpu_percent: float): self.run(extra_build_args=["--weight_streaming"], extra_summarize_args=["--gpu_weights_percent=0"]) for gpu_percent in [0.1, 0.5, 0.9, 1]: if gpu_percent > max_gpu_percent: break self.extra_summarize_args = [f"--gpu_weights_percent={gpu_percent}"] self.evaluate() def test_cyclic_kv_cache(self): self.run(extra_acc_spec="max_attention_window_size=960", extra_summarize_args=["--max_attention_window_size=960"]) @pytest.mark.skip(reason="https://nvbugspro.nvidia.com/bug/5166352") def test_cyclic_kv_cache_beam_search(self): self.run(extra_acc_spec="max_attention_window_size=960;beam_width=4", extra_build_args=["--max_beam_width=4"], extra_summarize_args=[ "--max_attention_window_size=960", "--num_beams=4" ]) class TestMixtral8x7B(CliFlowAccuracyTestHarness): MODEL_NAME = "mistralai/Mixtral-8x7B-v0.1" MODEL_PATH = f"{llm_models_root()}/Mixtral-8x7B-v0.1" EXAMPLE_FOLDER = "llama" @pytest.mark.skip_less_device(2) @pytest.mark.skip_less_device_memory(80000) def test_tp2(self): self.run(dtype='auto', tp_size=2) @skip_pre_ada @pytest.mark.skip_less_device(2) @pytest.mark.skip_less_device_memory(80000) def test_fp8_tp2(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8, tp_size=2) @skip_pre_ada @pytest.mark.skip_less_device(4) @pytest.mark.skip_less_device_memory(40000) def test_fp8_tp2pp2(self): self.run(tasks=[CnnDailymail(self.MODEL_NAME), MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8, tp_size=2, pp_size=2) @skip_pre_ada @pytest.mark.skip_less_device(4) @pytest.mark.skip_less_device_memory(40000) def test_fp8_tp2pp2_manage_weights(self): self.run(tasks=[CnnDailymail(self.MODEL_NAME), MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8, tp_size=2, pp_size=2, extra_build_args=["--fast_build"]) @skip_pre_blackwell def test_nvfp4_prequantized(self, mocker): mocker.patch.object( self.__class__, "MODEL_PATH", f"{llm_models_root()}/nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1") self.run(tasks=[MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.NVFP4, kv_cache_quant_algo=QuantAlgo.FP8) class TestGemma2B(CliFlowAccuracyTestHarness): MODEL_NAME = "google/gemma-2b" MODEL_PATH = f"{llm_models_root()}/gemma/gemma-2b" EXAMPLE_FOLDER = "gemma" def test_auto_dtype(self): self.run(dtype='auto', extra_convert_args=["--ckpt-type=hf"]) @pytest.mark.parametrize("precision", ["int8"]) def test_weight_only(self, precision: str): quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16 self.run(quant_algo=quant_algo, extra_convert_args=["--ckpt-type=hf"]) @skip_post_blackwell def test_smooth_quant(self): self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN, extra_convert_args=[ "--ckpt-type=hf", f"--tokenizer_dir={self.MODEL_PATH}/tokenizer.model" ]) @skip_pre_ada def test_fp8(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) @skip_post_blackwell def test_int4_awq(self): self.run(quant_algo=QuantAlgo.W4A16_AWQ) @pytest.mark.skip_less_device_memory(40000) class TestGemma7B(CliFlowAccuracyTestHarness): MODEL_NAME = "google/gemma-7b" MODEL_PATH = f"{llm_models_root()}/gemma/gemma-7b" EXAMPLE_FOLDER = "gemma" def test_auto_dtype(self): self.run(dtype='auto', extra_convert_args=["--ckpt-type=hf"]) @pytest.mark.parametrize("precision", ["int8"]) def test_weight_only(self, precision: str): quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16 self.run(quant_algo=quant_algo, extra_convert_args=["--ckpt-type=hf"]) @skip_post_blackwell @pytest.mark.skip_less_device_memory(50000) def test_smooth_quant(self): self.run(quant_algo=QuantAlgo.W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN, extra_convert_args=[ "--ckpt-type=hf", f"--tokenizer_dir={self.MODEL_PATH}/tokenizer.model" ]) @skip_pre_ada def test_fp8(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8) @skip_post_blackwell def test_int4_awq(self): self.run(quant_algo=QuantAlgo.W4A16_AWQ) @pytest.mark.skip_less_device_memory(40000) class TestGemma2_9BIt(CliFlowAccuracyTestHarness): MODEL_NAME = "google/gemma-2-9b-it" MODEL_PATH = f"{llm_models_root()}/gemma/gemma-2-9b-it" EXAMPLE_FOLDER = "gemma" def test_auto_dtype(self): self.run(tasks=[CnnDailymail(self.MODEL_NAME), MMLU(self.MODEL_NAME)], dtype='auto', extra_convert_args=["--ckpt-type=hf"]) @skip_post_blackwell @pytest.mark.parametrize("precision", ["int8", "int4"]) def test_weight_only(self, precision: str): quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16 self.run(quant_algo=quant_algo, extra_convert_args=["--ckpt-type=hf"]) @skip_pre_hopper def test_fp8(self): self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8, extra_convert_args=["--device_map=sequential"]) class TestQwen7BChat(CliFlowAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen-7B-Chat" MODEL_PATH = f"{llm_models_root()}/Qwen-7B-Chat" EXAMPLE_FOLDER = "qwen" def test_auto_dtype(self): self.run(dtype='auto') def test_weight_only(self): self.run(quant_algo=QuantAlgo.W8A16) @skip_post_blackwell def test_int4_gptq_prequantized(self, mocker): mocker.patch.object(self.__class__, "MODEL_PATH", f"{llm_models_root()}/Qwen-7B-Chat-Int4") self.run(quant_algo=QuantAlgo.W4A16_GPTQ) @pytest.mark.skip_less_device_memory(40000) class TestQwen1_5MoeA2_7BChat(CliFlowAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen1.5-MoE-A2.7B-Chat" MODEL_PATH = f"{llm_models_root()}/Qwen1.5-MoE-A2.7B-Chat" EXAMPLE_FOLDER = "qwen" def test_auto_dtype(self): self.run(dtype='auto') @pytest.mark.skip(reason="https://nvbugs/5100102") def test_weight_only(self): self.run(quant_algo=QuantAlgo.W8A16) class TestQwen2_0_5BInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen2-0.5B-Instruct" MODEL_PATH = f"{llm_models_root()}/Qwen2-0.5B-Instruct" EXAMPLE_FOLDER = "qwen" def test_auto_dtype(self): self.run(dtype='auto') @skip_post_blackwell def test_weight_only(self): self.run(quant_algo=QuantAlgo.W8A16) @skip_pre_ada def test_fp8(self): self.run(tasks=[CnnDailymail(self.MODEL_NAME), MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.FP8) class TestQwen2_7BInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen2-7B-Instruct" MODEL_PATH = f"{llm_models_root()}/Qwen2-7B-Instruct" EXAMPLE_FOLDER = "qwen" def test_auto_dtype(self): self.run(dtype='auto') @skip_post_blackwell def test_weight_only(self): self.run(quant_algo=QuantAlgo.W8A16) @skip_post_blackwell def test_int4_awq_prequantized(self, mocker): mocker.patch.object(self.__class__, "MODEL_PATH", f"{llm_models_root()}/Qwen2-7B-Instruct-AWQ") self.run(quant_algo=QuantAlgo.W4A16_AWQ) @pytest.mark.skip_less_device_memory(40000) class TestQwen2_57B_A14B(CliFlowAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen2-57B-A14B" MODEL_PATH = f"{llm_models_root()}/Qwen2-57B-A14B" EXAMPLE_FOLDER = "qwen" @pytest.mark.skip(reason="https://nvbugs/5063469") @pytest.mark.skip_less_device(4) def test_tp4(self): self.run(tp_size=4) @pytest.mark.skip(reason="https://nvbugs/5063469") @pytest.mark.skip_less_device(4) def test_tp2pp2(self): self.run(tp_size=2, pp_size=2) class TestQwen2_5_1_5BInstruct(CliFlowAccuracyTestHarness): MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct" MODEL_PATH = f"{llm_models_root()}/Qwen2.5-1.5B-Instruct" EXAMPLE_FOLDER = "qwen" def test_auto_dtype(self): self.run(dtype='auto') @skip_post_blackwell def test_weight_only(self): self.run(quant_algo=QuantAlgo.W8A16) @skip_pre_ada def test_fp8(self): self.run(tasks=[CnnDailymail(self.MODEL_NAME), MMLU(self.MODEL_NAME)], quant_algo=QuantAlgo.FP8)