meta-llama/Llama-2-7b-hf: - accuracy: 46.69 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 46.76 meta-llama/Meta-Llama-3-8B-Instruct: - accuracy: 67.74 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 63.47 meta-llama/Llama-3.1-8B: - accuracy: 66.06 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 63.16 - quant_algo: FP8_PER_CHANNEL_PER_TOKEN accuracy: 65.55 - quant_algo: MIXED_PRECISION extra_acc_spec: autoq_format=int4_awq,fp8,w4a8_awq;auto_quantize_bits=5.8 accuracy: 64.99 meta-llama/Llama-3.1-8B-Instruct: - accuracy: 68.17 - spec_dec_algo: Eagle accuracy: 68.20 - spec_dec_algo: NGram accuracy: 68.17 - quant_algo: FP8 accuracy: 67.93 - quant_algo: FP8 extra_acc_spec: temperature=0.8,top_p=0.95 accuracy: 64.62 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 67.87 - quant_algo: FP8 kv_cache_quant_algo: NVFP4 accuracy: 66.45 meta-llama/Llama-3.2-1B: - quant_algo: W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN accuracy: 32.72 - quant_algo: W8A8_SQ_PER_CHANNEL accuracy: 32.07 - quant_algo: W4A16_AWQ accuracy: 30.56 - quant_algo: W4A16_AWQ kv_cache_quant_algo: INT8 accuracy: 31.29 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 31.02 - quant_algo: FP8_PER_CHANNEL_PER_TOKEN accuracy: 33.97 - quant_algo: FP8_PER_CHANNEL_PER_TOKEN extra_acc_spec: meta_recipe accuracy: 33.87 - extra_acc_spec: max_attention_window_size=960 accuracy: 32.82 meta-llama/Llama-3.2-3B: - accuracy: 57.92 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 60.60 meta-llama/Llama-3.3-70B-Instruct: - accuracy: 81.31 - quant_algo: FP8 spec_dec_algo: Eagle accuracy: 81.31 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 78.78 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 80.40 - quant_algo: FP8 accuracy: 80.40 meta-llama/Llama-4-Maverick-17B-128E-Instruct: - accuracy: 86.40 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 86.40 - quant_algo: FP8 kv_cache_quant_algo: FP8 spec_dec_algo: Eagle accuracy: 86.40 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 86.40 meta-llama/Llama-4-Scout-17B-16E-Instruct: - accuracy: 80.00 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 79.60 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 78.58 mistralai/Mistral-7B-v0.1: - accuracy: 66 mistralai/Mistral-7B-Instruct-v0.3: - quant_algo: W4A16 accuracy: 59.23 - quant_algo: W4A16_AWQ accuracy: 61.06 - quant_algo: W4A8_AWQ accuracy: 60.04 mistralai/Mixtral-8x7B-v0.1: - accuracy: 71.35 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 71.27 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 69.64 mistralai/Mixtral-8x7B-Instruct-v0.1: - accuracy: 68.0 mistralai/Mixtral-8x22B-v0.1: - quant_algo: FP8 accuracy: 77.63 mistralai/Mistral-Small-3.1-24B-Instruct-2503: - accuracy: 81.7 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 81.1 google/gemma-2-9b-it: - accuracy: 73.05 google/gemma-3-1b-it: - accuracy: 39.0 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 37.5 google/gemma-3-27b-it: - accuracy: 77.80 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 76.80 Qwen/Qwen2-0.5B-Instruct: - accuracy: 45.30 - quant_algo: FP8 accuracy: 45.03 Qwen/Qwen2.5-0.5B-Instruct: - accuracy: 49.59 - quant_algo: FP8 accuracy: 48.59 Qwen/Qwen2.5-1.5B-Instruct: - accuracy: 61.45 - quant_algo: FP8 accuracy: 61.43 Qwen/Qwen2.5-7B-Instruct: - accuracy: 75.32 - quant_algo: FP8 accuracy: 75.32 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 75.32 Qwen/QwQ-32B: - accuracy: 82.60 deepseek-ai/DeepSeek-V3-Lite: - accuracy: 71.40 - quant_algo: NVFP4 accuracy: 70.60 - quant_algo: NVFP4 spec_dec_algo: MTP accuracy: 70.60 - quant_algo: FP8_BLOCK_SCALES accuracy: 71.27 - spec_dec_algo: MTP accuracy: 71.39 - quant_algo: FP8_BLOCK_SCALES spec_dec_algo: MTP accuracy: 71.29 deepseek-ai/DeepSeek-R1: - quant_algo: NVFP4 accuracy: 87.33 - quant_algo: NVFP4 spec_dec_algo: MTP accuracy: 87.33 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 87.33 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 spec_dec_algo: MTP accuracy: 87.33 - quant_algo: FP8_BLOCK_SCALES accuracy: 87.573 - quant_algo: FP8_BLOCK_SCALES spec_dec_algo: MTP accuracy: 87.573 - quant_algo: FP8_BLOCK_SCALES kv_cache_quant_algo: FP8 accuracy: 87.573 - quant_algo: FP8_BLOCK_SCALES kv_cache_quant_algo: FP8 spec_dec_algo: MTP accuracy: 87.573 Qwen3/Qwen3-8B: - quant_algo: W4A8_MXFP4_FP8 accuracy: 72.70 - quant_algo: W4A8_MXFP4_MXFP8 accuracy: 72.70 - quant_algo: FP8_BLOCK_SCALES accuracy: 76.12 - accuracy: 76.12 - spec_dec_algo: Eagle accuracy: 76.12 Qwen3/Qwen3-30B-A3B: - quant_algo: FP8_BLOCK_SCALES accuracy: 79.53 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 80.65 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 80.65 - quant_algo: W4A8_MXFP4_FP8 accuracy: 79.78 - quant_algo: W4A8_MXFP4_MXFP8 accuracy: 79.78 - quant_algo: W4A16_MXFP4 accuracy: 79.80 Qwen3/Qwen3-235B-A22B: - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 86 - quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 86 - spec_dec_algo: Eagle quant_algo: NVFP4 kv_cache_quant_algo: FP8 accuracy: 86 moonshotai/Kimi-K2-Instruct: - quant_algo: FP8_BLOCK_SCALES accuracy: 87.65 nvidia/Llama-3_3-Nemotron-Super-49B-v1: - accuracy: 79.43 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 79.26 nvidia/Llama-3.1-Nemotron-Nano-8B-v1: - accuracy: 57.97 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 57.12 nvidia/Nemotron-H-8B-Base-8K: - accuracy: 69.590 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 69.180 nvidia/Nemotron-H-47B-Base-8K: - accuracy: 83.26 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 82.68 nvidia/Nemotron-H-56B-Base-8K: - accuracy: 83.82 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 83.82 microsoft/Phi-4-mini-instruct: - accuracy: 68.98 - quant_algo: FP8 accuracy: 68.30 bigcode/starcoder2-7b: - accuracy: 41.35 - quant_algo: FP8 accuracy: 41.35 mistralai/Codestral-22B-v0.1: - accuracy: 61.72 - quant_algo: FP8 accuracy: 61.72 # Created a dummy accuracy to track tp_size=2 for phi4-mini model. # TODO: update once https://nvbugs/5393849 is fixed. microsoft/Phi-4-mini-instruct-tp2: - accuracy: 0.0 nvidia/Llama-3_1-Nemotron-Ultra-253B-v1: - accuracy: 83.70 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 83.36 kanana-1.5-2.1b-instruct-2505: - accuracy: 56.89 speakleash/Bielik-11B-v2.2-Instruct: - accuracy: 64.47 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 64.36 mistralai/Ministral-8B-Instruct-2410: - accuracy: 66.35 - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 65.96 microsoft/Phi-4-multimodal-instruct: - accuracy: 69.69 microsoft/Phi-4-multimodal-instruct-long-rope: - accuracy: 65.98 LGAI-EXAONE/EXAONE-4.0-32B: - accuracy: 78.52 GPT-OSS/BF16: - accuracy: 77.50 GPT-OSS/MXFP4: - accuracy: 75.50 - quant_algo: W4A8_MXFP4_MXFP8 accuracy: 75.50 - quant_algo: W4A8_MXFP4_FP8 accuracy: 75.50 mistralai/Mistral-Nemo-12b-Base: - accuracy: 69.66 - quant_algo: FP8 accuracy: 69.66