mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-04 10:11:47 +08:00
This commit adds some level of FP8 support to Mistral Small 3.1 by: * disabling quantization for the vision sub-model since `modelopt` does support quantizing it (yet). * extending existing accuracy tests to use a modelopt produced FP8 checkpoint. Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com> Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
310 lines
7.5 KiB
YAML
310 lines
7.5 KiB
YAML
meta-llama/Llama-2-7b-hf:
|
|
- accuracy: 46.69
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 46.76
|
|
meta-llama/Meta-Llama-3-8B-Instruct:
|
|
- accuracy: 67.74
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 63.47
|
|
meta-llama/Llama-3.1-8B:
|
|
- accuracy: 66.06
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 63.16
|
|
- quant_algo: FP8_PER_CHANNEL_PER_TOKEN
|
|
accuracy: 65.55
|
|
- quant_algo: MIXED_PRECISION
|
|
extra_acc_spec: autoq_format=int4_awq,fp8,w4a8_awq;auto_quantize_bits=5.8
|
|
accuracy: 64.99
|
|
meta-llama/Llama-3.1-8B-Instruct:
|
|
- accuracy: 68.17
|
|
- spec_dec_algo: Eagle
|
|
accuracy: 68.20
|
|
- spec_dec_algo: NGram
|
|
accuracy: 68.17
|
|
- quant_algo: FP8
|
|
accuracy: 67.93
|
|
- quant_algo: FP8
|
|
extra_acc_spec: temperature=0.8,top_p=0.95
|
|
accuracy: 64.62
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 67.87
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: NVFP4
|
|
accuracy: 66.45
|
|
meta-llama/Llama-3.2-1B:
|
|
- quant_algo: W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN
|
|
accuracy: 32.72
|
|
- quant_algo: W8A8_SQ_PER_CHANNEL
|
|
accuracy: 32.07
|
|
- quant_algo: W4A16_AWQ
|
|
accuracy: 30.56
|
|
- quant_algo: W4A16_AWQ
|
|
kv_cache_quant_algo: INT8
|
|
accuracy: 31.29
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 31.02
|
|
- quant_algo: FP8_PER_CHANNEL_PER_TOKEN
|
|
accuracy: 33.97
|
|
- quant_algo: FP8_PER_CHANNEL_PER_TOKEN
|
|
extra_acc_spec: meta_recipe
|
|
accuracy: 33.87
|
|
- extra_acc_spec: max_attention_window_size=960
|
|
accuracy: 32.82
|
|
meta-llama/Llama-3.2-3B:
|
|
- accuracy: 57.92
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 60.60
|
|
meta-llama/Llama-3.3-70B-Instruct:
|
|
- accuracy: 81.31
|
|
- quant_algo: FP8
|
|
spec_dec_algo: Eagle
|
|
accuracy: 81.31
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 78.78
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 80.40
|
|
- quant_algo: FP8
|
|
accuracy: 80.40
|
|
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
|
|
- accuracy: 86.40
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 86.40
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
spec_dec_algo: Eagle
|
|
accuracy: 86.40
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 86.40
|
|
meta-llama/Llama-4-Scout-17B-16E-Instruct:
|
|
- accuracy: 80.00
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 79.60
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 78.58
|
|
mistralai/Mistral-7B-v0.1:
|
|
- accuracy: 66
|
|
mistralai/Mistral-7B-Instruct-v0.3:
|
|
- quant_algo: W4A16
|
|
accuracy: 59.23
|
|
- quant_algo: W4A16_AWQ
|
|
accuracy: 61.06
|
|
- quant_algo: W4A8_AWQ
|
|
accuracy: 60.04
|
|
mistralai/Mixtral-8x7B-v0.1:
|
|
- accuracy: 71.35
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 71.27
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 69.64
|
|
mistralai/Mixtral-8x7B-Instruct-v0.1:
|
|
- accuracy: 68.0
|
|
mistralai/Mixtral-8x22B-v0.1:
|
|
- quant_algo: FP8
|
|
accuracy: 77.63
|
|
mistralai/Mistral-Small-3.1-24B-Instruct-2503:
|
|
- accuracy: 81.7
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 81.1
|
|
google/gemma-2-9b-it:
|
|
- accuracy: 73.05
|
|
google/gemma-3-1b-it:
|
|
- accuracy: 39.0
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 37.5
|
|
google/gemma-3-27b-it:
|
|
- accuracy: 77.80
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 76.80
|
|
Qwen/Qwen2-0.5B-Instruct:
|
|
- accuracy: 45.30
|
|
- quant_algo: FP8
|
|
accuracy: 45.03
|
|
Qwen/Qwen2.5-0.5B-Instruct:
|
|
- accuracy: 49.59
|
|
- quant_algo: FP8
|
|
accuracy: 48.59
|
|
Qwen/Qwen2.5-1.5B-Instruct:
|
|
- accuracy: 61.45
|
|
- quant_algo: FP8
|
|
accuracy: 61.43
|
|
Qwen/Qwen2.5-7B-Instruct:
|
|
- accuracy: 75.32
|
|
- quant_algo: FP8
|
|
accuracy: 75.32
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 75.32
|
|
Qwen/QwQ-32B:
|
|
- accuracy: 82.60
|
|
deepseek-ai/DeepSeek-V3-Lite:
|
|
- accuracy: 71.40
|
|
- quant_algo: NVFP4
|
|
accuracy: 70.60
|
|
- quant_algo: NVFP4
|
|
spec_dec_algo: MTP
|
|
accuracy: 70.60
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 71.27
|
|
- spec_dec_algo: MTP
|
|
accuracy: 71.39
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
spec_dec_algo: MTP
|
|
accuracy: 71.29
|
|
deepseek-ai/DeepSeek-R1:
|
|
- quant_algo: NVFP4
|
|
accuracy: 87.33
|
|
- quant_algo: NVFP4
|
|
spec_dec_algo: MTP
|
|
accuracy: 87.33
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 87.33
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
spec_dec_algo: MTP
|
|
accuracy: 87.33
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 87.573
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
spec_dec_algo: MTP
|
|
accuracy: 87.573
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 87.573
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
kv_cache_quant_algo: FP8
|
|
spec_dec_algo: MTP
|
|
accuracy: 87.573
|
|
Qwen3/Qwen3-8B:
|
|
- quant_algo: W4A8_MXFP4_FP8
|
|
accuracy: 72.70
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
accuracy: 72.70
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 76.12
|
|
- accuracy: 76.12
|
|
- spec_dec_algo: Eagle
|
|
accuracy: 76.12
|
|
Qwen3/Qwen3-30B-A3B:
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 79.53
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 80.65
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 80.65
|
|
- quant_algo: W4A8_MXFP4_FP8
|
|
accuracy: 79.78
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
accuracy: 79.78
|
|
- quant_algo: W4A16_MXFP4
|
|
accuracy: 79.80
|
|
Qwen3/Qwen3-235B-A22B:
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 86
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 86
|
|
- spec_dec_algo: Eagle
|
|
quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 86
|
|
moonshotai/Kimi-K2-Instruct:
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 87.65
|
|
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
|
|
- accuracy: 79.43
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 79.26
|
|
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
|
|
- accuracy: 57.97
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 57.12
|
|
nvidia/Nemotron-H-8B-Base-8K:
|
|
- accuracy: 69.590
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 69.180
|
|
nvidia/Nemotron-H-47B-Base-8K:
|
|
- accuracy: 83.26
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 82.68
|
|
nvidia/Nemotron-H-56B-Base-8K:
|
|
- accuracy: 83.82
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 83.82
|
|
microsoft/Phi-4-mini-instruct:
|
|
- accuracy: 68.98
|
|
- quant_algo: FP8
|
|
accuracy: 68.30
|
|
bigcode/starcoder2-7b:
|
|
- accuracy: 41.35
|
|
- quant_algo: FP8
|
|
accuracy: 41.35
|
|
mistralai/Codestral-22B-v0.1:
|
|
- accuracy: 61.72
|
|
- quant_algo: FP8
|
|
accuracy: 61.72
|
|
# Created a dummy accuracy to track tp_size=2 for phi4-mini model.
|
|
# TODO: update once https://nvbugs/5393849 is fixed.
|
|
microsoft/Phi-4-mini-instruct-tp2:
|
|
- accuracy: 0.0
|
|
nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
|
|
- accuracy: 83.70
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 83.36
|
|
kanana-1.5-2.1b-instruct-2505:
|
|
- accuracy: 56.89
|
|
speakleash/Bielik-11B-v2.2-Instruct:
|
|
- accuracy: 64.47
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 64.36
|
|
mistralai/Ministral-8B-Instruct-2410:
|
|
- accuracy: 66.35
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 65.96
|
|
microsoft/Phi-4-multimodal-instruct:
|
|
- accuracy: 69.69
|
|
microsoft/Phi-4-multimodal-instruct-long-rope:
|
|
- accuracy: 65.98
|
|
LGAI-EXAONE/EXAONE-4.0-32B:
|
|
- accuracy: 78.52
|
|
GPT-OSS/BF16:
|
|
- accuracy: 77.50
|
|
GPT-OSS/MXFP4:
|
|
- accuracy: 75.50
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
accuracy: 75.50
|
|
- quant_algo: W4A8_MXFP4_FP8
|
|
accuracy: 75.50
|
|
mistralai/Mistral-Nemo-12b-Base:
|
|
- accuracy: 69.66
|
|
- quant_algo: FP8
|
|
accuracy: 69.66
|