mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
316 lines
7.4 KiB
YAML
316 lines
7.4 KiB
YAML
meta-llama/Llama-3.1-8B-Instruct:
|
|
- accuracy: 74.20
|
|
- spec_dec_algo: NGram
|
|
accuracy: 74.20
|
|
- spec_dec_algo: Eagle
|
|
accuracy: 74.20
|
|
- quant_algo: FP8
|
|
accuracy: 74.30
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 72.85
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: NVFP4
|
|
accuracy: 69.75
|
|
meta-llama/Llama-3.3-70B-Instruct:
|
|
- accuracy: 83.78
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 87.33
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 90.30
|
|
- quant_algo: FP8
|
|
accuracy: 90.30
|
|
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
|
|
- accuracy: 92.20
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 92.20
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
spec_dec_algo: Eagle
|
|
accuracy: 92.20
|
|
meta-llama/Llama-4-Scout-17B-16E-Instruct:
|
|
- accuracy: 89.70
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 88.61
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 89.45
|
|
deepseek-ai/DeepSeek-V3-Lite:
|
|
- accuracy: 64.74
|
|
- quant_algo: NVFP4
|
|
accuracy: 63.71
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 63.71
|
|
- quant_algo: NVFP4
|
|
spec_dec_algo: MTP
|
|
accuracy: 63.71
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
spec_dec_algo: MTP
|
|
accuracy: 63.71
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 64.74
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 64.74
|
|
- spec_dec_algo: MTP
|
|
accuracy: 64.44
|
|
- spec_dec_algo: MTP
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 64.44
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
spec_dec_algo: MTP
|
|
accuracy: 64.14
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
kv_cache_quant_algo: FP8
|
|
spec_dec_algo: MTP
|
|
accuracy: 64.14
|
|
deepseek-ai/DeepSeek-R1:
|
|
- quant_algo: NVFP4
|
|
accuracy: 95.42
|
|
- quant_algo: NVFP4
|
|
spec_dec_algo: MTP
|
|
accuracy: 95.42
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 95.42
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
spec_dec_algo: MTP
|
|
accuracy: 95.42
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 95.413
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
spec_dec_algo: MTP
|
|
accuracy: 95.413
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 95.413
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
kv_cache_quant_algo: FP8
|
|
spec_dec_algo: MTP
|
|
accuracy: 95.413
|
|
deepseek-ai/DeepSeek-V3.2-Exp:
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 95.6
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
spec_dec_algo: MTP
|
|
accuracy: 95.6
|
|
- quant_algo: NVFP4
|
|
accuracy: 95.6
|
|
- quant_algo: NVFP4
|
|
spec_dec_algo: MTP
|
|
accuracy: 95.6
|
|
Qwen3/Qwen3-4B:
|
|
- spec_dec_algo: Eagle
|
|
accuracy: 85.823
|
|
Qwen3/Qwen3-8B:
|
|
- accuracy: 87.1114
|
|
- spec_dec_algo: Eagle
|
|
accuracy: 87.1114
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 87.1114
|
|
Qwen3/Qwen3-30B-A3B:
|
|
- accuracy: 83.43
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 84.36
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 83.43
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 83.43
|
|
- spec_dec_algo: Eagle
|
|
accuracy: 83.43
|
|
Qwen3/Qwen3-235B-A22B:
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 85.78
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 85.78
|
|
- spec_dec_algo: Eagle
|
|
quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 85.78
|
|
Qwen3/Qwen3-Next-80B-A3B-Thinking:
|
|
- accuracy: 81.577
|
|
Qwen3/Qwen3-Next-80B-A3B-Instruct:
|
|
- accuracy: 92.72
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 90.86
|
|
moonshotai/Kimi-K2-Instruct:
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 94.84
|
|
moonshotai/Kimi-K2-Thinking:
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 90.84
|
|
- quant_algo: NVFP4
|
|
accuracy: 90.84
|
|
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
|
|
- accuracy: 92.57
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 92.42
|
|
nvidia/Nemotron-H-8B-Base-8K:
|
|
- accuracy: 46.20
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 85.78
|
|
nvidia/Nemotron-H-47B-Base-8K:
|
|
- accuracy: 88.82
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 88.55
|
|
nvidia/Nemotron-H-56B-Base-8K:
|
|
- accuracy: 89.27
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 89.27
|
|
nvidia/Nemotron-MOE:
|
|
- accuracy: 88.249
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 86.884
|
|
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
|
|
- accuracy: 37.15
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 28.39
|
|
nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
|
|
- accuracy: 94.43
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 94.16
|
|
kanana-1.5-2.1b-instruct-2505:
|
|
- accuracy: 75.81
|
|
speakleash/Bielik-11B-v2.2-Instruct:
|
|
- accuracy: 41.51
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 40.41
|
|
google/gemma-3-1b-it:
|
|
- accuracy: 25.52 # score getting from lm-eval with HF implementation
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 23.96
|
|
google/gemma-3-27b-it:
|
|
- accuracy: 91.66
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 90.66
|
|
mistralai/Ministral-8B-Instruct-2410:
|
|
- accuracy: 79.25
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 78.35
|
|
mistralai/Mistral-Small-3.1-24B-Instruct-2503:
|
|
- accuracy: 89.23
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 89.23
|
|
microsoft/Phi-4-multimodal-instruct:
|
|
- accuracy: 81.19
|
|
- quant_algo: FP8
|
|
accuracy: 80.82
|
|
- quant_algo: NVFP4
|
|
accuracy: 69.33
|
|
microsoft/Phi-4-multimodal-instruct-long-rope:
|
|
- accuracy: 75.85
|
|
microsoft/Phi-4-mini-instruct:
|
|
- accuracy: 82.30
|
|
microsoft/phi-4:
|
|
- accuracy: 90.30
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 90.64
|
|
mistralai/Codestral-22B-v0.1:
|
|
- accuracy: 67.10
|
|
GPT-OSS/BF16:
|
|
- accuracy: 90.3
|
|
- kv_cache_quant_algo: FP8
|
|
accuracy: 90.3
|
|
GPT-OSS/120B-MXFP4:
|
|
- accuracy: 90.3
|
|
- spec_dec_algo: Eagle
|
|
accuracy: 90.3
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
accuracy: 90.3
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
spec_dec_algo: Eagle
|
|
accuracy: 90.3
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 90.3
|
|
- quant_algo: W4A8_MXFP4_FP8
|
|
accuracy: 90.3
|
|
- quant_algo: W4A8_MXFP4_FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 90.3
|
|
- quant_algo: W4A16_MXFP4
|
|
accuracy: 90.3
|
|
- quant_algo: W4A16_MXFP4
|
|
spec_dec_algo: Eagle
|
|
accuracy: 90.3
|
|
- quant_algo: W4A16_MXFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 90.3
|
|
GPT-OSS/20B-MXFP4:
|
|
- accuracy: 85.0
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
accuracy: 85.0
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 85.0
|
|
- quant_algo: W4A16_MXFP4
|
|
accuracy: 85.0
|
|
- quant_algo: W4A16_MXFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 85.0
|
|
GPT-OSS/20B-NVFP4:
|
|
- accuracy: 85.0
|
|
- quant_algo: NVFP4
|
|
accuracy: 85.0
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 85.0
|
|
LGAI-EXAONE/EXAONE-4.0-32B:
|
|
- accuracy: 88.36
|
|
ByteDance-Seed/Seed-OSS-36B-Instruct:
|
|
- accuracy: 90.8
|
|
zai-org/GLM-4.6:
|
|
- accuracy: 81.3
|
|
- spec_dec_algo: MTP
|
|
accuracy: 81.3
|
|
- quant_algo: NVFP4
|
|
spec_dec_algo: MTP
|
|
accuracy: 88.0
|
|
bigcode/starcoder2-3b:
|
|
- accuracy: 20.2
|
|
bigcode/starcoder2-7b:
|
|
- accuracy: 26.5
|
|
bigcode/starcoder2-15b:
|
|
- accuracy: 54.5
|
|
mistral/Mistral-Large-3-675B:
|
|
- accuracy: 86.1
|
|
- spec_dec_algo: Eagle
|
|
accuracy: 86.1
|
|
nvidia/Nemotron-Super-V3:
|
|
- accuracy: 83.74
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 80.85
|
|
nvidia/Nemotron-3-Nano:
|
|
- accuracy: 69.37
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 68.73
|