mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
116 lines
2.7 KiB
YAML
116 lines
2.7 KiB
YAML
meta-llama/Llama-3.1-8B-Instruct:
|
|
- accuracy: 74.20
|
|
- quant_algo: FP8
|
|
accuracy: 74.30
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 72.85
|
|
meta-llama/Llama-3.3-70B-Instruct:
|
|
- accuracy: 83.78
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 75.61
|
|
- quant_algo: FP8
|
|
accuracy: 83.30
|
|
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
|
|
- accuracy: 92.20
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 83.30
|
|
meta-llama/Llama-4-Scout-17B-16E-Instruct:
|
|
- accuracy: 89.70
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 89.61
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 89.00
|
|
deepseek-ai/DeepSeek-V3-Lite:
|
|
- accuracy: 64.74
|
|
- quant_algo: NVFP4
|
|
accuracy: 63.71
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 63.71
|
|
- quant_algo: NVFP4
|
|
spec_dec_algo: MTP
|
|
accuracy: 63.71
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
spec_dec_algo: MTP
|
|
accuracy: 63.71
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 64.74
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 64.74
|
|
- spec_dec_algo: MTP
|
|
accuracy: 64.44
|
|
- spec_dec_algo: MTP
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 64.44
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
spec_dec_algo: MTP
|
|
accuracy: 64.14
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
kv_cache_quant_algo: FP8
|
|
spec_dec_algo: MTP
|
|
accuracy: 64.14
|
|
deepseek-ai/DeepSeek-R1:
|
|
- quant_algo: NVFP4
|
|
accuracy: 95.42
|
|
- quant_algo: NVFP4
|
|
spec_dec_algo: MTP
|
|
accuracy: 95.42
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 95.42
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 95.413
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
spec_dec_algo: MTP
|
|
accuracy: 95.413
|
|
Qwen3/Qwen3-30B-A3B:
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
accuracy: 84.36
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 83.43
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 83.43
|
|
Qwen3/Qwen3-235B-A22B:
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 85.78
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 85.78
|
|
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
|
|
- accuracy: 92.57
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 92.42
|
|
nvidia/Nemotron-H-8B-Base-8K:
|
|
- accuracy: 46.20
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 85.78
|
|
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
|
|
- accuracy: 37.15
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 28.39
|
|
nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
|
|
- accuracy: 94.43
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 94.16
|
|
kanana-1.5-2.1b-instruct-2505:
|
|
- accuracy: 75.81
|
|
speakleash/Bielik-11B-v2.2-Instruct:
|
|
- accuracy: 41.51
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 40.41
|