mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Signed-off-by: Dongfeng Yu <dongfengy@nvidia.com> Signed-off-by: dongfengy <99041270+dongfengy@users.noreply.github.com>
82 lines
2.1 KiB
YAML
82 lines
2.1 KiB
YAML
meta-llama/Llama-3.3-70B-Instruct:
|
|
- accuracy: 45.96
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 45.55
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 48.03
|
|
- quant_algo: FP8
|
|
accuracy: 48.03
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 48.03
|
|
deepseek-ai/DeepSeek-R1:
|
|
- quant_algo: NVFP4
|
|
accuracy: 70.45
|
|
- quant_algo: NVFP4
|
|
spec_dec_algo: MTP
|
|
accuracy: 70.06
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 70.45
|
|
deepseek-ai/DeepSeek-V3.2-Exp:
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 80.0
|
|
- quant_algo: NVFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 80.0
|
|
- quant_algo: FP8_BLOCK_SCALES
|
|
spec_dec_algo: MTP
|
|
accuracy: 80.0
|
|
- quant_algo: NVFP4
|
|
accuracy: 80.0
|
|
- quant_algo: NVFP4
|
|
spec_dec_algo: MTP
|
|
accuracy: 80.0
|
|
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
|
|
- accuracy: 44.95
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 42.42
|
|
# GPQA diamond only contains 198 samples, so the score tends to have large variance.
|
|
# We repeated evaluation 7 times to choose a lower bound score for FP8, 42.42.
|
|
# random_seed=0: 47.98
|
|
# random_seed=1: 42.42
|
|
# random_seed=2: 52.02
|
|
# random_seed=3: 51.52
|
|
# random_seed=4: 48.48
|
|
# random_seed=5: 47.47
|
|
# random_seed=6: 45.96
|
|
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
|
|
- accuracy: 40.40
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 39.39
|
|
nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
|
|
- accuracy: 58.08
|
|
- quant_algo: FP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 57.07
|
|
GPT-OSS/120B-MXFP4:
|
|
- accuracy: 65.0
|
|
- spec_dec_algo: Eagle
|
|
accuracy: 65.0
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
accuracy: 65.0
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
spec_dec_algo: Eagle
|
|
accuracy: 65.0
|
|
- quant_algo: W4A8_MXFP4_MXFP8
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 65.0
|
|
- quant_algo: W4A16_MXFP4
|
|
accuracy: 65.0
|
|
- quant_algo: W4A16_MXFP4
|
|
spec_dec_algo: Eagle
|
|
accuracy: 65.0
|
|
- quant_algo: W4A16_MXFP4
|
|
kv_cache_quant_algo: FP8
|
|
accuracy: 65.0
|