TensorRT-LLMs/tests/integration/defs/accuracy/references/gsm8k.yaml
Yibin Li 1ce483c999
[TRTLLM-7967][feat] Adding Starcoder2 PyTorch Backend Support (#8923)
Signed-off-by: Yibin Li <109242046+yibinl-nvidia@users.noreply.github.com>
2025-11-24 11:23:22 -08:00

279 lines
6.6 KiB
YAML

meta-llama/Llama-3.1-8B-Instruct:
- accuracy: 74.20
- spec_dec_algo: NGram
accuracy: 74.20
- spec_dec_algo: Eagle
accuracy: 74.20
- quant_algo: FP8
accuracy: 74.30
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 72.85
- quant_algo: FP8
kv_cache_quant_algo: NVFP4
accuracy: 69.75
meta-llama/Llama-3.3-70B-Instruct:
- accuracy: 83.78
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 87.33
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 90.30
- quant_algo: FP8
accuracy: 90.30
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
- accuracy: 92.20
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 92.20
- quant_algo: FP8
kv_cache_quant_algo: FP8
spec_dec_algo: Eagle
accuracy: 92.20
meta-llama/Llama-4-Scout-17B-16E-Instruct:
- accuracy: 89.70
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 88.61
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 89.45
deepseek-ai/DeepSeek-V3-Lite:
- accuracy: 64.74
- quant_algo: NVFP4
accuracy: 63.71
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 63.71
- quant_algo: NVFP4
spec_dec_algo: MTP
accuracy: 63.71
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
spec_dec_algo: MTP
accuracy: 63.71
- quant_algo: FP8_BLOCK_SCALES
accuracy: 64.74
- quant_algo: FP8_BLOCK_SCALES
kv_cache_quant_algo: FP8
accuracy: 64.74
- spec_dec_algo: MTP
accuracy: 64.44
- spec_dec_algo: MTP
kv_cache_quant_algo: FP8
accuracy: 64.44
- quant_algo: FP8_BLOCK_SCALES
spec_dec_algo: MTP
accuracy: 64.14
- quant_algo: FP8_BLOCK_SCALES
kv_cache_quant_algo: FP8
spec_dec_algo: MTP
accuracy: 64.14
deepseek-ai/DeepSeek-R1:
- quant_algo: NVFP4
accuracy: 95.42
- quant_algo: NVFP4
spec_dec_algo: MTP
accuracy: 95.42
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 95.42
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
spec_dec_algo: MTP
accuracy: 95.42
- quant_algo: FP8_BLOCK_SCALES
accuracy: 95.413
- quant_algo: FP8_BLOCK_SCALES
spec_dec_algo: MTP
accuracy: 95.413
- quant_algo: FP8_BLOCK_SCALES
kv_cache_quant_algo: FP8
accuracy: 95.413
- quant_algo: FP8_BLOCK_SCALES
kv_cache_quant_algo: FP8
spec_dec_algo: MTP
accuracy: 95.413
deepseek-ai/DeepSeek-V3.2-Exp:
- quant_algo: FP8_BLOCK_SCALES
accuracy: 95.6
- quant_algo: FP8_BLOCK_SCALES
spec_dec_algo: MTP
accuracy: 95.6
- quant_algo: NVFP4
accuracy: 95.6
- quant_algo: NVFP4
spec_dec_algo: MTP
accuracy: 95.6
Qwen3/Qwen3-8B:
- accuracy: 87.1114
- spec_dec_algo: Eagle
accuracy: 87.1114
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 87.1114
Qwen3/Qwen3-30B-A3B:
- accuracy: 83.43
- quant_algo: FP8_BLOCK_SCALES
accuracy: 84.36
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 83.43
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 83.43
- spec_dec_algo: Eagle
accuracy: 83.43
Qwen3/Qwen3-235B-A22B:
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 85.78
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 85.78
- spec_dec_algo: Eagle
quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 85.78
Qwen3/Qwen3-Next-80B-A3B-Thinking:
- accuracy: 81.577
moonshotai/Kimi-K2-Instruct:
- quant_algo: FP8_BLOCK_SCALES
accuracy: 94.84
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
- accuracy: 92.57
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 92.42
nvidia/Nemotron-H-8B-Base-8K:
- accuracy: 46.20
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 85.78
nvidia/Nemotron-H-47B-Base-8K:
- accuracy: 88.82
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 88.55
nvidia/Nemotron-H-56B-Base-8K:
- accuracy: 89.27
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 89.27
nvidia/Nemotron-MOE:
- accuracy: 88.249
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 86.884
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
- accuracy: 37.15
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 28.39
nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
- accuracy: 94.43
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 94.16
kanana-1.5-2.1b-instruct-2505:
- accuracy: 75.81
speakleash/Bielik-11B-v2.2-Instruct:
- accuracy: 41.51
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 40.41
google/gemma-3-1b-it:
- accuracy: 25.52 # score getting from lm-eval with HF implementation
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 23.96
google/gemma-3-27b-it:
- accuracy: 91.66
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 90.66
mistralai/Ministral-8B-Instruct-2410:
- accuracy: 79.25
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 78.35
mistralai/Mistral-Small-3.1-24B-Instruct-2503:
- accuracy: 89.23
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 89.23
microsoft/Phi-4-multimodal-instruct:
- accuracy: 81.19
- quant_algo: FP8
accuracy: 80.82
- quant_algo: NVFP4
accuracy: 69.33
microsoft/Phi-4-multimodal-instruct-long-rope:
- accuracy: 75.85
microsoft/Phi-4-mini-instruct:
- accuracy: 82.30
microsoft/phi-4:
- accuracy: 90.30
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 90.64
mistralai/Codestral-22B-v0.1:
- accuracy: 67.10
GPT-OSS/BF16:
- accuracy: 90.3
- kv_cache_quant_algo: FP8
accuracy: 90.3
GPT-OSS/120B-MXFP4:
- accuracy: 90.3
- spec_dec_algo: Eagle
accuracy: 90.3
- quant_algo: W4A8_MXFP4_MXFP8
accuracy: 90.3
- quant_algo: W4A8_MXFP4_MXFP8
spec_dec_algo: Eagle
accuracy: 90.3
- quant_algo: W4A8_MXFP4_MXFP8
kv_cache_quant_algo: FP8
accuracy: 90.3
- quant_algo: W4A8_MXFP4_FP8
accuracy: 90.3
- quant_algo: W4A8_MXFP4_FP8
kv_cache_quant_algo: FP8
accuracy: 90.3
- quant_algo: W4A16_MXFP4
accuracy: 90.3
- quant_algo: W4A16_MXFP4
spec_dec_algo: Eagle
accuracy: 90.3
- quant_algo: W4A16_MXFP4
kv_cache_quant_algo: FP8
accuracy: 90.3
GPT-OSS/20B-MXFP4:
- accuracy: 85.0
- quant_algo: W4A8_MXFP4_MXFP8
accuracy: 85.0
- quant_algo: W4A8_MXFP4_MXFP8
kv_cache_quant_algo: FP8
accuracy: 85.0
- quant_algo: W4A16_MXFP4
accuracy: 85.0
- quant_algo: W4A16_MXFP4
kv_cache_quant_algo: FP8
accuracy: 85.0
LGAI-EXAONE/EXAONE-4.0-32B:
- accuracy: 88.36
ByteDance-Seed/Seed-OSS-36B-Instruct:
- accuracy: 90.8
zai-org/GLM-4.6:
- accuracy: 81.3
- quant_algo: NVFP4
spec_dec_algo: MTP
accuracy: 88.0
bigcode/starcoder2-3b:
- accuracy: 20.2
bigcode/starcoder2-7b:
- accuracy: 26.5
bigcode/starcoder2-15b:
- accuracy: 54.5