TensorRT-LLMs/tests/integration/defs/accuracy/references/mmlu.yaml
Ivy Zhang 94de3c11b0
tests: Add llama4 functional cases (#6392)
Signed-off-by: Ivy Zhang <25222398+crazydemo@users.noreply.github.com>
2025-07-29 17:49:43 +10:00

199 lines
4.8 KiB
YAML

meta-llama/Llama-2-7b-hf:
- accuracy: 46.69
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 46.76
meta-llama/Meta-Llama-3-8B-Instruct:
- accuracy: 67.74
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 63.47
meta-llama/Llama-3.1-8B:
- accuracy: 66.06
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 63.16
- quant_algo: FP8_PER_CHANNEL_PER_TOKEN
accuracy: 65.55
- quant_algo: MIXED_PRECISION
extra_acc_spec: autoq_format=int4_awq,fp8,w4a8_awq;auto_quantize_bits=5.8
accuracy: 64.99
meta-llama/Llama-3.1-8B-Instruct:
- accuracy: 68.17
- spec_dec_algo: EAGLE3
accuracy: 68.20
- spec_dec_algo: NGRAM
accuracy: 68.17
- quant_algo: FP8
accuracy: 67.93
- quant_algo: FP8
extra_acc_spec: temperature=0.8,top_p=0.95
accuracy: 64.62
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 67.87
meta-llama/Llama-3.2-1B:
- quant_algo: W8A8_SQ_PER_CHANNEL_PER_TOKEN_PLUGIN
accuracy: 32.72
- quant_algo: W8A8_SQ_PER_CHANNEL
accuracy: 32.07
- quant_algo: W4A16_AWQ
accuracy: 30.56
- quant_algo: W4A16_AWQ
kv_cache_quant_algo: INT8
accuracy: 31.29
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 31.02
- quant_algo: FP8_PER_CHANNEL_PER_TOKEN
accuracy: 33.97
- quant_algo: FP8_PER_CHANNEL_PER_TOKEN
extra_acc_spec: meta_recipe
accuracy: 33.87
- extra_acc_spec: max_attention_window_size=960
accuracy: 32.82
meta-llama/Llama-3.2-3B:
- accuracy: 57.92
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 60.60
meta-llama/Llama-3.3-70B-Instruct:
- accuracy: 81.31
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 79.31
- quant_algo: FP8
accuracy: 81.02
meta-llama/Llama-4-Maverick-17B-128E-Instruct:
- accuracy: 86.40
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 86.45
meta-llama/Llama-4-Scout-17B-16E-Instruct:
- accuracy: 80.00
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 80.00
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 80.00
mistralai/Mistral-7B-v0.1:
- accuracy: 66
mistralai/Mistral-7B-Instruct-v0.3:
- quant_algo: W4A16
accuracy: 59.23
- quant_algo: W4A16_AWQ
accuracy: 61.06
- quant_algo: W4A8_AWQ
accuracy: 60.04
mistralai/Mixtral-8x7B-v0.1:
- accuracy: 71.35
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 71.27
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 69.64
mistralai/Mixtral-8x7B-Instruct-v0.1:
- accuracy: 68.0
mistralai/Mixtral-8x22B-v0.1:
- quant_algo: FP8
accuracy: 77.63
google/gemma-2-9b-it:
- accuracy: 73.05
Qwen/Qwen2-0.5B-Instruct:
- accuracy: 45.30
- quant_algo: FP8
accuracy: 45.03
Qwen/Qwen2.5-0.5B-Instruct:
- accuracy: 49.59
- quant_algo: FP8
accuracy: 48.59
Qwen/Qwen2.5-1.5B-Instruct:
- accuracy: 61.45
- quant_algo: FP8
accuracy: 61.43
Qwen/Qwen2.5-7B-Instruct:
- accuracy: 75.32
- quant_algo: FP8
accuracy: 75.32
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 75.32
deepseek-ai/DeepSeek-V3-Lite:
- accuracy: 71.40
- quant_algo: NVFP4
accuracy: 70.60
- quant_algo: NVFP4
spec_dec_algo: MTP
accuracy: 70.60
- quant_algo: FP8_BLOCK_SCALES
accuracy: 71.27
- spec_dec_algo: MTP
accuracy: 71.39
- quant_algo: FP8_BLOCK_SCALES
spec_dec_algo: MTP
accuracy: 71.29
deepseek-ai/DeepSeek-R1:
- quant_algo: NVFP4
accuracy: 87.33
- quant_algo: NVFP4
spec_dec_algo: MTP
accuracy: 87.33
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 87.33
- quant_algo: FP8_BLOCK_SCALES
accuracy: 87.573
- quant_algo: FP8_BLOCK_SCALES
spec_dec_algo: MTP
accuracy: 87.573
Qwen3/Qwen3-8B:
- quant_algo: FP8_BLOCK_SCALES
accuracy: 76.12
Qwen3/Qwen3-30B-A3B:
- quant_algo: FP8_BLOCK_SCALES
accuracy: 79.53
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 80.65
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 80.65
Qwen3/Qwen3-235B-A22B:
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 86
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 86
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
- accuracy: 79.43
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 79.26
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
- accuracy: 57.97
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 57.12
nvidia/Nemotron-H-8B-Base-8K:
- accuracy: 69.590
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 69.180
microsoft/Phi-4-mini-instruct:
- accuracy: 68.98
nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
- accuracy: 83.70
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 83.36
kanana-1.5-2.1b-instruct-2505:
- accuracy: 56.89
speakleash/Bielik-11B-v2.2-Instruct:
- accuracy: 64.47
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 64.36