TensorRT-LLMs/tests/integration/defs/accuracy/references/gpqa_diamond.yaml
xinhe-nv 0a467b00cc
[https://nvbugs/5409414][fix] fix Not registered specs (#6660)
Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com>
Co-authored-by: Larry <197874197+LarryXFly@users.noreply.github.com>
2025-08-07 17:55:53 +10:00

44 lines
1.2 KiB
YAML

meta-llama/Llama-3.3-70B-Instruct:
- accuracy: 45.96
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 45.55
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 48.03
- quant_algo: FP8
accuracy: 48.03
deepseek-ai/DeepSeek-R1:
- quant_algo: NVFP4
accuracy: 70.45
- quant_algo: NVFP4
spec_dec_algo: MTP
accuracy: 70.06
- quant_algo: NVFP4
kv_cache_quant_algo: FP8
accuracy: 70.45
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
- accuracy: 44.95
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 42.42
# GPQA diamond only contains 198 samples, so the score tends to have large variance.
# We repeated evaluation 7 times to choose a lower bound score for FP8, 42.42.
# random_seed=0: 47.98
# random_seed=1: 42.42
# random_seed=2: 52.02
# random_seed=3: 51.52
# random_seed=4: 48.48
# random_seed=5: 47.47
# random_seed=6: 45.96
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
- accuracy: 40.40
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 39.39
nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
- accuracy: 58.08
- quant_algo: FP8
kv_cache_quant_algo: FP8
accuracy: 57.07