TensorRT-LLMs/tests/integration/defs/accuracy/references/gpqa_diamond.yaml

meta-llama/Llama-3.3-70B-Instruct:
  - accuracy: 45.96
  - quant_algo: NVFP4
    kv_cache_quant_algo: FP8
    accuracy: 45.55
  - quant_algo: FP8
    kv_cache_quant_algo: FP8
    accuracy: 48.03
  - quant_algo: FP8
    accuracy: 48.03
  - quant_algo: FP8
    kv_cache_quant_algo: FP8
    accuracy: 48.03
deepseek-ai/DeepSeek-R1:
  - quant_algo: NVFP4
    accuracy: 70.45
  - quant_algo: NVFP4
    spec_dec_algo: MTP
    accuracy: 70.06
  - quant_algo: NVFP4
    kv_cache_quant_algo: FP8
    accuracy: 70.45
deepseek-ai/DeepSeek-V3.2-Exp:
  - quant_algo: FP8_BLOCK_SCALES
    kv_cache_quant_algo: FP8
    accuracy: 80.0
  - quant_algo: NVFP4
    kv_cache_quant_algo: FP8
    accuracy: 80.0
  - quant_algo: FP8_BLOCK_SCALES
    spec_dec_algo: MTP
    accuracy: 80.0
  - quant_algo: NVFP4
    accuracy: 80.0
  - quant_algo: NVFP4
    spec_dec_algo: MTP
    accuracy: 80.0
nvidia/Llama-3_3-Nemotron-Super-49B-v1:
  - accuracy: 44.95
  - quant_algo: FP8
    kv_cache_quant_algo: FP8
    accuracy: 42.42
    # GPQA diamond only contains 198 samples, so the score tends to have large variance.
    # We repeated evaluation 7 times to choose a lower bound score for FP8, 42.42.
    # random_seed=0: 47.98
    # random_seed=1: 42.42
    # random_seed=2: 52.02
    # random_seed=3: 51.52
    # random_seed=4: 48.48
    # random_seed=5: 47.47
    # random_seed=6: 45.96
nvidia/Llama-3.1-Nemotron-Nano-8B-v1:
  - accuracy: 40.40
  - quant_algo: FP8
    kv_cache_quant_algo: FP8
    accuracy: 39.39
nvidia/Llama-3_1-Nemotron-Ultra-253B-v1:
  - accuracy: 58.08
  - quant_algo: FP8
    kv_cache_quant_algo: FP8
    accuracy: 57.07
GPT-OSS/120B-MXFP4:
  - accuracy: 65.0
  - spec_dec_algo: Eagle
    accuracy: 65.0
  - quant_algo: W4A8_MXFP4_MXFP8
    accuracy: 65.0
  - quant_algo: W4A8_MXFP4_MXFP8
    spec_dec_algo: Eagle
    accuracy: 65.0
  - quant_algo: W4A8_MXFP4_MXFP8
    kv_cache_quant_algo: FP8
    accuracy: 65.0
  - quant_algo: W4A16_MXFP4
    accuracy: 65.0
  - quant_algo: W4A16_MXFP4
    spec_dec_algo: Eagle
    accuracy: 65.0
  - quant_algo: W4A16_MXFP4
    kv_cache_quant_algo: FP8
    accuracy: 65.0