TensorRT-LLMs/tests/scripts/perf-sanity/benchmark_config.yaml

test_cases:
  - id: 1
    model: "70B-FP8"
    gpus: 1
    tp: 1
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.9
    max_batch_size: 1024
    isl: 1024
    osl: 1024
    max_num_tokens: 16384
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 2
    model: "70B-FP8"
    gpus: 1
    tp: 1
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.9
    max_batch_size: 1024
    isl: 8192
    osl: 1024
    max_num_tokens: 16384
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 3
    model: "70B-FP8"
    gpus: 4
    tp: 4
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.9
    max_batch_size: 1024
    isl: 1024
    osl: 1024
    max_num_tokens: 16384
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 4
    model: "70B-FP8"
    gpus: 4
    tp: 4
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.9
    max_batch_size: 1024
    isl: 8192
    osl: 1024
    max_num_tokens: 16384
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 5
    model: "70B-FP4"
    gpus: 1
    tp: 1
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.9
    max_batch_size: 1024
    isl: 1024
    osl: 1024
    max_num_tokens: 16384
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 6
    model: "70B-FP4"
    gpus: 1
    tp: 1
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.9
    max_batch_size: 1024
    isl: 8192
    osl: 1024
    max_num_tokens: 16384
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 7
    model: "70B-FP4"
    gpus: 4
    tp: 4
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.9
    max_batch_size: 1024
    isl: 1024
    osl: 1024
    max_num_tokens: 16384
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 8
    model: "70B-FP4"
    gpus: 4
    tp: 4
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.9
    max_batch_size: 1024
    isl: 8192
    osl: 1024
    max_num_tokens: 16384
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 9
    model: "Scout-FP8"
    gpus: 1
    tp: 1
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 1024
    osl: 1024
    max_num_tokens: 2176
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 10
    model: "Scout-FP8"
    gpus: 1
    tp: 1
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 8192
    osl: 1024
    max_num_tokens: 9334
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 11
    model: "Scout-FP8"
    gpus: 4
    tp: 4
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 1024
    osl: 1024
    max_num_tokens: 2176
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 12
    model: "Scout-FP8"
    gpus: 4
    tp: 4
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 8192
    osl: 1024
    max_num_tokens: 9334
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 13
    model: "Scout-FP4"
    gpus: 1
    tp: 1
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 1024
    osl: 1024
    max_num_tokens: 2176
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 14
    model: "Scout-FP4"
    gpus: 1
    tp: 1
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 8192
    osl: 1024
    max_num_tokens: 9334
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 15
    model: "Scout-FP4"
    gpus: 4
    tp: 4
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 1024
    osl: 1024
    max_num_tokens: 2176
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 16
    model: "Scout-FP4"
    gpus: 4
    tp: 4
    ep: 1
    attn_backend: "TRTLLM"
    moe_backend: ""
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 8192
    osl: 1024
    max_num_tokens: 9334
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]
      - [64, 5]
      - [512, 2]

  - id: 17
    model: "R1-FP8"
    gpus: 8
    tp: 8
    ep: 8
    attn_backend: "TRTLLM"
    moe_backend: "DEEPGEMM"
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 1024
    isl: 1024
    osl: 1024
    max_num_tokens: 2176
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]

  - id: 18
    model: "R1-FP8"
    gpus: 8
    tp: 8
    ep: 8
    attn_backend: "TRTLLM"
    moe_backend: "DEEPGEMM"
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 1024
    isl: 8192
    osl: 1024
    max_num_tokens: 9344
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]

  - id: 19
    model: "R1-FP8"
    gpus: 8
    tp: 8
    ep: 8
    attn_backend: "TRTLLM"
    moe_backend: "DEEPGEMM"
    enable_attention_dp: true
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 1024
    osl: 1024
    max_num_tokens: 2176
    moe_max_num_tokens: 37376
    concurrency_iterations:
      - [64, 5]
      - [512, 2]
      - [4096, 2]

  - id: 20
    model: "R1-FP8"
    gpus: 8
    tp: 8
    ep: 8
    attn_backend: "TRTLLM"
    moe_backend: "DEEPGEMM"
    enable_attention_dp: true
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 8192
    osl: 1024
    max_num_tokens: 9344
    moe_max_num_tokens: 9344
    concurrency_iterations:
      - [64, 5]
      - [512, 2]
      - [4096, 2]

  - id: 21
    model: "R1-FP4"
    gpus: 8
    tp: 8
    ep: 8
    attn_backend: "TRTLLM"
    moe_backend: "TRTLLM"
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 1024
    isl: 1024
    osl: 1024
    max_num_tokens: 2176
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]

  - id: 22
    model: "R1-FP4"
    gpus: 8
    tp: 8
    ep: 8
    attn_backend: "TRTLLM"
    moe_backend: "TRTLLM"
    enable_attention_dp: false
    free_gpu_mem_fraction: 0.8
    max_batch_size: 1024
    isl: 8192
    osl: 1024
    max_num_tokens: 9344
    moe_max_num_tokens: ""
    concurrency_iterations:
      - [1, 10]
      - [8, 10]

  - id: 23
    model: "R1-FP4"
    gpus: 8
    tp: 8
    ep: 8
    attn_backend: "TRTLLM"
    moe_backend: "CUTLASS"
    enable_attention_dp: true
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 1024
    osl: 1024
    max_num_tokens: 2176
    moe_max_num_tokens: 37376
    concurrency_iterations:
      - [64, 5]
      - [512, 2]
      - [4096, 2]

  - id: 24
    model: "R1-FP4"
    gpus: 8
    tp: 8
    ep: 8
    attn_backend: "TRTLLM"
    moe_backend: "CUTLASS"
    enable_attention_dp: true
    free_gpu_mem_fraction: 0.8
    max_batch_size: 512
    isl: 8192
    osl: 1024
    max_num_tokens: 9344
    moe_max_num_tokens: 9344
    concurrency_iterations:
      - [64, 5]
      - [512, 2]
      - [4096, 2]