TensorRT-LLMs/examples/layer_wise_benchmarks/config_gen.yaml

model: nvidia/DeepSeek-R1-0528-FP4-v2
layer_indices: [5]
run_type: GEN
scaled_from: null

# KV cache related args
tokens_per_block: 32
max_seq_len: 9220  # 8192 + 1024 + 4
enable_attention_dp: true

# Model init args
max_num_tokens: 4096  # MTP3 as max
moe_backend: CUTLASS
use_cuda_graph: true

# Per iteration args
batch_size: 128
seq_len_q: 1  # Set to (1 + MTP)
seq_len_kv_cache: 8193
balance_method: Balanced
balance_ratio: null