mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-23 12:12:39 +08:00
22 lines
434 B
YAML
22 lines
434 B
YAML
model: nvidia/DeepSeek-R1-0528-FP4-v2
|
|
layer_indices: [5]
|
|
run_type: GEN
|
|
scaled_from: null
|
|
|
|
# KV cache related args
|
|
tokens_per_block: 32
|
|
max_seq_len: 9220 # 8192 + 1024 + 4
|
|
enable_attention_dp: true
|
|
|
|
# Model init args
|
|
max_num_tokens: 4096 # MTP3 as max
|
|
moe_backend: CUTLASS
|
|
use_cuda_graph: true
|
|
|
|
# Per iteration args
|
|
batch_size: 128
|
|
seq_len_q: 1 # Set to (1 + MTP)
|
|
seq_len_kv_cache: 8193
|
|
balance_method: Balanced
|
|
balance_ratio: null
|