model: nvidia/DeepSeek-R1-0528-FP4-v2 layer_indices: [5] run_type: GEN scaled_from: null # KV cache related args tokens_per_block: 32 max_seq_len: 9220 # 8192 + 1024 + 4 enable_attention_dp: true # Model init args max_num_tokens: 4096 # MTP3 as max moe_backend: CUTLASS use_cuda_graph: true # Per iteration args batch_size: 128 seq_len_q: 1 # Set to (1 + MTP) seq_len_kv_cache: 8193 balance_method: Balanced balance_ratio: null