model: nvidia/DeepSeek-R1-0528-FP4-v2 layer_indices: [5] run_type: GEN scaled_from: null # KV cache related args tokens_per_block: 32 enable_attention_dp: true # Model init args moe_backend: CUTLASS use_cuda_graph: true # Per iteration args batch_size: 128 seq_len_q: 1 # Set to (1 + MTP) seq_len_kv_cache: 8193 balance_method: Balanced balance_ratio: null