TensorRT-LLMs/examples/disaggregated/slurm/submit.sh
Kaiyu Xie e58afa510e
doc: Add README for wide EP (#6356)
Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
2025-07-29 00:36:12 -04:00

38 lines
1.1 KiB
Bash

#!/bin/bash
echo "Make sure that SLURM parameters are correctly set in \`disaggr_torch.slurm\` before executing this script."
# concurrency 8
concurrency=8
ctx_num=1
total_node_num=8
ntasks_per_node=4 # 4 GPUs per GB200 node
ntasks=$((total_node_num * ntasks_per_node))
# `--segment` makes sure that all nodes are in the same NVLink domain
# disaggr_torch.slurm arguments:
# num_ctx_servers=$1
# ctx_tp_size=$2
# ctx_batch_size=$3
# ctx_max_num_tokens=$4
# ctx_enable_attention_dp=$5
# num_gen_servers=$6
# gen_tp_size=$7
# gen_batch_size=$8
# gen_max_num_tokens=$9
# gen_enable_attention_dp=${10}
# gen_gpu_memory_fraction=${11}
# eplb_num_slots=${12}
# mtp_size=${13}
# concurrency=${14}
# This command starts a job with 8 nodes, 32 GPUs in total.
# The server will include 4 context workers with DEP4, and 1 generation worker with DEP8.
sbatch --nodes=${total_node_num} \
--ntasks=${ntasks} \
--ntasks-per-node=${ntasks_per_node} \
--gres=gpu:${ntasks_per_node} \
--segment=${total_node_num} \
disaggr_torch.slurm \
${ctx_num} 4 4 4480 true 1 8 1024 1024 true "0.8" 0 0 "$concurrency"