TensorRT-LLMs/examples/wide_ep/slurm_scripts/submit.sh
Kaiyu Xie f08286c679
doc: Refactor documents and examples of disaggregated serving and wide ep (#6054)
Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
2025-07-23 09:20:57 +08:00

39 lines
1.9 KiB
Bash

#!/bin/bash
# !!!
# Please find the `disaggr_torch.slurm` script in the `examples/disaggregated/slurm/` directory.
# Make sure that SLURM parameters are correctly set in `disaggr_torch.slurm` before executing this script.
# !!!
mtp_size=0
ntasks_per_node=4 # 4 GPUs per GB200 node
# dep8
for b in 1 64 1024; do
concurrency=$((b * 8))
ctx_num=$(((concurrency + 5499)/5500))
total_node_num=$((ctx_num + 2))
ntasks=$((total_node_num * ntasks_per_node))
sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 8 1024 1024 true "0.8" 0 "$mtp_size" "$concurrency"
done
# dep16 eplb0, 256, 288
for b in 1 64 1024; do
concurrency=$((b * 16))
ctx_num=$(((concurrency + 5499)/5500))
total_node_num=$((ctx_num + 4))
ntasks=$((total_node_num * ntasks_per_node))
sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 0 "$mtp_size" "$concurrency"
sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 256 "$mtp_size" "$concurrency"
sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 16 1024 1024 true "0.7" 288 "$mtp_size" "$concurrency"
done
# dep32 eplb288
for b in 512; do
concurrency=$((b * 32))
ctx_num=$(((concurrency + 5499)/5500))
total_node_num=$((ctx_num + 8))
ntasks=$((total_node_num * ntasks_per_node))
sbatch --nodes=${total_node_num} --ntasks=${ntasks} --ntasks-per-node=${ntasks_per_node} --segment=${total_node_num} disaggr_torch.slurm ${ctx_num} 4 4 4480 true 1 32 1024 1024 true "0.7" 288 "$mtp_size" "$concurrency"
done