TensorRT-LLMs/examples/disaggregated/slurm/submit.sh

#!/bin/bash

echo "Make sure that SLURM parameters are correctly set in \`disaggr_torch.slurm\` before executing this script."

# concurrency 8
concurrency=8
ctx_num=1
total_node_num=8
ntasks_per_node=4 # 4 GPUs per GB200 node
ntasks=$((total_node_num * ntasks_per_node))

# `--segment` makes sure that all nodes are in the same NVLink domain
# disaggr_torch.slurm arguments:
#   num_ctx_servers=$1
#   ctx_tp_size=$2
#   ctx_batch_size=$3
#   ctx_max_num_tokens=$4
#   ctx_enable_attention_dp=$5
#   num_gen_servers=$6
#   gen_tp_size=$7
#   gen_batch_size=$8
#   gen_max_num_tokens=$9
#   gen_enable_attention_dp=${10}
#   gen_gpu_memory_fraction=${11}
#   eplb_num_slots=${12}
#   mtp_size=${13}
#   concurrency=${14}

# This command starts a job with 8 nodes, 32 GPUs in total.
# The server will include 4 context workers with DEP4, and 1 generation worker with DEP8.
sbatch --nodes=${total_node_num} \
    --ntasks=${ntasks} \
    --ntasks-per-node=${ntasks_per_node} \
    --gres=gpu:${ntasks_per_node} \
    --segment=${total_node_num} \
    disaggr_torch.slurm \
        ${ctx_num} 4 4 4480 true 1 8 1024 1024 true "0.8" 0 0 "$concurrency"