mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Signed-off-by: Zero Zeng <38289304+zerollzeng@users.noreply.github.com> Co-authored-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
139 lines
5.5 KiB
Bash
139 lines
5.5 KiB
Bash
#!/bin/bash
|
|
set -euo pipefail
|
|
|
|
echo "Please find the \`disaggr_torch.slurm\` script in the \`examples/disaggregated/slurm/benchmark/\` directory."
|
|
|
|
# Configuration
|
|
slurm_file="disaggr_torch.slurm"
|
|
|
|
# SLURM Configuration
|
|
partition="<partition>"
|
|
account="<account>"
|
|
job_time="02:00:00"
|
|
job_name="<job_name>"
|
|
|
|
##############################################################
|
|
# User Configuration - Review and edit the following variables
|
|
|
|
numa_bind=true
|
|
benchmark_mode="e2e" # e2e or gen_only
|
|
|
|
# Hardware Configuration
|
|
gpus_per_node=4 # Modify this with your hardware configuration
|
|
|
|
# Benchmark Configuration
|
|
use_nv_sa_benchmark=false # Whether to use NVIDIA SA benchmark script instead of default one
|
|
isl=1024 # Input sequence length
|
|
osl=1024 # Output sequence length
|
|
multi_round=10 # Number of benchmark rounds
|
|
benchmark_ratio=0.8 # Benchmark ratio
|
|
streaming=true # Enable streaming mode
|
|
cache_max_tokens=4608 # Cache transceiver max tokens
|
|
seq_offset=203 # Offset added to sequence lengths
|
|
# Dataset file for benchmarking
|
|
dataset_file="<dataset_file>"
|
|
|
|
# Environment Configuration
|
|
# Directories mount to the container
|
|
container_mount="<container_mount>" # path1:path1,path2:path2
|
|
# Container image
|
|
container_image="<container_image>"
|
|
# Path to the model directory
|
|
model_path="<model_path>"
|
|
# Path to the TensorRT-LLM repository
|
|
trtllm_repo="<trtllm_repo>"
|
|
# Set to true to do a clean build of TensorRT-LLM from source
|
|
build_wheel=false
|
|
|
|
# Workspace Configuration
|
|
work_dir=$(pwd) # path to the work directory containing the scripts
|
|
|
|
# Profiling Configuration
|
|
nsys_on=false # Set to true to enable profiling
|
|
|
|
##############################################################
|
|
|
|
# Check if SLURM file exists
|
|
if [[ ! -f "${slurm_file}" ]]; then
|
|
echo "Error: SLURM script '${slurm_file}' not found" >&2
|
|
exit 1
|
|
fi
|
|
|
|
# Validate required paths
|
|
[[ ! -d "${model_path}" ]] && { echo "Error: model_path not found: ${model_path}" >&2; exit 1; }
|
|
[[ ! -d "${work_dir}" ]] && { echo "Error: work_dir '${work_dir}' not found" >&2; exit 1; }
|
|
[[ ! -f "${dataset_file}" ]] && { echo "Error: dataset_file '${dataset_file}' not found" >&2; exit 1; }
|
|
|
|
# Calculate required nodes based on tensor parallel size and server count
|
|
calc_nodes() {
|
|
local tp_size=$1
|
|
local num_servers=$2
|
|
echo $(( (tp_size + gpus_per_node - 1) / gpus_per_node * num_servers ))
|
|
}
|
|
|
|
# Submit a single benchmark job
|
|
run_single() {
|
|
# Context server params
|
|
local ctx_num=$1
|
|
local ctx_tp_size=$2
|
|
local ctx_pp_size=$3
|
|
local ctx_batch_size=$4
|
|
local ctx_max_num_tokens=$5
|
|
local ctx_enable_attention_dp=$6
|
|
local ctx_gpu_frac=$7
|
|
# Generation server params
|
|
local gen_num=$8
|
|
local gen_tp_size=$9
|
|
local gen_pp_size=${10}
|
|
local gen_batch_size=${11}
|
|
local gen_max_num_tokens=${12}
|
|
local gen_enable_attention_dp=${13}
|
|
local gen_gpu_frac=${14}
|
|
local gen_eplb_num_slots=${15}
|
|
local mtp_size=${16}
|
|
local gen_concurrency_list=${17}
|
|
|
|
# Calculate total nodes needed
|
|
local gen_nodes=$(calc_nodes "$gen_tp_size" "$gen_num")
|
|
local ctx_nodes=$(calc_nodes "$ctx_tp_size" "$ctx_num")
|
|
local total_nodes=$((ctx_nodes + gen_nodes))
|
|
local total_tasks=$((total_nodes * gpus_per_node))
|
|
|
|
# Handle SLURM reservation if needed
|
|
local reservation_str=""
|
|
[[ $gen_eplb_num_slots -gt 0 ]] && reservation_str="--reservation=sla_res_fw_11"
|
|
|
|
# Submit job
|
|
set -x
|
|
sbatch \
|
|
--partition="${partition}" \
|
|
--gres=gpu:${gpus_per_node} \
|
|
--account="${account}" \
|
|
--time="${job_time}" \
|
|
--job-name="${job_name}" \
|
|
--nodes="${total_nodes}" \
|
|
--ntasks="${total_tasks}" \
|
|
--ntasks-per-node="${gpus_per_node}" \
|
|
--segment="${total_nodes}" \
|
|
${reservation_str} \
|
|
"${slurm_file}" \
|
|
"${ctx_num}" "${ctx_tp_size}" "${ctx_pp_size}" "${ctx_batch_size}" "${ctx_max_num_tokens}" "${ctx_enable_attention_dp}" "${ctx_gpu_frac}" \
|
|
"${gen_num}" "${gen_tp_size}" "${gen_pp_size}" "${gen_batch_size}" "${gen_max_num_tokens}" "${gen_enable_attention_dp}" "${gen_gpu_frac}" \
|
|
"${gen_eplb_num_slots}" "${mtp_size}" "${gen_concurrency_list}" \
|
|
"${gpus_per_node}" "${use_nv_sa_benchmark}" "${isl}" "${osl}" "${multi_round}" "${benchmark_ratio}" \
|
|
"${streaming}" "${cache_max_tokens}" "${dataset_file}" "${container_mount}" "${container_image}" \
|
|
"${model_path}" "${trtllm_repo}" "${build_wheel}" "${work_dir}" "${nsys_on}" "${seq_offset}" "${numa_bind}" "${benchmark_mode}"
|
|
set +x
|
|
}
|
|
|
|
# Example benchmark configuration
|
|
# |------------------- context -----------------| |---------------------- generation ----------------------|
|
|
# num tp pp batch tokens attn_dp gpu_frac num tp pp batch tokens attn_dp gpu_frac eplb mtp concurrency
|
|
# 1k-1k
|
|
run_single 1 4 1 4 4608 true 0.85 1 16 1 64 256 true "0.7" 0 3 "512 1075"
|
|
run_single 2 4 1 4 4608 true 0.85 1 16 1 128 256 true "0.7" 0 1 "2150"
|
|
run_single 1 4 1 4 4608 true 0.85 1 32 1 16 64 true "0.6" 0 3 "512"
|
|
run_single 1 4 1 4 4608 true 0.85 1 32 1 32 32 true "0.7" 0 0 "1075"
|
|
run_single 1 4 1 4 4608 true 0.85 1 16 1 64 64 true "0.75" 0 0 "1075"
|
|
run_single 2 4 1 4 4608 true 0.85 1 16 1 256 256 true "0.75" 0 0 "2048 4300"
|