#!/bin/bash set -euo pipefail echo "Please find the \`disaggr_torch.slurm\` script in the \`examples/disaggregated/slurm/benchmark/\` directory." # Configuration slurm_file="disaggr_torch.slurm" # SLURM Configuration partition="" account="" job_time="02:00:00" job_name="" ############################################################## # User Configuration - Review and edit the following variables numa_bind=true benchmark_mode="e2e" # e2e or gen_only # Hardware Configuration gpus_per_node=4 # Modify this with your hardware configuration # Benchmark Configuration use_nv_sa_benchmark=false # Whether to use NVIDIA SA benchmark script instead of default one isl=1024 # Input sequence length osl=1024 # Output sequence length multi_round=10 # Number of benchmark rounds benchmark_ratio=0.8 # Benchmark ratio streaming=true # Enable streaming mode cache_max_tokens=4608 # Cache transceiver max tokens seq_offset=203 # Offset added to sequence lengths # Dataset file for benchmarking dataset_file="" # Environment Configuration # Directories mount to the container container_mount="" # path1:path1,path2:path2 # Container image container_image="" # Path to the model directory model_path="" # Path to the TensorRT-LLM repository trtllm_repo="" # Set to true to do a clean build of TensorRT-LLM from source build_wheel=false # Workspace Configuration work_dir=$(pwd) # path to the work directory containing the scripts # Profiling Configuration nsys_on=false # Set to true to enable profiling ############################################################## # Check if SLURM file exists if [[ ! -f "${slurm_file}" ]]; then echo "Error: SLURM script '${slurm_file}' not found" >&2 exit 1 fi # Validate required paths [[ ! -d "${model_path}" ]] && { echo "Error: model_path not found: ${model_path}" >&2; exit 1; } [[ ! -d "${work_dir}" ]] && { echo "Error: work_dir '${work_dir}' not found" >&2; exit 1; } [[ ! -f "${dataset_file}" ]] && { echo "Error: dataset_file '${dataset_file}' not found" >&2; exit 1; } # Calculate required nodes based on tensor parallel size and server count calc_nodes() { local tp_size=$1 local num_servers=$2 echo $(( (tp_size + gpus_per_node - 1) / gpus_per_node * num_servers )) } # Submit a single benchmark job run_single() { # Context server params local ctx_num=$1 local ctx_tp_size=$2 local ctx_pp_size=$3 local ctx_batch_size=$4 local ctx_max_num_tokens=$5 local ctx_enable_attention_dp=$6 local ctx_gpu_frac=$7 # Generation server params local gen_num=$8 local gen_tp_size=$9 local gen_pp_size=${10} local gen_batch_size=${11} local gen_max_num_tokens=${12} local gen_enable_attention_dp=${13} local gen_gpu_frac=${14} local gen_eplb_num_slots=${15} local mtp_size=${16} local gen_concurrency_list=${17} # Calculate total nodes needed local gen_nodes=$(calc_nodes "$gen_tp_size" "$gen_num") local ctx_nodes=$(calc_nodes "$ctx_tp_size" "$ctx_num") local total_nodes=$((ctx_nodes + gen_nodes)) local total_tasks=$((total_nodes * gpus_per_node)) # Handle SLURM reservation if needed local reservation_str="" [[ $gen_eplb_num_slots -gt 0 ]] && reservation_str="--reservation=sla_res_fw_11" # Submit job set -x sbatch \ --partition="${partition}" \ --gres=gpu:${gpus_per_node} \ --account="${account}" \ --time="${job_time}" \ --job-name="${job_name}" \ --nodes="${total_nodes}" \ --ntasks="${total_tasks}" \ --ntasks-per-node="${gpus_per_node}" \ --segment="${total_nodes}" \ ${reservation_str} \ "${slurm_file}" \ "${ctx_num}" "${ctx_tp_size}" "${ctx_pp_size}" "${ctx_batch_size}" "${ctx_max_num_tokens}" "${ctx_enable_attention_dp}" "${ctx_gpu_frac}" \ "${gen_num}" "${gen_tp_size}" "${gen_pp_size}" "${gen_batch_size}" "${gen_max_num_tokens}" "${gen_enable_attention_dp}" "${gen_gpu_frac}" \ "${gen_eplb_num_slots}" "${mtp_size}" "${gen_concurrency_list}" \ "${gpus_per_node}" "${use_nv_sa_benchmark}" "${isl}" "${osl}" "${multi_round}" "${benchmark_ratio}" \ "${streaming}" "${cache_max_tokens}" "${dataset_file}" "${container_mount}" "${container_image}" \ "${model_path}" "${trtllm_repo}" "${build_wheel}" "${work_dir}" "${nsys_on}" "${seq_offset}" "${numa_bind}" "${benchmark_mode}" set +x } # Example benchmark configuration # |------------------- context -----------------| |---------------------- generation ----------------------| # num tp pp batch tokens attn_dp gpu_frac num tp pp batch tokens attn_dp gpu_frac eplb mtp concurrency # 1k-1k run_single 1 4 1 4 4608 true 0.85 1 16 1 64 256 true "0.7" 0 3 "512 1075" run_single 2 4 1 4 4608 true 0.85 1 16 1 128 256 true "0.7" 0 1 "2150" run_single 1 4 1 4 4608 true 0.85 1 32 1 16 64 true "0.6" 0 3 "512" run_single 1 4 1 4 4608 true 0.85 1 32 1 32 32 true "0.7" 0 0 "1075" run_single 1 4 1 4 4608 true 0.85 1 16 1 64 64 true "0.75" 0 0 "1075" run_single 2 4 1 4 4608 true 0.85 1 16 1 256 256 true "0.75" 0 0 "2048 4300"