TensorRT-LLMs/examples/disaggregated/slurm/service_discovery_example/launch.slurm
Lizhi Zhou 24167d00eb
[TRTLLM-8431][doc] update public doc and example, add etcd auto-scaling tests (#8602)
Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
2025-10-28 17:04:53 -07:00

74 lines
2.7 KiB
Plaintext

bin/bash
#SBATCH --partition=${partition}
#SBATCH --account=${account}
#SBATCH --job-name=${job_name}
#SBATCH --time=02:00:00
container_image="${container_image:-}"
mount_paths="${mount_paths:-}"
work_path="${work_path:-}"
enable_etcd="${enable_etcd:-0}"
disagg_port="8000"
ctx_port="8001"
gen_port="8002"
# use the first node as the disaggregated server node
disagg_server_node=$(head -n 1 <(scontrol show hostnames $SLURM_JOB_NODELIST))
if [[ "$enable_etcd" == "1" ]]; then
# you can optionally launch a etcd server, the container image must have etcd installed
disagg_cluster_uri="etcd://${disagg_server_node}:2379"
srun --container-image=${container_image} \
--container-mounts=${mount_paths} \
-w $disagg_server_node -N 1 --ntasks-per-node=1 \
--mpi=pmix \
bash -c "etcd" &
sleep 5 # wait for etcd to start
else
# or use the disaggregated server's http address as built-in service discovery server
disagg_cluster_uri="http://${disagg_server_node}:${disagg_port}"
fi
cat >${work_path}/disagg_config.yaml << EOL
hostname: localhost
port: ${disagg_port}
backend: pytorch
disagg_cluster:
cluster_uri: ${disagg_cluster_uri}
cluster_name: example_cluster
EOL
cat >${work_path}/ctx_extra-llm-api-config.yaml << EOL
disable_overlap_scheduler: True
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: 2048
EOL
cat >${work_path}/gen_extra-llm-api-config.yaml << EOL
cache_transceiver_config:
backend: UCX
max_tokens_in_buffer: 2048
EOL
# Launch a proxy without any context/generation servers.
srun --container-image=${container_image} \
--container-mounts=${mount_paths} \
-w $disagg_server_node -N 1 --ntasks-per-node=1 \
--mpi=pmix \
bash -c "trtllm-llmapi-launch trtllm-serve disaggregated -c ${work_path}/disagg_config.yaml" &
# Launch a context with `tp_size=8` using two 4-GPU nodes, and register itself through disagg_cluster_uri
srun --container-image=${container_image} \
--container-mounts=${mount_paths} \
-N 2 --ntasks-per-node=4 \
--mpi=pmix \
bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 8 --host 0.0.0.0 --port ${ctx_port} --extra_llm_api_options ${work_path}/ctx_extra-llm-api-config.yaml --disagg_cluster_uri ${disagg_cluster_uri} --server-role context" &
# Launch a generation with `tp_size=4` using one 4-GPU node.
srun --container-image=${container_image} \
--container-mounts=${mount_paths} \
-N 1 --ntasks-per-node=4 \
--mpi=pmix \
bash -c "trtllm-llmapi-launch trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --tp_size 4 --host 0.0.0.0 --port ${gen_port} --extra_llm_api_options ${work_path}/gen_extra-llm-api-config.yaml --disagg_cluster_uri ${disagg_cluster_uri} --server-role generation" &