diff --git a/examples/llm-api/llm_mgmn_llm_distributed.sh b/examples/llm-api/llm_mgmn_llm_distributed.sh index fa23ae2548..453600beee 100644 --- a/examples/llm-api/llm_mgmn_llm_distributed.sh +++ b/examples/llm-api/llm_mgmn_llm_distributed.sh @@ -8,6 +8,44 @@ #SBATCH -e logs/llmapi-distributed.err #SBATCH -J llmapi-distributed-task +############################################################################## +# OVERVIEW: +# This script demonstrates running a custom LLM API Python script on SLURM +# with distributed inference support. It executes quickstart_advanced.py with +# tensor parallelism across multiple GPUs/nodes. +# +# WHAT TO MODIFY: +# 1. SLURM Parameters (lines 2-9): +# - Replace with your SLURM account name +# - Replace with your SLURM partition name +# - Adjust -N (number of nodes) based on your TP size +# - Adjust --ntasks-per-node (GPUs per node) to match your setup +# +# 2. Environment Variables (set before running sbatch): +# - CONTAINER_IMAGE: Docker image with TensorRT-LLM installed +# - MOUNT_DIR: Host directory to mount in container +# - MOUNT_DEST: Container mount destination path +# - WORKDIR: Working directory inside container +# - SOURCE_ROOT: Path to TensorRT-LLM source code +# - PROLOGUE: Commands to run before main task (e.g., module loads) +# - LOCAL_MODEL: Path to your pre-downloaded model directory +# +# 3. Script Configuration (lines 39, 51-54): +# - Line 39: Change $script to point to your own Python script +# - Line 52: Modify --model_dir to use your model path +# - Line 53: Customize --prompt with your test prompt +# - Line 54: Adjust --tp_size to match your node/GPU setup +# +# EXAMPLE USAGE: +# export CONTAINER_IMAGE="nvcr.io/nvidia/tensorrt_llm:latest" +# export LOCAL_MODEL="/path/to/llama-model" +# sbatch llm_mgmn_llm_distributed.sh +# +# NOTE: This is a template - you can replace quickstart_advanced.py with any +# LLM API Python script. The trtllm-llmapi-launch wrapper handles the +# distributed execution setup automatically. +############################################################################## + ### :section Slurm ### :title Run LLM-API with pytorch backend on Slurm ### :order 0 @@ -16,8 +54,8 @@ # The trtllm-llmapi-launch is a script that launches the LLM-API code on # Slurm-like systems, and can support multi-node and multi-GPU setups. -# Note that, the number of MPI processes should be the same as the model world -# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for +# IMPORTANT: Total MPI processes (nodes × ntasks-per-node) must equal tp_size. +# e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for # each, or 4 nodes with 4 gpus for each or other combinations. # This docker image should have tensorrt_llm installed, or you need to install diff --git a/examples/llm-api/llm_mgmn_trtllm_bench.sh b/examples/llm-api/llm_mgmn_trtllm_bench.sh index 43c126368d..4bd7b1d8c8 100644 --- a/examples/llm-api/llm_mgmn_trtllm_bench.sh +++ b/examples/llm-api/llm_mgmn_trtllm_bench.sh @@ -8,6 +8,42 @@ #SBATCH -e logs/trtllm-bench.err #SBATCH -J trtllm-bench +############################################################################## +# OVERVIEW: +# This script runs trtllm-bench throughput benchmarking on SLURM with multi-node, +# multi-GPU setup. It prepares a synthetic dataset and then benchmarks the model +# using the PyTorch backend with tensor parallelism. +# +# WHAT TO MODIFY: +# 1. SLURM Parameters (lines 2-9): +# - Replace with your SLURM account name +# - Replace with your SLURM partition name +# - Adjust -N (number of nodes) based on your TP size +# - Adjust --ntasks-per-node (GPUs per node) to match your setup +# +# 2. Environment Variables (set before running sbatch): +# - CONTAINER_IMAGE: Docker image with TensorRT-LLM installed +# - MOUNT_DIR: Host directory to mount in container +# - MOUNT_DEST: Container mount destination path +# - WORKDIR: Working directory inside container +# - SOURCE_ROOT: Path to TensorRT-LLM source code +# - PROLOGUE: Commands to run before main task (e.g., module loads) +# - LOCAL_MODEL: Path to your pre-downloaded model directory +# - MODEL_NAME: Name of the model to benchmark +# - EXTRA_ARGS: (Optional) Additional benchmark arguments +# +# 3. Model Configuration (lines 87-94): +# - --tp 16: Adjust tensor parallelism size to match your node/GPU setup +# - --num-requests (line 56): Change number of benchmark requests +# - --input-mean/output-mean (lines 57-58): Adjust token lengths +# +# EXAMPLE USAGE: +# export CONTAINER_IMAGE="nvcr.io/nvidia/tensorrt_llm:latest" +# export LOCAL_MODEL="/path/to/llama-model" +# export MODEL_NAME="meta-llama/Llama-2-7b-hf" +# sbatch llm_mgmn_trtllm_bench.sh +############################################################################## + ### :title Run trtllm-bench with pytorch backend on Slurm ### :order 1 ### :section Slurm @@ -16,8 +52,8 @@ # The trtllm-llmapi-launch is a script that launches the LLM-API code on # Slurm-like systems, and can support multi-node and multi-GPU setups. -# Note that, the number of MPI processes should be the same as the model world -# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for +# IMPORTANT: Total MPI processes (nodes × ntasks-per-node) must equal tensor_parallel_size. +# e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for # each, or 4 nodes with 4 gpus for each or other combinations. # This docker image should have tensorrt_llm installed, or you need to install diff --git a/examples/llm-api/llm_mgmn_trtllm_serve.sh b/examples/llm-api/llm_mgmn_trtllm_serve.sh index a0cd8ce11f..ad377c9baa 100644 --- a/examples/llm-api/llm_mgmn_trtllm_serve.sh +++ b/examples/llm-api/llm_mgmn_trtllm_serve.sh @@ -8,6 +8,42 @@ #SBATCH -e logs/trtllm-serve.err #SBATCH -J trtllm-serve +############################################################################## +# OVERVIEW: +# This script launches trtllm-serve (OpenAI-compatible API server) on SLURM +# with multi-node, multi-GPU support. The server will start on all allocated +# nodes and serve the model with tensor parallelism. +# +# WHAT TO MODIFY: +# 1. SLURM Parameters (lines 2-9): +# - Replace with your SLURM account name +# - Replace with your SLURM partition name +# - Adjust -N (number of nodes) based on your TP size +# - Adjust --ntasks-per-node (GPUs per node) to match your setup +# +# 2. Environment Variables (set before running sbatch): +# - CONTAINER_IMAGE: Docker image with TensorRT-LLM installed +# - MOUNT_DIR: Host directory to mount in container +# - MOUNT_DEST: Container mount destination path +# - WORKDIR: Working directory inside container +# - SOURCE_ROOT: Path to TensorRT-LLM source code +# - PROLOGUE: Commands to run before main task (e.g., module loads) +# - LOCAL_MODEL: Path to your pre-downloaded model directory +# - ADDITIONAL_OPTIONS: (Optional) Extra trtllm-serve options +# +# 3. Server Configuration (lines 51-55): +# - --tp_size 16: Adjust tensor parallelism to match your node/GPU setup +# - --host 0.0.0.0: Server bind address (0.0.0.0 allows external access) +# +# EXAMPLE USAGE: +# export CONTAINER_IMAGE="nvcr.io/nvidia/tensorrt_llm:latest" +# export LOCAL_MODEL="/path/to/llama-model" +# sbatch llm_mgmn_trtllm_serve.sh +# +# NOTE: After the server starts, you can send requests to it using curl or +# the OpenAI Python client. Check the output logs for the server address. +############################################################################## + ### :title Run trtllm-serve with pytorch backend on Slurm ### :order 2 ### :section Slurm @@ -16,8 +52,8 @@ # The trtllm-llmapi-launch is a script that launches the LLM-API code on # Slurm-like systems, and can support multi-node and multi-GPU setups. -# Note that, the number of MPI processes should be the same as the model world -# size. e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for +# IMPORTANT: Total MPI processes (nodes × ntasks-per-node) must equal tp_size. +# e.g. For tensor_parallel_size=16, you may use 2 nodes with 8 gpus for # each, or 4 nodes with 4 gpus for each or other combinations. # This docker image should have tensorrt_llm installed, or you need to install