TensorRT-LLMs/examples/layer_wise_benchmarks/slurm_init_containers.sh
Tailing Yuan 51ef0379d2
[None][feat] Add a parser to layer-wise benchmarks (#9440)
Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
2025-11-25 05:45:16 -08:00

56 lines
1.8 KiB
Bash
Executable File

#!/bin/bash
set -euo pipefail
# CONTAINER_IMAGE=
CONTAINER_NAME=${CONTAINER_NAME:-layer_wise_benchmarks}
CONTAINER_MOUNTS=$(realpath "$(pwd)/../.."):$(realpath "$(pwd)/../..")
if [ "${SLURM_JOB_ID:-}" == "" ]; then
echo "Please set SLURM_JOB_ID"
exit 1
fi
NODES=$(squeue -j $SLURM_JOB_ID -h -o "%D")
if [ "${CONTAINER_IMAGE:-}" == "" ]; then
# Read Docker image from current_image_tags.properties
source ../../jenkins/current_image_tags.properties
MACHINE="$(uname -m)"
if [ "$MACHINE" == "x86_64" ]; then
DOCKER_IMAGE=$LLM_DOCKER_IMAGE
elif [ "$MACHINE" == "aarch64" ]; then
DOCKER_IMAGE=$LLM_SBSA_DOCKER_IMAGE
else
echo "Unsupported machine hardware name \"$MACHINE\""
exit 1
fi
# Change "urm.nvidia.com/sw-tensorrt-docker/..." to "urm.nvidia.com#sw-tensorrt-docker/..." to bypass credentials
DOCKER_IMAGE="${DOCKER_IMAGE/\//#}"
echo "CONTAINER_IMAGE was not set, using Docker image $DOCKER_IMAGE"
# Import to .sqsh file
SQSH_FILE_NAME=$(echo "$DOCKER_IMAGE" |
awk -F'#' '{print $2}' |
awk -F':' '{gsub(/\//,"+",$1); print $1"+"$2".sqsh"}')
CONTAINER_IMAGE="../../enroot/$SQSH_FILE_NAME"
if [ ! -f "$CONTAINER_IMAGE" ]; then
echo "Container image file $CONTAINER_IMAGE does not exist, importing ..."
srun -N 1 --pty enroot import -o "$CONTAINER_IMAGE" "docker://$DOCKER_IMAGE"
fi
fi
WORKDIR=$(realpath "$(pwd)")
set -x
srun -N "$NODES" \
--ntasks-per-node 1 \
--container-image "$CONTAINER_IMAGE" \
--container-name "$CONTAINER_NAME" \
--container-mounts "$CONTAINER_MOUNTS" \
--container-workdir "$WORKDIR" \
bash -c "pip install -U packaging &&
pip install -r ../../requirements.txt --no-build-isolation &&
pip install -e ../.."