TensorRT-LLMs/jenkins/scripts/slurm_install.sh
Yanchao Lu c4f27fa4c0
[None][ci] Some tweaks for the CI pipeline (#10359)
Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-04 11:10:47 -05:00

50 lines
1.9 KiB
Bash

#!/bin/bash
# Set up error handling
set -xEeuo pipefail
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
# Source utilities
bashUtilsPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_install\.sh/bash_utils.sh/')"
source "$bashUtilsPath"
slurm_install_setup() {
cd $resourcePathNode
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
# Use unique lock file for this job ID
lock_file="install_lock_job_${SLURM_JOB_ID:-local}_node_${SLURM_NODEID:-0}.lock"
if [ $SLURM_LOCALID -eq 0 ]; then
if [ -f "$lock_file" ]; then
rm -f "$lock_file"
fi
retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
which python3
python3 --version
retry_command apt-get install -y libffi-dev
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
if [[ $pytestCommand == *--run-ray* ]]; then
retry_command pip3 install --retries 10 ray[default]
fi
retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt"
retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl"
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
echo "(Writing install lock) Current directory: $(pwd)"
touch "$lock_file"
else
echo "(Waiting for install lock) Current directory: $(pwd)"
while [ ! -f "$lock_file" ]; do
sleep 10
done
fi
}
# Only run slurm_install_setup when script is executed directly (not sourced)
if [[ "${BASH_SOURCE[0]}" == "${0}" ]]; then
slurm_install_setup
fi