[None][infra] Some improvements for Slurm execution path in the CI (#10316)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Yanchao Lu 2025-12-29 19:49:44 +08:00 committed by GitHub
parent 9cee32ab39
commit 965578ca21
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 133 additions and 24 deletions

View File

@ -694,9 +694,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
}
slurmRunner = null
if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
if (cluster.containerRuntime.toString() == "DOCKER") {
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
} else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
} else if (cluster.containerRuntime.toString() == "ENROOT") {
slurmRunner = runInEnrootOnNode(nodeName)
} else {
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
@ -799,7 +799,7 @@ def getPytestBaseCommandLine(
"LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
"LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
"MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
"COLUMNS=200",
"COLUMNS=400",
extraInternalEnv,
portEnvVars,
pytestUtil,
@ -860,11 +860,11 @@ def getMountListForSlurmTest(SlurmCluster cluster, boolean useSbatch = false)
}
// data/cache mounts
if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
if (cluster.containerRuntime.toString() == "DOCKER") {
mounts += [
"/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro",
]
} else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
} else if (cluster.containerRuntime.toString() == "ENROOT") {
if (!cluster.scratchPath) {
throw new Exception("Scratch path is not set for cluster: ${cluster.name}")
}
@ -922,6 +922,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
def scriptBashUtilsLocalPath = "${llmSrcLocal}/jenkins/scripts/bash_utils.sh"
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt"
def outputPath = "${jobWorkspace}/job-output.log"
@ -956,6 +958,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
scriptInstallPathNode,
true
)
Utils.exec(pipeline, script: "echo \"Script for Bash utilities: \" && cat ${scriptBashUtilsLocalPath}")
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptBashUtilsLocalPath,
scriptBashUtilsPathNode,
true
)
// Generate Test List and Upload to Frontend Node
def makoArgs = getMakoArgsFromStageName(stageName, true)
@ -1040,7 +1050,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def containerImageArg = container
def srunPrologue = ""
if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
if (cluster.containerRuntime.toString() == "ENROOT") {
def enrootImagePath = "${cluster.scratchPath}/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
containerImageArg = enrootImagePath
@ -1127,9 +1137,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
set -xEeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"
echo "Starting Slurm job \$SLURM_JOB_ID on \$SLURM_NODELIST"
export jobWorkspace=$jobWorkspace
export tarName=$tarName
export llmTarfile=$llmTarfile
@ -1219,10 +1227,11 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
touch "${outputPath}"
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
if [ -z "\$jobId" ]; then
echo "Error: Job submission failed, no job ID returned."
echo "Error: Slurm job submission failed, no job ID returned."
exit 1
fi
echo "Submitted job \$jobId"
echo "Submitted Slurm job \$jobId"
echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
tail -f ${outputPath} &
tailPid=\$!
# Wait until sbatch job is done.
@ -1232,9 +1241,28 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
# Kill tail -f process
kill \$tailPid
# Check if the job failed or not
sleep 5
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
sleep 10
# Retry getting status and exit code as sacct might be delayed
for i in {1..3}; do
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then
break
fi
echo "Waiting for sacct to update... attempt \$i"
sleep 10
done
if [ -z "\$EXIT_CODE" ]; then
echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
EXIT_CODE=1
fi
if [ -z "\$STATUS" ]; then
echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
STATUS="UNKNOWN"
fi
if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
echo "Pytest succeed in Slurm job \$jobId"
echo "Status: \$STATUS | Exit_code \$EXIT_CODE"

View File

@ -0,0 +1,45 @@
#!/bin/bash
# Retry a command with a specified number of retries and interval.
# Arguments:
# max_retries (optional): The maximum number of times to retry the command. Default: 3.
# interval (optional): The time in seconds to wait between retries. Default: 60.
# command: The command to run and its arguments.
# Usage:
# retry_command [max_retries] [interval] command...
# If only one numeric argument is provided, it is treated as max_retries.
function retry_command() {
local max_retries=3
local interval=60
if [[ "$1" =~ ^[0-9]+$ ]]; then
max_retries=$1
shift
fi
if [[ "$1" =~ ^[0-9]+$ ]]; then
interval=$1
shift
fi
local cmd=("$@")
local count=0
local rc=0
while [ $count -lt $max_retries ]; do
if "${cmd[@]}"; then
return 0
fi
rc=$?
count=$((count + 1))
echo "Command failed with exit code $rc. Attempt $count/$max_retries."
if [ $count -lt $max_retries ]; then
echo "Retrying in $interval seconds..."
sleep $interval
fi
done
echo "Command failed after $max_retries attempts."
return $rc
}

View File

@ -4,22 +4,25 @@
set -xEeuo pipefail
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
# Source utilities
bashUtilsPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_install\.sh/bash_utils.sh/')"
source "$bashUtilsPath"
slurm_install_setup() {
cd $resourcePathNode
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
if [ $SLURM_LOCALID -eq 0 ]; then
wget -nv $llmTarfile
tar -zxf $tarName
retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
which python3
python3 --version
apt-get install -y libffi-dev
retry_command apt-get install -y libffi-dev
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
if [[ $pytestCommand == *--run-ray* ]]; then
pip3 install --retries 10 ray[default]
retry_command pip3 install --retries 10 ray[default]
fi
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt"
retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl"
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"

View File

@ -63,9 +63,10 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
# Only the first process will save the coverage config file
if [ $SLURM_PROCID -eq 0 ]; then
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
else
# Sleep 10 seconds to wait for the coverage config file to be saved
sleep 10
fi
# Sleep 10 seconds to wait for the coverage config file to be saved
sleep 10
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
@ -95,8 +96,17 @@ echo "Full Command: $pytestCommand"
done
fi
# Turn off "exit on error" so the following lines always run
set +e
pytest_exit_code=0
perf_check_exit_code=0
perf_report_exit_code=0
perf_sanity_check_exit_code=0
eval $pytestCommand
echo "Rank${SLURM_PROCID} Pytest finished execution"
pytest_exit_code=$?
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
if [[ "$stageName" == *PyTorch* ]]; then
@ -109,15 +119,38 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
python3 $llmSrcNode/tests/integration/defs/perf/sanity_perf_check.py \
$stageName/perf_script_test_results.csv \
$basePerfPath
echo "Check Perf Result"
perf_check_exit_code=$?
echo "Create Perf Report"
python3 $llmSrcNode/tests/integration/defs/perf/create_perf_comparison_report.py \
--output_path $stageName/report.pdf \
--files $stageName/perf_script_test_results.csv \
$basePerfPath
perf_report_exit_code=$?
echo "Rank${SLURM_PROCID} Perf report finished execution with exit code $perf_report_exit_code"
if [ "$perf_check_exit_code" -eq 0 ] && [ "$perf_report_exit_code" -ne 0 ]; then
perf_check_exit_code=$perf_report_exit_code
fi
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
fi
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
echo "Check Perf-Sanity Result"
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
$jobWorkspace
perf_sanity_check_exit_code=$?
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
fi
if [ "$pytest_exit_code" -ne 0 ]; then
final_exit_code=$pytest_exit_code
elif [ "$perf_check_exit_code" -ne 0 ]; then
final_exit_code=$perf_check_exit_code
elif [ "$perf_sanity_check_exit_code" -ne 0 ]; then
final_exit_code=$perf_sanity_check_exit_code
else
final_exit_code=0
fi
echo "Rank${SLURM_PROCID} Final Slurm run finished execution with exit code $final_exit_code"
exit $final_exit_code