mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][infra] Some improvements for Slurm execution path in the CI (#10316)
Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
9cee32ab39
commit
965578ca21
@ -694,9 +694,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
|
||||
}
|
||||
|
||||
slurmRunner = null
|
||||
if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
|
||||
if (cluster.containerRuntime.toString() == "DOCKER") {
|
||||
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
|
||||
} else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
|
||||
} else if (cluster.containerRuntime.toString() == "ENROOT") {
|
||||
slurmRunner = runInEnrootOnNode(nodeName)
|
||||
} else {
|
||||
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
|
||||
@ -799,7 +799,7 @@ def getPytestBaseCommandLine(
|
||||
"LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
|
||||
"LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
|
||||
"MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
|
||||
"COLUMNS=200",
|
||||
"COLUMNS=400",
|
||||
extraInternalEnv,
|
||||
portEnvVars,
|
||||
pytestUtil,
|
||||
@ -860,11 +860,11 @@ def getMountListForSlurmTest(SlurmCluster cluster, boolean useSbatch = false)
|
||||
}
|
||||
|
||||
// data/cache mounts
|
||||
if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
|
||||
if (cluster.containerRuntime.toString() == "DOCKER") {
|
||||
mounts += [
|
||||
"/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro",
|
||||
]
|
||||
} else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
|
||||
} else if (cluster.containerRuntime.toString() == "ENROOT") {
|
||||
if (!cluster.scratchPath) {
|
||||
throw new Exception("Scratch path is not set for cluster: ${cluster.name}")
|
||||
}
|
||||
@ -922,6 +922,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
|
||||
def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
|
||||
def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
|
||||
def scriptBashUtilsLocalPath = "${llmSrcLocal}/jenkins/scripts/bash_utils.sh"
|
||||
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
|
||||
def testListPathNode = "${jobWorkspace}/${testList}.txt"
|
||||
def waivesListPathNode = "${jobWorkspace}/waives.txt"
|
||||
def outputPath = "${jobWorkspace}/job-output.log"
|
||||
@ -956,6 +958,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
scriptInstallPathNode,
|
||||
true
|
||||
)
|
||||
Utils.exec(pipeline, script: "echo \"Script for Bash utilities: \" && cat ${scriptBashUtilsLocalPath}")
|
||||
Utils.copyFileToRemoteHost(
|
||||
pipeline,
|
||||
remote,
|
||||
scriptBashUtilsLocalPath,
|
||||
scriptBashUtilsPathNode,
|
||||
true
|
||||
)
|
||||
|
||||
// Generate Test List and Upload to Frontend Node
|
||||
def makoArgs = getMakoArgsFromStageName(stageName, true)
|
||||
@ -1040,7 +1050,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
|
||||
def containerImageArg = container
|
||||
def srunPrologue = ""
|
||||
if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
|
||||
if (cluster.containerRuntime.toString() == "ENROOT") {
|
||||
def enrootImagePath = "${cluster.scratchPath}/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
|
||||
containerImageArg = enrootImagePath
|
||||
|
||||
@ -1127,9 +1137,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
set -xEeuo pipefail
|
||||
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
|
||||
|
||||
echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
|
||||
echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"
|
||||
|
||||
echo "Starting Slurm job \$SLURM_JOB_ID on \$SLURM_NODELIST"
|
||||
export jobWorkspace=$jobWorkspace
|
||||
export tarName=$tarName
|
||||
export llmTarfile=$llmTarfile
|
||||
@ -1219,10 +1227,11 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
touch "${outputPath}"
|
||||
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
|
||||
if [ -z "\$jobId" ]; then
|
||||
echo "Error: Job submission failed, no job ID returned."
|
||||
echo "Error: Slurm job submission failed, no job ID returned."
|
||||
exit 1
|
||||
fi
|
||||
echo "Submitted job \$jobId"
|
||||
echo "Submitted Slurm job \$jobId"
|
||||
echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
|
||||
tail -f ${outputPath} &
|
||||
tailPid=\$!
|
||||
# Wait until sbatch job is done.
|
||||
@ -1232,9 +1241,28 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
# Kill tail -f process
|
||||
kill \$tailPid
|
||||
# Check if the job failed or not
|
||||
sleep 5
|
||||
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
|
||||
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
|
||||
sleep 10
|
||||
# Retry getting status and exit code as sacct might be delayed
|
||||
for i in {1..3}; do
|
||||
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
|
||||
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
|
||||
|
||||
if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then
|
||||
break
|
||||
fi
|
||||
echo "Waiting for sacct to update... attempt \$i"
|
||||
sleep 10
|
||||
done
|
||||
|
||||
if [ -z "\$EXIT_CODE" ]; then
|
||||
echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
|
||||
EXIT_CODE=1
|
||||
fi
|
||||
if [ -z "\$STATUS" ]; then
|
||||
echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
|
||||
STATUS="UNKNOWN"
|
||||
fi
|
||||
|
||||
if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
|
||||
echo "Pytest succeed in Slurm job \$jobId"
|
||||
echo "Status: \$STATUS | Exit_code \$EXIT_CODE"
|
||||
|
||||
45
jenkins/scripts/bash_utils.sh
Normal file
45
jenkins/scripts/bash_utils.sh
Normal file
@ -0,0 +1,45 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Retry a command with a specified number of retries and interval.
|
||||
# Arguments:
|
||||
# max_retries (optional): The maximum number of times to retry the command. Default: 3.
|
||||
# interval (optional): The time in seconds to wait between retries. Default: 60.
|
||||
# command: The command to run and its arguments.
|
||||
# Usage:
|
||||
# retry_command [max_retries] [interval] command...
|
||||
# If only one numeric argument is provided, it is treated as max_retries.
|
||||
function retry_command() {
|
||||
local max_retries=3
|
||||
local interval=60
|
||||
|
||||
if [[ "$1" =~ ^[0-9]+$ ]]; then
|
||||
max_retries=$1
|
||||
shift
|
||||
fi
|
||||
|
||||
if [[ "$1" =~ ^[0-9]+$ ]]; then
|
||||
interval=$1
|
||||
shift
|
||||
fi
|
||||
|
||||
local cmd=("$@")
|
||||
|
||||
local count=0
|
||||
local rc=0
|
||||
|
||||
while [ $count -lt $max_retries ]; do
|
||||
if "${cmd[@]}"; then
|
||||
return 0
|
||||
fi
|
||||
rc=$?
|
||||
count=$((count + 1))
|
||||
echo "Command failed with exit code $rc. Attempt $count/$max_retries."
|
||||
if [ $count -lt $max_retries ]; then
|
||||
echo "Retrying in $interval seconds..."
|
||||
sleep $interval
|
||||
fi
|
||||
done
|
||||
|
||||
echo "Command failed after $max_retries attempts."
|
||||
return $rc
|
||||
}
|
||||
@ -4,22 +4,25 @@
|
||||
set -xEeuo pipefail
|
||||
trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
|
||||
|
||||
# Source utilities
|
||||
bashUtilsPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_install\.sh/bash_utils.sh/')"
|
||||
source "$bashUtilsPath"
|
||||
|
||||
slurm_install_setup() {
|
||||
cd $resourcePathNode
|
||||
llmSrcNode=$resourcePathNode/TensorRT-LLM/src
|
||||
|
||||
if [ $SLURM_LOCALID -eq 0 ]; then
|
||||
wget -nv $llmTarfile
|
||||
tar -zxf $tarName
|
||||
retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
|
||||
which python3
|
||||
python3 --version
|
||||
apt-get install -y libffi-dev
|
||||
retry_command apt-get install -y libffi-dev
|
||||
nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
|
||||
if [[ $pytestCommand == *--run-ray* ]]; then
|
||||
pip3 install --retries 10 ray[default]
|
||||
retry_command pip3 install --retries 10 ray[default]
|
||||
fi
|
||||
cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
|
||||
cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
|
||||
retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt"
|
||||
retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl"
|
||||
gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
|
||||
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
|
||||
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
|
||||
|
||||
@ -63,9 +63,10 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
|
||||
# Only the first process will save the coverage config file
|
||||
if [ $SLURM_PROCID -eq 0 ]; then
|
||||
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
|
||||
else
|
||||
# Sleep 10 seconds to wait for the coverage config file to be saved
|
||||
sleep 10
|
||||
fi
|
||||
# Sleep 10 seconds to wait for the coverage config file to be saved
|
||||
sleep 10
|
||||
|
||||
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
|
||||
containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
|
||||
@ -95,8 +96,17 @@ echo "Full Command: $pytestCommand"
|
||||
done
|
||||
fi
|
||||
|
||||
# Turn off "exit on error" so the following lines always run
|
||||
set +e
|
||||
|
||||
pytest_exit_code=0
|
||||
perf_check_exit_code=0
|
||||
perf_report_exit_code=0
|
||||
perf_sanity_check_exit_code=0
|
||||
|
||||
eval $pytestCommand
|
||||
echo "Rank${SLURM_PROCID} Pytest finished execution"
|
||||
pytest_exit_code=$?
|
||||
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
|
||||
|
||||
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
|
||||
if [[ "$stageName" == *PyTorch* ]]; then
|
||||
@ -109,15 +119,38 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
|
||||
python3 $llmSrcNode/tests/integration/defs/perf/sanity_perf_check.py \
|
||||
$stageName/perf_script_test_results.csv \
|
||||
$basePerfPath
|
||||
echo "Check Perf Result"
|
||||
perf_check_exit_code=$?
|
||||
|
||||
echo "Create Perf Report"
|
||||
python3 $llmSrcNode/tests/integration/defs/perf/create_perf_comparison_report.py \
|
||||
--output_path $stageName/report.pdf \
|
||||
--files $stageName/perf_script_test_results.csv \
|
||||
$basePerfPath
|
||||
perf_report_exit_code=$?
|
||||
echo "Rank${SLURM_PROCID} Perf report finished execution with exit code $perf_report_exit_code"
|
||||
|
||||
if [ "$perf_check_exit_code" -eq 0 ] && [ "$perf_report_exit_code" -ne 0 ]; then
|
||||
perf_check_exit_code=$perf_report_exit_code
|
||||
fi
|
||||
echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
|
||||
fi
|
||||
|
||||
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
|
||||
echo "Check Perf-Sanity Result"
|
||||
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
|
||||
$jobWorkspace
|
||||
perf_sanity_check_exit_code=$?
|
||||
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
|
||||
fi
|
||||
|
||||
if [ "$pytest_exit_code" -ne 0 ]; then
|
||||
final_exit_code=$pytest_exit_code
|
||||
elif [ "$perf_check_exit_code" -ne 0 ]; then
|
||||
final_exit_code=$perf_check_exit_code
|
||||
elif [ "$perf_sanity_check_exit_code" -ne 0 ]; then
|
||||
final_exit_code=$perf_sanity_check_exit_code
|
||||
else
|
||||
final_exit_code=0
|
||||
fi
|
||||
echo "Rank${SLURM_PROCID} Final Slurm run finished execution with exit code $final_exit_code"
|
||||
exit $final_exit_code
|
||||
|
||||
Loading…
Reference in New Issue
Block a user