mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][infra] add retry logic to get slurm sbatch job log when ssh dropped (#9167)
Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>
This commit is contained in:
parent
0d1f5ad7a2
commit
5bd37ce41e
@ -933,11 +933,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
|
||||
def testListPathNode = "${jobWorkspace}/${testList}.txt"
|
||||
def waivesListPathNode = "${jobWorkspace}/waives.txt"
|
||||
def outputPath = "${jobWorkspace}/job-output.log"
|
||||
def sbatchLogPath = "${jobWorkspace}/job-output.log"
|
||||
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
|
||||
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
|
||||
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
|
||||
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
|
||||
def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
|
||||
def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
|
||||
def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
|
||||
def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
|
||||
def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
|
||||
def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
|
||||
def isAarch64 = config.contains("aarch64")
|
||||
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
|
||||
|
||||
stage("[${stageName}] Initializing Test") {
|
||||
@ -1133,10 +1138,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
"export ${varName}=\"${escapedValue}\""
|
||||
}.join('\n')
|
||||
|
||||
// Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
|
||||
def scriptLaunchPrefix = """#!/bin/bash
|
||||
#SBATCH ${exemptionComment}
|
||||
#SBATCH --output=${outputPath}
|
||||
#SBATCH --output=${sbatchLogPath}
|
||||
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
|
||||
#SBATCH ${partition.additionalArgs}
|
||||
${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
|
||||
@ -1214,9 +1218,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
scriptLaunchPathNode,
|
||||
true
|
||||
)
|
||||
|
||||
def scriptExec = """#!/bin/bash
|
||||
set -xEeuo pipefail
|
||||
def scriptSubmit = """#!/bin/bash
|
||||
set -Eeuo pipefail
|
||||
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
|
||||
|
||||
# Clean up previous job intermediate files so that retry can work
|
||||
@ -1231,21 +1234,60 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
rm -rf "${jobWorkspace}/results.xml"
|
||||
rm -rf "${jobWorkspace}/report.csv"
|
||||
rm -rf "${jobWorkspace}/unfinished_test.txt"
|
||||
rm -rf "${outputPath}"
|
||||
rm -rf "${sbatchLogPath}"
|
||||
|
||||
touch "${outputPath}"
|
||||
touch ${sbatchLogPath}
|
||||
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
|
||||
if [ -z "\$jobId" ]; then
|
||||
echo "Error: Slurm job submission failed, no job ID returned."
|
||||
exit 1
|
||||
fi
|
||||
echo "Submitted Slurm job \$jobId"
|
||||
echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
|
||||
tail -f ${outputPath} &
|
||||
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
|
||||
echo \$jobId > $jobWorkspace/slurm_job_id.txt
|
||||
""".replaceAll("(?m)^\\s*", "").trim()
|
||||
pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
|
||||
Utils.copyFileToRemoteHost(
|
||||
pipeline,
|
||||
remote,
|
||||
scriptSubmitPathLocal,
|
||||
scriptSubmitPathNode,
|
||||
true
|
||||
)
|
||||
}
|
||||
stage("[${stageName}] Run Pytest") {
|
||||
// Submit the sbatch job
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
scriptSubmitPathNode
|
||||
),
|
||||
numRetries: 3
|
||||
)
|
||||
def sbatchJobId = Utils.exec(
|
||||
pipeline,
|
||||
returnStdout: true,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"cat $jobWorkspace/slurm_job_id.txt"
|
||||
)
|
||||
).trim()
|
||||
def scriptTrack = """#!/bin/bash
|
||||
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
|
||||
tail -f ${sbatchLogPath} &
|
||||
tailPid=\$!
|
||||
# Wait until sbatch job is done.
|
||||
while squeue -j \$jobId -o %T >/dev/null 2>&1; do
|
||||
sleep 300
|
||||
while true; do
|
||||
state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
|
||||
if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
|
||||
echo "job is still running"
|
||||
sleep 300
|
||||
else
|
||||
echo "Job \$jobId finished with state: \$state"
|
||||
break
|
||||
fi
|
||||
done
|
||||
# Kill tail -f process
|
||||
kill \$tailPid
|
||||
@ -1282,28 +1324,55 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
|
||||
exit 1
|
||||
fi
|
||||
""".replaceAll("(?m)^\\s*", "").trim()
|
||||
pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
|
||||
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm submission job: \" && cat ${scriptExecPathLocal}")
|
||||
pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
|
||||
Utils.copyFileToRemoteHost(
|
||||
pipeline,
|
||||
remote,
|
||||
scriptExecPathLocal,
|
||||
scriptExecPathNode,
|
||||
scriptTrackPathLocal,
|
||||
scriptTrackPathNode,
|
||||
true
|
||||
)
|
||||
}
|
||||
stage("[${stageName}] Run Pytest") {
|
||||
Utils.exec(
|
||||
def scriptStatus = """#!/bin/bash
|
||||
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
|
||||
sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
|
||||
"""
|
||||
pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
|
||||
Utils.copyFileToRemoteHost(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"\"${scriptExecPathNode}\""
|
||||
),
|
||||
numRetries: 3
|
||||
remote,
|
||||
scriptStatusPathLocal,
|
||||
scriptStatusPathNode,
|
||||
true
|
||||
)
|
||||
}
|
||||
|
||||
sh "cat $scriptStatusPathLocal"
|
||||
while (true) {
|
||||
// Check if the job is done by running sacct via SSH
|
||||
def result = Utils.exec(
|
||||
pipeline,
|
||||
returnStdout: true,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
scriptStatusPathNode
|
||||
)
|
||||
).trim()
|
||||
if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
|
||||
echo "Slurm job $sbatchJobId is still running, pulling the job log."
|
||||
// Pulling the sbatch output log
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
scriptTrackPathNode
|
||||
)
|
||||
)
|
||||
} else {
|
||||
echo "Slurm job $sbatchJobId is done."
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
echo "Finished test stage execution."
|
||||
}
|
||||
} finally {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user