[None][infra] add retry logic to get slurm sbatch job log when ssh dropped (#9167)

Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>
This commit is contained in:
yuanjingx87 2026-01-03 18:11:37 -08:00 committed by GitHub
parent 0d1f5ad7a2
commit 5bd37ce41e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -933,11 +933,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt"
def outputPath = "${jobWorkspace}/job-output.log"
def sbatchLogPath = "${jobWorkspace}/job-output.log"
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
def isAarch64 = config.contains("aarch64")
def coverageConfigFile = "${jobWorkspace}/.coveragerc"
stage("[${stageName}] Initializing Test") {
@ -1133,10 +1138,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
"export ${varName}=\"${escapedValue}\""
}.join('\n')
// Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
def scriptLaunchPrefix = """#!/bin/bash
#SBATCH ${exemptionComment}
#SBATCH --output=${outputPath}
#SBATCH --output=${sbatchLogPath}
${taskArgs.collect { "#SBATCH $it" }.join('\n')}
#SBATCH ${partition.additionalArgs}
${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@ -1214,9 +1218,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
scriptLaunchPathNode,
true
)
def scriptExec = """#!/bin/bash
set -xEeuo pipefail
def scriptSubmit = """#!/bin/bash
set -Eeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
# Clean up previous job intermediate files so that retry can work
@ -1231,21 +1234,60 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
rm -rf "${jobWorkspace}/results.xml"
rm -rf "${jobWorkspace}/report.csv"
rm -rf "${jobWorkspace}/unfinished_test.txt"
rm -rf "${outputPath}"
rm -rf "${sbatchLogPath}"
touch "${outputPath}"
touch ${sbatchLogPath}
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
if [ -z "\$jobId" ]; then
echo "Error: Slurm job submission failed, no job ID returned."
exit 1
fi
echo "Submitted Slurm job \$jobId"
echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
tail -f ${outputPath} &
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
echo \$jobId > $jobWorkspace/slurm_job_id.txt
""".replaceAll("(?m)^\\s*", "").trim()
pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptSubmitPathLocal,
scriptSubmitPathNode,
true
)
}
stage("[${stageName}] Run Pytest") {
// Submit the sbatch job
Utils.exec(
pipeline,
timeout: false,
script: Utils.sshUserCmd(
remote,
scriptSubmitPathNode
),
numRetries: 3
)
def sbatchJobId = Utils.exec(
pipeline,
returnStdout: true,
script: Utils.sshUserCmd(
remote,
"cat $jobWorkspace/slurm_job_id.txt"
)
).trim()
def scriptTrack = """#!/bin/bash
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
tail -f ${sbatchLogPath} &
tailPid=\$!
# Wait until sbatch job is done.
while squeue -j \$jobId -o %T >/dev/null 2>&1; do
sleep 300
while true; do
state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
echo "job is still running"
sleep 300
else
echo "Job \$jobId finished with state: \$state"
break
fi
done
# Kill tail -f process
kill \$tailPid
@ -1282,28 +1324,55 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
exit 1
fi
""".replaceAll("(?m)^\\s*", "").trim()
pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
Utils.exec(pipeline, script: "echo \"Script to trigger Slurm submission job: \" && cat ${scriptExecPathLocal}")
pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptExecPathLocal,
scriptExecPathNode,
scriptTrackPathLocal,
scriptTrackPathNode,
true
)
}
stage("[${stageName}] Run Pytest") {
Utils.exec(
def scriptStatus = """#!/bin/bash
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
"""
pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
Utils.copyFileToRemoteHost(
pipeline,
timeout: false,
script: Utils.sshUserCmd(
remote,
"\"${scriptExecPathNode}\""
),
numRetries: 3
remote,
scriptStatusPathLocal,
scriptStatusPathNode,
true
)
}
sh "cat $scriptStatusPathLocal"
while (true) {
// Check if the job is done by running sacct via SSH
def result = Utils.exec(
pipeline,
returnStdout: true,
script: Utils.sshUserCmd(
remote,
scriptStatusPathNode
)
).trim()
if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
echo "Slurm job $sbatchJobId is still running, pulling the job log."
// Pulling the sbatch output log
Utils.exec(
pipeline,
timeout: false,
script: Utils.sshUserCmd(
remote,
scriptTrackPathNode
)
)
} else {
echo "Slurm job $sbatchJobId is done."
break
}
}
}
echo "Finished test stage execution."
}
} finally {