[None][ci] Some improvements for Slurm CI setup (#7407)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Yanchao Lu 2025-09-01 10:57:36 +08:00 committed by GitHub
parent e257cb3533
commit c5148f52d5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -154,16 +154,11 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
"-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
"-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
"-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
"${slurmOutputFile} | tail -n1\""
"${slurmOutputFile} | tail -n1 || true\""
),
returnStdout: true
).trim()
if (!slurmJobID || !slurmJobID.isNumber()) {
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile}\""))
error("Slurm job did not submit successfully. No job ID found.")
}
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
@ -180,10 +175,18 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
pipeline,
script: Utils.sshUserCmd(
remote,
"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
"\"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID} || true\""
)
)
if (!slurmJobID || !slurmJobID.isNumber()) {
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\""))
echo "Slurm job did not submit successfully. No job ID found."
} else {
def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID)
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\""))
}
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
}
}
@ -198,6 +201,12 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
allowAnyHosts: true,
]
Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
CloudManager.destroyNode(nodeName)
Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
@ -214,7 +223,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
pipeline,
script: Utils.sshUserCmd(
remote,
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
"\"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh || true\""
)
)
@ -314,7 +323,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
slurmJobID = jobIDs ? jobIDs[-1] : null
if (!slurmJobID || !slurmJobID.isNumber()) {
error("Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}")
echo "Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}"
}
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
@ -361,12 +370,22 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
error "The Slurm node does not come online in the waiting period. Terminating the job."
}
}
} catch (Exception e) {
if (e.getMessage()?.contains("Failed to kill container")) {
echo "Known benign error ignored: ${e.getMessage()}"
} else {
throw e // Re-throw if it's a different IOException
}
} finally {
stage('Clean up SLURM Resources') {
Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
CloudManager.destroyNode(nodeName)
Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
stage("Clean up SLURM Resources") {
// Workaround to handle the interruption during clean up SLURM resources
retry(3) {
try {
cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
} catch (Exception e) {
error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
}
}
}
}
}
@ -420,7 +439,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
slurmOutputFile = "${jobWorkspace}/${jobUID}-slurm_output.log"
slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID)
def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt"
def isAarch64 = config.contains("aarch64")
@ -474,6 +493,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
// TODO: check if the tee always returns 0
def scriptContent = """#!/bin/bash
export jobWorkspace=$jobWorkspace
export tarName=$tarName
@ -515,8 +535,15 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
} finally {
uploadResults(pipeline, cluster, jobUID, stageName)
stage('Clean up SLURM Resources') {
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
stage("Clean up SLURM Resources") {
// Workaround to handle the interruption during clean up SLURM resources
retry(3) {
try {
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
} catch (Exception e) {
error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
}
}
}
}
}
@ -644,7 +671,7 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
if (stageIsInterrupted) {
echo "Stage is interrupted, skip to upload test result."
} else {
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg; fi'
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
if (noResultIfSuccess && !stageIsFailed) {
// Clean up the workspace
sh """
@ -1526,7 +1553,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
stage ("[${stageName}] Run Pytest")
{
echoNodeAndGpuInfo(pipeline, stageName)
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi'
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
def extraInternalEnv = ""
def pytestTestTimeout = "3600"