From c5148f52d5cf8f4d796f6a07fd0acc620c034e7d Mon Sep 17 00:00:00 2001 From: Yanchao Lu Date: Mon, 1 Sep 2025 10:57:36 +0800 Subject: [PATCH] [None][ci] Some improvements for Slurm CI setup (#7407) Signed-off-by: Yanchao Lu --- jenkins/L0_Test.groovy | 65 ++++++++++++++++++++++++++++++------------ 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 363dcf322f..0b9d706828 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -154,16 +154,11 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo "-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " + "-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " + "-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " + - "${slurmOutputFile} | tail -n1\"" + "${slurmOutputFile} | tail -n1 || true\"" ), returnStdout: true ).trim() - if (!slurmJobID || !slurmJobID.isNumber()) { - Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile}\"")) - error("Slurm job did not submit successfully. No job ID found.") - } - Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30") @@ -180,10 +175,18 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo pipeline, script: Utils.sshUserCmd( remote, - "rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}" + "\"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID} || true\"" ) ) + if (!slurmJobID || !slurmJobID.isNumber()) { + Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\"")) + echo "Slurm job did not submit successfully. No job ID found." + } else { + def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID) + Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\"")) + } + Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up") } } @@ -198,6 +201,12 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St allowAnyHosts: true, ] + Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30") + + CloudManager.destroyNode(nodeName) + + Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30") + Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client") Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") @@ -214,7 +223,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St pipeline, script: Utils.sshUserCmd( remote, - "rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh" + "\"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh || true\"" ) ) @@ -314,7 +323,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p slurmJobID = jobIDs ? jobIDs[-1] : null if (!slurmJobID || !slurmJobID.isNumber()) { - error("Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}") + echo "Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}" } Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}") Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30") @@ -361,12 +370,22 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p error "The Slurm node does not come online in the waiting period. Terminating the job." } } + } catch (Exception e) { + if (e.getMessage()?.contains("Failed to kill container")) { + echo "Known benign error ignored: ${e.getMessage()}" + } else { + throw e // Re-throw if it's a different IOException + } } finally { - stage('Clean up SLURM Resources') { - Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30") - CloudManager.destroyNode(nodeName) - Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30") - cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID) + stage("Clean up SLURM Resources") { + // Workaround to handle the interruption during clean up SLURM resources + retry(3) { + try { + cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID) + } catch (Exception e) { + error "Error during clean up SLURM resources: ${e.getMessage()} and retrying." + } + } } } } @@ -420,7 +439,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL def llmSrcLocal = "${llmPath}/TensorRT-LLM/src" def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh" def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh" - slurmOutputFile = "${jobWorkspace}/${jobUID}-slurm_output.log" + slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID) def testListPathNode = "${jobWorkspace}/${testList}.txt" def waivesListPathNode = "${jobWorkspace}/waives.txt" def isAarch64 = config.contains("aarch64") @@ -474,6 +493,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode) scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh") + // TODO: check if the tee always returns 0 def scriptContent = """#!/bin/bash export jobWorkspace=$jobWorkspace export tarName=$tarName @@ -515,8 +535,15 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL } finally { uploadResults(pipeline, cluster, jobUID, stageName) - stage('Clean up SLURM Resources') { - cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile) + stage("Clean up SLURM Resources") { + // Workaround to handle the interruption during clean up SLURM resources + retry(3) { + try { + cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile) + } catch (Exception e) { + error "Error during clean up SLURM resources: ${e.getMessage()} and retrying." + } + } } } } @@ -644,7 +671,7 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu if (stageIsInterrupted) { echo "Stage is interrupted, skip to upload test result." } else { - sh 'if [ "$(id -u)" -eq 0 ]; then dmesg; fi' + sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi' if (noResultIfSuccess && !stageIsFailed) { // Clean up the workspace sh """ @@ -1526,7 +1553,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO stage ("[${stageName}] Run Pytest") { echoNodeAndGpuInfo(pipeline, stageName) - sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi' + sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi' def extraInternalEnv = "" def pytestTestTimeout = "3600"