[None][ci] Some improvements for Slurm CI setup (#7407)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-14 06:27:45 +08:00 · 2025-09-01 10:57:36 +08:00 · 2025-09-01 10:57:36 +08:00 · c5148f52d5
commit c5148f52d5
parent e257cb3533
1 changed files with 46 additions and 19 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -154,16 +154,11 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
                "-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
                "-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
                "-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
-                "${slurmOutputFile} | tail -n1\""
+                "${slurmOutputFile} | tail -n1 || true\""
            ),
            returnStdout: true
        ).trim()

-        if (!slurmJobID || !slurmJobID.isNumber()) {
-            Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile}\""))
-            error("Slurm job did not submit successfully. No job ID found.")
-        }
-
        Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")

        Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
@ -180,10 +175,18 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
            pipeline,
            script: Utils.sshUserCmd(
                remote,
-                "rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
+                "\"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID} || true\""
            )
        )

+        if (!slurmJobID || !slurmJobID.isNumber()) {
+            Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\""))
+            echo "Slurm job did not submit successfully. No job ID found."
+        } else {
+            def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID)
+            Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\""))
+        }
+
        Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
    }
 }
@ -198,6 +201,12 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
            allowAnyHosts: true,
        ]

+        Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
+
+        CloudManager.destroyNode(nodeName)
+
+        Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
+
        Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")

        Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
@ -214,7 +223,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
            pipeline,
            script: Utils.sshUserCmd(
                remote,
-                "rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
+                "\"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh || true\""
            )
        )

@ -314,7 +323,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                slurmJobID = jobIDs ? jobIDs[-1] : null

                if (!slurmJobID || !slurmJobID.isNumber()) {
-                    error("Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}")
+                    echo "Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}"
                }
                Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
                Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
@ -361,12 +370,22 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                error "The Slurm node does not come online in the waiting period. Terminating the job."
            }
        }
+    } catch (Exception e) {
+        if (e.getMessage()?.contains("Failed to kill container")) {
+            echo "Known benign error ignored: ${e.getMessage()}"
+        } else {
+            throw e // Re-throw if it's a different IOException
+        }
    } finally {
-        stage('Clean up SLURM Resources') {
-            Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
-            CloudManager.destroyNode(nodeName)
-            Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
-            cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
+        stage("Clean up SLURM Resources") {
+            // Workaround to handle the interruption during clean up SLURM resources
+            retry(3) {
+                try {
+                    cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
+                } catch (Exception e) {
+                    error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
+                }
+            }
        }
    }
 }
@ -420,7 +439,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
            def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
            def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
            def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
-            slurmOutputFile = "${jobWorkspace}/${jobUID}-slurm_output.log"
+            slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID)
            def testListPathNode = "${jobWorkspace}/${testList}.txt"
            def waivesListPathNode = "${jobWorkspace}/waives.txt"
            def isAarch64 = config.contains("aarch64")
@ -474,6 +493,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL

                def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
                scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
+                // TODO: check if the tee always returns 0
                def scriptContent = """#!/bin/bash
                    export jobWorkspace=$jobWorkspace
                    export tarName=$tarName
@ -515,8 +535,15 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
    } finally {
        uploadResults(pipeline, cluster, jobUID, stageName)

-        stage('Clean up SLURM Resources') {
-            cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
+        stage("Clean up SLURM Resources") {
+            // Workaround to handle the interruption during clean up SLURM resources
+            retry(3) {
+                try {
+                    cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
+                } catch (Exception e) {
+                    error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
+                }
+            }
        }
    }
 }
@ -644,7 +671,7 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
        if (stageIsInterrupted) {
            echo "Stage is interrupted, skip to upload test result."
        } else {
-            sh 'if [ "$(id -u)" -eq 0 ]; then dmesg; fi'
+            sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
            if (noResultIfSuccess && !stageIsFailed) {
                // Clean up the workspace
                sh """
@ -1526,7 +1553,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
    stage ("[${stageName}] Run Pytest")
    {
        echoNodeAndGpuInfo(pipeline, stageName)
-        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi'
+        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'

        def extraInternalEnv = ""
        def pytestTestTimeout = "3600"