[None][fix] Fix a typo in the Slurm CI codes (#7485)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-14 06:27:45 +08:00 · 2025-09-04 13:56:27 +08:00 · 2025-09-04 13:56:27 +08:00 · c622f61609
commit c622f61609
parent 931816fee1
2 changed files with 11 additions and 6 deletions
--- a/jenkins/BuildDockerImage.groovy
+++ b/jenkins/BuildDockerImage.groovy
@ -684,7 +684,7 @@ pipeline {
                        }
                        cmd += imageKeyToTag.values().join(" ")
                        withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
-                            sh cmd
+                            trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200)
                        }
                    }
                }
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -169,7 +169,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo

        Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")

-        Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
+        Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job completion; sleep 30")

        Utils.exec(
            pipeline,
@ -179,6 +179,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
            )
        )

+        Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
+
        Utils.exec(
            pipeline,
            script: Utils.sshUserCmd(
@ -228,6 +230,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
            )
        )

+        Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
+
        Utils.exec(
            pipeline,
            script: Utils.sshUserCmd(
@ -354,7 +358,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
            }

            if (CloudManager.isNodeOnline(nodeName)) {
-                def dockerGpuOption = ""
+                def dockerGPUOption = ""

                node(nodeName) {
                    sh """
@ -373,6 +377,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p

                    // Dynamically set GPU arguments based on environment variables
                    // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
+                    // It's intentional to check NV_GPU first.
                    dockerGPUOption = sh(script: """
                        if [ -n "\$NV_GPU" ]; then
                            echo "--gpus '\\"device=\$NV_GPU\\"'"
@ -392,7 +397,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                    "-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
                    "-v /tmp/ccache:${CCACHE_DIR}:rw " +
                    "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
-                    "--cap-add syslog"
+                    "--cap-add=SYSLOG"

                echo "Final dockerArgs: ${dockerArgs}"

@ -522,9 +527,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                ].join(" ")

                def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
-                scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
-                // TODO: check if the tee always returns 0
+                def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
                def scriptContent = """#!/bin/bash
+                    set -o pipefail
                    export jobWorkspace=$jobWorkspace
                    export tarName=$tarName
                    export llmTarfile=$llmTarfile