[None][fix] Fix a typo in the Slurm CI codes (#7485)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Yanchao Lu 2025-09-04 13:56:27 +08:00 committed by GitHub
parent 931816fee1
commit c622f61609
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 11 additions and 6 deletions

View File

@ -684,7 +684,7 @@ pipeline {
}
cmd += imageKeyToTag.values().join(" ")
withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
sh cmd
trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200)
}
}
}

View File

@ -169,7 +169,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job completion; sleep 30")
Utils.exec(
pipeline,
@ -179,6 +179,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
)
)
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
Utils.exec(
pipeline,
script: Utils.sshUserCmd(
@ -228,6 +230,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
)
)
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
Utils.exec(
pipeline,
script: Utils.sshUserCmd(
@ -354,7 +358,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
}
if (CloudManager.isNodeOnline(nodeName)) {
def dockerGpuOption = ""
def dockerGPUOption = ""
node(nodeName) {
sh """
@ -373,6 +377,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
// Dynamically set GPU arguments based on environment variables
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
// It's intentional to check NV_GPU first.
dockerGPUOption = sh(script: """
if [ -n "\$NV_GPU" ]; then
echo "--gpus '\\"device=\$NV_GPU\\"'"
@ -392,7 +397,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
"-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
"--cap-add syslog"
"--cap-add=SYSLOG"
echo "Final dockerArgs: ${dockerArgs}"
@ -522,9 +527,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
].join(" ")
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
// TODO: check if the tee always returns 0
def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
def scriptContent = """#!/bin/bash
set -o pipefail
export jobWorkspace=$jobWorkspace
export tarName=$tarName
export llmTarfile=$llmTarfile