mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][fix] Fix a typo in the Slurm CI codes (#7485)
Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
931816fee1
commit
c622f61609
@ -684,7 +684,7 @@ pipeline {
|
||||
}
|
||||
cmd += imageKeyToTag.values().join(" ")
|
||||
withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
|
||||
sh cmd
|
||||
trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 3, shortCommondRunTimeMax: 7200)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@ -169,7 +169,7 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
|
||||
|
||||
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
|
||||
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job completion; sleep 30")
|
||||
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
@ -179,6 +179,8 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
|
||||
)
|
||||
)
|
||||
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
|
||||
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
script: Utils.sshUserCmd(
|
||||
@ -228,6 +230,8 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
|
||||
)
|
||||
)
|
||||
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow Slurm job termination; sleep 30")
|
||||
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
script: Utils.sshUserCmd(
|
||||
@ -354,7 +358,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
}
|
||||
|
||||
if (CloudManager.isNodeOnline(nodeName)) {
|
||||
def dockerGpuOption = ""
|
||||
def dockerGPUOption = ""
|
||||
|
||||
node(nodeName) {
|
||||
sh """
|
||||
@ -373,6 +377,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
|
||||
// Dynamically set GPU arguments based on environment variables
|
||||
// https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
|
||||
// It's intentional to check NV_GPU first.
|
||||
dockerGPUOption = sh(script: """
|
||||
if [ -n "\$NV_GPU" ]; then
|
||||
echo "--gpus '\\"device=\$NV_GPU\\"'"
|
||||
@ -392,7 +397,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
"-v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro " +
|
||||
"-v /tmp/ccache:${CCACHE_DIR}:rw " +
|
||||
"-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
|
||||
"--cap-add syslog"
|
||||
"--cap-add=SYSLOG"
|
||||
|
||||
echo "Final dockerArgs: ${dockerArgs}"
|
||||
|
||||
@ -522,9 +527,9 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
].join(" ")
|
||||
|
||||
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
|
||||
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
|
||||
// TODO: check if the tee always returns 0
|
||||
def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
|
||||
def scriptContent = """#!/bin/bash
|
||||
set -o pipefail
|
||||
export jobWorkspace=$jobWorkspace
|
||||
export tarName=$tarName
|
||||
export llmTarfile=$llmTarfile
|
||||
|
||||
Loading…
Reference in New Issue
Block a user