mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][ci] Some improvements for Slurm CI setup (#7407)
Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
e257cb3533
commit
c5148f52d5
@ -154,16 +154,11 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
|
||||
"-e 's/.*Submitted batch job \\([0-9]\\+\\).*/\\1/p' " +
|
||||
"-e 's/.*srun: job \\([0-9]\\+\\) queued.*/\\1/p' " +
|
||||
"-e 's/.*srun: job \\([0-9]\\+\\) has been allocated.*/\\1/p' " +
|
||||
"${slurmOutputFile} | tail -n1\""
|
||||
"${slurmOutputFile} | tail -n1 || true\""
|
||||
),
|
||||
returnStdout: true
|
||||
).trim()
|
||||
|
||||
if (!slurmJobID || !slurmJobID.isNumber()) {
|
||||
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile}\""))
|
||||
error("Slurm job did not submit successfully. No job ID found.")
|
||||
}
|
||||
|
||||
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
|
||||
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow slurm job termination; sleep 30")
|
||||
@ -180,10 +175,18 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo
|
||||
pipeline,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
|
||||
"\"rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID} || true\""
|
||||
)
|
||||
)
|
||||
|
||||
if (!slurmJobID || !slurmJobID.isNumber()) {
|
||||
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"cat ${slurmOutputFile} || true\""))
|
||||
echo "Slurm job did not submit successfully. No job ID found."
|
||||
} else {
|
||||
def newSlurmOutputFile = slurmOutputFile.replace("%j", slurmJobID)
|
||||
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mv ${slurmOutputFile} ${newSlurmOutputFile} || true\""))
|
||||
}
|
||||
|
||||
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID} cleaned up")
|
||||
}
|
||||
}
|
||||
@ -198,6 +201,12 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
|
||||
allowAnyHosts: true,
|
||||
]
|
||||
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
|
||||
|
||||
CloudManager.destroyNode(nodeName)
|
||||
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
|
||||
|
||||
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
|
||||
|
||||
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
|
||||
@ -214,7 +223,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
|
||||
pipeline,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
|
||||
"\"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh || true\""
|
||||
)
|
||||
)
|
||||
|
||||
@ -314,7 +323,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
slurmJobID = jobIDs ? jobIDs[-1] : null
|
||||
|
||||
if (!slurmJobID || !slurmJobID.isNumber()) {
|
||||
error("Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}")
|
||||
echo "Slurm job did not submit successfully. No job ID found.\nSubmission output:\n${slurmSubmitOutput}"
|
||||
}
|
||||
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobID}")
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
|
||||
@ -361,12 +370,22 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
|
||||
error "The Slurm node does not come online in the waiting period. Terminating the job."
|
||||
}
|
||||
}
|
||||
} catch (Exception e) {
|
||||
if (e.getMessage()?.contains("Failed to kill container")) {
|
||||
echo "Known benign error ignored: ${e.getMessage()}"
|
||||
} else {
|
||||
throw e // Re-throw if it's a different IOException
|
||||
}
|
||||
} finally {
|
||||
stage('Clean up SLURM Resources') {
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow docker stop; sleep 30")
|
||||
CloudManager.destroyNode(nodeName)
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow node destruction; sleep 30")
|
||||
cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
|
||||
stage("Clean up SLURM Resources") {
|
||||
// Workaround to handle the interruption during clean up SLURM resources
|
||||
retry(3) {
|
||||
try {
|
||||
cleanUpNodeResources(pipeline, cluster, nodeName, slurmJobID)
|
||||
} catch (Exception e) {
|
||||
error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -420,7 +439,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
|
||||
def scriptRunNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
|
||||
def scriptLaunch = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
|
||||
slurmOutputFile = "${jobWorkspace}/${jobUID}-slurm_output.log"
|
||||
slurmOutputFile = SlurmConfig.getOutputFilePath("/home/svc_tensorrt/slurm-logs", jobUID)
|
||||
def testListPathNode = "${jobWorkspace}/${testList}.txt"
|
||||
def waivesListPathNode = "${jobWorkspace}/waives.txt"
|
||||
def isAarch64 = config.contains("aarch64")
|
||||
@ -474,6 +493,7 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
|
||||
def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
|
||||
scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
|
||||
// TODO: check if the tee always returns 0
|
||||
def scriptContent = """#!/bin/bash
|
||||
export jobWorkspace=$jobWorkspace
|
||||
export tarName=$tarName
|
||||
@ -515,8 +535,15 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
|
||||
} finally {
|
||||
uploadResults(pipeline, cluster, jobUID, stageName)
|
||||
|
||||
stage('Clean up SLURM Resources') {
|
||||
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
|
||||
stage("Clean up SLURM Resources") {
|
||||
// Workaround to handle the interruption during clean up SLURM resources
|
||||
retry(3) {
|
||||
try {
|
||||
cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID, slurmOutputFile)
|
||||
} catch (Exception e) {
|
||||
error "Error during clean up SLURM resources: ${e.getMessage()} and retrying."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -644,7 +671,7 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
|
||||
if (stageIsInterrupted) {
|
||||
echo "Stage is interrupted, skip to upload test result."
|
||||
} else {
|
||||
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg; fi'
|
||||
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
|
||||
if (noResultIfSuccess && !stageIsFailed) {
|
||||
// Clean up the workspace
|
||||
sh """
|
||||
@ -1526,7 +1553,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
stage ("[${stageName}] Run Pytest")
|
||||
{
|
||||
echoNodeAndGpuInfo(pipeline, stageName)
|
||||
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C; fi'
|
||||
sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
|
||||
|
||||
def extraInternalEnv = ""
|
||||
def pytestTestTimeout = "3600"
|
||||
|
||||
Loading…
Reference in New Issue
Block a user