[None][ci] Some tweaks for the CI pipeline (#10359)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Yanchao Lu 2026-01-05 00:10:47 +08:00 committed by GitHub
parent afc533193d
commit c4f27fa4c0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
9 changed files with 183 additions and 161 deletions

View File

@ -585,7 +585,7 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
parallelJobs.failFast = enableFailFast parallelJobs.failFast = enableFailFast
if (cpu_arch == X86_64_TRIPLE && !reuseArtifactPath) { if (cpu_arch == X86_64_TRIPLE && !reuseArtifactPath) {
def key = "Build with build type Debug" def key = "Build With Build Type Debug"
parallelJobs += [ parallelJobs += [
(key): { (key): {
script { script {
@ -628,7 +628,7 @@ pipeline {
HF_DATASETS_OFFLINE=1 HF_DATASETS_OFFLINE=1
} }
stages { stages {
stage("BuildJob") { stage("Build Job") {
steps { steps {
launchStages(this, params.targetArch, params.enableFailFast, globalVars) launchStages(this, params.targetArch, params.enableFailFast, globalVars)
} }

View File

@ -276,7 +276,7 @@ def buildImage(config, imageKeyToTag)
} }
// Step 2: Build the images // Step 2: Build the images
stage ("Install packages") { stage ("Install Package") {
sh "pwd && ls -alh" sh "pwd && ls -alh"
sh "env | sort" sh "env | sort"
sh "apk add make git" sh "apk add make git"
@ -380,7 +380,7 @@ def buildImage(config, imageKeyToTag)
} }
if (customTag) { if (customTag) {
stage ("custom tag: ${customTag} (${arch})") { stage ("Custom Tag: ${customTag} (${arch})") {
sh """ sh """
cd ${LLM_ROOT} && make -C docker ${target}_${action} \ cd ${LLM_ROOT} && make -C docker ${target}_${action} \
BASE_IMAGE=${BASE_IMAGE} \ BASE_IMAGE=${BASE_IMAGE} \
@ -395,7 +395,7 @@ def buildImage(config, imageKeyToTag)
} catch (Exception ex) { } catch (Exception ex) {
containerGenFailure = ex containerGenFailure = ex
} finally { } finally {
stage ("Docker logout") { stage ("Docker Logout") {
withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) { withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
sh "docker logout urm.nvidia.com" sh "docker logout urm.nvidia.com"
sh "docker logout ${DEFAULT_GIT_URL}:5005" sh "docker logout ${DEFAULT_GIT_URL}:5005"
@ -424,14 +424,14 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
def release_action = params.action def release_action = params.action
def buildConfigs = [ def buildConfigs = [
"Build trtllm release (x86_64)": [ "Build Internal release (x86_64 trtllm)": [
target: "trtllm", target: "trtllm",
action: release_action, action: release_action,
customTag: LLM_BRANCH_TAG + "-x86_64", customTag: LLM_BRANCH_TAG + "-x86_64",
build_wheel: true, build_wheel: true,
dockerfileStage: "release", dockerfileStage: "release",
], ],
"Build trtllm release (SBSA)": [ "Build Internal release (SBSA trtllm)": [
target: "trtllm", target: "trtllm",
action: release_action, action: release_action,
customTag: LLM_BRANCH_TAG + "-sbsa", customTag: LLM_BRANCH_TAG + "-sbsa",
@ -439,21 +439,21 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
arch: "arm64", arch: "arm64",
dockerfileStage: "release", dockerfileStage: "release",
], ],
"Build CI image (x86_64 tritondevel)": [:], "Build CI Image (x86_64 tritondevel)": [:],
"Build CI image (SBSA tritondevel)": [ "Build CI Image (SBSA tritondevel)": [
arch: "arm64", arch: "arm64",
], ],
"Build CI image (RockyLinux8 Python310)": [ "Build CI Image (RockyLinux8 Python310)": [
target: "rockylinux8", target: "rockylinux8",
args: "PYTHON_VERSION=3.10.12", args: "PYTHON_VERSION=3.10.12",
postTag: "-py310", postTag: "-py310",
], ],
"Build CI image (RockyLinux8 Python312)": [ "Build CI Image (RockyLinux8 Python312)": [
target: "rockylinux8", target: "rockylinux8",
args: "PYTHON_VERSION=3.12.3", args: "PYTHON_VERSION=3.12.3",
postTag: "-py312", postTag: "-py312",
], ],
"Build NGC devel and release (x86_64)": [ "Build NGC devel And release (x86_64)": [
target: "ngc-release", target: "ngc-release",
action: release_action, action: release_action,
args: "DOCKER_BUILD_OPTS='--load --platform linux/amd64'", args: "DOCKER_BUILD_OPTS='--load --platform linux/amd64'",
@ -464,7 +464,7 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
], ],
dockerfileStage: "release", dockerfileStage: "release",
], ],
"Build NGC devel and release (SBSA)": [ "Build NGC devel And release (SBSA)": [
target: "ngc-release", target: "ngc-release",
action: release_action, action: release_action,
args: "DOCKER_BUILD_OPTS='--load --platform linux/arm64'", args: "DOCKER_BUILD_OPTS='--load --platform linux/arm64'",
@ -583,7 +583,7 @@ pipeline {
} }
} }
} }
stage("Upload Artifacts") { stage("Upload Artifact") {
steps { steps {
script { script {
String imageKeyToTagJson = writeJSON returnText: true, json: imageKeyToTag String imageKeyToTagJson = writeJSON returnText: true, json: imageKeyToTag
@ -594,7 +594,7 @@ pipeline {
} }
} }
} }
stage("Wait for Build Jobs Complete") { stage("Wait For Build Job Complete") {
when { when {
expression { expression {
RUN_SANITY_CHECK RUN_SANITY_CHECK
@ -655,7 +655,7 @@ pipeline {
} }
} }
} }
stage("Sanity Check for NGC Images") { stage("Sanity Check For NGC Image") {
when { when {
expression { expression {
RUN_SANITY_CHECK RUN_SANITY_CHECK
@ -691,7 +691,7 @@ pipeline {
} }
} }
} }
stage("Register NGC Images for Security Checks") { stage("Register NGC Image For Security Check") {
when { when {
expression { expression {
return params.nspect_id && params.action == "push" return params.nspect_id && params.action == "push"

View File

@ -451,7 +451,7 @@ def launchReleaseCheck(pipeline)
} }
def image = "urm.nvidia.com/docker/golang:1.22" def image = "urm.nvidia.com/docker/golang:1.22"
stageName = "Release Check" stageName = "Release-Check"
trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "package"), "trt-llm", { trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "package"), "trt-llm", {
stage("[${stageName}] Run") { stage("[${stageName}] Run") {
if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) { if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) {
@ -834,7 +834,7 @@ def collectTestResults(pipeline, testFilter)
{ {
collectResultPodSpec = createKubernetesPodConfig("", "agent") collectResultPodSpec = createKubernetesPodConfig("", "agent")
trtllm_utils.launchKubernetesPod(pipeline, collectResultPodSpec, "alpine", { trtllm_utils.launchKubernetesPod(pipeline, collectResultPodSpec, "alpine", {
stage ("Collect test result") { stage ("Collect Test Result") {
sh "rm -rf **/*.xml *.tar.gz" sh "rm -rf **/*.xml *.tar.gz"
testResultLink = "https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}/test-results" testResultLink = "https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}/test-results"
@ -864,7 +864,7 @@ def collectTestResults(pipeline, testFilter)
junit(testResults: '**/results*.xml', allowEmptyResults : true) junit(testResults: '**/results*.xml', allowEmptyResults : true)
} // Collect test result stage } // Collect test result stage
stage("Rerun report") { stage("Rerun Report") {
sh "rm -rf rerun && mkdir -p rerun" sh "rm -rf rerun && mkdir -p rerun"
sh "find . -type f -wholename '*/rerun_results.xml' -exec sh -c 'mv \"{}\" \"rerun/\$(basename \$(dirname \"{}\"))_rerun_results.xml\"' \\; || true" sh "find . -type f -wholename '*/rerun_results.xml' -exec sh -c 'mv \"{}\" \"rerun/\$(basename \$(dirname \"{}\"))_rerun_results.xml\"' \\; || true"
sh "find rerun -type f" sh "find rerun -type f"
@ -904,7 +904,7 @@ def collectTestResults(pipeline, testFilter)
} }
} // Rerun report stage } // Rerun report stage
try { try {
stage("Test coverage") { stage("Test Coverage") {
sh "ls" sh "ls"
def CUR_PATH = sh(returnStdout: true, script: 'pwd').replaceAll("\\s","") def CUR_PATH = sh(returnStdout: true, script: 'pwd').replaceAll("\\s","")
sh "echo ${CUR_PATH}" sh "echo ${CUR_PATH}"
@ -1030,14 +1030,15 @@ def launchJob(jobName, reuseBuild, enableFailFast, globalVars, platform="x86_64"
def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars) def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
{ {
stages = [ stages = [
"Release Check": { "Release-Check": {
script { script {
launchReleaseCheck(this) launchReleaseCheck(this)
} }
}, },
"x86_64-linux": { "x86_64-Linux": {
script { script {
stage("Build") { def testStageName = "[Build-x86_64] ${env.localJobCredentials ? "Remote Run" : "Run"}"
stage(testStageName) {
def additionalParameters = [ def additionalParameters = [
'dockerImage': globalVars["LLM_DOCKER_IMAGE"], 'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"], 'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
@ -1045,7 +1046,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
] ]
launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters) launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters)
} }
def testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
def singleGpuTestFailed = false def singleGpuTestFailed = false
stage(testStageName) { stage(testStageName) {
if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) { if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
@ -1135,24 +1137,23 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
} }
} }
}, },
"SBSA-linux": { "SBSA-Linux": {
script { script {
def jenkinsUrl = ""
def credentials = ""
def testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
def singleGpuTestFailed = false
if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") { if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") {
echo "SBSA build job is skipped due to Jenkins configuration or conditional pipeline run" echo "SBSA build job is skipped due to Jenkins configuration or conditional pipeline run"
return return
} }
stage("Build") { def testStageName = "[Build-SBSA] ${env.localJobCredentials ? "Remote Run" : "Run"}"
stage(testStageName) {
def additionalParameters = [ def additionalParameters = [
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"], "dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
] ]
launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters) launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters)
} }
testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
def singleGpuTestFailed = false
stage(testStageName) { stage(testStageName) {
if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) { if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
echo "SBSA test job is skipped due to Jenkins configuration" echo "SBSA test job is skipped due to Jenkins configuration"
@ -1269,9 +1270,9 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images") testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images")
testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images") testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images")
echo "Will run Build-Docker-Images job" echo "Will run Build-Docker-Images job"
stages.remove("x86_64-linux") stages.remove("x86_64-Linux")
stages.remove("SBSA-linux") stages.remove("SBSA-Linux")
echo "Build-Docker-Images job is set explicitly. Both x86_64-linux and SBSA-linux sub-pipelines will be disabled." echo "Build-Docker-Images job is set explicitly. Both x86_64-Linux and SBSA-Linux sub-pipelines will be disabled."
} }
parallelJobs = stages.collectEntries{key, value -> [key, { parallelJobs = stages.collectEntries{key, value -> [key, {
@ -1339,11 +1340,11 @@ pipeline {
} }
} }
} }
stage("Build and Test") { stage("Build And Test") {
steps { steps {
script { script {
if (isReleaseCheckMode) { if (isReleaseCheckMode) {
stage("Release Check") { stage("Release-Check") {
script { script {
launchReleaseCheck(this) launchReleaseCheck(this)
} }

View File

@ -125,7 +125,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
def hasTimeoutTest = false def hasTimeoutTest = false
def downloadResultSucceed = false def downloadResultSucceed = false
pipeline.stage('Submit Test Results') { pipeline.stage('Submit Test Result') {
sh "mkdir -p ${stageName}" sh "mkdir -p ${stageName}"
// Download timeout test results // Download timeout test results
def timeoutTestFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/unfinished_test.txt" def timeoutTestFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/unfinished_test.txt"
@ -554,7 +554,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
] ]
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client") Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
stage('Request Node via SLURM') { stage('Request Node Via Slurm') {
println("Selected Cluster: ${cluster.name}") println("Selected Cluster: ${cluster.name}")
def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint) def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint)
@ -603,7 +603,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
} }
} }
stage('Checking if the Node is Online') { stage('Check If Node Is Online') {
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
def remote = [ def remote = [
@ -696,20 +696,18 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
} }
slurmRunner = null slurmRunner = null
echo "${stageName} Slurm partition timeout: ${partition.time}"
def partitionTimeout = partition?.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
if (cluster.containerRuntime.toString() == "DOCKER") { if (cluster.containerRuntime.toString() == "DOCKER") {
echo "${stageName} partitionTimeout: ${partition.time}"
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true) slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true)
} else if (cluster.containerRuntime.toString() == "ENROOT") { } else if (cluster.containerRuntime.toString() == "ENROOT") {
echo "${stageName} partitionTimeout: ${partition.time}"
def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout) slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
} else { } else {
throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}") throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
} }
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner) executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
} finally { } finally {
stage("Clean up SLURM Resources") { stage("Clean Up Slurm Resource") {
// Workaround to handle the interruption during clean up SLURM resources // Workaround to handle the interruption during clean up SLURM resources
retry(3) { retry(3) {
try { try {
@ -805,7 +803,7 @@ def getPytestBaseCommandLine(
"LLM_BACKEND_ROOT=${llmSrc}/triton_backend", "LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
"LLM_MODELS_ROOT=${MODEL_CACHE_DIR}", "LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
"MODEL_CACHE_DIR=${MODEL_CACHE_DIR}", "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
"COLUMNS=400", "COLUMNS=300",
extraInternalEnv, extraInternalEnv,
portEnvVars, portEnvVars,
pytestUtil, pytestUtil,
@ -893,9 +891,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Create a unique suffix for the job name // Create a unique suffix for the job name
String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase() String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
def jobUID = "${cluster.host}-multi_node_test-${customSuffix}" def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
def perfSanityMode = stageName.contains("PerfSanity")
def disaggMode = stageName.contains("PerfSanity-Disagg") def disaggMode = stageName.contains("PerfSanity-Disagg")
def setSegment = disaggMode
Utils.exec(pipeline, script: "env | sort && pwd && ls -alh") Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
@ -933,19 +929,17 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh" def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
def testListPathNode = "${jobWorkspace}/${testList}.txt" def testListPathNode = "${jobWorkspace}/${testList}.txt"
def waivesListPathNode = "${jobWorkspace}/waives.txt" def waivesListPathNode = "${jobWorkspace}/waives.txt"
def sbatchLogPath = "${jobWorkspace}/job-output.log" def slurmJobLogPath = "${jobWorkspace}/job-output.log"
def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh") def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh" def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh") def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh" def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh") def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh" def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
def isAarch64 = config.contains("aarch64")
def coverageConfigFile = "${jobWorkspace}/.coveragerc" def coverageConfigFile = "${jobWorkspace}/.coveragerc"
stage("[${stageName}] Initializing Test") { stage("Initialize Test") {
println("Selected Cluster: ${cluster.name}")
// Create Job Workspace folder in Frontend Node // Create Job Workspace folder in Frontend Node
Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3) Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3)
@ -1052,7 +1046,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
// Generate Job Launch Script // Generate Job Launch Script
def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#") def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
def mounts = getMountListForSlurmTest(cluster, true).join(",") def mounts = getMountListForSlurmTest(cluster, true).join(",")
String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment) String[] taskArgs = getNodeArgs(nodeCount, gpuCount, disaggMode)
if (taskArgs == null) { if (taskArgs == null) {
error "Invalid Slurm test stage name is set" error "Invalid Slurm test stage name is set"
} }
@ -1140,7 +1134,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
def scriptLaunchPrefix = """#!/bin/bash def scriptLaunchPrefix = """#!/bin/bash
#SBATCH ${exemptionComment} #SBATCH ${exemptionComment}
#SBATCH --output=${sbatchLogPath} #SBATCH --output=${slurmJobLogPath}
${taskArgs.collect { "#SBATCH $it" }.join('\n')} ${taskArgs.collect { "#SBATCH $it" }.join('\n')}
#SBATCH ${partition.additionalArgs} #SBATCH ${partition.additionalArgs}
${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"} ${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@ -1182,8 +1176,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix) pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" ")) pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
Utils.exec(pipeline, script: "echo \"Script for Slurm srun job args: \" && cat ${scriptLaunchSrunArgsPathLocal}")
// Output is the corresponding scriptLaunchPathLocal script under the disaggMode // Output is the corresponding scriptLaunchPathLocal script under the disaggMode
sh """ sh """
@ -1218,8 +1210,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
scriptLaunchPathNode, scriptLaunchPathNode,
true true
) )
def filesToKeepWhenRetry = [
scriptRunPathNode,
scriptInstallPathNode,
scriptBashUtilsPathNode,
scriptLaunchPathNode,
scriptSubmitPathNode,
scriptTrackPathNode,
testListPathNode,
waivesListPathNode,
coverageConfigFile
]
def findKeepWhenRetryArgs = filesToKeepWhenRetry.collect { " ! -name \"\$(basename \"${it}\")\"" }.join("")
def scriptSubmit = """#!/bin/bash def scriptSubmit = """#!/bin/bash
set -Eeuo pipefail set -xEeuo pipefail
trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
# Clean up previous job intermediate files so that retry can work # Clean up previous job intermediate files so that retry can work
@ -1227,26 +1233,26 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt") previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
echo "Found previous Slurm job ID: \${previous_job_id}" echo "Found previous Slurm job ID: \${previous_job_id}"
scancel "\${previous_job_id}" || true scancel "\${previous_job_id}" || true
rm -rf "${jobWorkspace}/slurm_job_id.txt" # Wait for 120 seconds to ensure the previous job is canceled
# Wait for 60 seconds to ensure the previous job is canceled sleep 120
sleep 60
fi fi
rm -rf "${jobWorkspace}/results.xml"
rm -rf "${jobWorkspace}/report.csv"
rm -rf "${jobWorkspace}/unfinished_test.txt"
rm -rf "${sbatchLogPath}"
touch ${sbatchLogPath} # Clean up workspace: remove all files/dirs not in the keep list
find "${jobWorkspace}" -maxdepth 1 -mindepth 1 ${findKeepWhenRetryArgs} -exec rm -rf {} +
touch ${slurmJobLogPath}
jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}') jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
if [ -z "\$jobId" ]; then if [ -z "\$jobId" ]; then
echo "Error: Slurm job submission failed, no job ID returned." echo "Error: Slurm job submission failed, no job ID returned."
exit 1 exit 1
fi fi
echo "Submitted Slurm job \$jobId" echo "Submitted Slurm job \$jobId"
# save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve # Save Slurm job ID for later steps to retrieve
echo \$jobId > $jobWorkspace/slurm_job_id.txt echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
""".replaceAll("(?m)^\\s*", "").trim() """.replaceAll("(?m)^\\s*", "").trim()
pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit) pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
Utils.exec(pipeline, script: "echo \"Script to submit the final Slurm job: \" && cat ${scriptSubmitPathLocal}")
Utils.copyFileToRemoteHost( Utils.copyFileToRemoteHost(
pipeline, pipeline,
remote, remote,
@ -1255,8 +1261,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
true true
) )
} }
stage("[${stageName}] Run Pytest") { stage("[${stageName}] Run Pytest") {
// Submit the sbatch job // Submit the Slurm job
Utils.exec( Utils.exec(
pipeline, pipeline,
timeout: false, timeout: false,
@ -1266,42 +1273,56 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
), ),
numRetries: 3 numRetries: 3
) )
def sbatchJobId = Utils.exec(
def slurmJobId = Utils.exec(
pipeline, pipeline,
returnStdout: true,
script: Utils.sshUserCmd( script: Utils.sshUserCmd(
remote, remote,
"cat $jobWorkspace/slurm_job_id.txt" "\"cat ${jobWorkspace}/slurm_job_id.txt\""
) ),
returnStdout: true,
numRetries: 3
).trim() ).trim()
Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobId}")
def scriptTrack = """#!/bin/bash def scriptTrack = """#!/bin/bash
jobId=\$(cat $jobWorkspace/slurm_job_id.txt) set -xEeuo pipefail
tail -f ${sbatchLogPath} & trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
jobId=${slurmJobId}
tail -f ${slurmJobLogPath} &
tailPid=\$! tailPid=\$!
# Wait until sbatch job is done.
# Wait until Slurm job is done
while true; do while true; do
state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}') # Use --allocations to ensure we match the exact job ID and not job steps (like 123.batch, 123.0)
if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then STATUS=\$(sacct -j \$jobId --format=State -Pn --allocations)
echo "job is still running"
if [[ -z \$STATUS || \$STATUS == "RUNNING" || \$STATUS == "PENDING" || \$STATUS == "CONFIGURING" ]]; then
echo "Slurm job \$jobId is still running"
sleep 300 sleep 300
else else
echo "Job \$jobId finished with state: \$state" echo "Slurm job \$jobId finished with state: \$STATUS"
break break
fi fi
done done
# Kill tail -f process # Kill tail -f process
kill \$tailPid kill \$tailPid
# Check if the job failed or not
# Wait briefly to ensure accounting is consistent
sleep 10 sleep 10
# Retry getting status and exit code as sacct might be delayed
# Get exit code (STATUS is already known from loop break)
# Retry for exit code if missing
for i in {1..3}; do for i in {1..3}; do
STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}') # Use awk to parse exit code from format like "0:0"
EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}') EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then if [ -n "\$EXIT_CODE" ]; then
break break
fi fi
echo "Waiting for sacct to update... attempt \$i" echo "Waiting for sacct exit code to update... attempt \$i"
sleep 10 sleep 10
done done
@ -1309,11 +1330,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
echo "Error: Failed to get exit code from sacct after retries, defaulting to 1." echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
EXIT_CODE=1 EXIT_CODE=1
fi fi
if [ -z "\$STATUS" ]; then
echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
STATUS="UNKNOWN"
fi
# We already have valid STATUS from the loop that caused the break
if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
echo "Pytest succeed in Slurm job \$jobId" echo "Pytest succeed in Slurm job \$jobId"
echo "Status: \$STATUS | Exit_code \$EXIT_CODE" echo "Status: \$STATUS | Exit_code \$EXIT_CODE"
@ -1324,7 +1342,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
exit 1 exit 1
fi fi
""".replaceAll("(?m)^\\s*", "").trim() """.replaceAll("(?m)^\\s*", "").trim()
pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack) pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
Utils.exec(pipeline, script: "echo \"Script to track Slurm job and pull the log: \" && cat ${scriptTrackPathLocal}")
Utils.copyFileToRemoteHost( Utils.copyFileToRemoteHost(
pipeline, pipeline,
remote, remote,
@ -1332,52 +1352,23 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
scriptTrackPathNode, scriptTrackPathNode,
true true
) )
def scriptStatus = """#!/bin/bash
jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
"""
pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
Utils.copyFileToRemoteHost(
pipeline,
remote,
scriptStatusPathLocal,
scriptStatusPathNode,
true
)
sh "cat $scriptStatusPathLocal" // Track the Slurm job
while (true) { Utils.exec(
// Check if the job is done by running sacct via SSH pipeline,
def result = Utils.exec( timeout: false,
pipeline, script: Utils.sshUserCmd(
returnStdout: true, remote,
script: Utils.sshUserCmd( scriptTrackPathNode
remote, ),
scriptStatusPathNode numRetries: 3
) )
).trim()
if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
echo "Slurm job $sbatchJobId is still running, pulling the job log."
// Pulling the sbatch output log
Utils.exec(
pipeline,
timeout: false,
script: Utils.sshUserCmd(
remote,
scriptTrackPathNode
)
)
} else {
echo "Slurm job $sbatchJobId is done."
break
}
}
} }
echo "Finished test stage execution." echo "Finished test stage execution."
} }
} finally { } finally {
uploadResults(pipeline, cluster, jobUID, stageName) uploadResults(pipeline, cluster, jobUID, stageName)
stage("Clean up SLURM Resources") { stage("Clean Up Slurm Resource") {
// Workaround to handle the interruption during clean up SLURM resources // Workaround to handle the interruption during clean up SLURM resources
retry(3) { retry(3) {
try { try {
@ -1736,7 +1727,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
targetCloud = "kubernetes" targetCloud = "kubernetes"
// DGX Spark requires a special setting for accessing the device. // DGX Spark requires a special setting for accessing the device.
// It has 128GB unified memory as per spec. Use half of the memory at the CPU side. // It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
if (type == "gb10x") { if (type.contains("gb10x")) {
targetCloud = "nvks-sparks-cloud" targetCloud = "nvks-sparks-cloud"
memorySize = "64Gi" memorySize = "64Gi"
tolerations = """ tolerations = """
@ -1755,7 +1746,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
// The following GPU types doesn't support dynamic driver flashing. // The following GPU types doesn't support dynamic driver flashing.
if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) { if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
if (type == "gb10x") { if (type.contains("gb10x")) {
selectors = """ selectors = """
kubernetes.io/arch: ${arch} kubernetes.io/arch: ${arch}
kubernetes.io/os: linux kubernetes.io/os: linux
@ -2595,7 +2586,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
} }
// Step 2: run tests // Step 2: run tests
stage ("Setup environment") stage ("Setup Environment")
{ {
// Random sleep to avoid resource contention // Random sleep to avoid resource contention
sleep(10 * Math.random()) sleep(10 * Math.random())
@ -2647,7 +2638,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
} }
if (testFilter[(DEBUG_MODE)]) { if (testFilter[(DEBUG_MODE)]) {
stage("Interactive debug session") stage("Interactive Debug Session")
{ {
testFilter[(DEBUG_MODE)] = false testFilter[(DEBUG_MODE)] = false
@ -2848,7 +2839,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
} }
// Generate comprehensive rerun report if any reruns occurred // Generate comprehensive rerun report if any reruns occurred
stage ("[${stageName}] Generate Report") { stage ("Generate Report") {
generateRerunReport(stageName, llmSrc) generateRerunReport(stageName, llmSrc)
} }
@ -2859,7 +2850,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
if (perfMode) { if (perfMode) {
basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv" basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}" basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
stage("Check perf result") { stage("Check Perf Result") {
def perfCheckResult = sh( def perfCheckResult = sh(
script: """ script: """
python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \ python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
@ -2872,7 +2863,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
error "Performance regression detected and failing the build (exit code: ${perfCheckResult})" error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
} }
} }
stage("Create perf report") { stage("Create Perf Report") {
sh """ sh """
python3 ${llmSrc}/tests/integration/defs/perf/create_perf_comparison_report.py \ python3 ${llmSrc}/tests/integration/defs/perf/create_perf_comparison_report.py \
--output_path ${stageName}/report.pdf \ --output_path ${stageName}/report.pdf \
@ -2883,7 +2874,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
} }
if (stageName.contains("PerfSanity")) { if (stageName.contains("PerfSanity")) {
stage ("Check perf result") { stage ("Check PerfSanity Result") {
def perfCheckResult = sh( def perfCheckResult = sh(
script: """ script: """
python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \ python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
@ -3079,7 +3070,7 @@ def ensureStageResultNotUploaded(stageName) {
if(!GlobalState.uploadResultStageNames.contains(stageName)) { if(!GlobalState.uploadResultStageNames.contains(stageName)) {
GlobalState.uploadResultStageNames.add(stageName) GlobalState.uploadResultStageNames.add(stageName)
} else { } else {
stage('Upload Test Results') { stage('Upload Test Result') {
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') { catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
error "Upload test results for ${stageName} failed because it has already been uploaded." error "Upload test results for ${stageName} failed because it has already been uploaded."
} }
@ -3288,7 +3279,7 @@ def launchTestJobs(pipeline, testFilter)
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4], "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
"DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4], "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
// Perf sanity post merge test // PerfSanity post-merge tests
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4], // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4], // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
// "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4], // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
@ -3311,8 +3302,7 @@ def launchTestJobs(pipeline, testFilter)
parallelJobs += parallelSlurmJobs parallelJobs += parallelSlurmJobs
// Try to match what are being tested on x86 H100_PCIe. // SBSA machines from the Blossom machine pool
// SBSA machines from the Blossom machine pool
SBSATestConfigs = [ SBSATestConfigs = [
"GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1], "GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
// DGX Spark is also named as GB10 Grace Blackwell Superchip. // DGX Spark is also named as GB10 Grace Blackwell Superchip.
@ -3328,13 +3318,13 @@ def launchTestJobs(pipeline, testFilter)
// Disable GB300 stages due to nodes will be offline temporarily. // Disable GB300 stages due to nodes will be offline temporarily.
// "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1], // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
// "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4], // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
// Perf sanity pre merge test // PerfSanity pre-merge tests
"GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
// Perf sanity post merge test // PerfSanity post-merge tests
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
"GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
@ -3355,9 +3345,7 @@ def launchTestJobs(pipeline, testFilter)
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
// Perf sanity pre merge tests // PerfSanity post-merge tests
// "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
// Perf sanity post merge tests
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
"GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
"GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3], "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
@ -3539,7 +3527,7 @@ def launchTestJobs(pipeline, testFilter)
} }
if (checkPipStage) { if (checkPipStage) {
stage("Run LLMAPI tests") { stage("Run LLMAPI Test") {
pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch) pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch)
trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", { trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", {
echo "###### Prerequisites Start ######" echo "###### Prerequisites Start ######"
@ -3751,8 +3739,8 @@ def launchTestJobs(pipeline, testFilter)
parallelJobsFiltered = parallelJobsFiltered.collectEntries { key, values -> [key, { parallelJobsFiltered = parallelJobsFiltered.collectEntries { key, values -> [key, {
stage(key) { stage(key) {
if (key in testFilter[REUSE_STAGE_LIST]) { if (key in testFilter[REUSE_STAGE_LIST]) {
stage("Skip - reused") { stage("Skip - Reused") {
echo "Skip - Passed in the last pipeline." echo "Skip - Passed in the previous pipelines."
} }
} else if (values instanceof List) { } else if (values instanceof List) {
trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", { trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", {
@ -3876,7 +3864,7 @@ pipeline {
OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials") OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials")
} }
stages { stages {
stage("Setup environment") stage("Setup Environment")
{ {
steps steps
{ {
@ -3891,7 +3879,7 @@ pipeline {
} }
} }
} }
stage("Check Test Lists") stage("Check Test List")
{ {
when { when {
expression { expression {

View File

@ -12,7 +12,14 @@ slurm_install_setup() {
cd $resourcePathNode cd $resourcePathNode
llmSrcNode=$resourcePathNode/TensorRT-LLM/src llmSrcNode=$resourcePathNode/TensorRT-LLM/src
# Use unique lock file for this job ID
lock_file="install_lock_job_${SLURM_JOB_ID:-local}_node_${SLURM_NODEID:-0}.lock"
if [ $SLURM_LOCALID -eq 0 ]; then if [ $SLURM_LOCALID -eq 0 ]; then
if [ -f "$lock_file" ]; then
rm -f "$lock_file"
fi
retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName" retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
which python3 which python3
python3 --version python3 --version
@ -27,11 +34,11 @@ slurm_install_setup() {
hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}" hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName" echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
echo "(Writing install lock) Current directory: $(pwd)" echo "(Writing install lock) Current directory: $(pwd)"
touch install_lock.lock touch "$lock_file"
else else
echo "(Waiting for install lock) Current directory: $(pwd)" echo "(Waiting for install lock) Current directory: $(pwd)"
while [ ! -f install_lock.lock ]; do while [ ! -f "$lock_file" ]; do
sleep 5 sleep 10
done done
fi fi
} }

View File

@ -64,8 +64,8 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
if [ $SLURM_PROCID -eq 0 ]; then if [ $SLURM_PROCID -eq 0 ]; then
sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile" sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
else else
# Sleep 10 seconds to wait for the coverage config file to be saved # Sleep 30 seconds to wait for the coverage config file to be saved
sleep 10 sleep 30
fi fi
containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}') containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
@ -108,6 +108,25 @@ eval $pytestCommand
pytest_exit_code=$? pytest_exit_code=$?
echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code" echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
# DEBUG: Diagnose intermittent "unrecognized arguments" failure (Exit Code 4)
# Remove this after the issue is resolved
if [ $pytest_exit_code -eq 4 ]; then
echo "DEBUG: Pytest failed with usage error (exit code 4)"
echo "DEBUG: Directory state at $(pwd):"
ls -l
echo "DEBUG: Directory state at $llmSrcNode/tests/integration/defs:"
ls -l $llmSrcNode/tests/integration/defs
echo "DEBUG: conftest.py content:"
md5sum $llmSrcNode/tests/integration/defs/conftest.py
echo "DEBUG: pytest.ini content:"
md5sum $llmSrcNode/tests/integration/defs/pytest.ini
echo "DEBUG: Check importability of conftest.py"
python3 -c "import sys; sys.path.insert(0, '.'); import conftest; print('DEBUG: conftest imported successfully')"
fi
if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
if [[ "$stageName" == *PyTorch* ]]; then if [[ "$stageName" == *PyTorch* ]]; then
basePerfFilename="base_perf_pytorch.csv" basePerfFilename="base_perf_pytorch.csv"
@ -136,11 +155,11 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
fi fi
if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then
echo "Check Perf-Sanity Result" echo "Check PerfSanity Result"
python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \ python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
$jobWorkspace $jobWorkspace
perf_sanity_check_exit_code=$? perf_sanity_check_exit_code=$?
echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code" echo "Rank${SLURM_PROCID} PerfSanity check finished execution with exit code $perf_sanity_check_exit_code"
fi fi
if [ "$pytest_exit_code" -ne 0 ]; then if [ "$pytest_exit_code" -ne 0 ]; then

View File

@ -15,6 +15,7 @@ Note:
All the perf tests will be excluded since they are generated dynamically. All the perf tests will be excluded since they are generated dynamically.
""" """
import argparse import argparse
import glob
import os import os
import subprocess import subprocess
@ -42,7 +43,13 @@ def verify_l0_test_lists(llm_src):
test_list = f"{llm_src}/l0_test.txt" test_list = f"{llm_src}/l0_test.txt"
# Remove dynamically generated perf tests # Remove dynamically generated perf tests
subprocess.run(f"rm -f {test_db_path}/*perf*", shell=True, check=True) # Exclude perf_sanity tests from being removed since they are different and statically defined
for file_path in glob.glob(os.path.join(test_db_path, "*perf*")):
if "perf_sanity" not in os.path.basename(file_path):
try:
os.remove(file_path)
except OSError:
pass
subprocess.run( subprocess.run(
f"trt-test-db -d {test_db_path} --test-names --output {test_list}", f"trt-test-db -d {test_db_path} --test-names --output {test_list}",
shell=True, shell=True,

View File

@ -2689,8 +2689,7 @@ def get_gpu_memory_wo_pynvml():
import psutil import psutil
logger.warning( logger.warning(
f"\nWarning: pynvml not available, using fallback commands for memory monitoring" f"pynvml not available, using fallback commands for memory monitoring")
)
gpu_memory = {} gpu_memory = {}
system_total_mb = 0 system_total_mb = 0

View File

@ -542,3 +542,4 @@ disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backen
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5769890) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5769890)
disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5769890,https://nvbugs/5748683) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5769890,https://nvbugs/5748683)
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/5779536) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/5779536)
perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5778381)