diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy index 261c0a6d3a..5ecaa43a22 100644 --- a/jenkins/Build.groovy +++ b/jenkins/Build.groovy @@ -585,7 +585,7 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars) parallelJobs.failFast = enableFailFast if (cpu_arch == X86_64_TRIPLE && !reuseArtifactPath) { - def key = "Build with build type Debug" + def key = "Build With Build Type Debug" parallelJobs += [ (key): { script { @@ -628,7 +628,7 @@ pipeline { HF_DATASETS_OFFLINE=1 } stages { - stage("BuildJob") { + stage("Build Job") { steps { launchStages(this, params.targetArch, params.enableFailFast, globalVars) } diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy index c3fb2fac3c..f26d5537ed 100644 --- a/jenkins/BuildDockerImage.groovy +++ b/jenkins/BuildDockerImage.groovy @@ -276,7 +276,7 @@ def buildImage(config, imageKeyToTag) } // Step 2: Build the images - stage ("Install packages") { + stage ("Install Package") { sh "pwd && ls -alh" sh "env | sort" sh "apk add make git" @@ -380,7 +380,7 @@ def buildImage(config, imageKeyToTag) } if (customTag) { - stage ("custom tag: ${customTag} (${arch})") { + stage ("Custom Tag: ${customTag} (${arch})") { sh """ cd ${LLM_ROOT} && make -C docker ${target}_${action} \ BASE_IMAGE=${BASE_IMAGE} \ @@ -395,7 +395,7 @@ def buildImage(config, imageKeyToTag) } catch (Exception ex) { containerGenFailure = ex } finally { - stage ("Docker logout") { + stage ("Docker Logout") { withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) { sh "docker logout urm.nvidia.com" sh "docker logout ${DEFAULT_GIT_URL}:5005" @@ -424,14 +424,14 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) { def release_action = params.action def buildConfigs = [ - "Build trtllm release (x86_64)": [ + "Build Internal release (x86_64 trtllm)": [ target: "trtllm", action: release_action, customTag: LLM_BRANCH_TAG + "-x86_64", build_wheel: true, dockerfileStage: "release", ], - "Build trtllm release (SBSA)": [ + "Build Internal release (SBSA trtllm)": [ target: "trtllm", action: release_action, customTag: LLM_BRANCH_TAG + "-sbsa", @@ -439,21 +439,21 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) { arch: "arm64", dockerfileStage: "release", ], - "Build CI image (x86_64 tritondevel)": [:], - "Build CI image (SBSA tritondevel)": [ + "Build CI Image (x86_64 tritondevel)": [:], + "Build CI Image (SBSA tritondevel)": [ arch: "arm64", ], - "Build CI image (RockyLinux8 Python310)": [ + "Build CI Image (RockyLinux8 Python310)": [ target: "rockylinux8", args: "PYTHON_VERSION=3.10.12", postTag: "-py310", ], - "Build CI image (RockyLinux8 Python312)": [ + "Build CI Image (RockyLinux8 Python312)": [ target: "rockylinux8", args: "PYTHON_VERSION=3.12.3", postTag: "-py312", ], - "Build NGC devel and release (x86_64)": [ + "Build NGC devel And release (x86_64)": [ target: "ngc-release", action: release_action, args: "DOCKER_BUILD_OPTS='--load --platform linux/amd64'", @@ -464,7 +464,7 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) { ], dockerfileStage: "release", ], - "Build NGC devel and release (SBSA)": [ + "Build NGC devel And release (SBSA)": [ target: "ngc-release", action: release_action, args: "DOCKER_BUILD_OPTS='--load --platform linux/arm64'", @@ -583,7 +583,7 @@ pipeline { } } } - stage("Upload Artifacts") { + stage("Upload Artifact") { steps { script { String imageKeyToTagJson = writeJSON returnText: true, json: imageKeyToTag @@ -594,7 +594,7 @@ pipeline { } } } - stage("Wait for Build Jobs Complete") { + stage("Wait For Build Job Complete") { when { expression { RUN_SANITY_CHECK @@ -655,7 +655,7 @@ pipeline { } } } - stage("Sanity Check for NGC Images") { + stage("Sanity Check For NGC Image") { when { expression { RUN_SANITY_CHECK @@ -691,7 +691,7 @@ pipeline { } } } - stage("Register NGC Images for Security Checks") { + stage("Register NGC Image For Security Check") { when { expression { return params.nspect_id && params.action == "push" diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy index 3e81b22a09..adbfc46baa 100644 --- a/jenkins/L0_MergeRequest.groovy +++ b/jenkins/L0_MergeRequest.groovy @@ -451,7 +451,7 @@ def launchReleaseCheck(pipeline) } def image = "urm.nvidia.com/docker/golang:1.22" - stageName = "Release Check" + stageName = "Release-Check" trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "package"), "trt-llm", { stage("[${stageName}] Run") { if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) { @@ -834,7 +834,7 @@ def collectTestResults(pipeline, testFilter) { collectResultPodSpec = createKubernetesPodConfig("", "agent") trtllm_utils.launchKubernetesPod(pipeline, collectResultPodSpec, "alpine", { - stage ("Collect test result") { + stage ("Collect Test Result") { sh "rm -rf **/*.xml *.tar.gz" testResultLink = "https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}/test-results" @@ -864,7 +864,7 @@ def collectTestResults(pipeline, testFilter) junit(testResults: '**/results*.xml', allowEmptyResults : true) } // Collect test result stage - stage("Rerun report") { + stage("Rerun Report") { sh "rm -rf rerun && mkdir -p rerun" sh "find . -type f -wholename '*/rerun_results.xml' -exec sh -c 'mv \"{}\" \"rerun/\$(basename \$(dirname \"{}\"))_rerun_results.xml\"' \\; || true" sh "find rerun -type f" @@ -904,7 +904,7 @@ def collectTestResults(pipeline, testFilter) } } // Rerun report stage try { - stage("Test coverage") { + stage("Test Coverage") { sh "ls" def CUR_PATH = sh(returnStdout: true, script: 'pwd').replaceAll("\\s","") sh "echo ${CUR_PATH}" @@ -1030,14 +1030,15 @@ def launchJob(jobName, reuseBuild, enableFailFast, globalVars, platform="x86_64" def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars) { stages = [ - "Release Check": { + "Release-Check": { script { launchReleaseCheck(this) } }, - "x86_64-linux": { + "x86_64-Linux": { script { - stage("Build") { + def testStageName = "[Build-x86_64] ${env.localJobCredentials ? "Remote Run" : "Run"}" + stage(testStageName) { def additionalParameters = [ 'dockerImage': globalVars["LLM_DOCKER_IMAGE"], 'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"], @@ -1045,7 +1046,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars) ] launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters) } - def testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}" + + testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}" def singleGpuTestFailed = false stage(testStageName) { if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) { @@ -1135,24 +1137,23 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars) } } }, - "SBSA-linux": { + "SBSA-Linux": { script { - def jenkinsUrl = "" - def credentials = "" - def testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}" - def singleGpuTestFailed = false - if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") { echo "SBSA build job is skipped due to Jenkins configuration or conditional pipeline run" return } - stage("Build") { + def testStageName = "[Build-SBSA] ${env.localJobCredentials ? "Remote Run" : "Run"}" + stage(testStageName) { def additionalParameters = [ "dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"], ] launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters) } + + testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}" + def singleGpuTestFailed = false stage(testStageName) { if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) { echo "SBSA test job is skipped due to Jenkins configuration" @@ -1269,9 +1270,9 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars) testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images") testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images") echo "Will run Build-Docker-Images job" - stages.remove("x86_64-linux") - stages.remove("SBSA-linux") - echo "Build-Docker-Images job is set explicitly. Both x86_64-linux and SBSA-linux sub-pipelines will be disabled." + stages.remove("x86_64-Linux") + stages.remove("SBSA-Linux") + echo "Build-Docker-Images job is set explicitly. Both x86_64-Linux and SBSA-Linux sub-pipelines will be disabled." } parallelJobs = stages.collectEntries{key, value -> [key, { @@ -1339,11 +1340,11 @@ pipeline { } } } - stage("Build and Test") { + stage("Build And Test") { steps { script { if (isReleaseCheckMode) { - stage("Release Check") { + stage("Release-Check") { script { launchReleaseCheck(this) } diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 3aefeaaf0e..53ce1ee3db 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -125,7 +125,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st def hasTimeoutTest = false def downloadResultSucceed = false - pipeline.stage('Submit Test Results') { + pipeline.stage('Submit Test Result') { sh "mkdir -p ${stageName}" // Download timeout test results def timeoutTestFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/unfinished_test.txt" @@ -554,7 +554,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG, ] Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client") - stage('Request Node via SLURM') { + stage('Request Node Via Slurm') { println("Selected Cluster: ${cluster.name}") def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint) @@ -603,7 +603,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG, } } - stage('Checking if the Node is Online') { + stage('Check If Node Is Online') { withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) { def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host) def remote = [ @@ -696,20 +696,18 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG, } slurmRunner = null + echo "${stageName} Slurm partition timeout: ${partition.time}" + def partitionTimeout = partition?.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT if (cluster.containerRuntime.toString() == "DOCKER") { - echo "${stageName} partitionTimeout: ${partition.time}" - def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true) } else if (cluster.containerRuntime.toString() == "ENROOT") { - echo "${stageName} partitionTimeout: ${partition.time}" - def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout) } else { throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}") } executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner) } finally { - stage("Clean up SLURM Resources") { + stage("Clean Up Slurm Resource") { // Workaround to handle the interruption during clean up SLURM resources retry(3) { try { @@ -805,7 +803,7 @@ def getPytestBaseCommandLine( "LLM_BACKEND_ROOT=${llmSrc}/triton_backend", "LLM_MODELS_ROOT=${MODEL_CACHE_DIR}", "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}", - "COLUMNS=400", + "COLUMNS=300", extraInternalEnv, portEnvVars, pytestUtil, @@ -893,9 +891,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG // Create a unique suffix for the job name String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase() def jobUID = "${cluster.host}-multi_node_test-${customSuffix}" - def perfSanityMode = stageName.contains("PerfSanity") def disaggMode = stageName.contains("PerfSanity-Disagg") - def setSegment = disaggMode Utils.exec(pipeline, script: "env | sort && pwd && ls -alh") @@ -933,19 +929,17 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh" def testListPathNode = "${jobWorkspace}/${testList}.txt" def waivesListPathNode = "${jobWorkspace}/waives.txt" - def sbatchLogPath = "${jobWorkspace}/job-output.log" + def slurmJobLogPath = "${jobWorkspace}/job-output.log" def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh") def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh" def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh") def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh" def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh") def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh" - def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh") - def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh" - def isAarch64 = config.contains("aarch64") def coverageConfigFile = "${jobWorkspace}/.coveragerc" - stage("[${stageName}] Initializing Test") { + stage("Initialize Test") { + println("Selected Cluster: ${cluster.name}") // Create Job Workspace folder in Frontend Node Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3) @@ -1052,7 +1046,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG // Generate Job Launch Script def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#") def mounts = getMountListForSlurmTest(cluster, true).join(",") - String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment) + String[] taskArgs = getNodeArgs(nodeCount, gpuCount, disaggMode) if (taskArgs == null) { error "Invalid Slurm test stage name is set" } @@ -1140,7 +1134,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG def scriptLaunchPrefix = """#!/bin/bash #SBATCH ${exemptionComment} - #SBATCH --output=${sbatchLogPath} + #SBATCH --output=${slurmJobLogPath} ${taskArgs.collect { "#SBATCH $it" }.join('\n')} #SBATCH ${partition.additionalArgs} ${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"} @@ -1182,8 +1176,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix) pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" ")) - Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job prefix: \" && cat ${scriptLaunchPrefixPathLocal}") - Utils.exec(pipeline, script: "echo \"Script for Slurm srun job args: \" && cat ${scriptLaunchSrunArgsPathLocal}") // Output is the corresponding scriptLaunchPathLocal script under the disaggMode sh """ @@ -1218,8 +1210,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG scriptLaunchPathNode, true ) + + def filesToKeepWhenRetry = [ + scriptRunPathNode, + scriptInstallPathNode, + scriptBashUtilsPathNode, + scriptLaunchPathNode, + scriptSubmitPathNode, + scriptTrackPathNode, + testListPathNode, + waivesListPathNode, + coverageConfigFile + ] + def findKeepWhenRetryArgs = filesToKeepWhenRetry.collect { " ! -name \"\$(basename \"${it}\")\"" }.join("") + def scriptSubmit = """#!/bin/bash - set -Eeuo pipefail + set -xEeuo pipefail trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR # Clean up previous job intermediate files so that retry can work @@ -1227,26 +1233,26 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt") echo "Found previous Slurm job ID: \${previous_job_id}" scancel "\${previous_job_id}" || true - rm -rf "${jobWorkspace}/slurm_job_id.txt" - # Wait for 60 seconds to ensure the previous job is canceled - sleep 60 + # Wait for 120 seconds to ensure the previous job is canceled + sleep 120 fi - rm -rf "${jobWorkspace}/results.xml" - rm -rf "${jobWorkspace}/report.csv" - rm -rf "${jobWorkspace}/unfinished_test.txt" - rm -rf "${sbatchLogPath}" - touch ${sbatchLogPath} + # Clean up workspace: remove all files/dirs not in the keep list + find "${jobWorkspace}" -maxdepth 1 -mindepth 1 ${findKeepWhenRetryArgs} -exec rm -rf {} + + + touch ${slurmJobLogPath} jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}') if [ -z "\$jobId" ]; then echo "Error: Slurm job submission failed, no job ID returned." exit 1 fi echo "Submitted Slurm job \$jobId" - # save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve - echo \$jobId > $jobWorkspace/slurm_job_id.txt + # Save Slurm job ID for later steps to retrieve + echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt" """.replaceAll("(?m)^\\s*", "").trim() + pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit) + Utils.exec(pipeline, script: "echo \"Script to submit the final Slurm job: \" && cat ${scriptSubmitPathLocal}") Utils.copyFileToRemoteHost( pipeline, remote, @@ -1255,8 +1261,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG true ) } + stage("[${stageName}] Run Pytest") { - // Submit the sbatch job + // Submit the Slurm job Utils.exec( pipeline, timeout: false, @@ -1266,42 +1273,56 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG ), numRetries: 3 ) - def sbatchJobId = Utils.exec( + + def slurmJobId = Utils.exec( pipeline, - returnStdout: true, script: Utils.sshUserCmd( remote, - "cat $jobWorkspace/slurm_job_id.txt" - ) + "\"cat ${jobWorkspace}/slurm_job_id.txt\"" + ), + returnStdout: true, + numRetries: 3 ).trim() + Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobId}") + def scriptTrack = """#!/bin/bash - jobId=\$(cat $jobWorkspace/slurm_job_id.txt) - tail -f ${sbatchLogPath} & + set -xEeuo pipefail + trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR + + jobId=${slurmJobId} + tail -f ${slurmJobLogPath} & tailPid=\$! - # Wait until sbatch job is done. + + # Wait until Slurm job is done while true; do - state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}') - if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then - echo "job is still running" + # Use --allocations to ensure we match the exact job ID and not job steps (like 123.batch, 123.0) + STATUS=\$(sacct -j \$jobId --format=State -Pn --allocations) + + if [[ -z \$STATUS || \$STATUS == "RUNNING" || \$STATUS == "PENDING" || \$STATUS == "CONFIGURING" ]]; then + echo "Slurm job \$jobId is still running" sleep 300 else - echo "Job \$jobId finished with state: \$state" + echo "Slurm job \$jobId finished with state: \$STATUS" break fi done + # Kill tail -f process kill \$tailPid - # Check if the job failed or not + + # Wait briefly to ensure accounting is consistent sleep 10 - # Retry getting status and exit code as sacct might be delayed + + # Get exit code (STATUS is already known from loop break) + # Retry for exit code if missing for i in {1..3}; do - STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}') + # Use awk to parse exit code from format like "0:0" EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}') - if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then + if [ -n "\$EXIT_CODE" ]; then break fi - echo "Waiting for sacct to update... attempt \$i" + echo "Waiting for sacct exit code to update... attempt \$i" sleep 10 done @@ -1309,11 +1330,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG echo "Error: Failed to get exit code from sacct after retries, defaulting to 1." EXIT_CODE=1 fi - if [ -z "\$STATUS" ]; then - echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN." - STATUS="UNKNOWN" - fi + # We already have valid STATUS from the loop that caused the break if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then echo "Pytest succeed in Slurm job \$jobId" echo "Status: \$STATUS | Exit_code \$EXIT_CODE" @@ -1324,7 +1342,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG exit 1 fi """.replaceAll("(?m)^\\s*", "").trim() + pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack) + Utils.exec(pipeline, script: "echo \"Script to track Slurm job and pull the log: \" && cat ${scriptTrackPathLocal}") Utils.copyFileToRemoteHost( pipeline, remote, @@ -1332,52 +1352,23 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG scriptTrackPathNode, true ) - def scriptStatus = """#!/bin/bash - jobId=\$(cat $jobWorkspace/slurm_job_id.txt) - sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}' - """ - pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus) - Utils.copyFileToRemoteHost( - pipeline, - remote, - scriptStatusPathLocal, - scriptStatusPathNode, - true - ) - sh "cat $scriptStatusPathLocal" - while (true) { - // Check if the job is done by running sacct via SSH - def result = Utils.exec( - pipeline, - returnStdout: true, - script: Utils.sshUserCmd( - remote, - scriptStatusPathNode - ) - ).trim() - if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") { - echo "Slurm job $sbatchJobId is still running, pulling the job log." - // Pulling the sbatch output log - Utils.exec( - pipeline, - timeout: false, - script: Utils.sshUserCmd( - remote, - scriptTrackPathNode - ) - ) - } else { - echo "Slurm job $sbatchJobId is done." - break - } - } + // Track the Slurm job + Utils.exec( + pipeline, + timeout: false, + script: Utils.sshUserCmd( + remote, + scriptTrackPathNode + ), + numRetries: 3 + ) } echo "Finished test stage execution." } } finally { uploadResults(pipeline, cluster, jobUID, stageName) - stage("Clean up SLURM Resources") { + stage("Clean Up Slurm Resource") { // Workaround to handle the interruption during clean up SLURM resources retry(3) { try { @@ -1736,7 +1727,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod targetCloud = "kubernetes" // DGX Spark requires a special setting for accessing the device. // It has 128GB unified memory as per spec. Use half of the memory at the CPU side. - if (type == "gb10x") { + if (type.contains("gb10x")) { targetCloud = "nvks-sparks-cloud" memorySize = "64Gi" tolerations = """ @@ -1755,7 +1746,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod // The following GPU types doesn't support dynamic driver flashing. if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) { - if (type == "gb10x") { + if (type.contains("gb10x")) { selectors = """ kubernetes.io/arch: ${arch} kubernetes.io/os: linux @@ -2595,7 +2586,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO } // Step 2: run tests - stage ("Setup environment") + stage ("Setup Environment") { // Random sleep to avoid resource contention sleep(10 * Math.random()) @@ -2647,7 +2638,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO } if (testFilter[(DEBUG_MODE)]) { - stage("Interactive debug session") + stage("Interactive Debug Session") { testFilter[(DEBUG_MODE)] = false @@ -2848,7 +2839,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO } // Generate comprehensive rerun report if any reruns occurred - stage ("[${stageName}] Generate Report") { + stage ("Generate Report") { generateRerunReport(stageName, llmSrc) } @@ -2859,7 +2850,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO if (perfMode) { basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv" basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}" - stage("Check perf result") { + stage("Check Perf Result") { def perfCheckResult = sh( script: """ python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \ @@ -2872,7 +2863,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO error "Performance regression detected and failing the build (exit code: ${perfCheckResult})" } } - stage("Create perf report") { + stage("Create Perf Report") { sh """ python3 ${llmSrc}/tests/integration/defs/perf/create_perf_comparison_report.py \ --output_path ${stageName}/report.pdf \ @@ -2883,7 +2874,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO } if (stageName.contains("PerfSanity")) { - stage ("Check perf result") { + stage ("Check PerfSanity Result") { def perfCheckResult = sh( script: """ python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \ @@ -3079,7 +3070,7 @@ def ensureStageResultNotUploaded(stageName) { if(!GlobalState.uploadResultStageNames.contains(stageName)) { GlobalState.uploadResultStageNames.add(stageName) } else { - stage('Upload Test Results') { + stage('Upload Test Result') { catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') { error "Upload test results for ${stageName} failed because it has already been uploaded." } @@ -3288,7 +3279,7 @@ def launchTestJobs(pipeline, testFilter) "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true], "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4], "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4], - // Perf sanity post merge test + // PerfSanity post-merge tests // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4], // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4], // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4], @@ -3311,8 +3302,7 @@ def launchTestJobs(pipeline, testFilter) parallelJobs += parallelSlurmJobs - // Try to match what are being tested on x86 H100_PCIe. -// SBSA machines from the Blossom machine pool + // SBSA machines from the Blossom machine pool SBSATestConfigs = [ "GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1], // DGX Spark is also named as GB10 Grace Blackwell Superchip. @@ -3328,13 +3318,13 @@ def launchTestJobs(pipeline, testFilter) // Disable GB300 stages due to nodes will be offline temporarily. // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1], // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4], - // Perf sanity pre merge test + // PerfSanity pre-merge tests "GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4], - // Perf sanity post merge test + // PerfSanity post-merge tests "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4], "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4], @@ -3355,9 +3345,7 @@ def launchTestJobs(pipeline, testFilter) "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2], - // Perf sanity pre merge tests - // "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3], - // Perf sanity post merge tests + // PerfSanity post-merge tests "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2], "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2], "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3], @@ -3539,7 +3527,7 @@ def launchTestJobs(pipeline, testFilter) } if (checkPipStage) { - stage("Run LLMAPI tests") { + stage("Run LLMAPI Test") { pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch) trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", { echo "###### Prerequisites Start ######" @@ -3751,8 +3739,8 @@ def launchTestJobs(pipeline, testFilter) parallelJobsFiltered = parallelJobsFiltered.collectEntries { key, values -> [key, { stage(key) { if (key in testFilter[REUSE_STAGE_LIST]) { - stage("Skip - reused") { - echo "Skip - Passed in the last pipeline." + stage("Skip - Reused") { + echo "Skip - Passed in the previous pipelines." } } else if (values instanceof List) { trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", { @@ -3876,7 +3864,7 @@ pipeline { OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials") } stages { - stage("Setup environment") + stage("Setup Environment") { steps { @@ -3891,7 +3879,7 @@ pipeline { } } } - stage("Check Test Lists") + stage("Check Test List") { when { expression { diff --git a/jenkins/scripts/slurm_install.sh b/jenkins/scripts/slurm_install.sh index bd312180e7..cb1ec4bc83 100644 --- a/jenkins/scripts/slurm_install.sh +++ b/jenkins/scripts/slurm_install.sh @@ -12,7 +12,14 @@ slurm_install_setup() { cd $resourcePathNode llmSrcNode=$resourcePathNode/TensorRT-LLM/src + # Use unique lock file for this job ID + lock_file="install_lock_job_${SLURM_JOB_ID:-local}_node_${SLURM_NODEID:-0}.lock" + if [ $SLURM_LOCALID -eq 0 ]; then + if [ -f "$lock_file" ]; then + rm -f "$lock_file" + fi + retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName" which python3 python3 --version @@ -27,11 +34,11 @@ slurm_install_setup() { hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}" echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName" echo "(Writing install lock) Current directory: $(pwd)" - touch install_lock.lock + touch "$lock_file" else echo "(Waiting for install lock) Current directory: $(pwd)" - while [ ! -f install_lock.lock ]; do - sleep 5 + while [ ! -f "$lock_file" ]; do + sleep 10 done fi } diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh index 584ae1e7c9..f319b4e976 100755 --- a/jenkins/scripts/slurm_run.sh +++ b/jenkins/scripts/slurm_run.sh @@ -64,8 +64,8 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest if [ $SLURM_PROCID -eq 0 ]; then sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile" else - # Sleep 10 seconds to wait for the coverage config file to be saved - sleep 10 + # Sleep 30 seconds to wait for the coverage config file to be saved + sleep 30 fi containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}') @@ -108,6 +108,25 @@ eval $pytestCommand pytest_exit_code=$? echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code" +# DEBUG: Diagnose intermittent "unrecognized arguments" failure (Exit Code 4) +# Remove this after the issue is resolved +if [ $pytest_exit_code -eq 4 ]; then + echo "DEBUG: Pytest failed with usage error (exit code 4)" + echo "DEBUG: Directory state at $(pwd):" + ls -l + echo "DEBUG: Directory state at $llmSrcNode/tests/integration/defs:" + ls -l $llmSrcNode/tests/integration/defs + + echo "DEBUG: conftest.py content:" + md5sum $llmSrcNode/tests/integration/defs/conftest.py + + echo "DEBUG: pytest.ini content:" + md5sum $llmSrcNode/tests/integration/defs/pytest.ini + + echo "DEBUG: Check importability of conftest.py" + python3 -c "import sys; sys.path.insert(0, '.'); import conftest; print('DEBUG: conftest imported successfully')" +fi + if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then if [[ "$stageName" == *PyTorch* ]]; then basePerfFilename="base_perf_pytorch.csv" @@ -136,11 +155,11 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then fi if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then - echo "Check Perf-Sanity Result" + echo "Check PerfSanity Result" python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \ $jobWorkspace perf_sanity_check_exit_code=$? - echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code" + echo "Rank${SLURM_PROCID} PerfSanity check finished execution with exit code $perf_sanity_check_exit_code" fi if [ "$pytest_exit_code" -ne 0 ]; then diff --git a/scripts/check_test_list.py b/scripts/check_test_list.py index c799d433fc..ec610cddd4 100755 --- a/scripts/check_test_list.py +++ b/scripts/check_test_list.py @@ -15,6 +15,7 @@ Note: All the perf tests will be excluded since they are generated dynamically. """ import argparse +import glob import os import subprocess @@ -42,7 +43,13 @@ def verify_l0_test_lists(llm_src): test_list = f"{llm_src}/l0_test.txt" # Remove dynamically generated perf tests - subprocess.run(f"rm -f {test_db_path}/*perf*", shell=True, check=True) + # Exclude perf_sanity tests from being removed since they are different and statically defined + for file_path in glob.glob(os.path.join(test_db_path, "*perf*")): + if "perf_sanity" not in os.path.basename(file_path): + try: + os.remove(file_path) + except OSError: + pass subprocess.run( f"trt-test-db -d {test_db_path} --test-names --output {test_list}", shell=True, diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py index 1f187ce4e0..c06b0d18bc 100644 --- a/tests/integration/defs/conftest.py +++ b/tests/integration/defs/conftest.py @@ -2689,8 +2689,7 @@ def get_gpu_memory_wo_pynvml(): import psutil logger.warning( - f"\nWarning: pynvml not available, using fallback commands for memory monitoring" - ) + f"pynvml not available, using fallback commands for memory monitoring") gpu_memory = {} system_total_mb = 0 diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 1ac827c014..fd5900dfd5 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -542,3 +542,4 @@ disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backen disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5769890) disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5769890,https://nvbugs/5748683) accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/5779536) +perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5778381)