[None][ci] Some tweaks for the CI pipeline (#10359)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2026-01-05 00:10:47 +08:00 · 2026-01-05 00:10:47 +08:00 · c4f27fa4c0
commit c4f27fa4c0
parent afc533193d
9 changed files with 183 additions and 161 deletions
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@ -585,7 +585,7 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
    parallelJobs.failFast = enableFailFast

    if (cpu_arch == X86_64_TRIPLE && !reuseArtifactPath) {
-        def key = "Build with build type Debug"
+        def key = "Build With Build Type Debug"
        parallelJobs += [
        (key): {
            script {
@ -628,7 +628,7 @@ pipeline {
        HF_DATASETS_OFFLINE=1
    }
    stages {
-        stage("BuildJob") {
+        stage("Build Job") {
            steps {
                launchStages(this, params.targetArch, params.enableFailFast, globalVars)
            }
--- a/jenkins/BuildDockerImage.groovy
+++ b/jenkins/BuildDockerImage.groovy
@ -276,7 +276,7 @@ def buildImage(config, imageKeyToTag)
    }

    // Step 2: Build the images
-    stage ("Install packages") {
+    stage ("Install Package") {
        sh "pwd && ls -alh"
        sh "env | sort"
        sh "apk add make git"
@ -380,7 +380,7 @@ def buildImage(config, imageKeyToTag)
        }

        if (customTag) {
-            stage ("custom tag: ${customTag} (${arch})") {
+            stage ("Custom Tag: ${customTag} (${arch})") {
                sh """
                cd ${LLM_ROOT} && make -C docker ${target}_${action} \
                BASE_IMAGE=${BASE_IMAGE} \
@ -395,7 +395,7 @@ def buildImage(config, imageKeyToTag)
    } catch (Exception ex) {
        containerGenFailure = ex
    } finally {
-        stage ("Docker logout") {
+        stage ("Docker Logout") {
            withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
                sh "docker logout urm.nvidia.com"
                sh "docker logout ${DEFAULT_GIT_URL}:5005"
@ -424,14 +424,14 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {

    def release_action = params.action
    def buildConfigs = [
-        "Build trtllm release (x86_64)": [
+        "Build Internal release (x86_64 trtllm)": [
            target: "trtllm",
            action: release_action,
            customTag: LLM_BRANCH_TAG + "-x86_64",
            build_wheel: true,
            dockerfileStage: "release",
        ],
-        "Build trtllm release (SBSA)": [
+        "Build Internal release (SBSA trtllm)": [
            target: "trtllm",
            action: release_action,
            customTag: LLM_BRANCH_TAG + "-sbsa",
@ -439,21 +439,21 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
            arch: "arm64",
            dockerfileStage: "release",
        ],
-        "Build CI image (x86_64 tritondevel)": [:],
-        "Build CI image (SBSA tritondevel)": [
+        "Build CI Image (x86_64 tritondevel)": [:],
+        "Build CI Image (SBSA tritondevel)": [
            arch: "arm64",
        ],
-        "Build CI image (RockyLinux8 Python310)": [
+        "Build CI Image (RockyLinux8 Python310)": [
            target: "rockylinux8",
            args: "PYTHON_VERSION=3.10.12",
            postTag: "-py310",
        ],
-        "Build CI image (RockyLinux8 Python312)": [
+        "Build CI Image (RockyLinux8 Python312)": [
            target: "rockylinux8",
            args: "PYTHON_VERSION=3.12.3",
            postTag: "-py312",
        ],
-        "Build NGC devel and release (x86_64)": [
+        "Build NGC devel And release (x86_64)": [
            target: "ngc-release",
            action: release_action,
            args: "DOCKER_BUILD_OPTS='--load --platform linux/amd64'",
@ -464,7 +464,7 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
            ],
            dockerfileStage: "release",
        ],
-        "Build NGC devel and release (SBSA)": [
+        "Build NGC devel And release (SBSA)": [
            target: "ngc-release",
            action: release_action,
            args: "DOCKER_BUILD_OPTS='--load --platform linux/arm64'",
@ -583,7 +583,7 @@ pipeline {
                }
            }
        }
-        stage("Upload Artifacts") {
+        stage("Upload Artifact") {
            steps {
                script {
                    String imageKeyToTagJson = writeJSON returnText: true, json: imageKeyToTag
@ -594,7 +594,7 @@ pipeline {
                }
            }
        }
-        stage("Wait for Build Jobs Complete") {
+        stage("Wait For Build Job Complete") {
            when {
                expression {
                    RUN_SANITY_CHECK
@ -655,7 +655,7 @@ pipeline {
                }
            }
        }
-        stage("Sanity Check for NGC Images") {
+        stage("Sanity Check For NGC Image") {
            when {
                expression {
                    RUN_SANITY_CHECK
@ -691,7 +691,7 @@ pipeline {
                }
            }
        }
-        stage("Register NGC Images for Security Checks") {
+        stage("Register NGC Image For Security Check") {
            when {
                expression {
                    return params.nspect_id && params.action == "push"
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@ -451,7 +451,7 @@ def launchReleaseCheck(pipeline)
    }

    def image = "urm.nvidia.com/docker/golang:1.22"
-    stageName = "Release Check"
+    stageName = "Release-Check"
    trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "package"), "trt-llm", {
        stage("[${stageName}] Run") {
            if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) {
@ -834,7 +834,7 @@ def collectTestResults(pipeline, testFilter)
 {
    collectResultPodSpec = createKubernetesPodConfig("", "agent")
    trtllm_utils.launchKubernetesPod(pipeline, collectResultPodSpec, "alpine", {
-        stage ("Collect test result") {
+        stage ("Collect Test Result") {
            sh "rm -rf **/*.xml *.tar.gz"

            testResultLink = "https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}/test-results"
@ -864,7 +864,7 @@ def collectTestResults(pipeline, testFilter)

            junit(testResults: '**/results*.xml', allowEmptyResults : true)
        } // Collect test result stage
-        stage("Rerun report") {
+        stage("Rerun Report") {
            sh "rm -rf rerun && mkdir -p rerun"
            sh "find . -type f -wholename '*/rerun_results.xml' -exec sh -c 'mv \"{}\" \"rerun/\$(basename \$(dirname \"{}\"))_rerun_results.xml\"' \\; || true"
            sh "find rerun -type f"
@ -904,7 +904,7 @@ def collectTestResults(pipeline, testFilter)
            }
        } // Rerun report stage
        try {
-            stage("Test coverage") {
+            stage("Test Coverage") {
                sh "ls"
                def CUR_PATH = sh(returnStdout: true, script: 'pwd').replaceAll("\\s","")
                sh "echo ${CUR_PATH}"
@ -1030,14 +1030,15 @@ def launchJob(jobName, reuseBuild, enableFailFast, globalVars, platform="x86_64"
 def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
 {
    stages = [
-        "Release Check": {
+        "Release-Check": {
            script {
                launchReleaseCheck(this)
            }
        },
-        "x86_64-linux": {
+        "x86_64-Linux": {
            script {
-                stage("Build") {
+                def testStageName = "[Build-x86_64] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+                stage(testStageName) {
                    def additionalParameters = [
                        'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
                        'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
@ -1045,7 +1046,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
                    ]
                    launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters)
                }
-                def testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+
+                testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
                def singleGpuTestFailed = false
                stage(testStageName) {
                    if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
@ -1135,24 +1137,23 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
                }
            }
        },
-        "SBSA-linux": {
+        "SBSA-Linux": {
            script {
-                def jenkinsUrl = ""
-                def credentials = ""
-                def testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
-                def singleGpuTestFailed = false
-
                if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") {
                    echo "SBSA build job is skipped due to Jenkins configuration or conditional pipeline run"
                    return
                }

-                stage("Build") {
+                def testStageName = "[Build-SBSA] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+                stage(testStageName) {
                    def additionalParameters = [
                        "dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
                    ]
                    launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters)
                }
+
+                testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+                def singleGpuTestFailed = false
                stage(testStageName) {
                    if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
                        echo "SBSA test job is skipped due to Jenkins configuration"
@ -1269,9 +1270,9 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
        testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images")
        testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images")
        echo "Will run Build-Docker-Images job"
-        stages.remove("x86_64-linux")
-        stages.remove("SBSA-linux")
-        echo "Build-Docker-Images job is set explicitly. Both x86_64-linux and SBSA-linux sub-pipelines will be disabled."
+        stages.remove("x86_64-Linux")
+        stages.remove("SBSA-Linux")
+        echo "Build-Docker-Images job is set explicitly. Both x86_64-Linux and SBSA-Linux sub-pipelines will be disabled."
    }

    parallelJobs = stages.collectEntries{key, value -> [key, {
@ -1339,11 +1340,11 @@ pipeline {
                }
            }
        }
-        stage("Build and Test") {
+        stage("Build And Test") {
            steps {
                script {
                    if (isReleaseCheckMode) {
-                        stage("Release Check") {
+                        stage("Release-Check") {
                            script {
                                launchReleaseCheck(this)
                            }
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -125,7 +125,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
        def hasTimeoutTest = false
        def downloadResultSucceed = false

-        pipeline.stage('Submit Test Results') {
+        pipeline.stage('Submit Test Result') {
            sh "mkdir -p ${stageName}"
            // Download timeout test results
            def timeoutTestFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/unfinished_test.txt"
@ -554,7 +554,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
            ]

            Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
-            stage('Request Node via SLURM') {
+            stage('Request Node Via Slurm') {
                println("Selected Cluster: ${cluster.name}")

                def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint)
@ -603,7 +603,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
            }
        }

-        stage('Checking if the Node is Online') {
+        stage('Check If Node Is Online') {
            withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
                def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
                def remote = [
@ -696,20 +696,18 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
        }

        slurmRunner = null
+        echo "${stageName} Slurm partition timeout: ${partition.time}"
+        def partitionTimeout = partition?.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
        if (cluster.containerRuntime.toString() == "DOCKER") {
-            echo "${stageName} partitionTimeout: ${partition.time}"
-            def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
            slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true)
        } else if (cluster.containerRuntime.toString() == "ENROOT") {
-            echo "${stageName} partitionTimeout: ${partition.time}"
-            def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
            slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
        } else {
            throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
        }
        executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
    } finally {
-        stage("Clean up SLURM Resources") {
+        stage("Clean Up Slurm Resource") {
            // Workaround to handle the interruption during clean up SLURM resources
            retry(3) {
                try {
@ -805,7 +803,7 @@ def getPytestBaseCommandLine(
        "LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
        "LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
        "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
-        "COLUMNS=400",
+        "COLUMNS=300",
        extraInternalEnv,
        portEnvVars,
        pytestUtil,
@ -893,9 +891,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
    // Create a unique suffix for the job name
    String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
    def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
-    def perfSanityMode = stageName.contains("PerfSanity")
    def disaggMode = stageName.contains("PerfSanity-Disagg")
-    def setSegment = disaggMode

    Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")

@ -933,19 +929,17 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
            def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
            def testListPathNode = "${jobWorkspace}/${testList}.txt"
            def waivesListPathNode = "${jobWorkspace}/waives.txt"
-            def sbatchLogPath = "${jobWorkspace}/job-output.log"
+            def slurmJobLogPath = "${jobWorkspace}/job-output.log"
            def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
            def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
            def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
            def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
            def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
            def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
-            def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
-            def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
-            def isAarch64 = config.contains("aarch64")
            def coverageConfigFile = "${jobWorkspace}/.coveragerc"

-            stage("[${stageName}] Initializing Test") {
+            stage("Initialize Test") {
+                println("Selected Cluster: ${cluster.name}")
                // Create Job Workspace folder in Frontend Node
                Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3)

@ -1052,7 +1046,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                // Generate Job Launch Script
                def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
                def mounts = getMountListForSlurmTest(cluster, true).join(",")
-                String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment)
+                String[] taskArgs = getNodeArgs(nodeCount, gpuCount, disaggMode)
                if (taskArgs == null) {
                    error "Invalid Slurm test stage name is set"
                }
@ -1140,7 +1134,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG

                def scriptLaunchPrefix = """#!/bin/bash
                    #SBATCH ${exemptionComment}
-                    #SBATCH --output=${sbatchLogPath}
+                    #SBATCH --output=${slurmJobLogPath}
                    ${taskArgs.collect { "#SBATCH $it" }.join('\n')}
                    #SBATCH ${partition.additionalArgs}
                    ${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@ -1182,8 +1176,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG

                    pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
                    pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
-                    Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
-                    Utils.exec(pipeline, script: "echo \"Script for Slurm srun job args: \" && cat ${scriptLaunchSrunArgsPathLocal}")

                    // Output is the corresponding scriptLaunchPathLocal script under the disaggMode
                    sh """
@ -1218,8 +1210,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    scriptLaunchPathNode,
                    true
                )
+
+                def filesToKeepWhenRetry = [
+                    scriptRunPathNode,
+                    scriptInstallPathNode,
+                    scriptBashUtilsPathNode,
+                    scriptLaunchPathNode,
+                    scriptSubmitPathNode,
+                    scriptTrackPathNode,
+                    testListPathNode,
+                    waivesListPathNode,
+                    coverageConfigFile
+                ]
+                def findKeepWhenRetryArgs = filesToKeepWhenRetry.collect { " ! -name \"\$(basename \"${it}\")\"" }.join("")
+
                def scriptSubmit = """#!/bin/bash
-                    set -Eeuo pipefail
+                    set -xEeuo pipefail
                    trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR

                    # Clean up previous job intermediate files so that retry can work
@ -1227,26 +1233,26 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                        previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
                        echo "Found previous Slurm job ID: \${previous_job_id}"
                        scancel "\${previous_job_id}" || true
-                        rm -rf "${jobWorkspace}/slurm_job_id.txt"
-                        # Wait for 60 seconds to ensure the previous job is canceled
-                        sleep 60
+                        # Wait for 120 seconds to ensure the previous job is canceled
+                        sleep 120
                    fi
-                    rm -rf "${jobWorkspace}/results.xml"
-                    rm -rf "${jobWorkspace}/report.csv"
-                    rm -rf "${jobWorkspace}/unfinished_test.txt"
-                    rm -rf "${sbatchLogPath}"

-                    touch ${sbatchLogPath}
+                    # Clean up workspace: remove all files/dirs not in the keep list
+                    find "${jobWorkspace}" -maxdepth 1 -mindepth 1 ${findKeepWhenRetryArgs} -exec rm -rf {} +
+
+                    touch ${slurmJobLogPath}
                    jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
                    if [ -z "\$jobId" ]; then
                        echo "Error: Slurm job submission failed, no job ID returned."
                        exit 1
                    fi
                    echo "Submitted Slurm job \$jobId"
-                    # save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
-                    echo \$jobId > $jobWorkspace/slurm_job_id.txt
+                    # Save Slurm job ID for later steps to retrieve
+                    echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
                """.replaceAll("(?m)^\\s*", "").trim()
+
                pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
+                Utils.exec(pipeline, script: "echo \"Script to submit the final Slurm job: \" && cat ${scriptSubmitPathLocal}")
                Utils.copyFileToRemoteHost(
                    pipeline,
                    remote,
@ -1255,8 +1261,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    true
                )
            }
+
            stage("[${stageName}] Run Pytest") {
-                // Submit the sbatch job
+                // Submit the Slurm job
                Utils.exec(
                    pipeline,
                    timeout: false,
@ -1266,42 +1273,56 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    ),
                    numRetries: 3
                )
-                def sbatchJobId = Utils.exec(
+
+                def slurmJobId = Utils.exec(
                    pipeline,
-                    returnStdout: true,
                    script: Utils.sshUserCmd(
                        remote,
-                        "cat $jobWorkspace/slurm_job_id.txt"
-                    )
+                        "\"cat ${jobWorkspace}/slurm_job_id.txt\""
+                    ),
+                    returnStdout: true,
+                    numRetries: 3
                ).trim()
+                Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobId}")
+
                def scriptTrack = """#!/bin/bash
-                    jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
-                    tail -f ${sbatchLogPath} &
+                    set -xEeuo pipefail
+                    trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
+
+                    jobId=${slurmJobId}
+                    tail -f ${slurmJobLogPath} &
                    tailPid=\$!
-                    # Wait until sbatch job is done.
+
+                    # Wait until Slurm job is done
                    while true; do
-                        state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
-                        if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
-                            echo "job is still running"
+                        # Use --allocations to ensure we match the exact job ID and not job steps (like 123.batch, 123.0)
+                        STATUS=\$(sacct -j \$jobId --format=State -Pn --allocations)
+
+                        if [[ -z \$STATUS || \$STATUS == "RUNNING" || \$STATUS == "PENDING" || \$STATUS == "CONFIGURING" ]]; then
+                            echo "Slurm job \$jobId is still running"
                            sleep 300
                        else
-                            echo "Job \$jobId finished with state: \$state"
+                            echo "Slurm job \$jobId finished with state: \$STATUS"
                            break
                        fi
                    done
+
                    # Kill tail -f process
                    kill \$tailPid
-                    # Check if the job failed or not
+
+                    # Wait briefly to ensure accounting is consistent
                    sleep 10
-                    # Retry getting status and exit code as sacct might be delayed
+
+                    # Get exit code (STATUS is already known from loop break)
+                    # Retry for exit code if missing
                    for i in {1..3}; do
-                        STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
+                        # Use awk to parse exit code from format like "0:0"
                        EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')

-                        if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then
+                        if [ -n "\$EXIT_CODE" ]; then
                            break
                        fi
-                        echo "Waiting for sacct to update... attempt \$i"
+                        echo "Waiting for sacct exit code to update... attempt \$i"
                        sleep 10
                    done

@ -1309,11 +1330,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                        echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
                        EXIT_CODE=1
                    fi
-                    if [ -z "\$STATUS" ]; then
-                        echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
-                        STATUS="UNKNOWN"
-                    fi

+                    # We already have valid STATUS from the loop that caused the break
                    if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
                        echo "Pytest succeed in Slurm job \$jobId"
                        echo "Status: \$STATUS | Exit_code \$EXIT_CODE"
@ -1324,7 +1342,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                        exit 1
                    fi
                """.replaceAll("(?m)^\\s*", "").trim()
+
                pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
+                Utils.exec(pipeline, script: "echo \"Script to track Slurm job and pull the log: \" && cat ${scriptTrackPathLocal}")
                Utils.copyFileToRemoteHost(
                    pipeline,
                    remote,
@ -1332,52 +1352,23 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    scriptTrackPathNode,
                    true
                )
-                def scriptStatus = """#!/bin/bash
-                    jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
-                    sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
-                """
-                pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
-                Utils.copyFileToRemoteHost(
-                    pipeline,
-                    remote,
-                    scriptStatusPathLocal,
-                    scriptStatusPathNode,
-                    true
-                )

-                sh "cat $scriptStatusPathLocal"
-                while (true) {
-                    // Check if the job is done by running sacct via SSH
-                    def result = Utils.exec(
-                        pipeline,
-                        returnStdout: true,
-                        script: Utils.sshUserCmd(
-                            remote,
-                            scriptStatusPathNode
-                        )
-                    ).trim()
-                    if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
-                        echo "Slurm job $sbatchJobId is still running, pulling the job log."
-                        // Pulling the sbatch output log
+                // Track the Slurm job
                Utils.exec(
                    pipeline,
                    timeout: false,
                    script: Utils.sshUserCmd(
                        remote,
                        scriptTrackPathNode
+                    ),
+                    numRetries: 3
                )
-                        )
-                    } else {
-                        echo "Slurm job $sbatchJobId is done."
-                        break
-                    }
-                }
            }
            echo "Finished test stage execution."
        }
    } finally {
        uploadResults(pipeline, cluster, jobUID, stageName)
-        stage("Clean up SLURM Resources") {
+        stage("Clean Up Slurm Resource") {
            // Workaround to handle the interruption during clean up SLURM resources
            retry(3) {
                try {
@ -1736,7 +1727,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
        targetCloud = "kubernetes"
        // DGX Spark requires a special setting for accessing the device.
        // It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
-        if (type == "gb10x") {
+        if (type.contains("gb10x")) {
            targetCloud = "nvks-sparks-cloud"
            memorySize = "64Gi"
            tolerations = """
@ -1755,7 +1746,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod

        // The following GPU types doesn't support dynamic driver flashing.
        if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
-            if (type == "gb10x") {
+            if (type.contains("gb10x")) {
                selectors = """
                    kubernetes.io/arch: ${arch}
                    kubernetes.io/os: linux
@ -2595,7 +2586,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
    }

    // Step 2: run tests
-    stage ("Setup environment")
+    stage ("Setup Environment")
    {
        // Random sleep to avoid resource contention
        sleep(10 * Math.random())
@ -2647,7 +2638,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
    }

    if (testFilter[(DEBUG_MODE)]) {
-        stage("Interactive debug session")
+        stage("Interactive Debug Session")
        {
            testFilter[(DEBUG_MODE)] = false

@ -2848,7 +2839,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
        }

        // Generate comprehensive rerun report if any reruns occurred
-        stage ("[${stageName}] Generate Report") {
+        stage ("Generate Report") {
            generateRerunReport(stageName, llmSrc)
        }

@ -2859,7 +2850,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
        if (perfMode) {
            basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
            basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
-            stage("Check perf result") {
+            stage("Check Perf Result") {
                def perfCheckResult = sh(
                    script: """
                    python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
@ -2872,7 +2863,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
                    error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
                }
            }
-            stage("Create perf report") {
+            stage("Create Perf Report") {
                sh """
                    python3 ${llmSrc}/tests/integration/defs/perf/create_perf_comparison_report.py \
                    --output_path ${stageName}/report.pdf \
@ -2883,7 +2874,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
        }

        if (stageName.contains("PerfSanity")) {
-            stage ("Check perf result") {
+            stage ("Check PerfSanity Result") {
                def perfCheckResult = sh(
                    script: """
                        python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
@ -3079,7 +3070,7 @@ def ensureStageResultNotUploaded(stageName) {
    if(!GlobalState.uploadResultStageNames.contains(stageName)) {
        GlobalState.uploadResultStageNames.add(stageName)
    } else {
-        stage('Upload Test Results') {
+        stage('Upload Test Result') {
            catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
                error "Upload test results for ${stageName} failed because it has already been uploaded."
            }
@ -3288,7 +3279,7 @@ def launchTestJobs(pipeline, testFilter)
        "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
        "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
        "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
-        // Perf sanity post merge test
+        // PerfSanity post-merge tests
        // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
        // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
        // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
@ -3311,8 +3302,7 @@ def launchTestJobs(pipeline, testFilter)

    parallelJobs += parallelSlurmJobs

-    // Try to match what are being tested on x86 H100_PCIe.
-// SBSA machines from the Blossom machine pool
+    // SBSA machines from the Blossom machine pool
    SBSATestConfigs = [
        "GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
        // DGX Spark is also named as GB10 Grace Blackwell Superchip.
@ -3328,13 +3318,13 @@ def launchTestJobs(pipeline, testFilter)
        // Disable GB300 stages due to nodes will be offline temporarily.
        // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
        // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
-        // Perf sanity pre merge test
+        // PerfSanity pre-merge tests
        "GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
        "GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
        "GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
        "GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
        "GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
-        // Perf sanity post merge test
+        // PerfSanity post-merge tests
        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
@ -3355,9 +3345,7 @@ def launchTestJobs(pipeline, testFilter)
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
-        // Perf sanity pre merge tests
-        // "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
-        // Perf sanity post merge tests
+        // PerfSanity post-merge tests
        "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
        "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
@ -3539,7 +3527,7 @@ def launchTestJobs(pipeline, testFilter)
            }

            if (checkPipStage) {
-                stage("Run LLMAPI tests") {
+                stage("Run LLMAPI Test") {
                    pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch)
                    trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", {
                        echo "###### Prerequisites Start ######"
@ -3751,8 +3739,8 @@ def launchTestJobs(pipeline, testFilter)
    parallelJobsFiltered = parallelJobsFiltered.collectEntries { key, values -> [key, {
        stage(key) {
            if (key in testFilter[REUSE_STAGE_LIST]) {
-                stage("Skip - reused") {
-                    echo "Skip - Passed in the last pipeline."
+                stage("Skip - Reused") {
+                    echo "Skip - Passed in the previous pipelines."
                }
            } else if (values instanceof List) {
                trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", {
@ -3876,7 +3864,7 @@ pipeline {
        OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials")
    }
    stages {
-        stage("Setup environment")
+        stage("Setup Environment")
        {
            steps
            {
@ -3891,7 +3879,7 @@ pipeline {
                }
            }
        }
-        stage("Check Test Lists")
+        stage("Check Test List")
        {
            when {
                expression {
--- a/jenkins/scripts/slurm_install.sh
+++ b/jenkins/scripts/slurm_install.sh
@ -12,7 +12,14 @@ slurm_install_setup() {
    cd $resourcePathNode
    llmSrcNode=$resourcePathNode/TensorRT-LLM/src

+    # Use unique lock file for this job ID
+    lock_file="install_lock_job_${SLURM_JOB_ID:-local}_node_${SLURM_NODEID:-0}.lock"
+
    if [ $SLURM_LOCALID -eq 0 ]; then
+        if [ -f "$lock_file" ]; then
+            rm -f "$lock_file"
+        fi
+
        retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
        which python3
        python3 --version
@ -27,11 +34,11 @@ slurm_install_setup() {
        hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
        echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
        echo "(Writing install lock) Current directory: $(pwd)"
-        touch install_lock.lock
+        touch "$lock_file"
    else
        echo "(Waiting for install lock) Current directory: $(pwd)"
-        while [ ! -f install_lock.lock ]; do
-            sleep 5
+        while [ ! -f "$lock_file" ]; do
+            sleep 10
        done
    fi
 }
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@ -64,8 +64,8 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
 if [ $SLURM_PROCID -eq 0 ]; then
    sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
 else
-    # Sleep 10 seconds to wait for the coverage config file to be saved
-    sleep 10
+    # Sleep 30 seconds to wait for the coverage config file to be saved
+    sleep 30
 fi

 containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
@ -108,6 +108,25 @@ eval $pytestCommand
 pytest_exit_code=$?
 echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"

+# DEBUG: Diagnose intermittent "unrecognized arguments" failure (Exit Code 4)
+# Remove this after the issue is resolved
+if [ $pytest_exit_code -eq 4 ]; then
+    echo "DEBUG: Pytest failed with usage error (exit code 4)"
+    echo "DEBUG: Directory state at $(pwd):"
+    ls -l
+    echo "DEBUG: Directory state at $llmSrcNode/tests/integration/defs:"
+    ls -l $llmSrcNode/tests/integration/defs
+
+    echo "DEBUG: conftest.py content:"
+    md5sum $llmSrcNode/tests/integration/defs/conftest.py
+
+    echo "DEBUG: pytest.ini content:"
+    md5sum $llmSrcNode/tests/integration/defs/pytest.ini
+
+    echo "DEBUG: Check importability of conftest.py"
+    python3 -c "import sys; sys.path.insert(0, '.'); import conftest; print('DEBUG: conftest imported successfully')"
+fi
+
 if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
    if [[ "$stageName" == *PyTorch* ]]; then
        basePerfFilename="base_perf_pytorch.csv"
@ -136,11 +155,11 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
 fi

 if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then
-    echo "Check Perf-Sanity Result"
+    echo "Check PerfSanity Result"
    python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
        $jobWorkspace
    perf_sanity_check_exit_code=$?
-    echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
+    echo "Rank${SLURM_PROCID} PerfSanity check finished execution with exit code $perf_sanity_check_exit_code"
 fi

 if [ "$pytest_exit_code" -ne 0 ]; then
--- a/scripts/check_test_list.py
+++ b/scripts/check_test_list.py
@ -15,6 +15,7 @@ Note:
 All the perf tests will be excluded since they are generated dynamically.
 """
 import argparse
+import glob
 import os
 import subprocess

@ -42,7 +43,13 @@ def verify_l0_test_lists(llm_src):
    test_list = f"{llm_src}/l0_test.txt"

    # Remove dynamically generated perf tests
-    subprocess.run(f"rm -f {test_db_path}/*perf*", shell=True, check=True)
+    # Exclude perf_sanity tests from being removed since they are different and statically defined
+    for file_path in glob.glob(os.path.join(test_db_path, "*perf*")):
+        if "perf_sanity" not in os.path.basename(file_path):
+            try:
+                os.remove(file_path)
+            except OSError:
+                pass
    subprocess.run(
        f"trt-test-db -d {test_db_path} --test-names --output {test_list}",
        shell=True,
--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@ -2689,8 +2689,7 @@ def get_gpu_memory_wo_pynvml():
    import psutil

    logger.warning(
-        f"\nWarning: pynvml not available, using fallback commands for memory monitoring"
-    )
+        f"pynvml not available, using fallback commands for memory monitoring")

    gpu_memory = {}
    system_total_mb = 0
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -542,3 +542,4 @@ disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backen
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5769890)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5769890,https://nvbugs/5748683)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/5779536)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5778381)