diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index 261c0a6d3a..5ecaa43a22 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -585,7 +585,7 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
     parallelJobs.failFast = enableFailFast
 
     if (cpu_arch == X86_64_TRIPLE && !reuseArtifactPath) {
-        def key = "Build with build type Debug"
+        def key = "Build With Build Type Debug"
         parallelJobs += [
         (key): {
             script {
@@ -628,7 +628,7 @@ pipeline {
         HF_DATASETS_OFFLINE=1
     }
     stages {
-        stage("BuildJob") {
+        stage("Build Job") {
             steps {
                 launchStages(this, params.targetArch, params.enableFailFast, globalVars)
             }
diff --git a/jenkins/BuildDockerImage.groovy b/jenkins/BuildDockerImage.groovy
index c3fb2fac3c..f26d5537ed 100644
--- a/jenkins/BuildDockerImage.groovy
+++ b/jenkins/BuildDockerImage.groovy
@@ -276,7 +276,7 @@ def buildImage(config, imageKeyToTag)
     }
 
     // Step 2: Build the images
-    stage ("Install packages") {
+    stage ("Install Package") {
         sh "pwd && ls -alh"
         sh "env | sort"
         sh "apk add make git"
@@ -380,7 +380,7 @@ def buildImage(config, imageKeyToTag)
         }
 
         if (customTag) {
-            stage ("custom tag: ${customTag} (${arch})") {
+            stage ("Custom Tag: ${customTag} (${arch})") {
                 sh """
                 cd ${LLM_ROOT} && make -C docker ${target}_${action} \
                 BASE_IMAGE=${BASE_IMAGE} \
@@ -395,7 +395,7 @@ def buildImage(config, imageKeyToTag)
     } catch (Exception ex) {
         containerGenFailure = ex
     } finally {
-        stage ("Docker logout") {
+        stage ("Docker Logout") {
             withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
                 sh "docker logout urm.nvidia.com"
                 sh "docker logout ${DEFAULT_GIT_URL}:5005"
@@ -424,14 +424,14 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
 
     def release_action = params.action
     def buildConfigs = [
-        "Build trtllm release (x86_64)": [
+        "Build Internal release (x86_64 trtllm)": [
             target: "trtllm",
             action: release_action,
             customTag: LLM_BRANCH_TAG + "-x86_64",
             build_wheel: true,
             dockerfileStage: "release",
         ],
-        "Build trtllm release (SBSA)": [
+        "Build Internal release (SBSA trtllm)": [
             target: "trtllm",
             action: release_action,
             customTag: LLM_BRANCH_TAG + "-sbsa",
@@ -439,21 +439,21 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
             arch: "arm64",
             dockerfileStage: "release",
         ],
-        "Build CI image (x86_64 tritondevel)": [:],
-        "Build CI image (SBSA tritondevel)": [
+        "Build CI Image (x86_64 tritondevel)": [:],
+        "Build CI Image (SBSA tritondevel)": [
             arch: "arm64",
         ],
-        "Build CI image (RockyLinux8 Python310)": [
+        "Build CI Image (RockyLinux8 Python310)": [
             target: "rockylinux8",
             args: "PYTHON_VERSION=3.10.12",
             postTag: "-py310",
         ],
-        "Build CI image (RockyLinux8 Python312)": [
+        "Build CI Image (RockyLinux8 Python312)": [
             target: "rockylinux8",
             args: "PYTHON_VERSION=3.12.3",
             postTag: "-py312",
         ],
-        "Build NGC devel and release (x86_64)": [
+        "Build NGC devel And release (x86_64)": [
             target: "ngc-release",
             action: release_action,
             args: "DOCKER_BUILD_OPTS='--load --platform linux/amd64'",
@@ -464,7 +464,7 @@ def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
             ],
             dockerfileStage: "release",
         ],
-        "Build NGC devel and release (SBSA)": [
+        "Build NGC devel And release (SBSA)": [
             target: "ngc-release",
             action: release_action,
             args: "DOCKER_BUILD_OPTS='--load --platform linux/arm64'",
@@ -583,7 +583,7 @@ pipeline {
                 }
             }
         }
-        stage("Upload Artifacts") {
+        stage("Upload Artifact") {
             steps {
                 script {
                     String imageKeyToTagJson = writeJSON returnText: true, json: imageKeyToTag
@@ -594,7 +594,7 @@ pipeline {
                 }
             }
         }
-        stage("Wait for Build Jobs Complete") {
+        stage("Wait For Build Job Complete") {
             when {
                 expression {
                     RUN_SANITY_CHECK
@@ -655,7 +655,7 @@ pipeline {
                 }
             }
         }
-        stage("Sanity Check for NGC Images") {
+        stage("Sanity Check For NGC Image") {
             when {
                 expression {
                     RUN_SANITY_CHECK
@@ -691,7 +691,7 @@ pipeline {
                 }
             }
         }
-        stage("Register NGC Images for Security Checks") {
+        stage("Register NGC Image For Security Check") {
             when {
                 expression {
                     return params.nspect_id && params.action == "push"
diff --git a/jenkins/L0_MergeRequest.groovy b/jenkins/L0_MergeRequest.groovy
index 3e81b22a09..adbfc46baa 100644
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@@ -451,7 +451,7 @@ def launchReleaseCheck(pipeline)
     }
 
     def image = "urm.nvidia.com/docker/golang:1.22"
-    stageName = "Release Check"
+    stageName = "Release-Check"
     trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "package"), "trt-llm", {
         stage("[${stageName}] Run") {
             if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) {
@@ -834,7 +834,7 @@ def collectTestResults(pipeline, testFilter)
 {
     collectResultPodSpec = createKubernetesPodConfig("", "agent")
     trtllm_utils.launchKubernetesPod(pipeline, collectResultPodSpec, "alpine", {
-        stage ("Collect test result") {
+        stage ("Collect Test Result") {
             sh "rm -rf **/*.xml *.tar.gz"
 
             testResultLink = "https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}/test-results"
@@ -864,7 +864,7 @@ def collectTestResults(pipeline, testFilter)
 
             junit(testResults: '**/results*.xml', allowEmptyResults : true)
         } // Collect test result stage
-        stage("Rerun report") {
+        stage("Rerun Report") {
             sh "rm -rf rerun && mkdir -p rerun"
             sh "find . -type f -wholename '*/rerun_results.xml' -exec sh -c 'mv \"{}\" \"rerun/\$(basename \$(dirname \"{}\"))_rerun_results.xml\"' \\; || true"
             sh "find rerun -type f"
@@ -904,7 +904,7 @@ def collectTestResults(pipeline, testFilter)
             }
         } // Rerun report stage
         try {
-            stage("Test coverage") {
+            stage("Test Coverage") {
                 sh "ls"
                 def CUR_PATH = sh(returnStdout: true, script: 'pwd').replaceAll("\\s","")
                 sh "echo ${CUR_PATH}"
@@ -1030,14 +1030,15 @@ def launchJob(jobName, reuseBuild, enableFailFast, globalVars, platform="x86_64"
 def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
 {
     stages = [
-        "Release Check": {
+        "Release-Check": {
             script {
                 launchReleaseCheck(this)
             }
         },
-        "x86_64-linux": {
+        "x86_64-Linux": {
             script {
-                stage("Build") {
+                def testStageName = "[Build-x86_64] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+                stage(testStageName) {
                     def additionalParameters = [
                         'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
                         'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
@@ -1045,7 +1046,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
                     ]
                     launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters)
                 }
-                def testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+
+                testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
                 def singleGpuTestFailed = false
                 stage(testStageName) {
                     if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
@@ -1135,24 +1137,23 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
                 }
             }
         },
-        "SBSA-linux": {
+        "SBSA-Linux": {
             script {
-                def jenkinsUrl = ""
-                def credentials = ""
-                def testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
-                def singleGpuTestFailed = false
-
                 if (testFilter[(ONLY_ONE_GROUP_CHANGED)] == "Docs") {
                     echo "SBSA build job is skipped due to Jenkins configuration or conditional pipeline run"
                     return
                 }
 
-                stage("Build") {
+                def testStageName = "[Build-SBSA] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+                stage(testStageName) {
                     def additionalParameters = [
                         "dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
                     ]
                     launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters)
                 }
+
+                testStageName = "[Test-SBSA-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+                def singleGpuTestFailed = false
                 stage(testStageName) {
                     if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
                         echo "SBSA test job is skipped due to Jenkins configuration"
@@ -1269,9 +1270,9 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
         testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images")
         testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images")
         echo "Will run Build-Docker-Images job"
-        stages.remove("x86_64-linux")
-        stages.remove("SBSA-linux")
-        echo "Build-Docker-Images job is set explicitly. Both x86_64-linux and SBSA-linux sub-pipelines will be disabled."
+        stages.remove("x86_64-Linux")
+        stages.remove("SBSA-Linux")
+        echo "Build-Docker-Images job is set explicitly. Both x86_64-Linux and SBSA-Linux sub-pipelines will be disabled."
     }
 
     parallelJobs = stages.collectEntries{key, value -> [key, {
@@ -1339,11 +1340,11 @@ pipeline {
                 }
             }
         }
-        stage("Build and Test") {
+        stage("Build And Test") {
             steps {
                 script {
                     if (isReleaseCheckMode) {
-                        stage("Release Check") {
+                        stage("Release-Check") {
                             script {
                                 launchReleaseCheck(this)
                             }
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index 3aefeaaf0e..53ce1ee3db 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -125,7 +125,7 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
         def hasTimeoutTest = false
         def downloadResultSucceed = false
 
-        pipeline.stage('Submit Test Results') {
+        pipeline.stage('Submit Test Result') {
             sh "mkdir -p ${stageName}"
             // Download timeout test results
             def timeoutTestFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/unfinished_test.txt"
@@ -554,7 +554,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
             ]
 
             Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
-            stage('Request Node via SLURM') {
+            stage('Request Node Via Slurm') {
                 println("Selected Cluster: ${cluster.name}")
 
                 def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, entrypoint)
@@ -603,7 +603,7 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
             }
         }
 
-        stage('Checking if the Node is Online') {
+        stage('Check If Node Is Online') {
             withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
                 def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
                 def remote = [
@@ -696,20 +696,18 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
         }
 
         slurmRunner = null
+        echo "${stageName} Slurm partition timeout: ${partition.time}"
+        def partitionTimeout = partition?.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
         if (cluster.containerRuntime.toString() == "DOCKER") {
-            echo "${stageName} partitionTimeout: ${partition.time}"
-            def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
             slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, partitionTimeout, true)
         } else if (cluster.containerRuntime.toString() == "ENROOT") {
-            echo "${stageName} partitionTimeout: ${partition.time}"
-            def partitionTimeout = partition.time ? partition.time : SlurmConfig.DEFAULT_TIMEOUT_SHORT
             slurmRunner = runInEnrootOnNode(nodeName, partitionTimeout)
         } else {
             throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
         }
         executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
     } finally {
-        stage("Clean up SLURM Resources") {
+        stage("Clean Up Slurm Resource") {
             // Workaround to handle the interruption during clean up SLURM resources
             retry(3) {
                 try {
@@ -805,7 +803,7 @@ def getPytestBaseCommandLine(
         "LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
         "LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
         "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
-        "COLUMNS=400",
+        "COLUMNS=300",
         extraInternalEnv,
         portEnvVars,
         pytestUtil,
@@ -893,9 +891,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
     // Create a unique suffix for the job name
     String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
     def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
-    def perfSanityMode = stageName.contains("PerfSanity")
     def disaggMode = stageName.contains("PerfSanity-Disagg")
-    def setSegment = disaggMode
 
     Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
 
@@ -933,19 +929,17 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
             def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
             def testListPathNode = "${jobWorkspace}/${testList}.txt"
             def waivesListPathNode = "${jobWorkspace}/waives.txt"
-            def sbatchLogPath = "${jobWorkspace}/job-output.log"
+            def slurmJobLogPath = "${jobWorkspace}/job-output.log"
             def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
             def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
             def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
             def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
             def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
             def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
-            def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
-            def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
-            def isAarch64 = config.contains("aarch64")
             def coverageConfigFile = "${jobWorkspace}/.coveragerc"
 
-            stage("[${stageName}] Initializing Test") {
+            stage("Initialize Test") {
+                println("Selected Cluster: ${cluster.name}")
                 // Create Job Workspace folder in Frontend Node
                 Utils.exec(pipeline, script: Utils.sshUserCmd(remote, "\"mkdir -p ${jobWorkspace}\""), numRetries: 3)
 
@@ -1052,7 +1046,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                 // Generate Job Launch Script
                 def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
                 def mounts = getMountListForSlurmTest(cluster, true).join(",")
-                String[] taskArgs = getNodeArgs(nodeCount, gpuCount, setSegment)
+                String[] taskArgs = getNodeArgs(nodeCount, gpuCount, disaggMode)
                 if (taskArgs == null) {
                     error "Invalid Slurm test stage name is set"
                 }
@@ -1140,7 +1134,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
 
                 def scriptLaunchPrefix = """#!/bin/bash
                     #SBATCH ${exemptionComment}
-                    #SBATCH --output=${sbatchLogPath}
+                    #SBATCH --output=${slurmJobLogPath}
                     ${taskArgs.collect { "#SBATCH $it" }.join('\n')}
                     #SBATCH ${partition.additionalArgs}
                     ${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@@ -1182,8 +1176,6 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
 
                     pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
                     pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
-                    Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
-                    Utils.exec(pipeline, script: "echo \"Script for Slurm srun job args: \" && cat ${scriptLaunchSrunArgsPathLocal}")
 
                     // Output is the corresponding scriptLaunchPathLocal script under the disaggMode
                     sh """
@@ -1218,8 +1210,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     scriptLaunchPathNode,
                     true
                 )
+
+                def filesToKeepWhenRetry = [
+                    scriptRunPathNode,
+                    scriptInstallPathNode,
+                    scriptBashUtilsPathNode,
+                    scriptLaunchPathNode,
+                    scriptSubmitPathNode,
+                    scriptTrackPathNode,
+                    testListPathNode,
+                    waivesListPathNode,
+                    coverageConfigFile
+                ]
+                def findKeepWhenRetryArgs = filesToKeepWhenRetry.collect { " ! -name \"\$(basename \"${it}\")\"" }.join("")
+
                 def scriptSubmit = """#!/bin/bash
-                    set -Eeuo pipefail
+                    set -xEeuo pipefail
                     trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
 
                     # Clean up previous job intermediate files so that retry can work
@@ -1227,26 +1233,26 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                         previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
                         echo "Found previous Slurm job ID: \${previous_job_id}"
                         scancel "\${previous_job_id}" || true
-                        rm -rf "${jobWorkspace}/slurm_job_id.txt"
-                        # Wait for 60 seconds to ensure the previous job is canceled
-                        sleep 60
+                        # Wait for 120 seconds to ensure the previous job is canceled
+                        sleep 120
                     fi
-                    rm -rf "${jobWorkspace}/results.xml"
-                    rm -rf "${jobWorkspace}/report.csv"
-                    rm -rf "${jobWorkspace}/unfinished_test.txt"
-                    rm -rf "${sbatchLogPath}"
 
-                    touch ${sbatchLogPath}
+                    # Clean up workspace: remove all files/dirs not in the keep list
+                    find "${jobWorkspace}" -maxdepth 1 -mindepth 1 ${findKeepWhenRetryArgs} -exec rm -rf {} +
+
+                    touch ${slurmJobLogPath}
                     jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
                     if [ -z "\$jobId" ]; then
                         echo "Error: Slurm job submission failed, no job ID returned."
                         exit 1
                     fi
                     echo "Submitted Slurm job \$jobId"
-                    # save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
-                    echo \$jobId > $jobWorkspace/slurm_job_id.txt
+                    # Save Slurm job ID for later steps to retrieve
+                    echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
                 """.replaceAll("(?m)^\\s*", "").trim()
+
                 pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
+                Utils.exec(pipeline, script: "echo \"Script to submit the final Slurm job: \" && cat ${scriptSubmitPathLocal}")
                 Utils.copyFileToRemoteHost(
                     pipeline,
                     remote,
@@ -1255,8 +1261,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     true
                 )
             }
+
             stage("[${stageName}] Run Pytest") {
-                // Submit the sbatch job
+                // Submit the Slurm job
                 Utils.exec(
                     pipeline,
                     timeout: false,
@@ -1266,42 +1273,56 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     ),
                     numRetries: 3
                 )
-                def sbatchJobId = Utils.exec(
+
+                def slurmJobId = Utils.exec(
                     pipeline,
-                    returnStdout: true,
                     script: Utils.sshUserCmd(
                         remote,
-                        "cat $jobWorkspace/slurm_job_id.txt"
-                    )
+                        "\"cat ${jobWorkspace}/slurm_job_id.txt\""
+                    ),
+                    returnStdout: true,
+                    numRetries: 3
                 ).trim()
+                Utils.exec(pipeline, script: "echo Slurm job ID: ${slurmJobId}")
+
                 def scriptTrack = """#!/bin/bash
-                    jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
-                    tail -f ${sbatchLogPath} &
+                    set -xEeuo pipefail
+                    trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
+
+                    jobId=${slurmJobId}
+                    tail -f ${slurmJobLogPath} &
                     tailPid=\$!
-                    # Wait until sbatch job is done.
+
+                    # Wait until Slurm job is done
                     while true; do
-                        state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
-                        if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
-                            echo "job is still running"
+                        # Use --allocations to ensure we match the exact job ID and not job steps (like 123.batch, 123.0)
+                        STATUS=\$(sacct -j \$jobId --format=State -Pn --allocations)
+
+                        if [[ -z \$STATUS || \$STATUS == "RUNNING" || \$STATUS == "PENDING" || \$STATUS == "CONFIGURING" ]]; then
+                            echo "Slurm job \$jobId is still running"
                             sleep 300
                         else
-                            echo "Job \$jobId finished with state: \$state"
+                            echo "Slurm job \$jobId finished with state: \$STATUS"
                             break
                         fi
                     done
+
                     # Kill tail -f process
                     kill \$tailPid
-                    # Check if the job failed or not
+
+                    # Wait briefly to ensure accounting is consistent
                     sleep 10
-                    # Retry getting status and exit code as sacct might be delayed
+
+                    # Get exit code (STATUS is already known from loop break)
+                    # Retry for exit code if missing
                     for i in {1..3}; do
-                        STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
+                        # Use awk to parse exit code from format like "0:0"
                         EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
 
-                        if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then
+                        if [ -n "\$EXIT_CODE" ]; then
                             break
                         fi
-                        echo "Waiting for sacct to update... attempt \$i"
+                        echo "Waiting for sacct exit code to update... attempt \$i"
                         sleep 10
                     done
 
@@ -1309,11 +1330,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                         echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
                         EXIT_CODE=1
                     fi
-                    if [ -z "\$STATUS" ]; then
-                        echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
-                        STATUS="UNKNOWN"
-                    fi
 
+                    # We already have valid STATUS from the loop that caused the break
                     if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
                         echo "Pytest succeed in Slurm job \$jobId"
                         echo "Status: \$STATUS | Exit_code \$EXIT_CODE"
@@ -1324,7 +1342,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                         exit 1
                     fi
                 """.replaceAll("(?m)^\\s*", "").trim()
+
                 pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
+                Utils.exec(pipeline, script: "echo \"Script to track Slurm job and pull the log: \" && cat ${scriptTrackPathLocal}")
                 Utils.copyFileToRemoteHost(
                     pipeline,
                     remote,
@@ -1332,52 +1352,23 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                     scriptTrackPathNode,
                     true
                 )
-                def scriptStatus = """#!/bin/bash
-                    jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
-                    sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
-                """
-                pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
-                Utils.copyFileToRemoteHost(
-                    pipeline,
-                    remote,
-                    scriptStatusPathLocal,
-                    scriptStatusPathNode,
-                    true
-                )
 
-                sh "cat $scriptStatusPathLocal"
-                while (true) {
-                    // Check if the job is done by running sacct via SSH
-                    def result = Utils.exec(
-                        pipeline,
-                        returnStdout: true,
-                        script: Utils.sshUserCmd(
-                            remote,
-                            scriptStatusPathNode
-                        )
-                    ).trim()
-                    if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
-                        echo "Slurm job $sbatchJobId is still running, pulling the job log."
-                        // Pulling the sbatch output log
-                        Utils.exec(
-                            pipeline,
-                            timeout: false,
-                            script: Utils.sshUserCmd(
-                                remote,
-                                scriptTrackPathNode
-                            )
-                        )
-                    } else {
-                        echo "Slurm job $sbatchJobId is done."
-                        break
-                    }
-                }
+                // Track the Slurm job
+                Utils.exec(
+                    pipeline,
+                    timeout: false,
+                    script: Utils.sshUserCmd(
+                        remote,
+                        scriptTrackPathNode
+                    ),
+                    numRetries: 3
+                )
             }
             echo "Finished test stage execution."
         }
     } finally {
         uploadResults(pipeline, cluster, jobUID, stageName)
-        stage("Clean up SLURM Resources") {
+        stage("Clean Up Slurm Resource") {
             // Workaround to handle the interruption during clean up SLURM resources
             retry(3) {
                 try {
@@ -1736,7 +1727,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
         targetCloud = "kubernetes"
         // DGX Spark requires a special setting for accessing the device.
         // It has 128GB unified memory as per spec. Use half of the memory at the CPU side.
-        if (type == "gb10x") {
+        if (type.contains("gb10x")) {
             targetCloud = "nvks-sparks-cloud"
             memorySize = "64Gi"
             tolerations = """
@@ -1755,7 +1746,7 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
 
         // The following GPU types doesn't support dynamic driver flashing.
         if (REQUIRED_NO_DRIVER_TYPES.any { type.contains(it) }) {
-            if (type == "gb10x") {
+            if (type.contains("gb10x")) {
                 selectors = """
                     kubernetes.io/arch: ${arch}
                     kubernetes.io/os: linux
@@ -2595,7 +2586,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
     }
 
     // Step 2: run tests
-    stage ("Setup environment")
+    stage ("Setup Environment")
     {
         // Random sleep to avoid resource contention
         sleep(10 * Math.random())
@@ -2647,7 +2638,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
     }
 
     if (testFilter[(DEBUG_MODE)]) {
-        stage("Interactive debug session")
+        stage("Interactive Debug Session")
         {
             testFilter[(DEBUG_MODE)] = false
 
@@ -2848,7 +2839,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
         }
 
         // Generate comprehensive rerun report if any reruns occurred
-        stage ("[${stageName}] Generate Report") {
+        stage ("Generate Report") {
             generateRerunReport(stageName, llmSrc)
         }
 
@@ -2859,7 +2850,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
         if (perfMode) {
             basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
             basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
-            stage("Check perf result") {
+            stage("Check Perf Result") {
                 def perfCheckResult = sh(
                     script: """
                     python3 ${llmSrc}/tests/integration/defs/perf/sanity_perf_check.py \
@@ -2872,7 +2863,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
                     error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
                 }
             }
-            stage("Create perf report") {
+            stage("Create Perf Report") {
                 sh """
                     python3 ${llmSrc}/tests/integration/defs/perf/create_perf_comparison_report.py \
                     --output_path ${stageName}/report.pdf \
@@ -2883,7 +2874,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
         }
 
         if (stageName.contains("PerfSanity")) {
-            stage ("Check perf result") {
+            stage ("Check PerfSanity Result") {
                 def perfCheckResult = sh(
                     script: """
                         python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
@@ -3079,7 +3070,7 @@ def ensureStageResultNotUploaded(stageName) {
     if(!GlobalState.uploadResultStageNames.contains(stageName)) {
         GlobalState.uploadResultStageNames.add(stageName)
     } else {
-        stage('Upload Test Results') {
+        stage('Upload Test Result') {
             catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
                 error "Upload test results for ${stageName} failed because it has already been uploaded."
             }
@@ -3288,7 +3279,7 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
         "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
         "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
-        // Perf sanity post merge test
+        // PerfSanity post-merge tests
         // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
         // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
         // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
@@ -3311,8 +3302,7 @@ def launchTestJobs(pipeline, testFilter)
 
     parallelJobs += parallelSlurmJobs
 
-    // Try to match what are being tested on x86 H100_PCIe.
-// SBSA machines from the Blossom machine pool
+    // SBSA machines from the Blossom machine pool
     SBSATestConfigs = [
         "GH200-TensorRT-Post-Merge-1": ["gh200", "l0_gh200", 1, 1],
         // DGX Spark is also named as GB10 Grace Blackwell Superchip.
@@ -3328,13 +3318,13 @@ def launchTestJobs(pipeline, testFilter)
         // Disable GB300 stages due to nodes will be offline temporarily.
         // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
         // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
-        // Perf sanity pre merge test
+        // PerfSanity pre-merge tests
         "GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
         "GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
         "GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
         "GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
         "GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
-        // Perf sanity post merge test
+        // PerfSanity post-merge tests
         "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
         "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
         "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
@@ -3355,9 +3345,7 @@ def launchTestJobs(pipeline, testFilter)
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
         "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
-        // Perf sanity pre merge tests
-        // "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
-        // Perf sanity post merge tests
+        // PerfSanity post-merge tests
         "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
         "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
         "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
@@ -3539,7 +3527,7 @@ def launchTestJobs(pipeline, testFilter)
             }
 
             if (checkPipStage) {
-                stage("Run LLMAPI tests") {
+                stage("Run LLMAPI Test") {
                     pipInstallSanitySpec = createKubernetesPodConfig(values[5], gpu_type, k8s_arch)
                     trtllm_utils.launchKubernetesPod(pipeline, pipInstallSanitySpec, "trt-llm", {
                         echo "###### Prerequisites Start ######"
@@ -3751,8 +3739,8 @@ def launchTestJobs(pipeline, testFilter)
     parallelJobsFiltered = parallelJobsFiltered.collectEntries { key, values -> [key, {
         stage(key) {
             if (key in testFilter[REUSE_STAGE_LIST]) {
-                stage("Skip - reused") {
-                    echo "Skip - Passed in the last pipeline."
+                stage("Skip - Reused") {
+                    echo "Skip - Passed in the previous pipelines."
                 }
             } else if (values instanceof List) {
                 trtllm_utils.launchKubernetesPod(pipeline, values[0], "trt-llm", {
@@ -3876,7 +3864,7 @@ pipeline {
         OPEN_SEARCH_DB_CREDENTIALS=credentials("open_search_db_credentials")
     }
     stages {
-        stage("Setup environment")
+        stage("Setup Environment")
         {
             steps
             {
@@ -3891,7 +3879,7 @@ pipeline {
                 }
             }
         }
-        stage("Check Test Lists")
+        stage("Check Test List")
         {
             when {
                 expression {
diff --git a/jenkins/scripts/slurm_install.sh b/jenkins/scripts/slurm_install.sh
index bd312180e7..cb1ec4bc83 100644
--- a/jenkins/scripts/slurm_install.sh
+++ b/jenkins/scripts/slurm_install.sh
@@ -12,7 +12,14 @@ slurm_install_setup() {
     cd $resourcePathNode
     llmSrcNode=$resourcePathNode/TensorRT-LLM/src
 
+    # Use unique lock file for this job ID
+    lock_file="install_lock_job_${SLURM_JOB_ID:-local}_node_${SLURM_NODEID:-0}.lock"
+
     if [ $SLURM_LOCALID -eq 0 ]; then
+        if [ -f "$lock_file" ]; then
+            rm -f "$lock_file"
+        fi
+
         retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
         which python3
         python3 --version
@@ -27,11 +34,11 @@ slurm_install_setup() {
         hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
         echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
         echo "(Writing install lock) Current directory: $(pwd)"
-        touch install_lock.lock
+        touch "$lock_file"
     else
         echo "(Waiting for install lock) Current directory: $(pwd)"
-        while [ ! -f install_lock.lock ]; do
-            sleep 5
+        while [ ! -f "$lock_file" ]; do
+            sleep 10
         done
     fi
 }
diff --git a/jenkins/scripts/slurm_run.sh b/jenkins/scripts/slurm_run.sh
index 584ae1e7c9..f319b4e976 100755
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@@ -64,8 +64,8 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
 if [ $SLURM_PROCID -eq 0 ]; then
     sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
 else
-    # Sleep 10 seconds to wait for the coverage config file to be saved
-    sleep 10
+    # Sleep 30 seconds to wait for the coverage config file to be saved
+    sleep 30
 fi
 
 containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
@@ -108,6 +108,25 @@ eval $pytestCommand
 pytest_exit_code=$?
 echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"
 
+# DEBUG: Diagnose intermittent "unrecognized arguments" failure (Exit Code 4)
+# Remove this after the issue is resolved
+if [ $pytest_exit_code -eq 4 ]; then
+    echo "DEBUG: Pytest failed with usage error (exit code 4)"
+    echo "DEBUG: Directory state at $(pwd):"
+    ls -l
+    echo "DEBUG: Directory state at $llmSrcNode/tests/integration/defs:"
+    ls -l $llmSrcNode/tests/integration/defs
+
+    echo "DEBUG: conftest.py content:"
+    md5sum $llmSrcNode/tests/integration/defs/conftest.py
+
+    echo "DEBUG: pytest.ini content:"
+    md5sum $llmSrcNode/tests/integration/defs/pytest.ini
+
+    echo "DEBUG: Check importability of conftest.py"
+    python3 -c "import sys; sys.path.insert(0, '.'); import conftest; print('DEBUG: conftest imported successfully')"
+fi
+
 if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
     if [[ "$stageName" == *PyTorch* ]]; then
         basePerfFilename="base_perf_pytorch.csv"
@@ -136,11 +155,11 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
 fi
 
 if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then
-    echo "Check Perf-Sanity Result"
+    echo "Check PerfSanity Result"
     python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
         $jobWorkspace
     perf_sanity_check_exit_code=$?
-    echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
+    echo "Rank${SLURM_PROCID} PerfSanity check finished execution with exit code $perf_sanity_check_exit_code"
 fi
 
 if [ "$pytest_exit_code" -ne 0 ]; then
diff --git a/scripts/check_test_list.py b/scripts/check_test_list.py
index c799d433fc..ec610cddd4 100755
--- a/scripts/check_test_list.py
+++ b/scripts/check_test_list.py
@@ -15,6 +15,7 @@ Note:
 All the perf tests will be excluded since they are generated dynamically.
 """
 import argparse
+import glob
 import os
 import subprocess
 
@@ -42,7 +43,13 @@ def verify_l0_test_lists(llm_src):
     test_list = f"{llm_src}/l0_test.txt"
 
     # Remove dynamically generated perf tests
-    subprocess.run(f"rm -f {test_db_path}/*perf*", shell=True, check=True)
+    # Exclude perf_sanity tests from being removed since they are different and statically defined
+    for file_path in glob.glob(os.path.join(test_db_path, "*perf*")):
+        if "perf_sanity" not in os.path.basename(file_path):
+            try:
+                os.remove(file_path)
+            except OSError:
+                pass
     subprocess.run(
         f"trt-test-db -d {test_db_path} --test-names --output {test_list}",
         shell=True,
diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
index 1f187ce4e0..c06b0d18bc 100644
--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@@ -2689,8 +2689,7 @@ def get_gpu_memory_wo_pynvml():
     import psutil
 
     logger.warning(
-        f"\nWarning: pynvml not available, using fallback commands for memory monitoring"
-    )
+        f"pynvml not available, using fallback commands for memory monitoring")
 
     gpu_memory = {}
     system_total_mb = 0
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 1ac827c014..fd5900dfd5 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -542,3 +542,4 @@ disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backen
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16] SKIP (https://nvbugs/5769890)
 disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-v3-8b-hf] SKIP (https://nvbugs/5769890,https://nvbugs/5748683)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_pp4_mtp] SKIP (https://nvbugs/5779536)
+perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] SKIP (https://nvbugs/5778381)