[None][ci] Move remaining DGX-B200 tests to LBD (#9876)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-12-28 13:55:39 +08:00 · 2025-12-28 13:55:39 +08:00 · 270be801aa
commit 270be801aa
parent c59aa8bec5
6 changed files with 69 additions and 46 deletions
--- a/jenkins/BuildDockerImage.groovy
+++ b/jenkins/BuildDockerImage.groovy
@ -372,7 +372,7 @@ def buildImage(config, imageKeyToTag)
                IMAGE_WITH_TAG=${imageWithTag} \
                STAGE=${dockerfileStage} \
                BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
-                """, sleepInSecs: randomSleep, numRetries: 2, shortCommondRunTimeMax: 7200)
+                """, sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
            }
            if (target == "ngc-release") {
                imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
@ -726,7 +726,7 @@ pipeline {
                        cmd += "--image "
                        cmd += imageKeyToTag.values().join(" ")
                        withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
-                            trtllm_utils.llmExecStepWithRetry(this, script: cmd, numRetries: 6, shortCommondRunTimeMax: 7200)
+                            trtllm_utils.llmExecStepWithRetry(this, script: cmd, sleepInSecs: 600, numRetries: 6, shortCommondRunTimeMax: 7200)
                        }
                    }
                }
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@ -1241,7 +1241,8 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
    def dockerBuildJob = [
        "Build-Docker-Images": {
            script {
-                stage("[Build-Docker-Images] Remote Run") {
+                def testStageName = "[Build-Docker-Images] ${env.localJobCredentials ? "Remote Run" : "Run"}"
+                stage(testStageName) {
                    def branch = env.gitlabBranch ? env.gitlabBranch : "main"
                    if (globalVars[GITHUB_PR_API_URL]) {
                        branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last()
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -461,7 +461,7 @@ def cleanUpSlurmResources(def pipeline, SlurmCluster cluster, String jobUID){
        def cleanupCommands = [
            "rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
            "rm -rf ${jobWorkspace} || true",
-        ].join(" && ")
+        ].join(" ; ")
        Utils.exec(
            pipeline,
            script: Utils.sshUserCmd(
@ -511,7 +511,7 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, St
        def cleanupCommands = [
            "rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-${entrypoint} || true",
            "rm -rf ${cluster.scratchPath}/users/svc_tensorrt/containers/container-${slurmJobID}.sqsh || true",
-        ].join(" && ")
+        ].join(" ; ")
        Utils.exec(
            pipeline,
            script: Utils.sshUserCmd(
@ -939,7 +939,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
                sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}"

-                Utils.exec(pipeline, script: "echo \"Script to trigger Slurm srun job: \" && cat ${scriptRunLocalPath}")
+                Utils.exec(pipeline, script: "echo \"Script for Slurm srun job to submit: \" && cat ${scriptRunLocalPath}")
                Utils.copyFileToRemoteHost(
                    pipeline,
                    remote,
@ -948,7 +948,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    true
                )

-                Utils.exec(pipeline, script: "echo \"Script to install environment: \" && cat ${scriptInstallLocalPath}")
+                Utils.exec(pipeline, script: "echo \"Script to install TensorRT LLM dependencies: \" && cat ${scriptInstallLocalPath}")
                Utils.copyFileToRemoteHost(
                    pipeline,
                    remote,
@ -1093,7 +1093,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                srunArgs = [
                    "--container-name=multi_node_test-\${SLURM_JOB_ID}",
                    "--container-image=$containerImageArg",
-                    "--container-workdir=/home/svc_tensorrt/bloom/scripts",
+                    "--container-workdir=$jobWorkspace",
                    "--container-mounts=$mounts",
                    "--container-env=NVIDIA_IMEX_CHANNELS"
                ]
@ -1115,16 +1115,21 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    "export ${varName}=\"${escapedValue}\""
                }.join('\n')

+                // Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
                def scriptLaunchPrefix = """#!/bin/bash
                    #SBATCH ${exemptionComment}
                    #SBATCH --output=${outputPath}
                    ${taskArgs.collect { "#SBATCH $it" }.join('\n')}
                    #SBATCH ${partition.additionalArgs}
                    ${(partition?.name && partition.name != "unspecified") ? "#SBATCH --partition=${partition.name}" : ""}
-                    echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"

-                    set -Eeuo pipefail
+                    # SBATCH directives must appear before any executable commands.
+                    set -xEeuo pipefail
                    trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
+
+                    echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
+                    echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"
+
                    export jobWorkspace=$jobWorkspace
                    export tarName=$tarName
                    export llmTarfile=$llmTarfile
@ -1156,8 +1161,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG

                    pipeline.writeFile(file: scriptLaunchPrefixPathLocal, text: scriptLaunchPrefix)
                    pipeline.writeFile(file: scriptLaunchSrunArgsPathLocal, text: srunArgs.join(" "))
-                    Utils.exec(pipeline, script: "echo \"Script launch prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
-                    Utils.exec(pipeline, script: "echo \"Srun args content: \" && cat ${scriptLaunchSrunArgsPathLocal}")
+                    Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job prefix: \" && cat ${scriptLaunchPrefixPathLocal}")
+                    Utils.exec(pipeline, script: "echo \"Script for Slurm srun job args: \" && cat ${scriptLaunchSrunArgsPathLocal}")

                    // Output is the corresponding scriptLaunchPathLocal script under the disaggMode
                    sh """
@ -1184,7 +1189,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    pipeline.writeFile(file: scriptLaunchPathLocal, text: scriptContent)
                }

-                Utils.exec(pipeline, script: "echo \"Script to trigger Slurm sbatch job: \" && cat ${scriptLaunchPathLocal}")
+                Utils.exec(pipeline, script: "echo \"Script for Slurm sbatch job to submit: \" && cat ${scriptLaunchPathLocal}")
                Utils.copyFileToRemoteHost(
                    pipeline,
                    remote,
@ -1194,9 +1199,24 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                )

                def scriptExec = """#!/bin/bash
-                    set -Eeuo pipefail
+                    set -xEeuo pipefail
                    trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
-                    touch ${outputPath}
+
+                    # Clean up previous job intermediate files so that retry can work
+                    if [ -f "${jobWorkspace}/slurm_job_id.txt" ]; then
+                        previous_job_id=\$(cat "${jobWorkspace}/slurm_job_id.txt")
+                        echo "Found previous Slurm job ID: \${previous_job_id}"
+                        scancel "\${previous_job_id}" || true
+                        rm -rf "${jobWorkspace}/slurm_job_id.txt"
+                        # Wait for 60 seconds to ensure the previous job is canceled
+                        sleep 60
+                    fi
+                    rm -rf "${jobWorkspace}/results.xml"
+                    rm -rf "${jobWorkspace}/report.csv"
+                    rm -rf "${jobWorkspace}/unfinished_test.txt"
+                    rm -rf "${outputPath}"
+
+                    touch "${outputPath}"
                    jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
                    if [ -z "\$jobId" ]; then
                        echo "Error: Job submission failed, no job ID returned."
@ -1460,7 +1480,8 @@ def cacheErrorAndUploadResult(stageName, taskRunner, finallyRunner, noResultIfSu
        if (stageIsInterrupted) {
            echo "Stage is interrupted, skip to upload test result."
        } else {
-            sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
+            // Temporarily disable to reduce the log size
+            // sh 'if [ "$(id -u)" -eq 0 ]; then dmesg || true; fi'
            if (noResultIfSuccess && !stageIsFailed) {
                // Clean up the workspace
                sh """
@ -2603,7 +2624,8 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
        def containerPortNum = GlobalState.PORT_SECTION_SIZE

        // Some clusters do not allow dmesg -C so we add || true
-        sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
+        // Temporarily disable to reduce the log size
+        // sh 'if [ "$(id -u)" -eq 0 ]; then dmesg -C || true; fi'
        def pytestCommand = getPytestBaseCommandLine(
            llmSrc,
            stageName,
@ -3124,11 +3146,11 @@ def launchTestJobs(pipeline, testFilter)
        "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
        "DGX_H100-4_GPUs-PyTorch-Ray-1": ["dgx-h100-x4-oci", "l0_dgx_h100", 1, 1, 4],
        "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
-        "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4", "l0_dgx_b200", 1, 1, 4],
+        "DGX_B200-4_GPUs-PyTorch-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
        "DGX_B200-4_GPUs-PyTorch-Ray-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 1, 4, 1, true],
        "DGX_B200-8_GPUs-PyTorch-1": ["b200-x8-lbd", "l0_dgx_b200", 1, 1, 8, 1, true],
-        "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-trtllm", "l0_dgx_b200", 1, 2, 4, 1, true],
-        "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-trtllm", "l0_dgx_b200", 2, 2, 4, 1, true],
+        "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["b200-x4-lbd", "l0_dgx_b200", 1, 2, 4, 1, true],
+        "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["b200-x4-lbd", "l0_dgx_b200", 2, 2, 4, 1, true],
        "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
        "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
        // Perf sanity post merge test
--- a/jenkins/scripts/slurm_install.sh
+++ b/jenkins/scripts/slurm_install.sh
@ -1,7 +1,7 @@
 #!/bin/bash

 # Set up error handling
-set -Eeuo pipefail
+set -xEeuo pipefail
 trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR

 slurm_install_setup() {
@ -23,8 +23,10 @@ slurm_install_setup() {
        gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
        hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
        echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
+        echo "(Writing install lock) Current directory: $(pwd)"
        touch install_lock.lock
    else
+        echo "(Waiting for install lock) Current directory: $(pwd)"
        while [ ! -f install_lock.lock ]; do
            sleep 5
        done
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@ -1,7 +1,7 @@
 #!/bin/bash

 # Set up error handling
-set -Eeuo pipefail
+set -xEeuo pipefail
 trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR

 cd $resourcePathNode
@ -29,10 +29,8 @@ set_value_in_command() {
    echo "$result"
 }

-# Only the first process will save the job ID and set the git config
+# Only the first process will set the git config
 if [ $SLURM_PROCID -eq 0 ]; then
-    # Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
-    echo $SLURM_JOB_ID > $jobWorkspace/slurm_job_id.txt
    # Update HOME/.gitconfig
    if ! git config --global --get-all safe.directory | grep -Fxq "*"; then
        git config --global --add safe.directory "*"
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@ -24,7 +24,7 @@ l0_dgx_b200:
  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_trtllm-torch_compile=True]
  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True]
  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
  - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4]
@ -66,17 +66,17 @@ l0_dgx_b200:
      backend: pytorch
      orchestrator: mpi
  tests:
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] TIMEOUT (180)
-  - accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] TIMEOUT (360)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_mtp] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_bs8_mtp] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput_mtp_trtllm] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_mtp1] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_mtp1] TIMEOUT (60)
+  - accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] TIMEOUT (60)
 - condition:
    ranges:
      system_gpu_count:
@ -92,15 +92,15 @@ l0_dgx_b200:
      backend: pytorch
      orchestrator: mpi
  tests:
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (180)
-  - accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (90)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus_corner_case TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[baseline_fp8kv] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[latency] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[skip_indexer] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[baseline_fp8kv] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[latency] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus[skip_indexer] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_nvfp4_multi_gpus_chunked_prefill[baseline_fp8kv] TIMEOUT (60)
+  - accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_fp8[latency_moe_deepgemm] TIMEOUT (60)
 - condition:
    ranges:
      system_gpu_count: