[None][ci] Some improvements for Slurm CI (#7689)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-14 06:27:45 +08:00 · 2025-09-14 16:56:32 +08:00 · 2025-09-14 16:56:32 +08:00 · 89fc136972
commit 89fc136972
parent 1f43854496
4 changed files with 201 additions and 38 deletions
--- a/jenkins/BuildDockerImage.groovy
+++ b/jenkins/BuildDockerImage.groovy
@ -283,7 +283,7 @@ def buildImage(config, imageKeyToTag)
        sh "git config --global --add safe.directory '*'"

        withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
-            sh "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}"
+            trtllm_utils.llmExecStepWithRetry(this, script: "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}")
        }

        withCredentials([
@ -294,7 +294,7 @@ def buildImage(config, imageKeyToTag)
            ),
            string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
        ]) {
-            sh "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}"
+            trtllm_utils.llmExecStepWithRetry(this, script: "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}")
        }
    }
    def containerGenFailure = null
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -105,24 +105,28 @@ REQUIRED_NO_DRIVER_TYPES = ["dgx-h100", "dgx-h200", "gh200"]
 ENABLE_NGC_DEVEL_IMAGE_TEST = params.enableNgcDevelImageTest ?: false
 ENABLE_NGC_RELEASE_IMAGE_TEST = params.enableNgcReleaseImageTest ?: false

-COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o ServerAliveInterval=60 -o ServerAliveCountMax=5"
+COMMON_SSH_OPTIONS = "-o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null -o TCPKeepAlive=no -o ServerAliveInterval=30 -o ServerAliveCountMax=20"

 def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
    withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+        def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
        def remote = [
-            ip           : cluster.ip,
-            host         : cluster.host,
+            ip           : randomLoginNode,
+            host         : randomLoginNode,
            user         : "${pipeline.USERNAME}",
            passwd       : "${pipeline.PASSWORD}",
            allowAnyHosts: true,
        ]

        Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
+
+        def downloadSucceed = false
+
        pipeline.stage('Submit Test Results') {
            sh "mkdir -p ${stageName}"
            def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
            def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p ${COMMON_SSH_OPTIONS} ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
-            def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
+            downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
            if (downloadSucceed) {
                sh "ls ${stageName}"
                echo "Upload test results."
@ -136,8 +140,9 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
                println("No results xml to submit")
            }
        }
+
        if (downloadSucceed) {
-            junit(testResults: "${stageName}/results*.xml")
+            junit(allowEmptyResults: true, testResults: "${stageName}/results*.xml")
        }
    }
 }
@ -145,9 +150,10 @@ def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String st
 //TODO: consolidate slurm related code for both multi nodes and single nodes
 def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID, String slurmOutputFile) {
    withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+        def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
        def remote = [
-            ip           : cluster.ip,
-            host         : cluster.host,
+            ip           : randomLoginNode,
+            host         : randomLoginNode,
            user         : "${pipeline.USERNAME}",
            passwd       : "${pipeline.PASSWORD}",
            allowAnyHosts: true,
@ -207,9 +213,10 @@ def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jo

 def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName, String slurmJobID) {
    withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+        def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
        def remote = [
-            ip           : cluster.ip,
-            host         : cluster.host,
+            ip           : randomLoginNode,
+            host         : randomLoginNode,
            user         : "${pipeline.USERNAME}",
            passwd       : "${pipeline.PASSWORD}",
            allowAnyHosts: true,
@ -290,13 +297,15 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
    def nodeSecret = CloudManager.createNode(nodeName, customWorkspace)

    def slurmJobID = null
+    def dockerArgs = null

    try {
        // Run ssh command to start node in desired cluster via SLURM
        withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+            def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
            def remote = [
-                    ip           : cluster.ip,
-                    host         : cluster.host,
+                    ip           : randomLoginNode,
+                    host         : randomLoginNode,
                    user         : "${pipeline.USERNAME}",
                    passwd       : "${pipeline.PASSWORD}",
                    allowAnyHosts: true,
@ -314,6 +323,8 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p

                Utils.exec(pipeline, script: "cat ${jenkinsSetupPath}")

+                Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
+
                def slurmSubmitOutput = Utils.exec(
                    pipeline,
                    timeout: false,
@ -353,9 +364,10 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p

        stage('Checking if the Node is Online') {
            withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+                def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
                def remote = [
-                        ip           : cluster.ip,
-                        host         : cluster.host,
+                        ip           : randomLoginNode,
+                        host         : randomLoginNode,
                        user         : "${pipeline.USERNAME}",
                        passwd       : "${pipeline.PASSWORD}",
                        allowAnyHosts: true,
@ -373,8 +385,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
            }

            if (CloudManager.isNodeOnline(nodeName)) {
-                def dockerGPUOption = ""
-
                node(nodeName) {
                    sh """
                        env | sort
@ -393,7 +403,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                    // Dynamically set GPU arguments based on environment variables
                    // https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/docker-specialized.html
                    // It's intentional to check NV_GPU first.
-                    dockerGPUOption = sh(script: """
+                    dockerArgs = sh(script: """
                        if [ -n "\$NV_GPU" ]; then
                            echo "--gpus '\\"device=\$NV_GPU\\"'"
                        elif [ -n "\$CUDA_VISIBLE_DEVICES" ]; then
@ -404,7 +414,7 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                    """, returnStdout: true).trim()
                }

-                def dockerArgs = "${dockerGPUOption} " +
+                dockerArgs = "${dockerArgs} " +
                    "--cap-add=SYS_ADMIN " +
                    "--ipc=host " +
                    "--entrypoint=\"\" " +
@ -415,18 +425,17 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                    "-v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw " +
                    "--cap-add=SYSLOG"

-                echo "Final dockerArgs: ${dockerArgs}"
-
                if (partition.clusterName == "dlcluster") {
                    dockerArgs += " -e NVIDIA_IMEX_CHANNELS=0"
                }
-
-                slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
-                executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
+                echo "Final dockerArgs: ${dockerArgs}"
            } else {
                error "The Slurm node does not come online in the waiting period. Terminating the job."
            }
        }
+
+        slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
+        executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
    } finally {
        stage("Clean up SLURM Resources") {
            // Workaround to handle the interruption during clean up SLURM resources
@ -473,9 +482,10 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                passwordVariable: 'PASSWORD'
            )
        ]) {
+            def randomLoginNode = SlurmConfig.getRandomLoginNode(cluster.host)
            def remote = [
-                    ip           : cluster.ip,
-                    host         : cluster.host,
+                    ip           : randomLoginNode,
+                    host         : randomLoginNode,
                    user         : "${pipeline.USERNAME}",
                    passwd       : "${pipeline.PASSWORD}",
                    allowAnyHosts: true,
@ -545,7 +555,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
                def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
                def scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
                def scriptContent = """#!/bin/bash
-                    set -o pipefail
+                    set -Eeuo pipefail
+                    trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR
                    export jobWorkspace=$jobWorkspace
                    export tarName=$tarName
                    export llmTarfile=$llmTarfile
@ -571,6 +582,8 @@ def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILL
            }

            stage('Run Test') {
+                Utils.exec(pipeline, script: "echo Sleeping before Slurm job submission; sleep \$((RANDOM % 29 + 1))")
+
                Utils.exec(
                    pipeline,
                    timeout: false,
@ -1940,14 +1953,18 @@ def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
                stage('Pull Docker Image') {
                    docker.image(image).pull()
                }
-                docker.image(image).inside(dockerArgs) {
-                    runner()
+                // We submit the Slurm job with SlurmConfig.DEFAULT_TIMEOUT minutes (300) timeout
+                // The timeout here is to avoid the Slurm job being stuck.
+                timeout(time: SlurmConfig.DEFAULT_TIMEOUT, unit: 'MINUTES') {
+                    docker.image(image).inside(dockerArgs) {
+                        runner()
+                    }
                }
            } catch (Exception e) {
                if (e.getMessage()?.contains("Failed to kill container")) {
                    echo "Known benign error ignored: ${e.getMessage()}"
                } else {
-                    throw e // Re-throw if it's a different IOException
+                    throw e // Re-throw if it's a different Exception
                }
            }
        }
@ -2128,10 +2145,11 @@ def launchTestJobs(pipeline, testFilter)

    multiNodesSBSAConfigs = [
        // Each stage test 1 testcase with 8 GPUs and 2 nodes.
-        "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
-        "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
+        // Disable GB200 multi-node testing in L0 pre-merge until the configuration issue is resolved (https://nvbugs/5455140)
+        // "GB200-8_GPUs-2_Nodes-PyTorch-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 4, 8, 2],
+        // "GB200-8_GPUs-2_Nodes-PyTorch-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 4, 8, 2],
+        // "GB200-8_GPUs-2_Nodes-PyTorch-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 4, 8, 2],
+        // "GB200-8_GPUs-2_Nodes-PyTorch-4": ["gb200-multi-node", "l0_gb200_multi_nodes", 4, 4, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 5, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-multi-node", "l0_gb200_multi_nodes", 2, 5, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-multi-node", "l0_gb200_multi_nodes", 3, 5, 8, 2],
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@ -1,4 +1,9 @@
 #!/bin/bash
+
+# Set up error handling
+set -Eeuo pipefail
+trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR
+
 cd $resourcePathNode
 llmSrcNode=$resourcePathNode/TensorRT-LLM/src

@ -27,21 +32,25 @@ if [ $SLURM_LOCALID -eq 0 ]; then
    cd $resourcePathNode &&  pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
    git config --global --add safe.directory "*"
    gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
-    echo "HOST_NODE_NAME = $HOST_NODE_NAME ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
+    hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
+    echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
    touch install_lock.lock
 else
    while [ ! -f install_lock.lock ]; do
        sleep 5
    done
 fi
-testList="$testList_$splitId"
 export CPP_TEST_TIMEOUT_OVERRIDDEN=$pytestTestTimeout
 export LLM_ROOT=$llmSrcNode
 export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
 export UCX_TLS=^gdr_copy
+
+# TODO: Move back to tensorrt_llm/llmapi/trtllm-llmapi-launch later
+llmapiLaunchScript="$llmSrcNode/jenkins/scripts/trtllm-llmapi-launch"
+chmod +x $llmapiLaunchScript
 cd $llmSrcNode/tests/integration/defs
 testCmdLines=(
-    "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
+    "$llmapiLaunchScript"
    "pytest"
    "-v"
    "--timeout-method=thread"
@ -88,6 +97,13 @@ echo "Library Path:"
 echo "$LD_LIBRARY_PATH"
 env | sort
 fullCmd="${testCmdLines[*]}"
-echo "Running: $testCase"
 echo "Full Command: $fullCmd"
+
+# Turn off "exit on error" so the following lines always run
+set +e
+trap - ERR
+
 eval $fullCmd
+exitCode=$?
+echo "Pytest exit code: $exitCode"
+exit $exitCode
--- a/jenkins/scripts/trtllm-llmapi-launch
+++ b/jenkins/scripts/trtllm-llmapi-launch
@ -0,0 +1,129 @@
+#!/bin/bash
+set -Eeo pipefail
+
+task_with_command=("$@")
+native_mpi_rank=$OMPI_COMM_WORLD_RANK
+mpi_rank=${SLURM_PROCID:-${OMPI_COMM_WORLD_RANK:-${PMI_RANK:-${PMI_ID:-0}}}}
+
+log_stderr() { echo -e "\033[33m$@\033[0m" >&2; }
+log_stderr "mpi_rank: $mpi_rank"
+
+pid=$(ps -o pid= -p $$ | tr -d ' ')
+
+# Tell TRTLLM to spawn a additional process for the Proxy
+export TLLM_SPAWN_PROXY_PROCESS=1
+
+function mpi_world_size {
+    if [ -n "$SLURM_NTASKS" ]; then
+        echo "$SLURM_NTASKS"
+    elif [ -n "$OMPI_COMM_WORLD_SIZE" ]; then
+        echo "$OMPI_COMM_WORLD_SIZE"
+    else
+        echo "1"
+    fi
+}
+
+function export_free_tcp_addr_for_spawn_proxy_process {
+    # find free port starting from 10012
+    local free_port=$(python -c 'import socket; s=socket.socket();
+port = 10012
+while True:
+    try:
+        s.bind(("", port))
+        break
+    except OSError:
+        port += 1
+print(port); s.close()')
+    export TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR="tcp://127.0.0.1:${free_port}"
+    log_stderr "TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR: $TLLM_SPAWN_PROXY_PROCESS_IPC_ADDR"
+
+    export TLLM_SPAWN_PROXY_PROCESS_IPC_HMAC_KEY=$(openssl rand -hex 32)
+}
+
+
+export tllm_mpi_size=$(mpi_world_size)
+log_stderr "tllm_mpi_size: $tllm_mpi_size"
+
+export_free_tcp_addr_for_spawn_proxy_process
+
+if [ -z "$mpi_rank" ] || [ "$mpi_rank" -eq 0 ]; then
+    log_stderr "rank${mpi_rank} run ${task_with_command[@]} in background"
+
+    # MPI doesn't allow spawn a process sharing the MPI environment in a MPI
+    # process, or duplicate MPI_Init in the child process will cause undefined
+    # behavior. Thus we need to clean the MPI environment in the parent process
+    # before spawning the child process, and restore the MPI environment later
+    # before running MPI operations in the parent process.
+    mpi_blacklist=(
+        OMPI_ PMIX_ PMI_ SLURM_ MPI_ UCX_
+        I_MPI_ HYDRA_ KMP_ MPICH_ MV2_ CRAY_
+    )
+
+    (
+        # Remove MPI-related variables only in the subshell context
+        for var in $(compgen -e); do
+            for prefix in "${mpi_blacklist[@]}"; do
+                if [[ "$var" == "$prefix"* ]]; then
+                    unset "$var"
+                    break
+                fi
+            done
+        done
+
+        # Turn off "exit on error" so the following lines always run
+        set +e
+
+        # Execute the task with cleaned environment
+        "${task_with_command[@]}"
+        task_exit_code=$?
+        echo "Task exit code: $task_exit_code"
+
+        # Stop the MPI Comm server
+        python3 -m tensorrt_llm.llmapi.mgmn_leader_node --action stop
+        mpi_exit_code=$?
+        echo "MPI Comm server exit code: $mpi_exit_code"
+
+        # Propagate task exit status
+        if [ $task_exit_code -ne 0 ]; then
+            exit $task_exit_code
+        else
+            exit $mpi_exit_code
+        fi
+    ) 1>&2 &
+
+    # Turn off "exit on error" so the following lines always run
+    set +e
+
+    # Capture subshell PID
+    subshell_pid=$!
+    echo "Subshell PID: $subshell_pid"
+
+    log_stderr "rank${mpi_rank} run mgmn leader node with mpi_world_size: $(mpi_world_size) ..."
+    log_stderr "rank0 host: $HOSTNAME"
+    python3 -m tensorrt_llm.llmapi.mgmn_leader_node
+    mgmn_leader_node_exit_code=$?
+    echo "MGMN leader node exit code: $mgmn_leader_node_exit_code"
+
+    # Wait for subshell
+    wait $subshell_pid
+    # This is subshell's exit code
+    subshell_exit_code=$?
+    echo "Subshell exit code: $subshell_exit_code"
+
+    # Propagate subshell exit status
+    if [ $subshell_exit_code -ne 0 ]; then
+        exit $subshell_exit_code
+    else
+        exit $mgmn_leader_node_exit_code
+    fi
+else
+    # Turn off "exit on error" so the following lines always run
+    set +e
+
+    log_stderr "rank${mpi_rank} run mgmn worker node with mpi_world_size: $(mpi_world_size) ..."
+    python3 -m tensorrt_llm.llmapi.mgmn_worker_node
+    mgmn_worker_node_exit_code=$?
+    echo "MGMN worker node exit code: $mgmn_worker_node_exit_code"
+
+    exit $mgmn_worker_node_exit_code
+fi