[None][infra] Some improvements for Slurm execution path in the CI (#10316)

Signed-off-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-12-29 19:49:44 +08:00 · 2025-12-29 19:49:44 +08:00 · 965578ca21
commit 965578ca21
parent 9cee32ab39
4 changed files with 133 additions and 24 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -694,9 +694,9 @@ def runLLMTestlistWithAgent(pipeline, platform, testList, config=VANILLA_CONFIG,
        }

        slurmRunner = null
-        if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
+        if (cluster.containerRuntime.toString() == "DOCKER") {
            slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, true)
-        } else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
+        } else if (cluster.containerRuntime.toString() == "ENROOT") {
            slurmRunner = runInEnrootOnNode(nodeName)
        } else {
            throw new Exception("Unsupported container runtime: ${cluster.containerRuntime}")
@ -799,7 +799,7 @@ def getPytestBaseCommandLine(
        "LLM_BACKEND_ROOT=${llmSrc}/triton_backend",
        "LLM_MODELS_ROOT=${MODEL_CACHE_DIR}",
        "MODEL_CACHE_DIR=${MODEL_CACHE_DIR}",
-        "COLUMNS=200",
+        "COLUMNS=400",
        extraInternalEnv,
        portEnvVars,
        pytestUtil,
@ -860,11 +860,11 @@ def getMountListForSlurmTest(SlurmCluster cluster, boolean useSbatch = false)
    }

    // data/cache mounts
-    if (cluster.containerRuntime == ContainerRuntime.DOCKER) {
+    if (cluster.containerRuntime.toString() == "DOCKER") {
        mounts += [
            "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro",
        ]
-    } else if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
+    } else if (cluster.containerRuntime.toString() == "ENROOT") {
        if (!cluster.scratchPath) {
            throw new Exception("Scratch path is not set for cluster: ${cluster.name}")
        }
@ -922,6 +922,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
            def scriptRunPathNode = "${jobWorkspace}/${jobUID}-slurm_run.sh"
            def scriptInstallLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_install.sh"
            def scriptInstallPathNode = "${jobWorkspace}/${jobUID}-slurm_install.sh"
+            def scriptBashUtilsLocalPath = "${llmSrcLocal}/jenkins/scripts/bash_utils.sh"
+            def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
            def testListPathNode = "${jobWorkspace}/${testList}.txt"
            def waivesListPathNode = "${jobWorkspace}/waives.txt"
            def outputPath = "${jobWorkspace}/job-output.log"
@ -956,6 +958,14 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    scriptInstallPathNode,
                    true
                )
+                Utils.exec(pipeline, script: "echo \"Script for Bash utilities: \" && cat ${scriptBashUtilsLocalPath}")
+                Utils.copyFileToRemoteHost(
+                    pipeline,
+                    remote,
+                    scriptBashUtilsLocalPath,
+                    scriptBashUtilsPathNode,
+                    true
+                )

                // Generate Test List and Upload to Frontend Node
                def makoArgs = getMakoArgsFromStageName(stageName, true)
@ -1040,7 +1050,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG

                def containerImageArg = container
                def srunPrologue = ""
-                if (cluster.containerRuntime == ContainerRuntime.ENROOT) {
+                if (cluster.containerRuntime.toString() == "ENROOT") {
                    def enrootImagePath = "${cluster.scratchPath}/users/svc_tensorrt/containers/container-\${SLURM_JOB_ID}.sqsh"
                    containerImageArg = enrootImagePath

@ -1127,9 +1137,7 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    set -xEeuo pipefail
                    trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR

-                    echo "Starting job \$SLURM_JOB_ID on \$SLURM_NODELIST"
-                    echo \$SLURM_JOB_ID > "$jobWorkspace/slurm_job_id.txt"
-
+                    echo "Starting Slurm job \$SLURM_JOB_ID on \$SLURM_NODELIST"
                    export jobWorkspace=$jobWorkspace
                    export tarName=$tarName
                    export llmTarfile=$llmTarfile
@ -1219,10 +1227,11 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    touch "${outputPath}"
                    jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
                    if [ -z "\$jobId" ]; then
-                        echo "Error: Job submission failed, no job ID returned."
+                        echo "Error: Slurm job submission failed, no job ID returned."
                        exit 1
                    fi
-                    echo "Submitted job \$jobId"
+                    echo "Submitted Slurm job \$jobId"
+                    echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
                    tail -f ${outputPath} &
                    tailPid=\$!
                    # Wait until sbatch job is done.
@ -1232,9 +1241,28 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    # Kill tail -f process
                    kill \$tailPid
                    # Check if the job failed or not
-                    sleep 5
-                    STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
-                    EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
+                    sleep 10
+                    # Retry getting status and exit code as sacct might be delayed
+                    for i in {1..3}; do
+                        STATUS=\$(sacct -j \$jobId --format=State --noheader | head -n 1 | awk '{print \$1}')
+                        EXIT_CODE=\$(sacct -j \$jobId --format=ExitCode -Pn --allocations | awk -F: '{print \$1}')
+
+                        if [ -n "\$STATUS" ] && [ -n "\$EXIT_CODE" ]; then
+                            break
+                        fi
+                        echo "Waiting for sacct to update... attempt \$i"
+                        sleep 10
+                    done
+
+                    if [ -z "\$EXIT_CODE" ]; then
+                        echo "Error: Failed to get exit code from sacct after retries, defaulting to 1."
+                        EXIT_CODE=1
+                    fi
+                    if [ -z "\$STATUS" ]; then
+                        echo "Error: Failed to get status from sacct after retries, defaulting to UNKNOWN."
+                        STATUS="UNKNOWN"
+                    fi
+
                    if [[ "\$STATUS" == "COMPLETED" && \$EXIT_CODE -eq 0 ]]; then
                        echo "Pytest succeed in Slurm job \$jobId"
                        echo "Status: \$STATUS | Exit_code \$EXIT_CODE"
--- a/jenkins/scripts/bash_utils.sh
+++ b/jenkins/scripts/bash_utils.sh
@ -0,0 +1,45 @@
+#!/bin/bash
+
+# Retry a command with a specified number of retries and interval.
+# Arguments:
+#   max_retries (optional): The maximum number of times to retry the command. Default: 3.
+#   interval (optional): The time in seconds to wait between retries. Default: 60.
+#   command: The command to run and its arguments.
+# Usage:
+#   retry_command [max_retries] [interval] command...
+#   If only one numeric argument is provided, it is treated as max_retries.
+function retry_command() {
+    local max_retries=3
+    local interval=60
+
+    if [[ "$1" =~ ^[0-9]+$ ]]; then
+        max_retries=$1
+        shift
+    fi
+
+    if [[ "$1" =~ ^[0-9]+$ ]]; then
+        interval=$1
+        shift
+    fi
+
+    local cmd=("$@")
+
+    local count=0
+    local rc=0
+
+    while [ $count -lt $max_retries ]; do
+        if "${cmd[@]}"; then
+            return 0
+        fi
+        rc=$?
+        count=$((count + 1))
+        echo "Command failed with exit code $rc. Attempt $count/$max_retries."
+        if [ $count -lt $max_retries ]; then
+            echo "Retrying in $interval seconds..."
+            sleep $interval
+        fi
+    done
+
+    echo "Command failed after $max_retries attempts."
+    return $rc
+}
--- a/jenkins/scripts/slurm_install.sh
+++ b/jenkins/scripts/slurm_install.sh
@ -4,22 +4,25 @@
 set -xEeuo pipefail
 trap 'rc=$?; echo "Error in file ${BASH_SOURCE[0]} on line $LINENO: $BASH_COMMAND (exit $rc)"; exit $rc' ERR

+# Source utilities
+bashUtilsPath="$(dirname "${BASH_SOURCE[0]}")/$(basename "${BASH_SOURCE[0]}" | sed 's/slurm_install\.sh/bash_utils.sh/')"
+source "$bashUtilsPath"
+
 slurm_install_setup() {
    cd $resourcePathNode
    llmSrcNode=$resourcePathNode/TensorRT-LLM/src

    if [ $SLURM_LOCALID -eq 0 ]; then
-        wget -nv $llmTarfile
-        tar -zxf $tarName
+        retry_command bash -c "wget -nv $llmTarfile && tar -zxf $tarName"
        which python3
        python3 --version
-        apt-get install -y libffi-dev
+        retry_command apt-get install -y libffi-dev
        nvidia-smi && nvidia-smi -q && nvidia-smi topo -m
        if [[ $pytestCommand == *--run-ray* ]]; then
-            pip3 install --retries 10 ray[default]
+            retry_command pip3 install --retries 10 ray[default]
        fi
-        cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt
-        cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
+        retry_command bash -c "cd $llmSrcNode && pip3 install --retries 10 -r requirements-dev.txt"
+        retry_command bash -c "cd $resourcePathNode && pip3 install --retries 10 --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl"
        gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
        hostNodeName="${HOST_NODE_NAME:-$(hostname -f || hostname)}"
        echo "HOST_NODE_NAME = $hostNodeName ; GPU_UUIDS = $gpuUuids ; STAGE_NAME = $stageName"
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@ -63,9 +63,10 @@ pytestCommand=$(set_value_in_command "TRTLLM_WHL_PATH" "$trtllmWhlPath" "$pytest
 # Only the first process will save the coverage config file
 if [ $SLURM_PROCID -eq 0 ]; then
    sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
+else
+    # Sleep 10 seconds to wait for the coverage config file to be saved
+    sleep 10
 fi
-# Sleep 10 seconds to wait for the coverage config file to be saved
-sleep 10

 containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
 containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
@ -95,8 +96,17 @@ echo "Full Command: $pytestCommand"
    done
 fi

+# Turn off "exit on error" so the following lines always run
+set +e
+
+pytest_exit_code=0
+perf_check_exit_code=0
+perf_report_exit_code=0
+perf_sanity_check_exit_code=0
+
 eval $pytestCommand
-echo "Rank${SLURM_PROCID} Pytest finished execution"
+pytest_exit_code=$?
+echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"

 if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
    if [[ "$stageName" == *PyTorch* ]]; then
@ -109,15 +119,38 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
    python3 $llmSrcNode/tests/integration/defs/perf/sanity_perf_check.py \
        $stageName/perf_script_test_results.csv \
        $basePerfPath
-    echo "Check Perf Result"
+    perf_check_exit_code=$?
+
+    echo "Create Perf Report"
    python3 $llmSrcNode/tests/integration/defs/perf/create_perf_comparison_report.py \
        --output_path $stageName/report.pdf \
        --files $stageName/perf_script_test_results.csv \
        $basePerfPath
+    perf_report_exit_code=$?
+    echo "Rank${SLURM_PROCID} Perf report finished execution with exit code $perf_report_exit_code"
+
+    if [ "$perf_check_exit_code" -eq 0 ] && [ "$perf_report_exit_code" -ne 0 ]; then
+        perf_check_exit_code=$perf_report_exit_code
+    fi
+    echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
 fi

 if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
    echo "Check Perf-Sanity Result"
    python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
        $jobWorkspace
+    perf_sanity_check_exit_code=$?
+    echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
 fi
+
+if [ "$pytest_exit_code" -ne 0 ]; then
+    final_exit_code=$pytest_exit_code
+elif [ "$perf_check_exit_code" -ne 0 ]; then
+    final_exit_code=$perf_check_exit_code
+elif [ "$perf_sanity_check_exit_code" -ne 0 ]; then
+    final_exit_code=$perf_sanity_check_exit_code
+else
+    final_exit_code=0
+fi
+echo "Rank${SLURM_PROCID} Final Slurm run finished execution with exit code $final_exit_code"
+exit $final_exit_code