[None][infra] add retry logic to get slurm sbatch job log when ssh dropped (#9167)

Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2026-01-03 18:11:37 -08:00 · 2026-01-03 18:11:37 -08:00 · 5bd37ce41e
commit 5bd37ce41e
parent 0d1f5ad7a2
1 changed files with 97 additions and 28 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -933,11 +933,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
            def scriptBashUtilsPathNode = "${jobWorkspace}/${jobUID}-bash_utils.sh"
            def testListPathNode = "${jobWorkspace}/${testList}.txt"
            def waivesListPathNode = "${jobWorkspace}/waives.txt"
-            def outputPath = "${jobWorkspace}/job-output.log"
+            def sbatchLogPath = "${jobWorkspace}/job-output.log"
            def scriptLaunchPathLocal = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
            def scriptLaunchPathNode = "${jobWorkspace}/${jobUID}-slurm_launch.sh"
-            def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
-            def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
+            def scriptSubmitPathLocal = Utils.createTempLocation(pipeline, "./slurm_submit.sh")
+            def scriptSubmitPathNode = "${jobWorkspace}/${jobUID}-slurm_submit.sh"
+            def scriptTrackPathLocal = Utils.createTempLocation(pipeline, "./slurm_track.sh")
+            def scriptTrackPathNode = "${jobWorkspace}/${jobUID}-slurm_track.sh"
+            def scriptStatusPathLocal = Utils.createTempLocation(pipeline, "./slurm_status.sh")
+            def scriptStatusPathNode = "${jobWorkspace}/${jobUID}-slurm_status.sh"
+            def isAarch64 = config.contains("aarch64")
            def coverageConfigFile = "${jobWorkspace}/.coveragerc"

            stage("[${stageName}] Initializing Test") {
@ -1133,10 +1138,9 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    "export ${varName}=\"${escapedValue}\""
                }.join('\n')

-                // Save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
                def scriptLaunchPrefix = """#!/bin/bash
                    #SBATCH ${exemptionComment}
-                    #SBATCH --output=${outputPath}
+                    #SBATCH --output=${sbatchLogPath}
                    ${taskArgs.collect { "#SBATCH $it" }.join('\n')}
                    #SBATCH ${partition.additionalArgs}
                    ${partition?.time ? "#SBATCH --time=${partition.time}" : "#SBATCH --time=${SlurmConfig.DEFAULT_TIMEOUT_SHORT}"}
@ -1214,9 +1218,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    scriptLaunchPathNode,
                    true
                )
-
-                def scriptExec = """#!/bin/bash
-                    set -xEeuo pipefail
+                def scriptSubmit = """#!/bin/bash
+                    set -Eeuo pipefail
                    trap 'rc=\$?; echo "Error in file \${BASH_SOURCE[0]} on line \$LINENO: \$BASH_COMMAND (exit \$rc)"; exit \$rc' ERR

                    # Clean up previous job intermediate files so that retry can work
@ -1231,21 +1234,60 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    rm -rf "${jobWorkspace}/results.xml"
                    rm -rf "${jobWorkspace}/report.csv"
                    rm -rf "${jobWorkspace}/unfinished_test.txt"
-                    rm -rf "${outputPath}"
+                    rm -rf "${sbatchLogPath}"

-                    touch "${outputPath}"
+                    touch ${sbatchLogPath}
                    jobId=\$(sbatch ${scriptLaunchPathNode} | awk '{print \$4}')
                    if [ -z "\$jobId" ]; then
                        echo "Error: Slurm job submission failed, no job ID returned."
                        exit 1
                    fi
                    echo "Submitted Slurm job \$jobId"
-                    echo "\$jobId" > "${jobWorkspace}/slurm_job_id.txt"
-                    tail -f ${outputPath} &
+                    # save job ID in $jobWorkspace/slurm_job_id.txt for later job to retrieve
+                    echo \$jobId > $jobWorkspace/slurm_job_id.txt
+                """.replaceAll("(?m)^\\s*", "").trim()
+                pipeline.writeFile(file: scriptSubmitPathLocal, text: scriptSubmit)
+                Utils.copyFileToRemoteHost(
+                    pipeline,
+                    remote,
+                    scriptSubmitPathLocal,
+                    scriptSubmitPathNode,
+                    true
+                )
+            }
+            stage("[${stageName}] Run Pytest") {
+                // Submit the sbatch job
+                Utils.exec(
+                    pipeline,
+                    timeout: false,
+                    script: Utils.sshUserCmd(
+                        remote,
+                        scriptSubmitPathNode
+                    ),
+                    numRetries: 3
+                )
+                def sbatchJobId = Utils.exec(
+                    pipeline,
+                    returnStdout: true,
+                    script: Utils.sshUserCmd(
+                        remote,
+                        "cat $jobWorkspace/slurm_job_id.txt"
+                    )
+                ).trim()
+                def scriptTrack = """#!/bin/bash
+                    jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
+                    tail -f ${sbatchLogPath} &
                    tailPid=\$!
                    # Wait until sbatch job is done.
-                    while squeue -j \$jobId -o %T >/dev/null 2>&1; do
-                        sleep 300
+                    while true; do
+                        state=\$(sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}')
+                        if [[ -z \$state || \$state == "RUNNING" || \$state == "PENDING" || \$state == "CONFIGURING" ]]; then
+                            echo "job is still running"
+                            sleep 300
+                        else
+                            echo "Job \$jobId finished with state: \$state"
+                            break
+                        fi
                    done
                    # Kill tail -f process
                    kill \$tailPid
@ -1282,28 +1324,55 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                        exit 1
                    fi
                """.replaceAll("(?m)^\\s*", "").trim()
-                pipeline.writeFile(file: scriptExecPathLocal, text: scriptExec)
-                Utils.exec(pipeline, script: "echo \"Script to trigger Slurm submission job: \" && cat ${scriptExecPathLocal}")
+                pipeline.writeFile(file: scriptTrackPathLocal, text: scriptTrack)
                Utils.copyFileToRemoteHost(
                    pipeline,
                    remote,
-                    scriptExecPathLocal,
-                    scriptExecPathNode,
+                    scriptTrackPathLocal,
+                    scriptTrackPathNode,
                    true
                )
-            }
-            stage("[${stageName}] Run Pytest") {
-                Utils.exec(
+                def scriptStatus = """#!/bin/bash
+                    jobId=\$(cat $jobWorkspace/slurm_job_id.txt)
+                    sacct -j \$jobId --format=JobIDRaw,State --noheader | awk -v jobId=\$jobId '""\$1"" == jobId {print \$2}'
+                """
+                pipeline.writeFile(file: scriptStatusPathLocal, text: scriptStatus)
+                Utils.copyFileToRemoteHost(
                    pipeline,
-                    timeout: false,
-                    script: Utils.sshUserCmd(
-                        remote,
-                        "\"${scriptExecPathNode}\""
-                    ),
-                    numRetries: 3
+                    remote,
+                    scriptStatusPathLocal,
+                    scriptStatusPathNode,
+                    true
                )
-            }

+                sh "cat $scriptStatusPathLocal"
+                while (true) {
+                    // Check if the job is done by running sacct via SSH
+                    def result = Utils.exec(
+                        pipeline,
+                        returnStdout: true,
+                        script: Utils.sshUserCmd(
+                            remote,
+                            scriptStatusPathNode
+                        )
+                    ).trim()
+                    if (!result || result == "RUNNING" || result == "PENDING" || result == "CONFIGURING") {
+                        echo "Slurm job $sbatchJobId is still running, pulling the job log."
+                        // Pulling the sbatch output log
+                        Utils.exec(
+                            pipeline,
+                            timeout: false,
+                            script: Utils.sshUserCmd(
+                                remote,
+                                scriptTrackPathNode
+                            )
+                        )
+                    } else {
+                        echo "Slurm job $sbatchJobId is done."
+                        break
+                    }
+                }
+            }
            echo "Finished test stage execution."
        }
    } finally {