[feat] Multi-node CI testing support via Slurm (#4771)

Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> Signed-off-by: yuanjingx87 <197832395+yuanjingx87@users.noreply.github.com> Signed-off-by: Yanchao Lu <yanchaol@nvidia.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-14 06:27:45 +08:00 · 2025-06-18 10:11:12 -07:00 · 2025-06-18 10:11:12 -07:00 · a1c5704055
commit a1c5704055
parent e5ee5c5352
3 changed files with 386 additions and 44 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -91,6 +91,63 @@ TESTER_MEMORY = "96Gi"
 CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
 MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"

+def uploadResults(def pipeline, SlurmCluster cluster, String nodeName, String stageName){
+    withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+        def remote = [
+            ip           : cluster.ip,
+            host         : cluster.host,
+            user         : "${pipeline.USERNAME}",
+            passwd       : "${pipeline.PASSWORD}",
+            allowAnyHosts: true,
+        ]
+
+        Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
+        pipeline.stage('Submit Test Results') {
+            sh "mkdir -p ${stageName}"
+            def resultsFilePath = "/home/svc_tensorrt/bloom/scripts/${nodeName}/results/results.xml"
+            def downloadResultCmd = "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${remote.user}@${remote.host}:${resultsFilePath} ${stageName}/"
+            def downloadSucceed = sh(script: downloadResultCmd, returnStatus: true) == 0
+            if (downloadSucceed) {
+                sh "ls ${stageName}"
+                echo "Upload test results."
+                sh "tar -czvf results-${stageName}.tar.gz ${stageName}/"
+                trtllm_utils.uploadArtifacts(
+                    "results-${stageName}.tar.gz",
+                    "${UPLOAD_PATH}/test-results/"
+                )
+                junit(testResults: "${stageName}/results*.xml")
+            } else {
+                println("No results xml to submit")
+            }
+        }
+    }
+}
+
+//TODO: consolidate slurm related code for both multi nodes and single nodes
+def cleanUpNodeResourcesMultiNodes(def pipeline, SlurmCluster cluster, String jobUID){
+    withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+        def remote = [
+            ip           : cluster.ip,
+            host         : cluster.host,
+            user         : "${pipeline.USERNAME}",
+            passwd       : "${pipeline.PASSWORD}",
+            allowAnyHosts: true,
+        ]
+
+        Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
+        pipeline.stage('Clean up SLURM Agent Resources') {
+            Utils.exec(
+                pipeline,
+                timeout: false,
+                script: Utils.sshUserCmd(
+                    remote,
+                    "rm -rf /home/svc_tensorrt/bloom/scripts/${jobUID}"
+                )
+            )
+        }
+    }
+}
+
 def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
    withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
        def remote = [
@ -98,7 +155,6 @@ def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
            host         : cluster.host,
            user         : "${pipeline.USERNAME}",
            passwd       : "${pipeline.PASSWORD}",
-            password     : "${pipeline.PASSWORD}",
            allowAnyHosts: true,
        ]

@ -164,7 +220,6 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
                    host         : cluster.host,
                    user         : "${pipeline.USERNAME}",
                    passwd       : "${pipeline.PASSWORD}",
-                    password     : "${pipeline.PASSWORD}",
                    allowAnyHosts: true,
            ]

@ -211,6 +266,133 @@ def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, p
    }
 }

+def getNodeArgs(int nodeCount, int gpuCount) {
+    int gpusPerNode = ((gpuCount / nodeCount) as BigDecimal).setScale(0, BigDecimal.ROUND_CEILING).intValue()
+    return [
+        "--nodes=${nodeCount}",
+        "--ntasks=${gpuCount}",
+        "--ntasks-per-node=${gpusPerNode}",
+        "--gpus-per-node=${gpusPerNode}",
+    ].join(" ")
+}
+
+def runLLMTestlistOnSlurm_MultiNodes(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, gpuCount=1, nodeCount=2, skipInstallWheel=false, cpver="cp312")
+{
+    SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
+    SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
+
+    def jobUID = "${cluster.host}-multi_node_test-${UUID.randomUUID().toString()}"
+
+    try {
+        // Run ssh command to start node in desired cluster via SLURM
+        withCredentials([
+            usernamePassword(
+                credentialsId: 'svc_tensorrt',
+                usernameVariable: 'USERNAME',
+                passwordVariable: 'PASSWORD'
+            )
+        ]) {
+            def remote = [
+                    ip           : cluster.ip,
+                    host         : cluster.host,
+                    user         : "${pipeline.USERNAME}",
+                    passwd       : "${pipeline.PASSWORD}",
+                    allowAnyHosts: true,
+            ]
+            Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
+            def tarName = BUILD_CONFIGS[config][TARNAME]
+            def llmTarfile = "https://urm.nvidia.com/artifactory/${ARTIFACT_PATH}/${tarName}"
+            def llmPath = sh (script: "realpath .", returnStdout: true).trim()
+            def jobWorkspace = "/home/svc_tensorrt/bloom/scripts/${jobUID}"
+            def resourcePathNode = "/tmp"
+            def llmSrcNode = "${resourcePathNode}/TensorRT-LLM/src"
+            def llmSrcLocal = "${llmPath}/TensorRT-LLM/src"
+            def scriptRunNode = "${jobWorkspace}/slurm_run.sh"
+            def testListPathNode = "${jobWorkspace}/${testList}.txt"
+            def isAarch64 = config.contains("aarch64")
+            def pytestTestTimeout = "7200"
+
+            stage('Prepare Testing') {
+                // Create Job Workspace folder in Frontend Node
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' ssh -oStrictHostKeyChecking=no ${remote.user}@${remote.host} 'mkdir ${jobWorkspace}'",)
+
+                // Download and Unzip Tar File
+                trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${llmPath} && wget -nv ${llmTarfile}")
+                sh "cd ${llmPath} && tar -zxf ${BUILD_CONFIGS[config][TARNAME]}"
+
+                // Upload slurm_run_sh to Frontend node
+                def scriptRunLocalPath = "${llmSrcLocal}/jenkins/scripts/slurm_run.sh"
+                Utils.exec(pipeline, script: "chmod +x ${scriptRunLocalPath}", returnStdout: true)
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptRunLocalPath} ${remote.user}@${remote.host}:${scriptRunNode}",)
+
+                // Generate Test List and Upload to Frontend Node
+                def makoArgs = getMakoArgsFromStageName(stageName, true)
+                // TODO: currently the options will only be processed if the first
+                // line is "Mako options:", maybe we can make it more generic, which
+                // if the line cannot be split by "=", just ignore that line.
+                def makoOptsJson = transformMakoArgsToJson(["Mako options:"] + makoArgs)
+                def testListPath = renderTestDB(testList, llmSrcLocal, stageName, makoOptsJson)
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${testListPath} ${remote.user}@${remote.host}:${testListPathNode}",)
+
+                // Generate Multi Node Job Launch Script
+                def container = LLM_DOCKER_IMAGE.replace("urm.nvidia.com/", "urm.nvidia.com#")
+                def mounts = "/home/scratch.trt_llm_data:/scratch.trt_llm_data:ro,/home/svc_tensorrt/bloom/scripts:/home/svc_tensorrt/bloom/scripts"
+                String taskArgs = getNodeArgs(nodeCount, gpuCount)
+
+                if (taskArgs == null) {
+                    error "Invalid multinode task stage name is set"
+                }
+
+                taskArgs =  [
+                    taskArgs,
+                    "--exclusive",
+                    "--container-image=${container}",
+                    "--container-workdir=/home/svc_tensorrt/bloom/scripts",
+                    "--container-mounts=${mounts}",
+                ].join(" ")
+
+                def scriptLaunch = "/home/svc_tensorrt/bloom/scripts/${jobUID}/slurm_launch.sh"
+                def srunCmd = SlurmConfig.generateMultiNodeCommand(partition, taskArgs, scriptRunNode)
+                scriptLaunchDestPath = Utils.createTempLocation(pipeline, "./slurm_launch.sh")
+                def scriptContent = """#!/bin/bash
+                    export jobWorkspace=$jobWorkspace
+                    export tarName=$tarName
+                    export llmTarfile=$llmTarfile
+                    export llmSrcNode=$llmSrcNode
+                    export stageName=$stageName
+                    export testList=$testList
+                    export testListPathNode=$testListPathNode
+                    export pytestTestTimeout=$pytestTestTimeout
+                    export splits=$splits
+                    export splitId=$splitId
+                    export perfMode=$perfMode
+                    export resourcePathNode=$resourcePathNode
+                    export MODEL_CACHE_DIR=$MODEL_CACHE_DIR
+                    chmod +x ${scriptRunNode}
+                    ${srunCmd}
+                """.stripIndent()
+                pipeline.writeFile(file: scriptLaunchDestPath, text: scriptContent)
+                Utils.exec(pipeline, script: "chmod +x ${scriptLaunchDestPath}", returnStdout: true)
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${scriptLaunchDestPath} ${remote.user}@${remote.host}:${scriptLaunch}",)
+            }
+            stage('Run Test') {
+                def scriptLaunch = "${jobWorkspace}/slurm_launch.sh"
+                Utils.exec(
+                    pipeline,
+                    timeout: false,
+                    script: Utils.sshUserCmd(
+                        remote,
+                        """bash ${scriptLaunch}"""
+                    )
+                )
+            }
+        }
+    } finally {
+        uploadResults(pipeline, cluster, jobUID, stageName)
+        cleanUpNodeResourcesMultiNodes(pipeline, cluster, jobUID)
+    }
+}
+
 def trimForStageList(stageNameList)
 {
    if (stageNameList == null) {
@ -686,9 +868,49 @@ def generateStageFailTestResultXml(stageName, subName, failureLog, resultPath) {
        </failure></testcase></testsuite></testsuites>"""
 }

+def transformMakoArgsToJson(optList) {
+    def makoOpts = [:]
+    def startedMakoOpts = false
+    def param = null
+    def value = null
+    optList.each { val ->
+        if (startedMakoOpts) {
+            // Handle case where value is missing
+            param = null
+            value = null
+            try {
+                (param, value) = val.split("=")
+            } catch (ArrayIndexOutOfBoundsException ex) {
+                param = val.split("=")[0]
+                value = null
+            }
+
+            // Try to convert nulls, booleans, and floats into the correct type
+            if (value != null) {
+                if (value.toLowerCase() == "none") {
+                    echo "Converted mako param '${param}' value '${value}' to 'null'"
+                    value = null
+                } else if (value.toLowerCase() in ["true", "false"]) {
+                    echo "Converted mako param '${param}' value '${value}' to Boolean '${value.toBoolean()}'"
+                    value = value.toBoolean()
+                }
+            }
+            makoOpts[(param)] = value
+        }
+        if (val.equals("Mako options:")) {
+            startedMakoOpts = true
+        }
+    }
+
+    def makoOptsJson = JsonOutput.toJson(makoOpts)
+
+    // Print and return the Test DB Query as a JSON string
+    echo "Test DB Mako opts: ${makoOptsJson}"
+    return makoOptsJson
+}
+
 def getMakoOpts(getMakoScript, makoArgs=[]) {
    // We want to save a map for the Mako opts
-    def makoOpts = [:]
    def turtleOutput = ""

    // Echo the command
@ -723,50 +945,25 @@ def getMakoOpts(getMakoScript, makoArgs=[]) {
    // Split each line of turtle output into a list
    def turtleOutList = turtleOutput.split("\n")

-    // Extract the mako opts
-    def startedMakoOpts = false
-    def param = null
-    def value = null
-    turtleOutList.each { val ->
-        if (startedMakoOpts) {
-            // Handle case where value is missing
-            param = null
-            value = null
-            try {
-                (param, value) = val.split("=")
-            } catch (ArrayIndexOutOfBoundsException ex) {
-                param = val.split("=")[0]
-                value = null
-            }
-
-            // Try to convert nulls, booleans, and floats into the correct type
-            if (value != null) {
-                if (value.toLowerCase() == "none") {
-                    echo "Converted mako param '${param}' value '${value}' to 'null'"
-                    value = null
-                } else if (value.toLowerCase() in ["true", "false"]) {
-                    echo "Converted mako param '${param}' value '${value}' to Boolean '${value.toBoolean()}'"
-                    value = value.toBoolean()
-                }
-            }
-            makoOpts[(param)] = value
-        }
-        if (val.equals("Mako options:")) {
-            startedMakoOpts = true
-        }
-    }
-
-    // Finally, convert the query to a json string
-    def makoOptsJson = JsonOutput.toJson(makoOpts)
-
-    // Print and return the Test DB Query as a JSON string
-    echo "Test DB Mako opts: ${makoOptsJson}"
+    def makoOptsJson = transformMakoArgsToJson(turtleOutList)

    return makoOptsJson
 }

-def renderTestDB(testContext, llmSrc, stageName) {
-    def scriptPath = "${llmSrc}/tests/integration/defs/sysinfo/get_sysinfo.py"
+def parseMultiNodeTaskConfigFromStageName(String stageName) {
+    def taskConfig = null
+    def matcher = (stageName =~ /([^-]+)-(\d+)_GPUs-(\d+)_Nodes/)
+    if (matcher.find()) {
+        taskConfig = [
+            gpu: "${matcher.group(1)}",
+            system_gpu_count: "${matcher.group(2)}",
+            node_count: "${matcher.group(3)}" // "node_count" might not be used currently
+        ]
+    }
+    return taskConfig
+}
+
+def getMakoArgsFromStageName(stageName, parseSysinfo=false) {
    def makoArgs = []
    def isPostMerge = stageName.contains("Post-Merge")
    makoArgs += [isPostMerge ? "stage=post_merge" : "stage=pre_merge"]
@ -798,7 +995,27 @@ def renderTestDB(testContext, llmSrc, stageName) {
        makoArgs += ["auto_trigger=others"]
    }

-    def makoOpts = getMakoOpts(scriptPath, makoArgs)
+    if (parseSysinfo) {
+        def taskConfig = parseMultiNodeTaskConfigFromStageName(stageName)
+        if (taskConfig) {
+            makoArgs += [
+                "gpu=${taskConfig.gpu}",
+                "system_gpu_count=${taskConfig.system_gpu_count}"
+            ]
+        }
+    }
+
+    return makoArgs
+}
+
+def renderTestDB(testContext, llmSrc, stageName, preDefinedMakoOpts=null) {
+    def makoOpts = preDefinedMakoOpts
+
+    if (!makoOpts) {
+        def scriptPath = "${llmSrc}/tests/integration/defs/sysinfo/get_sysinfo.py"
+        def makoArgs = getMakoArgsFromStageName(stageName)
+        makoOpts = getMakoOpts(scriptPath, makoArgs)
+    }

    sh "pip3 install --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/sw-tensorrt-pypi/simple --ignore-installed trt-test-db==1.8.5+bc6df7"
    def testDBPath = "${llmSrc}/tests/integration/test_lists/test-db"
@ -1600,6 +1817,11 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
    ]
    fullSet += SBSASlurmTestConfigs.keySet()

+    multiNodesSBSAConfigs = [
+        "GB200-8_GPUs-2_Nodes-PyTorch-[Post-Merge]-1": ["gb200-multi-node", "l0_gb200_multi_nodes", 1, 1, 8, 2],
+    ]
+    fullSet += multiNodesSBSAConfigs.keySet()
+
    if (env.targetArch == AARCH64_TRIPLE) {
        parallelJobs = SBSATestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), {
            runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3])
@ -1617,6 +1839,20 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
            runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1)
        }]]}
        parallelJobs += parallelSlurmJobs
+
+        // Add SBSA multi node Slurm jobs
+        parallelMultiNodesSBSAJobs = multiNodesSBSAConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), {
+            def config = LINUX_AARCH64_CONFIG
+            if (key.contains("single-device")) {
+                config = SINGLE_DEVICE_CONFIG
+            }
+            if (key.contains("llvm")) {
+                config = LLVM_CONFIG
+            }
+            runLLMTestlistOnSlurm_MultiNodes(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2)
+        }]]}
+
+        parallelJobs += parallelMultiNodesSBSAJobs
    }

    docBuildSpec = createKubernetesPodConfig(LLM_DOCKER_IMAGE, "a10")
--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@ -0,0 +1,90 @@
+#!/bin/bash
+cd $resourcePathNode
+llmSrcNode=$resourcePathNode/TensorRT-LLM/src
+
+# generate .coveragerc in workspace
+cat << EOF > $jobWorkspace/.coveragerc
+[run]
+branch = True
+data_file = $jobWorkspace/.coverage.$stageName
+[paths]
+source =
+    $llmSrcNode/tensorrt_llm/
+    ---wheel_path---/tensorrt_llm/
+EOF
+
+resultsPath=$jobWorkspace/results
+mkdir -p $resultsPath
+if [ $SLURM_LOCALID -eq 0 ]; then
+    wget -nv $llmTarfile
+    tar -zxf $tarName
+    which python3
+    python3 --version
+    apt-get install -y libffi-dev
+    nvidia-smi
+    cd $llmSrcNode && pip3 install --retries 1 -r requirements-dev.txt
+    cd $resourcePathNode &&  pip3 install --force-reinstall --no-deps TensorRT-LLM/tensorrt_llm-*.whl
+    git config --global --add safe.directory "*"
+    gpuUuids=$(nvidia-smi -q | grep "GPU UUID" | awk '{print $4}' | tr '\n' ',' || true)
+    echo "HOST_NODE_NAME = $HOST_NODE_NAME ; GPU_UUIDS = =$gpuUuids ; STAGE_NAME = $stageName"
+    touch install_lock.lock
+else
+    while [ ! -f install_lock.lock ]; do
+        sleep 5
+    done
+fi
+testList="$testList_$splitId"
+export CPP_TEST_TIMEOUT_OVERRIDDEN=7200
+export LLM_ROOT=$llmSrcNode
+export LLM_MODELS_ROOT=$MODEL_CACHE_DIR
+export UCX_TLS=^gdr_copy
+cd $llmSrcNode/tests/integration/defs
+testCmdLines=(
+    "$llmSrcNode/tensorrt_llm/llmapi/trtllm-llmapi-launch"
+    "pytest"
+    "-v"
+    "--timeout=$pytestTestTimeout"
+    "--test-list=$testListPathNode"
+    "--rootdir $llmSrcNode/tests/integration/defs"
+    "--test-prefix=$stageName"
+    "--splits $splits"
+    "--group $splitId"
+    "--output-dir=$jobWorkspace/"
+    "--csv=$resultsPath/report.csv"
+    "--junit-xml $resultsPath/results.xml"
+    "-o junit_logging=out-err"
+)
+if [ "$perfMode" = "true" ]; then
+    testCmdLines+=(
+        "--perf"
+        "--perf-log-formats csv"
+        "--perf-log-formats yaml"
+    )
+fi
+trtllmWhlPath=$(pip3 show tensorrt_llm | grep Location | cut -d ' ' -f 2)
+trtllmWhlPath=$(echo "$trtllmWhlPath" | sed 's/[[:space:]]+/_/g')
+echo "TRTLLM WHEEL PATH: $trtllmWhlPath"
+sed -i "s|---wheel_path---|$trtllmWhlPath|g" "$coverageConfigFile"
+testCmdLines+=(
+    "--cov=$llmSrcNode/examples/"
+    "--cov=$llmSrcNode/tensorrt_llm/"
+    "--cov=$trtllmWhlPath/tensorrt_llm/"
+    "--cov-report="
+    "--cov-config=$coverageConfigFile"
+)
+containerPipLLMLibPath=$(pip3 show tensorrt_llm | grep "Location" | awk -F ":" '{ gsub(/ /, "", $2); print $2"/tensorrt_llm/libs"}')
+containerPipLLMLibPath=$(echo "$containerPipLLMLibPath" | sed 's/[[:space:]]+/_/g')
+containerLDLibPath=$LD_LIBRARY_PATH
+containerLDLibPath=$(echo "$containerLDLibPath" | sed 's/[[:space:]]+/_/g')
+if [[ "$containerLDLibPath" != *"$containerPipLLMLibPath"* ]]; then
+  containerLDLibPath="$containerPipLLMLibPath:$containerLDLibPath"
+  containerLDLibPath="${containerLDLibPath%:}"
+fi
+export LD_LIBRARY_PATH=$containerLDLibPath
+echo "Library Path:"
+echo "$LD_LIBRARY_PATH"
+env | sort
+fullCmd="${testCmdLines[*]}"
+echo "Running: $testCase"
+echo "Full Command: $fullCmd"
+eval $fullCmd
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@ -0,0 +1,16 @@
+version: 0.0.1
+l0_gb200_multi_nodes:
+- condition:
+    ranges:
+      # 2 nodes with each node has 4 GPUs
+      system_gpu_count:
+        gte: 8
+        lte: 8
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_8gpus[latency]