feat: Add Slurm support and enable RTX Pro 6000 testing pipeline in CI (#4019)

* Add slurm support with RTXPro6000 PostMerge Tests Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> * remove H100 post merge test from testing Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> --------- Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-05-08 00:15:36 -07:00 · 2025-05-08 00:15:36 -07:00 · 6e1d2a1320
commit 6e1d2a1320
parent 179efd45d4
2 changed files with 183 additions and 0 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -6,6 +6,12 @@ import groovy.json.JsonSlurper
 import groovy.json.JsonOutput
 import com.nvidia.bloom.KubernetesManager
 import com.nvidia.bloom.Constants
+import com.nvidia.bloom.CloudManager
+import com.nvidia.bloom.KubernetesManager
+import com.nvidia.bloom.SlurmConfig
+import com.nvidia.bloom.SlurmCluster
+import com.nvidia.bloom.SlurmPartition
+import com.nvidia.bloom.Utils
 import org.jenkinsci.plugins.workflow.cps.CpsThread
 import org.jsoup.Jsoup
 import org.jenkinsci.plugins.pipeline.modeldefinition.Utils as jUtils
@ -79,6 +85,121 @@ TESTER_MEMORY = "96Gi"
 CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
 MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"

+def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
+    withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+        def remote = [
+            ip           : cluster.ip,
+            host         : cluster.host,
+            user         : "${pipeline.USERNAME}",
+            passwd       : "${pipeline.PASSWORD}",
+            password     : "${pipeline.PASSWORD}",
+            allowAnyHosts: true,
+        ]
+
+        Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
+        pipeline.stage('Clean up SLURM Agent Resources') {
+            Utils.exec(
+                pipeline,
+                timeout: false,
+                script: Utils.sshUserCmd(
+                    remote,
+                    "rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
+                )
+            )
+            Utils.exec(pipeline, script: "echo done")
+        }
+    }
+}
+
+def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", runner)
+{
+    runner {
+        // TODO: refactor the finallyRunner to reuse within slurm or nonslurm job.
+        cacheErrorAndUploadResult(stageName, {
+            runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver)
+        }, {
+            // If the execution test list is null, remove the test result xml
+            sh """
+                ls -all ${stageName}/
+                if ! grep -q '<testcase' ${stageName}/results.xml; then
+                    rm ${stageName}/results.xml
+                fi
+            """
+            def llmPath = sh (script: "realpath .", returnStdout: true).trim()
+            def llmSrc = "${llmPath}/${LLM_ROOT}${config}/TensorRT-LLM/src"
+            // CPP tests will generate test result in ${llmSrc}/cpp/build_backup/, move these files to job result folder
+            sh "ls -all ${llmSrc}/cpp/build_backup/ || true"
+            sh "ls -all ${llmSrc}/cpp/build/ || true"
+            // Sed for CPP test result
+            sh "cd ${llmSrc}/cpp/build_backup/ && sed -i 's/\" classname=\"/\" classname=\"${stageName}./g' *.xml || true"
+            sh "cd ${llmSrc}/cpp/build_backup/ && sed -i 's/testsuite name=\"[^\"]*\"/testsuite name=\"${stageName}\"/g' *.xml || true"
+            // Sed for Pytest result
+            sh "cd ${stageName} && sed -i 's/testsuite name=\"pytest\"/testsuite name=\"${stageName}\"/g' *.xml || true"
+            // Copy CPP test result
+            sh "cp ${llmSrc}/cpp/build_backup/*.xml ${stageName} || true"
+            sh "ls ${stageName}/ -all"
+        })
+    }
+}
+
+def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312")
+{
+    SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
+    SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
+
+    def nodeName = "${cluster.host}-test-${UUID.randomUUID().toString()}"
+    def nodeSecret = CloudManager.createNode(nodeName)
+
+    try {
+        // Run ssh command to start node in desired cluster via SLURM
+        withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
+            def remote = [
+                    ip           : cluster.ip,
+                    host         : cluster.host,
+                    user         : "${pipeline.USERNAME}",
+                    passwd       : "${pipeline.PASSWORD}",
+                    password     : "${pipeline.PASSWORD}",
+                    allowAnyHosts: true,
+            ]
+
+            Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
+            stage('Request Node via SLURM') {
+                println("Selected Cluster: ${cluster.name}")
+
+                def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, "slurm_jenkins_agent_setup.sh")
+
+                Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)
+
+                Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
+
+                Utils.exec(
+                    pipeline,
+                    timeout: false,
+                    script: Utils.sshUserCmd(
+                            remote,
+                            """${SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)}"""
+                    )
+                )
+                Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
+            }
+        }
+
+        if (!CloudManager.isNodeOnline(nodeName)){
+            catchError(buildResult: 'SUCCESS', stageResult: 'UNSTABLE') {
+                error "Cannot find a node that is idle to run the test for ${stageName}"
+            }
+        }
+
+        // TODO: pass in the gpu numbers instead of hard code it to 1
+        def dockerArgs = "--gpus 1 --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined  -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
+        slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, false)
+        executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
+    } finally {
+        cleanUpNodeResources(pipeline, cluster, nodeName)
+        CloudManager.destroyNode(nodeName)
+    }
+}
+
 def trimForStageList(stageNameList)
 {
    if (stageNameList == null) {
@ -727,6 +848,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
 {
    // Step 1: create LLM_ROOT dir
    sh "pwd && ls -alh"
+    // TODO: proper way to clean workspace, maybe save in a folder named with BUILD_ID.
+    // So that it can work with multiple job running in same node
+    sh "rm -rf ./*"
    def llmRootConfig = "${LLM_ROOT}${config}"
    sh "mkdir ${llmRootConfig}"

@ -1179,6 +1303,24 @@ def checkStageName(stageNames) {
    }
 }

+// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
+def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
+{
+    return {
+        runner -> node(label) {
+            if (needToDeleteDir) {
+                deleteDir()
+            }
+            stage('Pull Docker Image') {
+                docker.image(image).pull()
+            }
+            docker.image(image).inside(dockerArgs) {
+                runner()
+            }
+        }
+    }
+}
+
 def runInDockerOnNode(image, label, dockerArgs)
 {
    return {
@ -1290,6 +1432,25 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)

    fullSet = parallelJobs.keySet()

+    turtleSlurmConfigs = [
+        "RTXPro6000-PyTorch-[Post-Merge]-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
+    ]
+
+    // TODO: use cpu pod to launch slurm job
+    parallelSlurmJobs = turtleSlurmConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "a10", "amd64", values[4] ?: 1, key.contains("Perf")), {
+    def config = VANILLA_CONFIG
+        if (key.contains("single-device")) {
+            config = SINGLE_DEVICE_CONFIG
+        }
+        if (key.contains("llvm")) {
+            config = LLVM_CONFIG
+        }
+        runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
+    }]]}
+
+    fullSet += parallelSlurmJobs.keySet()
+    parallelJobs += parallelSlurmJobs
+
    // Try to match what are being tested on x86 H100_PCIe.
    // The total machine time is scaled proportionally according to the number of each GPU.
    aarch64Configs = [
--- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
+++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@ -0,0 +1,22 @@
+version: 0.0.1
+l0_rtx_pro_6000:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 1
+        lte: 1
+    wildcards:
+      gpu:
+      - '*6000*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  # ------------- PyTorch tests ---------------
+  - unittest/_torch/modeling -k "modeling_mllama"
+  - unittest/_torch/modeling -k "modeling_out_of_tree"
+  # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
+  - unittest/_torch/test_attention_mla.py
+  - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
+  - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]