feat: Add Slurm support and enable RTX Pro 6000 testing pipeline in CI (#4019)

* Add slurm support with RTXPro6000 PostMerge Tests

Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>

* remove H100 post merge test from testing

Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>

---------

Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>
This commit is contained in:
yuanjingx87 2025-05-08 00:15:36 -07:00 committed by GitHub
parent 179efd45d4
commit 6e1d2a1320
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 183 additions and 0 deletions

View File

@ -6,6 +6,12 @@ import groovy.json.JsonSlurper
import groovy.json.JsonOutput
import com.nvidia.bloom.KubernetesManager
import com.nvidia.bloom.Constants
import com.nvidia.bloom.CloudManager
import com.nvidia.bloom.KubernetesManager
import com.nvidia.bloom.SlurmConfig
import com.nvidia.bloom.SlurmCluster
import com.nvidia.bloom.SlurmPartition
import com.nvidia.bloom.Utils
import org.jenkinsci.plugins.workflow.cps.CpsThread
import org.jsoup.Jsoup
import org.jenkinsci.plugins.pipeline.modeldefinition.Utils as jUtils
@ -79,6 +85,121 @@ TESTER_MEMORY = "96Gi"
CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def remote = [
ip : cluster.ip,
host : cluster.host,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
password : "${pipeline.PASSWORD}",
allowAnyHosts: true,
]
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
pipeline.stage('Clean up SLURM Agent Resources') {
Utils.exec(
pipeline,
timeout: false,
script: Utils.sshUserCmd(
remote,
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
)
)
Utils.exec(pipeline, script: "echo done")
}
}
}
def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", runner)
{
runner {
// TODO: refactor the finallyRunner to reuse within slurm or nonslurm job.
cacheErrorAndUploadResult(stageName, {
runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver)
}, {
// If the execution test list is null, remove the test result xml
sh """
ls -all ${stageName}/
if ! grep -q '<testcase' ${stageName}/results.xml; then
rm ${stageName}/results.xml
fi
"""
def llmPath = sh (script: "realpath .", returnStdout: true).trim()
def llmSrc = "${llmPath}/${LLM_ROOT}${config}/TensorRT-LLM/src"
// CPP tests will generate test result in ${llmSrc}/cpp/build_backup/, move these files to job result folder
sh "ls -all ${llmSrc}/cpp/build_backup/ || true"
sh "ls -all ${llmSrc}/cpp/build/ || true"
// Sed for CPP test result
sh "cd ${llmSrc}/cpp/build_backup/ && sed -i 's/\" classname=\"/\" classname=\"${stageName}./g' *.xml || true"
sh "cd ${llmSrc}/cpp/build_backup/ && sed -i 's/testsuite name=\"[^\"]*\"/testsuite name=\"${stageName}\"/g' *.xml || true"
// Sed for Pytest result
sh "cd ${stageName} && sed -i 's/testsuite name=\"pytest\"/testsuite name=\"${stageName}\"/g' *.xml || true"
// Copy CPP test result
sh "cp ${llmSrc}/cpp/build_backup/*.xml ${stageName} || true"
sh "ls ${stageName}/ -all"
})
}
}
def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312")
{
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
def nodeName = "${cluster.host}-test-${UUID.randomUUID().toString()}"
def nodeSecret = CloudManager.createNode(nodeName)
try {
// Run ssh command to start node in desired cluster via SLURM
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
def remote = [
ip : cluster.ip,
host : cluster.host,
user : "${pipeline.USERNAME}",
passwd : "${pipeline.PASSWORD}",
password : "${pipeline.PASSWORD}",
allowAnyHosts: true,
]
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
stage('Request Node via SLURM') {
println("Selected Cluster: ${cluster.name}")
def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, "slurm_jenkins_agent_setup.sh")
Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
Utils.exec(
pipeline,
timeout: false,
script: Utils.sshUserCmd(
remote,
"""${SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)}"""
)
)
Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
}
}
if (!CloudManager.isNodeOnline(nodeName)){
catchError(buildResult: 'SUCCESS', stageResult: 'UNSTABLE') {
error "Cannot find a node that is idle to run the test for ${stageName}"
}
}
// TODO: pass in the gpu numbers instead of hard code it to 1
def dockerArgs = "--gpus 1 --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, false)
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
} finally {
cleanUpNodeResources(pipeline, cluster, nodeName)
CloudManager.destroyNode(nodeName)
}
}
def trimForStageList(stageNameList)
{
if (stageNameList == null) {
@ -727,6 +848,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
{
// Step 1: create LLM_ROOT dir
sh "pwd && ls -alh"
// TODO: proper way to clean workspace, maybe save in a folder named with BUILD_ID.
// So that it can work with multiple job running in same node
sh "rm -rf ./*"
def llmRootConfig = "${LLM_ROOT}${config}"
sh "mkdir ${llmRootConfig}"
@ -1179,6 +1303,24 @@ def checkStageName(stageNames) {
}
}
// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
{
return {
runner -> node(label) {
if (needToDeleteDir) {
deleteDir()
}
stage('Pull Docker Image') {
docker.image(image).pull()
}
docker.image(image).inside(dockerArgs) {
runner()
}
}
}
}
def runInDockerOnNode(image, label, dockerArgs)
{
return {
@ -1290,6 +1432,25 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
fullSet = parallelJobs.keySet()
turtleSlurmConfigs = [
"RTXPro6000-PyTorch-[Post-Merge]-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
]
// TODO: use cpu pod to launch slurm job
parallelSlurmJobs = turtleSlurmConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "a10", "amd64", values[4] ?: 1, key.contains("Perf")), {
def config = VANILLA_CONFIG
if (key.contains("single-device")) {
config = SINGLE_DEVICE_CONFIG
}
if (key.contains("llvm")) {
config = LLVM_CONFIG
}
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
}]]}
fullSet += parallelSlurmJobs.keySet()
parallelJobs += parallelSlurmJobs
// Try to match what are being tested on x86 H100_PCIe.
// The total machine time is scaled proportionally according to the number of each GPU.
aarch64Configs = [

View File

@ -0,0 +1,22 @@
version: 0.0.1
l0_rtx_pro_6000:
- condition:
ranges:
system_gpu_count:
gte: 1
lte: 1
wildcards:
gpu:
- '*6000*'
linux_distribution_name: ubuntu*
terms:
stage: post_merge
backend: pytorch
tests:
# ------------- PyTorch tests ---------------
- unittest/_torch/modeling -k "modeling_mllama"
- unittest/_torch/modeling -k "modeling_out_of_tree"
# - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
- unittest/_torch/test_attention_mla.py
- test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
- test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]