mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
feat: Add Slurm support and enable RTX Pro 6000 testing pipeline in CI (#4019)
* Add slurm support with RTXPro6000 PostMerge Tests Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> * remove H100 post merge test from testing Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com> --------- Signed-off-by: Yuanjing Xue <197832395+yuanjingx87@users.noreply.github.com>
This commit is contained in:
parent
179efd45d4
commit
6e1d2a1320
@ -6,6 +6,12 @@ import groovy.json.JsonSlurper
|
||||
import groovy.json.JsonOutput
|
||||
import com.nvidia.bloom.KubernetesManager
|
||||
import com.nvidia.bloom.Constants
|
||||
import com.nvidia.bloom.CloudManager
|
||||
import com.nvidia.bloom.KubernetesManager
|
||||
import com.nvidia.bloom.SlurmConfig
|
||||
import com.nvidia.bloom.SlurmCluster
|
||||
import com.nvidia.bloom.SlurmPartition
|
||||
import com.nvidia.bloom.Utils
|
||||
import org.jenkinsci.plugins.workflow.cps.CpsThread
|
||||
import org.jsoup.Jsoup
|
||||
import org.jenkinsci.plugins.pipeline.modeldefinition.Utils as jUtils
|
||||
@ -79,6 +85,121 @@ TESTER_MEMORY = "96Gi"
|
||||
CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"
|
||||
MODEL_CACHE_DIR="/scratch.trt_llm_data/llm-models"
|
||||
|
||||
def cleanUpNodeResources(def pipeline, SlurmCluster cluster, String nodeName){
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
password : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
]
|
||||
|
||||
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
|
||||
pipeline.stage('Clean up SLURM Agent Resources') {
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"rm -rf /home/svc_tensorrt/bloom/scripts/agent-${nodeName}.jar /home/svc_tensorrt/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh"
|
||||
)
|
||||
)
|
||||
Utils.exec(pipeline, script: "echo done")
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def executeLLMTestOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312", runner)
|
||||
{
|
||||
runner {
|
||||
// TODO: refactor the finallyRunner to reuse within slurm or nonslurm job.
|
||||
cacheErrorAndUploadResult(stageName, {
|
||||
runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver)
|
||||
}, {
|
||||
// If the execution test list is null, remove the test result xml
|
||||
sh """
|
||||
ls -all ${stageName}/
|
||||
if ! grep -q '<testcase' ${stageName}/results.xml; then
|
||||
rm ${stageName}/results.xml
|
||||
fi
|
||||
"""
|
||||
def llmPath = sh (script: "realpath .", returnStdout: true).trim()
|
||||
def llmSrc = "${llmPath}/${LLM_ROOT}${config}/TensorRT-LLM/src"
|
||||
// CPP tests will generate test result in ${llmSrc}/cpp/build_backup/, move these files to job result folder
|
||||
sh "ls -all ${llmSrc}/cpp/build_backup/ || true"
|
||||
sh "ls -all ${llmSrc}/cpp/build/ || true"
|
||||
// Sed for CPP test result
|
||||
sh "cd ${llmSrc}/cpp/build_backup/ && sed -i 's/\" classname=\"/\" classname=\"${stageName}./g' *.xml || true"
|
||||
sh "cd ${llmSrc}/cpp/build_backup/ && sed -i 's/testsuite name=\"[^\"]*\"/testsuite name=\"${stageName}\"/g' *.xml || true"
|
||||
// Sed for Pytest result
|
||||
sh "cd ${stageName} && sed -i 's/testsuite name=\"pytest\"/testsuite name=\"${stageName}\"/g' *.xml || true"
|
||||
// Copy CPP test result
|
||||
sh "cp ${llmSrc}/cpp/build_backup/*.xml ${stageName} || true"
|
||||
sh "ls ${stageName}/ -all"
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
def runLLMTestlistOnSlurm(pipeline, platform, testList, config=VANILLA_CONFIG, perfMode=false, stageName="Undefined", splitId=1, splits=1, skipInstallWheel=false, cpver="cp312")
|
||||
{
|
||||
SlurmPartition partition = SlurmConfig.partitionConfig[platform] as SlurmPartition
|
||||
SlurmCluster cluster = SlurmConfig.clusterConfig[partition.clusterName]
|
||||
|
||||
def nodeName = "${cluster.host}-test-${UUID.randomUUID().toString()}"
|
||||
def nodeSecret = CloudManager.createNode(nodeName)
|
||||
|
||||
try {
|
||||
// Run ssh command to start node in desired cluster via SLURM
|
||||
withCredentials([usernamePassword(credentialsId: 'svc_tensorrt', usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
|
||||
def remote = [
|
||||
ip : cluster.ip,
|
||||
host : cluster.host,
|
||||
user : "${pipeline.USERNAME}",
|
||||
passwd : "${pipeline.PASSWORD}",
|
||||
password : "${pipeline.PASSWORD}",
|
||||
allowAnyHosts: true,
|
||||
]
|
||||
|
||||
Utils.exec(pipeline, script: "apt-get update && apt-get install -y sshpass openssh-client")
|
||||
stage('Request Node via SLURM') {
|
||||
println("Selected Cluster: ${cluster.name}")
|
||||
|
||||
def jenkinsSetupPath = Utils.copyLibraryResource(pipeline, "slurm_jenkins_agent_setup.sh")
|
||||
|
||||
Utils.exec(pipeline, script: "chmod +x ${jenkinsSetupPath}", returnStdout: true)
|
||||
|
||||
Utils.exec(pipeline, script: "sshpass -p '${remote.passwd}' scp -r -p -oStrictHostKeyChecking=no ${jenkinsSetupPath} ${remote.user}@${remote.host}:~/bloom/scripts/${nodeName}-slurm_jenkins_agent_setup.sh",)
|
||||
|
||||
Utils.exec(
|
||||
pipeline,
|
||||
timeout: false,
|
||||
script: Utils.sshUserCmd(
|
||||
remote,
|
||||
"""${SlurmConfig.generateCommand(cluster, partition, nodeSecret, nodeName, Jenkins.instance.rootUrl)}"""
|
||||
)
|
||||
)
|
||||
Utils.exec(pipeline, script: "echo Sleeping to allow agent initialization; sleep 30")
|
||||
}
|
||||
}
|
||||
|
||||
if (!CloudManager.isNodeOnline(nodeName)){
|
||||
catchError(buildResult: 'SUCCESS', stageResult: 'UNSTABLE') {
|
||||
error "Cannot find a node that is idle to run the test for ${stageName}"
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: pass in the gpu numbers instead of hard code it to 1
|
||||
def dockerArgs = "--gpus 1 --cap-add=SYS_ADMIN --ipc=host --security-opt seccomp=unconfined -u root:root -v /home/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
|
||||
slurmRunner = runInDockerOnNodeMultiStage(LLM_DOCKER_IMAGE, nodeName, dockerArgs, false)
|
||||
executeLLMTestOnSlurm(pipeline, platform, testList, config, perfMode, stageName, splitId, splits, skipInstallWheel, cpver, slurmRunner)
|
||||
} finally {
|
||||
cleanUpNodeResources(pipeline, cluster, nodeName)
|
||||
CloudManager.destroyNode(nodeName)
|
||||
}
|
||||
}
|
||||
|
||||
def trimForStageList(stageNameList)
|
||||
{
|
||||
if (stageNameList == null) {
|
||||
@ -727,6 +848,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
|
||||
{
|
||||
// Step 1: create LLM_ROOT dir
|
||||
sh "pwd && ls -alh"
|
||||
// TODO: proper way to clean workspace, maybe save in a folder named with BUILD_ID.
|
||||
// So that it can work with multiple job running in same node
|
||||
sh "rm -rf ./*"
|
||||
def llmRootConfig = "${LLM_ROOT}${config}"
|
||||
sh "mkdir ${llmRootConfig}"
|
||||
|
||||
@ -1179,6 +1303,24 @@ def checkStageName(stageNames) {
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Update existing functions to use runInDockerOnNodeMultiStage and get rid of runInDockerOnNode
|
||||
def runInDockerOnNodeMultiStage(image, label, dockerArgs, needToDeleteDir=true)
|
||||
{
|
||||
return {
|
||||
runner -> node(label) {
|
||||
if (needToDeleteDir) {
|
||||
deleteDir()
|
||||
}
|
||||
stage('Pull Docker Image') {
|
||||
docker.image(image).pull()
|
||||
}
|
||||
docker.image(image).inside(dockerArgs) {
|
||||
runner()
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
def runInDockerOnNode(image, label, dockerArgs)
|
||||
{
|
||||
return {
|
||||
@ -1290,6 +1432,25 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
|
||||
fullSet = parallelJobs.keySet()
|
||||
|
||||
turtleSlurmConfigs = [
|
||||
"RTXPro6000-PyTorch-[Post-Merge]-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
|
||||
]
|
||||
|
||||
// TODO: use cpu pod to launch slurm job
|
||||
parallelSlurmJobs = turtleSlurmConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "a10", "amd64", values[4] ?: 1, key.contains("Perf")), {
|
||||
def config = VANILLA_CONFIG
|
||||
if (key.contains("single-device")) {
|
||||
config = SINGLE_DEVICE_CONFIG
|
||||
}
|
||||
if (key.contains("llvm")) {
|
||||
config = LLVM_CONFIG
|
||||
}
|
||||
runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
|
||||
}]]}
|
||||
|
||||
fullSet += parallelSlurmJobs.keySet()
|
||||
parallelJobs += parallelSlurmJobs
|
||||
|
||||
// Try to match what are being tested on x86 H100_PCIe.
|
||||
// The total machine time is scaled proportionally according to the number of each GPU.
|
||||
aarch64Configs = [
|
||||
|
||||
22
tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
Normal file
22
tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
Normal file
@ -0,0 +1,22 @@
|
||||
version: 0.0.1
|
||||
l0_rtx_pro_6000:
|
||||
- condition:
|
||||
ranges:
|
||||
system_gpu_count:
|
||||
gte: 1
|
||||
lte: 1
|
||||
wildcards:
|
||||
gpu:
|
||||
- '*6000*'
|
||||
linux_distribution_name: ubuntu*
|
||||
terms:
|
||||
stage: post_merge
|
||||
backend: pytorch
|
||||
tests:
|
||||
# ------------- PyTorch tests ---------------
|
||||
- unittest/_torch/modeling -k "modeling_mllama"
|
||||
- unittest/_torch/modeling -k "modeling_out_of_tree"
|
||||
# - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
|
||||
- unittest/_torch/test_attention_mla.py
|
||||
- test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
|
||||
- test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
|
||||
Loading…
Reference in New Issue
Block a user