infra: [TRTLLM-5879] Spilt single GPU test and multi GPU test into 2 pipelines (#5199)

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Zhanrui Sun 2025-07-16 18:04:04 +08:00 committed by GitHub
parent fc2347eaf5
commit e42f5a9581
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 132 additions and 128 deletions

View File

@ -878,6 +878,45 @@ def triggerJob(jobName, parameters, jenkinsUrl = "", credentials = "")
return status
}
def launchJob(jobName, reuseBuild, enableFailFast, globalVars, platform="x86_64", additionalParameters = [:]) {
def parameters = getCommonParameters()
String globalVarsJson = writeJSON returnText: true, json: globalVars
parameters += [
'enableFailFast': enableFailFast,
'globalVars': globalVarsJson,
] + additionalParameters
if (env.alternativeTRT && platform == "x86_64") {
parameters += [
'alternativeTRT': env.alternativeTRT,
]
}
if (env.alternativeTrtSBSA && platform == "SBSA") {
parameters += [
'alternativeTRT': env.alternativeTrtSBSA,
]
}
if (env.testPhase2StageName) {
parameters += [
'testPhase2StageName': env.testPhase2StageName,
]
}
if (reuseBuild) {
parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
}
echo "Trigger ${jobName} job, params: ${parameters}"
def status = triggerJob(jobName, parameters)
if (status != "SUCCESS") {
error "Downstream job did not succeed"
}
return status
}
def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
{
stages = [
@ -889,78 +928,88 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
"x86_64-linux": {
script {
stage("Build") {
def parameters = getCommonParameters()
String globalVarsJson = writeJSON returnText: true, json: globalVars
parameters += [
'enableFailFast': enableFailFast,
def additionalParameters = [
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
'globalVars': globalVarsJson,
]
if (env.alternativeTRT) {
parameters += [
'alternativeTRT': env.alternativeTRT,
]
}
if (reuseBuild) {
parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
}
echo "trigger x86_64 build job, params: ${parameters}"
def status = triggerJob("/LLM/helpers/Build-x86_64", parameters)
if (status != "SUCCESS") {
error "Downstream job did not succeed"
}
}
def testStageName = "[Test-x86_64] Run"
if (env.localJobCredentials) {
testStageName = "[Test-x86_64] Remote Run"
launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters)
}
def testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
def singleGpuTestFailed = false
stage(testStageName) {
if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
echo "x86_64 test job is skipped due to Jenkins configuration"
return
}
try {
parameters = getCommonParameters()
String testFilterJson = writeJSON returnText: true, json: testFilter
String globalVarsJson = writeJSON returnText: true, json: globalVars
parameters += [
'enableFailFast': enableFailFast,
def additionalParameters = [
'testFilter': testFilterJson,
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
'globalVars': globalVarsJson,
]
if (env.alternativeTRT) {
parameters += [
'alternativeTRT': env.alternativeTRT,
]
launchJob("L0_Test-x86_64-Single-GPU", false, enableFailFast, globalVars, "x86_64", additionalParameters)
} catch (InterruptedException e) {
throw e
} catch (Exception e) {
if (X86_TEST_CHOICE == STAGE_CHOICE_IGNORE) {
catchError(
buildResult: 'SUCCESS',
stageResult: 'FAILURE') {
error "x86_64 test failed but ignored due to Jenkins configuration"
}
} else {
catchError(
buildResult: 'FAILURE',
stageResult: 'FAILURE') {
error "x86_64 single-GPU test failed"
}
singleGpuTestFailed = true
}
}
}
if (env.testPhase2StageName) {
parameters += [
'testPhase2StageName': env.testPhase2StageName,
]
def requireMultiGpuTesting = currentBuild.description?.contains("Require Multi-GPU Testing") ?: false
echo "requireMultiGpuTesting: ${requireMultiGpuTesting}"
if (!requireMultiGpuTesting) {
return
}
if (singleGpuTestFailed) {
if (env.JOB_NAME ==~ /.*PostMerge.*/) {
echo "In the official post-merge pipeline, single-GPU test failed, whereas multi-GPU test is still kept running."
} else {
stage("[Test-x86_64-Multi-GPU] Blocked") {
catchError(
buildResult: 'FAILURE',
stageResult: 'FAILURE') {
error "This pipeline requires running multi-GPU test, but single-GPU test has failed."
}
}
return
}
}
echo "trigger x86_64 test job, params: ${parameters}"
testStageName = "[Test-x86_64-Multi-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
stage(testStageName) {
if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
echo "x86_64 test job is skipped due to Jenkins configuration"
return
}
try {
def testFilterJson = writeJSON returnText: true, json: testFilter
def additionalParameters = [
'testFilter': testFilterJson,
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
]
def status = triggerJob(
"L0_Test-x86_64",
parameters,
)
launchJob("L0_Test-x86_64-Multi-GPU", false, enableFailFast, globalVars, "x86_64", additionalParameters)
if (status != "SUCCESS") {
error "Downstream job did not succeed"
}
} catch (InterruptedException e) {
throw e
} catch (Exception e) {
@ -991,38 +1040,11 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
return
}
def stageName = "Build"
stage(stageName) {
def parameters = getCommonParameters()
String globalVarsJson = writeJSON returnText: true, json: globalVars
parameters += [
'enableFailFast': enableFailFast,
stage("Build") {
def additionalParameters = [
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
'globalVars': globalVarsJson,
]
if (env.alternativeTrtSBSA) {
parameters += [
"alternativeTRT": env.alternativeTrtSBSA,
]
}
if (reuseBuild) {
parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
}
echo "trigger SBSA build job, params: ${parameters}"
def status = triggerJob(
"/LLM/helpers/Build-SBSA",
parameters,
jenkinsUrl,
credentials,
)
if (status != "SUCCESS") {
error "Downstream job did not succeed"
}
launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters)
}
stage(testStageName) {
if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
@ -1030,40 +1052,14 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
return
}
try {
def parameters = getCommonParameters()
String testFilterJson = writeJSON returnText: true, json: testFilter
String globalVarsJson = writeJSON returnText: true, json: globalVars
parameters += [
'enableFailFast': enableFailFast,
def testFilterJson = writeJSON returnText: true, json: testFilter
def additionalParameters = [
'testFilter': testFilterJson,
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
'globalVars': globalVarsJson,
]
if (env.alternativeTrtSBSA) {
parameters += [
"alternativeTRT": env.alternativeTrtSBSA,
]
}
launchJob("L0_Test-SBSA", false, enableFailFast, globalVars, "SBSA", additionalParameters)
if (env.testPhase2StageName) {
parameters += [
'testPhase2StageName': env.testPhase2StageName,
]
}
echo "trigger SBSA test job, params: ${parameters}"
def status = triggerJob(
"L0_Test-SBSA",
parameters,
jenkinsUrl,
credentials,
)
if (status != "SUCCESS") {
error "Downstream job did not succeed"
}
} catch (InterruptedException e) {
throw e
} catch (Exception e) {
@ -1085,31 +1081,23 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
"Build-Docker-Images": {
script {
stage("[Build-Docker-Images] Remote Run") {
def parameters = getCommonParameters()
String globalVarsJson = writeJSON returnText: true, json: globalVars
def branch = env.gitlabBranch ? env.gitlabBranch : "main"
if (globalVars[GITHUB_PR_API_URL]) {
branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last()
}
parameters += [
'enableFailFast': enableFailFast,
def additionalParameters = [
'branch': branch,
'action': "push",
'triggerType': env.JOB_NAME ==~ /.*PostMerge.*/ ? "post-merge" : "pre-merge",
'globalVars': globalVarsJson,
]
echo "trigger BuildDockerImages job, params: ${parameters}"
def status = triggerJob("/LLM/helpers/BuildDockerImages", parameters)
if (status != "SUCCESS") {
error "Downstream job did not succeed"
}
launchJob("/LLM/helpers/BuildDockerImages", false, enableFailFast, globalVars, "x86_64", additionalParameters)
}
}
}
]
if (env.JOB_NAME ==~ /.*PostMerge.*/) {
stages += dockerBuildJob
}

View File

@ -2274,7 +2274,7 @@ pipeline {
when {
expression {
// Only run the test list validation when necessary
env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false
env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false && !(env.JOB_NAME ==~ /.*Multi-GPU.*/)
}
}
steps
@ -2299,17 +2299,33 @@ pipeline {
dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
}
if (singleGpuJobs.size() > 0) {
singleGpuJobs.failFast = params.enableFailFast
parallel singleGpuJobs
} else {
echo "Skip single-GPU testing. No test to run."
}
if (dgxJobs.size() > 0) {
stage(testPhase2StageName) {
if (env.JOB_NAME ==~ /.*Single-GPU.*/) {
echo "Only run single-GPU tests."
if (dgxJobs.size() > 0) {
if (globalVars[ACTION_INFO]['parents'].size() > 0) {
// We add a special marker to the parent job's description.
// This will be used to decide whether to run multi-GPU test stage.
def parentJob = globalVars[ACTION_INFO]['parents'][-2]
trtllm_utils.appendBuildDescription(this, parentJob['name'], parentJob['build_number'], "====Require Multi-GPU Testing====<br/>")
} else {
echo "No parent job found to add the special marker for executing multi-GPU test stage."
}
} else {
echo "Skip multi-GPU testing. No test to run."
}
if (singleGpuJobs.size() > 0) {
singleGpuJobs.failFast = params.enableFailFast
parallel singleGpuJobs
} else {
echo "Skip single-GPU testing. No test to run."
}
} else if (env.JOB_NAME ==~ /.*Multi-GPU.*/) {
echo "Only run multi-GPU tests."
if (dgxJobs.size() > 0) {
dgxJobs.failFast = params.enableFailFast
parallel dgxJobs
} else {
error "Skip multi-GPU testing. No test to run."
}
}
}