mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
infra: [TRTLLM-5879] Spilt single GPU test and multi GPU test into 2 pipelines (#5199)
Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
fc2347eaf5
commit
e42f5a9581
@ -878,6 +878,45 @@ def triggerJob(jobName, parameters, jenkinsUrl = "", credentials = "")
|
||||
return status
|
||||
}
|
||||
|
||||
def launchJob(jobName, reuseBuild, enableFailFast, globalVars, platform="x86_64", additionalParameters = [:]) {
|
||||
def parameters = getCommonParameters()
|
||||
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||
parameters += [
|
||||
'enableFailFast': enableFailFast,
|
||||
'globalVars': globalVarsJson,
|
||||
] + additionalParameters
|
||||
|
||||
if (env.alternativeTRT && platform == "x86_64") {
|
||||
parameters += [
|
||||
'alternativeTRT': env.alternativeTRT,
|
||||
]
|
||||
}
|
||||
|
||||
if (env.alternativeTrtSBSA && platform == "SBSA") {
|
||||
parameters += [
|
||||
'alternativeTRT': env.alternativeTrtSBSA,
|
||||
]
|
||||
}
|
||||
|
||||
if (env.testPhase2StageName) {
|
||||
parameters += [
|
||||
'testPhase2StageName': env.testPhase2StageName,
|
||||
]
|
||||
}
|
||||
|
||||
if (reuseBuild) {
|
||||
parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
|
||||
}
|
||||
|
||||
echo "Trigger ${jobName} job, params: ${parameters}"
|
||||
|
||||
def status = triggerJob(jobName, parameters)
|
||||
if (status != "SUCCESS") {
|
||||
error "Downstream job did not succeed"
|
||||
}
|
||||
return status
|
||||
}
|
||||
|
||||
def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
{
|
||||
stages = [
|
||||
@ -889,78 +928,88 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
"x86_64-linux": {
|
||||
script {
|
||||
stage("Build") {
|
||||
def parameters = getCommonParameters()
|
||||
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||
parameters += [
|
||||
'enableFailFast': enableFailFast,
|
||||
def additionalParameters = [
|
||||
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
|
||||
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
|
||||
'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
|
||||
'globalVars': globalVarsJson,
|
||||
]
|
||||
|
||||
if (env.alternativeTRT) {
|
||||
parameters += [
|
||||
'alternativeTRT': env.alternativeTRT,
|
||||
]
|
||||
}
|
||||
|
||||
if (reuseBuild) {
|
||||
parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
|
||||
}
|
||||
|
||||
echo "trigger x86_64 build job, params: ${parameters}"
|
||||
|
||||
def status = triggerJob("/LLM/helpers/Build-x86_64", parameters)
|
||||
if (status != "SUCCESS") {
|
||||
error "Downstream job did not succeed"
|
||||
}
|
||||
|
||||
}
|
||||
def testStageName = "[Test-x86_64] Run"
|
||||
if (env.localJobCredentials) {
|
||||
testStageName = "[Test-x86_64] Remote Run"
|
||||
launchJob("/LLM/helpers/Build-x86_64", reuseBuild, enableFailFast, globalVars, "x86_64", additionalParameters)
|
||||
}
|
||||
def testStageName = "[Test-x86_64-Single-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
|
||||
def singleGpuTestFailed = false
|
||||
stage(testStageName) {
|
||||
if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
|
||||
echo "x86_64 test job is skipped due to Jenkins configuration"
|
||||
return
|
||||
}
|
||||
try {
|
||||
parameters = getCommonParameters()
|
||||
String testFilterJson = writeJSON returnText: true, json: testFilter
|
||||
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||
parameters += [
|
||||
'enableFailFast': enableFailFast,
|
||||
def additionalParameters = [
|
||||
'testFilter': testFilterJson,
|
||||
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
|
||||
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
|
||||
'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
|
||||
'globalVars': globalVarsJson,
|
||||
]
|
||||
|
||||
if (env.alternativeTRT) {
|
||||
parameters += [
|
||||
'alternativeTRT': env.alternativeTRT,
|
||||
]
|
||||
launchJob("L0_Test-x86_64-Single-GPU", false, enableFailFast, globalVars, "x86_64", additionalParameters)
|
||||
} catch (InterruptedException e) {
|
||||
throw e
|
||||
} catch (Exception e) {
|
||||
if (X86_TEST_CHOICE == STAGE_CHOICE_IGNORE) {
|
||||
catchError(
|
||||
buildResult: 'SUCCESS',
|
||||
stageResult: 'FAILURE') {
|
||||
error "x86_64 test failed but ignored due to Jenkins configuration"
|
||||
}
|
||||
} else {
|
||||
catchError(
|
||||
buildResult: 'FAILURE',
|
||||
stageResult: 'FAILURE') {
|
||||
error "x86_64 single-GPU test failed"
|
||||
}
|
||||
singleGpuTestFailed = true
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (env.testPhase2StageName) {
|
||||
parameters += [
|
||||
'testPhase2StageName': env.testPhase2StageName,
|
||||
]
|
||||
def requireMultiGpuTesting = currentBuild.description?.contains("Require Multi-GPU Testing") ?: false
|
||||
echo "requireMultiGpuTesting: ${requireMultiGpuTesting}"
|
||||
if (!requireMultiGpuTesting) {
|
||||
return
|
||||
}
|
||||
|
||||
if (singleGpuTestFailed) {
|
||||
if (env.JOB_NAME ==~ /.*PostMerge.*/) {
|
||||
echo "In the official post-merge pipeline, single-GPU test failed, whereas multi-GPU test is still kept running."
|
||||
} else {
|
||||
stage("[Test-x86_64-Multi-GPU] Blocked") {
|
||||
catchError(
|
||||
buildResult: 'FAILURE',
|
||||
stageResult: 'FAILURE') {
|
||||
error "This pipeline requires running multi-GPU test, but single-GPU test has failed."
|
||||
}
|
||||
}
|
||||
return
|
||||
}
|
||||
}
|
||||
|
||||
echo "trigger x86_64 test job, params: ${parameters}"
|
||||
testStageName = "[Test-x86_64-Multi-GPU] ${env.localJobCredentials ? "Remote Run" : "Run"}"
|
||||
stage(testStageName) {
|
||||
if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
|
||||
echo "x86_64 test job is skipped due to Jenkins configuration"
|
||||
return
|
||||
}
|
||||
try {
|
||||
def testFilterJson = writeJSON returnText: true, json: testFilter
|
||||
def additionalParameters = [
|
||||
'testFilter': testFilterJson,
|
||||
'dockerImage': globalVars["LLM_DOCKER_IMAGE"],
|
||||
'wheelDockerImagePy310': globalVars["LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE"],
|
||||
'wheelDockerImagePy312': globalVars["LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE"],
|
||||
]
|
||||
|
||||
def status = triggerJob(
|
||||
"L0_Test-x86_64",
|
||||
parameters,
|
||||
)
|
||||
launchJob("L0_Test-x86_64-Multi-GPU", false, enableFailFast, globalVars, "x86_64", additionalParameters)
|
||||
|
||||
if (status != "SUCCESS") {
|
||||
error "Downstream job did not succeed"
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
throw e
|
||||
} catch (Exception e) {
|
||||
@ -991,38 +1040,11 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
return
|
||||
}
|
||||
|
||||
def stageName = "Build"
|
||||
stage(stageName) {
|
||||
def parameters = getCommonParameters()
|
||||
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||
parameters += [
|
||||
'enableFailFast': enableFailFast,
|
||||
stage("Build") {
|
||||
def additionalParameters = [
|
||||
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
|
||||
'globalVars': globalVarsJson,
|
||||
]
|
||||
|
||||
if (env.alternativeTrtSBSA) {
|
||||
parameters += [
|
||||
"alternativeTRT": env.alternativeTrtSBSA,
|
||||
]
|
||||
}
|
||||
|
||||
if (reuseBuild) {
|
||||
parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
|
||||
}
|
||||
|
||||
echo "trigger SBSA build job, params: ${parameters}"
|
||||
|
||||
def status = triggerJob(
|
||||
"/LLM/helpers/Build-SBSA",
|
||||
parameters,
|
||||
jenkinsUrl,
|
||||
credentials,
|
||||
)
|
||||
|
||||
if (status != "SUCCESS") {
|
||||
error "Downstream job did not succeed"
|
||||
}
|
||||
launchJob("/LLM/helpers/Build-SBSA", reuseBuild, enableFailFast, globalVars, "SBSA", additionalParameters)
|
||||
}
|
||||
stage(testStageName) {
|
||||
if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
|
||||
@ -1030,40 +1052,14 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
return
|
||||
}
|
||||
try {
|
||||
def parameters = getCommonParameters()
|
||||
String testFilterJson = writeJSON returnText: true, json: testFilter
|
||||
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||
parameters += [
|
||||
'enableFailFast': enableFailFast,
|
||||
def testFilterJson = writeJSON returnText: true, json: testFilter
|
||||
def additionalParameters = [
|
||||
'testFilter': testFilterJson,
|
||||
"dockerImage": globalVars["LLM_SBSA_DOCKER_IMAGE"],
|
||||
'globalVars': globalVarsJson,
|
||||
]
|
||||
|
||||
if (env.alternativeTrtSBSA) {
|
||||
parameters += [
|
||||
"alternativeTRT": env.alternativeTrtSBSA,
|
||||
]
|
||||
}
|
||||
launchJob("L0_Test-SBSA", false, enableFailFast, globalVars, "SBSA", additionalParameters)
|
||||
|
||||
if (env.testPhase2StageName) {
|
||||
parameters += [
|
||||
'testPhase2StageName': env.testPhase2StageName,
|
||||
]
|
||||
}
|
||||
|
||||
echo "trigger SBSA test job, params: ${parameters}"
|
||||
|
||||
def status = triggerJob(
|
||||
"L0_Test-SBSA",
|
||||
parameters,
|
||||
jenkinsUrl,
|
||||
credentials,
|
||||
)
|
||||
|
||||
if (status != "SUCCESS") {
|
||||
error "Downstream job did not succeed"
|
||||
}
|
||||
} catch (InterruptedException e) {
|
||||
throw e
|
||||
} catch (Exception e) {
|
||||
@ -1085,31 +1081,23 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
"Build-Docker-Images": {
|
||||
script {
|
||||
stage("[Build-Docker-Images] Remote Run") {
|
||||
def parameters = getCommonParameters()
|
||||
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||
def branch = env.gitlabBranch ? env.gitlabBranch : "main"
|
||||
if (globalVars[GITHUB_PR_API_URL]) {
|
||||
branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last()
|
||||
}
|
||||
|
||||
parameters += [
|
||||
'enableFailFast': enableFailFast,
|
||||
def additionalParameters = [
|
||||
'branch': branch,
|
||||
'action': "push",
|
||||
'triggerType': env.JOB_NAME ==~ /.*PostMerge.*/ ? "post-merge" : "pre-merge",
|
||||
'globalVars': globalVarsJson,
|
||||
]
|
||||
|
||||
echo "trigger BuildDockerImages job, params: ${parameters}"
|
||||
|
||||
def status = triggerJob("/LLM/helpers/BuildDockerImages", parameters)
|
||||
if (status != "SUCCESS") {
|
||||
error "Downstream job did not succeed"
|
||||
}
|
||||
launchJob("/LLM/helpers/BuildDockerImages", false, enableFailFast, globalVars, "x86_64", additionalParameters)
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
|
||||
if (env.JOB_NAME ==~ /.*PostMerge.*/) {
|
||||
stages += dockerBuildJob
|
||||
}
|
||||
|
||||
@ -2274,7 +2274,7 @@ pipeline {
|
||||
when {
|
||||
expression {
|
||||
// Only run the test list validation when necessary
|
||||
env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false
|
||||
env.targetArch == X86_64_TRIPLE && testFilter[ONLY_DOCS_FILE_CHANGED] == false && !(env.JOB_NAME ==~ /.*Multi-GPU.*/)
|
||||
}
|
||||
}
|
||||
steps
|
||||
@ -2299,17 +2299,33 @@ pipeline {
|
||||
dgxJobs = parallelJobs.findAll{dgxSigns.any{sign -> it.key.contains(sign)}}
|
||||
}
|
||||
|
||||
if (singleGpuJobs.size() > 0) {
|
||||
singleGpuJobs.failFast = params.enableFailFast
|
||||
parallel singleGpuJobs
|
||||
} else {
|
||||
echo "Skip single-GPU testing. No test to run."
|
||||
}
|
||||
|
||||
if (dgxJobs.size() > 0) {
|
||||
stage(testPhase2StageName) {
|
||||
if (env.JOB_NAME ==~ /.*Single-GPU.*/) {
|
||||
echo "Only run single-GPU tests."
|
||||
if (dgxJobs.size() > 0) {
|
||||
if (globalVars[ACTION_INFO]['parents'].size() > 0) {
|
||||
// We add a special marker to the parent job's description.
|
||||
// This will be used to decide whether to run multi-GPU test stage.
|
||||
def parentJob = globalVars[ACTION_INFO]['parents'][-2]
|
||||
trtllm_utils.appendBuildDescription(this, parentJob['name'], parentJob['build_number'], "====Require Multi-GPU Testing====<br/>")
|
||||
} else {
|
||||
echo "No parent job found to add the special marker for executing multi-GPU test stage."
|
||||
}
|
||||
} else {
|
||||
echo "Skip multi-GPU testing. No test to run."
|
||||
}
|
||||
if (singleGpuJobs.size() > 0) {
|
||||
singleGpuJobs.failFast = params.enableFailFast
|
||||
parallel singleGpuJobs
|
||||
} else {
|
||||
echo "Skip single-GPU testing. No test to run."
|
||||
}
|
||||
} else if (env.JOB_NAME ==~ /.*Multi-GPU.*/) {
|
||||
echo "Only run multi-GPU tests."
|
||||
if (dgxJobs.size() > 0) {
|
||||
dgxJobs.failFast = params.enableFailFast
|
||||
parallel dgxJobs
|
||||
} else {
|
||||
error "Skip multi-GPU testing. No test to run."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user