[TRTLLM-6893][infra] Disable the x86 / SBSA build stage when run BuildDockerImage (#6729)

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Zhanrui Sun 2025-09-04 19:20:15 +08:00 committed by GitHub
parent cce9556858
commit 0de3f83805
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 61 additions and 56 deletions

View File

@ -563,53 +563,55 @@ pipeline {
}
steps {
script {
container("python3") {
// Install wget
trtllm_utils.llmExecStepWithRetry(this, script: "apt-get update && apt-get -y install wget")
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
container("python3") {
// Install wget
trtllm_utils.llmExecStepWithRetry(this, script: "apt-get update && apt-get -y install wget")
// Poll for build artifacts
def artifactBaseUrl = "https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/"
def requiredFiles = [
"TensorRT-LLM-GH200.tar.gz",
"TensorRT-LLM.tar.gz"
]
def maxWaitMinutes = 60
def pollIntervalSeconds = 60
// Poll for build artifacts
def artifactBaseUrl = "https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/"
def requiredFiles = [
"TensorRT-LLM-GH200.tar.gz",
"TensorRT-LLM.tar.gz"
]
def maxWaitMinutes = 60
def pollIntervalSeconds = 60
echo "Waiting for build artifacts..."
echo "Required files: ${requiredFiles}"
echo "Waiting for build artifacts..."
echo "Required files: ${requiredFiles}"
def startTime = System.currentTimeMillis()
def maxWaitMs = maxWaitMinutes * 60 * 1000
def startTime = System.currentTimeMillis()
def maxWaitMs = maxWaitMinutes * 60 * 1000
while ((System.currentTimeMillis() - startTime) < maxWaitMs) {
def missingFiles = []
while ((System.currentTimeMillis() - startTime) < maxWaitMs) {
def missingFiles = []
for (file in requiredFiles) {
def fileUrl = "${artifactBaseUrl}${file}"
def exitCode = sh(
script: "wget --spider --quiet --timeout=30 --tries=1 '${fileUrl}'",
returnStatus: true
)
for (file in requiredFiles) {
def fileUrl = "${artifactBaseUrl}${file}"
def exitCode = sh(
script: "wget --spider --quiet --timeout=30 --tries=1 '${fileUrl}'",
returnStatus: true
)
if (exitCode != 0) {
missingFiles.add(file)
if (exitCode != 0) {
missingFiles.add(file)
}
}
}
if (missingFiles.isEmpty()) {
echo "All build artifacts are ready!"
return
if (missingFiles.isEmpty()) {
echo "All build artifacts are ready!"
return
}
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
echo "Waiting... (${elapsedMinutes.intValue()} minutes elapsed)"
echo "Missing files: ${missingFiles}"
sleep(pollIntervalSeconds)
}
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
echo "Waiting... (${elapsedMinutes.intValue()} minutes elapsed)"
echo "Missing files: ${missingFiles}"
sleep(pollIntervalSeconds)
error "Timeout waiting for build artifacts (${elapsedMinutes.intValue()} minutes)"
}
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
error "Timeout waiting for build artifacts (${elapsedMinutes.intValue()} minutes)"
}
}
}
@ -622,28 +624,28 @@ pipeline {
}
steps {
script {
globalVars[IMAGE_KEY_TO_TAG] = imageKeyToTag
String globalVarsJson = writeJSON returnText: true, json: globalVars
def parameters = getCommonParameters()
parameters += [
'enableFailFast': false,
'globalVars': globalVarsJson,
]
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
globalVars[IMAGE_KEY_TO_TAG] = imageKeyToTag
String globalVarsJson = writeJSON returnText: true, json: globalVars
def parameters = getCommonParameters()
parameters += [
'enableFailFast': false,
'globalVars': globalVarsJson,
]
echo "Trigger BuildDockerImageSanityTest job, params: ${parameters}"
echo "Trigger BuildDockerImageSanityTest job, params: ${parameters}"
def status = ""
def jobName = "/LLM/helpers/BuildDockerImageSanityTest"
def handle = build(
job: jobName,
parameters: trtllm_utils.toBuildParameters(parameters),
propagate: false,
)
echo "Triggered job: ${handle.absoluteUrl}"
status = handle.result
def status = ""
def jobName = "/LLM/helpers/BuildDockerImageSanityTest"
def handle = build(
job: jobName,
parameters: trtllm_utils.toBuildParameters(parameters),
propagate: false,
)
echo "Triggered job: ${handle.absoluteUrl}"
status = handle.result
if (status != "SUCCESS") {
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
if (status != "SUCCESS") {
error "Downstream job did not succeed"
}
}

View File

@ -1229,7 +1229,7 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
'branch': branch,
'action': "push",
'triggerType': env.JOB_NAME ==~ /.*PostMerge.*/ ? "post-merge" : "pre-merge",
'runSanityCheck': true,
'runSanityCheck': env.JOB_NAME ==~ /.*PostMerge.*/ ? true : false,
]
launchJob("/LLM/helpers/BuildDockerImages", false, enableFailFast, globalVars, "x86_64", additionalParameters)
@ -1246,6 +1246,9 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images")
testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images")
echo "Will run Build-Docker-Images job"
stages.remove("x86_64-linux")
stages.remove("SBSA-linux")
echo "Build-Docker-Images job is set explicitly. Both x86_64-linux and SBSA-linux sub-pipelines will be disabled."
}
parallelJobs = stages.collectEntries{key, value -> [key, {

View File

@ -2348,7 +2348,7 @@ def launchTestJobs(pipeline, testFilter)
parallelJobsFiltered += multiGpuJobs
}
if (testFilter[(AUTO_TRIGGER_TAG_LIST)] != null) {
if (testFilter[(AUTO_TRIGGER_TAG_LIST)]) {
echo "AUTO_TRIGGER_TAG_LIST mode is true. Auto trigger tags: ${testFilter[(AUTO_TRIGGER_TAG_LIST)].join(', ')}."
def autoTriggerTagStages = [:]
for (tag in testFilter[(AUTO_TRIGGER_TAG_LIST)]) {