mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[TRTLLM-6893][infra] Disable the x86 / SBSA build stage when run BuildDockerImage (#6729)
Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
cce9556858
commit
0de3f83805
@ -563,53 +563,55 @@ pipeline {
|
||||
}
|
||||
steps {
|
||||
script {
|
||||
container("python3") {
|
||||
// Install wget
|
||||
trtllm_utils.llmExecStepWithRetry(this, script: "apt-get update && apt-get -y install wget")
|
||||
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
|
||||
container("python3") {
|
||||
// Install wget
|
||||
trtllm_utils.llmExecStepWithRetry(this, script: "apt-get update && apt-get -y install wget")
|
||||
|
||||
// Poll for build artifacts
|
||||
def artifactBaseUrl = "https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/"
|
||||
def requiredFiles = [
|
||||
"TensorRT-LLM-GH200.tar.gz",
|
||||
"TensorRT-LLM.tar.gz"
|
||||
]
|
||||
def maxWaitMinutes = 60
|
||||
def pollIntervalSeconds = 60
|
||||
// Poll for build artifacts
|
||||
def artifactBaseUrl = "https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/"
|
||||
def requiredFiles = [
|
||||
"TensorRT-LLM-GH200.tar.gz",
|
||||
"TensorRT-LLM.tar.gz"
|
||||
]
|
||||
def maxWaitMinutes = 60
|
||||
def pollIntervalSeconds = 60
|
||||
|
||||
echo "Waiting for build artifacts..."
|
||||
echo "Required files: ${requiredFiles}"
|
||||
echo "Waiting for build artifacts..."
|
||||
echo "Required files: ${requiredFiles}"
|
||||
|
||||
def startTime = System.currentTimeMillis()
|
||||
def maxWaitMs = maxWaitMinutes * 60 * 1000
|
||||
def startTime = System.currentTimeMillis()
|
||||
def maxWaitMs = maxWaitMinutes * 60 * 1000
|
||||
|
||||
while ((System.currentTimeMillis() - startTime) < maxWaitMs) {
|
||||
def missingFiles = []
|
||||
while ((System.currentTimeMillis() - startTime) < maxWaitMs) {
|
||||
def missingFiles = []
|
||||
|
||||
for (file in requiredFiles) {
|
||||
def fileUrl = "${artifactBaseUrl}${file}"
|
||||
def exitCode = sh(
|
||||
script: "wget --spider --quiet --timeout=30 --tries=1 '${fileUrl}'",
|
||||
returnStatus: true
|
||||
)
|
||||
for (file in requiredFiles) {
|
||||
def fileUrl = "${artifactBaseUrl}${file}"
|
||||
def exitCode = sh(
|
||||
script: "wget --spider --quiet --timeout=30 --tries=1 '${fileUrl}'",
|
||||
returnStatus: true
|
||||
)
|
||||
|
||||
if (exitCode != 0) {
|
||||
missingFiles.add(file)
|
||||
if (exitCode != 0) {
|
||||
missingFiles.add(file)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (missingFiles.isEmpty()) {
|
||||
echo "All build artifacts are ready!"
|
||||
return
|
||||
if (missingFiles.isEmpty()) {
|
||||
echo "All build artifacts are ready!"
|
||||
return
|
||||
}
|
||||
|
||||
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
|
||||
echo "Waiting... (${elapsedMinutes.intValue()} minutes elapsed)"
|
||||
echo "Missing files: ${missingFiles}"
|
||||
sleep(pollIntervalSeconds)
|
||||
}
|
||||
|
||||
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
|
||||
echo "Waiting... (${elapsedMinutes.intValue()} minutes elapsed)"
|
||||
echo "Missing files: ${missingFiles}"
|
||||
sleep(pollIntervalSeconds)
|
||||
error "Timeout waiting for build artifacts (${elapsedMinutes.intValue()} minutes)"
|
||||
}
|
||||
|
||||
def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
|
||||
error "Timeout waiting for build artifacts (${elapsedMinutes.intValue()} minutes)"
|
||||
}
|
||||
}
|
||||
}
|
||||
@ -622,28 +624,28 @@ pipeline {
|
||||
}
|
||||
steps {
|
||||
script {
|
||||
globalVars[IMAGE_KEY_TO_TAG] = imageKeyToTag
|
||||
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||
def parameters = getCommonParameters()
|
||||
parameters += [
|
||||
'enableFailFast': false,
|
||||
'globalVars': globalVarsJson,
|
||||
]
|
||||
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
|
||||
globalVars[IMAGE_KEY_TO_TAG] = imageKeyToTag
|
||||
String globalVarsJson = writeJSON returnText: true, json: globalVars
|
||||
def parameters = getCommonParameters()
|
||||
parameters += [
|
||||
'enableFailFast': false,
|
||||
'globalVars': globalVarsJson,
|
||||
]
|
||||
|
||||
echo "Trigger BuildDockerImageSanityTest job, params: ${parameters}"
|
||||
echo "Trigger BuildDockerImageSanityTest job, params: ${parameters}"
|
||||
|
||||
def status = ""
|
||||
def jobName = "/LLM/helpers/BuildDockerImageSanityTest"
|
||||
def handle = build(
|
||||
job: jobName,
|
||||
parameters: trtllm_utils.toBuildParameters(parameters),
|
||||
propagate: false,
|
||||
)
|
||||
echo "Triggered job: ${handle.absoluteUrl}"
|
||||
status = handle.result
|
||||
def status = ""
|
||||
def jobName = "/LLM/helpers/BuildDockerImageSanityTest"
|
||||
def handle = build(
|
||||
job: jobName,
|
||||
parameters: trtllm_utils.toBuildParameters(parameters),
|
||||
propagate: false,
|
||||
)
|
||||
echo "Triggered job: ${handle.absoluteUrl}"
|
||||
status = handle.result
|
||||
|
||||
if (status != "SUCCESS") {
|
||||
catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
|
||||
if (status != "SUCCESS") {
|
||||
error "Downstream job did not succeed"
|
||||
}
|
||||
}
|
||||
|
||||
@ -1229,7 +1229,7 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
'branch': branch,
|
||||
'action': "push",
|
||||
'triggerType': env.JOB_NAME ==~ /.*PostMerge.*/ ? "post-merge" : "pre-merge",
|
||||
'runSanityCheck': true,
|
||||
'runSanityCheck': env.JOB_NAME ==~ /.*PostMerge.*/ ? true : false,
|
||||
]
|
||||
|
||||
launchJob("/LLM/helpers/BuildDockerImages", false, enableFailFast, globalVars, "x86_64", additionalParameters)
|
||||
@ -1246,6 +1246,9 @@ def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
|
||||
testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images")
|
||||
testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images")
|
||||
echo "Will run Build-Docker-Images job"
|
||||
stages.remove("x86_64-linux")
|
||||
stages.remove("SBSA-linux")
|
||||
echo "Build-Docker-Images job is set explicitly. Both x86_64-linux and SBSA-linux sub-pipelines will be disabled."
|
||||
}
|
||||
|
||||
parallelJobs = stages.collectEntries{key, value -> [key, {
|
||||
|
||||
@ -2348,7 +2348,7 @@ def launchTestJobs(pipeline, testFilter)
|
||||
parallelJobsFiltered += multiGpuJobs
|
||||
}
|
||||
|
||||
if (testFilter[(AUTO_TRIGGER_TAG_LIST)] != null) {
|
||||
if (testFilter[(AUTO_TRIGGER_TAG_LIST)]) {
|
||||
echo "AUTO_TRIGGER_TAG_LIST mode is true. Auto trigger tags: ${testFilter[(AUTO_TRIGGER_TAG_LIST)].join(', ')}."
|
||||
def autoTriggerTagStages = [:]
|
||||
for (tag in testFilter[(AUTO_TRIGGER_TAG_LIST)]) {
|
||||
|
||||
Loading…
Reference in New Issue
Block a user