[None][infra] Add fallback when get wheel from build stage is fail (#9290)

Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
This commit is contained in:
Zhanrui Sun 2025-11-21 13:26:20 +08:00 committed by GitHub
parent e2a372a3b1
commit 5138ef3227
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -333,7 +333,7 @@ def buildImage(config, imageKeyToTag)
}
}
args += prepareWheelFromBuildStage(dockerfileStage, arch)
def buildWheelArgs = prepareWheelFromBuildStage(dockerfileStage, arch)
// Avoid the frequency of OOM issue when building the wheel
if (target == "trtllm") {
if (arch == "x86_64") {
@ -346,15 +346,34 @@ def buildImage(config, imageKeyToTag)
sh "env | sort"
def randomSleep = (Math.random() * 600 + 600).toInteger()
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
trtllm_utils.llmExecStepWithRetry(this, script: """
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
BASE_IMAGE=${BASE_IMAGE} \
TRITON_IMAGE=${TRITON_IMAGE} \
TORCH_INSTALL_TYPE=${torchInstallType} \
IMAGE_WITH_TAG=${imageWithTag} \
STAGE=${dockerfileStage} \
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
""", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
try {
trtllm_utils.llmExecStepWithRetry(this, script: """
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
BASE_IMAGE=${BASE_IMAGE} \
TRITON_IMAGE=${TRITON_IMAGE} \
TORCH_INSTALL_TYPE=${torchInstallType} \
IMAGE_WITH_TAG=${imageWithTag} \
STAGE=${dockerfileStage} \
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
""", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
} catch (InterruptedException ex) {
throw ex
} catch (Exception ex) {
if (buildWheelArgs.trim().isEmpty()) {
throw ex
}
echo "Build failed with wheel arguments, retrying without them"
buildWheelArgs = ""
trtllm_utils.llmExecStepWithRetry(this, script: """
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
BASE_IMAGE=${BASE_IMAGE} \
TRITON_IMAGE=${TRITON_IMAGE} \
TORCH_INSTALL_TYPE=${torchInstallType} \
IMAGE_WITH_TAG=${imageWithTag} \
STAGE=${dockerfileStage} \
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
""", sleepInSecs: randomSleep, numRetries: 2, shortCommondRunTimeMax: 7200)
}
if (target == "ngc-release") {
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
}
@ -369,7 +388,7 @@ def buildImage(config, imageKeyToTag)
TORCH_INSTALL_TYPE=${torchInstallType} \
IMAGE_WITH_TAG=${customImageWithTag} \
STAGE=${dockerfileStage} \
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
"""
}
}