mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][infra] Add fallback when get wheel from build stage is fail (#9290)
Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com>
This commit is contained in:
parent
e2a372a3b1
commit
5138ef3227
@ -333,7 +333,7 @@ def buildImage(config, imageKeyToTag)
|
||||
}
|
||||
}
|
||||
|
||||
args += prepareWheelFromBuildStage(dockerfileStage, arch)
|
||||
def buildWheelArgs = prepareWheelFromBuildStage(dockerfileStage, arch)
|
||||
// Avoid the frequency of OOM issue when building the wheel
|
||||
if (target == "trtllm") {
|
||||
if (arch == "x86_64") {
|
||||
@ -346,15 +346,34 @@ def buildImage(config, imageKeyToTag)
|
||||
sh "env | sort"
|
||||
def randomSleep = (Math.random() * 600 + 600).toInteger()
|
||||
trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
|
||||
trtllm_utils.llmExecStepWithRetry(this, script: """
|
||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||
BASE_IMAGE=${BASE_IMAGE} \
|
||||
TRITON_IMAGE=${TRITON_IMAGE} \
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_WITH_TAG=${imageWithTag} \
|
||||
STAGE=${dockerfileStage} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
|
||||
""", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
|
||||
try {
|
||||
trtllm_utils.llmExecStepWithRetry(this, script: """
|
||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||
BASE_IMAGE=${BASE_IMAGE} \
|
||||
TRITON_IMAGE=${TRITON_IMAGE} \
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_WITH_TAG=${imageWithTag} \
|
||||
STAGE=${dockerfileStage} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
|
||||
""", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
|
||||
} catch (InterruptedException ex) {
|
||||
throw ex
|
||||
} catch (Exception ex) {
|
||||
if (buildWheelArgs.trim().isEmpty()) {
|
||||
throw ex
|
||||
}
|
||||
echo "Build failed with wheel arguments, retrying without them"
|
||||
buildWheelArgs = ""
|
||||
trtllm_utils.llmExecStepWithRetry(this, script: """
|
||||
cd ${LLM_ROOT} && make -C docker ${target}_${action} \
|
||||
BASE_IMAGE=${BASE_IMAGE} \
|
||||
TRITON_IMAGE=${TRITON_IMAGE} \
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_WITH_TAG=${imageWithTag} \
|
||||
STAGE=${dockerfileStage} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
|
||||
""", sleepInSecs: randomSleep, numRetries: 2, shortCommondRunTimeMax: 7200)
|
||||
}
|
||||
if (target == "ngc-release") {
|
||||
imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
|
||||
}
|
||||
@ -369,7 +388,7 @@ def buildImage(config, imageKeyToTag)
|
||||
TORCH_INSTALL_TYPE=${torchInstallType} \
|
||||
IMAGE_WITH_TAG=${customImageWithTag} \
|
||||
STAGE=${dockerfileStage} \
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
|
||||
BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
|
||||
"""
|
||||
}
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user