TensorRT-LLMs/jenkins/BuildDockerImage.groovy

@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _

import java.lang.Exception
import groovy.transform.Field

// Docker image registry
IMAGE_NAME = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging"
NGC_IMAGE_NAME = "${IMAGE_NAME}/ngc"

// LLM repository configuration
withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) {
    LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
}

ARTIFACT_PATH = env.artifactPath ? env.artifactPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"

LLM_ROOT = "llm"

LLM_BRANCH = env.gitlabBranch ?: params.branch
LLM_BRANCH_TAG = LLM_BRANCH.replaceAll('/', '_')

LLM_COMMIT_OR_BRANCH = env.gitlabCommit ?: LLM_BRANCH

LLM_SHORT_COMMIT = env.gitlabCommit ? env.gitlabCommit.substring(0, 7) : "undefined"

LLM_DEFAULT_TAG = env.defaultTag ?: "${LLM_SHORT_COMMIT}-${LLM_BRANCH_TAG}-${BUILD_NUMBER}"

RUN_SANITY_CHECK = params.runSanityCheck ?: false
TRIGGER_TYPE = env.triggerType ?: "manual"

ENABLE_USE_WHEEL_FROM_BUILD_STAGE = params.useWheelFromBuildStage ?: false

WAIT_TIME_FOR_BUILD_STAGE = 60  // minutes

BUILD_JOBS = "32"
BUILD_JOBS_RELEASE_X86_64 = "32"
BUILD_JOBS_RELEASE_SBSA = "32"

CCACHE_DIR="/mnt/sw-tensorrt-pvc/scratch.trt_ccache/llm_ccache"

@Field
def GITHUB_PR_API_URL = "github_pr_api_url"
@Field
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
@Field
def ACTION_INFO = "action_info"
@Field
def IMAGE_KEY_TO_TAG = "image_key_to_tag"
def globalVars = [
    (GITHUB_PR_API_URL): null,
    (CACHED_CHANGED_FILE_LIST): null,
    (ACTION_INFO): null,
    (IMAGE_KEY_TO_TAG): [:],
]

@Field
def imageKeyToTag = [:]

def createKubernetesPodConfig(type, arch = "amd64", build_wheel = false)
{
    def targetCould = "kubernetes-cpu"
    def containerConfig = ""
    def selectors = """
                nodeSelector:
                  nvidia.com/node_type: builder
                  kubernetes.io/os: linux
                  kubernetes.io/arch: ${arch}"""

    if (build_wheel && arch == "arm64") {
        // For aarch64, we need to use hostname to fix the ucxx issue when building wheels
        selectors += """
                affinity:
                    nodeAffinity:
                        requiredDuringSchedulingIgnoredDuringExecution:
                            nodeSelectorTerms:
                                - matchExpressions:
                                    - key: "kubernetes.io/hostname"
                                      operator: In
                                      values:
                                        - "rl300-0008.ipp2u1.colossus"
                                        - "rl300-0014.ipp2u1.colossus"
                                        - "rl300-0023.ipp2u1.colossus"
                                        - "rl300-0024.ipp2u1.colossus"
                                        - "rl300-0030.ipp2u1.colossus"
                                        - "rl300-0040.ipp2u1.colossus"
                                        - "rl300-0041.ipp2u1.colossus"
                                        - "rl300-0042.ipp2u1.colossus"
                                        - "rl300-0043.ipp2u1.colossus"
                                        - "rl300-0044.ipp2u1.colossus"
                                        - "rl300-0045.ipp2u1.colossus"
                                        - "rl300-0046.ipp2u1.colossus"
                                        - "rl300-0047.ipp2u1.colossus"
        """
    }

    if (arch == "amd64") {
        // For x86_64, we block some nodes to avoid unstable network access.
        selectors += """
                affinity:
                    nodeAffinity:
                        requiredDuringSchedulingIgnoredDuringExecution:
                            nodeSelectorTerms:
                                - matchExpressions:
                                    - key: "kubernetes.io/hostname"
                                      operator: NotIn
                                      values:
                                        - "sc-ipp-blossom-prod-k8w-105"
                                        - "sc-ipp-blossom-prod-k8w-114"
                                        - "sc-ipp-blossom-prod-k8w-115"
                                        - "sc-ipp-blossom-prod-k8w-121"
                                        - "sc-ipp-blossom-prod-k8w-123"
                                        - "sc-ipp-blossom-prod-k8w-124"
        """
    }

    def archSuffix = arch == "arm64" ? "arm" : "amd"
    def jnlpImage = "urm.nvidia.com/sw-ipp-blossom-sre-docker-local/lambda/custom_jnlp_images_${archSuffix}_linux:jdk17"

    switch(type)
    {
    case "agent":
        containerConfig = """
                  - name: python3
                    image: urm.nvidia.com/docker/python:3.12-slim
                    command: ['cat']
                    tty: true
                    resources:
                      requests:
                        cpu: '2'
                        memory: 10Gi
                        ephemeral-storage: 25Gi
                      limits:
                        cpu: '2'
                        memory: 10Gi
                        ephemeral-storage: 25Gi
                    imagePullPolicy: Always"""
        break
    case "build":
        // Use a customized docker:dind image with essential dependencies
        containerConfig = """
                  - name: docker
                    image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:202505221445_docker_dind_withbash
                    tty: true
                    resources:
                      requests:
                        cpu: 16
                        memory: 72Gi
                        ephemeral-storage: 200Gi
                      limits:
                        cpu: 16
                        memory: 256Gi
                        ephemeral-storage: 200Gi
                    imagePullPolicy: Always
                    securityContext:
                      privileged: true
                      capabilities:
                        add:
                        - SYS_ADMIN"""
        break
    }
    def pvcVolume = """
                - name: sw-tensorrt-pvc
                  persistentVolumeClaim:
                    claimName: sw-tensorrt-pvc
    """
    if (arch == "arm64") {
        // PVC mount isn't supported on aarch64 platform. Use NFS as a WAR.
        pvcVolume = """
                - name: sw-tensorrt-pvc
                  nfs:
                    server: 10.117.145.13
                    path: /vol/scratch1/scratch.svc_tensorrt_blossom
        """
    }
    def nodeLabelPrefix = "cpu"
    def jobName = "llm-build-images"
    def buildID = env.BUILD_ID
    def nodeLabel = trtllm_utils.appendRandomPostfix("${nodeLabelPrefix}---tensorrt-${jobName}-${buildID}")
    def podConfig = [
        cloud: targetCould,
        namespace: "sw-tensorrt",
        label: nodeLabel,
        yaml: """
            apiVersion: v1
            kind: Pod
            spec:
                qosClass: Guaranteed
                ${selectors}
                containers:
                  ${containerConfig}
                  - name: jnlp
                    image: ${jnlpImage}
                    args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
                    resources:
                      requests:
                        cpu: '2'
                        memory: 10Gi
                        ephemeral-storage: 25Gi
                      limits:
                        cpu: '2'
                        memory: 10Gi
                        ephemeral-storage: 25Gi
                volumeMounts:
                    - name: sw-tensorrt-pvc
                      mountPath: "/mnt/sw-tensorrt-pvc"
                      readOnly: false
                volumes:
                ${pvcVolume}
        """.stripIndent(),
    ]

    return podConfig
}


def prepareWheelFromBuildStage(dockerfileStage, arch) {
    if (!ENABLE_USE_WHEEL_FROM_BUILD_STAGE) {
        echo "useWheelFromBuildStage is false, skip preparing wheel from build stage"
        return ""
    }

    if (TRIGGER_TYPE != "post-merge") {
        echo "Trigger type is not post-merge, skip preparing wheel from build stage"
        return ""
    }

    if (!dockerfileStage || !arch) {
        echo "Error: dockerfileStage and arch are required parameters"
        return ""
    }

    if (dockerfileStage != "release") {
        echo "prepareWheelFromBuildStage: ${dockerfileStage} is not release"
        return ""
    }

    def wheelScript = 'scripts/get_wheel_from_package.py'
    def wheelArgs = "--arch ${arch} --timeout ${WAIT_TIME_FOR_BUILD_STAGE} --artifact_path " + env.uploadPath
    return " BUILD_WHEEL_SCRIPT=${wheelScript} BUILD_WHEEL_ARGS='${wheelArgs}'"
}

def buildImage(config, imageKeyToTag)
{
    def target = config.target
    def action = config.action
    def torchInstallType = config.torchInstallType
    def args = config.args ?: ""
    def customTag = config.customTag
    def postTag = config.postTag
    def dependent = config.dependent
    def arch = config.arch == 'arm64' ? 'sbsa' : 'x86_64'
    def dockerfileStage = config.dockerfileStage

    def tag = "${arch}-${target}-torch_${torchInstallType}${postTag}-${LLM_DEFAULT_TAG}"

    def dependentTag = tag.replace("${arch}-${target}-", "${arch}-${dependent.target}-")

    def imageWithTag = "${IMAGE_NAME}/${dockerfileStage}:${tag}"
    def dependentImageWithTag = "${IMAGE_NAME}/${dependent.dockerfileStage}:${dependentTag}"
    def customImageWithTag = "${IMAGE_NAME}/${dockerfileStage}:${customTag}"

    if (target == "ngc-release" && TRIGGER_TYPE == "post-merge") {
        echo "Use NGC artifacts for post merge build"
        dependentImageWithTag = "${NGC_IMAGE_NAME}:${dependentTag}"
        imageWithTag = "${NGC_IMAGE_NAME}:${tag}"
        customImageWithTag = "${NGC_IMAGE_NAME}:${customTag}"
    }

    args += " GITHUB_MIRROR=https://urm.nvidia.com/artifactory/github-go-remote"

    stage (config.stageName) {
        // Step 1: Clone TRT-LLM source codes
        // If using a forked repo, svc_tensorrt needs to have the access to the forked repo.
        trtllm_utils.checkoutSource(LLM_REPO, LLM_COMMIT_OR_BRANCH, LLM_ROOT, true, true)
    }

    // Step 2: Build the images
    stage ("Install Package") {
        sh "pwd && ls -alh"
        sh "env | sort"
        sh "apk add make git"
        sh "git config --global --add safe.directory '*'"

        withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'USERNAME', passwordVariable: 'PASSWORD')]) {
            trtllm_utils.llmExecStepWithRetry(this, script: "docker login urm.nvidia.com -u ${USERNAME} -p ${PASSWORD}")
        }

        withCredentials([
            usernamePassword(
                credentialsId: "svc_tensorrt_gitlab_read_api_token",
                usernameVariable: 'USERNAME',
                passwordVariable: 'PASSWORD'
            ),
            string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
        ]) {
            trtllm_utils.llmExecStepWithRetry(this, script: "docker login ${DEFAULT_GIT_URL}:5005 -u ${USERNAME} -p ${PASSWORD}")
        }
    }
    def containerGenFailure = null
    try {
        def build_jobs = BUILD_JOBS
        // Fix the triton image pull timeout issue
        def BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG BASE_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
        def TRITON_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_IMAGE=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
        def TRITON_BASE_TAG = sh(script: "cd ${LLM_ROOT} && grep '^ARG TRITON_BASE_TAG=' docker/Dockerfile.multi | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()

        if (target == "rockylinux8") {
            BASE_IMAGE = sh(script: "cd ${LLM_ROOT} && grep '^jenkins-rockylinux8_%: BASE_IMAGE =' docker/Makefile | grep -o '=.*' | tr -d '=\"'", returnStdout: true).trim()
        }

        // Replace the base image and triton image with the internal mirror
        BASE_IMAGE = BASE_IMAGE.replace("nvcr.io/", "urm.nvidia.com/docker/")
        TRITON_IMAGE = TRITON_IMAGE.replace("nvcr.io/", "urm.nvidia.com/docker/")

        if (dependent) {
            stage ("make ${dependent.target}_${action} (${arch})") {
                def randomSleep = (Math.random() * 600 + 600).toInteger()
                trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
                trtllm_utils.llmExecStepWithRetry(this, script: """
                cd ${LLM_ROOT} && make -C docker ${dependent.target}_${action} \
                BASE_IMAGE=${BASE_IMAGE} \
                TRITON_IMAGE=${TRITON_IMAGE} \
                TORCH_INSTALL_TYPE=${torchInstallType} \
                IMAGE_WITH_TAG=${dependentImageWithTag} \
                STAGE=${dependent.dockerfileStage} \
                BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args}
                """, sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
                args += " DEVEL_IMAGE=${dependentImageWithTag}"
                if (target == "ngc-release") {
                    imageKeyToTag["NGC Devel Image ${config.arch}"] = dependentImageWithTag
                }
            }
        }

        def buildWheelArgs = prepareWheelFromBuildStage(dockerfileStage, arch)
        // Avoid the frequency of OOM issue when building the wheel
        if (target == "trtllm") {
            if (arch == "x86_64") {
                build_jobs = BUILD_JOBS_RELEASE_X86_64
            } else {
                build_jobs = BUILD_JOBS_RELEASE_SBSA
            }
        }
        stage ("make ${target}_${action} (${arch})") {
            sh "env | sort"
            def randomSleep = (Math.random() * 600 + 600).toInteger()
            trtllm_utils.llmExecStepWithRetry(this, script: "docker pull ${TRITON_IMAGE}:${TRITON_BASE_TAG}", sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
            try {
                trtllm_utils.llmExecStepWithRetry(this, script: """
                cd ${LLM_ROOT} && make -C docker ${target}_${action} \
                BASE_IMAGE=${BASE_IMAGE} \
                TRITON_IMAGE=${TRITON_IMAGE} \
                TORCH_INSTALL_TYPE=${torchInstallType} \
                IMAGE_WITH_TAG=${imageWithTag} \
                STAGE=${dockerfileStage} \
                BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
                """, sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
            } catch (InterruptedException ex) {
                throw ex
            } catch (Exception ex) {
                if (buildWheelArgs.trim().isEmpty()) {
                    throw ex
                }
                echo "Build failed with wheel arguments, retrying without them"
                buildWheelArgs = ""
                trtllm_utils.llmExecStepWithRetry(this, script: """
                cd ${LLM_ROOT} && make -C docker ${target}_${action} \
                BASE_IMAGE=${BASE_IMAGE} \
                TRITON_IMAGE=${TRITON_IMAGE} \
                TORCH_INSTALL_TYPE=${torchInstallType} \
                IMAGE_WITH_TAG=${imageWithTag} \
                STAGE=${dockerfileStage} \
                BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
                """, sleepInSecs: randomSleep, numRetries: 6, shortCommondRunTimeMax: 7200)
            }
            if (target == "ngc-release") {
                imageKeyToTag["NGC Release Image ${config.arch}"] = imageWithTag
            }
        }

        if (customTag) {
            stage ("Custom Tag: ${customTag} (${arch})") {
                sh """
                cd ${LLM_ROOT} && make -C docker ${target}_${action} \
                BASE_IMAGE=${BASE_IMAGE} \
                TRITON_IMAGE=${TRITON_IMAGE} \
                TORCH_INSTALL_TYPE=${torchInstallType} \
                IMAGE_WITH_TAG=${customImageWithTag} \
                STAGE=${dockerfileStage} \
                BUILD_WHEEL_OPTS='-j ${build_jobs}' ${args} ${buildWheelArgs}
                """
            }
        }
    } catch (Exception ex) {
        containerGenFailure = ex
    } finally {
        stage ("Docker Logout") {
            withCredentials([string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')]) {
                sh "docker logout urm.nvidia.com"
                sh "docker logout ${DEFAULT_GIT_URL}:5005"
            }
        }
        if (containerGenFailure != null) {
            throw containerGenFailure
        }
    }
}


def launchBuildJobs(pipeline, globalVars, imageKeyToTag) {
    def defaultBuildConfig = [
        target: "tritondevel",
        action: params.action,
        customTag: "",
        postTag: "",
        args: "",
        torchInstallType: "skip",
        arch: "amd64",
        build_wheel: false,
        dependent: [:],
        dockerfileStage: "tritondevel",
    ]

    def release_action = params.action
    def buildConfigs = [
        "Build Internal release (x86_64 trtllm)": [
            target: "trtllm",
            action: release_action,
            customTag: LLM_BRANCH_TAG + "-x86_64",
            build_wheel: true,
            dockerfileStage: "release",
        ],
        "Build Internal release (SBSA trtllm)": [
            target: "trtllm",
            action: release_action,
            customTag: LLM_BRANCH_TAG + "-sbsa",
            build_wheel: true,
            arch: "arm64",
            dockerfileStage: "release",
        ],
        "Build CI Image (x86_64 tritondevel)": [:],
        "Build CI Image (SBSA tritondevel)": [
            arch: "arm64",
        ],
        "Build CI Image (RockyLinux8 Python310)": [
            target: "rockylinux8",
            args: "PYTHON_VERSION=3.10.12",
            postTag: "-py310",
        ],
        "Build CI Image (RockyLinux8 Python312)": [
            target: "rockylinux8",
            args: "PYTHON_VERSION=3.12.3",
            postTag: "-py312",
        ],
        "Build NGC devel And release (x86_64)": [
            target: "ngc-release",
            action: release_action,
            args: "DOCKER_BUILD_OPTS='--load --platform linux/amd64'",
            build_wheel: true,
            dependent: [
                target: "ngc-devel",
                dockerfileStage: "devel",
            ],
            dockerfileStage: "release",
        ],
        "Build NGC devel And release (SBSA)": [
            target: "ngc-release",
            action: release_action,
            args: "DOCKER_BUILD_OPTS='--load --platform linux/arm64'",
            arch: "arm64",
            build_wheel: true,
            dependent: [
                target: "ngc-devel",
                dockerfileStage: "devel",
            ],
            dockerfileStage: "release",
        ],
    ]
    // Override all fields in build config with default values
    buildConfigs.each { key, config ->
        defaultBuildConfig.each { defaultKey, defaultValue ->
            if (!(defaultKey in config)) {
                config[defaultKey] = defaultValue
            }
        }
        config.podConfig = createKubernetesPodConfig("build", config.arch, config.build_wheel)
    }
    echo "Build configs:"
    println buildConfigs

    def buildJobs = buildConfigs.collectEntries { key, config ->
        [key, {
            script {
                stage(key) {
                    config.stageName = key
                    try {
                        trtllm_utils.launchKubernetesPod(pipeline, config.podConfig, "docker") {
                            buildImage(config, imageKeyToTag)
                        }
                    } catch (InterruptedException e) {
                        throw e
                    } catch (Exception e) {
                        catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
                            echo "Build ${key} failed."
                            throw e
                        }
                    }
                }
            }
        }]
    }

    echo "enableFailFast is: ${params.enableFailFast}, but we currently don't use it due to random ucxx issue"
    // pipeline.failFast = params.enableFailFast
    pipeline.parallel buildJobs

}


def getCommonParameters()
{
    return [
        'gitlabSourceRepoHttpUrl': LLM_REPO,
        'gitlabCommit': env.gitlabCommit,
        'artifactPath': ARTIFACT_PATH,
        'uploadPath': UPLOAD_PATH,
    ]
}


pipeline {
    agent {
        kubernetes createKubernetesPodConfig("agent")
    }

    parameters {
        string(
            name: "branch",
            defaultValue: "main",
            description: "Branch to launch job."
        )
        choice(
            name: "action",
            choices: ["build", "push"],
            description: "Docker image generation action. build: only perform image build step; push: build docker image and push it to artifacts"
        )
    }
    options {
        // Check the valid options at: https://www.jenkins.io/doc/book/pipeline/syntax/
        // some step like results analysis stage, does not need to check out source code
        skipDefaultCheckout()
        // to better analyze the time for each step/test
        timestamps()
        timeout(time: 24, unit: 'HOURS')
    }
    environment {
        CCACHE_DIR="${CCACHE_DIR}"
        PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
    }
    stages {
        stage("Setup Environment") {
            steps {
                script {
                    echo "branch is: ${LLM_BRANCH}"
                    echo "env.gitlabBranch is: ${env.gitlabBranch}"
                    echo "params.branch is: ${params.branch}"
                    echo "params.action is: ${params.action}"
                    echo "env.defaultTag is: ${env.defaultTag}"
                    echo "env.gitlabCommit is: ${env.gitlabCommit}"
                    echo "LLM_REPO is: ${LLM_REPO}"
                    echo "env.globalVars is: ${env.globalVars}"
                    sh "env | sort"
                    globalVars = trtllm_utils.updateMapWithJson(this, globalVars, env.globalVars, "globalVars")
                    globalVars[ACTION_INFO] = trtllm_utils.setupPipelineDescription(this, globalVars[ACTION_INFO])
                }
            }
        }
        stage("Build") {
            steps{
                script{
                    launchBuildJobs(this, globalVars, imageKeyToTag)
                }
            }
        }
        stage("Upload Artifact") {
            steps {
                script {
                    String imageKeyToTagJson = writeJSON returnText: true, json: imageKeyToTag
                    echo "imageKeyToTag is: ${imageKeyToTagJson}"
                    writeFile file: "imageKeyToTag.json", text: imageKeyToTagJson
                    archiveArtifacts artifacts: 'imageKeyToTag.json', fingerprint: true
                    trtllm_utils.uploadArtifacts("imageKeyToTag.json", "${UPLOAD_PATH}/")
                }
            }
        }
        stage("Wait For Build Job Complete") {
            when {
                expression {
                    RUN_SANITY_CHECK
                }
            }
            steps {
                script {
                    catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
                        container("python3") {
                            // Install wget
                            trtllm_utils.llmExecStepWithRetry(this, script: "apt-get update && apt-get install -y wget")

                            // Poll for build artifacts
                            def artifactBaseUrl = "https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/"
                            def requiredFiles = [
                                "TensorRT-LLM-GH200.tar.gz",
                                "TensorRT-LLM.tar.gz"
                            ]
                            def maxWaitMinutes = 60
                            def pollIntervalSeconds = 60

                            echo "Waiting for build artifacts..."
                            echo "Required files: ${requiredFiles}"

                            def startTime = System.currentTimeMillis()
                            def maxWaitMs = maxWaitMinutes * 60 * 1000

                            while ((System.currentTimeMillis() - startTime) < maxWaitMs) {
                                def missingFiles = []

                                for (file in requiredFiles) {
                                    def fileUrl = "${artifactBaseUrl}${file}"
                                    def exitCode = sh(
                                        script: "wget --spider --quiet --timeout=30 --tries=1 '${fileUrl}'",
                                        returnStatus: true
                                    )

                                    if (exitCode != 0) {
                                        missingFiles.add(file)
                                    }
                                }

                                if (missingFiles.isEmpty()) {
                                    echo "All build artifacts are ready!"
                                    return
                                }

                                def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
                                echo "Waiting... (${elapsedMinutes.intValue()} minutes elapsed)"
                                echo "Missing files: ${missingFiles}"
                                sleep(pollIntervalSeconds)
                            }

                            def elapsedMinutes = (System.currentTimeMillis() - startTime) / (60 * 1000)
                            error "Timeout waiting for build artifacts (${elapsedMinutes.intValue()} minutes)"
                        }
                    }
                }
            }
        }
        stage("Sanity Check For NGC Image") {
            when {
                expression {
                    RUN_SANITY_CHECK
                }
            }
            steps {
                script {
                    catchError(buildResult: 'FAILURE', stageResult: 'FAILURE') {
                        globalVars[IMAGE_KEY_TO_TAG] = imageKeyToTag
                        String globalVarsJson = writeJSON returnText: true, json: globalVars
                        def parameters = getCommonParameters()
                        parameters += [
                            'enableFailFast': false,
                            'globalVars': globalVarsJson,
                        ]

                        echo "Trigger BuildDockerImageSanityTest job, params: ${parameters}"

                        def status = ""
                        def jobName = "/LLM/helpers/BuildDockerImageSanityTest"
                        def handle = build(
                            job: jobName,
                            parameters: trtllm_utils.toBuildParameters(parameters),
                            propagate: false,
                        )
                        echo "Triggered job: ${handle.absoluteUrl}"
                        status = handle.result

                        if (status != "SUCCESS") {
                            error "Downstream job did not succeed"
                        }
                    }
                }
            }
        }
        stage("Register NGC Image For Security Check") {
            when {
                expression {
                    return params.nspect_id && params.action == "push"
                }
            }
            steps {
                script {
                    container("python3") {
                        trtllm_utils.llmExecStepWithRetry(this, script: "pip3 install --upgrade pip")
                        trtllm_utils.llmExecStepWithRetry(this, script: "pip3 install --upgrade requests")
                        def nspect_commit = "4cb9c0c42d44ebeeba1e40d2c3eb6aab6fb90173"
                        withCredentials([string(credentialsId: "TRTLLM_NSPECT_REPO", variable: "NSPECT_REPO")]) {
                            trtllm_utils.checkoutSource("${NSPECT_REPO}", nspect_commit, "nspect")
                        }
                        def nspect_env = params.nspect_env ? params.nspect_env : "prod"
                        def program_version_name = params.program_version_name ? params.program_version_name : "PostMerge"
                        def cmd = """./nspect/nspect.py \
                            --env ${nspect_env} \
                            --nspect_id ${params.nspect_id} \
                            --program_version_name '${program_version_name}' \
                            """
                        if (params.register_images) {
                            cmd += "--register "
                        }
                        if (params.osrb_ticket) {
                            cmd += "--osrb_ticket ${params.osrb_ticket} "
                        }
                        if (params.wait_success_seconds) {
                            cmd += "--check_launch_api "
                            cmd += "--wait_success ${params.wait_success_seconds} "
                        }
                        cmd += "--image "
                        cmd += imageKeyToTag.values().join(" ")
                        withCredentials([usernamePassword(credentialsId: "NSPECT_CLIENT-${nspect_env}", usernameVariable: 'NSPECT_CLIENT_ID', passwordVariable: 'NSPECT_CLIENT_SECRET')]) {
                            trtllm_utils.llmExecStepWithRetry(this, script: cmd, sleepInSecs: 600, numRetries: 6, shortCommondRunTimeMax: 7200)
                        }
                    }
                }
            }
        }
    } // stages
} // pipeline