@Library(['bloom-jenkins-shared-lib@main', 'trtllm-jenkins-shared-lib@main']) _

import java.lang.InterruptedException
import groovy.transform.Field
import groovy.json.JsonOutput
import com.nvidia.bloom.KubernetesManager
import com.nvidia.bloom.Constants
import org.jenkinsci.plugins.workflow.cps.CpsThread
import org.jsoup.Jsoup
import org.jenkinsci.plugins.pipeline.modeldefinition.Utils as jUtils

// LLM repository configuration
withCredentials([string(credentialsId: 'default-llm-repo', variable: 'DEFAULT_LLM_REPO')]) {
    LLM_REPO = env.gitlabSourceRepoHttpUrl ? env.gitlabSourceRepoHttpUrl : "${DEFAULT_LLM_REPO}"
}
LLM_ROOT = "llm"

// LLM repository configuration
withCredentials([string(credentialsId: 'default-scan-repo', variable: 'DEFAULT_SCAN_REPO')]) {
    SCAN_REPO = "${DEFAULT_SCAN_REPO}"
}
SCAN_COMMIT = "main"
SCAN_ROOT = "scan"

ARTIFACT_PATH = env.artifactPath ? env.artifactPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"
UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}"

// Container configuration
// available tags can be found in: https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/
// [base_image_name]-[arch]-[os](-[python_version])-[trt_version]-[torch_install_type]-[stage]-[date]-[mr_id]
LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202505211401-4539"
LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-aarch64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202505211401-4539"
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.10.0.31-skip-tritondevel-202505211401-4539"
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.10.0.31-skip-tritondevel-202505211401-4539"

// TODO: Move common variables to an unified location
BUILD_CORES_REQUEST = "8"
BUILD_CORES_LIMIT = "8"
BUILD_MEMORY_REQUEST = "48Gi"
BUILD_MEMORY_LIMIT = "48Gi"

// Stage choices
STAGE_CHOICE_NORMAL = "normal"
STAGE_CHOICE_SKIP = "skip"
STAGE_CHOICE_IGNORE = "ignore"

RELESE_CHECK_CHOICE = env.releaseCheckChoice ? env.releaseCheckChoice : STAGE_CHOICE_NORMAL
X86_TEST_CHOICE = env.x86TestChoice ? env.x86TestChoice : STAGE_CHOICE_NORMAL
SBSA_TEST_CHOICE = env.SBSATestChoice ? env.SBSATestChoice : STAGE_CHOICE_NORMAL

def gitlabParamsFromBot = [:]

if (env.gitlabTriggerPhrase)
{
    gitlabParamsFromBot = readJSON text: env.gitlabTriggerPhrase, returnPojo: true
}

// "Fail Fast" feature is enabled by default for the pre-merge pipeline.
// "Fail Fast" feature is always disabled for the post-merge pipeline.
boolean enableFailFast = !(env.JOB_NAME ==~ /.*PostMerge.*/ || env.JOB_NAME ==~ /.*Dependency_Testing_TRT.*/) && !gitlabParamsFromBot.get("disable_fail_fast", false)

boolean isReleaseCheckMode = (gitlabParamsFromBot.get("run_mode", "full") == "release_check")

BUILD_STATUS_NAME = isReleaseCheckMode ? "Jenkins Release Check" : "Jenkins Full Build"

def trimForStageList(stageNameList)
{
    if (stageNameList == null) {
        return null
    }
    trimedList = []
    stageNameList.each { stageName ->
        trimedList.add(stageName.trim().replaceAll('\\\\', ''))
    }
    return trimedList
}

@Field
def REUSE_STAGE_LIST = "reuse_stage_list"
@Field
def ENABLE_SKIP_TEST = "skip_test"
@Field
def TEST_STAGE_LIST = "stage_list"
@Field
def GPU_TYPE_LIST = "gpu_type"
@Field
def TEST_BACKEND = "test_backend"
@Field
def IS_POST_MERGE = "post_merge"
@Field
def ADD_MULTI_GPU_TEST = "add_multi_gpu_test"
@Field
def ENABLE_MULTI_GPU_TEST = "multi_gpu_test"
@Field
def ONLY_MULTI_GPU_TEST = "only_multi_gpu_test"
@Field
def DISABLE_MULTI_GPU_TEST = "disable_multi_gpu_test"
@Field
def EXTRA_STAGE_LIST = "extra_stage"
@Field
def MULTI_GPU_FILE_CHANGED = "multi_gpu_file_changed"
@Field
def ONLY_PYTORCH_FILE_CHANGED = "only_pytorch_file_changed"
@Field
def AUTO_TRIGGER_TAG_LIST = "auto_trigger_tag_list"
@Field
def DEBUG_MODE = "debug"

def testFilter = [
    (REUSE_STAGE_LIST): trimForStageList(gitlabParamsFromBot.get(REUSE_STAGE_LIST, null)?.tokenize(',')),
    (ENABLE_SKIP_TEST): gitlabParamsFromBot.get((ENABLE_SKIP_TEST), false),
    (TEST_STAGE_LIST): trimForStageList(gitlabParamsFromBot.get((TEST_STAGE_LIST), null)?.tokenize(',')),
    (GPU_TYPE_LIST): trimForStageList(gitlabParamsFromBot.get((GPU_TYPE_LIST), null)?.tokenize(',')),
    (TEST_BACKEND): trimForStageList(gitlabParamsFromBot.get((TEST_BACKEND), null)?.tokenize(',')),
    (IS_POST_MERGE): (env.JOB_NAME ==~ /.*PostMerge.*/) || gitlabParamsFromBot.get((IS_POST_MERGE), false),
    (ADD_MULTI_GPU_TEST): gitlabParamsFromBot.get((ADD_MULTI_GPU_TEST), false),
    (ONLY_MULTI_GPU_TEST): gitlabParamsFromBot.get((ONLY_MULTI_GPU_TEST), false) || gitlabParamsFromBot.get((ENABLE_MULTI_GPU_TEST), false),
    (DISABLE_MULTI_GPU_TEST): gitlabParamsFromBot.get((DISABLE_MULTI_GPU_TEST), false),
    (EXTRA_STAGE_LIST): trimForStageList(gitlabParamsFromBot.get((EXTRA_STAGE_LIST), null)?.tokenize(',')),
    (MULTI_GPU_FILE_CHANGED): false,
    (ONLY_PYTORCH_FILE_CHANGED): false,
    (DEBUG_MODE): gitlabParamsFromBot.get(DEBUG_MODE, false),
    (AUTO_TRIGGER_TAG_LIST): [],
]

String reuseBuild = gitlabParamsFromBot.get('reuse_build', null)

@Field
def GITHUB_PR_API_URL = "github_pr_api_url"
@Field
def CACHED_CHANGED_FILE_LIST = "cached_changed_file_list"
@Field
def ACTION_INFO = "action_info"
def globalVars = [
    (GITHUB_PR_API_URL): gitlabParamsFromBot.get('github_pr_api_url', null),
    (CACHED_CHANGED_FILE_LIST): null,
    (ACTION_INFO): gitlabParamsFromBot.get('action_info', null),
]

// If not running all test stages in the L0 pre-merge, we will not update the GitLab status at the end.
boolean enableUpdateGitlabStatus =
    !testFilter[ENABLE_SKIP_TEST] &&
    !testFilter[ONLY_MULTI_GPU_TEST] &&
    testFilter[GPU_TYPE_LIST] == null &&
    testFilter[TEST_STAGE_LIST] == null &&
    testFilter[TEST_BACKEND] == null

String getShortenedJobName(String path)
{
    static final nameMapping = [
        "L0_MergeRequest": "l0-mr",
        "L0_Custom": "l0-cus",
        "L0_PostMerge": "l0-pm",
        "L0_PostMergeDocker": "l0-pmd",
        "L1_Custom": "l1-cus",
        "L1_Nightly": "l1-nt",
        "L1_Stable": "l1-stb",
    ]
    def parts = path.split('/')
    // Apply nameMapping to the last part (jobName)
    def jobName = parts[-1]
    boolean replaced = false
    nameMapping.each { key, value ->
        if (jobName.contains(key)) {
            jobName = jobName.replace(key, value)
            replaced = true
        }
    }
    if (!replaced) {
        jobName = jobName.length() > 7 ? jobName.substring(0, 7) : jobName
    }
    // Replace the last part with the transformed jobName
    parts[-1] = jobName
    // Rejoin the parts with '-', convert to lowercase
    return parts.join('-').toLowerCase()
}

def createKubernetesPodConfig(image, type)
{
    def targetCould = "kubernetes-cpu"
    def selectors = """
                  nvidia.com/node_type: builder
                  kubernetes.io/os: linux"""
    def containerConfig = ""
    def nodeLabelPrefix = ""
    def jobName = getShortenedJobName(env.JOB_NAME)
    def buildID = env.BUILD_ID

    switch(type)
    {
    case "agent":
        containerConfig = """
                  - name: alpine
                    image: urm.nvidia.com/docker/alpine:latest
                    command: ['cat']
                    tty: true
                    resources:
                      requests:
                        cpu: '2'
                        memory: 10Gi
                        ephemeral-storage: 25Gi
                      limits:
                        cpu: '2'
                        memory: 10Gi
                        ephemeral-storage: 25Gi
                    imagePullPolicy: Always"""
        nodeLabelPrefix = "cpu"
        break
    case "build":
        containerConfig = """
                  - name: trt-llm
                    image: ${image}
                    command: ['cat']
                    volumeMounts:
                    - name: sw-tensorrt-pvc
                      mountPath: "/mnt/sw-tensorrt-pvc"
                      readOnly: false
                    tty: true
                    resources:
                      requests:
                        cpu: ${BUILD_CORES_REQUEST}
                        memory: ${BUILD_MEMORY_REQUEST}
                        ephemeral-storage: 200Gi
                      limits:
                        cpu: ${BUILD_CORES_LIMIT}
                        memory: ${BUILD_MEMORY_LIMIT}
                        ephemeral-storage: 200Gi
                    imagePullPolicy: Always"""
        nodeLabelPrefix = "cpu"
        break
    case "package":
        containerConfig = """
                  - name: trt-llm
                    image: ${image}
                    command: ['cat']
                    tty: true
                    resources:
                      requests:
                        cpu: '2'
                        memory: 10Gi
                        ephemeral-storage: 25Gi
                      limits:
                        cpu: '2'
                        memory: 10Gi
                        ephemeral-storage: 25Gi
                    imagePullPolicy: Always"""
        nodeLabelPrefix = "cpu"
        break
    }
    def nodeLabel = trtllm_utils.appendRandomPostfix("${nodeLabelPrefix}---tensorrt-${jobName}-${buildID}")
    def podConfig = [
        cloud: targetCould,
        namespace: "sw-tensorrt",
        label: nodeLabel,
        yaml: """
            apiVersion: v1
            kind: Pod
            spec:
                qosClass: Guaranteed
                affinity:
                    nodeAffinity:
                        requiredDuringSchedulingIgnoredDuringExecution:
                            nodeSelectorTerms:
                            - matchExpressions:
                              - key: "tensorrt/taints"
                                operator: DoesNotExist
                              - key: "tensorrt/affinity"
                                operator: NotIn
                                values:
                                - "core"
                nodeSelector: ${selectors}
                containers:
                  ${containerConfig}
                    env:
                    - name: HOST_NODE_NAME
                      valueFrom:
                        fieldRef:
                          fieldPath: spec.nodeName
                  - name: jnlp
                    image: urm.nvidia.com/docker/jenkins/inbound-agent:4.11-1-jdk11
                    args: ['\$(JENKINS_SECRET)', '\$(JENKINS_NAME)']
                    resources:
                      requests:
                        cpu: '2'
                        memory: 10Gi
                        ephemeral-storage: 25Gi
                      limits:
                        cpu: '2'
                        memory: 10Gi
                        ephemeral-storage: 25Gi
                qosClass: Guaranteed
                volumes:
                - name: sw-tensorrt-pvc
                  persistentVolumeClaim:
                    claimName: sw-tensorrt-pvc

        """.stripIndent(),
    ]

    return podConfig
}

def echoNodeAndGpuInfo(pipeline, stageName)
{
    String hostNodeName = sh(script: 'echo $HOST_NODE_NAME', returnStdout: true)
    String gpuUuids = pipeline.sh(script: "nvidia-smi -q | grep \"GPU UUID\" | awk '{print \$4}' | tr '\n' ',' || true", returnStdout: true)
    pipeline.echo "HOST_NODE_NAME = ${hostNodeName} ; GPU_UUIDS = ${gpuUuids} ; STAGE_NAME = ${stageName}"
}

def setupPipelineEnvironment(pipeline, testFilter, globalVars)
{
    setupPipelineSpec = createKubernetesPodConfig(LLM_DOCKER_IMAGE, "build")
    trtllm_utils.launchKubernetesPod(pipeline, setupPipelineSpec, "trt-llm", {
        sh "env | sort"
        updateGitlabCommitStatus name: "${BUILD_STATUS_NAME}", state: 'running'
        echo "Using GitLab repo: ${LLM_REPO}."
        sh "git config --global --add safe.directory \"*\""
        if (env.gitlabMergeRequestLastCommit) {
            env.gitlabCommit = env.gitlabMergeRequestLastCommit
        } else {
            branch = env.gitlabBranch ? env.gitlabBranch : "main"
            trtllm_utils.checkoutSource(LLM_REPO, branch, LLM_ROOT, true, true)
            checkoutCommit = sh (script: "cd ${LLM_ROOT} && git rev-parse HEAD",returnStdout: true).trim()
            env.gitlabCommit = checkoutCommit
        }
        echo "Env.gitlabMergeRequestLastCommit: ${env.gitlabMergeRequestLastCommit}."
        echo "Freeze GitLab commit. Branch: ${env.gitlabBranch}. Commit: ${env.gitlabCommit}."
        testFilter[(MULTI_GPU_FILE_CHANGED)] = getMultiGpuFileChanged(pipeline, testFilter, globalVars)
        testFilter[(ONLY_PYTORCH_FILE_CHANGED)] = getOnlyPytorchFileChanged(pipeline, testFilter, globalVars)
        testFilter[(AUTO_TRIGGER_TAG_LIST)] = getAutoTriggerTagList(pipeline, testFilter, globalVars)
    })
}

def launchReleaseCheck(pipeline)
{
    stages = {
        trtllm_utils.llmExecStepWithRetry(pipeline, script: """apt-get update && apt-get install \
            python3-pip \
            -y""")
        sh "pip3 config set global.break-system-packages true"
        sh "git config --global --add safe.directory \"*\""
        // Step 1: cloning tekit source code
        trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, true, true)
        sh "cd ${LLM_ROOT} && git config --unset-all core.hooksPath"
        trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${LLM_ROOT} && python3 -u scripts/release_check.py || (git restore . && false)")

        // Step 2: build tools
        withEnv(['GONOSUMDB=*.nvidia.com']) {
            withCredentials([
                gitUsernamePassword(
                    credentialsId: 'svc_tensorrt_gitlab_read_api_token',
                    gitToolName: 'git-tool'
                ),
                string(
                    credentialsId: 'default-git-url',
                    variable: 'DEFAULT_GIT_URL'
                )
            ]) {
                sh "go install ${DEFAULT_GIT_URL}/TensorRT/Infrastructure/licensechecker/cmd/license_checker@v0.3.0"
            }
        }
        // Step 3: Run license check
        sh "cd ${LLM_ROOT}/cpp && /go/bin/license_checker -config ../jenkins/license_cpp.json include tensorrt_llm"

        // Step 4: Run guardwords scan
        def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/)
        if (env.alternativeTRT || isOfficialPostMergeJob) {
            trtllm_utils.checkoutSource(SCAN_REPO, SCAN_COMMIT, SCAN_ROOT, true, true)
            trtllm_utils.llmExecStepWithRetry(pipeline, script: "cd ${SCAN_ROOT} && pip3 install -e .")
            try {
                ignoreList = [
                    "*/.git/*",
                    "*/3rdparty/*",
                    "*/examples/scaffolding/contrib/mcp/weather/weather.py",
                    "*/tensorrt_llm_internal_cutlass_kernels_static.tar.xz"
                ]
                sh "cd ${LLM_ROOT} && confidentiality-scan \$(find . -type f ${ignoreList.collect { "-not -path \"${it}\"" }.join(' ')}) 2>&1 | tee scan.log"
                def lastLine = sh(script: "tail -n 1 ${LLM_ROOT}/scan.log", returnStdout: true).trim()
                if (lastLine.toLowerCase().contains("error")) {
                    error "Guardwords Scan Failed."
                }
            } catch (Exception e) {
                throw e
            } finally {
                trtllm_utils.uploadArtifacts("${LLM_ROOT}/scan.log", "${UPLOAD_PATH}/guardwords-scan-results/")
                echo "Guardwords Scan Results: https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/guardwords-scan-results/scan.log"
            }
        }
    }

    def image = "urm.nvidia.com/docker/golang:1.22"
    stageName = "Release Check"
    trtllm_utils.launchKubernetesPod(pipeline, createKubernetesPodConfig(image, "build"), "trt-llm", {
        stage("[${stageName}] Run") {
            if (RELESE_CHECK_CHOICE == STAGE_CHOICE_SKIP) {
                echo "Release Check job is skipped due to Jenkins configuration"
                return
            }
            try {
                echoNodeAndGpuInfo(pipeline, stageName)
                stages()
            } catch (InterruptedException e) {
                throw e
            } catch (Exception e) {
                if (RELESE_CHECK_CHOICE == STAGE_CHOICE_IGNORE) {
                    catchError(
                        buildResult: 'SUCCESS',
                        stageResult: 'FAILURE') {
                        error "Release Check failed but ignored due to Jenkins configuration"
                    }
                } else {
                    throw e
                }
            }
        }
    })
}

def getMergeRequestChangedFileListGitlab(pipeline) {
    def changedFileList = []
    def pageId = 0
    withCredentials([
        usernamePassword(
            credentialsId: 'svc_tensorrt_gitlab_read_api_token',
            usernameVariable: 'GITLAB_API_USER',
            passwordVariable: 'GITLAB_API_TOKEN'
        ),
        string(credentialsId: 'default-git-url', variable: 'DEFAULT_GIT_URL')
    ]) {
        while(true) {
            pageId += 1
            def rawDataJson = pipeline.sh(
                script: """
                    curl --header "PRIVATE-TOKEN: $GITLAB_API_TOKEN" \
                         --url "https://${DEFAULT_GIT_URL}/api/v4/projects/${env.gitlabMergeRequestTargetProjectId}/merge_requests/${env.gitlabMergeRequestIid}/diffs?page=${pageId}&per_page=20"
                """,
                returnStdout: true
            )
            def rawDataList = readJSON text: rawDataJson, returnPojo: true
            rawDataList.each { rawData ->
                changedFileList += [rawData.get("old_path"), rawData.get("new_path")]
            }
            if (!rawDataList) { break }
        }
    }
    def changedFileListStr = changedFileList.join(",\n")
    pipeline.echo("The changeset of this MR is: ${changedFileListStr}.")
    return changedFileList
}

def getMergeRequestChangedFileListGithub(pipeline, githubPrApiUrl) {
    def changedFileList = []
    def pageId = 0
    withCredentials([
        string(
            credentialsId: 'github-token-trtllm-ci',
            variable: 'GITHUB_API_TOKEN'
        ),
    ]) {
        while(true) {
            pageId += 1
            def rawDataJson = pipeline.sh(
                script: """
                    curl --header "Authorization: Bearer $GITHUB_API_TOKEN" \
                         --url "${githubPrApiUrl}/files?page=${pageId}&per_page=20"
                """,
                returnStdout: true
            )
            echo "rawDataJson: ${rawDataJson}"
            def rawDataList = readJSON text: rawDataJson, returnPojo: true
            rawDataList.each { rawData ->
                changedFileList += [rawData.get("filename"), rawData.get("previous_filename")].findAll { it }
            }
            if (!rawDataList) { break }
        }
    }
    def changedFileListStr = changedFileList.join(",\n")
    pipeline.echo("The changeset of this PR is: ${changedFileListStr}.")
    return changedFileList
}

def getMergeRequestChangedFileList(pipeline, globalVars) {
    def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/)
    if (env.alternativeTRT || isOfficialPostMergeJob) {
        pipeline.echo("Force set changed file list to empty list.")
        return []
    }

    def githubPrApiUrl = globalVars[GITHUB_PR_API_URL]

    if (globalVars[CACHED_CHANGED_FILE_LIST] != null) {
        return globalVars[CACHED_CHANGED_FILE_LIST]
    }
    try {
        if (githubPrApiUrl != null) {
            globalVars[CACHED_CHANGED_FILE_LIST] = getMergeRequestChangedFileListGithub(pipeline, githubPrApiUrl)
        } else {
            globalVars[CACHED_CHANGED_FILE_LIST] = getMergeRequestChangedFileListGitlab(pipeline)
        }
        return globalVars[CACHED_CHANGED_FILE_LIST]
    } catch (InterruptedException e) {
        throw e
    } catch (Exception e) {
        pipeline.echo("Get merge request changed file list failed. Error: ${e.toString()}")
        globalVars[CACHED_CHANGED_FILE_LIST] = []
        return globalVars[CACHED_CHANGED_FILE_LIST]
    }
}

def getAutoTriggerTagList(pipeline, testFilter, globalVars) {
    def autoTriggerTagList = []
    def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/)
    if (env.alternativeTRT || isOfficialPostMergeJob) {
        pipeline.echo("Force set auto trigger tags to empty list.")
        return autoTriggerTagList
    }
    def changedFileList = getMergeRequestChangedFileList(pipeline, globalVars)
    if (!changedFileList || changedFileList.isEmpty()) {
        return autoTriggerTagList
    }
    def specialFileToTagMap = [
        "tensorrt_llm/_torch/models/modeling_deepseekv3.py": ["-DeepSeek-"],
    ]
    for (file in changedFileList) {
        for (String key : specialFileToTagMap.keySet()) {
            if (file.startsWith(key)) {
                autoTriggerTagList += specialFileToTagMap[key]
            }
        }
    }
    autoTriggerTagList = autoTriggerTagList.unique()
    if (!autoTriggerTagList.isEmpty()) {
        pipeline.echo("Auto trigger tags detected: ${autoTriggerTagList.join(', ')}")
    }
    return autoTriggerTagList
}

def getMultiGpuFileChanged(pipeline, testFilter, globalVars)
{
    if (testFilter[(DISABLE_MULTI_GPU_TEST)]) {
        pipeline.echo("Force not run multi-GPU testing.")
        return false
    }
    if (env.alternativeTRT || testFilter[(ADD_MULTI_GPU_TEST)] || testFilter[(ONLY_MULTI_GPU_TEST)] || testFilter[(IS_POST_MERGE)]) {
        pipeline.echo("Force run multi-GPU testing.")
        return true
    }

    def relatedFileList = [
        "cpp/include/tensorrt_llm/runtime/gptJsonConfig.h",
        "cpp/include/tensorrt_llm/runtime/worldConfig.h",
        "cpp/include/tensorrt_llm/runtime/utils/mpiUtils.h",
        "cpp/include/tensorrt_llm/runtime/utils/multiDeviceUtils.h",
        "cpp/tensorrt_llm/runtime/utils/mpiUtils.cpp",
        "cpp/tests/runtime/mpiUtilsTest.cpp",
        "cpp/tensorrt_llm/batch_manager/trtGptModelFactory.h",
        "cpp/tensorrt_llm/runtime/worldConfig.cpp",
        "cpp/tensorrt_llm/runtime/ncclCommunicator.cpp",
        "cpp/tensorrt_llm/runtime/workerPool.h",
        "cpp/tensorrt_llm/executor_worker/executorWorker.cpp",
        "cpp/tensorrt_llm/runtime/ipcUtils.cpp",
        "cpp/tensorrt_llm/executor/executor.cpp",
        "cpp/tensorrt_llm/executor/executorImpl.cpp",
        "cpp/tensorrt_llm/executor/executorImpl.h",
        "cpp/tensorrt_llm/runtime/ncclCommunicator.cpp",
        "cpp/tensorrt_llm/kernels/allReduceFusionKernels.h",
        "cpp/tensorrt_llm/kernels/allReduceFusionKernels.cu",
        "cpp/tensorrt_llm/kernels/customAllReduceKernels.h",
        "cpp/tensorrt_llm/kernels/customAllReduceKernels.cu",
        "cpp/tensorrt_llm/kernels/gptKernels.h",
        "cpp/tensorrt_llm/kernels/gptKernels.cu",
        "cpp/tensorrt_llm/kernels/unfusedAttentionKernels.h",
        "cpp/tensorrt_llm/kernels/unfusedAttentionKernels.cu",
        "cpp/tensorrt_llm/kernels/userbuffers/",
        "cpp/tensorrt_llm/pybind/",
        "cpp/tests/kernels/allReduce/",
        "cpp/tensorrt_llm/plugins/cpSplitPlugin/cpSplitPlugin.h",
        "cpp/tensorrt_llm/plugins/cpSplitPlugin/cpSplitPlugin.cpp",
        "cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.h",
        "cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp",
        "cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.h",
        "cpp/tensorrt_llm/plugins/gptAttentionPlugin/gptAttentionPlugin.cpp",
        "cpp/tests/runtime/mpiUtilsTest.cpp",
        "cpp/tensorrt_llm/plugins/ncclPlugin/",
        "tensorrt_llm/functional.py",
        "tensorrt_llm/mapping.py",
        "tensorrt_llm/llmapi/",
        "tensorrt_llm/executor/",
        "tensorrt_llm/_ipc_utils.py",
        "tensorrt_llm/parameter.py",
        "tensorrt_llm/models/llama/",
        "tensorrt_llm/_torch/compilation/patterns/ar_residual_norm.py",
        "tensorrt_llm/_torch/compilation/patterns/ub_allreduce.py",
        "tensorrt_llm/_torch/custom_ops/userbuffers_custom_ops.py",
        "tensorrt_llm/_torch/pyexecutor/model_engine.py",
        "tensorrt_llm/_torch/pyexecutor/py_executor.py",
        "tensorrt_llm/_torch/pyexecutor/_util.py",
        "tensorrt_llm/_torch/models/modeling_llama.py",
        "tests/integration/defs/cpp/test_multi_gpu.py",
        "tests/integration/test_lists/test-db/l0_dgx_h100.yml",
        "tests/integration/test_lists/test-db/l0_dgx_h200.yml",
        "tests/unittest/_torch/multi_gpu/",
        "tests/unittest/_torch/multi_gpu_modeling/",
        "tests/unittest/llmapi/test_llm_multi_gpu.py",
        "tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py",
        "jenkins/L0_Test.groovy",
    ]

    def changedFileList = getMergeRequestChangedFileList(pipeline, globalVars)
    if (!changedFileList || changedFileList.isEmpty()) {
        return false
    }

    def changedFileListStr = ","
    def relatedFileChanged = false
    try {
        changedFileListStr = changedFileList.join(", ")
        relatedFileChanged = relatedFileList.any { it ->
            if (changedFileListStr.contains(it)) {
                return true
            }
        }
    }
    catch (InterruptedException e)
    {
        throw e
    }
    catch (Exception e)
    {
        pipeline.echo("getMultiGpuFileChanged failed execution. Error: ${e.toString()}")
    }
    if (relatedFileChanged) {
        pipeline.echo("Detect multi-GPU related files changed.")
    }
    return relatedFileChanged
}

def getOnlyPytorchFileChanged(pipeline, testFilter, globalVars) {
    def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/)
    if (env.alternativeTRT || isOfficialPostMergeJob) {
        pipeline.echo("Force set ONLY_PYTORCH_FILE_CHANGED false.")
        return false
    }
    def pytorchOnlyList = [
        "tensorrt_llm/_torch/",
        "tensorrt_llm/scaffolding/",
        "tests/unittest/_torch/",
        "tests/unittest/scaffolding/",
        "tests/unittest/llmapi/test_llm_pytorch.py",
        "tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py",
        "tests/integration/defs/accuracy/test_llm_api_pytorch.py",
        "tests/integration/defs/disaggregated/",
        "examples/auto_deploy",
        "examples/disaggregated",
        "examples/pytorch/",
        "examples/scaffolding/",
        "docs/"
    ]

    def changedFileList = getMergeRequestChangedFileList(pipeline, globalVars)

    if (!changedFileList || changedFileList.isEmpty()) {
        return false
    }

    def result = true
    for (file in changedFileList) {
        def isPytorchFile = false
        for (prefix in pytorchOnlyList) {
            if (file.startsWith(prefix)) {
                isPytorchFile = true
                break
            }
        }
        if (!isPytorchFile) {
            pipeline.echo("Found non-PyTorch file: ${file}")
            result = false
            break
        }
    }

    pipeline.echo("Only PyTorch files changed: ${result}")
    return result
}

def collectTestResults(pipeline, testFilter)
{
    collectResultPodSpec = createKubernetesPodConfig("", "agent")
    trtllm_utils.launchKubernetesPod(pipeline, collectResultPodSpec, "alpine", {
        stage ("Collect test result") {
            sh "rm -rf **/*.xml *.tar.gz"

            testResultLink = "https://urm.nvidia.com/artifactory/sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${BUILD_NUMBER}/test-results"

            trtllm_utils.llmExecStepWithRetry(pipeline, script: "apk add --no-cache curl")
            trtllm_utils.llmExecStepWithRetry(pipeline, script: "apk add python3")
            trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget ${testResultLink}/", allowStepFailed: true)
            sh "cat index.html | grep \"tar.gz\" | cut -d \"\\\"\" -f 2 > result_file_names.txt"
            sh "cat result_file_names.txt"
            trtllm_utils.llmExecStepWithRetry(pipeline, script: "cat result_file_names.txt | xargs -n1 -I {} wget -c -nv ${testResultLink}/{}", allowStepFailed: true)
            sh "ls -l | grep \"tar.gz\" || true"
            resultFileNumber = sh(script: "cat result_file_names.txt | wc -l", returnStdout: true)
            resultFileDownloadedNumber = sh(script: "ls -l | grep \"tar.gz\" | wc -l", returnStdout: true)
            echo "Result File Number: ${resultFileNumber}, Downloaded: ${resultFileDownloadedNumber}"

            sh "find . -name results-\\*.tar.gz -type f -exec tar -zxvf {} \\; || true"
            trtllm_utils.checkoutSource(LLM_REPO, env.gitlabCommit, LLM_ROOT, true, true)
            if (testFilter[(IS_POST_MERGE)]) {
                try {
                    sh "python3 llm/scripts/generate_duration.py --duration-file=new_test_duration.json"
                    trtllm_utils.uploadArtifacts("new_test_duration.json", "${UPLOAD_PATH}/test-results/")
                } catch (Exception e) {
                    // No need to fail the stage if the duration file generation fails
                    echo "An error occurred while generating or uploading the duration file: ${e.toString()}"
                }
            }

            junit(testResults: '**/results*.xml', allowEmptyResults : true)
        } // Collect test result stage
        stage("Rerun report") {
            sh "rm -rf rerun && mkdir -p rerun"
            sh "find . -type f -wholename '*/rerun_results.xml' -exec sh -c 'mv \"{}\" \"rerun/\$(basename \$(dirname \"{}\"))_rerun_results.xml\"' \\; || true"
            sh "find rerun -type f"
            def rerunFileCount = sh(returnStdout: true, script: 'find rerun -type f | wc -l').replaceAll("\\s","").toInteger()
            if (rerunFileCount == 0) {
                echo "Rerun report is skipped because there is no rerun test data file."
                return
            }
            def xmlFiles = findFiles(glob: 'rerun/**/*.xml')
            def xmlFileList = xmlFiles.collect { it.path }
            def inputfiles = xmlFileList.join(',')
            echo "inputfiles: ${inputfiles}"
            trtllm_utils.llmExecStepWithRetry(pipeline, script: "apk add python3")
            trtllm_utils.llmExecStepWithRetry(pipeline, script: "apk add py3-pip")
            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 config set global.break-system-packages true")
            sh """
                python3 llm/tests/integration/defs/test_rerun.py \
                generate_rerun_report \
                --output-file=rerun/rerun_report.xml \
                --input-files=${inputfiles}
            """
            trtllm_utils.uploadArtifacts("rerun/rerun_report.html", "${UPLOAD_PATH}/test-results/")
            echo "Rerun report: https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/test-results/rerun_report.html"
            def isOfficialPostMergeJob = (env.JOB_NAME ==~ /.*PostMerge.*/)
            if (env.alternativeTRT || isOfficialPostMergeJob) {
                catchError(
                    buildResult: 'FAILURE',
                    stageResult: 'FAILURE') {
                    error "Some failed tests were reruned, please check the rerun report."
                }
            } else {
                catchError(
                    buildResult: 'SUCCESS',
                    stageResult: 'UNSTABLE') {
                    error "Some failed tests were reruned, please check the rerun report."
                }
            }
        } // Rerun report stage
        try {
            stage("Test coverage") {
                sh "ls"
                def CUR_PATH = sh(returnStdout: true, script: 'pwd').replaceAll("\\s","")
                sh "echo ${CUR_PATH}"
                sh "rm -rf cov && mkdir -p cov"
                sh "find . -type f -wholename '*/.coverage.*' -exec mv {} cov/ \\; || true"
                sh "cd cov && find . -type f"
                def fileCount = sh(returnStdout: true, script: 'find cov -type f | wc -l').replaceAll("\\s","").toInteger()
                if (fileCount == 0) {
                    echo "Test coverage is skipped because there is no test data file."
                    return
                }
                trtllm_utils.llmExecStepWithRetry(pipeline, script: "apk add py3-pip")
                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 config set global.break-system-packages true")
                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install coverage")
                sh "coverage --version"

                sh "cp llm/examples/openai_triton/manual_plugin/fmha_triton.py llm/examples/openai_triton/plugin_autogen/"
                def coverageConfigFile = "cov/.coveragerc"
                sh """
                    echo '[paths]' > ${coverageConfigFile}
                    echo 'source1=\n    ${CUR_PATH}/llm/examples/\n    */TensorRT-LLM/src/examples/' >> ${coverageConfigFile}
                    echo 'source2=\n    ${CUR_PATH}/llm/tensorrt_llm/\n    */tensorrt_llm/' >> ${coverageConfigFile}
                    cat ${coverageConfigFile}
                """

                sh "cd cov && coverage combine"
                sh "cd cov && find . -type f"
                sh "cd cov && coverage report"
                sh "cd cov && coverage html -d test_coverage_html"
                trtllm_utils.uploadArtifacts("cov/test_coverage_html/*", "${UPLOAD_PATH}/test-results/coverage-report/")
                echo "Test coverage report: https://urm.nvidia.com/artifactory/${UPLOAD_PATH}/test-results/coverage-report/index.html"
            } // Test coverage
        }
        catch (InterruptedException e)
        {
            throw e
        }
        catch (Exception e)
        {
            pipeline.echo("Test coverage failed execution.")
        }
    })
}

def getCommonParameters()
{
    return [
        'gitlabSourceRepoHttpUrl': LLM_REPO,
        'gitlabCommit': env.gitlabCommit,
        'artifactPath': UPLOAD_PATH,
        'uploadPath': UPLOAD_PATH,
    ]
}

def triggerJob(jobName, parameters, jenkinsUrl = "", credentials = "")
{
    if (jenkinsUrl == "" && env.localJobCredentials) {
        jenkinsUrl = env.JENKINS_URL
        credentials = env.localJobCredentials
    }
    def status = ""
    if (jenkinsUrl != "") {
        def jobPath = trtllm_utils.resolveFullJobName(jobName).replace('/', '/job/').substring(1)
        def handle = triggerRemoteJob(
            job: "${jenkinsUrl}${jobPath}/",
            auth: CredentialsAuth(credentials: credentials),
            parameters: trtllm_utils.toRemoteBuildParameters(parameters),
            pollInterval: 60,
            abortTriggeredJob: true,
        )
        status = handle.getBuildResult().toString()
    } else {
        def handle = build(
            job: jobName,
            parameters: trtllm_utils.toBuildParameters(parameters),
            propagate: false,
        )
        echo "Triggered job: ${handle.absoluteUrl}"
        status = handle.result
    }
    return status
}

def launchStages(pipeline, reuseBuild, testFilter, enableFailFast, globalVars)
{
    stages = [
        "Release Check": {
            script {
                launchReleaseCheck(this)
            }
        },
        "x86_64-linux": {
            script {
                stage("Build") {
                    def parameters = getCommonParameters()
                    String globalVarsJson = writeJSON returnText: true, json: globalVars
                    parameters += [
                        'enableFailFast': enableFailFast,
                        'dockerImage': LLM_DOCKER_IMAGE,
                        'wheelDockerImagePy310': LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE,
                        'wheelDockerImagePy312': LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE,
                        'globalVars': globalVarsJson,
                    ]

                    if (env.alternativeTRT) {
                        parameters += [
                            'alternativeTRT': env.alternativeTRT,
                        ]
                    }

                    if (reuseBuild) {
                        parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
                    }

                    echo "trigger x86_64 build job, params: ${parameters}"

                    def status = triggerJob("/LLM/helpers/Build-x86_64", parameters)
                    if (status != "SUCCESS") {
                        error "Downstream job did not succeed"
                    }

                }
                def testStageName = "[Test-x86_64] Run"
                if (env.localJobCredentials) {
                    testStageName = "[Test-x86_64] Remote Run"
                }
                stage(testStageName) {
                    if (X86_TEST_CHOICE == STAGE_CHOICE_SKIP) {
                        echo "x86_64 test job is skipped due to Jenkins configuration"
                        return
                    }
                    try {
                        parameters = getCommonParameters()

                        String testFilterJson = writeJSON returnText: true, json: testFilter
                        String globalVarsJson = writeJSON returnText: true, json: globalVars
                        parameters += [
                            'enableFailFast': enableFailFast,
                            'testFilter': testFilterJson,
                            'dockerImage': LLM_DOCKER_IMAGE,
                            'wheelDockerImagePy310': LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE,
                            'wheelDockerImagePy312': LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE,
                            'globalVars': globalVarsJson,
                        ]

                        if (env.alternativeTRT) {
                            parameters += [
                                'alternativeTRT': env.alternativeTRT,
                            ]
                        }

                        if (env.testPhase2StageName) {
                            parameters += [
                                'testPhase2StageName': env.testPhase2StageName,
                            ]
                        }

                        echo "trigger x86_64 test job, params: ${parameters}"

                        def status = triggerJob(
                            "L0_Test-x86_64",
                            parameters,
                        )

                        if (status != "SUCCESS") {
                            error "Downstream job did not succeed"
                        }
                    } catch (InterruptedException e) {
                        throw e
                    } catch (Exception e) {
                        if (X86_TEST_CHOICE == STAGE_CHOICE_IGNORE) {
                            catchError(
                                buildResult: 'SUCCESS',
                                stageResult: 'FAILURE') {
                                error "x86_64 test failed but ignored due to Jenkins configuration"
                            }
                        } else {
                            throw e
                        }
                    }
                }
            }
        },
        "SBSA-linux": {
            script {
                def jenkinsUrl = ""
                def credentials = ""
                def testStageName = "[Test-SBSA] Run"
                if (env.localJobCredentials) {
                    testStageName = "[Test-SBSA] Remote Run"
                }

                def stageName = "Build"
                stage(stageName) {
                    def parameters = getCommonParameters()
                    String globalVarsJson = writeJSON returnText: true, json: globalVars
                    parameters += [
                        'enableFailFast': enableFailFast,
                        "dockerImage": LLM_SBSA_DOCKER_IMAGE,
                        'globalVars': globalVarsJson,
                    ]

                    if (env.alternativeTrtSBSA) {
                        parameters += [
                            "alternativeTRT": env.alternativeTrtSBSA,
                        ]
                    }

                    if (reuseBuild) {
                        parameters['reuseArtifactPath'] = "sw-tensorrt-generic/llm-artifacts/${JOB_NAME}/${reuseBuild}"
                    }

                    echo "trigger SBSA build job, params: ${parameters}"

                    def status = triggerJob(
                        "/LLM/helpers/Build-SBSA",
                        parameters,
                        jenkinsUrl,
                        credentials,
                    )

                    if (status != "SUCCESS") {
                        error "Downstream job did not succeed"
                    }
                }
                stage(testStageName) {
                    if (SBSA_TEST_CHOICE == STAGE_CHOICE_SKIP) {
                        echo "SBSA test job is skipped due to Jenkins configuration"
                        return
                    }
                    try {
                        def parameters = getCommonParameters()

                        String testFilterJson = writeJSON returnText: true, json: testFilter
                        String globalVarsJson = writeJSON returnText: true, json: globalVars
                        parameters += [
                            'enableFailFast': enableFailFast,
                            'testFilter': testFilterJson,
                            "dockerImage": LLM_SBSA_DOCKER_IMAGE,
                            'globalVars': globalVarsJson,
                        ]

                        if (env.alternativeTrtSBSA) {
                            parameters += [
                                "alternativeTRT": env.alternativeTrtSBSA,
                            ]
                        }

                        if (env.testPhase2StageName) {
                            parameters += [
                                'testPhase2StageName': env.testPhase2StageName,
                            ]
                        }

                        echo "trigger SBSA test job, params: ${parameters}"

                        def status = triggerJob(
                            "L0_Test-SBSA",
                            parameters,
                            jenkinsUrl,
                            credentials,
                        )

                        if (status != "SUCCESS") {
                            error "Downstream job did not succeed"
                        }
                    } catch (InterruptedException e) {
                        throw e
                    } catch (Exception e) {
                        if (SBSA_TEST_CHOICE == STAGE_CHOICE_IGNORE) {
                            catchError(
                                buildResult: 'SUCCESS',
                                stageResult: 'FAILURE') {
                                error "SBSA test failed but ignored due to Jenkins configuration"
                            }
                        } else {
                            throw e
                        }
                    }
                }
            }
        },
    ]
    def dockerBuildJob = [
        "Build-Docker-Images": {
            script {
                stage("[Build-Docker-Images] Remote Run") {
                    def parameters = getCommonParameters()
                    String globalVarsJson = writeJSON returnText: true, json: globalVars
                    def branch = env.gitlabBranch ? env.gitlabBranch : "main"
                    if (globalVars[GITHUB_PR_API_URL]) {
                        branch = "github-pr-" + globalVars[GITHUB_PR_API_URL].split('/').last()
                    }

                    parameters += [
                        'enableFailFast': enableFailFast,
                        'branch': branch,
                        'action': "push",
                        'globalVars': globalVarsJson,
                    ]

                    echo "trigger BuildDockerImages job, params: ${parameters}"

                    def status = triggerJob("/LLM/helpers/BuildDockerImages", parameters)
                    if (status != "SUCCESS") {
                        error "Downstream job did not succeed"
                    }
                }
            }
        }
    ]
    if (env.JOB_NAME ==~ /.*PostMerge.*/) {
        stages += dockerBuildJob
    }
    if (testFilter[(TEST_STAGE_LIST)]?.contains("Build-Docker-Images") || testFilter[(EXTRA_STAGE_LIST)]?.contains("Build-Docker-Images")) {
        stages += dockerBuildJob
        testFilter[(TEST_STAGE_LIST)]?.remove("Build-Docker-Images")
        testFilter[(EXTRA_STAGE_LIST)]?.remove("Build-Docker-Images")
        echo "Will run Build-Docker-Images job"
    }

    parallelJobs = stages.collectEntries{key, value -> [key, {
        script {
            stage(key) {
                value()
            }
        }
    }]}

    parallelJobs.failFast = enableFailFast
    pipeline.parallel parallelJobs
}

pipeline {
    agent {
        kubernetes createKubernetesPodConfig("", "agent")
    }
    options {
        // Check the valid options at: https://www.jenkins.io/doc/book/pipeline/syntax/
        // some step like results analysis stage, does not need to check out source code
        skipDefaultCheckout()
        // to better analyze the time for each step/test
        timestamps()
        timeout(time: 24, unit: 'HOURS')
    }
    post {
        unsuccessful {
            updateGitlabCommitStatus name: "${BUILD_STATUS_NAME}", state: "failed"
        }
        success {
            script {
                if (enableUpdateGitlabStatus) {
                    updateGitlabCommitStatus name: "${BUILD_STATUS_NAME}", state: "success"
                } else {
                    updateGitlabCommitStatus name: "${BUILD_STATUS_NAME}", state: "canceled"
                    updateGitlabCommitStatus name: "Custom Jenkins build", state: "success"
                }
            }
        }
        aborted {
            updateGitlabCommitStatus name: "${BUILD_STATUS_NAME}", state: 'canceled'
        }
        always {
            script {
                if (!isReleaseCheckMode) {
                    collectTestResults(this, testFilter)
                }
            }
        }
    }
    stages {
        stage("Setup environment")
        {
            steps
            {
                script {
                    setupPipelineEnvironment(this, testFilter, globalVars)
                    println globalVars
                    globalVars[ACTION_INFO] = trtllm_utils.setupPipelineDescription(this, globalVars[ACTION_INFO])
                    echo "enableFailFast is: ${enableFailFast}"
                    echo "env.gitlabTriggerPhrase is: ${env.gitlabTriggerPhrase}"
                    println testFilter
                    echo "Check the passed GitLab bot testFilter parameters."
                }
            }
        }
        stage("Build and Test") {
            steps {
                script {
                    if (isReleaseCheckMode) {
                        stage("Release Check") {
                            script {
                                launchReleaseCheck(this)
                            }
                        }
                    } else {
                        // globalVars[CACHED_CHANGED_FILE_LIST] is only used in setupPipelineEnvironment
                        // Reset it to null to workaround the "Argument list too long" error
                        globalVars[CACHED_CHANGED_FILE_LIST] = null
                        launchStages(this, reuseBuild, testFilter, enableFailFast, globalVars)
                    }
                }
            }
        }
    } // stages
} // pipeline