Support DLFW sanity check use CU13 image

Signed-off-by: Zhanrui Sun <zhanruis@nvidia.com>
2026-01-23 12:12:39 +08:00 · 2025-09-05 00:04:22 -07:00 · 2025-09-05 00:04:22 -07:00 · 5ca3376d6f
commit 5ca3376d6f
parent 1978227bb7
3 changed files with 37 additions and 14 deletions
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@ -34,6 +34,9 @@ def TARNAME = "tarName"
@Field
 def WHEEL_ARCHS = "wheelArchs"

+@Field
+def BUILD_JOBS_FOR_CONFIG = "buildJobsForConfig"
+
@Field
 def CONFIG_LINUX_X86_64_VANILLA = "linux_x86_64_Vanilla"

@ -109,6 +112,7 @@ def BUILD_CONFIGS = [
    (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
    (TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
    (WHEEL_ARCHS): "90-real;100-real;120-real",
+    (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
  ],
 ]

@ -457,8 +461,10 @@ def runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64)
        trtllm_utils.replaceWithAlternativeTRT(env.alternativeTRT, "cp312")
    }

+    def buildJobs = buildFlags[BUILD_JOBS_FOR_CONFIG] ?: BUILD_JOBS
+
    withCredentials([usernamePassword(credentialsId: "urm-artifactory-creds", usernameVariable: 'CONAN_LOGIN_USERNAME', passwordVariable: 'CONAN_PASSWORD')]) {
-        sh "cd ${LLM_ROOT} && python3 scripts/build_wheel.py --use_ccache -G Ninja -j ${BUILD_JOBS} -a '${buildFlags[WHEEL_ARCHS]}' ${buildFlags[WHEEL_EXTRA_ARGS]} --benchmarks"
+        sh "cd ${LLM_ROOT} && python3 scripts/build_wheel.py --use_ccache -G Ninja -j ${buildJobs} -a '${buildFlags[WHEEL_ARCHS]}' ${buildFlags[WHEEL_EXTRA_ARGS]} --benchmarks"
    }
    if (is_linux_x86_64) {
        sh "cd ${LLM_ROOT} && python3 scripts/build_cpp_examples.py"
@ -472,7 +478,7 @@ def runLLMBuild(pipeline, buildFlags, tarName, is_linux_x86_64)
    if (tarName.contains("CU12")) {
        tritonShortTag = "r25.06"
    }
-    sh "cd ${LLM_ROOT}/triton_backend/inflight_batcher_llm && mkdir build && cd build && cmake .. -DTRTLLM_DIR=${llmPath} -DTRITON_COMMON_REPO_TAG=${tritonShortTag} -DTRITON_CORE_REPO_TAG=${tritonShortTag} -DTRITON_THIRD_PARTY_REPO_TAG=${tritonShortTag} -DTRITON_BACKEND_REPO_TAG=${tritonShortTag} -DUSE_CXX11_ABI=ON && make -j${BUILD_JOBS} install"
+    sh "cd ${LLM_ROOT}/triton_backend/inflight_batcher_llm && mkdir build && cd build && cmake .. -DTRTLLM_DIR=${llmPath} -DTRITON_COMMON_REPO_TAG=${tritonShortTag} -DTRITON_CORE_REPO_TAG=${tritonShortTag} -DTRITON_THIRD_PARTY_REPO_TAG=${tritonShortTag} -DTRITON_BACKEND_REPO_TAG=${tritonShortTag} -DUSE_CXX11_ABI=ON && make -j${buildJobs} install"

    // Step 3: packaging wheels into tarfile
    sh "cp ${LLM_ROOT}/build/tensorrt_llm-*.whl TensorRT-LLM/"
@ -579,8 +585,6 @@ def launchStages(pipeline, cpu_arch, enableFailFast, globalVars)
        "Build TRT-LLM SingleDevice": [LLM_DOCKER_IMAGE] + prepareLLMBuild(
            pipeline, CONFIG_LINUX_X86_64_SINGLE_DEVICE),
        ]
-    } else {
-        buildConfigs.remove("Build TRT-LLM LLVM") // TODO: Remove after LLVM is supported on AArch64
    }

    rtServer (
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -904,7 +904,8 @@ def createKubernetesPodConfig(image, type, arch = "amd64", gpuCount = 1, perfMod
                              - key: "kubernetes.io/hostname"
                                operator: In
                                values:
-                                - "lego-cg1-qct-066.ipp3a2.colossus\""""
+                                - "lego-cg1-qct-066.ipp3a2.colossus"
+                                - "lego-cg1-qct-069.ipp3a2.colossus\""""
    }

    def podConfig = [
@ -2160,13 +2161,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)

    // Python version and OS for sanity check
    x86SanityCheckConfigs = [
-        "PY312-DLFW-CU12": [
-            LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE_12_9,
-            "B200_PCIe",
+        "PY312-DLFW": [
+            LLM_DOCKER_IMAGE,
+            "A10",
            X86_64_TRIPLE,
-            true,
+            false,
            "dlfw/",
-            DLFW_IMAGE_12_9,
+            DLFW_IMAGE,
            false,
        ],
        "PY310-UB2204-CU12": [
@ -2199,13 +2200,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
            UBUNTU_24_04_IMAGE,
            true, // Extra PyTorch CUDA 12.8 install
        ],
-        "PY312-DLFW-CU12": [
-            LLM_SBSA_DOCKER_IMAGE_12_9,
+        "PY312-DLFW": [
+            LLM_DOCKER_IMAGE,
            "GH200",
            AARCH64_TRIPLE,
            false,
            "dlfw/",
-            DLFW_IMAGE_12_9,
+            DLFW_IMAGE,
            false,
        ],
    ]
@ -2321,6 +2322,24 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
                            }
                        }

+                        // TODO: Remove this after public triton supports CUDA 13.
+                        if (key == "PY312-DLFW" && values[2] == X86_64_TRIPLE) {
+                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install https://download.pytorch.org/whl/nightly/pytorch_triton-3.3.1%2Bgitc8757738-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl")
+                            sh """
+                                cd /usr/local/lib/python3.12/dist-packages/ && \
+                                ls -la | grep pytorch_triton && \
+                                mv pytorch_triton-3.3.1+gitc8757738.dist-info triton-3.3.1+gitc8757738.dist-info && \
+                                cd triton-3.3.1+gitc8757738.dist-info && \
+                                echo "Current directory: \$(pwd)" && \
+                                echo "Files in directory:" && \
+                                ls -la && \
+                                sed -i 's/^Name: pytorch-triton/Name: triton/' METADATA && \
+                                sed -i 's|pytorch_triton-3.3.1+gitc8757738.dist-info/|triton-3.3.1+gitc8757738.dist-info/|g' RECORD && \
+                                echo "METADATA after update:" && \
+                                grep "^Name:" METADATA
+                            """
+                        }
+
                        def libEnv = []
                        if (env.alternativeTRT) {
                            stage("Replace TensorRT") {
--- a/requirements.txt
+++ b/requirements.txt
@ -70,7 +70,7 @@ ninja
 etcd3
 blake3
 soundfile
-triton>=3.3.1,<3.4.0; platform_machine == "x86_64"
+triton==3.3.1; platform_machine == "x86_64"
 tiktoken
 blobfile
 openai-harmony==0.0.4