infra: upgrade to DLFW 25.08-pre and TRT 10.13.2.4

Signed-off-by: Zhanrui Sun <zhanruis@nvidia.com>
2026-01-14 06:27:45 +08:00 · 2025-08-11 19:27:09 -07:00 · 2025-08-11 19:27:09 -07:00 · ebec4ea5ee
commit ebec4ea5ee
parent 97a3788dcf
10 changed files with 134 additions and 98 deletions
--- a/bringup_fix.sh
+++ b/bringup_fix.sh
@ -1,46 +1,49 @@
-ARCH=$(uname -m)
-if [ $ARCH == "x86_64" ]; then
+# ARCH=$(uname -m)
+# if [ $ARCH == "x86_64" ]; then

-wget https://urm.nvidia.com/artifactory/sw-gpu-cuda-installer-generic-local/packaging/r13.0/cuda_nvrtc/linux-x86_64/13.0.48/cuda-nvrtc-dev-13-0_13.0.48-1_amd64.deb && \
-    dpkg -i cuda-nvrtc-dev-13-0_13.0.48-1_amd64.deb && \
-    rm cuda-nvrtc-dev-13-0_13.0.48-1_amd64.deb
+# wget https://urm.nvidia.com/artifactory/sw-gpu-cuda-installer-generic-local/packaging/r13.0/cuda_nvrtc/linux-x86_64/13.0.48/cuda-nvrtc-dev-13-0_13.0.48-1_amd64.deb && \
+#     dpkg -i cuda-nvrtc-dev-13-0_13.0.48-1_amd64.deb && \
+#     rm cuda-nvrtc-dev-13-0_13.0.48-1_amd64.deb

-wget https://github.com/Kitware/CMake/releases/download/v4.0.3/cmake-4.0.3-linux-x86_64.sh && \
-    bash cmake-4.0.3-linux-x86_64.sh --skip-license --prefix=/usr/local/cmake --exclude-subdir
+# wget https://github.com/Kitware/CMake/releases/download/v4.0.3/cmake-4.0.3-linux-x86_64.sh && \
+#     bash cmake-4.0.3-linux-x86_64.sh --skip-license --prefix=/usr/local/cmake --exclude-subdir

-apt update
-apt install -y libstdc++-14-dev
+# apt update
+# apt install -y libstdc++-14-dev

-elif [ $ARCH == "aarch64" ]; then
+# elif [ $ARCH == "aarch64" ]; then

-# to be moved to docker/common/ scripts
-wget https://urm.nvidia.com/artifactory/sw-gpu-cuda-installer-generic-local/packaging/r13.0/cuda_nvrtc/linux-sbsa/13.0.48/cuda-nvrtc-dev-13-0_13.0.48-1_arm64.deb && \
-    dpkg -i cuda-nvrtc-dev-13-0_13.0.48-1_arm64.deb && \
-    rm cuda-nvrtc-dev-13-0_13.0.48-1_arm64.deb
+# # to be moved to docker/common/ scripts
+# wget https://urm.nvidia.com/artifactory/sw-gpu-cuda-installer-generic-local/packaging/r13.0/cuda_nvrtc/linux-sbsa/13.0.48/cuda-nvrtc-dev-13-0_13.0.48-1_arm64.deb && \
+#     dpkg -i cuda-nvrtc-dev-13-0_13.0.48-1_arm64.deb && \
+#     rm cuda-nvrtc-dev-13-0_13.0.48-1_arm64.deb

-wget https://github.com/Kitware/CMake/releases/download/v4.0.3/cmake-4.0.3-linux-aarch64.sh && \
-    bash cmake-4.0.3-linux-aarch64.sh --skip-license --prefix=/usr/local/cmake --exclude-subdir
+# wget https://github.com/Kitware/CMake/releases/download/v4.0.3/cmake-4.0.3-linux-aarch64.sh && \
+#     bash cmake-4.0.3-linux-aarch64.sh --skip-license --prefix=/usr/local/cmake --exclude-subdir

-apt update
-# fix LLVM build
-apt install -y libstdc++-14-dev
+# apt update
+# # fix LLVM build
+# apt install -y libstdc++-14-dev

-else
-    echo "Unsupported architecture: $ARCH"
-    exit 1
-fi
+# # wait for https://github.com/NVIDIA/TensorRT-LLM/pull/6588
+# pip install deep_gemm@git+https://github.com/VALLIS-NERIA/DeepGEMM.git@97d97a20c2ecd53a248ab64242219d780cf822b8 --no-build-isolation

-# wait for new triton to be published
-cd /usr/local/lib/python3.12/dist-packages/ && \
-    ls -la | grep pytorch_triton && \
-    mv pytorch_triton-3.3.1+gitc8757738.dist-info triton-3.3.1+gitc8757738.dist-info && \
-    cd triton-3.3.1+gitc8757738.dist-info && \
-    echo "Current directory: $(pwd)" && \
-    echo "Files in directory:" && \
-    ls -la && \
-    sed -i 's/^Name: pytorch-triton/Name: triton/' METADATA && \
-    sed -i 's|pytorch_triton-3.3.1+gitc8757738.dist-info/|triton-3.3.1+gitc8757738.dist-info/|g' RECORD && \
-    echo "METADATA after update:" && \
-    grep "^Name:" METADATA
+# else
+#     echo "Unsupported architecture: $ARCH"
+#     exit 1
+# fi

-# pip install git+https://github.com/triton-lang/triton.git@main
+# # wait for new triton to be published
+# cd /usr/local/lib/python3.12/dist-packages/ && \
+#     ls -la | grep pytorch_triton && \
+#     mv pytorch_triton-3.3.1+gitc8757738.dist-info triton-3.3.1+gitc8757738.dist-info && \
+#     cd triton-3.3.1+gitc8757738.dist-info && \
+#     echo "Current directory: $(pwd)" && \
+#     echo "Files in directory:" && \
+#     ls -la && \
+#     sed -i 's/^Name: pytorch-triton/Name: triton/' METADATA && \
+#     sed -i 's|pytorch_triton-3.3.1+gitc8757738.dist-info/|triton-3.3.1+gitc8757738.dist-info/|g' RECORD && \
+#     echo "METADATA after update:" && \
+#     grep "^Name:" METADATA
+
+# # pip install git+https://github.com/triton-lang/triton.git@main
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@ -1,8 +1,8 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=gitlab-master.nvidia.com:5005/dl/dgx/pytorch
 ARG TRITON_IMAGE=gitlab-master.nvidia.com:5005/dl/dgx/tritonserver
-ARG BASE_TAG=25.08-py3.32224057-base
-ARG TRITON_BASE_TAG=25.08-RC-py3.32078257
+ARG BASE_TAG=25.08-py3.32674667-devel
+ARG TRITON_BASE_TAG=25.08-py3.32978230
 ARG DEVEL_IMAGE=devel

 FROM ${BASE_IMAGE}:${BASE_TAG} AS base
@ -74,13 +74,29 @@ ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
 RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
 RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir

-COPY bringup_fix.sh bringup_fix.sh
-RUN bash ./bringup_fix.sh && rm bringup_fix.sh
+# COPY bringup_fix.sh bringup_fix.sh
+# RUN bash ./bringup_fix.sh && rm bringup_fix.sh

 # WARs against security issues inherited from pytorch:25.06
+# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
 RUN pip3 install --upgrade --no-cache-dir \
    "protobuf>=4.25.8"

+# wait for new triton to be published
+# Rename pytorch_triton package to triton
+RUN cd /usr/local/lib/python3.12/dist-packages/ && \
+    ls -la | grep pytorch_triton && \
+    mv pytorch_triton-3.3.1+gitc8757738.dist-info triton-3.3.1+gitc8757738.dist-info && \
+    cd triton-3.3.1+gitc8757738.dist-info && \
+    echo "Current directory: $(pwd)" && \
+    echo "Files in directory:" && \
+    ls -la && \
+    sed -i 's/^Name: pytorch-triton/Name: triton/' METADATA && \
+    sed -i 's|pytorch_triton-3.3.1+gitc8757738.dist-info/|triton-3.3.1+gitc8757738.dist-info/|g' RECORD && \
+    echo "METADATA after update:" && \
+    grep "^Name:" METADATA
+
+
 FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton

 FROM devel AS tritondevel
@ -90,6 +106,8 @@ COPY --from=triton /opt/tritonserver/lib /opt/tritonserver/lib
 COPY --from=triton /opt/tritonserver/include /opt/tritonserver/include
 COPY --from=triton /opt/tritonserver/bin /opt/tritonserver/bin
 COPY --from=triton /opt/tritonserver/caches /opt/tritonserver/caches
+# WAR for the datacenter-gpu-manager 4.4.0 is not available in the apt repository
+COPY --from=triton /usr/lib/*-linux-gnu/libdcgm.so* /tmp/
 COPY docker/common/install_triton.sh install_triton.sh
 RUN bash ./install_triton.sh && rm install_triton.sh

--- a/docker/common/install_base.sh
+++ b/docker/common/install_base.sh
@ -56,6 +56,7 @@ init_ubuntu() {
    llvm \
    libclang-rt-dev \
    libffi-dev \
+    libstdc++-14-dev \
    libnuma1 \
    libnuma-dev \
    python3-dev \
--- a/docker/common/install_cmake.sh
+++ b/docker/common/install_cmake.sh
@ -3,7 +3,7 @@
 set -ex

 ARCH=$(uname -m)
-CMAKE_VERSION="3.30.2"
+CMAKE_VERSION="4.0.3"
 GITHUB_URL="https://github.com"
 if [ -n "${GITHUB_MIRROR}" ]; then
    GITHUB_URL=${GITHUB_MIRROR}
--- a/docker/common/install_cuda_toolkit.sh
+++ b/docker/common/install_cuda_toolkit.sh
@ -5,7 +5,7 @@ set -ex
 # This script is used for reinstalling CUDA on Rocky Linux 8 with the run file.
 # CUDA version is usually aligned with the latest NGC CUDA image tag.
 # Only use when public CUDA image is not ready.
-CUDA_VER="12.9.1_575.57.08"
+CUDA_VER="13.0.0_580.65.06"
 CUDA_VER_SHORT="${CUDA_VER%_*}"

 NVCC_VERSION_OUTPUT=$(nvcc --version)
--- a/docker/common/install_tensorrt.sh
+++ b/docker/common/install_tensorrt.sh
@ -2,24 +2,23 @@

 set -ex

-TRT_VER="10.11.0.33"
+TRT_VER="10.13.2.6"
 # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-06.html#rel-25-06
-CUDA_VER="12.9" # 12.9.1
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-08.html#rel-25-08
+CUDA_VER="13.0" # 13.0.0
 # Keep the installation for cuDNN if users want to install PyTorch with source codes.
 # PyTorch 2.x can compile with cuDNN v9.
-CUDNN_VER="9.10.2.21-1"
-# NGC PyTorch 25.06 image uses NCCL 2.27.3, while NCCL 2.27.5 resolves a perf regression issue.
-# Use NCCL version 2.27.5 instead.
-NCCL_VER="2.27.5-1+cuda12.9"
-# NGC PyTorch 25.06 image uses cuBLAS 12.9.1.4, but which leads to failures with MoE Lora (see https://nvbugs/5376270).
-# Continue using cuBLAS 12.9.0.13 until this issue is resolved.
-CUBLAS_VER="12.9.0.13-1"
+CUDNN_VER="9.12.0.42-1"
+# NCCL version 2.26.x used in the NGC PyTorch 25.05 image but has a performance regression issue.
+# Use NCCL version 2.27.5 which has the fixes.
+NCCL_VER="2.27.6-1+cuda13.0"
+# Use cuBLAS version 13.0.0.19 instead.
+CUBLAS_VER="13.0.0.19-1"
 # Align with the pre-installed CUDA / NVCC / NVRTC versions from
 # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
-NVRTC_VER="12.9.86-1"
-CUDA_RUNTIME="12.9.79-1"
-CUDA_DRIVER_VERSION="575.57.08-1.el8"
+NVRTC_VER="13.0.48-1"
+CUDA_RUNTIME="13.0.37-1"
+CUDA_DRIVER_VERSION="580.65.06-1.el8"

 for i in "$@"; do
    case $i in
@ -41,39 +40,44 @@ fi
 install_ubuntu_requirements() {
    apt-get update && apt-get install -y --no-install-recommends gnupg2 curl ca-certificates
    ARCH=$(uname -m)
-    if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
-    if [ "$ARCH" = "aarch64" ];then ARCH="sbsa";fi
+    ARCH2="amd64"
+    if [ "$ARCH" = "amd64" ];then ARCH="x86_64" && ARCH2="amd64";fi
+    if [ "$ARCH" = "aarch64" ];then ARCH="sbsa" && ARCH2="arm64";fi

    curl -fsSLO https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2404/${ARCH}/cuda-keyring_1.1-1_all.deb
    dpkg -i cuda-keyring_1.1-1_all.deb
    rm cuda-keyring_1.1-1_all.deb

-    apt-get update
-  #   if [[ $(apt list --installed | grep libcudnn9) ]]; then
-  #     apt-get remove --purge -y libcudnn9*
-  #   fi
-  #   if [[ $(apt list --installed | grep libnccl) ]]; then
-  #     apt-get remove --purge -y --allow-change-held-packages libnccl*
-  #   fi
-  #   if [[ $(apt list --installed | grep libcublas) ]]; then
-  #     apt-get remove --purge -y --allow-change-held-packages libcublas*
-  #   fi
-  #   if [[ $(apt list --installed | grep cuda-nvrtc-dev) ]]; then
-  #     apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
-  #   fi
+    wget https://urm.nvidia.com/artifactory/sw-gpu-cuda-installer-generic-local/packaging/r13.0/cuda_nvrtc/linux-${ARCH}/13.0.48/cuda-nvrtc-dev-13-0_13.0.48-1_${ARCH2}.deb && \
+    dpkg -i cuda-nvrtc-dev-13-0_13.0.48-1_${ARCH2}.deb && \
+    rm cuda-nvrtc-dev-13-0_13.0.48-1_${ARCH2}.deb

-  #   CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
-  #   NVRTC_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    # apt-get update
+    # if [[ $(apt list --installed | grep libcudnn9) ]]; then
+    #   apt-get remove --purge -y libcudnn9*
+    # fi
+    # if [[ $(apt list --installed | grep libnccl) ]]; then
+    #   apt-get remove --purge -y --allow-change-held-packages libnccl*
+    # fi
+    # if [[ $(apt list --installed | grep libcublas) ]]; then
+    #   apt-get remove --purge -y --allow-change-held-packages libcublas*
+    # fi
+    # if [[ $(apt list --installed | grep cuda-nvrtc-dev) ]]; then
+    #   apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
+    # fi

-  #   apt-get install -y --no-install-recommends \
-  #       libcudnn9-cuda-12=${CUDNN_VER} \
-  #       libcudnn9-dev-cuda-12=${CUDNN_VER} \
-	# libcudnn9-headers-cuda-12=${CUDNN_VER} \
-  #       libnccl2=${NCCL_VER} \
-  #       libnccl-dev=${NCCL_VER} \
-  #       libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \
-  #       libcublas-dev-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \
-  #       cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER}
+    # CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+    # NVRTC_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
+
+    # apt-get install -y --no-install-recommends \
+    #     libcudnn9-cuda-13=${CUDNN_VER} \
+    #     libcudnn9-dev-cuda-13=${CUDNN_VER} \
+    #     libcudnn9-headers-cuda-13=${CUDNN_VER} \
+    #     libnccl2=${NCCL_VER} \
+    #     libnccl-dev=${NCCL_VER} \
+    #     libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \
+    #     libcublas-dev-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \
+    #     cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER}

    apt-get clean
    rm -rf /var/lib/apt/lists/*
@ -92,7 +96,7 @@ install_rockylinux_requirements() {
        "libnccl-devel-${NCCL_VER}.${ARCH1}" \
        "cuda-compat-${CUBLAS_CUDA_VERSION}-${CUDA_DRIVER_VERSION}.${ARCH1}" \
        "cuda-toolkit-${CUBLAS_CUDA_VERSION}-config-common-${CUDA_RUNTIME}.noarch" \
-        "cuda-toolkit-12-config-common-${CUDA_RUNTIME}.noarch" \
+        "cuda-toolkit-13-config-common-${CUDA_RUNTIME}.noarch" \
        "cuda-toolkit-config-common-${CUDA_RUNTIME}.noarch" \
        "libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.${ARCH1}" \
        "libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.${ARCH1}"; do
@ -108,7 +112,7 @@ install_rockylinux_requirements() {
        libnccl-devel-${NCCL_VER}.${ARCH1}.rpm \
        cuda-compat-${CUBLAS_CUDA_VERSION}-${CUDA_DRIVER_VERSION}.${ARCH1}.rpm \
        cuda-toolkit-${CUBLAS_CUDA_VERSION}-config-common-${CUDA_RUNTIME}.noarch.rpm \
-        cuda-toolkit-12-config-common-${CUDA_RUNTIME}.noarch.rpm \
+        cuda-toolkit-13-config-common-${CUDA_RUNTIME}.noarch.rpm \
        cuda-toolkit-config-common-${CUDA_RUNTIME}.noarch.rpm \
        libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.${ARCH1}.rpm \
        libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.${ARCH1}.rpm
@ -130,15 +134,16 @@ install_tensorrt() {
        if [ -z "$ARCH" ];then ARCH=$(uname -m);fi
        if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
        if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
-
-        if [ "$ARCH" = "x86_64" ]; then
-        RELEASE_URL_TRT="http://cuda-repo/release-candidates/Libraries/TensorRT/v10.14/10.14.0.19-6374d0f7/13.0-r580/Linux-x64-manylinux_2_28/tar/TensorRT-10.14.0.19.Linux.x86_64-gnu.cuda-13.0.tar.gz"
-        else
-        RELEASE_URL_TRT="http://cuda-repo/release-candidates/Libraries/TensorRT/v10.14/10.14.0.19-6374d0f7/13.0-r580/Linux-aarch64-manylinux_2_35/tar/TensorRT-10.14.0.19.Ubuntu-22.04.aarch64-gnu.cuda-13.0.tar.gz"
-        fi
+        RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/${TRT_VER_SHORT}/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz"
    fi

-    wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
+    # Download TensorRT (6GB file, needs longer timeout)
+    echo "Downloading TensorRT from: ${RELEASE_URL_TRT}"
+    if [ "$ARCH" = "x86_64" ];then
+        curl -L --insecure --connect-timeout 600 --max-time 3600 --retry 3 -o /tmp/TensorRT.tar "${RELEASE_URL_TRT}"
+    else
+        wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
+    fi
    tar -xf /tmp/TensorRT.tar -C /usr/local/
    mv /usr/local/TensorRT-* /usr/local/tensorrt
    pip3 install --no-cache-dir /usr/local/tensorrt/python/tensorrt-*-cp${PARSED_PY_VERSION}-*.whl
@ -162,7 +167,7 @@ case "$ID" in
    install_tensorrt
    ;;
  rocky)
-    install_rockylinux_requirements
+    # install_rockylinux_requirements
    install_tensorrt
    ;;
  *)
--- a/docker/common/install_triton.sh
+++ b/docker/common/install_triton.sh
@ -21,10 +21,17 @@ install_triton_deps() {
      python3-build \
      libb64-dev \
      libarchive-dev \
-      datacenter-gpu-manager=1:3.3.6 \
    && install_boost \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*
+  # Copy /tmp/libdcgm.so* files back to /usr/lib/<arch>-linux-gnu/
+  if [ -d /usr/lib/x86_64-linux-gnu ]; then
+    cp -f /tmp/libdcgm.so* /usr/lib/x86_64-linux-gnu/ || true
+  elif [ -d /usr/lib/aarch64-linux-gnu ]; then
+    cp -f /tmp/libdcgm.so* /usr/lib/aarch64-linux-gnu/ || true
+  else
+    echo "Target /usr/lib directory for architecture not found, skipping libdcgm.so* copy"
+  fi
 }

 # Install Triton only if base image is Ubuntu
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -2097,7 +2097,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget https://developer.download.nvidia.com/compute/cuda/repos/${ubuntu_version}/${platform}/cuda-keyring_1.1-1_all.deb")
                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "dpkg -i cuda-keyring_1.1-1_all.deb")
                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update")
-                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get -y install cuda-toolkit-12-9")
+                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get -y install cuda-toolkit-13-0")
                        }

                        // Extra PyTorch CUDA 12.8 install for SBSA platform and Blackwell GPUs bare-metal environments
--- a/jenkins/current_image_tags.properties
+++ b/jenkins/current_image_tags.properties
@ -11,7 +11,9 @@
 #
 # NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
 #     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
-LLM_DOCKER_IMAGE=gitlab-master.nvidia.com:5005/xiweny/images:gb110_bringup_x86_64
-LLM_SBSA_DOCKER_IMAGE=gitlab-master.nvidia.com:5005/xiweny/images:gb110_bringup_sbsa
+LLM_DOCKER_IMAGE_12_9=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508051130-6090
+LLM_SBSA_DOCKER_IMAGE_12_9=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508051130-6090
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:x86_64-tritondevel-torch_skip-a9bc5c5-user_zhanruis_update_dlfw_and_cu13-656
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm-staging/tritondevel:sbsa-tritondevel-torch_skip-a9bc5c5-user_zhanruis_update_dlfw_and_cu13-656
 LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508051130-6090
 LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508051130-6090
--- a/requirements.txt
+++ b/requirements.txt
@ -3,7 +3,7 @@
 accelerate>=0.25.0
 build
 colored
-cuda-python # Do not override the custom version of cuda-python installed in the NGC PyTorch image.
+cuda-python>=12,<13
 diffusers>=0.27.0
 lark
 mpi4py
@ -26,8 +26,8 @@ tensorrt
 torch>=2.7.1,<=2.8.0a0
 torchvision
 nvidia-modelopt[torch]~=0.33.0
-nvidia-nccl-cu12
-nvidia-cuda-nvrtc-cu12
+nvidia-nccl-cu13
+nvidia-cuda-nvrtc-cu13
 transformers==4.53.1
 pydantic>=2.9.1
 pydantic-settings[yaml]