[Infra] - Update dependencies with NGC PyTorch 25.05 and TRT 10.11 (#4885)

Signed-off-by: qqiao <qqiao@nvidia.com> Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> Signed-off-by: Erin Ho <14718778+hchings@users.noreply.github.com> Signed-off-by: Emma Qiao <qqiao@nvidia.com> Co-authored-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> Co-authored-by: Erin Ho <14718778+hchings@users.noreply.github.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-06-17 23:48:34 +08:00 · 2025-06-17 23:48:34 +08:00 · ff32caf4d7
commit ff32caf4d7
parent dcf18c4bcf
17 changed files with 65 additions and 50 deletions
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
@ -1,7 +1,7 @@
 version: "3.9"
 services:
  tensorrt_llm-dev:
-    image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792
+    image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885
    network_mode: host
    ipc: host

--- a/README.md
+++ b/README.md
@ -8,7 +8,7 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.9.0-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.10.0-green)](https://developer.nvidia.com/tensorrt)
+[![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
 [![version](https://img.shields.io/badge/release-0.21.0rc2-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@ -384,20 +384,22 @@ print(os.path.dirname(torch.__file__),end='');"
  set(USE_SYSTEM_NVTX ON)
  set(nvtx3_dir ${3RDPARTY_DIR}/NVTX/include)
  find_package(Torch REQUIRED)
-
-  message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
-  add_compile_options(${TORCH_CXX_FLAGS})
  add_compile_definitions(TORCH_CUDA=1)

-  if(DEFINED USE_CXX11_ABI)
-    parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
-    if(DEFINED TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI
-       AND NOT ${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} EQUAL ${USE_CXX11_ABI})
-      message(
-        WARNING
-          "The libtorch compilation options _GLIBCXX_USE_CXX11_ABI=${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} "
-          "found by CMake conflict with the project setting USE_CXX11_ABI=${USE_CXX11_ABI}, and the project "
-          "setting will be discarded.")
+  if(DEFINED TORCH_CXX_FLAGS)
+    message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
+    add_compile_options(${TORCH_CXX_FLAGS})
+    if(DEFINED USE_CXX11_ABI)
+      parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
+      if(DEFINED TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI
+         AND NOT ${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} EQUAL
+             ${USE_CXX11_ABI})
+        message(
+          WARNING
+            "The libtorch compilation options _GLIBCXX_USE_CXX11_ABI=${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} "
+            "found by CMake conflict with the project setting USE_CXX11_ABI=${USE_CXX11_ABI}, and the project "
+            "setting will be discarded.")
+      endif()
    endif()
  endif()

--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@ -1,8 +1,8 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
 ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver
-ARG BASE_TAG=25.04-py3
-ARG TRITON_BASE_TAG=25.04-py3
+ARG BASE_TAG=25.05-py3
+ARG TRITON_BASE_TAG=25.05-py3
 ARG DEVEL_IMAGE=devel

 FROM ${BASE_IMAGE}:${BASE_TAG} AS base
--- a/docker/common/install_pytorch.sh
+++ b/docker/common/install_pytorch.sh
@ -4,8 +4,8 @@ set -ex

 # Use latest stable version from https://pypi.org/project/torch/#history
 # and closest to the version specified in
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-04.html#rel-25-04
-TORCH_VERSION="2.7.0"
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05
+TORCH_VERSION="2.7.1"
 SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')

 prepare_environment() {
--- a/docker/common/install_tensorrt.sh
+++ b/docker/common/install_tensorrt.sh
@ -2,17 +2,16 @@

 set -ex

-TRT_VER="10.10.0.31"
+TRT_VER="10.11.0.33"
 # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-04.html#rel-25-04
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05
 CUDA_VER="12.9" # 12.9.0
 # Keep the installation for cuDNN if users want to install PyTorch with source codes.
 # PyTorch 2.x can compile with cuDNN v9.
-CUDNN_VER="9.9.0.52-1"
-# NCCL version 2.26.3 used in the NGC PyTorch 25.04 image but not existing in public.
-# Use NCCL version 2.26.5 instead.
-NCCL_VER="2.26.5-1+cuda12.9"
-# cuBLAS version 12.9.0.2 used in the NGC PyTorch 25.04 image but not existing in public.
+CUDNN_VER="9.10.1.4-1"
+# NCCL version 2.26.x used in the NGC PyTorch 25.05 image but has a performance regression issue.
+# Use NCCL version 2.25.1 instead.
+NCCL_VER="2.25.1-1+cuda12.8"
 # Use cuBLAS version 12.9.0.13 instead.
 CUBLAS_VER="12.9.0.13-1"
 # Align with the pre-installed CUDA / NVCC / NVRTC versions from
@ -68,6 +67,7 @@ install_ubuntu_requirements() {
    apt-get install -y --no-install-recommends \
        libcudnn9-cuda-12=${CUDNN_VER} \
        libcudnn9-dev-cuda-12=${CUDNN_VER} \
+	libcudnn9-headers-cuda-12=${CUDNN_VER} \
        libnccl2=${NCCL_VER} \
        libnccl-dev=${NCCL_VER} \
        libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \
--- a/docs/source/installation/build-from-source-linux.md
+++ b/docs/source/installation/build-from-source-linux.md
@ -2,7 +2,7 @@

 # Building from Source Code on Linux

-This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0, which uses the new CXX11 ABI.
+This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0 and subsequent versions, which uses the new CXX11 ABI.


 ## Prerequisites
--- a/docs/source/installation/grace-hopper.md
+++ b/docs/source/installation/grace-hopper.md
@ -5,7 +5,7 @@
 1. Install TensorRT-LLM (tested on Ubuntu 24.04).

    ```bash
-    pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+    pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

    sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
    ```
--- a/docs/source/installation/linux.md
+++ b/docs/source/installation/linux.md
@ -5,7 +5,7 @@
 1. Install TensorRT-LLM (tested on Ubuntu 24.04).

    ```bash
-    (Optional) pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
+    (Optional) pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128

    sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
    ```
--- a/docs/source/reference/support-matrix.md
+++ b/docs/source/reference/support-matrix.md
@ -142,9 +142,9 @@ The following table shows the supported software for TensorRT-LLM.
 * -
  - Software Compatibility
 * - Container
-  - [25.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+  - [25.05](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
 * - TensorRT
-  - [10.10](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
+  - [10.11](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
 * - Precision
  -
    - Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@ -28,10 +28,10 @@ UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifac
 // Container configuration
 // available tags can be found in: https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/
 // [base_image_name]-[arch]-[os](-[python_version])-[trt_version]-[torch_install_type]-[stage]-[date]-[mr_id]
-LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
-LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-aarch64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.10.0.31-skip-tritondevel-202506111045-4792"
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.10.0.31-skip-tritondevel-202506111045-4792"
+LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885"
+LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885"
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202506051650-4885"
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202506051650-4885"

 // TODO: Move common variables to an unified location
 BUILD_CORES_REQUEST = "8"
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310
 LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312

 // DLFW torch image
-DLFW_IMAGE = "nvcr.io/nvidia/pytorch:25.04-py3"
+DLFW_IMAGE = "nvcr.io/nvidia/pytorch:25.05-py3"

 //Ubuntu base image
 UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"
@ -1775,7 +1775,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
                        // Extra PyTorch CUDA 12.8 install
                        if (values[6]) {
                            echo "###### Extra PyTorch CUDA 12.8 install Start ######"
-                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128")
+                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128")
                        }

                        def libEnv = []
--- a/jenkins/controlCCache.groovy
+++ b/jenkins/controlCCache.groovy
@ -1,7 +1,7 @@

 import java.lang.InterruptedException

-DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
+DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885"

 def createKubernetesPodConfig(image, arch = "amd64")
 {
--- a/requirements.txt
+++ b/requirements.txt
@ -21,9 +21,9 @@ pandas
 h5py==3.12.1
 StrEnum
 sentencepiece>=0.1.99
-tensorrt~=10.10.0
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-04.html#rel-25-04 uses 2.7.0a0.
-torch>=2.7.0a0,<=2.7.0
+tensorrt~=10.11.0
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05 uses 2.8.0a0.
+torch>=2.7.1,<=2.8.0a0
 torchvision
 nvidia-modelopt[torch]~=0.31.0
 nvidia-nccl-cu12
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -425,3 +425,10 @@ triton_server/test_triton_llm.py::test_llava[False-1---False-True-False-0-128-en
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5336321)
 examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs1] SKIP (https://nvbugs/5344070)
 examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8] SKIP (https://nvbugs/5343850)
+examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5333849)
+examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
+examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
+triton_server/test_triton.py::test_mllama[mllama] SKIP (https://nvbugs/5333818)
+examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
+accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_ootb SKIP (https://nvbugs/5338552)
+accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5345215)
--- a/tests/unittest/conftest.py
+++ b/tests/unittest/conftest.py
@ -73,3 +73,9 @@ def pytest_collection_modifyitems(session, config, items):
        # it into the appropriate test suite.
        for item in items:
            item._nodeid = f"{test_prefix}/{item._nodeid}"
+
+
+def pytest_sessionstart(session):
+    # To counter TransformerEngine v2.3's lazy_compile deferral,
+    # which will cause Pytest thinks there's a thread leakage.
+    import torch._inductor.async_compile  # noqa: F401
--- a/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py
+++ b/triton_backend/inflight_batcher_llm/client/inflight_batcher_llm_client.py
@ -809,20 +809,20 @@ if __name__ == "__main__":
                               input_ids[0]) + [
                                   21221,
                                   290,
+                                   373,
                                   257,
-                                   4255,
-                                   379,
+                                   2888,
+                                   286,
                                   262,
-                                   1957,
-                                   7072,
-                                   11,
-                                   4689,
-                                   347,
-                                   2852,
-                                   2564,
-                                   494,
+                                   4141,
+                                   2351,
+                                   10006,
                                   13,
                                   679,
+                                   373,
+                                   7018,
+                                   284,
+                                   262,
                               ]

    if FLAGS.num_return_sequences is None: