[Infra] - Update dependencies with NGC PyTorch 25.05 and TRT 10.11 (#4885)

Signed-off-by: qqiao <qqiao@nvidia.com>
Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
Signed-off-by: Erin Ho <14718778+hchings@users.noreply.github.com>
Signed-off-by: Emma Qiao <qqiao@nvidia.com>
Co-authored-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
Co-authored-by: Erin Ho <14718778+hchings@users.noreply.github.com>
Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
Emma Qiao 2025-06-17 23:48:34 +08:00 committed by GitHub
parent dcf18c4bcf
commit ff32caf4d7
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 65 additions and 50 deletions

View File

@ -1,7 +1,7 @@
version: "3.9"
services:
tensorrt_llm-dev:
image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792
image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885
network_mode: host
ipc: host

View File

@ -8,7 +8,7 @@ TensorRT-LLM
[![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.9.0-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.10.0-green)](https://developer.nvidia.com/tensorrt)
[![trt](https://img.shields.io/badge/TRT-10.11.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.21.0rc2-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

View File

@ -384,20 +384,22 @@ print(os.path.dirname(torch.__file__),end='');"
set(USE_SYSTEM_NVTX ON)
set(nvtx3_dir ${3RDPARTY_DIR}/NVTX/include)
find_package(Torch REQUIRED)
message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
add_compile_options(${TORCH_CXX_FLAGS})
add_compile_definitions(TORCH_CUDA=1)
if(DEFINED USE_CXX11_ABI)
parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
if(DEFINED TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI
AND NOT ${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} EQUAL ${USE_CXX11_ABI})
message(
WARNING
"The libtorch compilation options _GLIBCXX_USE_CXX11_ABI=${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} "
"found by CMake conflict with the project setting USE_CXX11_ABI=${USE_CXX11_ABI}, and the project "
"setting will be discarded.")
if(DEFINED TORCH_CXX_FLAGS)
message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
add_compile_options(${TORCH_CXX_FLAGS})
if(DEFINED USE_CXX11_ABI)
parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
if(DEFINED TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI
AND NOT ${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} EQUAL
${USE_CXX11_ABI})
message(
WARNING
"The libtorch compilation options _GLIBCXX_USE_CXX11_ABI=${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} "
"found by CMake conflict with the project setting USE_CXX11_ABI=${USE_CXX11_ABI}, and the project "
"setting will be discarded.")
endif()
endif()
endif()

View File

@ -1,8 +1,8 @@
# Multi-stage Dockerfile
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver
ARG BASE_TAG=25.04-py3
ARG TRITON_BASE_TAG=25.04-py3
ARG BASE_TAG=25.05-py3
ARG TRITON_BASE_TAG=25.05-py3
ARG DEVEL_IMAGE=devel
FROM ${BASE_IMAGE}:${BASE_TAG} AS base

View File

@ -4,8 +4,8 @@ set -ex
# Use latest stable version from https://pypi.org/project/torch/#history
# and closest to the version specified in
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-04.html#rel-25-04
TORCH_VERSION="2.7.0"
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05
TORCH_VERSION="2.7.1"
SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
prepare_environment() {

View File

@ -2,17 +2,16 @@
set -ex
TRT_VER="10.10.0.31"
TRT_VER="10.11.0.33"
# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-04.html#rel-25-04
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05
CUDA_VER="12.9" # 12.9.0
# Keep the installation for cuDNN if users want to install PyTorch with source codes.
# PyTorch 2.x can compile with cuDNN v9.
CUDNN_VER="9.9.0.52-1"
# NCCL version 2.26.3 used in the NGC PyTorch 25.04 image but not existing in public.
# Use NCCL version 2.26.5 instead.
NCCL_VER="2.26.5-1+cuda12.9"
# cuBLAS version 12.9.0.2 used in the NGC PyTorch 25.04 image but not existing in public.
CUDNN_VER="9.10.1.4-1"
# NCCL version 2.26.x used in the NGC PyTorch 25.05 image but has a performance regression issue.
# Use NCCL version 2.25.1 instead.
NCCL_VER="2.25.1-1+cuda12.8"
# Use cuBLAS version 12.9.0.13 instead.
CUBLAS_VER="12.9.0.13-1"
# Align with the pre-installed CUDA / NVCC / NVRTC versions from
@ -68,6 +67,7 @@ install_ubuntu_requirements() {
apt-get install -y --no-install-recommends \
libcudnn9-cuda-12=${CUDNN_VER} \
libcudnn9-dev-cuda-12=${CUDNN_VER} \
libcudnn9-headers-cuda-12=${CUDNN_VER} \
libnccl2=${NCCL_VER} \
libnccl-dev=${NCCL_VER} \
libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \

View File

@ -2,7 +2,7 @@
# Building from Source Code on Linux
This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0, which uses the new CXX11 ABI.
This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0 and subsequent versions, which uses the new CXX11 ABI.
## Prerequisites

View File

@ -5,7 +5,7 @@
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
```bash
pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
```

View File

@ -5,7 +5,7 @@
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
```bash
(Optional) pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
(Optional) pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
```

View File

@ -142,9 +142,9 @@ The following table shows the supported software for TensorRT-LLM.
* -
- Software Compatibility
* - Container
- [25.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
- [25.05](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
* - TensorRT
- [10.10](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
- [10.11](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
* - Precision
-
- Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4

View File

@ -28,10 +28,10 @@ UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifac
// Container configuration
// available tags can be found in: https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/
// [base_image_name]-[arch]-[os](-[python_version])-[trt_version]-[torch_install_type]-[stage]-[date]-[mr_id]
LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-aarch64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.10.0.31-skip-tritondevel-202506111045-4792"
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.10.0.31-skip-tritondevel-202506111045-4792"
LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885"
LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885"
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202506051650-4885"
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202506051650-4885"
// TODO: Move common variables to an unified location
BUILD_CORES_REQUEST = "8"

View File

@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312
// DLFW torch image
DLFW_IMAGE = "nvcr.io/nvidia/pytorch:25.04-py3"
DLFW_IMAGE = "nvcr.io/nvidia/pytorch:25.05-py3"
//Ubuntu base image
UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"
@ -1775,7 +1775,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
// Extra PyTorch CUDA 12.8 install
if (values[6]) {
echo "###### Extra PyTorch CUDA 12.8 install Start ######"
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128")
}
def libEnv = []

View File

@ -1,7 +1,7 @@
import java.lang.InterruptedException
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885"
def createKubernetesPodConfig(image, arch = "amd64")
{

View File

@ -21,9 +21,9 @@ pandas
h5py==3.12.1
StrEnum
sentencepiece>=0.1.99
tensorrt~=10.10.0
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-04.html#rel-25-04 uses 2.7.0a0.
torch>=2.7.0a0,<=2.7.0
tensorrt~=10.11.0
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05 uses 2.8.0a0.
torch>=2.7.1,<=2.8.0a0
torchvision
nvidia-modelopt[torch]~=0.31.0
nvidia-nccl-cu12

View File

@ -425,3 +425,10 @@ triton_server/test_triton_llm.py::test_llava[False-1---False-True-False-0-128-en
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5336321)
examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs1] SKIP (https://nvbugs/5344070)
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8] SKIP (https://nvbugs/5343850)
examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5333849)
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
triton_server/test_triton.py::test_mllama[mllama] SKIP (https://nvbugs/5333818)
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_ootb SKIP (https://nvbugs/5338552)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5345215)

View File

@ -73,3 +73,9 @@ def pytest_collection_modifyitems(session, config, items):
# it into the appropriate test suite.
for item in items:
item._nodeid = f"{test_prefix}/{item._nodeid}"
def pytest_sessionstart(session):
# To counter TransformerEngine v2.3's lazy_compile deferral,
# which will cause Pytest thinks there's a thread leakage.
import torch._inductor.async_compile # noqa: F401

View File

@ -809,20 +809,20 @@ if __name__ == "__main__":
input_ids[0]) + [
21221,
290,
373,
257,
4255,
379,
2888,
286,
262,
1957,
7072,
11,
4689,
347,
2852,
2564,
494,
4141,
2351,
10006,
13,
679,
373,
7018,
284,
262,
]
if FLAGS.num_return_sequences is None: