mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[Infra] - Update dependencies with NGC PyTorch 25.05 and TRT 10.11 (#4885)
Signed-off-by: qqiao <qqiao@nvidia.com> Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> Signed-off-by: Erin Ho <14718778+hchings@users.noreply.github.com> Signed-off-by: Emma Qiao <qqiao@nvidia.com> Co-authored-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> Co-authored-by: Erin Ho <14718778+hchings@users.noreply.github.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
This commit is contained in:
parent
dcf18c4bcf
commit
ff32caf4d7
@ -1,7 +1,7 @@
|
||||
version: "3.9"
|
||||
services:
|
||||
tensorrt_llm-dev:
|
||||
image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792
|
||||
image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885
|
||||
network_mode: host
|
||||
ipc: host
|
||||
|
||||
|
||||
@ -8,7 +8,7 @@ TensorRT-LLM
|
||||
[](https://www.python.org/downloads/release/python-3123/)
|
||||
[](https://www.python.org/downloads/release/python-31012/)
|
||||
[](https://developer.nvidia.com/cuda-downloads)
|
||||
[](https://developer.nvidia.com/tensorrt)
|
||||
[](https://developer.nvidia.com/tensorrt)
|
||||
[](./tensorrt_llm/version.py)
|
||||
[](./LICENSE)
|
||||
|
||||
|
||||
@ -384,20 +384,22 @@ print(os.path.dirname(torch.__file__),end='');"
|
||||
set(USE_SYSTEM_NVTX ON)
|
||||
set(nvtx3_dir ${3RDPARTY_DIR}/NVTX/include)
|
||||
find_package(Torch REQUIRED)
|
||||
|
||||
message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
|
||||
add_compile_options(${TORCH_CXX_FLAGS})
|
||||
add_compile_definitions(TORCH_CUDA=1)
|
||||
|
||||
if(DEFINED USE_CXX11_ABI)
|
||||
parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
|
||||
if(DEFINED TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI
|
||||
AND NOT ${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} EQUAL ${USE_CXX11_ABI})
|
||||
message(
|
||||
WARNING
|
||||
"The libtorch compilation options _GLIBCXX_USE_CXX11_ABI=${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} "
|
||||
"found by CMake conflict with the project setting USE_CXX11_ABI=${USE_CXX11_ABI}, and the project "
|
||||
"setting will be discarded.")
|
||||
if(DEFINED TORCH_CXX_FLAGS)
|
||||
message(STATUS "TORCH_CXX_FLAGS: ${TORCH_CXX_FLAGS}")
|
||||
add_compile_options(${TORCH_CXX_FLAGS})
|
||||
if(DEFINED USE_CXX11_ABI)
|
||||
parse_make_options(${TORCH_CXX_FLAGS} "TORCH_CXX_FLAGS")
|
||||
if(DEFINED TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI
|
||||
AND NOT ${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} EQUAL
|
||||
${USE_CXX11_ABI})
|
||||
message(
|
||||
WARNING
|
||||
"The libtorch compilation options _GLIBCXX_USE_CXX11_ABI=${TORCH_CXX_FLAGS__GLIBCXX_USE_CXX11_ABI} "
|
||||
"found by CMake conflict with the project setting USE_CXX11_ABI=${USE_CXX11_ABI}, and the project "
|
||||
"setting will be discarded.")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
|
||||
|
||||
@ -1,8 +1,8 @@
|
||||
# Multi-stage Dockerfile
|
||||
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
|
||||
ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver
|
||||
ARG BASE_TAG=25.04-py3
|
||||
ARG TRITON_BASE_TAG=25.04-py3
|
||||
ARG BASE_TAG=25.05-py3
|
||||
ARG TRITON_BASE_TAG=25.05-py3
|
||||
ARG DEVEL_IMAGE=devel
|
||||
|
||||
FROM ${BASE_IMAGE}:${BASE_TAG} AS base
|
||||
|
||||
@ -4,8 +4,8 @@ set -ex
|
||||
|
||||
# Use latest stable version from https://pypi.org/project/torch/#history
|
||||
# and closest to the version specified in
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-04.html#rel-25-04
|
||||
TORCH_VERSION="2.7.0"
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05
|
||||
TORCH_VERSION="2.7.1"
|
||||
SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
|
||||
|
||||
prepare_environment() {
|
||||
|
||||
@ -2,17 +2,16 @@
|
||||
|
||||
set -ex
|
||||
|
||||
TRT_VER="10.10.0.31"
|
||||
TRT_VER="10.11.0.33"
|
||||
# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-04.html#rel-25-04
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05
|
||||
CUDA_VER="12.9" # 12.9.0
|
||||
# Keep the installation for cuDNN if users want to install PyTorch with source codes.
|
||||
# PyTorch 2.x can compile with cuDNN v9.
|
||||
CUDNN_VER="9.9.0.52-1"
|
||||
# NCCL version 2.26.3 used in the NGC PyTorch 25.04 image but not existing in public.
|
||||
# Use NCCL version 2.26.5 instead.
|
||||
NCCL_VER="2.26.5-1+cuda12.9"
|
||||
# cuBLAS version 12.9.0.2 used in the NGC PyTorch 25.04 image but not existing in public.
|
||||
CUDNN_VER="9.10.1.4-1"
|
||||
# NCCL version 2.26.x used in the NGC PyTorch 25.05 image but has a performance regression issue.
|
||||
# Use NCCL version 2.25.1 instead.
|
||||
NCCL_VER="2.25.1-1+cuda12.8"
|
||||
# Use cuBLAS version 12.9.0.13 instead.
|
||||
CUBLAS_VER="12.9.0.13-1"
|
||||
# Align with the pre-installed CUDA / NVCC / NVRTC versions from
|
||||
@ -68,6 +67,7 @@ install_ubuntu_requirements() {
|
||||
apt-get install -y --no-install-recommends \
|
||||
libcudnn9-cuda-12=${CUDNN_VER} \
|
||||
libcudnn9-dev-cuda-12=${CUDNN_VER} \
|
||||
libcudnn9-headers-cuda-12=${CUDNN_VER} \
|
||||
libnccl2=${NCCL_VER} \
|
||||
libnccl-dev=${NCCL_VER} \
|
||||
libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \
|
||||
|
||||
@ -2,7 +2,7 @@
|
||||
|
||||
# Building from Source Code on Linux
|
||||
|
||||
This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0, which uses the new CXX11 ABI.
|
||||
This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0 and subsequent versions, which uses the new CXX11 ABI.
|
||||
|
||||
|
||||
## Prerequisites
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
|
||||
|
||||
```bash
|
||||
pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
||||
pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
||||
|
||||
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
|
||||
```
|
||||
|
||||
@ -5,7 +5,7 @@
|
||||
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
|
||||
|
||||
```bash
|
||||
(Optional) pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
||||
(Optional) pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
|
||||
|
||||
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
|
||||
```
|
||||
|
||||
@ -142,9 +142,9 @@ The following table shows the supported software for TensorRT-LLM.
|
||||
* -
|
||||
- Software Compatibility
|
||||
* - Container
|
||||
- [25.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
|
||||
- [25.05](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
|
||||
* - TensorRT
|
||||
- [10.10](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
|
||||
- [10.11](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
|
||||
* - Precision
|
||||
-
|
||||
- Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
|
||||
|
||||
@ -28,10 +28,10 @@ UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifac
|
||||
// Container configuration
|
||||
// available tags can be found in: https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/
|
||||
// [base_image_name]-[arch]-[os](-[python_version])-[trt_version]-[torch_install_type]-[stage]-[date]-[mr_id]
|
||||
LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
|
||||
LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-aarch64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
|
||||
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.10.0.31-skip-tritondevel-202506111045-4792"
|
||||
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.10.0.31-skip-tritondevel-202506111045-4792"
|
||||
LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885"
|
||||
LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885"
|
||||
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202506051650-4885"
|
||||
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202506051650-4885"
|
||||
|
||||
// TODO: Move common variables to an unified location
|
||||
BUILD_CORES_REQUEST = "8"
|
||||
|
||||
@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310
|
||||
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312
|
||||
|
||||
// DLFW torch image
|
||||
DLFW_IMAGE = "nvcr.io/nvidia/pytorch:25.04-py3"
|
||||
DLFW_IMAGE = "nvcr.io/nvidia/pytorch:25.05-py3"
|
||||
|
||||
//Ubuntu base image
|
||||
UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"
|
||||
@ -1775,7 +1775,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
|
||||
// Extra PyTorch CUDA 12.8 install
|
||||
if (values[6]) {
|
||||
echo "###### Extra PyTorch CUDA 12.8 install Start ######"
|
||||
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128")
|
||||
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.7.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128")
|
||||
}
|
||||
|
||||
def libEnv = []
|
||||
|
||||
@ -1,7 +1,7 @@
|
||||
|
||||
import java.lang.InterruptedException
|
||||
|
||||
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202506111045-4792"
|
||||
DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.05-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202506051650-4885"
|
||||
|
||||
def createKubernetesPodConfig(image, arch = "amd64")
|
||||
{
|
||||
|
||||
@ -21,9 +21,9 @@ pandas
|
||||
h5py==3.12.1
|
||||
StrEnum
|
||||
sentencepiece>=0.1.99
|
||||
tensorrt~=10.10.0
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-04.html#rel-25-04 uses 2.7.0a0.
|
||||
torch>=2.7.0a0,<=2.7.0
|
||||
tensorrt~=10.11.0
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-05.html#rel-25-05 uses 2.8.0a0.
|
||||
torch>=2.7.1,<=2.8.0a0
|
||||
torchvision
|
||||
nvidia-modelopt[torch]~=0.31.0
|
||||
nvidia-nccl-cu12
|
||||
|
||||
@ -425,3 +425,10 @@ triton_server/test_triton_llm.py::test_llava[False-1---False-True-False-0-128-en
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[ep4-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False] SKIP (https://nvbugs/5336321)
|
||||
examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs1] SKIP (https://nvbugs/5344070)
|
||||
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8] SKIP (https://nvbugs/5343850)
|
||||
examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (https://nvbugs/5333849)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
|
||||
triton_server/test_triton.py::test_mllama[mllama] SKIP (https://nvbugs/5333818)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5333818)
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_weight_streaming_ootb SKIP (https://nvbugs/5338552)
|
||||
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5345215)
|
||||
|
||||
@ -73,3 +73,9 @@ def pytest_collection_modifyitems(session, config, items):
|
||||
# it into the appropriate test suite.
|
||||
for item in items:
|
||||
item._nodeid = f"{test_prefix}/{item._nodeid}"
|
||||
|
||||
|
||||
def pytest_sessionstart(session):
|
||||
# To counter TransformerEngine v2.3's lazy_compile deferral,
|
||||
# which will cause Pytest thinks there's a thread leakage.
|
||||
import torch._inductor.async_compile # noqa: F401
|
||||
|
||||
@ -809,20 +809,20 @@ if __name__ == "__main__":
|
||||
input_ids[0]) + [
|
||||
21221,
|
||||
290,
|
||||
373,
|
||||
257,
|
||||
4255,
|
||||
379,
|
||||
2888,
|
||||
286,
|
||||
262,
|
||||
1957,
|
||||
7072,
|
||||
11,
|
||||
4689,
|
||||
347,
|
||||
2852,
|
||||
2564,
|
||||
494,
|
||||
4141,
|
||||
2351,
|
||||
10006,
|
||||
13,
|
||||
679,
|
||||
373,
|
||||
7018,
|
||||
284,
|
||||
262,
|
||||
]
|
||||
|
||||
if FLAGS.num_return_sequences is None:
|
||||
|
||||
Loading…
Reference in New Issue
Block a user