upgrade to base image and new TRT, fix many dependency issues

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
Xiwen Yu 2025-06-17 17:16:57 +08:00
parent 3036d49071
commit 303604f82d
11 changed files with 93 additions and 39 deletions

2
3rdparty/json vendored

@ -1 +1 @@
Subproject commit bc889afb4c5bf1c0d8ee29ef35eaaf4c8bef8a5d
Subproject commit 55f93686c01528224f448c19128836e7df245f72

47
bringup_fix.sh Normal file
View File

@ -0,0 +1,47 @@
ARCH=$(uname -m)
if [ $ARCH == "x86_64" ]; then
wget https://urm.nvidia.com/artifactory/sw-gpu-cuda-installer-generic-local/packaging/r13.0/cuda_nvrtc/linux-x86_64/13.0.48/cuda-nvrtc-dev-13-0_13.0.48-1_amd64.deb && \
dpkg -i cuda-nvrtc-dev-13-0_13.0.48-1_amd64.deb && \
rm cuda-nvrtc-dev-13-0_13.0.48-1_amd64.deb
wget https://github.com/Kitware/CMake/releases/download/v4.0.3/cmake-4.0.3-linux-x86_64.sh && \
bash cmake-4.0.3-linux-x86_64.sh --skip-license --prefix=/usr/local/cmake --exclude-subdir
apt update
apt remove -y ibverbs-providers libibverbs1
apt install -y libibverbs-dev
apt install -y libstdc++-14-dev
elif [ $ARCH == "aarch64" ]; then
wget https://urm.nvidia.com/artifactory/sw-gpu-cuda-installer-generic-local/packaging/r13.0/cuda_nvrtc/linux-sbsa/13.0.48/cuda-nvrtc-dev-13-0_13.0.48-1_arm64.deb && \
dpkg -i cuda-nvrtc-dev-13-0_13.0.48-1_arm64.deb && \
rm cuda-nvrtc-dev-13-0_13.0.48-1_arm64.deb
wget https://github.com/Kitware/CMake/releases/download/v4.0.3/cmake-4.0.3-linux-aarch64.sh && \
bash cmake-4.0.3-linux-aarch64.sh --skip-license --prefix=/usr/local/cmake --exclude-subdir
apt update
apt remove -y ibverbs-providers libibverbs1
apt install -y libibverbs-dev
apt install -y libstdc++-14-dev
else
echo "Unsupported architecture: $ARCH"
exit 1
fi
cd /usr/local/lib/python3.12/dist-packages/ && \
ls -la | grep pytorch_triton && \
mv pytorch_triton-3.3.1+gitc8757738.dist-info triton-3.3.1+gitc8757738.dist-info && \
cd triton-3.3.1+gitc8757738.dist-info && \
echo "Current directory: $(pwd)" && \
echo "Files in directory:" && \
ls -la && \
sed -i 's/^Name: pytorch-triton/Name: triton/' METADATA && \
sed -i 's|pytorch_triton-3.3.1+gitc8757738.dist-info/|triton-3.3.1+gitc8757738.dist-info/|g' RECORD && \
echo "METADATA after update:" && \
grep "^Name:" METADATA
# pip install git+https://github.com/triton-lang/triton.git@main

View File

@ -472,7 +472,6 @@ print(os.path.dirname(torch.__file__),end='');"
endif()
endif()
endif()
else()
if(NOT WIN32)
if(NOT USE_CXX11_ABI)

View File

@ -127,6 +127,8 @@ ExternalProject_Add(
${DEEP_EP_SOURCE_DIR}/third-party/nvshmem.patch
COMMAND sed "s/TRANSPORT_VERSION_MAJOR 3/TRANSPORT_VERSION_MAJOR 103/" -i
src/CMakeLists.txt
COMMAND sed "s/_STANDARD 11/_STANDARD 17/" -i src/device/CMakeLists.txt
COMMAND sed "s/_STANDARD 11/_STANDARD 17/" -i src/CMakeLists.txt
COMMAND patch -p1 --forward --batch -i
${CMAKE_CURRENT_SOURCE_DIR}/nvshmem_fast_build.patch
CMAKE_CACHE_ARGS

View File

@ -63,9 +63,9 @@ DataType Tensor::getDataType() const
case nvinfer1::DataType::kBF16: return DataType::kBF16;
case nvinfer1::DataType::kINT64: return DataType::kINT64;
case nvinfer1::DataType::kINT4: [[fallthrough]] /* do nothing */;
case nvinfer1::DataType::kFP4: /* do nothing */;
case nvinfer1::DataType::kFP4: [[fallthrough]] /* do nothing */;
default: TLLM_THROW("Unsupported data type");
}
TLLM_THROW("Unsupported data type");
}
MemoryType Tensor::getMemoryType() const

View File

@ -101,9 +101,9 @@ char const* IBuffer::getDataTypeName(DataType dataType)
case nvinfer1::DataType::kINT8: return DataTypeTraits<nvinfer1::DataType::kINT8>::name;
case nvinfer1::DataType::kFP8: return DataTypeTraits<nvinfer1::DataType::kFP8>::name;
case nvinfer1::DataType::kINT4: [[fallthrough]] /* do nothing */;
case nvinfer1::DataType::kFP4: /* do nothing */;
case nvinfer1::DataType::kFP4: [[fallthrough]] /* do nothing */;
default: TLLM_THROW("Unknown data type");
}
TLLM_THROW("Unknown data type");
}
char const* IBuffer::getDataTypeName() const

View File

@ -1,7 +1,7 @@
# Multi-stage Dockerfile
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
ARG BASE_IMAGE=gitlab-master.nvidia.com:5005/dl/dgx/pytorch
ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver
ARG BASE_TAG=25.06-py3
ARG BASE_TAG=25.08-py3.32224057-base
ARG TRITON_BASE_TAG=25.06-py3
ARG DEVEL_IMAGE=devel
@ -74,8 +74,10 @@ ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
COPY bringup_fix.sh bringup_fix.sh
RUN bash ./bringup_fix.sh && rm bringup_fix.sh
# WARs against security issues inherited from pytorch:25.06
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
RUN pip3 install --upgrade --no-cache-dir \
"protobuf>=4.25.8"
@ -103,7 +105,6 @@ RUN bash ./install_nixl.sh && rm install_nixl.sh
COPY docker/common/install_etcd.sh install_etcd.sh
RUN bash ./install_etcd.sh && rm install_etcd.sh
FROM ${DEVEL_IMAGE} AS wheel
WORKDIR /src/tensorrt_llm
COPY benchmarks benchmarks

View File

@ -49,31 +49,31 @@ install_ubuntu_requirements() {
rm cuda-keyring_1.1-1_all.deb
apt-get update
if [[ $(apt list --installed | grep libcudnn9) ]]; then
apt-get remove --purge -y libcudnn9*
fi
if [[ $(apt list --installed | grep libnccl) ]]; then
apt-get remove --purge -y --allow-change-held-packages libnccl*
fi
if [[ $(apt list --installed | grep libcublas) ]]; then
apt-get remove --purge -y --allow-change-held-packages libcublas*
fi
if [[ $(apt list --installed | grep cuda-nvrtc-dev) ]]; then
apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
fi
# if [[ $(apt list --installed | grep libcudnn9) ]]; then
# apt-get remove --purge -y libcudnn9*
# fi
# if [[ $(apt list --installed | grep libnccl) ]]; then
# apt-get remove --purge -y --allow-change-held-packages libnccl*
# fi
# if [[ $(apt list --installed | grep libcublas) ]]; then
# apt-get remove --purge -y --allow-change-held-packages libcublas*
# fi
# if [[ $(apt list --installed | grep cuda-nvrtc-dev) ]]; then
# apt-get remove --purge -y --allow-change-held-packages cuda-nvrtc-dev*
# fi
CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
NVRTC_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
# CUBLAS_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
# NVRTC_CUDA_VERSION=$(echo $CUDA_VER | sed 's/\./-/g')
apt-get install -y --no-install-recommends \
libcudnn9-cuda-12=${CUDNN_VER} \
libcudnn9-dev-cuda-12=${CUDNN_VER} \
libcudnn9-headers-cuda-12=${CUDNN_VER} \
libnccl2=${NCCL_VER} \
libnccl-dev=${NCCL_VER} \
libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \
libcublas-dev-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \
cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER}
# apt-get install -y --no-install-recommends \
# libcudnn9-cuda-12=${CUDNN_VER} \
# libcudnn9-dev-cuda-12=${CUDNN_VER} \
# libcudnn9-headers-cuda-12=${CUDNN_VER} \
# libnccl2=${NCCL_VER} \
# libnccl-dev=${NCCL_VER} \
# libcublas-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \
# libcublas-dev-${CUBLAS_CUDA_VERSION}=${CUBLAS_VER} \
# cuda-nvrtc-dev-${NVRTC_CUDA_VERSION}=${NVRTC_VER}
apt-get clean
rm -rf /var/lib/apt/lists/*
@ -130,12 +130,17 @@ install_tensorrt() {
if [ -z "$ARCH" ];then ARCH=$(uname -m);fi
if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/${TRT_VER_SHORT}/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz"
if [ "$ARCH" = "x86_64" ]; then
RELEASE_URL_TRT="http://cuda-repo/release-candidates/Libraries/TensorRT/v10.14/10.14.0.19-6374d0f7/13.0-r580/Linux-x64-manylinux_2_28/tar/TensorRT-10.14.0.19.Linux.x86_64-gnu.cuda-13.0.tar.gz"
else
RELEASE_URL_TRT="http://cuda-repo/release-candidates/Libraries/TensorRT/v10.14/10.14.0.19-6374d0f7/13.0-r580/Linux-aarch64-manylinux_2_35/tar/TensorRT-10.14.0.19.Ubuntu-22.04.aarch64-gnu.cuda-13.0.tar.gz"
fi
fi
wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
tar -xf /tmp/TensorRT.tar -C /usr/local/
mv /usr/local/TensorRT-${TRT_VER} /usr/local/tensorrt
mv /usr/local/TensorRT-* /usr/local/tensorrt
pip3 install --no-cache-dir /usr/local/tensorrt/python/tensorrt-*-cp${PARSED_PY_VERSION}-*.whl
rm -rf /tmp/TensorRT.tar
echo 'export LD_LIBRARY_PATH=/usr/local/tensorrt/lib:$LD_LIBRARY_PATH' >> "${ENV}"

View File

@ -11,7 +11,7 @@
#
# NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508051130-6090
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508051130-6090
LLM_DOCKER_IMAGE=gitlab-master.nvidia.com:5005/xiweny/images:gb110_bringup_x86_64
LLM_SBSA_DOCKER_IMAGE=gitlab-master.nvidia.com:5005/xiweny/images:gb110_bringup_sbsa
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202508051130-6090
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202508051130-6090

View File

@ -21,7 +21,7 @@ pandas
h5py==3.12.1
StrEnum
sentencepiece>=0.1.99
tensorrt~=10.11.0
tensorrt
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-06.html#rel-25-06 uses 2.8.0a0.
torch>=2.7.1,<=2.8.0a0
torchvision

View File

@ -289,7 +289,7 @@ target_link_libraries(
FetchContent_Declare(
json
GIT_REPOSITORY https://github.com/nlohmann/json.git
GIT_TAG v3.11.2)
GIT_TAG v3.12.0)
FetchContent_MakeAvailable(json)