mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Signed-off-by: ZhanruiSunCh <184402041+ZhanruiSunCh@users.noreply.github.com> Signed-off-by: Zhanrui Sun <184402041+ZhanruiSunCh@users.noreply.github.com>
207 lines
7.2 KiB
Docker
207 lines
7.2 KiB
Docker
# Multi-stage Dockerfile
|
|
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
|
|
ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver
|
|
ARG BASE_TAG=25.10-py3
|
|
ARG TRITON_BASE_TAG=25.10-py3
|
|
ARG DEVEL_IMAGE=devel
|
|
|
|
FROM ${BASE_IMAGE}:${BASE_TAG} AS base
|
|
|
|
# Add NVIDIA EULA and AI Terms labels
|
|
LABEL com.nvidia.eula="https://www.nvidia.com/en-us/agreements/enterprise-software/nvidia-software-license-agreement/"
|
|
LABEL com.nvidia.ai-terms="https://www.nvidia.com/en-us/agreements/enterprise-software/product-specific-terms-for-ai-products/"
|
|
|
|
# https://www.gnu.org/software/bash/manual/html_node/Bash-Startup-Files.html
|
|
ARG SH_ENV="/etc/shinit_v2"
|
|
ENV ENV=${SH_ENV}
|
|
ARG BASH_ENV="/etc/bash.bashrc"
|
|
ENV BASH_ENV=${BASH_ENV}
|
|
|
|
ARG GITHUB_MIRROR=""
|
|
RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
|
|
|
|
ARG PYTHON_VERSION="3.12.3"
|
|
RUN echo "Using Python version: $PYTHON_VERSION"
|
|
|
|
SHELL ["/bin/bash", "-c"]
|
|
|
|
FROM base AS devel
|
|
|
|
#
|
|
# NB: PyTorch requires this to be < 1.0
|
|
ENV PYTORCH_ALLOC_CONF="garbage_collection_threshold:0.99999"
|
|
|
|
# Copy all installation scripts at once to reduce layers
|
|
COPY docker/common/install.sh \
|
|
docker/common/install_base.sh \
|
|
docker/common/install_cmake.sh \
|
|
docker/common/install_ccache.sh \
|
|
docker/common/install_cuda_toolkit.sh \
|
|
docker/common/install_tensorrt.sh \
|
|
docker/common/install_polygraphy.sh \
|
|
docker/common/install_mpi4py.sh \
|
|
docker/common/install_pytorch.sh \
|
|
docker/common/install_ucx.sh \
|
|
docker/common/install_nixl.sh \
|
|
docker/common/install_etcd.sh \
|
|
./
|
|
|
|
ARG TRT_VER
|
|
ARG CUDA_VER
|
|
ARG CUDNN_VER
|
|
ARG NCCL_VER
|
|
ARG CUBLAS_VER
|
|
ARG TORCH_INSTALL_TYPE="skip"
|
|
RUN GITHUB_MIRROR=${GITHUB_MIRROR} \
|
|
PYTHON_VERSION=${PYTHON_VERSION} \
|
|
TRT_VER=${TRT_VER} \
|
|
CUDA_VER=${CUDA_VER} \
|
|
CUDNN_VER=${CUDNN_VER} \
|
|
NCCL_VER=${NCCL_VER} \
|
|
CUBLAS_VER=${CUBLAS_VER} \
|
|
TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} \
|
|
bash ./install.sh --base --cmake --ccache --cuda_toolkit --tensorrt --polygraphy --mpi4py --pytorch --opencv && \
|
|
rm install_base.sh && \
|
|
rm install_cmake.sh && \
|
|
rm install_ccache.sh && \
|
|
rm install_cuda_toolkit.sh && \
|
|
rm install_tensorrt.sh && \
|
|
rm install_polygraphy.sh && \
|
|
rm install_mpi4py.sh && \
|
|
rm install_pytorch.sh && \
|
|
rm install.sh
|
|
|
|
# Copy and install dependencies from constraints.txt
|
|
COPY constraints.txt /tmp/constraints.txt
|
|
RUN pip3 install --no-cache-dir -r /tmp/constraints.txt && rm /tmp/constraints.txt
|
|
|
|
# Install UCX, NIXL, etcd
|
|
# TODO: Combine these into the main install.sh script
|
|
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_ucx.sh && \
|
|
GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_nixl.sh && \
|
|
bash ./install_etcd.sh && \
|
|
rm install_ucx.sh && \
|
|
rm install_nixl.sh && \
|
|
rm install_etcd.sh
|
|
|
|
# Generate OSS attribution file for devel image
|
|
ARG TRT_LLM_VER
|
|
ARG TARGETARCH
|
|
COPY scripts/generate_container_oss_attribution.sh /tmp/generate_container_oss_attribution.sh
|
|
RUN bash /tmp/generate_container_oss_attribution.sh "devel" "${TRT_LLM_VER}" "${TARGETARCH}" && \
|
|
rm /tmp/generate_container_oss_attribution.sh
|
|
|
|
FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton
|
|
|
|
FROM devel AS tritondevel
|
|
|
|
ARG GITHUB_MIRROR=""
|
|
COPY --from=triton /opt/tritonserver/backends/python /opt/tritonserver/backends/python
|
|
COPY --from=triton /opt/tritonserver/lib /opt/tritonserver/lib
|
|
COPY --from=triton /opt/tritonserver/include /opt/tritonserver/include
|
|
COPY --from=triton /opt/tritonserver/bin /opt/tritonserver/bin
|
|
COPY --from=triton /opt/tritonserver/caches /opt/tritonserver/caches
|
|
|
|
# Copy all installation scripts at once to reduce layers
|
|
COPY docker/common/install_triton.sh \
|
|
docker/common/install_mooncake.sh \
|
|
./
|
|
|
|
# Install Mooncake, after triton handles boost requirement
|
|
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install_triton.sh && \
|
|
if [ -f /etc/redhat-release ]; then \
|
|
echo "Rocky8 detected, skipping mooncake installation"; \
|
|
else \
|
|
bash ./install_mooncake.sh; \
|
|
fi && \
|
|
rm install_triton.sh && \
|
|
rm install_mooncake.sh
|
|
|
|
FROM ${DEVEL_IMAGE} AS wheel
|
|
WORKDIR /src/tensorrt_llm
|
|
COPY benchmarks benchmarks
|
|
COPY cpp cpp
|
|
COPY docker docker
|
|
COPY scripts scripts
|
|
COPY tensorrt_llm tensorrt_llm
|
|
COPY 3rdparty 3rdparty
|
|
COPY .gitmodules setup.py requirements.txt requirements-dev.txt constraints.txt README.md ./
|
|
|
|
# Create cache directories for pip and ccache
|
|
RUN mkdir -p /root/.cache/pip /root/.cache/ccache
|
|
ENV CCACHE_DIR=/root/.cache/ccache
|
|
# Build the TRT-LLM wheel
|
|
ARG GITHUB_MIRROR=""
|
|
ARG BUILD_WHEEL_ARGS="--clean --benchmarks"
|
|
ARG BUILD_WHEEL_SCRIPT="scripts/build_wheel.py"
|
|
RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=${CCACHE_DIR} \
|
|
GITHUB_MIRROR=$GITHUB_MIRROR python3 ${BUILD_WHEEL_SCRIPT} ${BUILD_WHEEL_ARGS}
|
|
|
|
FROM ${DEVEL_IMAGE} AS release
|
|
|
|
# Create a cache directory for pip
|
|
RUN mkdir -p /root/.cache/pip
|
|
|
|
WORKDIR /app/tensorrt_llm
|
|
RUN --mount=type=cache,target=/root/.cache/pip --mount=type=bind,from=wheel,source=/src/tensorrt_llm/build,target=/tmp/wheel \
|
|
pip install /tmp/wheel/tensorrt_llm*.whl
|
|
|
|
COPY README.md ./
|
|
COPY docs docs
|
|
COPY cpp/include include
|
|
|
|
RUN ln -sv $(python3 -c 'import site; print(f"{site.getsitepackages()[0]}/tensorrt_llm/bin")') bin && \
|
|
test -f bin/executorWorker && \
|
|
ln -sv $(python3 -c 'import site; print(f"{site.getsitepackages()[0]}/tensorrt_llm/libs")') lib && \
|
|
test -f lib/libnvinfer_plugin_tensorrt_llm.so && \
|
|
echo "/app/tensorrt_llm/lib" > /etc/ld.so.conf.d/tensorrt_llm.conf && \
|
|
ldconfig && \
|
|
! ( ldd -v bin/executorWorker | grep tensorrt_llm | grep -q "not found" )
|
|
|
|
ARG SRC_DIR=/src/tensorrt_llm
|
|
COPY --from=wheel ${SRC_DIR}/benchmarks benchmarks
|
|
ARG CPP_BUILD_DIR=${SRC_DIR}/cpp/build
|
|
COPY --from=wheel \
|
|
${CPP_BUILD_DIR}/benchmarks/bertBenchmark \
|
|
${CPP_BUILD_DIR}/benchmarks/gptManagerBenchmark \
|
|
${CPP_BUILD_DIR}/benchmarks/disaggServerBenchmark \
|
|
benchmarks/cpp/
|
|
|
|
COPY examples examples
|
|
RUN chmod -R a+w examples && \
|
|
rm -v \
|
|
benchmarks/cpp/bertBenchmark.cpp \
|
|
benchmarks/cpp/gptManagerBenchmark.cpp \
|
|
benchmarks/cpp/disaggServerBenchmark.cpp \
|
|
benchmarks/cpp/CMakeLists.txt && \
|
|
rm -rf /root/.cache/pip
|
|
|
|
ARG GIT_COMMIT
|
|
ARG TRT_LLM_VER
|
|
ARG TARGETARCH
|
|
ENV TRT_LLM_GIT_COMMIT=${GIT_COMMIT} \
|
|
TRT_LLM_VERSION=${TRT_LLM_VER}
|
|
|
|
# Generate OSS attribution file for release image
|
|
COPY scripts/generate_container_oss_attribution.sh /tmp/generate_container_oss_attribution.sh
|
|
RUN bash /tmp/generate_container_oss_attribution.sh "release" "${TRT_LLM_VER}" "${TARGETARCH}" && rm /tmp/generate_container_oss_attribution.sh
|
|
|
|
FROM wheel AS tritonbuild
|
|
|
|
WORKDIR /src/tensorrt_llm
|
|
RUN pip install /src/tensorrt_llm/build/tensorrt_llm*.whl
|
|
COPY ./triton_backend/ ./triton_backend/
|
|
ARG TRITON_BASE_TAG
|
|
RUN bash ./triton_backend/inflight_batcher_llm/scripts/build.sh -s "r${TRITON_BASE_TAG%-py3}"
|
|
|
|
|
|
FROM release AS tritonrelease
|
|
|
|
WORKDIR /app/tensorrt_llm
|
|
COPY ./triton_backend/all_models ./triton_backend/all_models
|
|
COPY ./triton_backend/scripts ./triton_backend/scripts
|
|
COPY ./triton_backend/tools ./triton_backend/tools
|
|
COPY ./triton_backend/inflight_batcher_llm/scripts ./triton_backend/inflight_batcher_llm/scripts
|
|
COPY ./triton_backend/inflight_batcher_llm/client ./triton_backend/inflight_batcher_llm/client
|
|
COPY --from=tritonbuild /opt/tritonserver/backends/tensorrtllm /opt/tritonserver/backends/tensorrtllm
|