mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][feat] Use a shell context to install dependancies (#7383)
Signed-off-by: Shobhit Verma <shobhitv@nvidia.com> Signed-off-by: v-shobhit <161510941+v-shobhit@users.noreply.github.com> Co-authored-by: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com>
This commit is contained in:
parent
222e01662c
commit
0652514c6d
3
.gitignore
vendored
3
.gitignore
vendored
@ -76,3 +76,6 @@ compile_commands.json
|
||||
.dir-locals.el
|
||||
.devcontainer/devcontainer.env.user
|
||||
.devcontainer/docker-compose.override.yml
|
||||
|
||||
# Enroot sqsh files
|
||||
enroot/tensorrt_llm.devel.sqsh
|
||||
|
||||
@ -15,70 +15,61 @@ LABEL com.nvidia.ai-terms="https://www.nvidia.com/en-us/agreements/enterprise-so
|
||||
# The default values come from `nvcr.io/nvidia/pytorch`
|
||||
ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc}
|
||||
ENV ENV=${ENV:-/etc/shinit_v2}
|
||||
|
||||
ARG GITHUB_MIRROR=""
|
||||
RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
# Clean up the pip constraint file from the base NGC PyTorch image.
|
||||
RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true
|
||||
|
||||
FROM base AS devel
|
||||
|
||||
ARG PYTHON_VERSION="3.12.3"
|
||||
RUN echo "Using Python version: $PYTHON_VERSION"
|
||||
|
||||
SHELL ["/bin/bash", "-c"]
|
||||
|
||||
FROM base AS devel
|
||||
|
||||
#
|
||||
# NB: PyTorch requires this to be < 1.0
|
||||
ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
|
||||
|
||||
COPY docker/common/install.sh install.sh
|
||||
|
||||
COPY docker/common/install_base.sh install_base.sh
|
||||
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_base.sh $PYTHON_VERSION && rm install_base.sh
|
||||
RUN GITHUB_MIRROR=${GITHUB_MIRROR} \
|
||||
PYTHON_VERSION=${PYTHON_VERSION} \
|
||||
bash ./install.sh --base && rm install_base.sh
|
||||
|
||||
COPY docker/common/install_cmake.sh install_cmake.sh
|
||||
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_cmake.sh && rm install_cmake.sh
|
||||
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --cmake && rm install_cmake.sh
|
||||
|
||||
COPY docker/common/install_ccache.sh install_ccache.sh
|
||||
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_ccache.sh && rm install_ccache.sh
|
||||
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --ccache && rm install_ccache.sh
|
||||
|
||||
# Only take effect when the base image is Rocky Linux 8 with old CUDA version.
|
||||
COPY docker/common/install_cuda_toolkit.sh install_cuda_toolkit.sh
|
||||
RUN bash ./install_cuda_toolkit.sh && rm install_cuda_toolkit.sh
|
||||
RUN bash ./install.sh --cuda_toolkit && rm install_cuda_toolkit.sh
|
||||
|
||||
# Download & install latest TRT release
|
||||
ARG TRT_VER
|
||||
ARG CUDA_VER
|
||||
ARG CUDNN_VER
|
||||
ARG NCCL_VER
|
||||
ARG CUBLAS_VER
|
||||
COPY docker/common/install_tensorrt.sh install_tensorrt.sh
|
||||
RUN bash ./install_tensorrt.sh \
|
||||
--TRT_VER=${TRT_VER} \
|
||||
--CUDA_VER=${CUDA_VER} \
|
||||
--CUDNN_VER=${CUDNN_VER} \
|
||||
--NCCL_VER=${NCCL_VER} \
|
||||
--CUBLAS_VER=${CUBLAS_VER} && \
|
||||
rm install_tensorrt.sh
|
||||
RUN TRT_VER=${TRT_VER} \
|
||||
CUDA_VER=${CUDA_VER} \
|
||||
CUDNN_VER=${CUDNN_VER} \
|
||||
NCCL_VER=${NCCL_VER} \
|
||||
CUBLAS_VER=${CUBLAS_VER} \
|
||||
bash ./install.sh --tensorrt && rm install_tensorrt.sh
|
||||
|
||||
# Install latest Polygraphy
|
||||
COPY docker/common/install_polygraphy.sh install_polygraphy.sh
|
||||
RUN bash ./install_polygraphy.sh && rm install_polygraphy.sh
|
||||
RUN bash ./install.sh --polygraphy && rm install_polygraphy.sh
|
||||
|
||||
# Install mpi4py
|
||||
COPY docker/common/install_mpi4py.sh install_mpi4py.sh
|
||||
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_mpi4py.sh && rm install_mpi4py.sh
|
||||
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --mpi4py && rm install_mpi4py.sh
|
||||
|
||||
# Install PyTorch
|
||||
ARG TORCH_INSTALL_TYPE="skip"
|
||||
COPY docker/common/install_pytorch.sh install_pytorch.sh
|
||||
RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
|
||||
#
|
||||
# NB: PyTorch requires this to be < 1.0
|
||||
ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
|
||||
RUN TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} bash ./install.sh --pytorch && rm install_pytorch.sh
|
||||
|
||||
# Install OpenCV with FFMPEG support
|
||||
RUN pip3 uninstall -y opencv && \
|
||||
rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \
|
||||
pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
|
||||
|
||||
# WARs against security issues inherited from pytorch:25.06
|
||||
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
|
||||
RUN pip3 install --upgrade --no-cache-dir \
|
||||
"protobuf>=4.25.8"
|
||||
RUN bash ./install.sh --opencv && bash ./install.sh --protobuf && rm install.sh
|
||||
|
||||
FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton
|
||||
|
||||
|
||||
144
docker/common/install.sh
Executable file
144
docker/common/install.sh
Executable file
@ -0,0 +1,144 @@
|
||||
#!/bin/bash
|
||||
set -Eeo pipefail
|
||||
shopt -s nullglob
|
||||
trap 'echo "[install.sh] Error on line $LINENO" >&2' ERR
|
||||
|
||||
# Resolve script directory for robust relative pathing
|
||||
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
|
||||
|
||||
# Default values
|
||||
base=0
|
||||
cmake=0
|
||||
ccache=0
|
||||
cuda_toolkit=0
|
||||
tensorrt=0
|
||||
polygraphy=0
|
||||
mpi4py=0
|
||||
pytorch=0
|
||||
opencv=0
|
||||
protobuf=0
|
||||
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case $1 in
|
||||
--base)
|
||||
base=1
|
||||
shift 1
|
||||
;;
|
||||
--cmake)
|
||||
cmake=1
|
||||
shift 1
|
||||
;;
|
||||
--ccache)
|
||||
ccache=1
|
||||
shift 1
|
||||
;;
|
||||
--cuda_toolkit)
|
||||
cuda_toolkit=1
|
||||
shift 1
|
||||
;;
|
||||
--tensorrt)
|
||||
tensorrt=1
|
||||
shift 1
|
||||
;;
|
||||
--polygraphy)
|
||||
polygraphy=1
|
||||
shift 1
|
||||
;;
|
||||
--mpi4py)
|
||||
mpi4py=1
|
||||
shift 1
|
||||
;;
|
||||
--pytorch)
|
||||
pytorch=1
|
||||
shift 1
|
||||
;;
|
||||
--opencv)
|
||||
opencv=1
|
||||
shift 1
|
||||
;;
|
||||
--protobuf)
|
||||
protobuf=1
|
||||
shift 1
|
||||
;;
|
||||
--all)
|
||||
base=1
|
||||
cmake=1
|
||||
ccache=1
|
||||
cuda_toolkit=1
|
||||
tensorrt=1
|
||||
polygraphy=1
|
||||
mpi4py=1
|
||||
pytorch=1
|
||||
opencv=1
|
||||
protobuf=1
|
||||
shift 1
|
||||
;;
|
||||
*)
|
||||
echo "Unknown option: $1"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [ $base -eq 1 ]; then
|
||||
echo "Installing base dependencies..."
|
||||
# Clean up the pip constraint file from the base NGC PyTorch image.
|
||||
[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true
|
||||
|
||||
echo "Using Python version: $PYTHON_VERSION"
|
||||
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_base.sh $PYTHON_VERSION
|
||||
fi
|
||||
|
||||
if [ $cmake -eq 1 ]; then
|
||||
echo "Installing CMake..."
|
||||
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_cmake.sh
|
||||
fi
|
||||
|
||||
if [ $ccache -eq 1 ]; then
|
||||
echo "Installing ccache..."
|
||||
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_ccache.sh
|
||||
fi
|
||||
|
||||
if [ $cuda_toolkit -eq 1 ]; then
|
||||
echo "Installing CUDA toolkit..."
|
||||
bash $SCRIPT_DIR/install_cuda_toolkit.sh
|
||||
fi
|
||||
|
||||
if [ $tensorrt -eq 1 ]; then
|
||||
echo "Installing TensorRT..."
|
||||
bash $SCRIPT_DIR/install_tensorrt.sh \
|
||||
--TRT_VER=${TRT_VER} \
|
||||
--CUDA_VER=${CUDA_VER} \
|
||||
--CUDNN_VER=${CUDNN_VER} \
|
||||
--NCCL_VER=${NCCL_VER} \
|
||||
--CUBLAS_VER=${CUBLAS_VER}
|
||||
fi
|
||||
|
||||
if [ $polygraphy -eq 1 ]; then
|
||||
echo "Installing Polygraphy..."
|
||||
bash $SCRIPT_DIR/install_polygraphy.sh
|
||||
fi
|
||||
|
||||
if [ $mpi4py -eq 1 ]; then
|
||||
echo "Installing mpi4py..."
|
||||
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_mpi4py.sh
|
||||
fi
|
||||
|
||||
if [ $pytorch -eq 1 ]; then
|
||||
echo "Installing PyTorch..."
|
||||
bash $SCRIPT_DIR/install_pytorch.sh $TORCH_INSTALL_TYPE
|
||||
fi
|
||||
|
||||
if [ $opencv -eq 1 ]; then
|
||||
echo "Installing OpenCV..."
|
||||
pip3 uninstall -y opencv
|
||||
rm -rf /usr/local/lib/python3*/dist-packages/cv2/
|
||||
pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
|
||||
fi
|
||||
|
||||
# WARs against security issues inherited from pytorch:25.06
|
||||
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
|
||||
if [ $protobuf -eq 1 ]; then
|
||||
pip3 install --upgrade --no-cache-dir \
|
||||
"protobuf>=4.25.8"
|
||||
fi
|
||||
@ -88,6 +88,33 @@ Follow the linked catalog entry to enter a new container based on the pre-built
|
||||
make -C docker run LOCAL_USER=1
|
||||
```
|
||||
|
||||
If you wish to use enroot instead of docker, then you can build a sqsh file that has the identical environment as the development image `tensorrt_llm/devel:latest` as follows.
|
||||
|
||||
1. Allocate a compute node:
|
||||
```bash
|
||||
salloc --nodes=1
|
||||
```
|
||||
|
||||
2. Create a sqsh file with essential TensorRT LLM dependencies installed
|
||||
```bash
|
||||
# Using default sqsh filename (enroot/tensorrt_llm.devel.sqsh)
|
||||
make -C enroot build_sqsh
|
||||
|
||||
# Or specify a custom path (optional)
|
||||
make -C enroot build_sqsh SQSH_PATH=/path/to/dev_trtllm_image.sqsh
|
||||
```
|
||||
|
||||
3. Once this squash file is ready, you can follow the steps under [Build TensorRT LLM](#build-tensorrt-llm)by launching an enroot sandbox from `dev_trtllm_image.sqsh`. To do this, proceed as follows:
|
||||
```bash
|
||||
export SQSH_PATH=/path/to/dev_trtllm_image.sqsh
|
||||
|
||||
# Start a pseudo terminal for interactive session
|
||||
make -C enroot run_sqsh
|
||||
|
||||
# Or, you could run commands directly
|
||||
make -C enroot run_sqsh RUN_CMD="python3 scripts/build_wheel.py"
|
||||
```
|
||||
|
||||
**On systems without GNU `make`**
|
||||
|
||||
1. Create a Docker image for development.
|
||||
|
||||
45
enroot/Makefile
Normal file
45
enroot/Makefile
Normal file
@ -0,0 +1,45 @@
|
||||
ifndef MAKEFILE_PYXIS_INCLUDED
|
||||
MAKEFILE_PYXIS_INCLUDED := 1
|
||||
|
||||
BASE_IMAGE ?= $(shell grep '^ARG BASE_IMAGE=' ../docker/Dockerfile.multi | grep -o '=.*' | tr -d '="')
|
||||
BASE_TAG ?= $(shell grep '^ARG BASE_TAG=' ../docker/Dockerfile.multi | grep -o '=.*' | tr -d '="')
|
||||
SQSH_PATH ?= tensorrt_llm.devel.sqsh
|
||||
SOURCE_DIR ?= $(shell readlink -f ..)
|
||||
CODE_DIR ?= /code/tensorrt_llm
|
||||
RUN_CMD ?= --pty bash
|
||||
|
||||
PYTHON_VERSION ?= 3.12.3
|
||||
TORCH_INSTALL_TYPE ?= skip
|
||||
GITHUB_MIRROR ?=
|
||||
CUDA_VERSION ?=
|
||||
CUDNN_VERSION ?=
|
||||
NCCL_VERSION ?=
|
||||
CUBLAS_VERSION ?=
|
||||
TRT_VERSION ?=
|
||||
|
||||
build_sqsh:
|
||||
@echo "Building trtllm sqsh image."
|
||||
@echo "Base image: $(BASE_IMAGE):$(BASE_TAG)"
|
||||
@echo "Location: $(SQSH_PATH)"
|
||||
|
||||
srun \
|
||||
--container-image "$(BASE_IMAGE):$(BASE_TAG)" \
|
||||
--container-save "$(SQSH_PATH)" \
|
||||
--container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR)/docker/common \
|
||||
--container-mount-home --container-remap-root \
|
||||
--export PYTHON_VERSION=$(PYTHON_VERSION),GITHUB_MIRROR=$(GITHUB_MIRROR),TORCH_INSTALL_TYPE=$(TORCH_INSTALL_TYPE),CUDA_VER=$(CUDA_VERSION),CUDNN_VER=$(CUDNN_VERSION),NCCL_VER=$(NCCL_VERSION),CUBLAS_VER=$(CUBLAS_VERSION),TRT_VER=$(TRT_VERSION) \
|
||||
./install.sh --all
|
||||
|
||||
run_sqsh:
|
||||
@echo "Running srun job step with:"
|
||||
@echo " sqsh image: $(SQSH_PATH)"
|
||||
@echo " run command: $(RUN_CMD)"
|
||||
|
||||
srun \
|
||||
--container-image "$(SQSH_PATH)" \
|
||||
--container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR) \
|
||||
--container-mount-home --container-remap-root \
|
||||
--export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.99999 \
|
||||
$(RUN_CMD)
|
||||
|
||||
endif
|
||||
@ -12,7 +12,7 @@
|
||||
# NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
|
||||
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
|
||||
IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
|
||||
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
|
||||
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
|
||||
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509081850-5980
|
||||
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509081850-5980
|
||||
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383
|
||||
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383
|
||||
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509091430-7383
|
||||
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509091430-7383
|
||||
|
||||
Loading…
Reference in New Issue
Block a user