[None][feat] Use a shell context to install dependancies (#7383)

Signed-off-by: Shobhit Verma <shobhitv@nvidia.com>
Signed-off-by: v-shobhit <161510941+v-shobhit@users.noreply.github.com>
Co-authored-by: Zhihan Jiang <68881590+nvzhihanj@users.noreply.github.com>
This commit is contained in:
v-shobhit 2025-09-10 09:57:37 -07:00 committed by GitHub
parent 222e01662c
commit 0652514c6d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 251 additions and 41 deletions

3
.gitignore vendored
View File

@ -76,3 +76,6 @@ compile_commands.json
.dir-locals.el
.devcontainer/devcontainer.env.user
.devcontainer/docker-compose.override.yml
# Enroot sqsh files
enroot/tensorrt_llm.devel.sqsh

View File

@ -15,70 +15,61 @@ LABEL com.nvidia.ai-terms="https://www.nvidia.com/en-us/agreements/enterprise-so
# The default values come from `nvcr.io/nvidia/pytorch`
ENV BASH_ENV=${BASH_ENV:-/etc/bash.bashrc}
ENV ENV=${ENV:-/etc/shinit_v2}
ARG GITHUB_MIRROR=""
RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
SHELL ["/bin/bash", "-c"]
# Clean up the pip constraint file from the base NGC PyTorch image.
RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true
FROM base AS devel
ARG PYTHON_VERSION="3.12.3"
RUN echo "Using Python version: $PYTHON_VERSION"
SHELL ["/bin/bash", "-c"]
FROM base AS devel
#
# NB: PyTorch requires this to be < 1.0
ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
COPY docker/common/install.sh install.sh
COPY docker/common/install_base.sh install_base.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_base.sh $PYTHON_VERSION && rm install_base.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} \
PYTHON_VERSION=${PYTHON_VERSION} \
bash ./install.sh --base && rm install_base.sh
COPY docker/common/install_cmake.sh install_cmake.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_cmake.sh && rm install_cmake.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --cmake && rm install_cmake.sh
COPY docker/common/install_ccache.sh install_ccache.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_ccache.sh && rm install_ccache.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --ccache && rm install_ccache.sh
# Only take effect when the base image is Rocky Linux 8 with old CUDA version.
COPY docker/common/install_cuda_toolkit.sh install_cuda_toolkit.sh
RUN bash ./install_cuda_toolkit.sh && rm install_cuda_toolkit.sh
RUN bash ./install.sh --cuda_toolkit && rm install_cuda_toolkit.sh
# Download & install latest TRT release
ARG TRT_VER
ARG CUDA_VER
ARG CUDNN_VER
ARG NCCL_VER
ARG CUBLAS_VER
COPY docker/common/install_tensorrt.sh install_tensorrt.sh
RUN bash ./install_tensorrt.sh \
--TRT_VER=${TRT_VER} \
--CUDA_VER=${CUDA_VER} \
--CUDNN_VER=${CUDNN_VER} \
--NCCL_VER=${NCCL_VER} \
--CUBLAS_VER=${CUBLAS_VER} && \
rm install_tensorrt.sh
RUN TRT_VER=${TRT_VER} \
CUDA_VER=${CUDA_VER} \
CUDNN_VER=${CUDNN_VER} \
NCCL_VER=${NCCL_VER} \
CUBLAS_VER=${CUBLAS_VER} \
bash ./install.sh --tensorrt && rm install_tensorrt.sh
# Install latest Polygraphy
COPY docker/common/install_polygraphy.sh install_polygraphy.sh
RUN bash ./install_polygraphy.sh && rm install_polygraphy.sh
RUN bash ./install.sh --polygraphy && rm install_polygraphy.sh
# Install mpi4py
COPY docker/common/install_mpi4py.sh install_mpi4py.sh
RUN GITHUB_MIRROR=$GITHUB_MIRROR bash ./install_mpi4py.sh && rm install_mpi4py.sh
RUN GITHUB_MIRROR=${GITHUB_MIRROR} bash ./install.sh --mpi4py && rm install_mpi4py.sh
# Install PyTorch
ARG TORCH_INSTALL_TYPE="skip"
COPY docker/common/install_pytorch.sh install_pytorch.sh
RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
#
# NB: PyTorch requires this to be < 1.0
ENV PYTORCH_CUDA_ALLOC_CONF="garbage_collection_threshold:0.99999"
RUN TORCH_INSTALL_TYPE=${TORCH_INSTALL_TYPE} bash ./install.sh --pytorch && rm install_pytorch.sh
# Install OpenCV with FFMPEG support
RUN pip3 uninstall -y opencv && \
rm -rf /usr/local/lib/python3*/dist-packages/cv2/ && \
pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
# WARs against security issues inherited from pytorch:25.06
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
RUN pip3 install --upgrade --no-cache-dir \
"protobuf>=4.25.8"
RUN bash ./install.sh --opencv && bash ./install.sh --protobuf && rm install.sh
FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton

144
docker/common/install.sh Executable file
View File

@ -0,0 +1,144 @@
#!/bin/bash
set -Eeo pipefail
shopt -s nullglob
trap 'echo "[install.sh] Error on line $LINENO" >&2' ERR
# Resolve script directory for robust relative pathing
SCRIPT_DIR="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" >/dev/null 2>&1 && pwd)"
# Default values
base=0
cmake=0
ccache=0
cuda_toolkit=0
tensorrt=0
polygraphy=0
mpi4py=0
pytorch=0
opencv=0
protobuf=0
while [[ $# -gt 0 ]]; do
case $1 in
--base)
base=1
shift 1
;;
--cmake)
cmake=1
shift 1
;;
--ccache)
ccache=1
shift 1
;;
--cuda_toolkit)
cuda_toolkit=1
shift 1
;;
--tensorrt)
tensorrt=1
shift 1
;;
--polygraphy)
polygraphy=1
shift 1
;;
--mpi4py)
mpi4py=1
shift 1
;;
--pytorch)
pytorch=1
shift 1
;;
--opencv)
opencv=1
shift 1
;;
--protobuf)
protobuf=1
shift 1
;;
--all)
base=1
cmake=1
ccache=1
cuda_toolkit=1
tensorrt=1
polygraphy=1
mpi4py=1
pytorch=1
opencv=1
protobuf=1
shift 1
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
if [ $base -eq 1 ]; then
echo "Installing base dependencies..."
# Clean up the pip constraint file from the base NGC PyTorch image.
[ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true
echo "Using Python version: $PYTHON_VERSION"
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_base.sh $PYTHON_VERSION
fi
if [ $cmake -eq 1 ]; then
echo "Installing CMake..."
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_cmake.sh
fi
if [ $ccache -eq 1 ]; then
echo "Installing ccache..."
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_ccache.sh
fi
if [ $cuda_toolkit -eq 1 ]; then
echo "Installing CUDA toolkit..."
bash $SCRIPT_DIR/install_cuda_toolkit.sh
fi
if [ $tensorrt -eq 1 ]; then
echo "Installing TensorRT..."
bash $SCRIPT_DIR/install_tensorrt.sh \
--TRT_VER=${TRT_VER} \
--CUDA_VER=${CUDA_VER} \
--CUDNN_VER=${CUDNN_VER} \
--NCCL_VER=${NCCL_VER} \
--CUBLAS_VER=${CUBLAS_VER}
fi
if [ $polygraphy -eq 1 ]; then
echo "Installing Polygraphy..."
bash $SCRIPT_DIR/install_polygraphy.sh
fi
if [ $mpi4py -eq 1 ]; then
echo "Installing mpi4py..."
GITHUB_MIRROR=$GITHUB_MIRROR bash $SCRIPT_DIR/install_mpi4py.sh
fi
if [ $pytorch -eq 1 ]; then
echo "Installing PyTorch..."
bash $SCRIPT_DIR/install_pytorch.sh $TORCH_INSTALL_TYPE
fi
if [ $opencv -eq 1 ]; then
echo "Installing OpenCV..."
pip3 uninstall -y opencv
rm -rf /usr/local/lib/python3*/dist-packages/cv2/
pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir
fi
# WARs against security issues inherited from pytorch:25.06
# * https://github.com/advisories/GHSA-8qvm-5x2c-j2w7
if [ $protobuf -eq 1 ]; then
pip3 install --upgrade --no-cache-dir \
"protobuf>=4.25.8"
fi

View File

@ -88,6 +88,33 @@ Follow the linked catalog entry to enter a new container based on the pre-built
make -C docker run LOCAL_USER=1
```
If you wish to use enroot instead of docker, then you can build a sqsh file that has the identical environment as the development image `tensorrt_llm/devel:latest` as follows.
1. Allocate a compute node:
```bash
salloc --nodes=1
```
2. Create a sqsh file with essential TensorRT LLM dependencies installed
```bash
# Using default sqsh filename (enroot/tensorrt_llm.devel.sqsh)
make -C enroot build_sqsh
# Or specify a custom path (optional)
make -C enroot build_sqsh SQSH_PATH=/path/to/dev_trtllm_image.sqsh
```
3. Once this squash file is ready, you can follow the steps under [Build TensorRT LLM](#build-tensorrt-llm)by launching an enroot sandbox from `dev_trtllm_image.sqsh`. To do this, proceed as follows:
```bash
export SQSH_PATH=/path/to/dev_trtllm_image.sqsh
# Start a pseudo terminal for interactive session
make -C enroot run_sqsh
# Or, you could run commands directly
make -C enroot run_sqsh RUN_CMD="python3 scripts/build_wheel.py"
```
**On systems without GNU `make`**
1. Create a Docker image for development.

45
enroot/Makefile Normal file
View File

@ -0,0 +1,45 @@
ifndef MAKEFILE_PYXIS_INCLUDED
MAKEFILE_PYXIS_INCLUDED := 1
BASE_IMAGE ?= $(shell grep '^ARG BASE_IMAGE=' ../docker/Dockerfile.multi | grep -o '=.*' | tr -d '="')
BASE_TAG ?= $(shell grep '^ARG BASE_TAG=' ../docker/Dockerfile.multi | grep -o '=.*' | tr -d '="')
SQSH_PATH ?= tensorrt_llm.devel.sqsh
SOURCE_DIR ?= $(shell readlink -f ..)
CODE_DIR ?= /code/tensorrt_llm
RUN_CMD ?= --pty bash
PYTHON_VERSION ?= 3.12.3
TORCH_INSTALL_TYPE ?= skip
GITHUB_MIRROR ?=
CUDA_VERSION ?=
CUDNN_VERSION ?=
NCCL_VERSION ?=
CUBLAS_VERSION ?=
TRT_VERSION ?=
build_sqsh:
@echo "Building trtllm sqsh image."
@echo "Base image: $(BASE_IMAGE):$(BASE_TAG)"
@echo "Location: $(SQSH_PATH)"
srun \
--container-image "$(BASE_IMAGE):$(BASE_TAG)" \
--container-save "$(SQSH_PATH)" \
--container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR)/docker/common \
--container-mount-home --container-remap-root \
--export PYTHON_VERSION=$(PYTHON_VERSION),GITHUB_MIRROR=$(GITHUB_MIRROR),TORCH_INSTALL_TYPE=$(TORCH_INSTALL_TYPE),CUDA_VER=$(CUDA_VERSION),CUDNN_VER=$(CUDNN_VERSION),NCCL_VER=$(NCCL_VERSION),CUBLAS_VER=$(CUBLAS_VERSION),TRT_VER=$(TRT_VERSION) \
./install.sh --all
run_sqsh:
@echo "Running srun job step with:"
@echo " sqsh image: $(SQSH_PATH)"
@echo " run command: $(RUN_CMD)"
srun \
--container-image "$(SQSH_PATH)" \
--container-mounts "$(SOURCE_DIR):$(CODE_DIR)" --container-workdir $(CODE_DIR) \
--container-mount-home --container-remap-root \
--export PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.99999 \
$(RUN_CMD)
endif

View File

@ -12,7 +12,7 @@
# NB: Typically, the suffix indicates the PR whose CI pipeline generated the images. In case that
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202508201630-pre-test
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509081850-5980
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509081850-5980
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-x86_64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.06-py3-aarch64-ubuntu24.04-trt10.11.0.33-skip-tritondevel-202509091430-7383
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py310-trt10.11.0.33-skip-tritondevel-202509091430-7383
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.1-devel-rocky8-x86_64-rocky8-py312-trt10.11.0.33-skip-tritondevel-202509091430-7383