doc: [TRTLLM-325]Integrate the NGC image in Makefile automation and document (#4400)

* doc: [TRTLLM-325]Integrate the NGC image in Makefile automation and documentation Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> * WAR against https://github.com/advisories/GHSA-vqfr-h8mv-ghfj Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> * Fix default assignment for CUDA architectures in SBSA build Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> * Push new docker images Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> * Handle constraints.txt in setup.py Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com> --------- Signed-off-by: Martin Marciniszyn Mehringer <11665257+MartinMarciniszyn@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-05-20 08:45:01 +02:00 · 2025-05-20 08:45:01 +02:00 · 3485347584
commit 3485347584
parent f2c0565577
9 changed files with 53 additions and 26 deletions
--- a/.devcontainer/docker-compose.yml
+++ b/.devcontainer/docker-compose.yml
@ -1,7 +1,7 @@
 version: "3.9"
 services:
  tensorrt_llm-dev:
-    image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202505160532-3934
+    image: urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202505191345-4400
    network_mode: host
    ipc: host

--- a/constraints.txt
+++ b/constraints.txt
@ -0,0 +1,2 @@
+# WAR against https://github.com/advisories/GHSA-vqfr-h8mv-ghfj
+h11>=0.16.0
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@ -72,6 +72,9 @@ RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
 RUN pip3 uninstall -y opencv && rm -rf /usr/local/lib/python3*/dist-packages/cv2/
 RUN pip3 install opencv-python-headless --force-reinstall --no-deps --no-cache-dir

+# WAR against https://github.com/advisories/GHSA-vqfr-h8mv-ghfj
+RUN pip3 install --upgrade h11>=0.16 --no-cache-dir
+

 FROM ${TRITON_IMAGE}:${TRITON_BASE_TAG} AS triton

@ -102,7 +105,7 @@ COPY benchmarks benchmarks
 COPY scripts scripts
 COPY tensorrt_llm tensorrt_llm
 COPY 3rdparty 3rdparty
-COPY .gitmodules setup.py requirements.txt requirements-dev.txt ./
+COPY .gitmodules setup.py requirements.txt requirements-dev.txt constraints.txt ./

 # Create cache directories for pip and ccache
 RUN mkdir -p /root/.cache/pip /root/.cache/ccache
--- a/docker/Makefile
+++ b/docker/Makefile
@ -28,12 +28,8 @@ PUSH_TO_STAGING    ?= 1
 DOCKER_BUILD_OPTS  ?= --pull --load
 DOCKER_BUILD_ARGS  ?=
 DOCKER_PROGRESS    ?= auto
-CUDA_ARCHS         ?=
 PLATFORM           ?= $(shell uname -m | grep -q 'aarch64' && echo "arm64" || echo "amd64")
-ifeq ($(PLATFORM), arm64)
-  CUDA_ARCHS = '90-real;100-real;120-real'
-endif
-
+CUDA_ARCHS         ?= $(if $(filter arm64,$(PLATFORM)),'90-real;100-real;120-real',)
 BUILD_WHEEL_OPTS   ?=
 BUILD_WHEEL_ARGS   ?= $(shell grep 'ARG BUILD_WHEEL_ARGS=' Dockerfile.multi | grep -o '=.*' | tr -d '="')$(if $(CUDA_ARCHS), --cuda_architectures $(CUDA_ARCHS))$(if $(BUILD_WHEEL_OPTS), $(BUILD_WHEEL_OPTS))
 TORCH_INSTALL_TYPE ?= skip
@ -47,6 +43,8 @@ TRT_LLM_VERSION    ?= $(shell grep '^__version__' ../tensorrt_llm/version.py | g
 GITHUB_MIRROR      ?=
 PYTHON_VERSION     ?=
 NGC_STAGING_REPO   ?= nvcr.io/nvstaging/tensorrt-llm
+NGC_REPO           ?= nvcr.io/nvidia/tensorrt-llm
+NGC_USE_STAGING    ?= 0

 define add_local_user
 	docker build \
@ -201,22 +199,29 @@ ngc-devel_%: IMAGE_TAG = $(TRT_LLM_VERSION)
 ngc-devel_push: DOCKER_BUILD_ARGS = --push
 ngc-devel_push: ngc-devel_build ;

+ngc-devel_run: IMAGE_NAME = $(if $(filter 1,$(NGC_USE_STAGING)),$(NGC_STAGING_REPO),$(NGC_REPO))
+
 ngc-release_%: STAGE = release
 ngc-release_%: DOCKER_BUILD_OPTS = --pull --load --platform linux/$(PLATFORM)
 ngc-release_%: DEVEL_IMAGE = $(NGC_STAGING_REPO)/devel:$(TRT_LLM_VERSION)
-ngc-release_%: IMAGE_NAME = nvcr.io/nvstaging/tensorrt-llm
+ngc-release_%: IMAGE_NAME = $(NGC_STAGING_REPO)
 ngc-release_%: IMAGE_TAG = $(TRT_LLM_VERSION)-$(PLATFORM)

+ngc-release_run: IMAGE_NAME = $(if $(filter 1,$(NGC_USE_STAGING)),$(NGC_STAGING_REPO),$(NGC_REPO))
+ngc-release_run: WORK_DIR = /app/tensorrt_llm
+
 ngc-manifest_%: STAGE = release
 ngc-manifest_%: IMAGE_NAME = $(NGC_STAGING_REPO)
 ngc-manifest_%: IMAGE_TAG = $(TRT_LLM_VERSION)

 ngc-manifest_create:
+	docker pull $(IMAGE_WITH_TAG)-amd64
+	docker pull $(IMAGE_WITH_TAG)-arm64
 	docker manifest create $(IMAGE_WITH_TAG) \
  		--amend $(IMAGE_WITH_TAG)-amd64 \
  		--amend $(IMAGE_WITH_TAG)-arm64

-ngc-manifest_push:
+ngc-manifest_push: ngc-manifest_create
 	docker manifest push $(IMAGE_WITH_TAG)

 build: devel_build ;
--- a/docker/README.md
+++ b/docker/README.md
@ -52,6 +52,28 @@ make -C docker release_build CUDA_ARCHS="80-real;90-real"

 For more build options, see the variables defined in [`Makefile`](Makefile).

+### NGC Integration
+
+When building from source, one can conveniently download a docker image for development from
+the [NVIDIA NGC Catalog](https://catalog.ngc.nvidia.com/) and start it like so:
+
+```bash
+make -C docker ngc-devel_run LOCAL_USER=1 DOCKER_PULL=1
+```
+
+As before, specifying `LOCAL_USER=1` will run the container with the local user's identity. Specifying `DOCKER_PULL=1`
+is optional, but it will pull the latest image from the NGC Catalog. This will map the source code into the container
+in the directory `/code/tensorrt_llm`.
+
+We also provide an image with pre-installed binaries for release. This can be used like so:
+
+```bash
+make -C docker ngc-release_run LOCAL_USER=1 DOCKER_PULL=1
+```
+
+If you want to deploy a specific version of TensorRT-LLM, you can specify the version with
+`TRT_LLM_VERSION=<version_tag>`. The application examples and benchmarks are installed in `/app/tensorrt_llm`.
+
 ### Jenkins Integration

 [`Makefile`](Makefile) has special targets for building, pushing and running the Docker build image used on Jenkins.
@ -91,14 +113,3 @@ make -C docker trtllm_run LOCAL_USER=1 DOCKER_PULL=1

 The argument `DOCKER_PULL=1` instructs `make` to pull the latest version of the image before deploying it in the container.
 By default, images are tagged by their `git` branch name and may be frequently updated.
-
-### Binary Compatible Environment
-
-Currently, `BatchManager` is released as a closed source binary library. In order to make it deployable in a wider
-scope, the compilation environment needs to be constructed in the following way.
-
-The compilation environment for x86_64 architecture
-
-```bash
-make -C docker centos7_push
-```
--- a/jenkins/L0_MergeRequest.groovy
+++ b/jenkins/L0_MergeRequest.groovy
@ -21,10 +21,10 @@ UPLOAD_PATH = env.uploadPath ? env.uploadPath : "sw-tensorrt-generic/llm-artifac
 // Container configuration
 // available tags can be found in: https://urm.nvidia.com/artifactory/sw-tensorrt-docker/tensorrt-llm/
 // [base_image_name]-[arch]-[os](-[python_version])-[trt_version]-[torch_install_type]-[stage]-[date]-[mr_id]
-LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202505160532-3934"
-LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-aarch64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202505160532-3934"
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.10.0.31-skip-tritondevel-202505160532-3934"
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.10.0.31-skip-tritondevel-202505160532-3934"
+LLM_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202505191345-4400"
+LLM_SBSA_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-aarch64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202505191345-4400"
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py310-trt10.10.0.31-skip-tritondevel-202505191345-4400"
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-12.9.0-devel-rocky8-x86_64-rocky8-py312-trt10.10.0.31-skip-tritondevel-202505191345-4400"

 // TODO: Move common variables to an unified location
 BUILD_CORES_REQUEST = "8"
--- a/jenkins/controlCCache.groovy
+++ b/jenkins/controlCCache.groovy
@ -1,7 +1,7 @@

 import java.lang.InterruptedException

-DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202505121727-4049"
+DOCKER_IMAGE = "urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.04-py3-x86_64-ubuntu24.04-trt10.10.0.31-skip-tritondevel-202505191345-4400"

 def createKubernetesPodConfig(image)
 {
--- a/requirements.txt
+++ b/requirements.txt
@ -1,4 +1,5 @@
 --extra-index-url https://download.pytorch.org/whl/cu128
+-c constraints.txt
 accelerate>=0.25.0
 build
 colored
--- a/setup.py
+++ b/setup.py
@ -31,7 +31,8 @@ def parse_requirements(filename: os.PathLike):
        extra_URLs = []
        deps = []
        for line in requirements:
-            if line.startswith("#") or line.startswith("-r"):
+            if line.startswith("#") or line.startswith("-r") or line.startswith(
+                    "-c"):
                continue

            # handle -i and --extra-index-url options
@ -87,6 +88,10 @@ required_deps, extra_URLs = parse_requirements(
 devel_deps, _ = parse_requirements(
    Path("requirements-dev-windows.txt"
         if on_windows else "requirements-dev.txt"))
+constraints_file = Path("constraints.txt")
+if constraints_file.exists():
+    constraints, _ = parse_requirements(constraints_file)
+    required_deps.extend(constraints)

 if on_windows:
    package_data = [