diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml index 42eaed7ddaa..e0ef7d59242 100644 --- a/.buildkite/image_build/image_build.yaml +++ b/.buildkite/image_build/image_build.yaml @@ -6,6 +6,48 @@ steps: timeout_in_minutes: 600 commands: - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG $IMAGE_TAG_LATEST; else .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $IMAGE_TAG; fi + # Non-root smoke 1: the default (root) image must still be importable + # under a non-root UID via `--user 2000:0`. Validates the `vllm` passwd + # entry + group-0-writable /home/vllm + uv path cleanup from #31959. + # Uses `import vllm` rather than `vllm serve --help` because the latter + # instantiates `VllmConfig` which requires a GPU attached to the + # container. + - docker run --rm --user 2000:0 --entrypoint python3 "$IMAGE_TAG" -c "import vllm; print(vllm.__version__)" + # Non-root smoke 2: assert the non-root enabling invariants are baked + # into the image. Runs as UID 2000:0 via a shell so we can verify + # filesystem perms + passwd/group file state + wrapper presence without + # triggering vLLM's GPU-requiring config-init path. The opt-in + # `vllm-openai-nonroot` target adds only `USER vllm`, `WORKDIR + # /home/vllm`, and an `ENTRYPOINT` override on top of these invariants; + # its build correctness is reviewed at the Dockerfile level. Wrapper + # logic is covered separately by the pre-commit hook + # `test-nonroot-entrypoint` (see .pre-commit-config.yaml). + - | + docker run --rm --user 2000:0 --entrypoint /bin/sh "$IMAGE_TAG" -ec ' + if ! getent passwd 2000 | grep -q ^vllm:; then + echo FAIL: UID 2000 != vllm + exit 1 + fi + if ! id -gn 2>/dev/null | grep -qx root; then + echo FAIL: GID 0 not root group + exit 1 + fi + touch /home/vllm/.smoke && rm /home/vllm/.smoke + touch /opt/uv/cache/.smoke && rm /opt/uv/cache/.smoke + if ! test -x /usr/local/bin/vllm-nonroot-entrypoint.sh; then + echo FAIL: wrapper missing + exit 1 + fi + if ! test -w /etc/passwd; then + echo FAIL: /etc/passwd not group-writable + exit 1 + fi + if ! test -w /etc/group; then + echo FAIL: /etc/group not group-writable + exit 1 + fi + echo non-root invariants OK + ' retry: automatic: - exit_status: -1 # Agent was lost diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c6658ff735e..05625e8f667 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -222,6 +222,12 @@ repos: name: Update Dockerfile dependency graph entry: tools/pre_commit/update-dockerfile-graph.sh language: script + - id: test-nonroot-entrypoint + name: Test non-root entrypoint wrapper + entry: bash docker/entrypoints/test_vllm_nonroot_entrypoint.sh + language: system + pass_filenames: false + files: ^docker/entrypoints/(vllm-nonroot-entrypoint|test_vllm_nonroot_entrypoint)\.sh$ - id: check-forbidden-imports name: Check for forbidden imports entry: python tools/pre_commit/check_forbidden_imports.py diff --git a/docker/Dockerfile b/docker/Dockerfile index 6b6c4bdfba5..cae909862b5 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -105,6 +105,23 @@ ARG BUILD_OS ENV DEBIAN_FRONTEND=noninteractive +# Environment for uv +# Declared BEFORE the installer + `uv venv` invocations below so the uv +# binary, managed Python, download cache, and /opt/venv all land under +# /opt/uv instead of /root/.local/. Without this, the venv created at +# build time hardlinks back to /root/.local/share/uv/python and +# descendants of this stage (`build`, `dev`, `csrc-build`, +# `extensions-build`) inherit a root-owned, non-root-unreadable layout. +# See #15174, #15359, #31959. Child stages inherit these via Dockerfile +# `ENV` unless they override them explicitly. +ENV UV_HTTP_TIMEOUT=500 +ENV UV_INDEX_STRATEGY="unsafe-best-match" +ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python +ENV UV_CACHE_DIR=/opt/uv/cache +ENV UV_INSTALL_DIR=/opt/uv/bin +ENV PATH="/opt/venv/bin:/opt/uv/bin:$PATH" +ENV VIRTUAL_ENV="/opt/venv" + # Install system dependencies including build tools. # The Ubuntu path uses apt + deadsnakes-via-uv for Python; the manylinux path # (AlmaLinux 8, e.g. pytorch/manylinux2_28-builder) uses dnf and the Python @@ -145,15 +162,21 @@ RUN if [ "${BUILD_OS}" = "manylinux" ]; then \ # Install uv and bootstrap /opt/venv. Both paths converge on /opt/venv so all # downstream stages stay distro-agnostic. -RUN curl -LsSf https://astral.sh/uv/install.sh | sh \ +RUN mkdir -p "${UV_PYTHON_INSTALL_DIR}" "${UV_CACHE_DIR}" "${UV_INSTALL_DIR}" \ + && chmod -R a+rX /opt/uv \ + && curl -LsSf https://astral.sh/uv/install.sh | sh \ + # `--seed` installs pip/setuptools/wheel into the venv so `python3 -m + # pip` works regardless of how uv happens to link the venv back to the + # managed Python install (which, at a non-default UV_PYTHON_INSTALL_DIR, + # doesn't always expose ensurepip via the default venv layout). && if [ "${BUILD_OS}" = "manylinux" ]; then \ # manylinux images ship Python at /opt/python/cpXY-cpXY/; point uv # at the matching interpreter rather than letting it fetch one. PYV_NODOT=$(echo ${PYTHON_VERSION} | tr -d '.') \ && MANYLINUX_PY=/opt/python/cp${PYV_NODOT}-cp${PYV_NODOT}/bin/python${PYTHON_VERSION} \ - && $HOME/.local/bin/uv venv /opt/venv --python "$MANYLINUX_PY"; \ + && uv venv --seed /opt/venv --python "$MANYLINUX_PY"; \ else \ - $HOME/.local/bin/uv venv /opt/venv --python ${PYTHON_VERSION}; \ + uv venv --seed /opt/venv --python ${PYTHON_VERSION}; \ fi \ && rm -f /usr/bin/python3 /usr/bin/python3-config /usr/bin/pip \ && ln -sf /opt/venv/bin/python3 /usr/bin/python3 \ @@ -161,13 +184,10 @@ RUN curl -LsSf https://astral.sh/uv/install.sh | sh \ && ln -sf /opt/venv/bin/pip /usr/bin/pip \ && python3 --version && python3 -m pip --version -# Activate virtual environment and add uv to PATH -ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH" -ENV VIRTUAL_ENV="/opt/venv" - -# Environment for uv -ENV UV_HTTP_TIMEOUT=500 -ENV UV_INDEX_STRATEGY="unsafe-best-match" +# UV_LINK_MODE=copy applies to subsequent `uv pip install` RUNs (avoids +# hardlink failures with BuildKit cache mounts); it must not be set during +# `uv venv` above, which relies on hardlinking /opt/venv back to the +# managed Python source so ensurepip / `python3 -m pip` still resolve. ENV UV_LINK_MODE=copy # Verify GCC version @@ -198,7 +218,7 @@ COPY requirements/common.txt requirements/common.txt COPY requirements/cuda.txt requirements/cuda.txt COPY use_existing_torch.py use_existing_torch.py COPY pyproject.toml pyproject.toml -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ if [ "$(echo $CUDA_VERSION | cut -d. -f1)" = "12" ]; then \ sed -i 's/^nvidia-cutlass-dsl\[cu13\]>=/nvidia-cutlass-dsl>=/' requirements/cuda.txt; \ fi \ @@ -218,7 +238,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Track PyTorch lib versions used during build and match in downstream instances. # We do this for both nightly and release so we can strip dependencies/*.txt as needed. # Otherwise library dependencies can upgrade/downgrade torch incorrectly. -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ uv pip freeze | grep -i "^torch=\|^torchvision=\|^torchaudio=" > torch_lib_versions.txt \ && TORCH_LIB_VERSIONS=$(cat torch_lib_versions.txt | xargs) \ && echo "Installed torch libs: ${TORCH_LIB_VERSIONS}" @@ -304,7 +324,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts ENV UV_LINK_MODE=copy -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \ echo "Installing build requirements without torch..." \ && python3 use_existing_torch.py --prefix \ @@ -349,7 +369,7 @@ ARG VLLM_MAIN_CUDA_VERSION="" ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+csrc.build" # Use existing torch for nightly builds -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \ python3 use_existing_torch.py --prefix; \ fi @@ -365,7 +385,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # Build the vLLM wheel # if USE_SCCACHE is set, use sccache to speed up compilation # AWS credentials mounted at ~/.aws/credentials for sccache S3 auth (optional) -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ --mount=type=secret,id=aws-credentials,target=/root/.aws/credentials,required=false \ if [ "$USE_SCCACHE" = "1" ]; then \ echo "Installing sccache..." \ @@ -399,7 +419,7 @@ ARG vllm_target_device="cuda" ENV VLLM_TARGET_DEVICE=${vllm_target_device} ENV CCACHE_DIR=/root/.cache/ccache RUN --mount=type=cache,target=/root/.cache/ccache \ - --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/opt/uv/cache \ if [ "$USE_SCCACHE" != "1" ]; then \ # Clean any existing CMake artifacts rm -rf .deps && \ @@ -431,7 +451,7 @@ COPY tools/ep_kernels/install_python_libraries.sh /tmp/install_python_libraries. # Defaults moved here from tools/ep_kernels/install_python_libraries.sh for centralized version management ARG DEEPEP_COMMIT_HASH=73b6ea4 ARG NVSHMEM_VER -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ mkdir -p /tmp/ep_kernels_workspace/dist && \ export TORCH_CUDA_ARCH_LIST='9.0a 10.0a' && \ /tmp/install_python_libraries.sh \ @@ -465,7 +485,7 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts ENV UV_LINK_MODE=copy -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \ echo "Installing build requirements without torch..." \ && python3 use_existing_torch.py --prefix \ @@ -500,13 +520,13 @@ ENV VLLM_TARGET_DEVICE=${vllm_target_device} ENV VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX=1 # Use existing torch for nightly builds -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \ python3 use_existing_torch.py --prefix; \ fi # Build the vLLM wheel -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ --mount=type=bind,source=.git,target=.git \ if [ "${vllm_target_device}" = "cuda" ]; then \ export VLLM_USE_PRECOMPILED=1; \ @@ -564,7 +584,7 @@ COPY requirements/test/cuda.txt requirements/test/cuda.txt COPY requirements/dev.txt requirements/dev.txt COPY use_existing_torch.py use_existing_torch.py COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \ echo "Installing dev requirements plus torch nightly..." \ && python3 use_existing_torch.py --prefix \ @@ -664,9 +684,50 @@ RUN CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-') && \ RUN python3 -m pip install uv # Environment for uv +# Redirect uv's managed Python and download cache out of /root/ so downstream +# images (`FROM vllm/vllm-openai` + `USER `) and direct non-root runs +# (`docker run --user :`) can read and execute them. See #15174, +# #15359, #31959. ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" ENV UV_LINK_MODE=copy +ENV UV_PYTHON_INSTALL_DIR=/opt/uv/python +ENV UV_CACHE_DIR=/opt/uv/cache +RUN mkdir -p "${UV_PYTHON_INSTALL_DIR}" "${UV_CACHE_DIR}" \ + && chgrp -R 0 /opt/uv \ + && chmod -R g+rwX,a+rX /opt/uv + +# ---------------------------------------------------------------------- +# Non-root support (opt-in) +# ---------------------------------------------------------------------- +# Create a conventional `vllm` user (UID 2000, GID 0) so the image can be +# run under `--user 2000:0` or the opt-in `vllm-openai-nonroot` target. +# +# Design notes: +# * GID 0 + group-writable cache dirs follow the OpenShift arbitrary-UID +# pattern, so any UID that is a member of group 0 at runtime can write +# to /home/vllm and /opt/uv without additional chown work. +# * The default `vllm-openai` image keeps `USER root`, so every existing +# `docker run vllm/vllm-openai ...` / K8s manifest / `FROM vllm/vllm-openai` +# + `RUN uv pip install --system ...` flow is unchanged. +# * The entrypoint wrapper below is only used by `vllm-openai-nonroot`; it +# handles the OpenShift arbitrary-UID case (UID not in /etc/passwd). +# See #31959 and docs/deployment/docker.md. +RUN useradd --uid 2000 --gid 0 --create-home --home-dir /home/vllm \ + --shell /bin/bash vllm \ + && mkdir -p /home/vllm/.cache /home/vllm/.config \ + && chown -R 2000:0 /home/vllm \ + && chmod -R g+rwX /home/vllm \ + # Allow the entrypoint wrapper to append a /etc/passwd entry for an + # arbitrary runtime UID that shares GID 0. Without this, `whoami`, bash's + # `\u` prompt, `id -un`, and anything else that calls `getpwuid()` + # directly return "I have no name!" for OpenShift-style arbitrary UIDs. + # This matches the convention used by Red Hat UBI base images. + && chgrp 0 /etc/passwd /etc/group \ + && chmod g=u /etc/passwd /etc/group +COPY docker/entrypoints/vllm-nonroot-entrypoint.sh \ + /usr/local/bin/vllm-nonroot-entrypoint.sh +RUN chmod 0755 /usr/local/bin/vllm-nonroot-entrypoint.sh # Enable CUDA forward compatibility by setting '-e VLLM_ENABLE_CUDA_COMPATIBILITY=1' # Only needed for datacenter/professional GPUs with older drivers. @@ -683,7 +744,7 @@ ENV VLLM_ENABLE_CUDA_COMPATIBILITY=0 ARG PYTORCH_CUDA_INDEX_BASE_URL COPY requirements/common.txt /tmp/common.txt COPY requirements/cuda.txt /tmp/requirements-cuda.txt -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ if [ "$(echo $CUDA_VERSION | cut -d. -f1)" = "12" ]; then \ sed -i 's/^nvidia-cutlass-dsl\[cu13\]>=/nvidia-cutlass-dsl>=/' /tmp/requirements-cuda.txt; \ fi && \ @@ -695,7 +756,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ # https://docs.flashinfer.ai/installation.html # From versions.json: .flashinfer.version ARG FLASHINFER_VERSION=0.6.11.post2 -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ uv pip install --system flashinfer-jit-cache==${FLASHINFER_VERSION} \ --extra-index-url https://flashinfer.ai/whl/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') @@ -727,7 +788,7 @@ ARG BITSANDBYTES_VERSION_X86=0.46.1 ARG BITSANDBYTES_VERSION_ARM64=0.42.0 ARG TIMM_VERSION=">=1.0.17" ARG RUNAI_MODEL_STREAMER_VERSION=">=0.15.7" -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ if [ "$TARGETPLATFORM" = "linux/arm64" ]; then \ BITSANDBYTES_VERSION="${BITSANDBYTES_VERSION_ARM64}"; \ else \ @@ -752,7 +813,7 @@ ARG PYTORCH_NIGHTLY # Check whether to install torch nightly instead of release for this build. COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist \ - --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/opt/uv/cache \ if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \ echo "Installing torch nightly..." \ && uv pip install --system $(cat torch_lib_versions.txt | xargs) --pre \ @@ -766,7 +827,7 @@ RUN --mount=type=bind,from=build,src=/workspace/dist,target=/vllm-workspace/dist --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.'); \ fi -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ . /etc/environment && \ uv pip list @@ -775,7 +836,7 @@ ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH # Install EP kernels wheels (DeepEP) that have been built in the `build` stage RUN --mount=type=bind,from=build,src=/tmp/ep_kernels_workspace/dist,target=/vllm-workspace/ep_kernels/dist \ - --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/opt/uv/cache \ uv pip install --system ep_kernels/dist/*.whl --verbose \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') @@ -830,7 +891,7 @@ COPY requirements/test/cuda.txt requirements/test/cuda.txt COPY requirements/dev.txt requirements/dev.txt COPY use_existing_torch.py use_existing_torch.py COPY --from=base /workspace/torch_lib_versions.txt torch_lib_versions.txt -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ if [ "$CUDA_MAJOR" -ge 12 ]; then \ if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \ @@ -850,7 +911,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \ fi # install development dependencies (for testing) -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ uv pip install --system -e tests/vllm_test_utils # enable fast downloads from hf (for testing) @@ -890,7 +951,7 @@ ENV UV_HTTP_TIMEOUT=500 # install kv_connectors if requested ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX' ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list} -RUN --mount=type=cache,target=/root/.cache/uv \ +RUN --mount=type=cache,target=/opt/uv/cache \ --mount=type=bind,source=requirements/kv_connectors.txt,target=/tmp/kv_connectors.txt,ro \ CUDA_MAJOR="${CUDA_VERSION%%.*}"; \ CUDA_VERSION_DASH=$(echo $CUDA_VERSION | cut -d. -f1,2 | tr '.' '-'); \ @@ -958,5 +1019,32 @@ ENTRYPOINT ["./sagemaker-entrypoint.sh"] FROM vllm-openai-base AS vllm-openai +# To run the image as non-root, either build the `vllm-openai-nonroot` target +# below, or in a derived Dockerfile uncomment the following line and ensure +# any additional layers chgrp-0 / chmod-g+rwX paths they write to. The `vllm` +# user (UID 2000, GID 0) is already created in the `vllm-base` stage. +# See docs/deployment/docker.md. +# USER vllm ENTRYPOINT ["vllm", "serve"] #################### OPENAI API SERVER #################### + +#################### OPENAI API SERVER (NON-ROOT, OPT-IN) #################### +# Non-root-ready variant of `vllm-openai`. Built via: +# docker build --target vllm-openai-nonroot -t vllm:openai-nonroot \ +# -f docker/Dockerfile . +# +# Runtime behavior: +# * Default USER is `vllm` (UID 2000, GID 0) created in `vllm-base`. +# * HOME is /home/vllm, pre-created group-0-writable so arbitrary UIDs in +# group 0 (OpenShift / `--user :0`) can also use the image. +# * Entrypoint wrapper handles the "UID not in /etc/passwd" case for truly +# arbitrary UIDs by falling back HOME/USER to sane writable defaults. +# * All cache/config envs (HF_HOME, VLLM_CACHE_ROOT, TRITON_CACHE_DIR, ...) +# remain unset so their library defaults resolve to $HOME/.cache/... , +# which is writable. +FROM vllm-openai AS vllm-openai-nonroot + +USER vllm +WORKDIR /home/vllm +ENTRYPOINT ["/usr/local/bin/vllm-nonroot-entrypoint.sh"] +#################### OPENAI API SERVER (NON-ROOT, OPT-IN) #################### diff --git a/docker/entrypoints/test_vllm_nonroot_entrypoint.sh b/docker/entrypoints/test_vllm_nonroot_entrypoint.sh new file mode 100755 index 00000000000..c136f054919 --- /dev/null +++ b/docker/entrypoints/test_vllm_nonroot_entrypoint.sh @@ -0,0 +1,266 @@ +#!/bin/sh +# Shell-level unit test for vllm-nonroot-entrypoint.sh. +# +# Runs on the host (no Docker, no GPU) by stubbing `vllm` with a shim that +# dumps its env + argv instead of actually serving. Exercises the wrapper's +# HOME/USER fallback behavior that can't be easily tested from buildkite +# (which would need a GPU to run `vllm serve --help`). +# +# Usage: +# bash docker/entrypoints/test_vllm_nonroot_entrypoint.sh +# Exits non-zero on the first failed assertion. + +set -eu + +SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)" +WRAPPER="${SCRIPT_DIR}/vllm-nonroot-entrypoint.sh" + +if [ ! -x "$WRAPPER" ]; then + echo "FAIL: wrapper not found or not executable: $WRAPPER" >&2 + exit 1 +fi + +WORKDIR="$(mktemp -d)" +trap 'rm -rf "$WORKDIR"' EXIT + +# Stub `vllm` on PATH. It dumps env + argv + cwd to stdout so we can assert. +mkdir -p "$WORKDIR/bin" +cat > "$WORKDIR/bin/vllm" <<'EOF' +#!/bin/sh +echo "ARGV=$*" +echo "HOME=${HOME-__unset__}" +echo "USER=${USER-__unset__}" +echo "LOGNAME=${LOGNAME-__unset__}" +echo "PWD=$(pwd)" +EOF +chmod +x "$WORKDIR/bin/vllm" + +run_wrapper() { + # Usage: run_wrapper ... -- ... + _out="$1"; shift + _env="" + while [ "${1:-}" != "--" ]; do + _env="$_env $1"; shift + done + shift + env -i PATH="$WORKDIR/bin:/usr/bin:/bin" $_env "$WRAPPER" "$@" > "$_out" +} + +fail() { echo "FAIL: $*" >&2; echo "--- stdout ---" >&2; cat "$1" >&2; exit 1; } + +expect_default_home() { + _out="$1" + _case="$2" + if [ -w /home/vllm ]; then + expected_home="/home/vllm" + grep -q "^HOME=$expected_home\$" "$_out" \ + || fail "$_out" "$_case: HOME not set to $expected_home" + else + expected_home="/tmp/vllm-home.XXXXXX" + grep -Eq '^HOME=/tmp/vllm-home\.[^/]+$' "$_out" \ + || fail "$_out" "$_case: HOME not set to $expected_home" + fi +} + +# ----------------------------------------------------------------------------- +# Case 1: writable HOME and USER both set -> wrapper must leave them alone. +# ----------------------------------------------------------------------------- +case1_home="$WORKDIR/case1-home" +mkdir -p "$case1_home" +out="$WORKDIR/case1.out" +run_wrapper "$out" "HOME=$case1_home" "USER=alice" "LOGNAME=alice" -- --model foo +grep -q "^HOME=$case1_home\$" "$out" || fail "$out" "case1: HOME not preserved" +grep -q "^USER=alice\$" "$out" || fail "$out" "case1: USER not preserved" +grep -q "^LOGNAME=alice\$" "$out" || fail "$out" "case1: LOGNAME not preserved" +grep -q "^ARGV=serve --model foo\$" "$out" || fail "$out" "case1: ARGV wrong" +echo "PASS: case1 (writable HOME + USER preserved)" + +# ----------------------------------------------------------------------------- +# Case 2: HOME unset -> falls back to /home/vllm if writable, else +# /tmp/vllm-home.XXXXXX. +# ----------------------------------------------------------------------------- +# The wrapper checks whether the real /home/vllm exists and is writable. On +# dev machines /home/vllm typically does NOT exist, so the +# wrapper should fall to /tmp/vllm-home.XXXXXX. +out="$WORKDIR/case2.out" +run_wrapper "$out" -- --model bar +expect_default_home "$out" "case2" +grep -q "^USER=vllm\$" "$out" || fail "$out" "case2: USER not defaulted to vllm" +grep -q "^LOGNAME=vllm\$" "$out" || fail "$out" "case2: LOGNAME not defaulted to vllm" +grep -q "^ARGV=serve --model bar\$" "$out" || fail "$out" "case2: ARGV wrong" +echo "PASS: case2 (unset HOME falls back to $expected_home, USER defaulted)" + +# ----------------------------------------------------------------------------- +# Case 3: HOME set but unwritable -> must also fall back. +# ----------------------------------------------------------------------------- +ro_home="$WORKDIR/ro-home" +mkdir -p "$ro_home" +chmod 0500 "$ro_home" +out="$WORKDIR/case3.out" +run_wrapper "$out" "HOME=$ro_home" -- --model baz +expect_default_home "$out" "case3" +grep -q "^USER=vllm\$" "$out" || fail "$out" "case3: USER not defaulted" +chmod 0700 "$ro_home" +echo "PASS: case3 (unwritable HOME overridden)" + +# ----------------------------------------------------------------------------- +# Case 4: USER set but LOGNAME unset -> LOGNAME mirrors USER. +# ----------------------------------------------------------------------------- +case4_home="$WORKDIR/case4-home" +mkdir -p "$case4_home" +out="$WORKDIR/case4.out" +run_wrapper "$out" "HOME=$case4_home" "USER=carol" -- --model qux +grep -q "^USER=carol\$" "$out" || fail "$out" "case4: USER not preserved" +grep -q "^LOGNAME=carol\$" "$out" || fail "$out" "case4: LOGNAME not mirrored from USER" +echo "PASS: case4 (LOGNAME mirrors USER when unset)" + +# ----------------------------------------------------------------------------- +# Case 5: /etc/passwd is writable AND the current UID is not in it -> wrapper +# appends a synthetic entry. Uses the VLLM_PASSWD_FILE test hook so we don't +# touch the real /etc/passwd. +# ----------------------------------------------------------------------------- +fake_passwd="$WORKDIR/fake-passwd" +: > "$fake_passwd" # empty file, current UID definitely not present +case5_home="$WORKDIR/case5-home" +mkdir -p "$case5_home" +out="$WORKDIR/case5.out" +run_wrapper "$out" "HOME=$case5_home" "VLLM_PASSWD_FILE=$fake_passwd" -- --model foo +current_uid="$(id -u)" +current_gid="$(id -g)" +expected_line="vllm:x:${current_uid}:${current_gid}:vllm:${case5_home}:/bin/bash" +grep -Fx "$expected_line" "$fake_passwd" > /dev/null \ + || { echo "FAIL: case5: expected line not found in fake passwd:"; echo " expected: $expected_line"; echo " file contents:"; cat "$fake_passwd"; exit 1; } +echo "PASS: case5 (passwd entry appended for arbitrary UID)" + +# ----------------------------------------------------------------------------- +# Case 6: /etc/passwd is writable but current UID already has an entry -> +# wrapper must NOT duplicate the entry. +# ----------------------------------------------------------------------------- +fake_passwd="$WORKDIR/fake-passwd-prepopulated" +printf 'vllm:x:%s:%s:vllm:/home/vllm:/bin/bash\n' "$current_uid" "$current_gid" > "$fake_passwd" +out="$WORKDIR/case6.out" +run_wrapper "$out" "HOME=$case5_home" "VLLM_PASSWD_FILE=$fake_passwd" -- --model foo +line_count="$(wc -l < "$fake_passwd")" +# NOTE: wc may count 0 or 1 depending on trailing newline; accept 1. +# More robust: count lines matching our UID. +uid_lines="$(grep -c ":${current_uid}:" "$fake_passwd" || true)" +[ "$uid_lines" = "1" ] \ + || { echo "FAIL: case6: expected exactly one entry for UID $current_uid, got $uid_lines"; cat "$fake_passwd"; exit 1; } +echo "PASS: case6 (existing passwd entry not duplicated)" + +# ----------------------------------------------------------------------------- +# Case 7: /etc/passwd is NOT writable -> wrapper must NOT crash, just skip. +# Skipped when running as root, because root's DAC override means [ -w ... ] +# is always true regardless of mode bits -- the case can't be simulated. +# In the real deployment (non-root UID inside the container) this IS the +# relevant behavior and is what `_passwd_file is not writable` encodes. +# ----------------------------------------------------------------------------- +if [ "$(id -u)" = "0" ]; then + echo "SKIP: case7 (running as root; DAC override makes unwritable check meaningless)" +else + fake_passwd="$WORKDIR/ro-passwd" + : > "$fake_passwd" + chmod 0444 "$fake_passwd" + out="$WORKDIR/case7.out" + run_wrapper "$out" "HOME=$case5_home" "VLLM_PASSWD_FILE=$fake_passwd" -- --model foo + # File must remain empty (no write happened) and the wrapper exec'd + # `vllm serve` successfully (stdout contains ARGV line). + [ ! -s "$fake_passwd" ] \ + || { echo "FAIL: case7: RO passwd file was modified"; cat "$fake_passwd"; exit 1; } + grep -q "^ARGV=serve --model foo\$" "$out" || fail "$out" "case7: wrapper didn't exec vllm" + chmod 0600 "$fake_passwd" + echo "PASS: case7 (unwritable passwd file tolerated)" +fi + +# ----------------------------------------------------------------------------- +# Case 8: caller's writable CWD is preserved — wrapper must NOT chdir to HOME +# when cwd is usable. Protects relative-path workflows like +# `docker run -w /models ... --model ./llama.gguf`. +# ----------------------------------------------------------------------------- +case8_home="$WORKDIR/case8-home" +mkdir -p "$case8_home" +case8_cwd="$WORKDIR/case8-cwd" +mkdir -p "$case8_cwd" +out="$WORKDIR/case8.out" +(cd "$case8_cwd" && run_wrapper "$out" "HOME=$case8_home" "USER=alice" "LOGNAME=alice" -- --model ./relpath) +grep -q "^PWD=$case8_cwd\$" "$out" \ + || fail "$out" "case8: writable cwd not preserved (got $(grep '^PWD=' "$out"))" +grep -q "^ARGV=serve --model \\./relpath\$" "$out" \ + || fail "$out" "case8: relative argv not preserved" +echo "PASS: case8 (writable cwd preserved; relative argv still resolves from caller's cwd)" + +# ----------------------------------------------------------------------------- +# Case 9: read-only cwd is ALSO preserved. A caller who mounts a read-only +# model directory at the container's cwd (e.g. `docker run -w /models` with +# /models bind-mounted ro) expects relative argv like `--model ./foo.gguf` +# to resolve against /models. An earlier version of this wrapper rewrote +# read-only cwd to $HOME and broke that workflow; this case guards against +# the regression returning. +# ----------------------------------------------------------------------------- +case9_home="$WORKDIR/case9-home" +mkdir -p "$case9_home" +case9_ro="$WORKDIR/case9-ro" +mkdir -p "$case9_ro" +chmod 0555 "$case9_ro" +out="$WORKDIR/case9.out" +(cd "$case9_ro" && run_wrapper "$out" "HOME=$case9_home" "USER=alice" "LOGNAME=alice" -- --model ./foo) +grep -q "^PWD=$case9_ro\$" "$out" \ + || fail "$out" "case9: read-only cwd was rewritten (got $(grep '^PWD=' "$out"))" +grep -q "^ARGV=serve --model \\./foo\$" "$out" \ + || fail "$out" "case9: relative argv not preserved" +chmod 0700 "$case9_ro" +echo "PASS: case9 (read-only cwd preserved; relative argv still resolves from caller's cwd)" + +# ----------------------------------------------------------------------------- +# Case 10: truly inaccessible cwd (no search bit) DOES fall back to $HOME. +# Skipped as root because DAC override lets root cd into 0000 directories. +# ----------------------------------------------------------------------------- +if [ "$(id -u)" = "0" ]; then + echo "SKIP: case10 (running as root; DAC override makes inaccessible cwd untestable)" +else + case10_home="$WORKDIR/case10-home" + mkdir -p "$case10_home" + case10_cwd="$WORKDIR/case10-cwd" + mkdir -p "$case10_cwd" + out="$WORKDIR/case10.out" + # Make cwd genuinely inaccessible (mode 0000 = no search bit -> cd . + # fails with EACCES). Use absolute paths for chmod so our own test + # cleanup still works without needing search perm on the dir. + ( + cd "$case10_cwd" + chmod 0000 "$case10_cwd" + run_wrapper "$out" "HOME=$case10_home" "USER=alice" "LOGNAME=alice" -- --model foo + ) + chmod 0700 "$case10_cwd" + grep -q "^PWD=$case10_home\$" "$out" \ + || fail "$out" "case10: inaccessible cwd not overridden to HOME (got $(grep '^PWD=' "$out"))" + echo "PASS: case10 (inaccessible cwd falls back to \$HOME)" +fi + +# ----------------------------------------------------------------------------- +# Case 11: if /tmp cannot create a private fallback dir, wrapper uses /tmp as +# the last-resort HOME instead of leaving HOME empty under set -eu. +# ----------------------------------------------------------------------------- +if [ -w /home/vllm ]; then + echo "SKIP: case11 (/home/vllm is writable; mktemp fallback path is not used)" +else + cat > "$WORKDIR/bin/mktemp" <<'EOF' +#!/bin/sh +exit 1 +EOF + chmod +x "$WORKDIR/bin/mktemp" + + out="$WORKDIR/case11.out" + run_wrapper "$out" -- --model no-mktemp + rm -f "$WORKDIR/bin/mktemp" + + grep -q "^HOME=/tmp\$" "$out" \ + || fail "$out" "case11: mktemp failure did not fall back to /tmp" + grep -q "^USER=vllm\$" "$out" || fail "$out" "case11: USER not defaulted" + grep -q "^LOGNAME=vllm\$" "$out" || fail "$out" "case11: LOGNAME not defaulted" + grep -q "^ARGV=serve --model no-mktemp\$" "$out" || fail "$out" "case11: ARGV wrong" + echo "PASS: case11 (mktemp failure falls back to /tmp)" +fi + +echo "" +echo "ALL CASES PASSED." diff --git a/docker/entrypoints/vllm-nonroot-entrypoint.sh b/docker/entrypoints/vllm-nonroot-entrypoint.sh new file mode 100755 index 00000000000..0972ed99097 --- /dev/null +++ b/docker/entrypoints/vllm-nonroot-entrypoint.sh @@ -0,0 +1,87 @@ +#!/bin/sh +# Entrypoint wrapper for the opt-in `vllm-openai-nonroot` image. +# +# The image also ships a `vllm` user (UID 2000, GID 0) with HOME /home/vllm +# and a group-0-writable home directory. When the container is launched with +# `--user 2000:0` (or any other UID in group 0) the passwd entry is enough on +# its own: Docker picks up HOME=/home/vllm, getpass.getuser() resolves to +# "vllm", and every cache dir (HF, Triton, Inductor, vLLM, Numba, Outlines) +# that defaults to `$HOME/.cache/...` lands in a writable location. +# +# This wrapper exists for the *arbitrary-UID* case (e.g. OpenShift's +# `runAsUser: 1000540000` Restricted Pod Security Standard) where the caller +# UID is not in /etc/passwd at all. In that case: +# * $HOME may be unset or resolve to "/" (unwritable). +# * getpass.getuser() falls back to pwd.getpwuid() -> KeyError. +# +# The wrapper re-points $HOME to /home/vllm when writable, /tmp/vllm-home.XXXXXX +# otherwise, and defaults $USER to "vllm" so the pwd-lookup path is never +# taken. Everything else is forwarded to `vllm serve`. +# +# Non-empty caller-set env vars (HOME, USER, LOGNAME) are preserved, so +# existing K8s manifests and `docker run -e ...` keep working unchanged. +# Unset or empty values fall through to the wrapper's defaults, matching +# what shell code typically expects from "unset". + +set -eu + +if [ -z "${HOME:-}" ] || [ ! -w "${HOME}" ]; then + if [ -w /home/vllm ]; then + export HOME=/home/vllm + else + if _h="$(mktemp -d /tmp/vllm-home.XXXXXX 2>/dev/null)"; then + export HOME="$_h" + chmod 0700 "$HOME" 2>/dev/null || true + else + export HOME=/tmp + fi + unset _h + fi +fi + +# Preserve the caller's cwd whenever it's still usable. A read-only mount +# (e.g. `docker run -w /models ... --model ./llama.gguf` where /models is +# the user's model share) is a legitimate, usable cwd — vllm only needs to +# *read* relative paths from there. We only fall back to $HOME when the +# cwd itself is truly inaccessible (no search bit, deleted inode, mount +# gone, etc.), which is when `cd .` actually fails. +# +# This is the accessibility check, not a writability check; the latter +# would silently rewrite cwd for any read-only workflow and break relative +# argv like `--model ./llama.gguf`, `--chat-template ./t.jinja`, relative +# TLS cert paths, etc. +if ! cd . 2>/dev/null; then + cd "$HOME" +fi + +# getpass.getuser() prefers $USER/$LOGNAME/etc. before hitting getpwuid(); +# setting it here makes the "UID not in passwd" path a no-op for everything +# in the process tree. +if [ -z "${USER:-}" ]; then + export USER=vllm +fi +if [ -z "${LOGNAME:-}" ]; then + export LOGNAME="$USER" +fi + +# Shell-level tooling (`whoami`, bash's `\u` prompt, `id -un`, `sudo`) does +# NOT consult $USER; it calls getpwuid(geteuid()) directly. For arbitrary +# runtime UIDs in OpenShift-style deploys this returns "I have no name!". +# If /etc/passwd is group-0 writable (set at build time) and doesn't yet +# have an entry for this UID, append a synthetic one so every downstream +# consumer sees a consistent "vllm" identity. +# +# We parse the passwd file directly instead of calling `getent` because +# the container's NSS is typically just files anyway, and this lets us +# unit-test via the VLLM_PASSWD_FILE hook (undocumented; production uses +# /etc/passwd). +_passwd_file="${VLLM_PASSWD_FILE:-/etc/passwd}" +_uid="$(id -u)" +if [ -w "$_passwd_file" ] \ + && ! awk -F: -v u="$_uid" '$3==u {found=1; exit} END {exit !found}' "$_passwd_file" 2>/dev/null; then + printf 'vllm:x:%s:%s:vllm:%s:/bin/bash\n' \ + "$_uid" "$(id -g)" "$HOME" >> "$_passwd_file" +fi +unset _uid _passwd_file + +exec vllm serve "$@" diff --git a/docs/assets/contributing/dockerfile-stages-dependency.png b/docs/assets/contributing/dockerfile-stages-dependency.png index b27e2f7832f..b4f505493ad 100644 Binary files a/docs/assets/contributing/dockerfile-stages-dependency.png and b/docs/assets/contributing/dockerfile-stages-dependency.png differ diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index 39cd085b26e..a8debf2cdb3 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -8,6 +8,64 @@ toc_depth: 2 --8<-- "docs/getting_started/installation/gpu.md:pre-built-images" +## Run as a non-root user + +The CUDA `vllm/vllm-openai` image runs as root by default for backward +compatibility. It is also prepared to run as the built-in `vllm` user +(UID 2000, GID 0): + +```bash +docker run --rm --gpus all \ + --user 2000:0 \ + -p 8000:8000 \ + vllm/vllm-openai:latest \ + meta-llama/Llama-3.1-8B-Instruct +``` + +When mounting model or cache volumes for a non-root container, mount writable +paths under `/home/vllm` instead of `/root`. For example, mount the Hugging +Face cache at `/home/vllm/.cache/huggingface` and make the mounted directory +writable by group 0. + +```bash +docker run --rm --gpus all \ + --user 2000:0 \ + -v ~/.cache/huggingface:/home/vllm/.cache/huggingface \ + -p 8000:8000 \ + vllm/vllm-openai:latest \ + meta-llama/Llama-3.1-8B-Instruct +``` + +To build an image that defaults to the non-root `vllm` user, use the opt-in +`vllm-openai-nonroot` target: + +```bash +docker build --target vllm-openai-nonroot \ + -t vllm-openai-nonroot:local \ + -f docker/Dockerfile . + +docker run --rm --gpus all \ + -p 8000:8000 \ + vllm-openai-nonroot:local \ + meta-llama/Llama-3.1-8B-Instruct +``` + +The `vllm-openai-nonroot` target also supports OpenShift-style arbitrary UIDs +when the runtime UID is a member of group 0. In Kubernetes manifests, set the +container security context accordingly and keep mounted cache/model paths +writable by group 0: + +```yaml +securityContext: + runAsNonRoot: true + runAsUser: 1000540000 + runAsGroup: 0 + fsGroup: 0 +``` + +Runtime UIDs outside group 0 are not part of the documented support matrix +because they may be unable to write to `/home/vllm` or `/opt/uv/cache`. + ## Build image from source --8<-- "docs/getting_started/installation/gpu.md:build-image-from-source"