vllm/docker/Dockerfile.rocm

# default base image
ARG REMOTE_VLLM="0"
ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base
ARG CI_BASE_IMAGE=rocm/vllm-dev:ci_base
# NIC backend for MoRI RDMA support.
# By default (all), drivers and userspace libraries for all supported NIC types
# (ainic and bnxt) are installed; MoRI selects the appropriate one at runtime.
# To install drivers for a single NIC type only, set NIC_BACKEND explicitly:
#   --build-arg NIC_BACKEND=ainic   # AMD AINIC (Pensando) only
#   --build-arg NIC_BACKEND=bnxt    # Broadcom Thor-2 only
#   --build-arg NIC_BACKEND=none    # Install nothing.
ARG NIC_BACKEND=all
# AMD AINIC apt repo settings
# Users can specify a custom version compatible with their host drivers.
# The default version has been tested with ioinic-dkms=25.11.1.001
ARG AINIC_VERSION=1.117.3-hydra
ARG UBUNTU_CODENAME=jammy

# Sccache configuration. Release builds use this today; CI can opt in when a
# shared S3-compatible cache backend is available.
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME=vllm-build-sccache
ARG SCCACHE_REGION_NAME=us-west-2
ARG SCCACHE_S3_NO_CREDENTIALS=0

FROM ${BASE_IMAGE} AS base

ARG ARG_PYTORCH_ROCM_ARCH
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}

# Install build dependencies and utilities
RUN apt-get update -q -y && apt-get install -q -y \
    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
    apt-transport-https ca-certificates wget curl \
    libnuma-dev ccache mold
RUN --mount=type=cache,target=/root/.cache/pip \
    python3 -m pip install --upgrade pip
# Note: mold is installed but not set as the system default linker because
# some packages use JIT compilation at runtime with flags mold does not support.
# Build stages opt in via LDFLAGS="-fuse-ld=mold".
# Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
ARG USE_SCCACHE
RUN if [ "$USE_SCCACHE" != "1" ]; then \
        apt-get purge -y sccache || true; \
        python3 -m pip uninstall -y sccache || true; \
        rm -f "$(which sccache)" || true; \
    fi

# Install UV — download first, then run, so a curl failure is not masked by the pipe
RUN curl -LsSf --retry 3 --retry-delay 5 https://astral.sh/uv/install.sh -o /tmp/uv-install.sh \
    && env UV_INSTALL_DIR="/usr/local/bin" sh /tmp/uv-install.sh \
    && rm -f /tmp/uv-install.sh \
    && uv --version

# This timeout (in seconds) is necessary when installing some dependencies via uv since it's likely to time out
# Reference: https://github.com/astral-sh/uv/pull/1694
ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
# ccache directory - persisted across layer rebuilds via cache mounts.
ENV CCACHE_DIR=/root/.cache/ccache
ENV CCACHE_COMPILERCHECK=content
# Empty by default so build steps fall back to $(nproc); CI can override.
ARG max_jobs
ENV MAX_JOBS=${max_jobs}

# Install sccache if USE_SCCACHE is enabled (for release builds)
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
ARG SCCACHE_BUCKET_NAME
ARG SCCACHE_REGION_NAME
ARG SCCACHE_S3_NO_CREDENTIALS
RUN if [ "$USE_SCCACHE" = "1" ]; then \
        if command -v sccache >/dev/null 2>&1; then \
            echo "sccache already installed, skipping installation"; \
            sccache --version; \
        else \
            echo "Installing sccache..." \
            && SCCACHE_ARCH="x86_64" \
            && SCCACHE_VERSION="v0.8.1" \
            && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \
            && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \
            && tar -xzf /tmp/sccache.tar.gz -C /tmp \
            && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \
            && chmod +x /usr/bin/sccache \
            && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \
            && sccache --version; \
        fi; \
    fi

# Set sccache environment variables only when USE_SCCACHE=1
# This prevents S3 config from leaking into images when sccache is not used
ARG USE_SCCACHE
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}

ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR}


# -----------------------
# vLLM fetch stages
FROM base AS fetch_vllm_0
ONBUILD COPY ./ vllm/
FROM base AS fetch_vllm_1
ARG VLLM_REPO="https://github.com/vllm-project/vllm.git"
ARG VLLM_BRANCH="main"
ENV VLLM_REPO=${VLLM_REPO}
ENV VLLM_BRANCH=${VLLM_BRANCH}
ONBUILD RUN git clone ${VLLM_REPO} \
	    && cd vllm \
	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
	    && git checkout FETCH_HEAD \
        && if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
               git remote add upstream "https://github.com/vllm-project/vllm.git" \
               && git fetch upstream ; fi
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm

# -----------------------
# Rust build stage
# Builds the `vllm-rs` frontend in a dedicated stage so the wheel build stages
# don't need the rust toolchain or protoc.
FROM fetch_vllm AS rust-build
ARG COMMON_WORKDIR

# protoc is used by tonic-build/prost-build.
RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
        ca-certificates curl unzip \
    && rm -rf /var/lib/apt/lists/*

COPY tools/install_protoc.sh /tmp/install_protoc.sh
RUN /tmp/install_protoc.sh && rm /tmp/install_protoc.sh

# Install rustup; the toolchain itself is pinned by rust-toolchain.toml.
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
    sh -s -- -y --profile minimal --default-toolchain none
ENV PATH="/root/.cargo/bin:${PATH}"

# Cap cargo parallelism to avoid exhausting the AMD CI host's open-file limit
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
ENV CARGO_BUILD_JOBS=4
ENV CARGO_NET_RETRY=10
ENV RUSTUP_MAX_RETRIES=10

# Build the release binary. Cargo's registry/git caches can be written by
# concurrent BuildKit jobs on shared workers, so lock those cache mounts while
# keeping the cache benefit. Copy the binary out so it persists into the image
# layer for later COPY --from=rust-build.
RUN --mount=type=cache,id=vllm-rocm-cargo-registry,target=/root/.cargo/registry,sharing=locked \
    --mount=type=cache,id=vllm-rocm-cargo-git,target=/root/.cargo/git,sharing=locked \
    --mount=type=cache,id=vllm-rocm-cargo-target,target=${COMMON_WORKDIR}/vllm/rust/target,sharing=locked \
    cd ${COMMON_WORKDIR}/vllm \
    && VLLM_RS_TARGET_PATH=/tmp/vllm-rs bash build_rust.sh \
    && test -x /tmp/vllm-rs

# -----------------------
# vLLM native build stages
#
# csrc-build intentionally copies only files that affect ROCm native extension
# compilation. That keeps unrelated CI/test/docs edits from invalidating the
# expensive HIP/C++ build layer.
FROM base AS csrc-build
ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR}/vllm

COPY requirements/rocm.txt requirements/rocm.txt
COPY requirements/common.txt requirements/common.txt
RUN --mount=type=cache,id=vllm-rocm-uv,target=/root/.cache/uv \
    uv pip install --system -r requirements/rocm.txt

# pyproject.toml is bind-mounted in the RUN step so metadata-only changes do
# not invalidate the expensive native build layer.
COPY setup.py CMakeLists.txt ./
COPY cmake cmake/
COPY csrc csrc/
COPY vllm/envs.py vllm/envs.py
COPY vllm/__init__.py vllm/__init__.py

ENV VLLM_TARGET_DEVICE=rocm
ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+rocm.csrc.build"

RUN --mount=type=bind,source=pyproject.toml,target=${COMMON_WORKDIR}/vllm/pyproject.toml \
    --mount=type=cache,id=vllm-rocm-ccache,target=/root/.cache/ccache \
    export CCACHE_BASEDIR="$PWD" \
    && echo "=== ccache stats before ROCm native build ===" \
    && (ccache --show-stats || true) \
    && (ccache --zero-stats || true) \
    && EFFECTIVE_MAX_JOBS="${MAX_JOBS:-$(nproc)}" \
    && echo "Building ROCm native extension wheel with MAX_JOBS=${EFFECTIVE_MAX_JOBS}" \
    && LDFLAGS="-fuse-ld=mold" MAX_JOBS="${EFFECTIVE_MAX_JOBS}" python3 setup.py bdist_wheel --dist-dir=dist \
    && test -d dist \
    && ls dist/*.whl >/dev/null \
    && echo "=== ccache stats after ROCm native build ===" \
    && (ccache --show-stats || true)

# Build the full vLLM ROCm wheel by reusing the native extension wheel from
# csrc-build. This stage still rebuilds for Python/package changes, but skips
# the expensive HIP/C++ compile when native inputs are unchanged.
FROM fetch_vllm AS build_vllm
ARG COMMON_WORKDIR
ENV VLLM_TARGET_DEVICE=rocm

COPY --from=csrc-build ${COMMON_WORKDIR}/vllm/dist /precompiled-wheels

# Drop the pre-built rust frontend binary into the source tree. setup.py
# detects it and ships it as-is, skipping the local cargo build.
COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs

RUN --mount=type=cache,id=vllm-rocm-uv,target=/root/.cache/uv \
    cd vllm \
    && uv pip install --system -r requirements/rocm.txt \
    && export VLLM_USE_PRECOMPILED=1 \
    && export VLLM_PRECOMPILED_WHEEL_LOCATION="$(ls /precompiled-wheels/*.whl)" \
    && export VLLM_DOCKER_BUILD_CONTEXT=1 \
    && echo "Packaging vLLM ROCm wheel using precompiled extensions from ${VLLM_PRECOMPILED_WHEEL_LOCATION}" \
    && python3 setup.py bdist_wheel --dist-dir=dist \
    && test -d dist \
    && ls dist/*.whl >/dev/null
FROM scratch AS export_vllm
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/requirements /requirements
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/pyproject.toml /pyproject.toml
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1

# RIXL/UCX build stages
FROM base AS build_rixl
ARG RIXL_BRANCH="39be1de8"
ARG RIXL_REPO="https://github.com/ROCm/RIXL.git"
ARG UCX_BRANCH="bfb51733"
ARG UCX_REPO="https://github.com/openucx/ucx.git"
ENV ROCM_PATH=/opt/rocm
ENV UCX_HOME=/usr/local/ucx
ENV RIXL_HOME=/usr/local/rixl
ENV RIXL_BENCH_HOME=/usr/local/rixl_bench

# RIXL build system dependences and RDMA support
RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
    libgrpc-dev \
    libgrpc++-dev \
    libprotobuf-dev \
    protobuf-compiler-grpc \
    libcpprest-dev \
    libaio-dev \
    librdmacm1 \
    librdmacm-dev \
    libibverbs1 \
    libibverbs-dev \
    ibverbs-utils \
    rdmacm-utils \
    ibverbs-providers \
    && rm -rf /var/lib/apt/lists/*

RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system meson auditwheel patchelf tomlkit

RUN --mount=type=cache,target=/root/.cache/ccache \
    cd /usr/local/src && \
    git clone ${UCX_REPO} &&  \
    cd ucx  && \
    git checkout ${UCX_BRANCH} && \
    ./autogen.sh && \
    mkdir build && cd build && \
    CC="ccache gcc" CXX="ccache g++" \
    ../configure \
        --prefix=/usr/local/ucx \
        --enable-shared \
        --disable-static \
        --disable-doxygen-doc \
        --enable-optimizations \
        --enable-devel-headers \
        --with-rocm=${ROCM_PATH} \
        --with-verbs \
        --with-dm \
        --enable-mt && \
    make -j$(nproc) && \
    make install

ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}

RUN --mount=type=cache,target=/root/.cache/ccache \
    git clone ${RIXL_REPO} /opt/rixl && \
    cd /opt/rixl && \
    git checkout ${RIXL_BRANCH} && \
    CC="ccache gcc" CXX="ccache g++" \
    meson setup build --prefix=${RIXL_HOME} \
                     -Ducx_path=${UCX_HOME} \
                     -Drocm_path=${ROCM_PATH} && \
    cd build && \
    ninja -j$(nproc) && \
    ninja install

# Generate RIXL wheel
# Exclude libcore and libpull from auditwheel: transitive dependencies
# that are not shipped in the wheel and vary across base images.
RUN cd /opt/rixl && \
    sed -i "s/--exclude 'libamdhip64\*'/--exclude 'libamdhip64*' --exclude 'libcore*' --exclude 'libpull*'/" \
        contrib/build-wheel.sh && \
    mkdir -p /app/install && \
    _ucx_install_dir=${UCX_HOME} \
    ./contrib/build-wheel.sh \
        --output-dir /app/install \
        --rocm-dir ${ROCM_PATH} \
        --ucx-plugins-dir ${UCX_HOME}/lib/ucx \
        --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins

# ROCShmem build stage - split from DeepEP so changing DEEPEP_BRANCH does not
# invalidate the slow ROCShmem build.
FROM base AS build_rocshmem
ARG ROCSHMEM_BRANCH="f0acb0c6"
ARG ROCSHMEM_REPO="https://github.com/ROCm/rocm-systems.git"
# DeepEP only supports gfx942 and gfx950; build ROCShmem for the same set so
# it can be linked against DeepEP without arch mismatches.
ARG DEEPEP_ROCM_ARCH="gfx942;gfx950"
ENV ROCM_PATH=/opt/rocm
ENV ROCSHMEM_DIR=/opt/rocshmem

RUN --mount=type=cache,target=/root/.cache/ccache \
    git clone --no-checkout --filter=blob:none ${ROCSHMEM_REPO} \
 && cd rocm-systems \
 && git sparse-checkout set --cone projects/rocshmem \
 && git checkout ${ROCSHMEM_BRANCH} \
 && mkdir -p projects/rocshmem/build \
 && cd projects/rocshmem/build \
 && CC="ccache gcc" CXX="ccache g++" INSTALL_PREFIX=${ROCSHMEM_DIR} \
    bash ../scripts/build_configs/all_backends \
      -DROCM_PATH=${ROCM_PATH} \
      -DGPU_TARGETS="${DEEPEP_ROCM_ARCH}" \
      -DUSE_EXTERNAL_MPI=OFF

# DeepEP build stage - depends on ROCShmem, builds the HIP kernel wheel.
FROM build_rocshmem AS build_deepep
ARG DEEPEP_BRANCH="a9ea9774"
ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
ARG DEEPEP_NIC="cx7"

# Build DeepEP wheel. DeepEP looks for rocshmem at ROCSHMEM_DIR.
# DeepEP only supports gfx942 and gfx950, so avoid gfx90a in the default list.
RUN --mount=type=cache,target=/root/.cache/ccache \
    export PYTORCH_ROCM_ARCH="gfx942;gfx950" \
 && git clone ${DEEPEP_REPO} \
 && cd DeepEP \
 && git checkout ${DEEPEP_BRANCH} \
 && LDFLAGS="-fuse-ld=mold" MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --rocm-explicit-ctx --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install

# MoRI runtime dependencies live in Dockerfile.rocm so NIC backend changes do
# not force users to rebuild the long-lived Dockerfile.rocm_base image.
FROM base AS mori_base
ARG NIC_BACKEND
ARG AINIC_VERSION
ARG UBUNTU_CODENAME
RUN /bin/bash -lc 'set -euo pipefail; \
 \
 install_ainic() { \
   apt-get update && apt-get install -y --no-install-recommends ca-certificates curl gnupg apt-transport-https; \
   rm -rf /var/lib/apt/lists/*; \
   mkdir -p /etc/apt/keyrings; \
   curl -fsSL https://repo.radeon.com/rocm/rocm.gpg.key | gpg --dearmor > /etc/apt/keyrings/amdainic.gpg; \
   echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/amdainic.gpg] https://repo.radeon.com/amdainic/pensando/ubuntu/${AINIC_VERSION} ${UBUNTU_CODENAME} main" \
     > /etc/apt/sources.list.d/amdainic.list; \
   apt-get update && apt-get install -y --no-install-recommends \
     libionic-dev \
     ionic-common \
   ; \
   rm -rf /var/lib/apt/lists/*; \
 }; \
 \
 # NOTE: requires FW 235.2.86.0 and kernel drivers on the host: \
 #   bnxt-en-dkms=1.10.3.235.2.86.0 bnxt-re-dkms=235.2.86.0 (from packages.broadcom.com PPA) \
 install_bnxt() { \
   install -m 0755 -d /etc/apt/keyrings; \
   curl -fsSL https://packages.broadcom.com/artifactory/api/security/keypair/PackagesKey/public \
     -o /etc/apt/keyrings/broadcom-nic.asc; \
   chmod a+r /etc/apt/keyrings/broadcom-nic.asc; \
   echo "deb [arch=amd64 signed-by=/etc/apt/keyrings/broadcom-nic.asc] https://packages.broadcom.com/artifactory/ethernet-nic-debian-public jammy main" \
     > /etc/apt/sources.list.d/broadcom-nic.list; \
   apt-get update && apt-get install -y --no-install-recommends \
     bnxt-rocelib=235.2.86.0 \
   ; \
   cp -a /usr/local/lib/x86_64-linux-gnu/libbnxt_re* /usr/local/lib/; \
   ldconfig; \
   rm -rf /var/lib/apt/lists/*; \
 }; \
 \
 echo "[MORI] Install MoRI proxy deps"; \
 pip install --quiet --ignore-installed blinker && \
 pip install --quiet quart msgpack aiohttp pyzmq; \
 echo "[MORI] NIC_BACKEND=${NIC_BACKEND}"; \
 \
 # NIC backend deps — mori auto-detects NIC at runtime (MORI_DEVICE_NIC env var override). \
 # Only vendor packages are installed here for dlopen; no compile-time flags needed. \
 case "${NIC_BACKEND}" in \
   none)  ;; \
   all)   install_ainic; install_bnxt ;; \
   ainic) install_ainic ;; \
   bnxt)  install_bnxt ;; \
   *)     echo "ERROR: unknown NIC_BACKEND=${NIC_BACKEND}. Use one of: none, ainic, bnxt, all"; exit 2 ;; \
 esac'

# -----------------------
# vLLM wheel release build stage (for building distributable wheels)
# This stage pins dependencies to custom ROCm wheel versions and handles version detection
FROM fetch_vllm AS build_vllm_wheel_release

ARG COMMON_WORKDIR

# Drop the pre-built rust frontend binary into the source tree. setup.py
# detects it and ships it as-is, skipping the local cargo build.
COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs

# Create /install directory for custom wheels
RUN mkdir -p /install

# Copy custom ROCm wheels from docker/context if they exist
# COPY ensures Docker cache is invalidated when wheels change
# .keep file ensures directory always exists for COPY to work
COPY docker/context/base-wheels/ /tmp/base-wheels/
# This is how we know if we are building for a wheel release or not.
# If there are not wheels found there, we are not building for a wheel release.
# So we exit with an error. To skip this stage.
RUN if [ -n "$(ls /tmp/base-wheels/*.whl 2>/dev/null)" ]; then \
        echo "Found custom wheels - copying to /install"; \
        cp /tmp/base-wheels/*.whl /install/ && \
        echo "Copied custom wheels:"; \
        ls -lh /install/; \
    else \
        echo "ERROR: No custom wheels found in docker/context/base-wheels/"; \
        echo "Wheel releases require pre-built ROCm wheels."; \
        exit 1; \
    fi

# GIT_REPO_CHECK: Verify repo is clean and tags are available (for release builds)
# This matches CUDA's Dockerfile behavior for proper version detection via setuptools_scm
ARG GIT_REPO_CHECK=0
RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
        echo "Running repository checks..."; \
        cd vllm && bash tools/check_repo.sh; \
    fi

# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
# This ensures setuptools_scm sees clean repo state for version detection
RUN --mount=type=bind,source=.git,target=vllm/.git \
    --mount=type=cache,target=/root/.cache/uv \
    cd vllm \
    && uv pip install --system setuptools_scm regex \
    && VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
    && echo "Detected vLLM version: ${VLLM_VERSION}" \
    && echo "${VLLM_VERSION}" > /tmp/vllm_version.txt

# Fail if git-based package dependencies are found in requirements files
# (uv doesn't handle git+ URLs well, and packages should be distributed on PyPI)
# Extra notes: pip install is able to handle git+ URLs, but uv doesn't.
RUN echo "Checking for git-based packages in requirements files..." \
    && echo "Checking common.txt for git-based packages:" \
    && if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; then \
         echo "ERROR: Git-based packages found in common.txt:"; \
         grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; \
         echo "Please publish these packages to PyPI instead of using git dependencies."; \
         exit 1; \
       else \
         echo "  ✓ No git-based packages found in common.txt"; \
       fi \
    && echo "Checking rocm.txt for git-based packages:" \
    && if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; then \
         echo "ERROR: Git-based packages found in rocm.txt:"; \
         grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; \
         echo "Please publish these packages to PyPI instead of using git dependencies."; \
         exit 1; \
       else \
         echo "  ✓ No git-based packages found in rocm.txt"; \
       fi \
    && echo "All requirements files are clean - no git-based packages found"

# Pin vLLM dependencies to exact versions of custom ROCm wheels
# This ensures 'pip install vllm' automatically installs correct torch/triton/torchvision/amdsmi
COPY tools/vllm-rocm/pin_rocm_dependencies.py /tmp/pin_rocm_dependencies.py
RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
    && python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt

# Install dependencies using custom wheels from /install
RUN --mount=type=cache,target=/root/.cache/uv \
    cd vllm \
    && echo "Building vLLM with custom wheels from /install" \
    && uv pip install --system --find-links /install -r requirements/rocm.txt

# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
# (setup.py auto-detects ccache/sccache in PATH)
RUN --mount=type=bind,source=.git,target=vllm/.git \
    --mount=type=cache,id=vllm-rocm-ccache,target=/root/.cache/ccache \
    cd vllm \
    && export CCACHE_BASEDIR="$PWD" \
    && export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
    && echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
    && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist

FROM scratch AS export_vllm_wheel_release
ARG COMMON_WORKDIR
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/dist/*.whl /
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/requirements /requirements
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/pyproject.toml /pyproject.toml
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1

# -----------------------
# CI base image (Tier 1) - stable, rarely changing CI dependencies.
# Per-PR test builds pull this as CI_BASE_IMAGE so the test stage only layers
# in the vLLM artifacts for the current commit.
FROM mori_base AS ci_base
ARG COMMON_WORKDIR

# Update rdma-core to support latest rocshmem.
ARG DEEPEP_NIC
RUN if [ "${DEEPEP_NIC}" = "cx7" ] || [ "${DEEPEP_NIC}" = "io" ]; then \
    git clone --branch v62.0 --depth 1 https://github.com/linux-rdma/rdma-core.git /tmp/rdma-core && \
    cd /tmp/rdma-core && \
    mkdir -p build && cd build && \
    cmake -GNinja -DCMAKE_INSTALL_PREFIX=/usr -DNO_MAN_PAGES=1 .. && \
    ninja && ninja install && ldconfig && rm -rf /tmp/rdma-core; \
fi

# Install RIXL + DeepEP wheels.
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
    --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
    uv pip install --system /rixl_install/*.whl /deep_install/*.whl

# Copy ROCShmem runtime libraries.
COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem

# RDMA userspace libraries plus FFmpeg dev libs needed by torchcodec.
RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
    librdmacm1 \
    libibverbs1 \
    ibverbs-providers \
    ibverbs-utils \
    pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \
    libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \
    && rm -rf /var/lib/apt/lists/*

# Install torchcodec from source for ROCm/torch ABI compatibility.
COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh
RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/pip \
    --mount=type=cache,target=/root/.cache/torchcodec-wheels \
    bash /tmp/install_torchcodec.sh \
    && rm /tmp/install_torchcodec.sh \
    && apt-get clean && rm -rf /var/lib/apt/lists/*

# Pre-install shared ROCm runtime dependencies.
COPY requirements/common.txt requirements/rocm.txt /tmp/ci-base-requirements/
RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r /tmp/ci-base-requirements/rocm.txt \
    && rm -rf /tmp/ci-base-requirements

# Enable fast and less brittle model downloads in tests.
ENV HF_XET_HIGH_PERFORMANCE=1
ENV HF_HUB_DOWNLOAD_TIMEOUT=60

# Pre-install vLLM test dependencies.
COPY requirements/test/rocm.txt /tmp/rocm-test-reqs.txt
RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system -r /tmp/rocm-test-reqs.txt

# Rebuild fastsafetensors from source so its C++ extension is compiled with
# USE_ROCM and can detect libamdhip64.so at runtime.
RUN --mount=type=cache,target=/root/.cache/pip \
    FASTSAFETENSORS_REQ="$(grep -E '^fastsafetensors(==| @ )' /tmp/rocm-test-reqs.txt | head -1)" \
    && test -n "${FASTSAFETENSORS_REQ}" \
    && python3 -m pip install --force-reinstall --no-deps \
        --no-binary fastsafetensors "${FASTSAFETENSORS_REQ}" \
    && rm /tmp/rocm-test-reqs.txt

# Set MIOPEN ENVS to resolve performance regressions in MIOpen 3D convolution kernel.
# See: https://github.com/pytorch/pytorch/issues/169857
ENV MIOPEN_DEBUG_CONV_DIRECT=0
ENV MIOPEN_DEBUG_CONV_GEMM=0

# Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc.
# See: https://github.com/ROCm/rocm-libraries/issues/6266
ENV HSA_ENABLE_IPC_MODE_LEGACY=1

# ROCm profiler limits workaround.
RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"

# Install vllm_test_utils in ci_base for ci_base + wheel parity.
COPY tests/vllm_test_utils /tmp/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system /tmp/vllm_test_utils \
    && rm -rf /tmp/vllm_test_utils

# -----------------------
# Test vLLM image (Tier 2) - vLLM-only layer on top of ci_base.
FROM ${CI_BASE_IMAGE} AS test
ARG COMMON_WORKDIR

# Install the vLLM wheel (--no-deps: all deps already in ci_base).
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    --mount=type=cache,target=/root/.cache/uv \
    cd /install \
    && uv pip install --system --no-deps *.whl

# Store the vLLM wheel in the image for python-only install tests.
COPY --from=export_vllm /*.whl /opt/vllm-wheels/

WORKDIR /vllm-workspace
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace

# Copy in the v1 package (for python-only install test group).
COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1

# Hide source under src/ so it won't shadow the installed package in tests.
RUN mkdir src && mv vllm src/vllm

# -----------------------
# Final vLLM image
FROM mori_base AS final

RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*

# Clean up sccache from release image (not needed at runtime)
# This removes the binary and wrappers that may have been installed during build
RUN rm -f /usr/bin/sccache || true \
    && rm -rf /opt/sccache-wrappers || true

# Unset sccache environment variables for the release image
# This prevents S3 bucket config from leaking into production images
ENV SCCACHE_BUCKET=
ENV SCCACHE_REGION=
ENV SCCACHE_ENDPOINT=
ENV SCCACHE_S3_NO_CREDENTIALS=
ENV SCCACHE_IDLE_TIMEOUT=

# Error related to odd state for numpy 1.20.3 where there is no METADATA etc, but an extra LICENSES_bundled.txt.
# Manually remove it so that later steps of numpy upgrade can continue
RUN case "$(which python3)" in \
        *"/opt/conda/envs/py_3.9"*) \
            rm -rf /opt/conda/envs/py_3.9/lib/python3.9/site-packages/numpy-1.20.3.dist-info/;; \
        *) ;; esac

RUN --mount=type=cache,target=/root/.cache/uv \
    uv pip install --system --upgrade huggingface-hub[cli]

# Install vLLM using uv (inherited from base stage)
# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
    --mount=type=cache,target=/root/.cache/uv \
    cd /install \
    && uv pip install --system -r requirements/rocm.txt \
    && pip uninstall -y vllm \
    && uv pip install --system *.whl

# Install RIXL wheel
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
    uv pip install --system /rixl_install/*.whl

ARG COMMON_WORKDIR
ARG BASE_IMAGE
ARG NIC_BACKEND
ARG AINIC_VERSION

# Copy over the benchmark scripts as well
COPY --from=export_vllm /benchmarks ${COMMON_WORKDIR}/vllm/benchmarks
COPY --from=export_vllm /examples ${COMMON_WORKDIR}/vllm/examples
COPY --from=export_vllm /docker ${COMMON_WORKDIR}/vllm/docker

# Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc
# See: https://github.com/ROCm/rocm-libraries/issues/6266
ENV HSA_ENABLE_IPC_MODE_LEGACY=1

ENV TOKENIZERS_PARALLELISM=false

# ENV that can improve safe tensor loading, and end-to-end time
ENV SAFETENSORS_FAST_GPU=1

# Performance environment variable.
ENV HIP_FORCE_DEV_KERNARG=1

# Workaround for ROCm profiler limits
RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
RUN echo "VLLM_BASE_IMAGE=${BASE_IMAGE}" >> ${COMMON_WORKDIR}/versions.txt \
    && echo "MORI_NIC_BACKEND=${NIC_BACKEND}" >> ${COMMON_WORKDIR}/versions.txt \
    && echo "AINIC_VERSION=${AINIC_VERSION}" >> ${COMMON_WORKDIR}/versions.txt

CMD ["/bin/bash"]

#Set entrypoint for vllm-openai official images
FROM final AS vllm-openai
ENTRYPOINT ["vllm", "serve"]