[ROCm][CI] Optimize ROCm Docker build: registry cache, DeepEP, and ci-bake script (#36949)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2026-06-03 01:43:07 -05:00
committed by GitHub
parent 71df063c49
commit 87954eb50e
10 changed files with 2746 additions and 158 deletions
+23
View File
@@ -0,0 +1,23 @@
name: vllm_rocm_ci
job_dirs:
- ".buildkite/hardware_tests"
run_all_patterns:
- "docker/Dockerfile.rocm"
- "docker/Dockerfile.rocm_base"
- "docker/ci-rocm.hcl"
- "docker/docker-bake-rocm.hcl"
- ".buildkite/hardware_tests/amd.yaml"
- ".buildkite/scripts/ci-bake-rocm.sh"
- ".buildkite/scripts/hardware_ci/run-amd-test.py"
- ".buildkite/scripts/hardware_ci/run-amd-test.sh"
- "CMakeLists.txt"
- "requirements/common.txt"
- "requirements/rocm.txt"
- "requirements/build/rocm.txt"
- "requirements/test/rocm.txt"
- "setup.py"
- "csrc/"
- "cmake/"
run_all_exclude_patterns:
- "csrc/cpu/"
- "cmake/cpu_extension.cmake"
+45 -14
View File
@@ -1,22 +1,43 @@
group: Hardware - AMD Build
steps:
- label: "AMD: :docker: build image"
key: image-build-amd
# Ensure ci_base is up-to-date before building the test image.
# Compares a content hash of ci_base-affecting files against the remote
# image label. If hashes match the build is skipped (< 30 s); if they
# differ ci_base is rebuilt and pushed automatically.
- label: "AMD: :docker: ensure ci_base"
key: ensure-ci-base-amd
depends_on: []
device: amd_cpu
no_plugin: true
commands:
- >
docker build
--build-arg max_jobs=16
--build-arg REMOTE_VLLM=1
--build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
--build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
--tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-f docker/Dockerfile.rocm
--target test
--no-cache
--progress plain .
- bash .buildkite/scripts/ci-bake-rocm.sh ci-base-rocm-ci-with-deps
env:
DOCKER_BUILDKIT: "1"
VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950"
REMOTE_VLLM: "1"
VLLM_BRANCH: "$BUILDKITE_COMMIT"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
- label: "AMD: :docker: build test image and artifacts"
key: image-build-amd
depends_on:
- ensure-ci-base-amd
device: amd_cpu
no_plugin: true
commands:
- |
if [[ "${ROCM_CI_ARTIFACT_ONLY:-0}" == "1" ]]; then
echo "ROCM_CI_ARTIFACT_ONLY=1; building ROCm wheel artifact only"
IMAGE_TAG="" bash .buildkite/scripts/ci-bake-rocm.sh test-rocm-ci-with-artifacts
else
bash .buildkite/scripts/ci-bake-rocm.sh test-rocm-ci-with-wheel
fi
- |
docker run --rm --network=none --entrypoint /bin/bash "rocm/vllm-ci:${BUILDKITE_COMMIT}" -ec '
if [ ! -d /vllm-workspace ]; then echo Missing directory: /vllm-workspace >&2; exit 1; fi
@@ -37,6 +58,16 @@ steps:
PY
echo AMD image smoke OK
'
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
env:
DOCKER_BUILDKIT: "1"
VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950"
IMAGE_TAG: "rocm/vllm-ci:$BUILDKITE_COMMIT"
REMOTE_VLLM: "1"
VLLM_BRANCH: "$BUILDKITE_COMMIT"
retry:
automatic:
- exit_status: -1 # Agent was lost
limit: 1
- exit_status: -10 # Agent was lost
limit: 1
File diff suppressed because it is too large Load Diff
+121 -11
View File
@@ -52,6 +52,108 @@ cleanup_network() {
fi
}
prepare_artifact_image() {
if [[ "${VLLM_CI_USE_ARTIFACTS:-0}" != "1" ]]; then
return 1
fi
if ! command -v buildkite-agent >/dev/null 2>&1; then
echo "buildkite-agent not found; cannot download ROCm wheel artifact"
return 1
fi
local artifact_glob="${VLLM_CI_ARTIFACT_GLOB:-artifacts/vllm-rocm-install/vllm-rocm-install.tar.gz}"
local archive=""
local metadata_file=""
local base_image="${VLLM_CI_BASE_IMAGE:-rocm/vllm-dev:ci_base}"
local artifact_image=""
local artifact_key=""
local base_digest=""
local wheel_dir=""
local context_dir=""
local workspace_dir=""
artifact_work_dir=$(mktemp -d -t vllm-rocm-artifact.XXXXXX)
wheel_dir="${artifact_work_dir}/wheels"
context_dir="${artifact_work_dir}/context"
workspace_dir="${context_dir}/workspace"
mkdir -p "${wheel_dir}" "${context_dir}/wheels" "${workspace_dir}"
echo "--- Downloading ROCm wheel artifact"
if ! buildkite-agent artifact download "${artifact_glob}" "${artifact_work_dir}"; then
echo "Failed to download ${artifact_glob}"
return 1
fi
buildkite-agent artifact download \
"artifacts/vllm-rocm-install/ci-base-image.txt" \
"${artifact_work_dir}" >/dev/null 2>&1 || true
archive=$(find "${artifact_work_dir}" -name "vllm-rocm-install.tar.gz" -type f | head -1)
if [[ -z "${archive}" || ! -f "${archive}" ]]; then
echo "ROCm wheel artifact archive was not found"
return 1
fi
metadata_file=$(find "${artifact_work_dir}" -name "ci-base-image.txt" -type f | head -1)
if [[ -n "${metadata_file}" && -s "${metadata_file}" ]]; then
base_image=$(tr -d '[:space:]' < "${metadata_file}")
fi
echo "--- Preparing local ROCm test image"
echo "Base image: ${base_image}"
docker pull "${base_image}" || return 1
base_digest=$(
docker image inspect \
--format='{{if .RepoDigests}}{{index .RepoDigests 0}}{{else}}{{.Id}}{{end}}' \
"${base_image}" 2>/dev/null || printf '%s' "${base_image}"
)
artifact_key=$(
{
printf 'base-image:%s\n' "${base_digest}"
sha256sum "${archive}"
} | sha256sum | cut -c1-24
)
artifact_image="rocm/vllm-ci-artifact:${artifact_key}"
if docker image inspect "${artifact_image}" >/dev/null 2>&1; then
echo "Using existing local ROCm artifact image: ${artifact_image}"
image_name="${artifact_image}"
return 0
fi
tar -xzf "${archive}" -C "${wheel_dir}" || return 1
if ! ls "${wheel_dir}"/*.whl >/dev/null 2>&1; then
echo "ROCm wheel artifact did not contain a wheel"
return 1
fi
if [[ ! -d "${wheel_dir}/tests" ]]; then
echo "ROCm wheel artifact did not contain the test workspace"
return 1
fi
cp "${wheel_dir}"/*.whl "${context_dir}/wheels/" || return 1
tar -C "${wheel_dir}" --exclude='*.whl' -cf - . \
| tar -C "${workspace_dir}" -xf - || return 1
cat > "${context_dir}/Dockerfile" <<'EOF'
ARG BASE_IMAGE
FROM ${BASE_IMAGE}
COPY wheels/ /tmp/vllm-wheels/
COPY workspace/ /vllm-workspace/
RUN python3 -m pip install --no-deps --force-reinstall /tmp/vllm-wheels/*.whl \
&& rm -rf /tmp/vllm-wheels
WORKDIR /vllm-workspace
EOF
echo "--- Building local ROCm test image"
docker build \
--pull=false \
--build-arg "BASE_IMAGE=${base_image}" \
-t "${artifact_image}" \
"${context_dir}" || return 1
image_name="${artifact_image}"
return 0
}
is_multi_node() {
local cmds="$1"
# Primary signal: NUM_NODES environment variable set by the pipeline
@@ -243,22 +345,30 @@ report_docker_usage
# --- Pull test image ---
echo "--- Pulling container"
image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
image_name="${VLLM_CI_FALLBACK_IMAGE:-rocm/vllm-ci:${BUILDKITE_COMMIT:-local}}"
artifact_work_dir=""
container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
docker pull "${image_name}"
remove_docker_container() {
# docker run uses --rm, so the container is normally already gone when the
# EXIT trap runs. Cleanup is best-effort and must not affect the test result.
docker rm -f "${container_name}" >/dev/null 2>&1 || true
if docker container inspect "${container_name}" >/dev/null 2>&1; then
docker rm -f "${container_name}" || true
fi
if [[ "${VLLM_CI_REMOVE_TEST_IMAGE:-0}" == "1" ]]; then
docker image rm -f "${image_name}" || true
else
# Keep images by default so later jobs on the same AMD node can reuse layers.
echo "Keeping ROCm test image locally: ${image_name}"
fi
if [[ -n "${artifact_work_dir}" ]]; then
rm -rf "${artifact_work_dir}"
fi
}
trap remove_docker_container EXIT
on_exit() {
local exit_code=$?
remove_docker_container
exit "$exit_code"
}
trap on_exit EXIT
if ! prepare_artifact_image; then
echo "Using full ROCm CI image: ${image_name}"
docker pull "${image_name}" || exit 1
fi
# --- Prepare commands ---
echo "--- Running container"
+7
View File
@@ -33,3 +33,10 @@ share/python-wheels/
*.egg
MANIFEST
rust/target/
# Not needed in Docker builds
docs/
.github/
.pre-commit-config.yaml
.clang-format
.gitattributes
format.sh
+8
View File
@@ -81,6 +81,14 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
set_property(GLOBAL APPEND PROPERTY VLLM_HIPIFY_ALL_SRCS ${SRCS})
set_property(GLOBAL APPEND PROPERTY VLLM_HIPIFY_ALL_BYPRODUCTS ${HIP_SRCS})
# Chain hipify targets so they run sequentially. Parallel hipify
# invocations race on shutil.copytree, overwriting .hip files
# produced by another target back to .cu originals.
if (DEFINED _VLLM_LAST_HIPIFY_TARGET)
add_dependencies(hipify${NAME} ${_VLLM_LAST_HIPIFY_TARGET})
endif()
set(_VLLM_LAST_HIPIFY_TARGET "hipify${NAME}" PARENT_SCOPE)
# Swap out original extension sources with hipified sources.
list(APPEND HIP_SRCS ${CXX_SRCS})
set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
+199 -107
View File
@@ -2,6 +2,7 @@
ARG REMOTE_VLLM="0"
ARG COMMON_WORKDIR=/app
ARG BASE_IMAGE=rocm/vllm-dev:base
ARG CI_BASE_IMAGE=rocm/vllm-dev:ci_base
# NIC backend for MoRI RDMA support.
# By default (all), drivers and userspace libraries for all supported NIC types
# (ainic and bnxt) are installed; MoRI selects the appropriate one at runtime.
@@ -16,7 +17,8 @@ ARG NIC_BACKEND=all
ARG AINIC_VERSION=1.117.3-hydra
ARG UBUNTU_CODENAME=jammy
# Sccache configuration (only used in release pipeline)
# Sccache configuration. Release builds use this today; CI can opt in when a
# shared S3-compatible cache backend is available.
ARG USE_SCCACHE
ARG SCCACHE_DOWNLOAD_URL
ARG SCCACHE_ENDPOINT
@@ -29,12 +31,16 @@ FROM ${BASE_IMAGE} AS base
ARG ARG_PYTORCH_ROCM_ARCH
ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}
# Install some basic utilities
# Install build dependencies and utilities
RUN apt-get update -q -y && apt-get install -q -y \
sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
apt-transport-https ca-certificates wget curl \
libnuma-dev
RUN python3 -m pip install --upgrade pip
libnuma-dev ccache mold
RUN --mount=type=cache,target=/root/.cache/pip \
python3 -m pip install --upgrade pip
# Note: mold is installed but not set as the system default linker because
# some packages use JIT compilation at runtime with flags mold does not support.
# Build stages opt in via LDFLAGS="-fuse-ld=mold".
# Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
ARG USE_SCCACHE
RUN if [ "$USE_SCCACHE" != "1" ]; then \
@@ -55,6 +61,12 @@ ENV UV_HTTP_TIMEOUT=500
ENV UV_INDEX_STRATEGY="unsafe-best-match"
# Use copy mode to avoid hardlink failures with Docker cache mounts
ENV UV_LINK_MODE=copy
# ccache directory - persisted across layer rebuilds via cache mounts.
ENV CCACHE_DIR=/root/.cache/ccache
ENV CCACHE_COMPILERCHECK=content
# Empty by default so build steps fall back to $(nproc); CI can override.
ARG max_jobs
ENV MAX_JOBS=${max_jobs}
# Install sccache if USE_SCCACHE is enabled (for release builds)
ARG USE_SCCACHE
@@ -86,6 +98,7 @@ RUN if [ "$USE_SCCACHE" = "1" ]; then \
ARG USE_SCCACHE
ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
ENV SCCACHE_ENDPOINT=${USE_SCCACHE:+${SCCACHE_ENDPOINT}}
ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}
@@ -114,8 +127,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# -----------------------
# Rust build stage
# Builds the `vllm-rs` frontend in a dedicated stage so the wheel build stages
# don't need the rust toolchain or protoc. Runs in parallel with the main wheel
# build for faster end-to-end builds.
# don't need the rust toolchain or protoc.
FROM fetch_vllm AS rust-build
ARG COMMON_WORKDIR
@@ -144,24 +156,74 @@ ENV RUSTUP_MAX_RETRIES=10
# layer for later COPY --from=rust-build.
RUN --mount=type=cache,id=vllm-rocm-cargo-registry,target=/root/.cargo/registry,sharing=locked \
--mount=type=cache,id=vllm-rocm-cargo-git,target=/root/.cargo/git,sharing=locked \
--mount=type=cache,id=vllm-rocm-cargo-target,target=${COMMON_WORKDIR}/vllm/rust/target,sharing=locked \
cd ${COMMON_WORKDIR}/vllm \
&& VLLM_RS_TARGET_PATH=/tmp/vllm-rs bash build_rust.sh \
&& test -x /tmp/vllm-rs
# -----------------------
# vLLM build stages
# vLLM native build stages
#
# csrc-build intentionally copies only files that affect ROCm native extension
# compilation. That keeps unrelated CI/test/docs edits from invalidating the
# expensive HIP/C++ build layer.
FROM base AS csrc-build
ARG COMMON_WORKDIR
WORKDIR ${COMMON_WORKDIR}/vllm
COPY requirements/rocm.txt requirements/rocm.txt
COPY requirements/common.txt requirements/common.txt
RUN --mount=type=cache,id=vllm-rocm-uv,target=/root/.cache/uv \
uv pip install --system -r requirements/rocm.txt
# pyproject.toml is bind-mounted in the RUN step so metadata-only changes do
# not invalidate the expensive native build layer.
COPY setup.py CMakeLists.txt ./
COPY cmake cmake/
COPY csrc csrc/
COPY vllm/envs.py vllm/envs.py
COPY vllm/__init__.py vllm/__init__.py
ENV VLLM_TARGET_DEVICE=rocm
ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+rocm.csrc.build"
RUN --mount=type=bind,source=pyproject.toml,target=${COMMON_WORKDIR}/vllm/pyproject.toml \
--mount=type=cache,id=vllm-rocm-ccache,target=/root/.cache/ccache \
export CCACHE_BASEDIR="$PWD" \
&& echo "=== ccache stats before ROCm native build ===" \
&& (ccache --show-stats || true) \
&& (ccache --zero-stats || true) \
&& EFFECTIVE_MAX_JOBS="${MAX_JOBS:-$(nproc)}" \
&& echo "Building ROCm native extension wheel with MAX_JOBS=${EFFECTIVE_MAX_JOBS}" \
&& LDFLAGS="-fuse-ld=mold" MAX_JOBS="${EFFECTIVE_MAX_JOBS}" python3 setup.py bdist_wheel --dist-dir=dist \
&& test -d dist \
&& ls dist/*.whl >/dev/null \
&& echo "=== ccache stats after ROCm native build ===" \
&& (ccache --show-stats || true)
# Build the full vLLM ROCm wheel by reusing the native extension wheel from
# csrc-build. This stage still rebuilds for Python/package changes, but skips
# the expensive HIP/C++ compile when native inputs are unchanged.
FROM fetch_vllm AS build_vllm
ARG COMMON_WORKDIR
ENV VLLM_TARGET_DEVICE=rocm
COPY --from=csrc-build ${COMMON_WORKDIR}/vllm/dist /precompiled-wheels
# Drop the pre-built rust frontend binary into the source tree. setup.py
# detects it and ships it as-is, skipping the local cargo build.
COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs
# Build vLLM (setup.py auto-detects sccache in PATH)
RUN cd vllm \
&& python3 -m pip install -r requirements/rocm.txt \
&& python3 setup.py clean --all \
&& python3 setup.py bdist_wheel --dist-dir=dist
RUN --mount=type=cache,id=vllm-rocm-uv,target=/root/.cache/uv \
cd vllm \
&& uv pip install --system -r requirements/rocm.txt \
&& export VLLM_USE_PRECOMPILED=1 \
&& export VLLM_PRECOMPILED_WHEEL_LOCATION="$(ls /precompiled-wheels/*.whl)" \
&& export VLLM_DOCKER_BUILD_CONTEXT=1 \
&& echo "Packaging vLLM ROCm wheel using precompiled extensions from ${VLLM_PRECOMPILED_WHEEL_LOCATION}" \
&& python3 setup.py bdist_wheel --dist-dir=dist \
&& test -d dist \
&& ls dist/*.whl >/dev/null
FROM scratch AS export_vllm
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
@@ -171,6 +233,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/pyproject.toml /pyproject.toml
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
# RIXL/UCX build stages
@@ -201,14 +264,17 @@ RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
ibverbs-providers \
&& rm -rf /var/lib/apt/lists/*
RUN uv pip install --system meson auditwheel patchelf tomlkit
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system meson auditwheel patchelf tomlkit
RUN cd /usr/local/src && \
RUN --mount=type=cache,target=/root/.cache/ccache \
cd /usr/local/src && \
git clone ${UCX_REPO} && \
cd ucx && \
git checkout ${UCX_BRANCH} && \
./autogen.sh && \
mkdir build && cd build && \
CC="ccache gcc" CXX="ccache g++" \
../configure \
--prefix=/usr/local/ucx \
--enable-shared \
@@ -220,20 +286,22 @@ RUN cd /usr/local/src && \
--with-verbs \
--with-dm \
--enable-mt && \
make -j && \
make -j$(nproc) && \
make install
ENV PATH=/usr/local/ucx/bin:$PATH
ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}
RUN git clone ${RIXL_REPO} /opt/rixl && \
RUN --mount=type=cache,target=/root/.cache/ccache \
git clone ${RIXL_REPO} /opt/rixl && \
cd /opt/rixl && \
git checkout ${RIXL_BRANCH} && \
CC="ccache gcc" CXX="ccache g++" \
meson setup build --prefix=${RIXL_HOME} \
-Ducx_path=${UCX_HOME} \
-Drocm_path=${ROCM_PATH} && \
cd build && \
ninja && \
ninja -j$(nproc) && \
ninja install
# Generate RIXL wheel
@@ -250,30 +318,44 @@ RUN cd /opt/rixl && \
--ucx-plugins-dir ${UCX_HOME}/lib/ucx \
--nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins
# DeepEP build stage
FROM base AS build_deep
# ROCShmem build stage - split from DeepEP so changing DEEPEP_BRANCH does not
# invalidate the slow ROCShmem build.
FROM base AS build_rocshmem
ARG ROCSHMEM_BRANCH="f0acb0c6"
ARG ROCSHMEM_REPO="https://github.com/ROCm/rocm-systems.git"
ARG DEEPEP_BRANCH="a9ea9774"
ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
ARG DEEPEP_NIC="cx7"
# DeepEP only supports gfx942 and gfx950; build ROCShmem for the same set so
# it can be linked against DeepEP without arch mismatches.
ARG DEEPEP_ROCM_ARCH="gfx942;gfx950"
ENV ROCM_PATH=/opt/rocm
ENV ROCSHMEM_DIR=/opt/rocshmem
RUN git clone ${ROCSHMEM_REPO} \
RUN --mount=type=cache,target=/root/.cache/ccache \
git clone --no-checkout --filter=blob:none ${ROCSHMEM_REPO} \
&& cd rocm-systems \
&& git sparse-checkout set --cone projects/rocshmem \
&& git checkout ${ROCSHMEM_BRANCH} \
&& mkdir -p projects/rocshmem/build \
&& cd projects/rocshmem/build \
&& INSTALL_PREFIX=${ROCSHMEM_DIR} \
../scripts/build_configs/all_backends -DUSE_EXTERNAL_MPI=OFF
&& CC="ccache gcc" CXX="ccache g++" INSTALL_PREFIX=${ROCSHMEM_DIR} \
bash ../scripts/build_configs/all_backends \
-DROCM_PATH=${ROCM_PATH} \
-DGPU_TARGETS="${DEEPEP_ROCM_ARCH}" \
-DUSE_EXTERNAL_MPI=OFF
# Build DeepEP wheel.
# DeepEP looks for rocshmem at ROCSHMEM_DIR.
RUN git clone ${DEEPEP_REPO} \
# DeepEP build stage - depends on ROCShmem, builds the HIP kernel wheel.
FROM build_rocshmem AS build_deepep
ARG DEEPEP_BRANCH="a9ea9774"
ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
ARG DEEPEP_NIC="cx7"
# Build DeepEP wheel. DeepEP looks for rocshmem at ROCSHMEM_DIR.
# DeepEP only supports gfx942 and gfx950, so avoid gfx90a in the default list.
RUN --mount=type=cache,target=/root/.cache/ccache \
export PYTORCH_ROCM_ARCH="gfx942;gfx950" \
&& git clone ${DEEPEP_REPO} \
&& cd DeepEP \
&& git checkout ${DEEPEP_BRANCH} \
&& python3 setup.py --variant rocm --rocm-explicit-ctx --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
&& LDFLAGS="-fuse-ld=mold" MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --rocm-explicit-ctx --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
# MoRI runtime dependencies live in Dockerfile.rocm so NIC backend changes do
# not force users to rebuild the long-lived Dockerfile.rocm_base image.
@@ -372,8 +454,9 @@ RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
# This ensures setuptools_scm sees clean repo state for version detection
RUN --mount=type=bind,source=.git,target=vllm/.git \
--mount=type=cache,target=/root/.cache/uv \
cd vllm \
&& pip install setuptools_scm regex \
&& uv pip install --system setuptools_scm regex \
&& VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
&& echo "Detected vLLM version: ${VLLM_VERSION}" \
&& echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
@@ -409,18 +492,20 @@ RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
&& python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt
# Install dependencies using custom wheels from /install
RUN cd vllm \
RUN --mount=type=cache,target=/root/.cache/uv \
cd vllm \
&& echo "Building vLLM with custom wheels from /install" \
&& python3 -m pip install --find-links /install -r requirements/rocm.txt \
&& python3 setup.py clean --all
&& uv pip install --system --find-links /install -r requirements/rocm.txt
# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
# (setup.py auto-detects sccache in PATH)
# (setup.py auto-detects ccache/sccache in PATH)
RUN --mount=type=bind,source=.git,target=vllm/.git \
--mount=type=cache,id=vllm-rocm-ccache,target=/root/.cache/ccache \
cd vllm \
&& export CCACHE_BASEDIR="$PWD" \
&& export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
&& echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
&& python3 setup.py bdist_wheel --dist-dir=dist
&& MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist
FROM scratch AS export_vllm_wheel_release
ARG COMMON_WORKDIR
@@ -431,30 +516,17 @@ COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/pyproject.toml /pyproject.toml
COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1
# -----------------------
# Test vLLM image
FROM mori_base AS test
# CI base image (Tier 1) - stable, rarely changing CI dependencies.
# Per-PR test builds pull this as CI_BASE_IMAGE so the test stage only layers
# in the vLLM artifacts for the current commit.
FROM mori_base AS ci_base
ARG COMMON_WORKDIR
RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
# Install vLLM using uv (inherited from base stage)
# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
--mount=type=cache,target=/root/.cache/uv \
cd /install \
&& uv pip install --system -r requirements/rocm.txt \
&& uv pip install --system -r requirements/test/rocm.txt \
&& pip uninstall -y vllm \
&& uv pip install --system *.whl
# Persist the built wheel in the image so python_only_compile_rocm.sh can
# reinstall it after removing compilers. The bind-mounted /install contents
# above are not available once that RUN step completes.
COPY --from=export_vllm /*.whl /opt/vllm-wheels/
# Update rdma-core to support latest rocshmem
# Update rdma-core to support latest rocshmem.
ARG DEEPEP_NIC
RUN if [ "${DEEPEP_NIC}" = "cx7" ] || [ "${DEEPEP_NIC}" = "io" ]; then \
git clone --branch v62.0 --depth 1 https://github.com/linux-rdma/rdma-core.git /tmp/rdma-core && \
@@ -464,79 +536,98 @@ RUN if [ "${DEEPEP_NIC}" = "cx7" ] || [ "${DEEPEP_NIC}" = "io" ]; then \
ninja && ninja install && ldconfig && rm -rf /tmp/rdma-core; \
fi
# Install RIXL wheel
# Install RIXL + DeepEP wheels.
RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
uv pip install --system /rixl_install/*.whl
--mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
uv pip install --system /rixl_install/*.whl /deep_install/*.whl
# Install DeepEP wheel
RUN --mount=type=bind,from=build_deep,src=/app/deep_install,target=/deep_install \
uv pip install --system /deep_install/*.whl
COPY --from=build_deep /opt/rocshmem /opt/rocshmem
# Copy ROCShmem runtime libraries.
COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem
# RIXL/MoRIIO runtime dependencies (RDMA userspace libraries)
RUN apt-get update -q -y && apt-get install -q -y \
# RDMA userspace libraries plus FFmpeg dev libs needed by torchcodec.
RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
librdmacm1 \
libibverbs1 \
ibverbs-providers \
ibverbs-utils \
pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \
libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \
&& rm -rf /var/lib/apt/lists/*
WORKDIR /vllm-workspace
ARG COMMON_WORKDIR
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
# install development dependencies (for testing)
RUN cd /vllm-workspace \
&& python3 -m pip install -e tests/vllm_test_utils \
&& python3 -m pip install pytest-shard
# enable fast downloads from hf (for testing)
ENV HF_XET_HIGH_PERFORMANCE=1
# increase timeout for hf downloads (for testing)
ENV HF_HUB_DOWNLOAD_TIMEOUT 60
# install audio decode package `torchcodec` from source (required due to
# ROCm and torch version mismatch) for tests with datasets package
# Install torchcodec from source for ROCm/torch ABI compatibility.
COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh
RUN bash /tmp/install_torchcodec.sh \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/pip \
--mount=type=cache,target=/root/.cache/torchcodec-wheels \
bash /tmp/install_torchcodec.sh \
&& rm /tmp/install_torchcodec.sh \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*
&& apt-get clean && rm -rf /var/lib/apt/lists/*
# Copy in the v1 package (for python-only install test group)
COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
# Pre-install shared ROCm runtime dependencies.
COPY requirements/common.txt requirements/rocm.txt /tmp/ci-base-requirements/
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r /tmp/ci-base-requirements/rocm.txt \
&& rm -rf /tmp/ci-base-requirements
# Set MIOPEN ENVS to resolve performance regressions in MIOpen 3D convolution kernel
# Enable fast and less brittle model downloads in tests.
ENV HF_XET_HIGH_PERFORMANCE=1
ENV HF_HUB_DOWNLOAD_TIMEOUT=60
# Pre-install vLLM test dependencies.
COPY requirements/test/rocm.txt /tmp/rocm-test-reqs.txt
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system -r /tmp/rocm-test-reqs.txt
# Rebuild fastsafetensors from source so its C++ extension is compiled with
# USE_ROCM and can detect libamdhip64.so at runtime.
RUN --mount=type=cache,target=/root/.cache/pip \
FASTSAFETENSORS_REQ="$(grep -E '^fastsafetensors(==| @ )' /tmp/rocm-test-reqs.txt | head -1)" \
&& test -n "${FASTSAFETENSORS_REQ}" \
&& python3 -m pip install --force-reinstall --no-deps \
--no-binary fastsafetensors "${FASTSAFETENSORS_REQ}" \
&& rm /tmp/rocm-test-reqs.txt
# Set MIOPEN ENVS to resolve performance regressions in MIOpen 3D convolution kernel.
# See: https://github.com/pytorch/pytorch/issues/169857
ENV MIOPEN_DEBUG_CONV_DIRECT=0
ENV MIOPEN_DEBUG_CONV_GEMM=0
# Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc
# Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc.
# See: https://github.com/ROCm/rocm-libraries/issues/6266
ENV HSA_ENABLE_IPC_MODE_LEGACY=1
# Source code is used in the `python_only_compile.sh` test
# We hide it inside `src/` so that this source code
# will not be imported by other tests
RUN mkdir src && mv vllm src/vllm
# ROCm profiler limits workaround.
RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"
# This is a workaround to ensure pytest exits with the correct status code in CI tests.
RUN printf '%s\n' \
'import os' \
'' \
'_exit_code = 1' \
'' \
'def pytest_sessionfinish(session, exitstatus):' \
' global _exit_code' \
' _exit_code = int(exitstatus)' \
'' \
'def pytest_unconfigure(config):' \
' import sys' \
' sys.stdout.flush()' \
' sys.stderr.flush()' \
' os._exit(_exit_code)' \
> /vllm-workspace/conftest.py
# Install vllm_test_utils in ci_base for ci_base + wheel parity.
COPY tests/vllm_test_utils /tmp/vllm_test_utils
RUN --mount=type=cache,target=/root/.cache/uv \
uv pip install --system /tmp/vllm_test_utils \
&& rm -rf /tmp/vllm_test_utils
# -----------------------
# Test vLLM image (Tier 2) - vLLM-only layer on top of ci_base.
FROM ${CI_BASE_IMAGE} AS test
ARG COMMON_WORKDIR
# Install the vLLM wheel (--no-deps: all deps already in ci_base).
RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
--mount=type=cache,target=/root/.cache/uv \
cd /install \
&& uv pip install --system --no-deps *.whl
# Store the vLLM wheel in the image for python-only install tests.
COPY --from=export_vllm /*.whl /opt/vllm-wheels/
WORKDIR /vllm-workspace
COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
# Copy in the v1 package (for python-only install test group).
COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
# Hide source under src/ so it won't shadow the installed package in tests.
RUN mkdir src && mv vllm src/vllm
# -----------------------
# Final vLLM image
@@ -553,6 +644,7 @@ RUN rm -f /usr/bin/sccache || true \
# This prevents S3 bucket config from leaking into production images
ENV SCCACHE_BUCKET=
ENV SCCACHE_REGION=
ENV SCCACHE_ENDPOINT=
ENV SCCACHE_S3_NO_CREDENTIALS=
ENV SCCACHE_IDLE_TIMEOUT=
+376
View File
@@ -0,0 +1,376 @@
# ci-rocm.hcl - CI-specific configuration for vLLM ROCm Docker builds
#
# This file lives in the vLLM repo at docker/ci-rocm.hcl so ROCm Docker
# build mechanics can evolve with Dockerfile.rocm and docker-bake-rocm.hcl.
# Used with: docker buildx bake -f docker/docker-bake-rocm.hcl -f docker/ci-rocm.hcl test-rocm-ci
#
# Registry cache: Docker Hub (rocm/vllm-ci-cache) is used exclusively.
# AMD build agents already have Docker Hub credentials (they push the test
# image to rocm/vllm-ci), so no additional credential setup is required.
# ROCm CI uses Docker Hub for BuildKit layer cache by default. A separate
# compiler cache can be enabled with USE_SCCACHE=1 when AMD provides a shared
# S3-compatible cache endpoint.
# CI metadata
variable "BUILDKITE_COMMIT" {
default = ""
}
variable "BUILDKITE_BUILD_NUMBER" {
default = ""
}
variable "BUILDKITE_BUILD_ID" {
default = ""
}
variable "PARENT_COMMIT" {
default = ""
}
# Merge-base of HEAD with main - provides a more stable cache fallback than
# parent commit for long-lived PRs. Mirrors the VLLM_MERGE_BASE_COMMIT
# pattern used in the shared ci.hcl file. Auto-computed by ci-bake-rocm.sh
# when unset.
variable "VLLM_MERGE_BASE_COMMIT" {
default = ""
}
# Bridge to vLLM's COMMIT variable for OCI labels
variable "COMMIT" {
default = BUILDKITE_COMMIT
}
# Image tags (set by CI)
variable "IMAGE_TAG" {
default = ""
}
variable "IMAGE_TAG_LATEST" {
default = ""
}
# ROCm-specific GPU architecture targets
variable "PYTORCH_ROCM_ARCH" {
default = "gfx90a;gfx942;gfx950"
}
# Pre-built CI base image (Tier 1). Per-PR builds pull this instead of
# rebuilding RIXL/DeepEP/torchcodec from scratch. The ci_base stage in
# Dockerfile.rocm inherits from base, so CI_BASE_IMAGE only affects the test
# stage and is irrelevant when building --target ci_base itself.
variable "CI_BASE_IMAGE" {
default = "rocm/vllm-dev:ci_base"
}
# Leave CI_MAX_JOBS empty so the Dockerfile falls back to $(nproc) and uses
# the full builder parallelism. Operators can still override this per build.
variable "CI_MAX_JOBS" {
default = ""
}
# Upstream dependency commit pins -- extracted from Dockerfile.rocm by
# ci-bake-rocm.sh at build time. Empty defaults are safe: the cache
# functions produce no entries when the variable is empty.
variable "RIXL_BRANCH" {
default = ""
}
variable "UCX_BRANCH" {
default = ""
}
variable "ROCSHMEM_BRANCH" {
default = ""
}
variable "DEEPEP_BRANCH" {
default = ""
}
variable "RIXL_CACHE_KEY" {
default = ""
}
variable "ROCSHMEM_CACHE_KEY" {
default = ""
}
variable "DEEPEP_CACHE_KEY" {
default = ""
}
# Docker Hub registry cache for AMD builds.
#
# A separate repo (rocm/vllm-ci-cache) is used for BuildKit layer cache.
# Final-image cache exports use mode=min to reduce the volume of data pushed.
# Source-scoped csrc cache exports default to mode=max so fresh workers can
# recover more of the native build graph when ROCm extension inputs change.
# NOTE: mode=min still includes all layers referenced by the final image
# manifest, including inherited base layers (~7.25GB ROCm runtime).
# Docker Hub auto-creates the repo on first push.
#
# Final-image cache stays commit-scoped. Branch-to-branch reuse for the test
# image comes from importing the parent and merge-base commit cache refs.
#
# The source-scoped native cache is exported both per-commit and per-branch so
# ROCm extension rebuilds are shareable within the same commit reruns and across
# consecutive commits on the same branch without depending on a single global
# latest tag.
variable "DOCKERHUB_CACHE_REPO" {
default = "rocm/vllm-ci-cache"
}
variable "DOCKERHUB_CACHE_TO" {
default = ""
}
variable "ROCM_CACHE_BRANCH_TAG" {
default = ""
}
variable "ROCM_CACHE_UPSTREAM_BRANCH_TAG" {
default = ""
}
variable "ROCM_CSRC_CACHE_TO_MODE" {
default = "max"
}
variable "ROCM_FINAL_CACHE_TO_MODE" {
default = "min"
}
# Functions
function "get_cache_from_rocm" {
params = []
result = compact([
# Exact commit hit - fastest cache on re-runs of the same commit
BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT}" : "",
# Parent commit - useful cache for incremental changes
PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${PARENT_COMMIT}" : "",
# Merge-base with main - stable fallback for long-lived or rebased PRs;
# maps to a real main-branch commit whose cache layers are likely warm
VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${VLLM_MERGE_BASE_COMMIT}" : "",
# Import the source-scoped native build cache as well so builds whose
# Python/package layers changed can still reuse compiled ROCm objects.
BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT}" : "",
PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${PARENT_COMMIT}" : "",
VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${VLLM_MERGE_BASE_COMMIT}" : "",
ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "",
ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "",
# Branch-scoped full image cache - fallback when parent-commit cache is evicted
ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "",
ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "",
])
}
function "get_cache_to_rocm" {
params = []
result = compact([
# Commit-scoped cache for exact re-runs.
BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT},mode=${ROCM_FINAL_CACHE_TO_MODE}" : "",
# Branch-scoped cache so later commits on the same branch can reuse the full
# image layers when the parent-commit cache is evicted. Unlike the old
# rocm-latest tag (which caused duplicate exporter 400s), this is per-branch.
ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=${ROCM_FINAL_CACHE_TO_MODE}" : "",
])
}
function "get_cache_from_rocm_csrc" {
params = []
result = compact([
BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT}" : "",
PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${PARENT_COMMIT}" : "",
VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${VLLM_MERGE_BASE_COMMIT}" : "",
ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "",
ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "",
])
}
function "get_cache_to_rocm_csrc" {
params = []
result = compact([
# Export the exact-commit native cache for same-commit reruns.
BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT},mode=${ROCM_CSRC_CACHE_TO_MODE}" : "",
# Export the branch-scoped native cache so later commits on the same branch
# can reuse compiled ROCm objects even when the exact parent cache is absent.
ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=${ROCM_CSRC_CACHE_TO_MODE}" : "",
])
}
# Cache functions for upstream dependency stages (RIXL/UCX, ROCShmem, DeepEP).
# These stages are pinned to specific upstream commit hashes, so cache keys use
# those hashes rather than the Buildkite commit. This means the cache persists
# across all vLLM commits as long as the upstream dependency pins don't change.
function "get_cache_from_rocm_deps" {
params = []
result = compact([
RIXL_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_CACHE_KEY}" : (RIXL_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH}" : ""),
ROCSHMEM_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_CACHE_KEY}" : (ROCSHMEM_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_BRANCH}" : ""),
DEEPEP_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_CACHE_KEY}" : (DEEPEP_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH}" : ""),
])
}
function "get_cache_to_rocm_rixl" {
params = []
result = compact([
RIXL_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_CACHE_KEY},mode=min" : (RIXL_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH},mode=min" : ""),
])
}
function "get_cache_to_rocm_rocshmem" {
params = []
result = compact([
ROCSHMEM_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_CACHE_KEY},mode=min" : (ROCSHMEM_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_BRANCH},mode=min" : ""),
])
}
function "get_cache_to_rocm_deepep" {
params = []
result = compact([
DEEPEP_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_CACHE_KEY},mode=min" : (DEEPEP_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH},mode=min" : ""),
])
}
# CI targets
target "_ci-rocm" {
annotations = [
"manifest:vllm.buildkite.build_number=${BUILDKITE_BUILD_NUMBER}",
"manifest:vllm.buildkite.build_id=${BUILDKITE_BUILD_ID}",
]
args = {
ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH
CI_BASE_IMAGE = CI_BASE_IMAGE
max_jobs = CI_MAX_JOBS
}
}
target "test-rocm-ci" {
inherits = ["_common-rocm", "_ci-rocm", "_labels"]
target = "test"
cache-from = get_cache_from_rocm()
cache-to = get_cache_to_rocm()
tags = compact([
IMAGE_TAG,
IMAGE_TAG_LATEST,
])
output = ["type=registry"]
}
# Cache-only target for the source-scoped ROCm native build stage.
# This persists the csrc-build stage in the registry cache even though the
# final test image only consumes it indirectly while packaging the wheel.
target "csrc-rocm-ci" {
inherits = ["_common-rocm", "_ci-rocm"]
target = "csrc-build"
cache-from = get_cache_from_rocm_csrc()
cache-to = get_cache_to_rocm_csrc()
output = ["type=cacheonly"]
}
# Keep wheel export on the same CI graph as the test image build so the
# shared build_vllm/export_vllm stages resolve identically within one bake
# invocation. Without this, export-wheel-rocm uses the plain local target
# args while test-rocm-ci uses CI-only args, which can lead to separate
# cache lineages and inconsistent export_vllm results.
target "export-wheel-rocm" {
inherits = ["_common-rocm", "_ci-rocm"]
target = "export_vllm"
cache-from = get_cache_from_rocm()
cache-to = get_cache_to_rocm()
output = ["type=local,dest=./wheel-export"]
}
# Artifact-only vLLM build. GPU test jobs consume this artifact on top of
# ci_base, avoiding a per-commit multi-GB image push/pull.
group "test-rocm-ci-with-artifacts" {
targets = ["csrc-rocm-ci", "export-wheel-rocm"]
}
# Full test image + wheel export. Kept for fallback/debugging when a pushed
# per-commit image is useful.
group "test-rocm-ci-with-wheel" {
targets = ["csrc-rocm-ci", "test-rocm-ci", "export-wheel-rocm"]
}
# Image tags for the ci_base build. ci-bake-rocm.sh rewrites CI_BASE_IMAGE_TAG
# to the primary tag for this build. Non-nightly builds use a commit-scoped tag
# and also publish a content tag for reuse. NIGHTLY=1 builds on the stable branch
# can additionally set CI_BASE_IMAGE_TAG_STABLE to refresh rocm/vllm-dev:ci_base.
variable "CI_BASE_IMAGE_TAG" {
default = "rocm/vllm-dev:ci_base"
}
variable "CI_BASE_IMAGE_TAG_CONTENT" {
default = ""
}
variable "CI_BASE_IMAGE_TAG_STABLE" {
default = ""
}
# Cache-only targets for upstream dependency stages. These persist each stage
# in the registry cache keyed by its upstream commit hash. When ci_base rebuilds
# (e.g., requirements change), these stages are cache hits if their upstream
# pins haven't changed -- saving ~35min of compilation.
target "rixl-rocm-ci" {
inherits = ["_common-rocm", "_ci-rocm"]
target = "build_rixl"
cache-from = get_cache_from_rocm_deps()
cache-to = get_cache_to_rocm_rixl()
output = ["type=cacheonly"]
}
target "rocshmem-rocm-ci" {
inherits = ["_common-rocm", "_ci-rocm"]
target = "build_rocshmem"
cache-from = get_cache_from_rocm_deps()
cache-to = get_cache_to_rocm_rocshmem()
output = ["type=cacheonly"]
}
target "deepep-rocm-ci" {
inherits = ["_common-rocm", "_ci-rocm"]
target = "build_deepep"
cache-from = get_cache_from_rocm_deps()
cache-to = get_cache_to_rocm_deepep()
output = ["type=cacheonly"]
}
# Builds only the ci_base stage (RIXL, DeepEP, torchcodec, etc.)
# Invoked by the ensure-ci-base step when the content hash of ci_base-affecting
# files drifts from the remote image label. Per-PR builds then pull the result
# as CI_BASE_IMAGE instead of rebuilding those slow layers on every commit.
# Uses inline cache metadata on the ci_base image itself instead of exporting a
# separate registry cache artifact.
target "ci-base-rocm-ci" {
inherits = ["_common-rocm", "_ci-rocm", "_labels"]
target = "ci_base"
cache-from = concat(
compact([
CI_BASE_IMAGE_TAG != "" ? "type=registry,ref=${CI_BASE_IMAGE_TAG}" : "",
CI_BASE_IMAGE_TAG_CONTENT != "" ? "type=registry,ref=${CI_BASE_IMAGE_TAG_CONTENT}" : "",
CI_BASE_IMAGE_TAG_STABLE != "" ? "type=registry,ref=${CI_BASE_IMAGE_TAG_STABLE}" : "",
]),
# Import upstream dependency caches so RIXL/ROCShmem/DeepEP stages
# are cache hits even when ci_base itself needs rebuilding.
get_cache_from_rocm_deps(),
)
cache-to = ["type=inline"]
tags = compact([CI_BASE_IMAGE_TAG, CI_BASE_IMAGE_TAG_CONTENT, CI_BASE_IMAGE_TAG_STABLE])
output = ["type=registry"]
}
# Group for ci_base builds -- exports dependency stage caches alongside the
# ci_base image so future rebuilds can reuse them independently.
group "ci-base-rocm-ci-with-deps" {
targets = ["rixl-rocm-ci", "rocshmem-rocm-ci", "deepep-rocm-ci", "ci-base-rocm-ci"]
}
+143
View File
@@ -0,0 +1,143 @@
# docker-bake-rocm.hcl - vLLM ROCm Docker build configuration
#
# This file lives in the vLLM repo at docker/docker-bake-rocm.hcl
# Equivalent of docker-bake.hcl for ROCm builds.
#
# Usage:
# docker buildx bake -f docker/docker-bake-rocm.hcl # Build test (default)
# docker buildx bake -f docker/docker-bake-rocm.hcl final-rocm # Build final image
# docker buildx bake -f docker/docker-bake-rocm.hcl --print # Show resolved config
#
# CI usage (with the vLLM-owned CI overlay):
# docker buildx bake -f docker/docker-bake-rocm.hcl -f docker/ci-rocm.hcl test-rocm-ci
variable "MAX_JOBS" {
# Empty string lets the Dockerfile fall back to $(nproc) via
# MAX_JOBS="${MAX_JOBS:-$(nproc)}" in each RUN step, which uses all
# available cores on whatever machine the build runs on.
# Override with --set '*.args.max_jobs=8' for local builds on small machines.
default = ""
}
variable "PYTORCH_ROCM_ARCH" {
default = "gfx90a;gfx942;gfx950"
}
variable "COMMIT" {
default = ""
}
# Content hash of ci_base-affecting files. Computed by ci-bake-rocm.sh and
# embedded as a label so future builds can compare without rebuilding.
variable "CI_BASE_CONTENT_HASH" {
default = ""
}
# REMOTE_VLLM=0: use local source via Docker build context (ONBUILD COPY ./ vllm/)
# REMOTE_VLLM=1: clone from GitHub at VLLM_BRANCH (standalone builds without local source)
variable "REMOTE_VLLM" {
default = "0"
}
variable "VLLM_BRANCH" {
default = "main"
}
# CI_BASE_IMAGE: pre-built ci_base image for per-PR test builds.
# Defaults to the local "ci_base" stage for standalone/local builds.
# CI overrides this to "rocm/vllm-dev:ci_base" via environment variable.
variable "CI_BASE_IMAGE" {
default = "rocm/vllm-dev:ci_base"
}
# Upstream dependency commit pins. Plain local bake builds use the Dockerfile
# ARG defaults. ci-bake-rocm.sh resolves those defaults (plus any env
# overrides) and writes a small HCL override before invoking CI targets.
variable "RIXL_BRANCH" {
default = ""
}
variable "UCX_BRANCH" {
default = ""
}
variable "ROCSHMEM_BRANCH" {
default = ""
}
variable "DEEPEP_BRANCH" {
default = ""
}
group "default" {
targets = ["test-rocm"]
}
target "_common-rocm" {
dockerfile = "docker/Dockerfile.rocm"
context = "."
args = {
max_jobs = MAX_JOBS
ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH
REMOTE_VLLM = REMOTE_VLLM
VLLM_BRANCH = VLLM_BRANCH
CI_BASE_IMAGE = CI_BASE_IMAGE
}
}
target "_labels" {
labels = {
"org.opencontainers.image.source" = "https://github.com/vllm-project/vllm"
"org.opencontainers.image.vendor" = "vLLM"
"org.opencontainers.image.title" = "vLLM ROCm"
"org.opencontainers.image.description" = "vLLM: A high-throughput and memory-efficient inference and serving engine for LLMs (ROCm)"
"org.opencontainers.image.licenses" = "Apache-2.0"
"org.opencontainers.image.revision" = COMMIT
}
annotations = [
"manifest:org.opencontainers.image.revision=${COMMIT}",
]
}
target "test-rocm" {
inherits = ["_common-rocm", "_labels"]
target = "test"
tags = ["rocm/vllm:test"]
output = ["type=docker"]
}
# CI base image target - builds only the ci_base stage (RIXL, DeepEP,
# torchcodec, requirements, etc.). Used by the weekly scheduled build and
# the auto-rebuild trigger when requirements change in a PR.
target "ci-base-rocm" {
inherits = ["_common-rocm", "_labels"]
target = "ci_base"
labels = {
"vllm.ci_base.content_hash" = CI_BASE_CONTENT_HASH
}
tags = ["rocm/vllm-dev:ci_base"]
output = ["type=docker"]
}
# Wheel export target - extracts the built vLLM wheel + test workspace
# to local disk. Used by CI to upload the wheel as a Buildkite artifact
# so test jobs can assemble images locally from ci_base + wheel instead
# of pulling the full large image from Docker Hub.
#
# Usage:
# docker buildx bake -f docker/docker-bake-rocm.hcl export-wheel-rocm
# # Creates ./wheel-export/*.whl, ./wheel-export/requirements/, etc.
#
# After a full bake build, BuildKit cache makes this nearly instant.
target "export-wheel-rocm" {
inherits = ["_common-rocm"]
target = "export_vllm"
output = ["type=local,dest=./wheel-export"]
}
target "final-rocm" {
inherits = ["_common-rocm", "_labels"]
target = "final"
tags = ["rocm/vllm:latest"]
output = ["type=docker"]
}
+52 -3
View File
@@ -3,12 +3,16 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Script to install TorchCodec from source (required for ROCm compatibility)
# The PyPI wheel is built against upstream PyTorch and has ABI mismatches with
# ROCm's custom torch build, so we must compile from source.
set -e
TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}"
# Pin to a specific release for reproducibility; update as needed.
TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}"
# Cache directory for pre-built wheels to avoid redundant recompilation.
TORCHCODEC_WHEEL_CACHE="${TORCHCODEC_WHEEL_CACHE:-/root/.cache/torchcodec-wheels}"
echo "=== TorchCodec Installation Script ==="
@@ -18,9 +22,26 @@ if python3 -c "from torchcodec.decoders import VideoDecoder" 2>/dev/null; then
exit 0
fi
# Try to install from cached wheel first
ARCH_TAG="${PYTORCH_ROCM_ARCH:-all}"
# Normalize arch tag (replace ; with _) for use in filename
ARCH_TAG="${ARCH_TAG//;/_}"
CACHED_WHEEL="${TORCHCODEC_WHEEL_CACHE}/torchcodec-${TORCHCODEC_BRANCH}-${ARCH_TAG}.whl"
if [ -f "$CACHED_WHEEL" ]; then
echo "Found cached wheel: $CACHED_WHEEL"
pip install "$CACHED_WHEEL" && {
echo "Installed from cached wheel."
echo "=== TorchCodec installation complete ==="
exit 0
}
echo "Cached wheel installation failed, rebuilding from source..."
fi
echo "TorchCodec not found. Installing from source..."
# Install system dependencies (FFmpeg + pkg-config)
# Install system dependencies (FFmpeg + pkg-config) if not already present.
# The Docker test image pre-installs these, so this is a fallback for other envs.
install_system_deps() {
if command -v apt-get &> /dev/null; then
echo "Installing system dependencies..."
@@ -56,6 +77,12 @@ export pybind11_DIR=$(python3 -c "import pybind11; print(pybind11.get_cmake_dir(
export CMAKE_PREFIX_PATH="${pybind11_DIR}:${CMAKE_PREFIX_PATH}"
echo "pybind11_DIR set to: $pybind11_DIR"
# Limit GPU architectures to only what this image targets.
# The default builds for all supported archs which is very slow.
if [ -n "$PYTORCH_ROCM_ARCH" ]; then
echo "Building for PYTORCH_ROCM_ARCH=$PYTORCH_ROCM_ARCH"
fi
# Create temp directory for build
BUILD_DIR=$(mktemp -d -t torchcodec-XXXXXX)
echo "Building in temporary directory: $BUILD_DIR"
@@ -77,9 +104,31 @@ cd torchcodec
export TORCHCODEC_CMAKE_BUILD_DIR="${PWD}/build"
export TORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR=1
export I_CONFIRM_THIS_IS_NOT_A_LICENSE_VIOLATION=1
# Use ninja for faster builds and parallelize compilation
export CMAKE_GENERATOR=Ninja
export MAX_JOBS="${MAX_JOBS:-$(nproc)}"
# Use ccache if available to speed up recompilation
if command -v ccache &> /dev/null; then
export CMAKE_C_COMPILER_LAUNCHER=ccache
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
fi
echo "Building TorchCodec..."
pip install . --no-build-isolation
echo "Building TorchCodec (MAX_JOBS=$MAX_JOBS)..."
pip wheel . --no-build-isolation --no-deps -w "$BUILD_DIR/dist"
# Install the built wheel
BUILT_WHEEL=$(ls "$BUILD_DIR/dist"/torchcodec-*.whl 2>/dev/null | head -1)
if [ -z "$BUILT_WHEEL" ]; then
echo "Error: No wheel produced"
exit 1
fi
pip install "$BUILT_WHEEL"
# Cache the wheel for future runs
mkdir -p "$TORCHCODEC_WHEEL_CACHE"
cp "$BUILT_WHEEL" "$CACHED_WHEEL"
echo "Cached wheel to: $CACHED_WHEEL"
# Verify installation
echo "Verifying installation..."