[ROCm][CI] Optimize ROCm Docker build: registry cache, DeepEP, and ci-bake script (#36949)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
2026-06-06 00:16:14 +00:00 · 2026-06-03 01:43:07 -05:00
parent 71df063c49
commit 87954eb50e
10 changed files with 2746 additions and 158 deletions
@@ -0,0 +1,23 @@
+name: vllm_rocm_ci
+job_dirs:
+  - ".buildkite/hardware_tests"
+run_all_patterns:
+  - "docker/Dockerfile.rocm"
+  - "docker/Dockerfile.rocm_base"
+  - "docker/ci-rocm.hcl"
+  - "docker/docker-bake-rocm.hcl"
+  - ".buildkite/hardware_tests/amd.yaml"
+  - ".buildkite/scripts/ci-bake-rocm.sh"
+  - ".buildkite/scripts/hardware_ci/run-amd-test.py"
+  - ".buildkite/scripts/hardware_ci/run-amd-test.sh"
+  - "CMakeLists.txt"
+  - "requirements/common.txt"
+  - "requirements/rocm.txt"
+  - "requirements/build/rocm.txt"
+  - "requirements/test/rocm.txt"
+  - "setup.py"
+  - "csrc/"
+  - "cmake/"
+run_all_exclude_patterns:
+  - "csrc/cpu/"
+  - "cmake/cpu_extension.cmake"
@@ -1,22 +1,43 @@
 group: Hardware - AMD Build
 steps:
-  - label: "AMD: :docker: build image"
-    key: image-build-amd
+  # Ensure ci_base is up-to-date before building the test image.
+  # Compares a content hash of ci_base-affecting files against the remote
+  # image label. If hashes match the build is skipped (< 30 s); if they
+  # differ ci_base is rebuilt and pushed automatically.
+  - label: "AMD: :docker: ensure ci_base"
+    key: ensure-ci-base-amd
    depends_on: []
    device: amd_cpu
    no_plugin: true
    commands:
-    - >
-      docker build
-      --build-arg max_jobs=16
-      --build-arg REMOTE_VLLM=1
-      --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950'
-      --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT
-      --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}"
-      -f docker/Dockerfile.rocm
-      --target test
-      --no-cache
-      --progress plain .
+      - bash .buildkite/scripts/ci-bake-rocm.sh ci-base-rocm-ci-with-deps
+    env:
+      DOCKER_BUILDKIT: "1"
+      VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
+      PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950"
+      REMOTE_VLLM: "1"
+      VLLM_BRANCH: "$BUILDKITE_COMMIT"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 1
+        - exit_status: -10  # Agent was lost
+          limit: 1
+
+  - label: "AMD: :docker: build test image and artifacts"
+    key: image-build-amd
+    depends_on:
+      - ensure-ci-base-amd
+    device: amd_cpu
+    no_plugin: true
+    commands:
+      - |
+        if [[ "${ROCM_CI_ARTIFACT_ONLY:-0}" == "1" ]]; then
+          echo "ROCM_CI_ARTIFACT_ONLY=1; building ROCm wheel artifact only"
+          IMAGE_TAG="" bash .buildkite/scripts/ci-bake-rocm.sh test-rocm-ci-with-artifacts
+        else
+          bash .buildkite/scripts/ci-bake-rocm.sh test-rocm-ci-with-wheel
+        fi
      - |
        docker run --rm --network=none --entrypoint /bin/bash "rocm/vllm-ci:${BUILDKITE_COMMIT}" -ec '
          if [ ! -d /vllm-workspace ]; then echo Missing directory: /vllm-workspace >&2; exit 1; fi
@@ -37,6 +58,16 @@ steps:
        PY
          echo AMD image smoke OK
        '
-    - docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
    env:
      DOCKER_BUILDKIT: "1"
+      VLLM_BAKE_FILE: "docker/docker-bake-rocm.hcl"
+      PYTORCH_ROCM_ARCH: "gfx90a;gfx942;gfx950"
+      IMAGE_TAG: "rocm/vllm-ci:$BUILDKITE_COMMIT"
+      REMOTE_VLLM: "1"
+      VLLM_BRANCH: "$BUILDKITE_COMMIT"
+    retry:
+      automatic:
+        - exit_status: -1  # Agent was lost
+          limit: 1
+        - exit_status: -10  # Agent was lost
+          limit: 1
@@ -52,6 +52,108 @@ cleanup_network() {
  fi
 }

+prepare_artifact_image() {
+  if [[ "${VLLM_CI_USE_ARTIFACTS:-0}" != "1" ]]; then
+    return 1
+  fi
+  if ! command -v buildkite-agent >/dev/null 2>&1; then
+    echo "buildkite-agent not found; cannot download ROCm wheel artifact"
+    return 1
+  fi
+
+  local artifact_glob="${VLLM_CI_ARTIFACT_GLOB:-artifacts/vllm-rocm-install/vllm-rocm-install.tar.gz}"
+  local archive=""
+  local metadata_file=""
+  local base_image="${VLLM_CI_BASE_IMAGE:-rocm/vllm-dev:ci_base}"
+  local artifact_image=""
+  local artifact_key=""
+  local base_digest=""
+  local wheel_dir=""
+  local context_dir=""
+  local workspace_dir=""
+
+  artifact_work_dir=$(mktemp -d -t vllm-rocm-artifact.XXXXXX)
+  wheel_dir="${artifact_work_dir}/wheels"
+  context_dir="${artifact_work_dir}/context"
+  workspace_dir="${context_dir}/workspace"
+  mkdir -p "${wheel_dir}" "${context_dir}/wheels" "${workspace_dir}"
+
+  echo "--- Downloading ROCm wheel artifact"
+  if ! buildkite-agent artifact download "${artifact_glob}" "${artifact_work_dir}"; then
+    echo "Failed to download ${artifact_glob}"
+    return 1
+  fi
+  buildkite-agent artifact download \
+    "artifacts/vllm-rocm-install/ci-base-image.txt" \
+    "${artifact_work_dir}" >/dev/null 2>&1 || true
+
+  archive=$(find "${artifact_work_dir}" -name "vllm-rocm-install.tar.gz" -type f | head -1)
+  if [[ -z "${archive}" || ! -f "${archive}" ]]; then
+    echo "ROCm wheel artifact archive was not found"
+    return 1
+  fi
+
+  metadata_file=$(find "${artifact_work_dir}" -name "ci-base-image.txt" -type f | head -1)
+  if [[ -n "${metadata_file}" && -s "${metadata_file}" ]]; then
+    base_image=$(tr -d '[:space:]' < "${metadata_file}")
+  fi
+
+  echo "--- Preparing local ROCm test image"
+  echo "Base image: ${base_image}"
+  docker pull "${base_image}" || return 1
+  base_digest=$(
+    docker image inspect \
+      --format='{{if .RepoDigests}}{{index .RepoDigests 0}}{{else}}{{.Id}}{{end}}' \
+      "${base_image}" 2>/dev/null || printf '%s' "${base_image}"
+  )
+
+  artifact_key=$(
+    {
+      printf 'base-image:%s\n' "${base_digest}"
+      sha256sum "${archive}"
+    } | sha256sum | cut -c1-24
+  )
+  artifact_image="rocm/vllm-ci-artifact:${artifact_key}"
+
+  if docker image inspect "${artifact_image}" >/dev/null 2>&1; then
+    echo "Using existing local ROCm artifact image: ${artifact_image}"
+    image_name="${artifact_image}"
+    return 0
+  fi
+
+  tar -xzf "${archive}" -C "${wheel_dir}" || return 1
+  if ! ls "${wheel_dir}"/*.whl >/dev/null 2>&1; then
+    echo "ROCm wheel artifact did not contain a wheel"
+    return 1
+  fi
+  if [[ ! -d "${wheel_dir}/tests" ]]; then
+    echo "ROCm wheel artifact did not contain the test workspace"
+    return 1
+  fi
+
+  cp "${wheel_dir}"/*.whl "${context_dir}/wheels/" || return 1
+  tar -C "${wheel_dir}" --exclude='*.whl' -cf - . \
+    | tar -C "${workspace_dir}" -xf - || return 1
+  cat > "${context_dir}/Dockerfile" <<'EOF'
+ARG BASE_IMAGE
+FROM ${BASE_IMAGE}
+COPY wheels/ /tmp/vllm-wheels/
+COPY workspace/ /vllm-workspace/
+RUN python3 -m pip install --no-deps --force-reinstall /tmp/vllm-wheels/*.whl \
+    && rm -rf /tmp/vllm-wheels
+WORKDIR /vllm-workspace
+EOF
+
+  echo "--- Building local ROCm test image"
+  docker build \
+    --pull=false \
+    --build-arg "BASE_IMAGE=${base_image}" \
+    -t "${artifact_image}" \
+    "${context_dir}" || return 1
+  image_name="${artifact_image}"
+  return 0
+}
+
 is_multi_node() {
  local cmds="$1"
  # Primary signal: NUM_NODES environment variable set by the pipeline
@@ -243,22 +345,30 @@ report_docker_usage

 # --- Pull test image ---
 echo "--- Pulling container"
-image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}"
+image_name="${VLLM_CI_FALLBACK_IMAGE:-rocm/vllm-ci:${BUILDKITE_COMMIT:-local}}"
+artifact_work_dir=""
 container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)"
-docker pull "${image_name}"

 remove_docker_container() {
-  # docker run uses --rm, so the container is normally already gone when the
-  # EXIT trap runs. Cleanup is best-effort and must not affect the test result.
-  docker rm -f "${container_name}" >/dev/null 2>&1 || true
+  if docker container inspect "${container_name}" >/dev/null 2>&1; then
+    docker rm -f "${container_name}" || true
+  fi
+  if [[ "${VLLM_CI_REMOVE_TEST_IMAGE:-0}" == "1" ]]; then
+    docker image rm -f "${image_name}" || true
+  else
+    # Keep images by default so later jobs on the same AMD node can reuse layers.
+    echo "Keeping ROCm test image locally: ${image_name}"
+  fi
+  if [[ -n "${artifact_work_dir}" ]]; then
+    rm -rf "${artifact_work_dir}"
+  fi
 }
+trap remove_docker_container EXIT

-on_exit() {
-  local exit_code=$?
-  remove_docker_container
-  exit "$exit_code"
-}
-trap on_exit EXIT
+if ! prepare_artifact_image; then
+  echo "Using full ROCm CI image: ${image_name}"
+  docker pull "${image_name}" || exit 1
+fi

 # --- Prepare commands ---
 echo "--- Running container"
@@ -33,3 +33,10 @@ share/python-wheels/
 *.egg
 MANIFEST
 rust/target/
+# Not needed in Docker builds
+docs/
+.github/
+.pre-commit-config.yaml
+.clang-format
+.gitattributes
+format.sh
@@ -81,6 +81,14 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS)
  set_property(GLOBAL APPEND PROPERTY VLLM_HIPIFY_ALL_SRCS ${SRCS})
  set_property(GLOBAL APPEND PROPERTY VLLM_HIPIFY_ALL_BYPRODUCTS ${HIP_SRCS})

+  # Chain hipify targets so they run sequentially. Parallel hipify
+  # invocations race on shutil.copytree, overwriting .hip files
+  # produced by another target back to .cu originals.
+  if (DEFINED _VLLM_LAST_HIPIFY_TARGET)
+    add_dependencies(hipify${NAME} ${_VLLM_LAST_HIPIFY_TARGET})
+  endif()
+  set(_VLLM_LAST_HIPIFY_TARGET "hipify${NAME}" PARENT_SCOPE)
+
  # Swap out original extension sources with hipified sources.
  list(APPEND HIP_SRCS ${CXX_SRCS})
  set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE)
@@ -2,6 +2,7 @@
 ARG REMOTE_VLLM="0"
 ARG COMMON_WORKDIR=/app
 ARG BASE_IMAGE=rocm/vllm-dev:base
+ARG CI_BASE_IMAGE=rocm/vllm-dev:ci_base
 # NIC backend for MoRI RDMA support.
 # By default (all), drivers and userspace libraries for all supported NIC types
 # (ainic and bnxt) are installed; MoRI selects the appropriate one at runtime.
@@ -16,7 +17,8 @@ ARG NIC_BACKEND=all
 ARG AINIC_VERSION=1.117.3-hydra
 ARG UBUNTU_CODENAME=jammy

-# Sccache configuration (only used in release pipeline)
+# Sccache configuration. Release builds use this today; CI can opt in when a
+# shared S3-compatible cache backend is available.
 ARG USE_SCCACHE
 ARG SCCACHE_DOWNLOAD_URL
 ARG SCCACHE_ENDPOINT
@@ -29,12 +31,16 @@ FROM ${BASE_IMAGE} AS base
 ARG ARG_PYTORCH_ROCM_ARCH
 ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}}

-# Install some basic utilities
+# Install build dependencies and utilities
 RUN apt-get update -q -y && apt-get install -q -y \
    sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \
    apt-transport-https ca-certificates wget curl \
-    libnuma-dev
-RUN python3 -m pip install --upgrade pip
+    libnuma-dev ccache mold
+RUN --mount=type=cache,target=/root/.cache/pip \
+    python3 -m pip install --upgrade pip
+# Note: mold is installed but not set as the system default linker because
+# some packages use JIT compilation at runtime with flags mold does not support.
+# Build stages opt in via LDFLAGS="-fuse-ld=mold".
 # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base)
 ARG USE_SCCACHE
 RUN if [ "$USE_SCCACHE" != "1" ]; then \
@@ -55,6 +61,12 @@ ENV UV_HTTP_TIMEOUT=500
 ENV UV_INDEX_STRATEGY="unsafe-best-match"
 # Use copy mode to avoid hardlink failures with Docker cache mounts
 ENV UV_LINK_MODE=copy
+# ccache directory - persisted across layer rebuilds via cache mounts.
+ENV CCACHE_DIR=/root/.cache/ccache
+ENV CCACHE_COMPILERCHECK=content
+# Empty by default so build steps fall back to $(nproc); CI can override.
+ARG max_jobs
+ENV MAX_JOBS=${max_jobs}

 # Install sccache if USE_SCCACHE is enabled (for release builds)
 ARG USE_SCCACHE
@@ -86,6 +98,7 @@ RUN if [ "$USE_SCCACHE" = "1" ]; then \
 ARG USE_SCCACHE
 ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}}
 ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}}
+ENV SCCACHE_ENDPOINT=${USE_SCCACHE:+${SCCACHE_ENDPOINT}}
 ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}}
 ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0}

@@ -114,8 +127,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
 # -----------------------
 # Rust build stage
 # Builds the `vllm-rs` frontend in a dedicated stage so the wheel build stages
-# don't need the rust toolchain or protoc. Runs in parallel with the main wheel
-# build for faster end-to-end builds.
+# don't need the rust toolchain or protoc.
 FROM fetch_vllm AS rust-build
 ARG COMMON_WORKDIR

@@ -144,24 +156,74 @@ ENV RUSTUP_MAX_RETRIES=10
 # layer for later COPY --from=rust-build.
 RUN --mount=type=cache,id=vllm-rocm-cargo-registry,target=/root/.cargo/registry,sharing=locked \
    --mount=type=cache,id=vllm-rocm-cargo-git,target=/root/.cargo/git,sharing=locked \
+    --mount=type=cache,id=vllm-rocm-cargo-target,target=${COMMON_WORKDIR}/vllm/rust/target,sharing=locked \
    cd ${COMMON_WORKDIR}/vllm \
    && VLLM_RS_TARGET_PATH=/tmp/vllm-rs bash build_rust.sh \
    && test -x /tmp/vllm-rs

 # -----------------------
-# vLLM build stages
+# vLLM native build stages
+#
+# csrc-build intentionally copies only files that affect ROCm native extension
+# compilation. That keeps unrelated CI/test/docs edits from invalidating the
+# expensive HIP/C++ build layer.
+FROM base AS csrc-build
+ARG COMMON_WORKDIR
+WORKDIR ${COMMON_WORKDIR}/vllm
+
+COPY requirements/rocm.txt requirements/rocm.txt
+COPY requirements/common.txt requirements/common.txt
+RUN --mount=type=cache,id=vllm-rocm-uv,target=/root/.cache/uv \
+    uv pip install --system -r requirements/rocm.txt
+
+# pyproject.toml is bind-mounted in the RUN step so metadata-only changes do
+# not invalidate the expensive native build layer.
+COPY setup.py CMakeLists.txt ./
+COPY cmake cmake/
+COPY csrc csrc/
+COPY vllm/envs.py vllm/envs.py
+COPY vllm/__init__.py vllm/__init__.py
+
+ENV VLLM_TARGET_DEVICE=rocm
+ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+rocm.csrc.build"
+
+RUN --mount=type=bind,source=pyproject.toml,target=${COMMON_WORKDIR}/vllm/pyproject.toml \
+    --mount=type=cache,id=vllm-rocm-ccache,target=/root/.cache/ccache \
+    export CCACHE_BASEDIR="$PWD" \
+    && echo "=== ccache stats before ROCm native build ===" \
+    && (ccache --show-stats || true) \
+    && (ccache --zero-stats || true) \
+    && EFFECTIVE_MAX_JOBS="${MAX_JOBS:-$(nproc)}" \
+    && echo "Building ROCm native extension wheel with MAX_JOBS=${EFFECTIVE_MAX_JOBS}" \
+    && LDFLAGS="-fuse-ld=mold" MAX_JOBS="${EFFECTIVE_MAX_JOBS}" python3 setup.py bdist_wheel --dist-dir=dist \
+    && test -d dist \
+    && ls dist/*.whl >/dev/null \
+    && echo "=== ccache stats after ROCm native build ===" \
+    && (ccache --show-stats || true)
+
+# Build the full vLLM ROCm wheel by reusing the native extension wheel from
+# csrc-build. This stage still rebuilds for Python/package changes, but skips
+# the expensive HIP/C++ compile when native inputs are unchanged.
 FROM fetch_vllm AS build_vllm
 ARG COMMON_WORKDIR
+ENV VLLM_TARGET_DEVICE=rocm
+
+COPY --from=csrc-build ${COMMON_WORKDIR}/vllm/dist /precompiled-wheels

 # Drop the pre-built rust frontend binary into the source tree. setup.py
 # detects it and ships it as-is, skipping the local cargo build.
 COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs

-# Build vLLM (setup.py auto-detects sccache in PATH)
-RUN cd vllm \
-    && python3 -m pip install -r requirements/rocm.txt \
-    && python3 setup.py clean --all  \
-    && python3 setup.py bdist_wheel --dist-dir=dist
+RUN --mount=type=cache,id=vllm-rocm-uv,target=/root/.cache/uv \
+    cd vllm \
+    && uv pip install --system -r requirements/rocm.txt \
+    && export VLLM_USE_PRECOMPILED=1 \
+    && export VLLM_PRECOMPILED_WHEEL_LOCATION="$(ls /precompiled-wheels/*.whl)" \
+    && export VLLM_DOCKER_BUILD_CONTEXT=1 \
+    && echo "Packaging vLLM ROCm wheel using precompiled extensions from ${VLLM_PRECOMPILED_WHEEL_LOCATION}" \
+    && python3 setup.py bdist_wheel --dist-dir=dist \
+    && test -d dist \
+    && ls dist/*.whl >/dev/null
 FROM scratch AS export_vllm
 ARG COMMON_WORKDIR
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl /
@@ -171,6 +233,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/pyproject.toml /pyproject.toml
 COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1

 # RIXL/UCX build stages
@@ -201,14 +264,17 @@ RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \
    ibverbs-providers \
    && rm -rf /var/lib/apt/lists/*

-RUN uv pip install --system meson auditwheel patchelf tomlkit
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system meson auditwheel patchelf tomlkit

-RUN cd /usr/local/src && \
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    cd /usr/local/src && \
    git clone ${UCX_REPO} &&  \
    cd ucx  && \
    git checkout ${UCX_BRANCH} && \
    ./autogen.sh && \
    mkdir build && cd build && \
+    CC="ccache gcc" CXX="ccache g++" \
    ../configure \
        --prefix=/usr/local/ucx \
        --enable-shared \
@@ -220,20 +286,22 @@ RUN cd /usr/local/src && \
        --with-verbs \
        --with-dm \
        --enable-mt && \
-    make -j && \
+    make -j$(nproc) && \
    make install

 ENV PATH=/usr/local/ucx/bin:$PATH
 ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH}

-RUN git clone ${RIXL_REPO} /opt/rixl && \
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    git clone ${RIXL_REPO} /opt/rixl && \
    cd /opt/rixl && \
    git checkout ${RIXL_BRANCH} && \
+    CC="ccache gcc" CXX="ccache g++" \
    meson setup build --prefix=${RIXL_HOME} \
                     -Ducx_path=${UCX_HOME} \
                     -Drocm_path=${ROCM_PATH} && \
    cd build && \
-    ninja && \
+    ninja -j$(nproc) && \
    ninja install

 # Generate RIXL wheel
@@ -250,30 +318,44 @@ RUN cd /opt/rixl && \
        --ucx-plugins-dir ${UCX_HOME}/lib/ucx \
        --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins

-# DeepEP build stage
-FROM base AS build_deep
+# ROCShmem build stage - split from DeepEP so changing DEEPEP_BRANCH does not
+# invalidate the slow ROCShmem build.
+FROM base AS build_rocshmem
 ARG ROCSHMEM_BRANCH="f0acb0c6"
 ARG ROCSHMEM_REPO="https://github.com/ROCm/rocm-systems.git"
-ARG DEEPEP_BRANCH="a9ea9774"
-ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
-ARG DEEPEP_NIC="cx7"
+# DeepEP only supports gfx942 and gfx950; build ROCShmem for the same set so
+# it can be linked against DeepEP without arch mismatches.
 ARG DEEPEP_ROCM_ARCH="gfx942;gfx950"
+ENV ROCM_PATH=/opt/rocm
 ENV ROCSHMEM_DIR=/opt/rocshmem

-RUN git clone ${ROCSHMEM_REPO} \
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    git clone --no-checkout --filter=blob:none ${ROCSHMEM_REPO} \
 && cd rocm-systems \
+ && git sparse-checkout set --cone projects/rocshmem \
 && git checkout ${ROCSHMEM_BRANCH} \
 && mkdir -p projects/rocshmem/build \
 && cd projects/rocshmem/build \
- && INSTALL_PREFIX=${ROCSHMEM_DIR} \
-    ../scripts/build_configs/all_backends -DUSE_EXTERNAL_MPI=OFF
+ && CC="ccache gcc" CXX="ccache g++" INSTALL_PREFIX=${ROCSHMEM_DIR} \
+    bash ../scripts/build_configs/all_backends \
+      -DROCM_PATH=${ROCM_PATH} \
+      -DGPU_TARGETS="${DEEPEP_ROCM_ARCH}" \
+      -DUSE_EXTERNAL_MPI=OFF

-# Build DeepEP wheel.
-# DeepEP looks for rocshmem at ROCSHMEM_DIR.
-RUN git clone ${DEEPEP_REPO} \
+# DeepEP build stage - depends on ROCShmem, builds the HIP kernel wheel.
+FROM build_rocshmem AS build_deepep
+ARG DEEPEP_BRANCH="a9ea9774"
+ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git"
+ARG DEEPEP_NIC="cx7"
+
+# Build DeepEP wheel. DeepEP looks for rocshmem at ROCSHMEM_DIR.
+# DeepEP only supports gfx942 and gfx950, so avoid gfx90a in the default list.
+RUN --mount=type=cache,target=/root/.cache/ccache \
+    export PYTORCH_ROCM_ARCH="gfx942;gfx950" \
+ && git clone ${DEEPEP_REPO} \
 && cd DeepEP \
 && git checkout ${DEEPEP_BRANCH} \
- && python3 setup.py --variant rocm --rocm-explicit-ctx --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install
+ && LDFLAGS="-fuse-ld=mold" MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --rocm-explicit-ctx --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install

 # MoRI runtime dependencies live in Dockerfile.rocm so NIC backend changes do
 # not force users to rebuild the long-lived Dockerfile.rocm_base image.
@@ -372,8 +454,9 @@ RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \
 # Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt)
 # This ensures setuptools_scm sees clean repo state for version detection
 RUN --mount=type=bind,source=.git,target=vllm/.git \
+    --mount=type=cache,target=/root/.cache/uv \
    cd vllm \
-    && pip install setuptools_scm regex \
+    && uv pip install --system setuptools_scm regex \
    && VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \
    && echo "Detected vLLM version: ${VLLM_VERSION}" \
    && echo "${VLLM_VERSION}" > /tmp/vllm_version.txt
@@ -409,18 +492,20 @@ RUN echo "Pinning vLLM dependencies to custom wheel versions..." \
    && python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt

 # Install dependencies using custom wheels from /install
-RUN cd vllm \
+RUN --mount=type=cache,target=/root/.cache/uv \
+    cd vllm \
    && echo "Building vLLM with custom wheels from /install" \
-    && python3 -m pip install --find-links /install -r requirements/rocm.txt \
-    && python3 setup.py clean --all
+    && uv pip install --system --find-links /install -r requirements/rocm.txt

 # Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt
-# (setup.py auto-detects sccache in PATH)
+# (setup.py auto-detects ccache/sccache in PATH)
 RUN --mount=type=bind,source=.git,target=vllm/.git \
+    --mount=type=cache,id=vllm-rocm-ccache,target=/root/.cache/ccache \
    cd vllm \
+    && export CCACHE_BASEDIR="$PWD" \
    && export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \
    && echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \
-    && python3 setup.py bdist_wheel --dist-dir=dist
+    && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist

 FROM scratch AS export_vllm_wheel_release
 ARG COMMON_WORKDIR
@@ -431,30 +516,17 @@ COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests
 COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples
 COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/
 COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite
+COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/pyproject.toml /pyproject.toml
 COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1

 # -----------------------
-# Test vLLM image
-FROM mori_base AS test
+# CI base image (Tier 1) - stable, rarely changing CI dependencies.
+# Per-PR test builds pull this as CI_BASE_IMAGE so the test stage only layers
+# in the vLLM artifacts for the current commit.
+FROM mori_base AS ci_base
+ARG COMMON_WORKDIR

-RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/*
-
-# Install vLLM using uv (inherited from base stage)
-# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version
-RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
-    --mount=type=cache,target=/root/.cache/uv \
-    cd /install \
-    && uv pip install --system -r requirements/rocm.txt \
-    && uv pip install --system -r requirements/test/rocm.txt \
-    && pip uninstall -y vllm \
-    && uv pip install --system *.whl
-
-# Persist the built wheel in the image so python_only_compile_rocm.sh can
-# reinstall it after removing compilers. The bind-mounted /install contents
-# above are not available once that RUN step completes.
-COPY --from=export_vllm /*.whl /opt/vllm-wheels/
-
-# Update rdma-core to support latest rocshmem
+# Update rdma-core to support latest rocshmem.
 ARG DEEPEP_NIC
 RUN if [ "${DEEPEP_NIC}" = "cx7" ] || [ "${DEEPEP_NIC}" = "io" ]; then \
    git clone --branch v62.0 --depth 1 https://github.com/linux-rdma/rdma-core.git /tmp/rdma-core && \
@@ -464,79 +536,98 @@ RUN if [ "${DEEPEP_NIC}" = "cx7" ] || [ "${DEEPEP_NIC}" = "io" ]; then \
    ninja && ninja install && ldconfig && rm -rf /tmp/rdma-core; \
 fi

-# Install RIXL wheel
+# Install RIXL + DeepEP wheels.
 RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \
-    uv pip install --system /rixl_install/*.whl
+    --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \
+    uv pip install --system /rixl_install/*.whl /deep_install/*.whl

-# Install DeepEP wheel
-RUN --mount=type=bind,from=build_deep,src=/app/deep_install,target=/deep_install \
-    uv pip install --system /deep_install/*.whl
-COPY --from=build_deep /opt/rocshmem /opt/rocshmem
+# Copy ROCShmem runtime libraries.
+COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem

-# RIXL/MoRIIO runtime dependencies (RDMA userspace libraries)
-RUN apt-get update -q -y && apt-get install -q -y \
+# RDMA userspace libraries plus FFmpeg dev libs needed by torchcodec.
+RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
    librdmacm1 \
    libibverbs1 \
    ibverbs-providers \
    ibverbs-utils \
+    pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \
+    libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \
    && rm -rf /var/lib/apt/lists/*

-WORKDIR /vllm-workspace
-ARG COMMON_WORKDIR
-COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
-
-# install development dependencies (for testing)
-RUN cd /vllm-workspace \
-    && python3 -m pip install -e tests/vllm_test_utils \
-    && python3 -m pip install pytest-shard
-
-# enable fast downloads from hf (for testing)
-ENV HF_XET_HIGH_PERFORMANCE=1
-
-# increase timeout for hf downloads (for testing)
-ENV HF_HUB_DOWNLOAD_TIMEOUT 60
-
-# install audio decode package `torchcodec` from source (required due to 
-# ROCm and torch version mismatch) for tests with datasets package
+# Install torchcodec from source for ROCm/torch ABI compatibility.
 COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh
-RUN bash /tmp/install_torchcodec.sh \
+RUN --mount=type=cache,target=/root/.cache/uv \
+    --mount=type=cache,target=/root/.cache/pip \
+    --mount=type=cache,target=/root/.cache/torchcodec-wheels \
+    bash /tmp/install_torchcodec.sh \
    && rm /tmp/install_torchcodec.sh \
-    && apt-get clean \
-    && rm -rf /var/lib/apt/lists/*
+    && apt-get clean && rm -rf /var/lib/apt/lists/*

-# Copy in the v1 package (for python-only install test group)
-COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+# Pre-install shared ROCm runtime dependencies.
+COPY requirements/common.txt requirements/rocm.txt /tmp/ci-base-requirements/
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r /tmp/ci-base-requirements/rocm.txt \
+    && rm -rf /tmp/ci-base-requirements

-# Set MIOPEN ENVS to resolve performance regressions in MIOpen 3D convolution kernel
+# Enable fast and less brittle model downloads in tests.
+ENV HF_XET_HIGH_PERFORMANCE=1
+ENV HF_HUB_DOWNLOAD_TIMEOUT=60
+
+# Pre-install vLLM test dependencies.
+COPY requirements/test/rocm.txt /tmp/rocm-test-reqs.txt
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system -r /tmp/rocm-test-reqs.txt
+
+# Rebuild fastsafetensors from source so its C++ extension is compiled with
+# USE_ROCM and can detect libamdhip64.so at runtime.
+RUN --mount=type=cache,target=/root/.cache/pip \
+    FASTSAFETENSORS_REQ="$(grep -E '^fastsafetensors(==| @ )' /tmp/rocm-test-reqs.txt | head -1)" \
+    && test -n "${FASTSAFETENSORS_REQ}" \
+    && python3 -m pip install --force-reinstall --no-deps \
+        --no-binary fastsafetensors "${FASTSAFETENSORS_REQ}" \
+    && rm /tmp/rocm-test-reqs.txt
+
+# Set MIOPEN ENVS to resolve performance regressions in MIOpen 3D convolution kernel.
 # See: https://github.com/pytorch/pytorch/issues/169857
 ENV MIOPEN_DEBUG_CONV_DIRECT=0
 ENV MIOPEN_DEBUG_CONV_GEMM=0

-# Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc
+# Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc.
 # See: https://github.com/ROCm/rocm-libraries/issues/6266
 ENV HSA_ENABLE_IPC_MODE_LEGACY=1

-# Source code is used in the `python_only_compile.sh` test
-# We hide it inside `src/` so that this source code
-# will not be imported by other tests
-RUN mkdir src && mv vllm src/vllm
+# ROCm profiler limits workaround.
+RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf
+ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf"

-# This is a workaround to ensure pytest exits with the correct status code in CI tests.
-RUN printf '%s\n' \
-    'import os' \
-    '' \
-    '_exit_code = 1' \
-    '' \
-    'def pytest_sessionfinish(session, exitstatus):' \
-    '    global _exit_code' \
-    '    _exit_code = int(exitstatus)' \
-    '' \
-    'def pytest_unconfigure(config):' \
-    '    import sys' \
-    '    sys.stdout.flush()' \
-    '    sys.stderr.flush()' \
-    '    os._exit(_exit_code)' \
-    > /vllm-workspace/conftest.py
+# Install vllm_test_utils in ci_base for ci_base + wheel parity.
+COPY tests/vllm_test_utils /tmp/vllm_test_utils
+RUN --mount=type=cache,target=/root/.cache/uv \
+    uv pip install --system /tmp/vllm_test_utils \
+    && rm -rf /tmp/vllm_test_utils
+
+# -----------------------
+# Test vLLM image (Tier 2) - vLLM-only layer on top of ci_base.
+FROM ${CI_BASE_IMAGE} AS test
+ARG COMMON_WORKDIR
+
+# Install the vLLM wheel (--no-deps: all deps already in ci_base).
+RUN --mount=type=bind,from=export_vllm,src=/,target=/install \
+    --mount=type=cache,target=/root/.cache/uv \
+    cd /install \
+    && uv pip install --system --no-deps *.whl
+
+# Store the vLLM wheel in the image for python-only install tests.
+COPY --from=export_vllm /*.whl /opt/vllm-wheels/
+
+WORKDIR /vllm-workspace
+COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace
+
+# Copy in the v1 package (for python-only install test group).
+COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1
+
+# Hide source under src/ so it won't shadow the installed package in tests.
+RUN mkdir src && mv vllm src/vllm

 # -----------------------
 # Final vLLM image
@@ -553,6 +644,7 @@ RUN rm -f /usr/bin/sccache || true \
 # This prevents S3 bucket config from leaking into production images
 ENV SCCACHE_BUCKET=
 ENV SCCACHE_REGION=
+ENV SCCACHE_ENDPOINT=
 ENV SCCACHE_S3_NO_CREDENTIALS=
 ENV SCCACHE_IDLE_TIMEOUT=

@@ -0,0 +1,376 @@
+# ci-rocm.hcl - CI-specific configuration for vLLM ROCm Docker builds
+#
+# This file lives in the vLLM repo at docker/ci-rocm.hcl so ROCm Docker
+# build mechanics can evolve with Dockerfile.rocm and docker-bake-rocm.hcl.
+# Used with: docker buildx bake -f docker/docker-bake-rocm.hcl -f docker/ci-rocm.hcl test-rocm-ci
+#
+# Registry cache: Docker Hub (rocm/vllm-ci-cache) is used exclusively.
+# AMD build agents already have Docker Hub credentials (they push the test
+# image to rocm/vllm-ci), so no additional credential setup is required.
+# ROCm CI uses Docker Hub for BuildKit layer cache by default. A separate
+# compiler cache can be enabled with USE_SCCACHE=1 when AMD provides a shared
+# S3-compatible cache endpoint.
+
+# CI metadata
+
+variable "BUILDKITE_COMMIT" {
+  default = ""
+}
+
+variable "BUILDKITE_BUILD_NUMBER" {
+  default = ""
+}
+
+variable "BUILDKITE_BUILD_ID" {
+  default = ""
+}
+
+variable "PARENT_COMMIT" {
+  default = ""
+}
+
+# Merge-base of HEAD with main - provides a more stable cache fallback than
+# parent commit for long-lived PRs. Mirrors the VLLM_MERGE_BASE_COMMIT
+# pattern used in the shared ci.hcl file. Auto-computed by ci-bake-rocm.sh
+# when unset.
+variable "VLLM_MERGE_BASE_COMMIT" {
+  default = ""
+}
+
+# Bridge to vLLM's COMMIT variable for OCI labels
+variable "COMMIT" {
+  default = BUILDKITE_COMMIT
+}
+
+# Image tags (set by CI)
+
+variable "IMAGE_TAG" {
+  default = ""
+}
+
+variable "IMAGE_TAG_LATEST" {
+  default = ""
+}
+
+# ROCm-specific GPU architecture targets
+
+variable "PYTORCH_ROCM_ARCH" {
+  default = "gfx90a;gfx942;gfx950"
+}
+
+# Pre-built CI base image (Tier 1). Per-PR builds pull this instead of
+# rebuilding RIXL/DeepEP/torchcodec from scratch. The ci_base stage in
+# Dockerfile.rocm inherits from base, so CI_BASE_IMAGE only affects the test
+# stage and is irrelevant when building --target ci_base itself.
+variable "CI_BASE_IMAGE" {
+  default = "rocm/vllm-dev:ci_base"
+}
+
+# Leave CI_MAX_JOBS empty so the Dockerfile falls back to $(nproc) and uses
+# the full builder parallelism. Operators can still override this per build.
+variable "CI_MAX_JOBS" {
+  default = ""
+}
+
+# Upstream dependency commit pins -- extracted from Dockerfile.rocm by
+# ci-bake-rocm.sh at build time. Empty defaults are safe: the cache
+# functions produce no entries when the variable is empty.
+variable "RIXL_BRANCH" {
+  default = ""
+}
+
+variable "UCX_BRANCH" {
+  default = ""
+}
+
+variable "ROCSHMEM_BRANCH" {
+  default = ""
+}
+
+variable "DEEPEP_BRANCH" {
+  default = ""
+}
+
+variable "RIXL_CACHE_KEY" {
+  default = ""
+}
+
+variable "ROCSHMEM_CACHE_KEY" {
+  default = ""
+}
+
+variable "DEEPEP_CACHE_KEY" {
+  default = ""
+}
+
+# Docker Hub registry cache for AMD builds.
+#
+# A separate repo (rocm/vllm-ci-cache) is used for BuildKit layer cache.
+# Final-image cache exports use mode=min to reduce the volume of data pushed.
+# Source-scoped csrc cache exports default to mode=max so fresh workers can
+# recover more of the native build graph when ROCm extension inputs change.
+# NOTE: mode=min still includes all layers referenced by the final image
+# manifest, including inherited base layers (~7.25GB ROCm runtime).
+# Docker Hub auto-creates the repo on first push.
+#
+# Final-image cache stays commit-scoped. Branch-to-branch reuse for the test
+# image comes from importing the parent and merge-base commit cache refs.
+#
+# The source-scoped native cache is exported both per-commit and per-branch so
+# ROCm extension rebuilds are shareable within the same commit reruns and across
+# consecutive commits on the same branch without depending on a single global
+# latest tag.
+
+variable "DOCKERHUB_CACHE_REPO" {
+  default = "rocm/vllm-ci-cache"
+}
+
+variable "DOCKERHUB_CACHE_TO" {
+  default = ""
+}
+
+variable "ROCM_CACHE_BRANCH_TAG" {
+  default = ""
+}
+
+variable "ROCM_CACHE_UPSTREAM_BRANCH_TAG" {
+  default = ""
+}
+
+variable "ROCM_CSRC_CACHE_TO_MODE" {
+  default = "max"
+}
+
+variable "ROCM_FINAL_CACHE_TO_MODE" {
+  default = "min"
+}
+
+# Functions
+
+function "get_cache_from_rocm" {
+  params = []
+  result = compact([
+    # Exact commit hit - fastest cache on re-runs of the same commit
+    BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT}" : "",
+    # Parent commit - useful cache for incremental changes
+    PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${PARENT_COMMIT}" : "",
+    # Merge-base with main - stable fallback for long-lived or rebased PRs;
+    # maps to a real main-branch commit whose cache layers are likely warm
+    VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${VLLM_MERGE_BASE_COMMIT}" : "",
+    # Import the source-scoped native build cache as well so builds whose
+    # Python/package layers changed can still reuse compiled ROCm objects.
+    BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT}" : "",
+    PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${PARENT_COMMIT}" : "",
+    VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${VLLM_MERGE_BASE_COMMIT}" : "",
+    ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "",
+    ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "",
+    # Branch-scoped full image cache - fallback when parent-commit cache is evicted
+    ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "",
+    ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "",
+  ])
+}
+
+function "get_cache_to_rocm" {
+  params = []
+  result = compact([
+    # Commit-scoped cache for exact re-runs.
+    BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT},mode=${ROCM_FINAL_CACHE_TO_MODE}" : "",
+    # Branch-scoped cache so later commits on the same branch can reuse the full
+    # image layers when the parent-commit cache is evicted. Unlike the old
+    # rocm-latest tag (which caused duplicate exporter 400s), this is per-branch.
+    ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=${ROCM_FINAL_CACHE_TO_MODE}" : "",
+  ])
+}
+
+function "get_cache_from_rocm_csrc" {
+  params = []
+  result = compact([
+    BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT}" : "",
+    PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${PARENT_COMMIT}" : "",
+    VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${VLLM_MERGE_BASE_COMMIT}" : "",
+    ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "",
+    ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "",
+  ])
+}
+
+function "get_cache_to_rocm_csrc" {
+  params = []
+  result = compact([
+    # Export the exact-commit native cache for same-commit reruns.
+    BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT},mode=${ROCM_CSRC_CACHE_TO_MODE}" : "",
+    # Export the branch-scoped native cache so later commits on the same branch
+    # can reuse compiled ROCm objects even when the exact parent cache is absent.
+    ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=${ROCM_CSRC_CACHE_TO_MODE}" : "",
+  ])
+}
+
+# Cache functions for upstream dependency stages (RIXL/UCX, ROCShmem, DeepEP).
+# These stages are pinned to specific upstream commit hashes, so cache keys use
+# those hashes rather than the Buildkite commit. This means the cache persists
+# across all vLLM commits as long as the upstream dependency pins don't change.
+
+function "get_cache_from_rocm_deps" {
+  params = []
+  result = compact([
+    RIXL_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_CACHE_KEY}" : (RIXL_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH}" : ""),
+    ROCSHMEM_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_CACHE_KEY}" : (ROCSHMEM_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_BRANCH}" : ""),
+    DEEPEP_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_CACHE_KEY}" : (DEEPEP_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH}" : ""),
+  ])
+}
+
+function "get_cache_to_rocm_rixl" {
+  params = []
+  result = compact([
+    RIXL_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_CACHE_KEY},mode=min" : (RIXL_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH},mode=min" : ""),
+  ])
+}
+
+function "get_cache_to_rocm_rocshmem" {
+  params = []
+  result = compact([
+    ROCSHMEM_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_CACHE_KEY},mode=min" : (ROCSHMEM_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_BRANCH},mode=min" : ""),
+  ])
+}
+
+function "get_cache_to_rocm_deepep" {
+  params = []
+  result = compact([
+    DEEPEP_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_CACHE_KEY},mode=min" : (DEEPEP_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH},mode=min" : ""),
+  ])
+}
+
+# CI targets
+
+target "_ci-rocm" {
+  annotations = [
+    "manifest:vllm.buildkite.build_number=${BUILDKITE_BUILD_NUMBER}",
+    "manifest:vllm.buildkite.build_id=${BUILDKITE_BUILD_ID}",
+  ]
+  args = {
+    ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH
+    CI_BASE_IMAGE         = CI_BASE_IMAGE
+    max_jobs              = CI_MAX_JOBS
+  }
+}
+
+target "test-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm", "_labels"]
+  target     = "test"
+  cache-from = get_cache_from_rocm()
+  cache-to   = get_cache_to_rocm()
+  tags = compact([
+    IMAGE_TAG,
+    IMAGE_TAG_LATEST,
+  ])
+  output = ["type=registry"]
+}
+
+# Cache-only target for the source-scoped ROCm native build stage.
+# This persists the csrc-build stage in the registry cache even though the
+# final test image only consumes it indirectly while packaging the wheel.
+target "csrc-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm"]
+  target     = "csrc-build"
+  cache-from = get_cache_from_rocm_csrc()
+  cache-to   = get_cache_to_rocm_csrc()
+  output     = ["type=cacheonly"]
+}
+
+# Keep wheel export on the same CI graph as the test image build so the
+# shared build_vllm/export_vllm stages resolve identically within one bake
+# invocation. Without this, export-wheel-rocm uses the plain local target
+# args while test-rocm-ci uses CI-only args, which can lead to separate
+# cache lineages and inconsistent export_vllm results.
+target "export-wheel-rocm" {
+  inherits   = ["_common-rocm", "_ci-rocm"]
+  target     = "export_vllm"
+  cache-from = get_cache_from_rocm()
+  cache-to   = get_cache_to_rocm()
+  output     = ["type=local,dest=./wheel-export"]
+}
+
+# Artifact-only vLLM build. GPU test jobs consume this artifact on top of
+# ci_base, avoiding a per-commit multi-GB image push/pull.
+group "test-rocm-ci-with-artifacts" {
+  targets = ["csrc-rocm-ci", "export-wheel-rocm"]
+}
+
+# Full test image + wheel export. Kept for fallback/debugging when a pushed
+# per-commit image is useful.
+group "test-rocm-ci-with-wheel" {
+  targets = ["csrc-rocm-ci", "test-rocm-ci", "export-wheel-rocm"]
+}
+
+# Image tags for the ci_base build. ci-bake-rocm.sh rewrites CI_BASE_IMAGE_TAG
+# to the primary tag for this build. Non-nightly builds use a commit-scoped tag
+# and also publish a content tag for reuse. NIGHTLY=1 builds on the stable branch
+# can additionally set CI_BASE_IMAGE_TAG_STABLE to refresh rocm/vllm-dev:ci_base.
+variable "CI_BASE_IMAGE_TAG" {
+  default = "rocm/vllm-dev:ci_base"
+}
+
+variable "CI_BASE_IMAGE_TAG_CONTENT" {
+  default = ""
+}
+
+variable "CI_BASE_IMAGE_TAG_STABLE" {
+  default = ""
+}
+
+# Cache-only targets for upstream dependency stages. These persist each stage
+# in the registry cache keyed by its upstream commit hash. When ci_base rebuilds
+# (e.g., requirements change), these stages are cache hits if their upstream
+# pins haven't changed -- saving ~35min of compilation.
+target "rixl-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm"]
+  target     = "build_rixl"
+  cache-from = get_cache_from_rocm_deps()
+  cache-to   = get_cache_to_rocm_rixl()
+  output     = ["type=cacheonly"]
+}
+
+target "rocshmem-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm"]
+  target     = "build_rocshmem"
+  cache-from = get_cache_from_rocm_deps()
+  cache-to   = get_cache_to_rocm_rocshmem()
+  output     = ["type=cacheonly"]
+}
+
+target "deepep-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm"]
+  target     = "build_deepep"
+  cache-from = get_cache_from_rocm_deps()
+  cache-to   = get_cache_to_rocm_deepep()
+  output     = ["type=cacheonly"]
+}
+
+# Builds only the ci_base stage (RIXL, DeepEP, torchcodec, etc.)
+# Invoked by the ensure-ci-base step when the content hash of ci_base-affecting
+# files drifts from the remote image label. Per-PR builds then pull the result
+# as CI_BASE_IMAGE instead of rebuilding those slow layers on every commit.
+# Uses inline cache metadata on the ci_base image itself instead of exporting a
+# separate registry cache artifact.
+target "ci-base-rocm-ci" {
+  inherits   = ["_common-rocm", "_ci-rocm", "_labels"]
+  target     = "ci_base"
+  cache-from = concat(
+    compact([
+      CI_BASE_IMAGE_TAG != "" ? "type=registry,ref=${CI_BASE_IMAGE_TAG}" : "",
+      CI_BASE_IMAGE_TAG_CONTENT != "" ? "type=registry,ref=${CI_BASE_IMAGE_TAG_CONTENT}" : "",
+      CI_BASE_IMAGE_TAG_STABLE != "" ? "type=registry,ref=${CI_BASE_IMAGE_TAG_STABLE}" : "",
+    ]),
+    # Import upstream dependency caches so RIXL/ROCShmem/DeepEP stages
+    # are cache hits even when ci_base itself needs rebuilding.
+    get_cache_from_rocm_deps(),
+  )
+  cache-to = ["type=inline"]
+  tags     = compact([CI_BASE_IMAGE_TAG, CI_BASE_IMAGE_TAG_CONTENT, CI_BASE_IMAGE_TAG_STABLE])
+  output   = ["type=registry"]
+}
+
+# Group for ci_base builds -- exports dependency stage caches alongside the
+# ci_base image so future rebuilds can reuse them independently.
+group "ci-base-rocm-ci-with-deps" {
+  targets = ["rixl-rocm-ci", "rocshmem-rocm-ci", "deepep-rocm-ci", "ci-base-rocm-ci"]
+}
@@ -0,0 +1,143 @@
+# docker-bake-rocm.hcl - vLLM ROCm Docker build configuration
+#
+# This file lives in the vLLM repo at docker/docker-bake-rocm.hcl
+# Equivalent of docker-bake.hcl for ROCm builds.
+#
+# Usage:
+#   docker buildx bake -f docker/docker-bake-rocm.hcl              # Build test (default)
+#   docker buildx bake -f docker/docker-bake-rocm.hcl final-rocm   # Build final image
+#   docker buildx bake -f docker/docker-bake-rocm.hcl --print      # Show resolved config
+#
+# CI usage (with the vLLM-owned CI overlay):
+#   docker buildx bake -f docker/docker-bake-rocm.hcl -f docker/ci-rocm.hcl test-rocm-ci
+
+variable "MAX_JOBS" {
+  # Empty string lets the Dockerfile fall back to $(nproc) via
+  # MAX_JOBS="${MAX_JOBS:-$(nproc)}" in each RUN step, which uses all
+  # available cores on whatever machine the build runs on.
+  # Override with --set '*.args.max_jobs=8' for local builds on small machines.
+  default = ""
+}
+
+variable "PYTORCH_ROCM_ARCH" {
+  default = "gfx90a;gfx942;gfx950"
+}
+
+variable "COMMIT" {
+  default = ""
+}
+
+# Content hash of ci_base-affecting files. Computed by ci-bake-rocm.sh and
+# embedded as a label so future builds can compare without rebuilding.
+variable "CI_BASE_CONTENT_HASH" {
+  default = ""
+}
+
+# REMOTE_VLLM=0: use local source via Docker build context (ONBUILD COPY ./ vllm/)
+# REMOTE_VLLM=1: clone from GitHub at VLLM_BRANCH (standalone builds without local source)
+variable "REMOTE_VLLM" {
+  default = "0"
+}
+
+variable "VLLM_BRANCH" {
+  default = "main"
+}
+
+# CI_BASE_IMAGE: pre-built ci_base image for per-PR test builds.
+# Defaults to the local "ci_base" stage for standalone/local builds.
+# CI overrides this to "rocm/vllm-dev:ci_base" via environment variable.
+variable "CI_BASE_IMAGE" {
+  default = "rocm/vllm-dev:ci_base"
+}
+
+# Upstream dependency commit pins. Plain local bake builds use the Dockerfile
+# ARG defaults. ci-bake-rocm.sh resolves those defaults (plus any env
+# overrides) and writes a small HCL override before invoking CI targets.
+variable "RIXL_BRANCH" {
+  default = ""
+}
+
+variable "UCX_BRANCH" {
+  default = ""
+}
+
+variable "ROCSHMEM_BRANCH" {
+  default = ""
+}
+
+variable "DEEPEP_BRANCH" {
+  default = ""
+}
+
+group "default" {
+  targets = ["test-rocm"]
+}
+
+target "_common-rocm" {
+  dockerfile = "docker/Dockerfile.rocm"
+  context    = "."
+  args = {
+    max_jobs                        = MAX_JOBS
+    ARG_PYTORCH_ROCM_ARCH           = PYTORCH_ROCM_ARCH
+    REMOTE_VLLM                     = REMOTE_VLLM
+    VLLM_BRANCH                     = VLLM_BRANCH
+    CI_BASE_IMAGE                   = CI_BASE_IMAGE
+  }
+}
+
+target "_labels" {
+  labels = {
+    "org.opencontainers.image.source"      = "https://github.com/vllm-project/vllm"
+    "org.opencontainers.image.vendor"      = "vLLM"
+    "org.opencontainers.image.title"       = "vLLM ROCm"
+    "org.opencontainers.image.description" = "vLLM: A high-throughput and memory-efficient inference and serving engine for LLMs (ROCm)"
+    "org.opencontainers.image.licenses"    = "Apache-2.0"
+    "org.opencontainers.image.revision"    = COMMIT
+  }
+  annotations = [
+    "manifest:org.opencontainers.image.revision=${COMMIT}",
+  ]
+}
+
+target "test-rocm" {
+  inherits = ["_common-rocm", "_labels"]
+  target   = "test"
+  tags     = ["rocm/vllm:test"]
+  output   = ["type=docker"]
+}
+
+# CI base image target - builds only the ci_base stage (RIXL, DeepEP,
+# torchcodec, requirements, etc.). Used by the weekly scheduled build and
+# the auto-rebuild trigger when requirements change in a PR.
+target "ci-base-rocm" {
+  inherits = ["_common-rocm", "_labels"]
+  target   = "ci_base"
+  labels   = {
+    "vllm.ci_base.content_hash" = CI_BASE_CONTENT_HASH
+  }
+  tags     = ["rocm/vllm-dev:ci_base"]
+  output   = ["type=docker"]
+}
+
+# Wheel export target - extracts the built vLLM wheel + test workspace
+# to local disk. Used by CI to upload the wheel as a Buildkite artifact
+# so test jobs can assemble images locally from ci_base + wheel instead
+# of pulling the full large image from Docker Hub.
+#
+# Usage:
+#   docker buildx bake -f docker/docker-bake-rocm.hcl export-wheel-rocm
+#   # Creates ./wheel-export/*.whl, ./wheel-export/requirements/, etc.
+#
+# After a full bake build, BuildKit cache makes this nearly instant.
+target "export-wheel-rocm" {
+  inherits = ["_common-rocm"]
+  target   = "export_vllm"
+  output   = ["type=local,dest=./wheel-export"]
+}
+
+target "final-rocm" {
+  inherits = ["_common-rocm", "_labels"]
+  target   = "final"
+  tags     = ["rocm/vllm:latest"]
+  output   = ["type=docker"]
+}
@@ -3,12 +3,16 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 # Script to install TorchCodec from source (required for ROCm compatibility)
+# The PyPI wheel is built against upstream PyTorch and has ABI mismatches with
+# ROCm's custom torch build, so we must compile from source.

 set -e

 TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}"
 # Pin to a specific release for reproducibility; update as needed.
 TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}"
+# Cache directory for pre-built wheels to avoid redundant recompilation.
+TORCHCODEC_WHEEL_CACHE="${TORCHCODEC_WHEEL_CACHE:-/root/.cache/torchcodec-wheels}"

 echo "=== TorchCodec Installation Script ==="

@@ -18,9 +22,26 @@ if python3 -c "from torchcodec.decoders import VideoDecoder" 2>/dev/null; then
    exit 0
 fi

+# Try to install from cached wheel first
+ARCH_TAG="${PYTORCH_ROCM_ARCH:-all}"
+# Normalize arch tag (replace ; with _) for use in filename
+ARCH_TAG="${ARCH_TAG//;/_}"
+CACHED_WHEEL="${TORCHCODEC_WHEEL_CACHE}/torchcodec-${TORCHCODEC_BRANCH}-${ARCH_TAG}.whl"
+
+if [ -f "$CACHED_WHEEL" ]; then
+    echo "Found cached wheel: $CACHED_WHEEL"
+    pip install "$CACHED_WHEEL" && {
+        echo "Installed from cached wheel."
+        echo "=== TorchCodec installation complete ==="
+        exit 0
+    }
+    echo "Cached wheel installation failed, rebuilding from source..."
+fi
+
 echo "TorchCodec not found. Installing from source..."

-# Install system dependencies (FFmpeg + pkg-config)
+# Install system dependencies (FFmpeg + pkg-config) if not already present.
+# The Docker test image pre-installs these, so this is a fallback for other envs.
 install_system_deps() {
    if command -v apt-get &> /dev/null; then
        echo "Installing system dependencies..."
@@ -56,6 +77,12 @@ export pybind11_DIR=$(python3 -c "import pybind11; print(pybind11.get_cmake_dir(
 export CMAKE_PREFIX_PATH="${pybind11_DIR}:${CMAKE_PREFIX_PATH}"
 echo "pybind11_DIR set to: $pybind11_DIR"

+# Limit GPU architectures to only what this image targets.
+# The default builds for all supported archs which is very slow.
+if [ -n "$PYTORCH_ROCM_ARCH" ]; then
+    echo "Building for PYTORCH_ROCM_ARCH=$PYTORCH_ROCM_ARCH"
+fi
+
 # Create temp directory for build
 BUILD_DIR=$(mktemp -d -t torchcodec-XXXXXX)
 echo "Building in temporary directory: $BUILD_DIR"
@@ -77,9 +104,31 @@ cd torchcodec
 export TORCHCODEC_CMAKE_BUILD_DIR="${PWD}/build"
 export TORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR=1
 export I_CONFIRM_THIS_IS_NOT_A_LICENSE_VIOLATION=1
+# Use ninja for faster builds and parallelize compilation
+export CMAKE_GENERATOR=Ninja
+export MAX_JOBS="${MAX_JOBS:-$(nproc)}"
+# Use ccache if available to speed up recompilation
+if command -v ccache &> /dev/null; then
+    export CMAKE_C_COMPILER_LAUNCHER=ccache
+    export CMAKE_CXX_COMPILER_LAUNCHER=ccache
+fi

-echo "Building TorchCodec..."
-pip install . --no-build-isolation
+echo "Building TorchCodec (MAX_JOBS=$MAX_JOBS)..."
+pip wheel . --no-build-isolation --no-deps -w "$BUILD_DIR/dist"
+
+# Install the built wheel
+BUILT_WHEEL=$(ls "$BUILD_DIR/dist"/torchcodec-*.whl 2>/dev/null | head -1)
+if [ -z "$BUILT_WHEEL" ]; then
+    echo "Error: No wheel produced"
+    exit 1
+fi
+
+pip install "$BUILT_WHEEL"
+
+# Cache the wheel for future runs
+mkdir -p "$TORCHCODEC_WHEEL_CACHE"
+cp "$BUILT_WHEEL" "$CACHED_WHEEL"
+echo "Cached wheel to: $CACHED_WHEEL"

 # Verify installation
 echo "Verifying installation..."