[ROCm][CI] Stabilize Cargo cache and pre-test image checks (#43815)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2026-05-27 22:24:44 -05:00
committed by GitHub
parent 413ac5c070
commit 33e94fc3ad
3 changed files with 45 additions and 28 deletions
+20
View File
@@ -17,6 +17,26 @@ steps:
--target test
--no-cache
--progress plain .
- |
docker run --rm --network=none --entrypoint /bin/bash "rocm/vllm-ci:${BUILDKITE_COMMIT}" -ec '
if [ ! -d /vllm-workspace ]; then echo Missing directory: /vllm-workspace >&2; exit 1; fi
if [ ! -d /vllm-workspace/tests ]; then echo Missing directory: /vllm-workspace/tests >&2; exit 1; fi
if [ ! -d /vllm-workspace/src/vllm ]; then echo Missing directory: /vllm-workspace/src/vllm >&2; exit 1; fi
if [ ! -x /vllm-workspace/src/vllm/vllm-rs ]; then echo Missing executable: /vllm-workspace/src/vllm/vllm-rs >&2; exit 1; fi
command -v python3
command -v uv
command -v pytest
if ! command -v amd-smi >/dev/null 2>&1 && ! command -v rocminfo >/dev/null 2>&1; then
echo No ROCm CLI found in image >&2
exit 1
fi
python3 - <<PY
import torch, vllm
print(torch.__version__)
print(vllm.__version__)
PY
echo AMD image smoke OK
'
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
env:
DOCKER_BUILDKIT: "1"
+15 -23
View File
@@ -35,25 +35,9 @@ export PYTHONPATH=".."
# Helper Functions
###############################################################################
cleanup_docker() {
# Get Docker's root directory
docker_root=$(docker info -f '{{.DockerRootDir}}')
if [ -z "$docker_root" ]; then
echo "Failed to determine Docker root directory."
exit 1
fi
echo "Docker root directory: $docker_root"
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
threshold=70
if [ "$disk_usage" -gt "$threshold" ]; then
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
docker image prune -f
docker volume prune -f && docker system prune --force --filter "until=72h" --all
echo "Docker images and volumes cleanup completed."
else
echo "Disk usage is below $threshold%. No cleanup needed."
fi
report_docker_usage() {
echo "--- Docker usage"
docker system df || true
}
cleanup_network() {
@@ -254,8 +238,8 @@ re_quote_pytest_markers() {
echo "--- ROCm info"
rocminfo
# --- Docker housekeeping ---
cleanup_docker
# --- Docker status ---
report_docker_usage
# --- Pull test image ---
echo "--- Pulling container"
@@ -264,9 +248,17 @@ container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | hea
docker pull "${image_name}"
remove_docker_container() {
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
# docker run uses --rm, so the container is normally already gone when the
# EXIT trap runs. Cleanup is best-effort and must not affect the test result.
docker rm -f "${container_name}" >/dev/null 2>&1 || true
}
trap remove_docker_container EXIT
on_exit() {
local exit_code=$?
remove_docker_container
exit "$exit_code"
}
trap on_exit EXIT
# --- Prepare commands ---
echo "--- Running container"
+10 -5
View File
@@ -135,13 +135,18 @@ ENV PATH="/root/.cargo/bin:${PATH}"
# Cap cargo parallelism to avoid exhausting the AMD CI host's open-file limit
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
ENV CARGO_BUILD_JOBS=4
ENV CARGO_NET_RETRY=10
ENV RUSTUP_MAX_RETRIES=10
# Build the release binary. Cache cargo registry/git, and copy the binary out
# so it persists into the image layer for later COPY --from=rust-build.
RUN --mount=type=cache,target=/root/.cargo/registry \
--mount=type=cache,target=/root/.cargo/git \
# Build the release binary. Cargo's registry/git caches can be written by
# concurrent BuildKit jobs on shared workers, so lock those cache mounts while
# keeping the cache benefit. Copy the binary out so it persists into the image
# layer for later COPY --from=rust-build.
RUN --mount=type=cache,id=vllm-rocm-cargo-registry,target=/root/.cargo/registry,sharing=locked \
--mount=type=cache,id=vllm-rocm-cargo-git,target=/root/.cargo/git,sharing=locked \
cd ${COMMON_WORKDIR}/vllm \
&& VLLM_RS_TARGET_PATH=/tmp/vllm-rs bash build_rust.sh
&& VLLM_RS_TARGET_PATH=/tmp/vllm-rs bash build_rust.sh \
&& test -x /tmp/vllm-rs
# -----------------------
# vLLM build stages