diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml index 0c514647dc2..1351eba92f2 100644 --- a/.buildkite/hardware_tests/amd.yaml +++ b/.buildkite/hardware_tests/amd.yaml @@ -17,6 +17,26 @@ steps: --target test --no-cache --progress plain . + - | + docker run --rm --network=none --entrypoint /bin/bash "rocm/vllm-ci:${BUILDKITE_COMMIT}" -ec ' + if [ ! -d /vllm-workspace ]; then echo Missing directory: /vllm-workspace >&2; exit 1; fi + if [ ! -d /vllm-workspace/tests ]; then echo Missing directory: /vllm-workspace/tests >&2; exit 1; fi + if [ ! -d /vllm-workspace/src/vllm ]; then echo Missing directory: /vllm-workspace/src/vllm >&2; exit 1; fi + if [ ! -x /vllm-workspace/src/vllm/vllm-rs ]; then echo Missing executable: /vllm-workspace/src/vllm/vllm-rs >&2; exit 1; fi + command -v python3 + command -v uv + command -v pytest + if ! command -v amd-smi >/dev/null 2>&1 && ! command -v rocminfo >/dev/null 2>&1; then + echo No ROCm CLI found in image >&2 + exit 1 + fi + python3 - </dev/null 2>&1 || true } -trap remove_docker_container EXIT + +on_exit() { + local exit_code=$? + remove_docker_container + exit "$exit_code" +} +trap on_exit EXIT # --- Prepare commands --- echo "--- Running container" diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index b2342200a68..61d73cd1527 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -135,13 +135,18 @@ ENV PATH="/root/.cargo/bin:${PATH}" # Cap cargo parallelism to avoid exhausting the AMD CI host's open-file limit # (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise). ENV CARGO_BUILD_JOBS=4 +ENV CARGO_NET_RETRY=10 +ENV RUSTUP_MAX_RETRIES=10 -# Build the release binary. Cache cargo registry/git, and copy the binary out -# so it persists into the image layer for later COPY --from=rust-build. -RUN --mount=type=cache,target=/root/.cargo/registry \ - --mount=type=cache,target=/root/.cargo/git \ +# Build the release binary. Cargo's registry/git caches can be written by +# concurrent BuildKit jobs on shared workers, so lock those cache mounts while +# keeping the cache benefit. Copy the binary out so it persists into the image +# layer for later COPY --from=rust-build. +RUN --mount=type=cache,id=vllm-rocm-cargo-registry,target=/root/.cargo/registry,sharing=locked \ + --mount=type=cache,id=vllm-rocm-cargo-git,target=/root/.cargo/git,sharing=locked \ cd ${COMMON_WORKDIR}/vllm \ - && VLLM_RS_TARGET_PATH=/tmp/vllm-rs bash build_rust.sh + && VLLM_RS_TARGET_PATH=/tmp/vllm-rs bash build_rust.sh \ + && test -x /tmp/vllm-rs # ----------------------- # vLLM build stages