mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[ROCm][CI] Stabilize Cargo cache and pre-test image checks (#43815)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
@@ -17,6 +17,26 @@ steps:
|
||||
--target test
|
||||
--no-cache
|
||||
--progress plain .
|
||||
- |
|
||||
docker run --rm --network=none --entrypoint /bin/bash "rocm/vllm-ci:${BUILDKITE_COMMIT}" -ec '
|
||||
if [ ! -d /vllm-workspace ]; then echo Missing directory: /vllm-workspace >&2; exit 1; fi
|
||||
if [ ! -d /vllm-workspace/tests ]; then echo Missing directory: /vllm-workspace/tests >&2; exit 1; fi
|
||||
if [ ! -d /vllm-workspace/src/vllm ]; then echo Missing directory: /vllm-workspace/src/vllm >&2; exit 1; fi
|
||||
if [ ! -x /vllm-workspace/src/vllm/vllm-rs ]; then echo Missing executable: /vllm-workspace/src/vllm/vllm-rs >&2; exit 1; fi
|
||||
command -v python3
|
||||
command -v uv
|
||||
command -v pytest
|
||||
if ! command -v amd-smi >/dev/null 2>&1 && ! command -v rocminfo >/dev/null 2>&1; then
|
||||
echo No ROCm CLI found in image >&2
|
||||
exit 1
|
||||
fi
|
||||
python3 - <<PY
|
||||
import torch, vllm
|
||||
print(torch.__version__)
|
||||
print(vllm.__version__)
|
||||
PY
|
||||
echo AMD image smoke OK
|
||||
'
|
||||
- docker push "rocm/vllm-ci:${BUILDKITE_COMMIT}"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
|
||||
@@ -35,25 +35,9 @@ export PYTHONPATH=".."
|
||||
# Helper Functions
|
||||
###############################################################################
|
||||
|
||||
cleanup_docker() {
|
||||
# Get Docker's root directory
|
||||
docker_root=$(docker info -f '{{.DockerRootDir}}')
|
||||
if [ -z "$docker_root" ]; then
|
||||
echo "Failed to determine Docker root directory."
|
||||
exit 1
|
||||
fi
|
||||
echo "Docker root directory: $docker_root"
|
||||
|
||||
disk_usage=$(df "$docker_root" | tail -1 | awk '{print $5}' | sed 's/%//')
|
||||
threshold=70
|
||||
if [ "$disk_usage" -gt "$threshold" ]; then
|
||||
echo "Disk usage is above $threshold%. Cleaning up Docker images and volumes..."
|
||||
docker image prune -f
|
||||
docker volume prune -f && docker system prune --force --filter "until=72h" --all
|
||||
echo "Docker images and volumes cleanup completed."
|
||||
else
|
||||
echo "Disk usage is below $threshold%. No cleanup needed."
|
||||
fi
|
||||
report_docker_usage() {
|
||||
echo "--- Docker usage"
|
||||
docker system df || true
|
||||
}
|
||||
|
||||
cleanup_network() {
|
||||
@@ -254,8 +238,8 @@ re_quote_pytest_markers() {
|
||||
echo "--- ROCm info"
|
||||
rocminfo
|
||||
|
||||
# --- Docker housekeeping ---
|
||||
cleanup_docker
|
||||
# --- Docker status ---
|
||||
report_docker_usage
|
||||
|
||||
# --- Pull test image ---
|
||||
echo "--- Pulling container"
|
||||
@@ -264,9 +248,17 @@ container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | hea
|
||||
docker pull "${image_name}"
|
||||
|
||||
remove_docker_container() {
|
||||
docker rm -f "${container_name}" || docker image rm -f "${image_name}" || true
|
||||
# docker run uses --rm, so the container is normally already gone when the
|
||||
# EXIT trap runs. Cleanup is best-effort and must not affect the test result.
|
||||
docker rm -f "${container_name}" >/dev/null 2>&1 || true
|
||||
}
|
||||
trap remove_docker_container EXIT
|
||||
|
||||
on_exit() {
|
||||
local exit_code=$?
|
||||
remove_docker_container
|
||||
exit "$exit_code"
|
||||
}
|
||||
trap on_exit EXIT
|
||||
|
||||
# --- Prepare commands ---
|
||||
echo "--- Running container"
|
||||
|
||||
Reference in New Issue
Block a user