diff --git a/.buildkite/ci_config_rocm.yaml b/.buildkite/ci_config_rocm.yaml new file mode 100644 index 00000000000..23f32340071 --- /dev/null +++ b/.buildkite/ci_config_rocm.yaml @@ -0,0 +1,23 @@ +name: vllm_rocm_ci +job_dirs: + - ".buildkite/hardware_tests" +run_all_patterns: + - "docker/Dockerfile.rocm" + - "docker/Dockerfile.rocm_base" + - "docker/ci-rocm.hcl" + - "docker/docker-bake-rocm.hcl" + - ".buildkite/hardware_tests/amd.yaml" + - ".buildkite/scripts/ci-bake-rocm.sh" + - ".buildkite/scripts/hardware_ci/run-amd-test.py" + - ".buildkite/scripts/hardware_ci/run-amd-test.sh" + - "CMakeLists.txt" + - "requirements/common.txt" + - "requirements/rocm.txt" + - "requirements/build/rocm.txt" + - "requirements/test/rocm.txt" + - "setup.py" + - "csrc/" + - "cmake/" +run_all_exclude_patterns: + - "csrc/cpu/" + - "cmake/cpu_extension.cmake" diff --git a/.buildkite/hardware_tests/amd.yaml b/.buildkite/hardware_tests/amd.yaml index 1351eba92f2..c2510f38aab 100644 --- a/.buildkite/hardware_tests/amd.yaml +++ b/.buildkite/hardware_tests/amd.yaml @@ -1,42 +1,73 @@ -group: Hardware - AMD Build +group: Hardware - AMD Build steps: - - label: "AMD: :docker: build image" - key: image-build-amd + # Ensure ci_base is up-to-date before building the test image. + # Compares a content hash of ci_base-affecting files against the remote + # image label. If hashes match the build is skipped (< 30 s); if they + # differ ci_base is rebuilt and pushed automatically. + - label: "AMD: :docker: ensure ci_base" + key: ensure-ci-base-amd depends_on: [] device: amd_cpu no_plugin: true commands: - - > - docker build - --build-arg max_jobs=16 - --build-arg REMOTE_VLLM=1 - --build-arg ARG_PYTORCH_ROCM_ARCH='gfx90a;gfx942;gfx950' - --build-arg VLLM_BRANCH=$BUILDKITE_COMMIT - --tag "rocm/vllm-ci:${BUILDKITE_COMMIT}" - -f docker/Dockerfile.rocm - --target test - --no-cache - --progress plain . - - | - docker run --rm --network=none --entrypoint /bin/bash "rocm/vllm-ci:${BUILDKITE_COMMIT}" -ec ' - if [ ! -d /vllm-workspace ]; then echo Missing directory: /vllm-workspace >&2; exit 1; fi - if [ ! -d /vllm-workspace/tests ]; then echo Missing directory: /vllm-workspace/tests >&2; exit 1; fi - if [ ! -d /vllm-workspace/src/vllm ]; then echo Missing directory: /vllm-workspace/src/vllm >&2; exit 1; fi - if [ ! -x /vllm-workspace/src/vllm/vllm-rs ]; then echo Missing executable: /vllm-workspace/src/vllm/vllm-rs >&2; exit 1; fi - command -v python3 - command -v uv - command -v pytest - if ! command -v amd-smi >/dev/null 2>&1 && ! command -v rocminfo >/dev/null 2>&1; then - echo No ROCm CLI found in image >&2 - exit 1 - fi - python3 - <&2; exit 1; fi + if [ ! -d /vllm-workspace/tests ]; then echo Missing directory: /vllm-workspace/tests >&2; exit 1; fi + if [ ! -d /vllm-workspace/src/vllm ]; then echo Missing directory: /vllm-workspace/src/vllm >&2; exit 1; fi + if [ ! -x /vllm-workspace/src/vllm/vllm-rs ]; then echo Missing executable: /vllm-workspace/src/vllm/vllm-rs >&2; exit 1; fi + command -v python3 + command -v uv + command -v pytest + if ! command -v amd-smi >/dev/null 2>&1 && ! command -v rocminfo >/dev/null 2>&1; then + echo No ROCm CLI found in image >&2 + exit 1 + fi + python3 - </dev/null 2>&1; then + timeout "${timeout_secs}s" git fetch "$@" 2>/dev/null + else + git fetch "$@" 2>/dev/null + fi +} + +hash_string_short() { + printf '%s' "$1" | sha256sum | cut -c1-16 +} + +compute_content_hash() { + local path + local file + + for path in "$@"; do + if [[ -d "${path}" ]]; then + while IFS= read -r -d '' file; do + printf 'file:%s\n' "${file}" + sha256sum "${file}" + done < <(find "${path}" -type f -print0 | sort -z) + elif [[ -f "${path}" ]]; then + printf 'file:%s\n' "${path}" + sha256sum "${path}" + else + printf 'missing:%s\n' "${path}" + fi + done | sha256sum | cut -d' ' -f1 +} + +compose_dependency_cache_key() { + local prefix="$1" + local material="$2" + local cleaned_prefix="" + + cleaned_prefix=$(clean_docker_tag "${prefix}" | cut -c1-96) + printf '%s-%s\n' "${cleaned_prefix}" "$(hash_string_short "${material}")" +} + +hash_dockerfile_stages() { + local dockerfile="$1" + local stages="$2" + + awk -v wanted_stages="${stages}" ' + BEGIN { + split(wanted_stages, stage_list, /[[:space:]]+/) + for (idx in stage_list) { + if (stage_list[idx] != "") { + wanted[stage_list[idx]] = 1 + } + } + emit = 1 + } + $1 == "FROM" { + stage = "" + for (idx = 1; idx <= NF; idx++) { + if (tolower($idx) == "as" && idx < NF) { + stage = $(idx + 1) + } + } + emit = (stage in wanted) + } + emit { + print + } + ' "${dockerfile}" +} + +discover_dockerfile_stage_args() { + local dockerfile="$1" + local stages="$2" + + [[ -f "${dockerfile}" ]] || return 0 + + awk -v wanted_stages="${stages}" ' + function add_arg(name) { + if (name != "" && !(name in seen)) { + seen[name] = 1 + args[++arg_count] = name + } + } + BEGIN { + split(wanted_stages, stage_list, /[[:space:]]+/) + for (idx in stage_list) { + if (stage_list[idx] != "") { + wanted[stage_list[idx]] = 1 + } + } + emit = 1 + } + { + line = $0 + if ($1 == "FROM") { + stage = "" + for (idx = 1; idx <= NF; idx++) { + if (tolower($idx) == "as" && idx < NF) { + stage = $(idx + 1) + } + } + emit = (stage in wanted) + } + if (emit) { + lines[++line_count] = line + } + } + END { + for (idx = 1; idx <= line_count; idx++) { + line = lines[idx] + arg_name = line + sub(/^[[:space:]]*ARG[[:space:]]+/, "", arg_name) + if (arg_name != line) { + sub(/[=[:space:]].*/, "", arg_name) + if (arg_name ~ /^[A-Za-z_][A-Za-z0-9_]*$/) { + add_arg(arg_name) + } + } + } + + for (idx = 1; idx <= line_count; idx++) { + line = lines[idx] + for (arg_idx = 1; arg_idx <= arg_count; arg_idx++) { + name = args[arg_idx] + if (line ~ "\\$\\{" name "([}:][^}]*)?\\}" \ + || line ~ "\\$" name "([^A-Za-z0-9_]|$)") { + used[name] = 1 + } + } + } + + for (arg_idx = 1; arg_idx <= arg_count; arg_idx++) { + name = args[arg_idx] + if (used[name]) { + print name + } + } + } + ' "${dockerfile}" +} + +get_content_arg_names() { + local dockerfile="$1" + local stages="$2" + local explicit_args="${3:-}" + + if [[ -n "${explicit_args}" ]]; then + tr ' ' '\n' <<< "${explicit_args}" + else + discover_dockerfile_stage_args "${dockerfile}" "${stages}" + fi | awk 'NF && !seen[$0]++' +} + +compute_ci_base_content_hash() { + local -a content_paths=() + local -a content_args=() + local dockerfile="${CI_BASE_DOCKERFILE:-}" + local stages="${CI_BASE_DOCKERFILE_STAGES:-}" + + read -r -a content_paths <<< "${CI_BASE_CONTENT_FILES}" + mapfile -t content_args < <( + get_content_arg_names "${dockerfile}" "${stages}" "${CI_BASE_CONTENT_ARGS:-}" + ) + + { + printf 'content-files-hash:%s\n' "$(compute_content_hash "${content_paths[@]}")" + if [[ -n "${dockerfile}" ]]; then + printf 'dockerfile:%s\n' "${dockerfile}" + printf 'resolved-build-args:\n' + hash_dockerfile_arg_values "${dockerfile}" "${content_args[@]}" + if [[ -n "${stages}" ]]; then + printf 'dockerfile-stages:%s\n' "${stages}" + if [[ -f "${dockerfile}" ]]; then + hash_dockerfile_stages "${dockerfile}" "${stages}" + else + printf 'missing:%s\n' "${dockerfile}" + fi + fi + fi + } | sha256sum | cut -d' ' -f1 +} + +extract_dockerfile_arg_default() { + local dockerfile="$1" + local arg_name="$2" + + sed -n -E "s/^[[:space:]]*ARG[[:space:]]+${arg_name}=\"?([^\"[:space:]]+)\"?.*/\\1/p" \ + "${dockerfile}" | head -1 +} + +resolve_image_digest() { + local image_ref="$1" + + docker buildx imagetools inspect "${image_ref}" 2>/dev/null \ + | sed -n -E 's/^Digest:[[:space:]]+//p' \ + | head -1 +} + +resolve_dockerfile_arg_value() { + local dockerfile="$1" + local arg_name="$2" + local env_name="${arg_name}" + local value="" + + case "${arg_name}" in + ARG_PYTORCH_ROCM_ARCH) + env_name="PYTORCH_ROCM_ARCH" + ;; + esac + + value="${!env_name:-}" + if [[ -z "${value}" && "${env_name}" != "${arg_name}" ]]; then + value="${!arg_name:-}" + fi + if [[ -z "${value}" && -f "${dockerfile}" ]]; then + value=$(extract_dockerfile_arg_default "${dockerfile}" "${arg_name}") + fi + + printf '%s\n' "${value}" +} + +hash_dockerfile_arg_values() { + local dockerfile="$1" + local arg_name="" + local arg_value="" + local digest="" + shift || true + + for arg_name in "$@"; do + [[ -n "${arg_name}" ]] || continue + arg_value=$(resolve_dockerfile_arg_value "${dockerfile}" "${arg_name}") + printf 'arg:%s=%s\n' "${arg_name}" "${arg_value:-}" + if [[ "${arg_name}" == "BASE_IMAGE" && -n "${arg_value}" ]]; then + digest=$(resolve_image_digest "${arg_value}") + printf 'arg:%s.digest=%s\n' "${arg_name}" "${digest:-unknown}" + fi + done +} + +is_ci_base_target() { + [[ "${TARGET}" == *"ci-base-rocm"* ]] +} + +is_commit_image_target() { + [[ -n "${IMAGE_TAG:-}" && -n "${BUILDKITE_COMMIT:-}" ]] || return 1 + is_ci_base_target && return 1 + return 0 +} + +image_tag_is_commit_scoped() { + [[ -n "${IMAGE_TAG:-}" && -n "${BUILDKITE_COMMIT:-}" ]] || return 1 + [[ "${IMAGE_TAG}" == *"${BUILDKITE_COMMIT}"* ]] +} + +should_upload_wheel_artifacts() { + [[ "${UPLOAD_ROCM_WHEEL_ARTIFACTS:-0}" == "1" ]] && return 0 + [[ "${TARGET}" == *"with-wheel"* \ + || "${TARGET}" == *"export-wheel"* \ + || "${TARGET}" == *"artifact"* ]] +} + +get_remote_image_label() { + local image_ref="$1" + local label_key="$2" + + docker buildx imagetools inspect "${image_ref}" --raw 2>/dev/null \ + | python3 -c ' +import json +import subprocess +import sys +import urllib.parse +import urllib.request + +image_ref = sys.argv[1] +label_key = sys.argv[2] + + +def docker_hub_repo(image_name): + image_name = image_name.split("@", 1)[0] + last_component = image_name.rsplit("/", 1)[-1] + if ":" in last_component: + image_name = image_name.rsplit(":", 1)[0] + + parts = image_name.split("/") + if len(parts) > 1 and ( + "." in parts[0] or ":" in parts[0] or parts[0] == "localhost" + ): + registry = parts[0] + if registry not in { + "docker.io", + "index.docker.io", + "registry-1.docker.io", + }: + return None + image_name = "/".join(parts[1:]) + elif len(parts) == 1: + image_name = f"library/{image_name}" + + return image_name + + +try: + data = json.load(sys.stdin) + if data.get("manifests"): + manifest = next( + ( + entry + for entry in data["manifests"] + if entry.get("platform", {}).get("os") != "unknown" + and entry.get("platform", {}).get("architecture") != "unknown" + ), + data["manifests"][0], + ) + digest = manifest["digest"] + result = subprocess.run( + [ + "docker", + "buildx", + "imagetools", + "inspect", + image_ref + "@" + digest, + "--raw", + ], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0 or not result.stdout: + raise RuntimeError("digest inspect failed") + data = json.loads(result.stdout) + + annotations = data.get("annotations", {}) + if label_key in annotations: + print(annotations[label_key]) + raise SystemExit(0) + + config_digest = data.get("config", {}).get("digest") + if not config_digest: + print("") + raise SystemExit(0) + + image_name = docker_hub_repo(image_ref) + if not image_name: + print("") + raise SystemExit(0) + + token_url = ( + "https://auth.docker.io/token?" + + urllib.parse.urlencode( + { + "service": "registry.docker.io", + "scope": f"repository:{image_name}:pull", + } + ) + ) + with urllib.request.urlopen(token_url, timeout=30) as response: + token = json.load(response)["token"] + + request = urllib.request.Request( + f"https://registry-1.docker.io/v2/{image_name}/blobs/{config_digest}", + headers={"Authorization": f"Bearer {token}"}, + ) + with urllib.request.urlopen(request, timeout=30) as response: + config_blob = json.load(response) + + labels = config_blob.get("config", {}).get("Labels", {}) + print(labels.get(label_key, "")) +except Exception: + print("") +' "${image_ref}" "${label_key}" 2>/dev/null || echo "" +} + +get_remote_image_label_with_retry() { + local image_ref="$1" + local label_key="$2" + local attempts="${3:-6}" + local delay_secs="${4:-5}" + local label_value="" + local attempt + + for ((attempt = 1; attempt <= attempts; attempt++)); do + label_value=$(get_remote_image_label "${image_ref}" "${label_key}") + if [[ -n "${label_value}" ]]; then + printf '%s\n' "${label_value}" + return 0 + fi + if [[ ${attempt} -lt ${attempts} ]]; then + sleep "${delay_secs}" + fi + done + + return 0 +} + +remote_image_exists() { + local image_ref="$1" + docker manifest inspect "${image_ref}" >/dev/null 2>&1 +} + +use_existing_builder() { + echo "Using existing builder: ${BUILDER_NAME}" + docker buildx use "${BUILDER_NAME}" + docker buildx inspect --bootstrap +} + +buildx_driver() { + local builder="${1:-}" + + if [[ -n "${builder}" ]]; then + docker buildx inspect "${builder}" 2>/dev/null + else + docker buildx inspect 2>/dev/null + fi | awk -F': *' '$1 == "Driver" { print $2; exit }' +} + +builder_supports_registry_cache() { + local driver="$1" + + [[ -n "${driver}" && "${driver}" != "docker" ]] +} + +create_and_bootstrap_builder() { + local driver="$1" + local endpoint="${2:-}" + + echo "Creating builder '${BUILDER_NAME}' with ${driver} driver" + if [[ -n "${endpoint}" ]]; then + docker buildx create \ + --name "${BUILDER_NAME}" \ + --driver "${driver}" \ + --use \ + "${endpoint}" + else + docker buildx create --name "${BUILDER_NAME}" --driver "${driver}" --use + fi + docker buildx inspect --bootstrap +} + +init_config() { + TARGET="${1:-test-ci}" + BAKE_TARGETS=("${TARGET}") + DEPENDENCY_CACHE_TARGETS=() + CI_HCL_SOURCE="${CI_HCL_SOURCE:-${CI_HCL_FILE:-${DEFAULT_CI_HCL_SOURCE}}}" + VLLM_BAKE_FILE="${VLLM_BAKE_FILE:-docker/docker-bake-rocm.hcl}" + BUILDER_NAME="${BUILDER_NAME:-vllm-builder}" + BUILDKIT_SOCKET="${BUILDKIT_SOCKET:-/run/buildkit/buildkitd.sock}" + PYTORCH_ROCM_ARCH="${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950}" + CI_BASE_CONTENT_FILES="${CI_BASE_CONTENT_FILES:-${DEFAULT_CI_BASE_CONTENT_FILES}}" + CI_BASE_DOCKERFILE="${CI_BASE_DOCKERFILE:-${DEFAULT_CI_BASE_DOCKERFILE}}" + CI_BASE_DOCKERFILE_STAGES="${CI_BASE_DOCKERFILE_STAGES:-${DEFAULT_CI_BASE_DOCKERFILE_STAGES}}" + CI_BASE_IMAGE_TAG="${CI_BASE_IMAGE_TAG:-rocm/vllm-dev:ci_base}" + export PYTORCH_ROCM_ARCH + + SCRIPT_TMP_DIR=$(mktemp -d -t ci-bake-rocm.XXXXXX) + CI_HCL_PATH="${SCRIPT_TMP_DIR}/ci.hcl" + CI_BASE_LABEL_OVERRIDE_PATH="${SCRIPT_TMP_DIR}/ci-base-label-override.hcl" + CSRC_CACHE_OVERRIDE_PATH="${SCRIPT_TMP_DIR}/rocm-csrc-cache-override.hcl" + ROCM_ARG_OVERRIDE_PATH="${SCRIPT_TMP_DIR}/rocm-arg-override.hcl" + BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json" +} + +print_header() { + echo "--- :docker: Setting up Docker buildx bake" + echo "Target: ${TARGET}" + echo "CI HCL source: ${CI_HCL_SOURCE}" + echo "vLLM bake file: ${VLLM_BAKE_FILE}" + if is_ci_base_target; then + echo "Build mode: ci_base" + elif is_commit_image_target; then + echo "Build mode: commit image" + else + echo "Build mode: generic" + fi + if [[ "${USE_SCCACHE:-0}" == "1" ]]; then + echo "Compiler cache: sccache enabled" + fi +} + +validate_inputs() { + if [[ ! -f "${VLLM_BAKE_FILE}" ]]; then + echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE}" + echo "Make sure you're running from the vLLM repository root" + exit 1 + fi + + if [[ -n "${CI_HCL_SOURCE:-}" ]] && is_url_like "${CI_HCL_SOURCE}"; then + echo "Error: remote CI HCL sources are not supported: ${CI_HCL_SOURCE}" + echo "Use the vLLM-owned docker/ci-rocm.hcl or set CI_HCL_SOURCE to a local file." + exit 1 + fi + + if [[ -n "${CI_HCL_SOURCE:-}" && ! -f "${CI_HCL_SOURCE}" ]]; then + echo "Error: CI HCL file not found at ${CI_HCL_SOURCE}" + echo "Set CI_HCL_SOURCE to a local file if you need an override." + exit 1 + fi +} + +load_ci_hcl() { + echo "--- :page_facing_up: Loading ci.hcl" + cp "${CI_HCL_SOURCE}" "${CI_HCL_PATH}" + echo "Copied ${CI_HCL_SOURCE} to ${CI_HCL_PATH}" +} + +compute_ci_base_hash_if_needed() { + if [[ -z "${CI_BASE_CONTENT_FILES:-}" ]]; then + return 0 + fi + + CI_BASE_CONTENT_HASH=$(compute_ci_base_content_hash) + export CI_BASE_CONTENT_HASH + echo "ci_base content hash: ${CI_BASE_CONTENT_HASH:0:16}..." +} + +should_push_stable_ci_base_tag() { + if [[ "${CI_BASE_PUSH_STABLE_TAG:-}" == "1" ]]; then + return 0 + fi + if [[ "${CI_BASE_PUSH_STABLE_TAG:-}" == "0" ]]; then + return 1 + fi + + [[ "${NIGHTLY:-0}" == "1" && "${BUILDKITE_BRANCH:-}" == "${CI_BASE_STABLE_BRANCH:-main}" ]] +} + +ci_base_tag_with_suffix() { + local base_tag="$1" + local suffix="$2" + + printf '%s-%s\n' "${base_tag}" "$(clean_docker_tag "${suffix}")" +} + +configure_ci_base_image_refs() { + local stable_tag="${CI_BASE_IMAGE_TAG:-rocm/vllm-dev:ci_base}" + local content_tag="" + local commit_tag="" + local primary_tag="" + + if [[ -z "${CI_BASE_CONTENT_HASH:-}" ]]; then + CI_BASE_IMAGE="${CI_BASE_IMAGE:-${stable_tag}}" + export CI_BASE_IMAGE + return 0 + fi + + content_tag=$(ci_base_tag_with_suffix "${stable_tag}" "${CI_BASE_CONTENT_HASH}") + if [[ -n "${BUILDKITE_COMMIT:-}" ]]; then + commit_tag=$(ci_base_tag_with_suffix "${stable_tag}" "${BUILDKITE_COMMIT}") + CI_BASE_IMAGE_TAG_COMMIT="${commit_tag}" + export CI_BASE_IMAGE_TAG_COMMIT + fi + + if should_push_stable_ci_base_tag; then + primary_tag="${content_tag}" + CI_BASE_IMAGE_TAG_STABLE="${stable_tag}" + else + primary_tag="${commit_tag:-${content_tag}}" + CI_BASE_IMAGE_TAG_STABLE="" + fi + CI_BASE_IMAGE_TAG="${primary_tag}" + if [[ "${primary_tag}" == "${content_tag}" ]]; then + CI_BASE_IMAGE_TAG_CONTENT="" + else + CI_BASE_IMAGE_TAG_CONTENT="${content_tag}" + fi + export CI_BASE_IMAGE_TAG CI_BASE_IMAGE_TAG_CONTENT CI_BASE_IMAGE_TAG_STABLE + + if is_ci_base_target; then + IMAGE_TAG="${primary_tag}" + export IMAGE_TAG + + echo "ci_base primary image tag: ${CI_BASE_IMAGE_TAG}" + if [[ -n "${CI_BASE_IMAGE_TAG_COMMIT:-}" ]]; then + echo "ci_base commit image tag: ${CI_BASE_IMAGE_TAG_COMMIT}" + fi + echo "ci_base content image tag: ${content_tag}" + if [[ -n "${CI_BASE_IMAGE_TAG_STABLE}" ]]; then + echo "ci_base stable alias will also be pushed: ${CI_BASE_IMAGE_TAG_STABLE}" + else + echo "ci_base stable alias will not be pushed for this build" + echo "Set NIGHTLY=1 on ${CI_BASE_STABLE_BRANCH:-main} to refresh ${stable_tag}" + fi + return 0 + fi + + if [[ -z "${CI_BASE_IMAGE:-}" || "${CI_BASE_IMAGE}" == "${stable_tag}" ]]; then + CI_BASE_IMAGE="${primary_tag}" + export CI_BASE_IMAGE + echo "Using ci_base image: ${CI_BASE_IMAGE}" + else + echo "Using provided CI_BASE_IMAGE override: ${CI_BASE_IMAGE}" + fi +} + +ci_base_candidate_refs() { + printf '%s\n' \ + "${IMAGE_TAG:-}" \ + "${CI_BASE_IMAGE_TAG:-}" \ + "${CI_BASE_IMAGE_TAG_COMMIT:-}" \ + "${CI_BASE_IMAGE_TAG_CONTENT:-}" \ + "${CI_BASE_IMAGE_TAG_STABLE:-}" \ + | awk 'NF && !seen[$0]++' +} + +find_matching_ci_base_ref() { + local candidate="" + local candidate_hash="" + + while IFS= read -r candidate; do + [[ -n "${candidate}" ]] || continue + remote_image_exists "${candidate}" || continue + candidate_hash=$(get_remote_image_label "${candidate}" "vllm.ci_base.content_hash") + if [[ "${candidate_hash}" == "${CI_BASE_CONTENT_HASH}" ]]; then + printf '%s\n' "${candidate}" + return 0 + fi + done < <(ci_base_candidate_refs) + + return 1 +} + +refresh_ci_base_tags_from_ref() { + local source_ref="$1" + local tag="" + local tag_hash="" + + while IFS= read -r tag; do + [[ -n "${tag}" ]] || continue + [[ "${tag}" != "${source_ref}" ]] || continue + tag_hash=$(get_remote_image_label "${tag}" "vllm.ci_base.content_hash") + if [[ "${tag_hash}" == "${CI_BASE_CONTENT_HASH}" ]]; then + echo "ci_base tag is already current: ${tag}" + continue + fi + echo "Updating ci_base tag ${tag} -> ${source_ref}" + docker buildx imagetools create -t "${tag}" "${source_ref}" + done < <(ci_base_candidate_refs) +} + +maybe_skip_existing_image() { + local remote_hash="" + local remote_revision="" + local matching_ref="" + + if [[ -z "${IMAGE_TAG:-}" ]]; then + return 0 + fi + + if [[ "${FORCE_BUILD:-0}" == "1" ]]; then + echo "FORCE_BUILD=1 set; skipping existing-image check" + return 0 + fi + + echo "--- :mag: Checking image tag" + echo "Image tag: ${IMAGE_TAG}" + + if ! remote_image_exists "${IMAGE_TAG}"; then + if is_ci_base_target && [[ -n "${CI_BASE_CONTENT_HASH:-}" ]]; then + matching_ref=$(find_matching_ci_base_ref || true) + if [[ -n "${matching_ref}" ]]; then + echo "Found existing ci_base image with matching content hash: ${matching_ref}" + if ! refresh_ci_base_tags_from_ref "${matching_ref}"; then + echo "ci_base tag refresh failed; rebuilding to push expected tags" + return 0 + fi + echo "Content hashes match -- ci_base is current" + echo "Skipping build" + exit 0 + fi + fi + echo "Image not found, proceeding with build" + return 0 + fi + + IMAGE_EXISTED_BEFORE_BUILD=1 + + if is_ci_base_target; then + if [[ -z "${CI_BASE_CONTENT_HASH:-}" ]]; then + echo "ci_base image already exists and no content hash was configured" + echo "Skipping build" + exit 0 + fi + + remote_hash=$(get_remote_image_label "${IMAGE_TAG}" "vllm.ci_base.content_hash") + if [[ -n "${remote_hash}" ]]; then + echo "Remote ci_base content hash: ${remote_hash:0:16}..." + if [[ "${remote_hash}" == "${CI_BASE_CONTENT_HASH}" ]]; then + if ! refresh_ci_base_tags_from_ref "${IMAGE_TAG}"; then + echo "ci_base tag refresh failed; rebuilding to push expected tags" + return 0 + fi + echo "Content hashes match -- ci_base is current" + echo "Skipping build" + exit 0 + fi + + echo "Content hashes differ -- ci_base is stale, rebuilding" + return 0 + fi + + echo "Remote ci_base has no content-hash label; rebuilding to add one" + return 0 + fi + + if is_commit_image_target; then + remote_revision=$(get_remote_image_label "${IMAGE_TAG}" "org.opencontainers.image.revision") + if [[ -n "${remote_revision}" && "${remote_revision}" != "${BUILDKITE_COMMIT}" ]]; then + echo "Existing image revision does not match ${BUILDKITE_COMMIT}" + echo " found revision: ${remote_revision}" + echo "Rebuilding image" + return 0 + fi + + if should_upload_wheel_artifacts; then + echo "Commit image already exists: ${IMAGE_TAG}" + echo "Continuing build because this target uploads per-build ROCm artifacts" + return 0 + fi + + echo "Commit image already exists: ${IMAGE_TAG}" + echo "Skipping build" + exit 0 + fi + + echo "Image already exists: ${IMAGE_TAG}" + echo "Skipping build" + exit 0 +} + +setup_builder() { + echo "--- :buildkite: Setting up buildx builder" + + local setup_mode="${ROCM_SETUP_BUILDX_BUILDER:-auto}" + local current_driver="" + local named_driver="" + + if [[ "${setup_mode}" == "0" || "${setup_mode}" == "false" ]]; then + echo "Using current Docker buildx builder" + echo "ROCM_SETUP_BUILDX_BUILDER=${setup_mode}; cache exporters may fail if the driver is docker" + docker buildx inspect --bootstrap + echo "Active builder:" + docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls + return 0 + fi + + current_driver=$(buildx_driver || true) + if [[ "${setup_mode}" != "1" ]] && builder_supports_registry_cache "${current_driver}"; then + echo "Using current Docker buildx builder with ${current_driver} driver" + docker buildx inspect --bootstrap + echo "Active builder:" + docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls + return 0 + fi + + if [[ "${setup_mode}" != "1" ]]; then + echo "Current buildx driver '${current_driver:-unknown}' cannot export registry caches" + echo "Creating or using a cache-capable builder: ${BUILDER_NAME}" + fi + + if docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then + named_driver=$(buildx_driver "${BUILDER_NAME}" || true) + if ! builder_supports_registry_cache "${named_driver}"; then + echo "Builder '${BUILDER_NAME}' uses ${named_driver:-unknown} driver; using ${BUILDER_NAME}-cache instead" + BUILDER_NAME="${BUILDER_NAME}-cache" + fi + fi + + if [[ -S "${BUILDKIT_SOCKET}" ]]; then + echo "Found local buildkitd socket at ${BUILDKIT_SOCKET}" + echo "Using remote driver to connect to buildkitd" + + if docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then + use_existing_builder + else + create_and_bootstrap_builder remote "unix://${BUILDKIT_SOCKET}" + fi + elif docker buildx inspect "${BUILDER_NAME}" >/dev/null 2>&1; then + use_existing_builder + else + echo "No local buildkitd found, using docker-container driver" + create_and_bootstrap_builder docker-container + fi + + echo "Active builder:" + docker buildx ls | grep -E '^\*|^NAME' || docker buildx ls +} + +prepare_git_cache_metadata() { + local cache_branch_name="" + local cache_base_branch="${BUILDKITE_PULL_REQUEST_BASE_BRANCH:-main}" + local target_repo_slug="" + local target_repo_url="" + local merge_base_ref="" + + if [[ -z "${PARENT_COMMIT:-}" || -z "${VLLM_MERGE_BASE_COMMIT:-}" ]] \ + && git rev-parse --is-shallow-repository 2>/dev/null | grep -q "true"; then + echo "Shallow clone detected - deepening for cache key computation" + git_fetch_for_cache --deepen=1 origin || true + fi + + if [[ -z "${PARENT_COMMIT:-}" ]]; then + PARENT_COMMIT=$(git rev-parse HEAD~1 2>/dev/null || echo "") + if [[ -n "${PARENT_COMMIT}" ]]; then + export PARENT_COMMIT + echo "Computed parent commit for cache fallback: ${PARENT_COMMIT}" + else + echo "Could not determine parent commit" + fi + else + echo "Using provided PARENT_COMMIT: ${PARENT_COMMIT}" + fi + + if [[ -z "${ROCM_CACHE_BRANCH_TAG:-}" ]]; then + cache_branch_name=$(select_cache_branch_name) + if [[ -z "${cache_branch_name}" && "${BUILDKITE_PULL_REQUEST:-false}" != "false" ]]; then + cache_branch_name="pr-${BUILDKITE_PULL_REQUEST}" + echo "Using pull request number for ROCm branch cache tag: ${cache_branch_name}" + fi + fi + + if [[ -z "${ROCM_CACHE_BRANCH_TAG:-}" && -n "${cache_branch_name}" ]]; then + ROCM_CACHE_BRANCH_TAG=$( + compose_cache_branch_tag "$(get_buildkite_repo_slug)" "${cache_branch_name}" + ) + export ROCM_CACHE_BRANCH_TAG + echo "Computed ROCm branch cache tag: ${ROCM_CACHE_BRANCH_TAG} (from ${cache_branch_name})" + elif [[ -n "${ROCM_CACHE_BRANCH_TAG:-}" ]]; then + echo "Using provided ROCM_CACHE_BRANCH_TAG: ${ROCM_CACHE_BRANCH_TAG}" + elif [[ -n "${BUILDKITE_BRANCH:-}" ]]; then + echo "Skipping ROCm branch cache tag: no usable branch name found" + echo " BUILDKITE_BRANCH=${BUILDKITE_BRANCH}" + fi + + if [[ -z "${ROCM_CACHE_UPSTREAM_BRANCH_TAG:-}" \ + && -n "${BUILDKITE_PULL_REQUEST_BASE_BRANCH:-}" \ + && "${BUILDKITE_PULL_REQUEST:-false}" != "false" ]]; then + target_repo_slug=$(get_buildkite_target_repo_slug) + ROCM_CACHE_UPSTREAM_BRANCH_TAG=$( + compose_cache_branch_tag "${target_repo_slug}" "${BUILDKITE_PULL_REQUEST_BASE_BRANCH}" + ) + export ROCM_CACHE_UPSTREAM_BRANCH_TAG + echo "Computed ROCm upstream branch cache tag: ${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" + elif [[ -n "${ROCM_CACHE_UPSTREAM_BRANCH_TAG:-}" ]]; then + echo "Using provided ROCM_CACHE_UPSTREAM_BRANCH_TAG: ${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" + fi + + if [[ -z "${VLLM_MERGE_BASE_COMMIT:-}" ]]; then + target_repo_url=$(get_buildkite_target_repo_url) + merge_base_ref="refs/remotes/vllm-cache-upstream/${cache_base_branch}" + git_fetch_for_cache --no-tags --depth=200 "${target_repo_url}" \ + "+refs/heads/${cache_base_branch}:${merge_base_ref}" 2>/dev/null || true + VLLM_MERGE_BASE_COMMIT=$(git merge-base HEAD "${merge_base_ref}" 2>/dev/null || echo "") + if [[ -z "${VLLM_MERGE_BASE_COMMIT}" ]]; then + git_fetch_for_cache --no-tags --deepen=1000 "${target_repo_url}" \ + "+refs/heads/${cache_base_branch}:${merge_base_ref}" 2>/dev/null || true + VLLM_MERGE_BASE_COMMIT=$(git merge-base HEAD "${merge_base_ref}" 2>/dev/null || echo "") + fi + if [[ -n "${VLLM_MERGE_BASE_COMMIT}" ]]; then + export VLLM_MERGE_BASE_COMMIT + echo "Computed merge base commit for cache fallback: ${VLLM_MERGE_BASE_COMMIT}" + else + echo "Could not determine merge base with ${cache_base_branch}" + fi + else + echo "Using provided VLLM_MERGE_BASE_COMMIT: ${VLLM_MERGE_BASE_COMMIT}" + fi +} + +write_ci_base_label_override() { + local target_name="" + local -a ci_base_targets=() + + BAKE_FILES=(-f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}") + + if [[ -z "${CI_BASE_CONTENT_HASH:-}" ]]; then + return 0 + fi + + mapfile -t ci_base_targets < <( + { + printf '%s\n' "ci-base-rocm" + sed -n -E 's/^target "(ci-base-rocm[^"]+)".*/\1/p' "${CI_HCL_PATH}" 2>/dev/null || true + } | awk '!seen[$0]++' + ) + + if [[ ${#ci_base_targets[@]} -eq 0 ]]; then + return 0 + fi + + : > "${CI_BASE_LABEL_OVERRIDE_PATH}" + for target_name in "${ci_base_targets[@]}"; do + cat >> "${CI_BASE_LABEL_OVERRIDE_PATH}" < "${ROCM_ARG_OVERRIDE_PATH}" + + BAKE_FILES+=(-f "${ROCM_ARG_OVERRIDE_PATH}") + echo "Appended resolved ROCm Docker ARG override" +} + +write_hcl_string_list_attr() { + local indent="$1" + local attr="$2" + shift 2 + + printf '%s%s = [\n' "${indent}" "${attr}" + write_hcl_string_list_entries "${indent} " "$@" + printf '%s]\n' "${indent}" +} + +validate_cache_export_mode() { + local mode="$1" + local env_name="$2" + + case "${mode}" in + min|max) + ;; + *) + echo "Error: ${env_name} must be one of: min, max" + exit 1 + ;; + esac +} + +write_rocm_cache_override() { + local cache_repo="${DOCKERHUB_CACHE_REPO:-rocm/vllm-ci-cache}" + local csrc_cache_to_mode="${ROCM_CSRC_CACHE_TO_MODE:-max}" + local rocm_cache_to_mode="${ROCM_FINAL_CACHE_TO_MODE:-min}" + local -a content_cache_from=() + local -a csrc_cache_to=() + local -a rocm_cache_to=() + local -a export_wheel_cache_to=() + + if ! uses_rocm_csrc_cache; then + return 0 + fi + + validate_cache_export_mode "${csrc_cache_to_mode}" "ROCM_CSRC_CACHE_TO_MODE" + validate_cache_export_mode "${rocm_cache_to_mode}" "ROCM_FINAL_CACHE_TO_MODE" + echo "ROCm csrc cache export mode: ${csrc_cache_to_mode}" + echo "ROCm final image cache export mode: ${rocm_cache_to_mode}" + + if [[ -n "${ROCM_CSRC_CONTENT_CACHE_REF:-}" ]]; then + content_cache_from+=("type=registry,ref=${ROCM_CSRC_CONTENT_CACHE_REF}") + csrc_cache_to+=( + "type=registry,ref=${ROCM_CSRC_CONTENT_CACHE_REF},mode=${csrc_cache_to_mode},ignore-error=true" + ) + fi + + # Docker Hub cache exports are best-effort. A cache-only target failure can + # otherwise cancel the sibling image target before its manifest is pushed. + if [[ -n "${BUILDKITE_COMMIT:-}" ]]; then + csrc_cache_to+=( + "type=registry,ref=${cache_repo}:csrc-rocm-${BUILDKITE_COMMIT},mode=${csrc_cache_to_mode},ignore-error=true" + ) + rocm_cache_to+=( + "type=registry,ref=${cache_repo}:rocm-${BUILDKITE_COMMIT},mode=${rocm_cache_to_mode},ignore-error=true" + ) + fi + + if [[ -n "${ROCM_CACHE_BRANCH_TAG:-}" ]]; then + csrc_cache_to+=( + "type=registry,ref=${cache_repo}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=${csrc_cache_to_mode},ignore-error=true" + ) + rocm_cache_to+=( + "type=registry,ref=${cache_repo}:rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=${rocm_cache_to_mode},ignore-error=true" + ) + fi + + if [[ "${TARGET}" == "test-rocm-ci-with-wheel" ]]; then + export_wheel_cache_to=() + else + export_wheel_cache_to=("${rocm_cache_to[@]}") + fi + + { + cat < "${CSRC_CACHE_OVERRIDE_PATH}" + + BAKE_FILES+=(-f "${CSRC_CACHE_OVERRIDE_PATH}") + echo "Appended ROCm cache override with non-fatal registry exports" +} + +extract_dependency_pins() { + local bake_dir="" + local dockerfile_rocm="" + local var="" + local val="" + + bake_dir=$(dirname "${VLLM_BAKE_FILE}") + dockerfile_rocm="${bake_dir}/Dockerfile.rocm" + if [[ ! -f "${dockerfile_rocm}" ]]; then + return 0 + fi + + for var in RIXL_BRANCH UCX_BRANCH ROCSHMEM_BRANCH DEEPEP_BRANCH; do + if [[ -n "${!var:-}" ]]; then + echo "Using provided ${var}: ${!var}" + continue + fi + + val=$( + sed -n -E "s/^[[:space:]]*ARG[[:space:]]+${var}=\"?([^\"[:space:]]+)\"?.*/\\1/p" \ + "${dockerfile_rocm}" | head -1 + ) + if [[ -n "${val}" ]]; then + export "${var}=${val}" + echo "Extracted ${var}=${val} from Dockerfile.rocm" + fi + done +} + +compute_dependency_cache_keys() { + local bake_dir="" + local dockerfile_rocm="" + local rixl_branch="" + local ucx_branch="" + local rocshmem_branch="" + local deepep_branch="" + local rixl_material="" + local rocshmem_material="" + local deepep_material="" + + bake_dir=$(dirname "${VLLM_BAKE_FILE}") + dockerfile_rocm="${bake_dir}/Dockerfile.rocm" + rixl_branch=$(resolve_dockerfile_arg_value "${dockerfile_rocm}" "RIXL_BRANCH") + ucx_branch=$(resolve_dockerfile_arg_value "${dockerfile_rocm}" "UCX_BRANCH") + rocshmem_branch=$(resolve_dockerfile_arg_value "${dockerfile_rocm}" "ROCSHMEM_BRANCH") + deepep_branch=$(resolve_dockerfile_arg_value "${dockerfile_rocm}" "DEEPEP_BRANCH") + + if [[ -n "${rixl_branch}" && -n "${ucx_branch}" ]]; then + rixl_material=$(compose_stage_cache_material "${dockerfile_rocm}" "base build_rixl") + RIXL_CACHE_KEY=$( + compose_dependency_cache_key \ + "${rixl_branch}-ucx-${ucx_branch}" \ + "${rixl_material}" + ) + export RIXL_CACHE_KEY + echo "RIXL dependency cache key: ${RIXL_CACHE_KEY}" + fi + + if [[ -n "${rocshmem_branch}" ]]; then + rocshmem_material=$(compose_stage_cache_material "${dockerfile_rocm}" "base build_rocshmem") + ROCSHMEM_CACHE_KEY=$( + compose_dependency_cache_key \ + "${rocshmem_branch}" \ + "${rocshmem_material}" + ) + export ROCSHMEM_CACHE_KEY + echo "ROCShmem dependency cache key: ${ROCSHMEM_CACHE_KEY}" + fi + + if [[ -n "${deepep_branch}" && -n "${rocshmem_branch}" ]]; then + deepep_material=$(compose_stage_cache_material "${dockerfile_rocm}" "base build_rocshmem build_deepep") + DEEPEP_CACHE_KEY=$( + compose_dependency_cache_key \ + "${deepep_branch}-rocshmem-${rocshmem_branch}" \ + "${deepep_material}" + ) + export DEEPEP_CACHE_KEY + echo "DeepEP dependency cache key: ${DEEPEP_CACHE_KEY}" + fi +} + +compose_stage_cache_material() { + local dockerfile="$1" + local stages="$2" + local -a content_args=() + + mapfile -t content_args < <(get_content_arg_names "${dockerfile}" "${stages}" "") + { + printf 'dockerfile:%s\n' "${dockerfile}" + printf 'dockerfile-stages:%s\n' "${stages}" + hash_dockerfile_stages "${dockerfile}" "${stages}" + printf 'resolved-build-args:\n' + hash_dockerfile_arg_values "${dockerfile}" "${content_args[@]}" + } +} + +dependency_cache_ref_exists() { + local cache_ref="$1" + docker buildx imagetools inspect "${cache_ref}" >/dev/null 2>&1 +} + +dependency_cache_ref_for_target() { + local target="$1" + local cache_repo="${DOCKERHUB_CACHE_REPO:-rocm/vllm-ci-cache}" + + case "${target}" in + rixl-rocm-ci) + if [[ -n "${RIXL_CACHE_KEY:-}" ]]; then + printf '%s\n' "${cache_repo}:rixl-rocm-${RIXL_CACHE_KEY}" + elif [[ -n "${RIXL_BRANCH:-}" ]]; then + printf '%s\n' "${cache_repo}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH:-}" + fi + ;; + rocshmem-rocm-ci) + if [[ -n "${ROCSHMEM_CACHE_KEY:-}" ]]; then + printf '%s\n' "${cache_repo}:rocshmem-rocm-${ROCSHMEM_CACHE_KEY}" + elif [[ -n "${ROCSHMEM_BRANCH:-}" ]]; then + printf '%s\n' "${cache_repo}:rocshmem-rocm-${ROCSHMEM_BRANCH}" + fi + ;; + deepep-rocm-ci) + if [[ -n "${DEEPEP_CACHE_KEY:-}" ]]; then + printf '%s\n' "${cache_repo}:deepep-rocm-${DEEPEP_CACHE_KEY}" + elif [[ -n "${DEEPEP_BRANCH:-}" ]]; then + printf '%s\n' "${cache_repo}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH:-}" + fi + ;; + esac +} + +add_dependency_cache_target() { + local target="$1" + + if printf '%s\n' "${DEPENDENCY_CACHE_TARGETS[@]}" | grep -qx "${target}"; then + return 0 + fi + DEPENDENCY_CACHE_TARGETS+=("${target}") +} + +resolve_ci_base_dependency_targets() { + local mode="${ROCM_DEP_CACHE_EXPORT_MODE:-missing}" + local rixl_ref="" + local rocshmem_ref="" + local deepep_ref="" + + [[ "${TARGET}" == "ci-base-rocm-ci-with-deps" ]] || return 0 + + case "${mode}" in + always) + echo "ROCM_DEP_CACHE_EXPORT_MODE=always; exporting all dependency caches serially" + for target in rixl-rocm-ci rocshmem-rocm-ci deepep-rocm-ci; do + if [[ -n "$(dependency_cache_ref_for_target "${target}")" ]]; then + add_dependency_cache_target "${target}" + fi + done + ;; + never) + BAKE_TARGETS=("ci-base-rocm-ci") + DEPENDENCY_CACHE_TARGETS=() + echo "ROCM_DEP_CACHE_EXPORT_MODE=never; building ci_base without dependency cache exports" + return 0 + ;; + missing|"") + ;; + *) + echo "Error: ROCM_DEP_CACHE_EXPORT_MODE must be one of: missing, always, never" + exit 1 + ;; + esac + + if [[ "${mode}" != "always" && -n "${RIXL_CACHE_KEY:-}" ]]; then + rixl_ref=$(dependency_cache_ref_for_target "rixl-rocm-ci") + if dependency_cache_ref_exists "${rixl_ref}"; then + echo "RIXL dependency cache exists: ${rixl_ref}" + else + echo "RIXL dependency cache missing; will seed: ${rixl_ref}" + add_dependency_cache_target "rixl-rocm-ci" + fi + fi + + if [[ "${mode}" != "always" && -n "${ROCSHMEM_CACHE_KEY:-}" ]]; then + rocshmem_ref=$(dependency_cache_ref_for_target "rocshmem-rocm-ci") + if dependency_cache_ref_exists "${rocshmem_ref}"; then + echo "ROCShmem dependency cache exists: ${rocshmem_ref}" + else + echo "ROCShmem dependency cache missing; will seed: ${rocshmem_ref}" + add_dependency_cache_target "rocshmem-rocm-ci" + fi + fi + + if [[ "${mode}" != "always" && -n "${DEEPEP_CACHE_KEY:-}" ]]; then + deepep_ref=$(dependency_cache_ref_for_target "deepep-rocm-ci") + if dependency_cache_ref_exists "${deepep_ref}"; then + echo "DeepEP dependency cache exists: ${deepep_ref}" + else + echo "DeepEP dependency cache missing; will seed: ${deepep_ref}" + add_dependency_cache_target "deepep-rocm-ci" + fi + fi + + # DeepEP inherits from ROCShmem. If ROCShmem is being seeded, seed DeepEP too + # so the pair stays consistent for future ci_base rebuilds. + if printf '%s\n' "${DEPENDENCY_CACHE_TARGETS[@]}" | grep -qx "rocshmem-rocm-ci" \ + && ! printf '%s\n' "${DEPENDENCY_CACHE_TARGETS[@]}" | grep -qx "deepep-rocm-ci" \ + && [[ -n "${DEEPEP_BRANCH:-}" ]]; then + echo "ROCShmem cache is missing; also seeding DeepEP cache" + add_dependency_cache_target "deepep-rocm-ci" + fi + + BAKE_TARGETS=("ci-base-rocm-ci") + if [[ ${#DEPENDENCY_CACHE_TARGETS[@]} -eq 0 ]]; then + echo "All dependency caches exist; building ci_base without dependency cache exports" + else + echo "Resolved dependency cache seed targets: ${DEPENDENCY_CACHE_TARGETS[*]}" + echo "Resolved ci_base bake targets: ${BAKE_TARGETS[*]}" + fi +} + +bake_config_targets() { + printf '%s\n' "${DEPENDENCY_CACHE_TARGETS[@]}" "${BAKE_TARGETS[@]}" \ + | awk 'NF && !seen[$0]++' +} + +print_bake_config() { + local -a print_targets=() + + echo "--- :page_facing_up: Resolved bake configuration" + mapfile -t print_targets < <(bake_config_targets) + docker buildx bake "${BAKE_FILES[@]}" --print "${print_targets[@]}" | tee "${BAKE_CONFIG_FILE}" + + if command -v buildkite-agent >/dev/null 2>&1 && [[ -n "${BUILDKITE_BUILD_NUMBER:-}" ]]; then + buildkite-agent artifact upload "${BAKE_CONFIG_FILE}" || true + echo "Uploaded ${BAKE_CONFIG_FILE} as Buildkite artifact" + else + echo "Saved bake config to ${BAKE_CONFIG_FILE} (not in Buildkite, skipping upload)" + fi +} + +confirm_remote_image_push() { + local image_ref="$1" + local remote_hash="" + local remote_revision="" + + if ! remote_image_exists "${image_ref}"; then + return 1 + fi + + if is_ci_base_target; then + if [[ -z "${CI_BASE_CONTENT_HASH:-}" ]]; then + return 0 + fi + + remote_hash=$(get_remote_image_label_with_retry "${image_ref}" "vllm.ci_base.content_hash") + if [[ -n "${remote_hash}" && "${remote_hash}" == "${CI_BASE_CONTENT_HASH}" ]]; then + return 0 + fi + + echo "Remote image exists but does not have the expected ci_base content hash." + echo " expected: ${CI_BASE_CONTENT_HASH:0:16}..." + echo " found: ${remote_hash:0:16}..." + return 1 + fi + + if is_commit_image_target; then + remote_revision=$(get_remote_image_label_with_retry "${image_ref}" "org.opencontainers.image.revision") + if [[ -n "${remote_revision}" && "${remote_revision}" == "${BUILDKITE_COMMIT}" ]]; then + return 0 + fi + + if [[ -z "${remote_revision}" \ + && ${IMAGE_EXISTED_BEFORE_BUILD} -eq 0 \ + && image_tag_is_commit_scoped ]]; then + echo "Remote image exists under a commit-scoped tag; accepting push despite missing revision label." + return 0 + fi + + echo "Remote image exists but revision label does not match ${BUILDKITE_COMMIT}." + echo " found revision: ${remote_revision:-}" + return 1 + fi + + return 0 +} + +verify_dependency_cache_ref() { + local cache_ref="$1" + local attempts="${ROCM_DEP_CACHE_VERIFY_ATTEMPTS:-6}" + local delay_secs="${ROCM_DEP_CACHE_VERIFY_DELAY:-5}" + local attempt + + for ((attempt = 1; attempt <= attempts; attempt++)); do + if dependency_cache_ref_exists "${cache_ref}"; then + echo "Dependency cache confirmed: ${cache_ref}" + return 0 + fi + if [[ ${attempt} -lt ${attempts} ]]; then + echo "Dependency cache not visible yet (${attempt}/${attempts}): ${cache_ref}" + sleep "${delay_secs}" + fi + done + + echo "ERROR: dependency cache was not confirmed after upload: ${cache_ref}" + return 1 +} + +seed_dependency_caches_if_needed() { + local target="" + local cache_ref="" + + if [[ "${TARGET}" != "ci-base-rocm-ci-with-deps" ]]; then + return 0 + fi + if [[ ${#DEPENDENCY_CACHE_TARGETS[@]} -eq 0 ]]; then + return 0 + fi + + echo "--- :docker: Seeding ROCm dependency caches" + echo "Dependency cache uploads are required for this build." + echo "Seeding serially to avoid concurrent Docker Hub cache exporters." + + for target in "${DEPENDENCY_CACHE_TARGETS[@]}"; do + cache_ref=$(dependency_cache_ref_for_target "${target}") + if [[ -z "${cache_ref}" ]]; then + echo "ERROR: could not resolve dependency cache ref for ${target}" + return 1 + fi + + echo "--- :docker: Seeding ${target}" + echo "Expected cache ref: ${cache_ref}" + docker buildx bake "${BAKE_FILES[@]}" --progress plain "${target}" + verify_dependency_cache_ref "${cache_ref}" + done +} + +annotate_cache_export_warning() { + local build_rc="$1" + + if ! command -v buildkite-agent >/dev/null 2>&1; then + return 0 + fi + + buildkite-agent annotate \ + --style warning \ + --context "cache-export-warning" \ + "### :warning: Docker cache export failed (non-fatal) + +Image was pushed successfully: \`${IMAGE_TAG}\` + +The BuildKit build returned exit code ${build_rc}, but the expected image +is present in the registry. Treating this as a registry cache export failure +so tests can continue with the pushed image." 2>/dev/null || true +} + +run_bake() { + local build_rc=0 + + echo "--- :docker: Building ${TARGET}" + docker buildx bake "${BAKE_FILES[@]}" --progress plain "${BAKE_TARGETS[@]}" || build_rc=$? + + if [[ ${build_rc} -eq 0 ]]; then + echo "--- :white_check_mark: Build complete" + return 0 + fi + + echo "" + echo "WARNING: docker buildx bake exited with code ${build_rc}" + + if [[ -n "${IMAGE_TAG:-}" ]]; then + echo "Checking if image was pushed successfully..." + if confirm_remote_image_push "${IMAGE_TAG}"; then + echo "" + echo "WARNING: Build reported failure (rc=${build_rc}) but the" + echo " image was pushed successfully: ${IMAGE_TAG}" + echo "" + echo " Treating this as a non-fatal registry cache export failure." + echo " The image is usable, but registry cache may be cold on the next build." + echo "" + annotate_cache_export_warning "${build_rc}" + echo "--- :white_check_mark: Build complete" + return 0 + fi + + echo "" + echo "ERROR: Build failed and image was NOT confirmed: ${IMAGE_TAG}" + echo " This is a real build failure, not a cache export warning." + echo "" + fi + + return "${build_rc}" +} + +upload_wheel_artifacts_if_present() { + local wheel_dir="./wheel-export" + local artifact_dir="artifacts/vllm-rocm-install" + local archive_name="vllm-rocm-install.tar.gz" + local whl="" + local whl_name="" + + if ! should_upload_wheel_artifacts; then + return 0 + fi + + if [[ ! -d "${wheel_dir}" ]] || ! ls "${wheel_dir}"/*.whl >/dev/null 2>&1; then + echo "No ROCm wheel artifacts found in ${wheel_dir}" + return 0 + fi + + echo "--- :package: Uploading ROCm vLLM install artifact" + mkdir -p "${artifact_dir}" + + tar -C "${wheel_dir}" -czf "${artifact_dir}/${archive_name}" . + echo "Created ${archive_name}: $(du -sh "${artifact_dir}/${archive_name}" | cut -f1)" + printf '%s\n' "${CI_BASE_IMAGE:-}" > "${artifact_dir}/ci-base-image.txt" + printf '%s\n' "${IMAGE_TAG:-}" > "${artifact_dir}/fallback-image.txt" + + for whl in "${wheel_dir}"/*.whl; do + [[ -f "${whl}" ]] || continue + whl_name=$(basename "${whl}") + cp "${whl}" "${artifact_dir}/${whl_name}" + echo "Copied ${whl_name}: $(du -sh "${artifact_dir}/${whl_name}" | cut -f1)" + done + + if command -v buildkite-agent >/dev/null 2>&1; then + buildkite-agent artifact upload "${artifact_dir}/*" + echo "ROCm vLLM install artifacts uploaded to ${artifact_dir}/" + else + echo "Not in Buildkite, skipping artifact upload" + fi + + rm -rf "${wheel_dir}" +} + +main() { + init_config "$@" + print_header + validate_inputs + load_ci_hcl + compute_ci_base_hash_if_needed + configure_ci_base_image_refs + maybe_skip_existing_image + setup_builder + prepare_git_cache_metadata + write_ci_base_label_override + extract_dependency_pins + write_rocm_build_arg_override + compute_dependency_cache_keys + compute_rocm_csrc_content_hash_if_needed + write_rocm_cache_override + resolve_ci_base_dependency_targets + print_bake_config + if [[ "${BAKE_PRINT_ONLY:-0}" == "1" ]]; then + echo "BAKE_PRINT_ONLY=1 set; skipping build" + return 0 + fi + seed_dependency_caches_if_needed + run_bake + upload_wheel_artifacts_if_present +} + +main "$@" diff --git a/.buildkite/scripts/hardware_ci/run-amd-test.sh b/.buildkite/scripts/hardware_ci/run-amd-test.sh index 94bbc15fcff..953074c3882 100755 --- a/.buildkite/scripts/hardware_ci/run-amd-test.sh +++ b/.buildkite/scripts/hardware_ci/run-amd-test.sh @@ -52,6 +52,108 @@ cleanup_network() { fi } +prepare_artifact_image() { + if [[ "${VLLM_CI_USE_ARTIFACTS:-0}" != "1" ]]; then + return 1 + fi + if ! command -v buildkite-agent >/dev/null 2>&1; then + echo "buildkite-agent not found; cannot download ROCm wheel artifact" + return 1 + fi + + local artifact_glob="${VLLM_CI_ARTIFACT_GLOB:-artifacts/vllm-rocm-install/vllm-rocm-install.tar.gz}" + local archive="" + local metadata_file="" + local base_image="${VLLM_CI_BASE_IMAGE:-rocm/vllm-dev:ci_base}" + local artifact_image="" + local artifact_key="" + local base_digest="" + local wheel_dir="" + local context_dir="" + local workspace_dir="" + + artifact_work_dir=$(mktemp -d -t vllm-rocm-artifact.XXXXXX) + wheel_dir="${artifact_work_dir}/wheels" + context_dir="${artifact_work_dir}/context" + workspace_dir="${context_dir}/workspace" + mkdir -p "${wheel_dir}" "${context_dir}/wheels" "${workspace_dir}" + + echo "--- Downloading ROCm wheel artifact" + if ! buildkite-agent artifact download "${artifact_glob}" "${artifact_work_dir}"; then + echo "Failed to download ${artifact_glob}" + return 1 + fi + buildkite-agent artifact download \ + "artifacts/vllm-rocm-install/ci-base-image.txt" \ + "${artifact_work_dir}" >/dev/null 2>&1 || true + + archive=$(find "${artifact_work_dir}" -name "vllm-rocm-install.tar.gz" -type f | head -1) + if [[ -z "${archive}" || ! -f "${archive}" ]]; then + echo "ROCm wheel artifact archive was not found" + return 1 + fi + + metadata_file=$(find "${artifact_work_dir}" -name "ci-base-image.txt" -type f | head -1) + if [[ -n "${metadata_file}" && -s "${metadata_file}" ]]; then + base_image=$(tr -d '[:space:]' < "${metadata_file}") + fi + + echo "--- Preparing local ROCm test image" + echo "Base image: ${base_image}" + docker pull "${base_image}" || return 1 + base_digest=$( + docker image inspect \ + --format='{{if .RepoDigests}}{{index .RepoDigests 0}}{{else}}{{.Id}}{{end}}' \ + "${base_image}" 2>/dev/null || printf '%s' "${base_image}" + ) + + artifact_key=$( + { + printf 'base-image:%s\n' "${base_digest}" + sha256sum "${archive}" + } | sha256sum | cut -c1-24 + ) + artifact_image="rocm/vllm-ci-artifact:${artifact_key}" + + if docker image inspect "${artifact_image}" >/dev/null 2>&1; then + echo "Using existing local ROCm artifact image: ${artifact_image}" + image_name="${artifact_image}" + return 0 + fi + + tar -xzf "${archive}" -C "${wheel_dir}" || return 1 + if ! ls "${wheel_dir}"/*.whl >/dev/null 2>&1; then + echo "ROCm wheel artifact did not contain a wheel" + return 1 + fi + if [[ ! -d "${wheel_dir}/tests" ]]; then + echo "ROCm wheel artifact did not contain the test workspace" + return 1 + fi + + cp "${wheel_dir}"/*.whl "${context_dir}/wheels/" || return 1 + tar -C "${wheel_dir}" --exclude='*.whl' -cf - . \ + | tar -C "${workspace_dir}" -xf - || return 1 + cat > "${context_dir}/Dockerfile" <<'EOF' +ARG BASE_IMAGE +FROM ${BASE_IMAGE} +COPY wheels/ /tmp/vllm-wheels/ +COPY workspace/ /vllm-workspace/ +RUN python3 -m pip install --no-deps --force-reinstall /tmp/vllm-wheels/*.whl \ + && rm -rf /tmp/vllm-wheels +WORKDIR /vllm-workspace +EOF + + echo "--- Building local ROCm test image" + docker build \ + --pull=false \ + --build-arg "BASE_IMAGE=${base_image}" \ + -t "${artifact_image}" \ + "${context_dir}" || return 1 + image_name="${artifact_image}" + return 0 +} + is_multi_node() { local cmds="$1" # Primary signal: NUM_NODES environment variable set by the pipeline @@ -243,22 +345,30 @@ report_docker_usage # --- Pull test image --- echo "--- Pulling container" -image_name="rocm/vllm-ci:${BUILDKITE_COMMIT}" +image_name="${VLLM_CI_FALLBACK_IMAGE:-rocm/vllm-ci:${BUILDKITE_COMMIT:-local}}" +artifact_work_dir="" container_name="rocm_${BUILDKITE_COMMIT}_$(tr -dc A-Za-z0-9 < /dev/urandom | head -c 10; echo)" -docker pull "${image_name}" remove_docker_container() { - # docker run uses --rm, so the container is normally already gone when the - # EXIT trap runs. Cleanup is best-effort and must not affect the test result. - docker rm -f "${container_name}" >/dev/null 2>&1 || true + if docker container inspect "${container_name}" >/dev/null 2>&1; then + docker rm -f "${container_name}" || true + fi + if [[ "${VLLM_CI_REMOVE_TEST_IMAGE:-0}" == "1" ]]; then + docker image rm -f "${image_name}" || true + else + # Keep images by default so later jobs on the same AMD node can reuse layers. + echo "Keeping ROCm test image locally: ${image_name}" + fi + if [[ -n "${artifact_work_dir}" ]]; then + rm -rf "${artifact_work_dir}" + fi } +trap remove_docker_container EXIT -on_exit() { - local exit_code=$? - remove_docker_container - exit "$exit_code" -} -trap on_exit EXIT +if ! prepare_artifact_image; then + echo "Using full ROCm CI image: ${image_name}" + docker pull "${image_name}" || exit 1 +fi # --- Prepare commands --- echo "--- Running container" diff --git a/.dockerignore b/.dockerignore index 66447272e95..fb010600db9 100644 --- a/.dockerignore +++ b/.dockerignore @@ -33,3 +33,10 @@ share/python-wheels/ *.egg MANIFEST rust/target/ +# Not needed in Docker builds +docs/ +.github/ +.pre-commit-config.yaml +.clang-format +.gitattributes +format.sh diff --git a/cmake/utils.cmake b/cmake/utils.cmake index f10ba93f7c6..dd2034c1c5e 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -81,6 +81,14 @@ function (hipify_sources_target OUT_SRCS NAME ORIG_SRCS) set_property(GLOBAL APPEND PROPERTY VLLM_HIPIFY_ALL_SRCS ${SRCS}) set_property(GLOBAL APPEND PROPERTY VLLM_HIPIFY_ALL_BYPRODUCTS ${HIP_SRCS}) + # Chain hipify targets so they run sequentially. Parallel hipify + # invocations race on shutil.copytree, overwriting .hip files + # produced by another target back to .cu originals. + if (DEFINED _VLLM_LAST_HIPIFY_TARGET) + add_dependencies(hipify${NAME} ${_VLLM_LAST_HIPIFY_TARGET}) + endif() + set(_VLLM_LAST_HIPIFY_TARGET "hipify${NAME}" PARENT_SCOPE) + # Swap out original extension sources with hipified sources. list(APPEND HIP_SRCS ${CXX_SRCS}) set(${OUT_SRCS} ${HIP_SRCS} PARENT_SCOPE) diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 61d73cd1527..1e39306e39f 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -2,6 +2,7 @@ ARG REMOTE_VLLM="0" ARG COMMON_WORKDIR=/app ARG BASE_IMAGE=rocm/vllm-dev:base +ARG CI_BASE_IMAGE=rocm/vllm-dev:ci_base # NIC backend for MoRI RDMA support. # By default (all), drivers and userspace libraries for all supported NIC types # (ainic and bnxt) are installed; MoRI selects the appropriate one at runtime. @@ -16,7 +17,8 @@ ARG NIC_BACKEND=all ARG AINIC_VERSION=1.117.3-hydra ARG UBUNTU_CODENAME=jammy -# Sccache configuration (only used in release pipeline) +# Sccache configuration. Release builds use this today; CI can opt in when a +# shared S3-compatible cache backend is available. ARG USE_SCCACHE ARG SCCACHE_DOWNLOAD_URL ARG SCCACHE_ENDPOINT @@ -29,12 +31,16 @@ FROM ${BASE_IMAGE} AS base ARG ARG_PYTORCH_ROCM_ARCH ENV PYTORCH_ROCM_ARCH=${ARG_PYTORCH_ROCM_ARCH:-${PYTORCH_ROCM_ARCH}} -# Install some basic utilities +# Install build dependencies and utilities RUN apt-get update -q -y && apt-get install -q -y \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ apt-transport-https ca-certificates wget curl \ - libnuma-dev -RUN python3 -m pip install --upgrade pip + libnuma-dev ccache mold +RUN --mount=type=cache,target=/root/.cache/pip \ + python3 -m pip install --upgrade pip +# Note: mold is installed but not set as the system default linker because +# some packages use JIT compilation at runtime with flags mold does not support. +# Build stages opt in via LDFLAGS="-fuse-ld=mold". # Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base) ARG USE_SCCACHE RUN if [ "$USE_SCCACHE" != "1" ]; then \ @@ -55,6 +61,12 @@ ENV UV_HTTP_TIMEOUT=500 ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts ENV UV_LINK_MODE=copy +# ccache directory - persisted across layer rebuilds via cache mounts. +ENV CCACHE_DIR=/root/.cache/ccache +ENV CCACHE_COMPILERCHECK=content +# Empty by default so build steps fall back to $(nproc); CI can override. +ARG max_jobs +ENV MAX_JOBS=${max_jobs} # Install sccache if USE_SCCACHE is enabled (for release builds) ARG USE_SCCACHE @@ -86,6 +98,7 @@ RUN if [ "$USE_SCCACHE" = "1" ]; then \ ARG USE_SCCACHE ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}} ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}} +ENV SCCACHE_ENDPOINT=${USE_SCCACHE:+${SCCACHE_ENDPOINT}} ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}} ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0} @@ -114,8 +127,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm # ----------------------- # Rust build stage # Builds the `vllm-rs` frontend in a dedicated stage so the wheel build stages -# don't need the rust toolchain or protoc. Runs in parallel with the main wheel -# build for faster end-to-end builds. +# don't need the rust toolchain or protoc. FROM fetch_vllm AS rust-build ARG COMMON_WORKDIR @@ -144,24 +156,74 @@ ENV RUSTUP_MAX_RETRIES=10 # layer for later COPY --from=rust-build. RUN --mount=type=cache,id=vllm-rocm-cargo-registry,target=/root/.cargo/registry,sharing=locked \ --mount=type=cache,id=vllm-rocm-cargo-git,target=/root/.cargo/git,sharing=locked \ + --mount=type=cache,id=vllm-rocm-cargo-target,target=${COMMON_WORKDIR}/vllm/rust/target,sharing=locked \ cd ${COMMON_WORKDIR}/vllm \ && VLLM_RS_TARGET_PATH=/tmp/vllm-rs bash build_rust.sh \ && test -x /tmp/vllm-rs # ----------------------- -# vLLM build stages +# vLLM native build stages +# +# csrc-build intentionally copies only files that affect ROCm native extension +# compilation. That keeps unrelated CI/test/docs edits from invalidating the +# expensive HIP/C++ build layer. +FROM base AS csrc-build +ARG COMMON_WORKDIR +WORKDIR ${COMMON_WORKDIR}/vllm + +COPY requirements/rocm.txt requirements/rocm.txt +COPY requirements/common.txt requirements/common.txt +RUN --mount=type=cache,id=vllm-rocm-uv,target=/root/.cache/uv \ + uv pip install --system -r requirements/rocm.txt + +# pyproject.toml is bind-mounted in the RUN step so metadata-only changes do +# not invalidate the expensive native build layer. +COPY setup.py CMakeLists.txt ./ +COPY cmake cmake/ +COPY csrc csrc/ +COPY vllm/envs.py vllm/envs.py +COPY vllm/__init__.py vllm/__init__.py + +ENV VLLM_TARGET_DEVICE=rocm +ENV SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0+rocm.csrc.build" + +RUN --mount=type=bind,source=pyproject.toml,target=${COMMON_WORKDIR}/vllm/pyproject.toml \ + --mount=type=cache,id=vllm-rocm-ccache,target=/root/.cache/ccache \ + export CCACHE_BASEDIR="$PWD" \ + && echo "=== ccache stats before ROCm native build ===" \ + && (ccache --show-stats || true) \ + && (ccache --zero-stats || true) \ + && EFFECTIVE_MAX_JOBS="${MAX_JOBS:-$(nproc)}" \ + && echo "Building ROCm native extension wheel with MAX_JOBS=${EFFECTIVE_MAX_JOBS}" \ + && LDFLAGS="-fuse-ld=mold" MAX_JOBS="${EFFECTIVE_MAX_JOBS}" python3 setup.py bdist_wheel --dist-dir=dist \ + && test -d dist \ + && ls dist/*.whl >/dev/null \ + && echo "=== ccache stats after ROCm native build ===" \ + && (ccache --show-stats || true) + +# Build the full vLLM ROCm wheel by reusing the native extension wheel from +# csrc-build. This stage still rebuilds for Python/package changes, but skips +# the expensive HIP/C++ compile when native inputs are unchanged. FROM fetch_vllm AS build_vllm ARG COMMON_WORKDIR +ENV VLLM_TARGET_DEVICE=rocm + +COPY --from=csrc-build ${COMMON_WORKDIR}/vllm/dist /precompiled-wheels # Drop the pre-built rust frontend binary into the source tree. setup.py # detects it and ships it as-is, skipping the local cargo build. COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs -# Build vLLM (setup.py auto-detects sccache in PATH) -RUN cd vllm \ - && python3 -m pip install -r requirements/rocm.txt \ - && python3 setup.py clean --all \ - && python3 setup.py bdist_wheel --dist-dir=dist +RUN --mount=type=cache,id=vllm-rocm-uv,target=/root/.cache/uv \ + cd vllm \ + && uv pip install --system -r requirements/rocm.txt \ + && export VLLM_USE_PRECOMPILED=1 \ + && export VLLM_PRECOMPILED_WHEEL_LOCATION="$(ls /precompiled-wheels/*.whl)" \ + && export VLLM_DOCKER_BUILD_CONTEXT=1 \ + && echo "Packaging vLLM ROCm wheel using precompiled extensions from ${VLLM_PRECOMPILED_WHEEL_LOCATION}" \ + && python3 setup.py bdist_wheel --dist-dir=dist \ + && test -d dist \ + && ls dist/*.whl >/dev/null FROM scratch AS export_vllm ARG COMMON_WORKDIR COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/dist/*.whl / @@ -171,6 +233,7 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/tests /tests COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/examples /examples COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/pyproject.toml /pyproject.toml COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1 # RIXL/UCX build stages @@ -201,14 +264,17 @@ RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \ ibverbs-providers \ && rm -rf /var/lib/apt/lists/* -RUN uv pip install --system meson auditwheel patchelf tomlkit +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system meson auditwheel patchelf tomlkit -RUN cd /usr/local/src && \ +RUN --mount=type=cache,target=/root/.cache/ccache \ + cd /usr/local/src && \ git clone ${UCX_REPO} && \ cd ucx && \ git checkout ${UCX_BRANCH} && \ ./autogen.sh && \ mkdir build && cd build && \ + CC="ccache gcc" CXX="ccache g++" \ ../configure \ --prefix=/usr/local/ucx \ --enable-shared \ @@ -220,20 +286,22 @@ RUN cd /usr/local/src && \ --with-verbs \ --with-dm \ --enable-mt && \ - make -j && \ + make -j$(nproc) && \ make install ENV PATH=/usr/local/ucx/bin:$PATH ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH} -RUN git clone ${RIXL_REPO} /opt/rixl && \ +RUN --mount=type=cache,target=/root/.cache/ccache \ + git clone ${RIXL_REPO} /opt/rixl && \ cd /opt/rixl && \ git checkout ${RIXL_BRANCH} && \ + CC="ccache gcc" CXX="ccache g++" \ meson setup build --prefix=${RIXL_HOME} \ -Ducx_path=${UCX_HOME} \ -Drocm_path=${ROCM_PATH} && \ cd build && \ - ninja && \ + ninja -j$(nproc) && \ ninja install # Generate RIXL wheel @@ -250,30 +318,44 @@ RUN cd /opt/rixl && \ --ucx-plugins-dir ${UCX_HOME}/lib/ucx \ --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins -# DeepEP build stage -FROM base AS build_deep +# ROCShmem build stage - split from DeepEP so changing DEEPEP_BRANCH does not +# invalidate the slow ROCShmem build. +FROM base AS build_rocshmem ARG ROCSHMEM_BRANCH="f0acb0c6" ARG ROCSHMEM_REPO="https://github.com/ROCm/rocm-systems.git" -ARG DEEPEP_BRANCH="a9ea9774" -ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git" -ARG DEEPEP_NIC="cx7" +# DeepEP only supports gfx942 and gfx950; build ROCShmem for the same set so +# it can be linked against DeepEP without arch mismatches. ARG DEEPEP_ROCM_ARCH="gfx942;gfx950" +ENV ROCM_PATH=/opt/rocm ENV ROCSHMEM_DIR=/opt/rocshmem -RUN git clone ${ROCSHMEM_REPO} \ +RUN --mount=type=cache,target=/root/.cache/ccache \ + git clone --no-checkout --filter=blob:none ${ROCSHMEM_REPO} \ && cd rocm-systems \ + && git sparse-checkout set --cone projects/rocshmem \ && git checkout ${ROCSHMEM_BRANCH} \ && mkdir -p projects/rocshmem/build \ && cd projects/rocshmem/build \ - && INSTALL_PREFIX=${ROCSHMEM_DIR} \ - ../scripts/build_configs/all_backends -DUSE_EXTERNAL_MPI=OFF + && CC="ccache gcc" CXX="ccache g++" INSTALL_PREFIX=${ROCSHMEM_DIR} \ + bash ../scripts/build_configs/all_backends \ + -DROCM_PATH=${ROCM_PATH} \ + -DGPU_TARGETS="${DEEPEP_ROCM_ARCH}" \ + -DUSE_EXTERNAL_MPI=OFF -# Build DeepEP wheel. -# DeepEP looks for rocshmem at ROCSHMEM_DIR. -RUN git clone ${DEEPEP_REPO} \ +# DeepEP build stage - depends on ROCShmem, builds the HIP kernel wheel. +FROM build_rocshmem AS build_deepep +ARG DEEPEP_BRANCH="a9ea9774" +ARG DEEPEP_REPO="https://github.com/ROCm/DeepEP.git" +ARG DEEPEP_NIC="cx7" + +# Build DeepEP wheel. DeepEP looks for rocshmem at ROCSHMEM_DIR. +# DeepEP only supports gfx942 and gfx950, so avoid gfx90a in the default list. +RUN --mount=type=cache,target=/root/.cache/ccache \ + export PYTORCH_ROCM_ARCH="gfx942;gfx950" \ + && git clone ${DEEPEP_REPO} \ && cd DeepEP \ && git checkout ${DEEPEP_BRANCH} \ - && python3 setup.py --variant rocm --rocm-explicit-ctx --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install + && LDFLAGS="-fuse-ld=mold" MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py --variant rocm --rocm-explicit-ctx --nic ${DEEPEP_NIC} bdist_wheel --dist-dir=/app/deep_install # MoRI runtime dependencies live in Dockerfile.rocm so NIC backend changes do # not force users to rebuild the long-lived Dockerfile.rocm_base image. @@ -372,8 +454,9 @@ RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \ # Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt) # This ensures setuptools_scm sees clean repo state for version detection RUN --mount=type=bind,source=.git,target=vllm/.git \ + --mount=type=cache,target=/root/.cache/uv \ cd vllm \ - && pip install setuptools_scm regex \ + && uv pip install --system setuptools_scm regex \ && VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \ && echo "Detected vLLM version: ${VLLM_VERSION}" \ && echo "${VLLM_VERSION}" > /tmp/vllm_version.txt @@ -409,18 +492,20 @@ RUN echo "Pinning vLLM dependencies to custom wheel versions..." \ && python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt # Install dependencies using custom wheels from /install -RUN cd vllm \ +RUN --mount=type=cache,target=/root/.cache/uv \ + cd vllm \ && echo "Building vLLM with custom wheels from /install" \ - && python3 -m pip install --find-links /install -r requirements/rocm.txt \ - && python3 setup.py clean --all + && uv pip install --system --find-links /install -r requirements/rocm.txt # Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt -# (setup.py auto-detects sccache in PATH) +# (setup.py auto-detects ccache/sccache in PATH) RUN --mount=type=bind,source=.git,target=vllm/.git \ + --mount=type=cache,id=vllm-rocm-ccache,target=/root/.cache/ccache \ cd vllm \ + && export CCACHE_BASEDIR="$PWD" \ && export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \ && echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \ - && python3 setup.py bdist_wheel --dist-dir=dist + && MAX_JOBS="${MAX_JOBS:-$(nproc)}" python3 setup.py bdist_wheel --dist-dir=dist FROM scratch AS export_vllm_wheel_release ARG COMMON_WORKDIR @@ -431,112 +516,118 @@ COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite +COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/pyproject.toml /pyproject.toml COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1 # ----------------------- -# Test vLLM image -FROM mori_base AS test +# CI base image (Tier 1) - stable, rarely changing CI dependencies. +# Per-PR test builds pull this as CI_BASE_IMAGE so the test stage only layers +# in the vLLM artifacts for the current commit. +FROM mori_base AS ci_base +ARG COMMON_WORKDIR -RUN python3 -m pip install --upgrade pip && rm -rf /var/lib/apt/lists/* - -# Install vLLM using uv (inherited from base stage) -# Note: No -U flag to avoid upgrading PyTorch ROCm to CUDA version -RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ - --mount=type=cache,target=/root/.cache/uv \ - cd /install \ - && uv pip install --system -r requirements/rocm.txt \ - && uv pip install --system -r requirements/test/rocm.txt \ - && pip uninstall -y vllm \ - && uv pip install --system *.whl - -# Persist the built wheel in the image so python_only_compile_rocm.sh can -# reinstall it after removing compilers. The bind-mounted /install contents -# above are not available once that RUN step completes. -COPY --from=export_vllm /*.whl /opt/vllm-wheels/ - -# Update rdma-core to support latest rocshmem +# Update rdma-core to support latest rocshmem. ARG DEEPEP_NIC RUN if [ "${DEEPEP_NIC}" = "cx7" ] || [ "${DEEPEP_NIC}" = "io" ]; then \ git clone --branch v62.0 --depth 1 https://github.com/linux-rdma/rdma-core.git /tmp/rdma-core && \ cd /tmp/rdma-core && \ mkdir -p build && cd build && \ cmake -GNinja -DCMAKE_INSTALL_PREFIX=/usr -DNO_MAN_PAGES=1 .. && \ - ninja && ninja install && ldconfig && rm -rf /tmp/rdma-core; \ + ninja && ninja install && ldconfig && rm -rf /tmp/rdma-core; \ fi -# Install RIXL wheel +# Install RIXL + DeepEP wheels. RUN --mount=type=bind,from=build_rixl,src=/app/install,target=/rixl_install \ - uv pip install --system /rixl_install/*.whl + --mount=type=bind,from=build_deepep,src=/app/deep_install,target=/deep_install \ + uv pip install --system /rixl_install/*.whl /deep_install/*.whl -# Install DeepEP wheel -RUN --mount=type=bind,from=build_deep,src=/app/deep_install,target=/deep_install \ - uv pip install --system /deep_install/*.whl -COPY --from=build_deep /opt/rocshmem /opt/rocshmem +# Copy ROCShmem runtime libraries. +COPY --from=build_rocshmem /opt/rocshmem /opt/rocshmem -# RIXL/MoRIIO runtime dependencies (RDMA userspace libraries) -RUN apt-get update -q -y && apt-get install -q -y \ +# RDMA userspace libraries plus FFmpeg dev libs needed by torchcodec. +RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \ librdmacm1 \ libibverbs1 \ ibverbs-providers \ ibverbs-utils \ + pkg-config ffmpeg libavcodec-dev libavformat-dev libavutil-dev \ + libswscale-dev libavdevice-dev libavfilter-dev libswresample-dev \ && rm -rf /var/lib/apt/lists/* -WORKDIR /vllm-workspace -ARG COMMON_WORKDIR -COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace - -# install development dependencies (for testing) -RUN cd /vllm-workspace \ - && python3 -m pip install -e tests/vllm_test_utils \ - && python3 -m pip install pytest-shard - -# enable fast downloads from hf (for testing) -ENV HF_XET_HIGH_PERFORMANCE=1 - -# increase timeout for hf downloads (for testing) -ENV HF_HUB_DOWNLOAD_TIMEOUT 60 - -# install audio decode package `torchcodec` from source (required due to -# ROCm and torch version mismatch) for tests with datasets package +# Install torchcodec from source for ROCm/torch ABI compatibility. COPY tools/install_torchcodec_rocm.sh /tmp/install_torchcodec.sh -RUN bash /tmp/install_torchcodec.sh \ +RUN --mount=type=cache,target=/root/.cache/uv \ + --mount=type=cache,target=/root/.cache/pip \ + --mount=type=cache,target=/root/.cache/torchcodec-wheels \ + bash /tmp/install_torchcodec.sh \ && rm /tmp/install_torchcodec.sh \ - && apt-get clean \ - && rm -rf /var/lib/apt/lists/* + && apt-get clean && rm -rf /var/lib/apt/lists/* -# Copy in the v1 package (for python-only install test group) -COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 +# Pre-install shared ROCm runtime dependencies. +COPY requirements/common.txt requirements/rocm.txt /tmp/ci-base-requirements/ +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r /tmp/ci-base-requirements/rocm.txt \ + && rm -rf /tmp/ci-base-requirements -# Set MIOPEN ENVS to resolve performance regressions in MIOpen 3D convolution kernel +# Enable fast and less brittle model downloads in tests. +ENV HF_XET_HIGH_PERFORMANCE=1 +ENV HF_HUB_DOWNLOAD_TIMEOUT=60 + +# Pre-install vLLM test dependencies. +COPY requirements/test/rocm.txt /tmp/rocm-test-reqs.txt +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system -r /tmp/rocm-test-reqs.txt + +# Rebuild fastsafetensors from source so its C++ extension is compiled with +# USE_ROCM and can detect libamdhip64.so at runtime. +RUN --mount=type=cache,target=/root/.cache/pip \ + FASTSAFETENSORS_REQ="$(grep -E '^fastsafetensors(==| @ )' /tmp/rocm-test-reqs.txt | head -1)" \ + && test -n "${FASTSAFETENSORS_REQ}" \ + && python3 -m pip install --force-reinstall --no-deps \ + --no-binary fastsafetensors "${FASTSAFETENSORS_REQ}" \ + && rm /tmp/rocm-test-reqs.txt + +# Set MIOPEN ENVS to resolve performance regressions in MIOpen 3D convolution kernel. # See: https://github.com/pytorch/pytorch/issues/169857 ENV MIOPEN_DEBUG_CONV_DIRECT=0 ENV MIOPEN_DEBUG_CONV_GEMM=0 -# Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc +# Use legacy IPC mode for HSA to avoid GPU memory pinning issues with UCX rocm_ipc. # See: https://github.com/ROCm/rocm-libraries/issues/6266 ENV HSA_ENABLE_IPC_MODE_LEGACY=1 -# Source code is used in the `python_only_compile.sh` test -# We hide it inside `src/` so that this source code -# will not be imported by other tests -RUN mkdir src && mv vllm src/vllm +# ROCm profiler limits workaround. +RUN echo "ROCTRACER_MAX_EVENTS=10000000" > ${COMMON_WORKDIR}/libkineto.conf +ENV KINETO_CONFIG="${COMMON_WORKDIR}/libkineto.conf" -# This is a workaround to ensure pytest exits with the correct status code in CI tests. -RUN printf '%s\n' \ - 'import os' \ - '' \ - '_exit_code = 1' \ - '' \ - 'def pytest_sessionfinish(session, exitstatus):' \ - ' global _exit_code' \ - ' _exit_code = int(exitstatus)' \ - '' \ - 'def pytest_unconfigure(config):' \ - ' import sys' \ - ' sys.stdout.flush()' \ - ' sys.stderr.flush()' \ - ' os._exit(_exit_code)' \ - > /vllm-workspace/conftest.py +# Install vllm_test_utils in ci_base for ci_base + wheel parity. +COPY tests/vllm_test_utils /tmp/vllm_test_utils +RUN --mount=type=cache,target=/root/.cache/uv \ + uv pip install --system /tmp/vllm_test_utils \ + && rm -rf /tmp/vllm_test_utils + +# ----------------------- +# Test vLLM image (Tier 2) - vLLM-only layer on top of ci_base. +FROM ${CI_BASE_IMAGE} AS test +ARG COMMON_WORKDIR + +# Install the vLLM wheel (--no-deps: all deps already in ci_base). +RUN --mount=type=bind,from=export_vllm,src=/,target=/install \ + --mount=type=cache,target=/root/.cache/uv \ + cd /install \ + && uv pip install --system --no-deps *.whl + +# Store the vLLM wheel in the image for python-only install tests. +COPY --from=export_vllm /*.whl /opt/vllm-wheels/ + +WORKDIR /vllm-workspace +COPY --from=build_vllm ${COMMON_WORKDIR}/vllm /vllm-workspace + +# Copy in the v1 package (for python-only install test group). +COPY --from=export_vllm /vllm_v1 /usr/local/lib/python${PYTHON_VERSION}/dist-packages/vllm/v1 + +# Hide source under src/ so it won't shadow the installed package in tests. +RUN mkdir src && mv vllm src/vllm # ----------------------- # Final vLLM image @@ -553,6 +644,7 @@ RUN rm -f /usr/bin/sccache || true \ # This prevents S3 bucket config from leaking into production images ENV SCCACHE_BUCKET= ENV SCCACHE_REGION= +ENV SCCACHE_ENDPOINT= ENV SCCACHE_S3_NO_CREDENTIALS= ENV SCCACHE_IDLE_TIMEOUT= diff --git a/docker/ci-rocm.hcl b/docker/ci-rocm.hcl new file mode 100644 index 00000000000..138adcffcad --- /dev/null +++ b/docker/ci-rocm.hcl @@ -0,0 +1,376 @@ +# ci-rocm.hcl - CI-specific configuration for vLLM ROCm Docker builds +# +# This file lives in the vLLM repo at docker/ci-rocm.hcl so ROCm Docker +# build mechanics can evolve with Dockerfile.rocm and docker-bake-rocm.hcl. +# Used with: docker buildx bake -f docker/docker-bake-rocm.hcl -f docker/ci-rocm.hcl test-rocm-ci +# +# Registry cache: Docker Hub (rocm/vllm-ci-cache) is used exclusively. +# AMD build agents already have Docker Hub credentials (they push the test +# image to rocm/vllm-ci), so no additional credential setup is required. +# ROCm CI uses Docker Hub for BuildKit layer cache by default. A separate +# compiler cache can be enabled with USE_SCCACHE=1 when AMD provides a shared +# S3-compatible cache endpoint. + +# CI metadata + +variable "BUILDKITE_COMMIT" { + default = "" +} + +variable "BUILDKITE_BUILD_NUMBER" { + default = "" +} + +variable "BUILDKITE_BUILD_ID" { + default = "" +} + +variable "PARENT_COMMIT" { + default = "" +} + +# Merge-base of HEAD with main - provides a more stable cache fallback than +# parent commit for long-lived PRs. Mirrors the VLLM_MERGE_BASE_COMMIT +# pattern used in the shared ci.hcl file. Auto-computed by ci-bake-rocm.sh +# when unset. +variable "VLLM_MERGE_BASE_COMMIT" { + default = "" +} + +# Bridge to vLLM's COMMIT variable for OCI labels +variable "COMMIT" { + default = BUILDKITE_COMMIT +} + +# Image tags (set by CI) + +variable "IMAGE_TAG" { + default = "" +} + +variable "IMAGE_TAG_LATEST" { + default = "" +} + +# ROCm-specific GPU architecture targets + +variable "PYTORCH_ROCM_ARCH" { + default = "gfx90a;gfx942;gfx950" +} + +# Pre-built CI base image (Tier 1). Per-PR builds pull this instead of +# rebuilding RIXL/DeepEP/torchcodec from scratch. The ci_base stage in +# Dockerfile.rocm inherits from base, so CI_BASE_IMAGE only affects the test +# stage and is irrelevant when building --target ci_base itself. +variable "CI_BASE_IMAGE" { + default = "rocm/vllm-dev:ci_base" +} + +# Leave CI_MAX_JOBS empty so the Dockerfile falls back to $(nproc) and uses +# the full builder parallelism. Operators can still override this per build. +variable "CI_MAX_JOBS" { + default = "" +} + +# Upstream dependency commit pins -- extracted from Dockerfile.rocm by +# ci-bake-rocm.sh at build time. Empty defaults are safe: the cache +# functions produce no entries when the variable is empty. +variable "RIXL_BRANCH" { + default = "" +} + +variable "UCX_BRANCH" { + default = "" +} + +variable "ROCSHMEM_BRANCH" { + default = "" +} + +variable "DEEPEP_BRANCH" { + default = "" +} + +variable "RIXL_CACHE_KEY" { + default = "" +} + +variable "ROCSHMEM_CACHE_KEY" { + default = "" +} + +variable "DEEPEP_CACHE_KEY" { + default = "" +} + +# Docker Hub registry cache for AMD builds. +# +# A separate repo (rocm/vllm-ci-cache) is used for BuildKit layer cache. +# Final-image cache exports use mode=min to reduce the volume of data pushed. +# Source-scoped csrc cache exports default to mode=max so fresh workers can +# recover more of the native build graph when ROCm extension inputs change. +# NOTE: mode=min still includes all layers referenced by the final image +# manifest, including inherited base layers (~7.25GB ROCm runtime). +# Docker Hub auto-creates the repo on first push. +# +# Final-image cache stays commit-scoped. Branch-to-branch reuse for the test +# image comes from importing the parent and merge-base commit cache refs. +# +# The source-scoped native cache is exported both per-commit and per-branch so +# ROCm extension rebuilds are shareable within the same commit reruns and across +# consecutive commits on the same branch without depending on a single global +# latest tag. + +variable "DOCKERHUB_CACHE_REPO" { + default = "rocm/vllm-ci-cache" +} + +variable "DOCKERHUB_CACHE_TO" { + default = "" +} + +variable "ROCM_CACHE_BRANCH_TAG" { + default = "" +} + +variable "ROCM_CACHE_UPSTREAM_BRANCH_TAG" { + default = "" +} + +variable "ROCM_CSRC_CACHE_TO_MODE" { + default = "max" +} + +variable "ROCM_FINAL_CACHE_TO_MODE" { + default = "min" +} + +# Functions + +function "get_cache_from_rocm" { + params = [] + result = compact([ + # Exact commit hit - fastest cache on re-runs of the same commit + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT}" : "", + # Parent commit - useful cache for incremental changes + PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${PARENT_COMMIT}" : "", + # Merge-base with main - stable fallback for long-lived or rebased PRs; + # maps to a real main-branch commit whose cache layers are likely warm + VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${VLLM_MERGE_BASE_COMMIT}" : "", + # Import the source-scoped native build cache as well so builds whose + # Python/package layers changed can still reuse compiled ROCm objects. + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT}" : "", + PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${PARENT_COMMIT}" : "", + VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${VLLM_MERGE_BASE_COMMIT}" : "", + ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "", + ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "", + # Branch-scoped full image cache - fallback when parent-commit cache is evicted + ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "", + ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "", + ]) +} + +function "get_cache_to_rocm" { + params = [] + result = compact([ + # Commit-scoped cache for exact re-runs. + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-${BUILDKITE_COMMIT},mode=${ROCM_FINAL_CACHE_TO_MODE}" : "", + # Branch-scoped cache so later commits on the same branch can reuse the full + # image layers when the parent-commit cache is evicted. Unlike the old + # rocm-latest tag (which caused duplicate exporter 400s), this is per-branch. + ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=${ROCM_FINAL_CACHE_TO_MODE}" : "", + ]) +} + +function "get_cache_from_rocm_csrc" { + params = [] + result = compact([ + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT}" : "", + PARENT_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${PARENT_COMMIT}" : "", + VLLM_MERGE_BASE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${VLLM_MERGE_BASE_COMMIT}" : "", + ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG}" : "", + ROCM_CACHE_UPSTREAM_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_UPSTREAM_BRANCH_TAG}" : "", + ]) +} + +function "get_cache_to_rocm_csrc" { + params = [] + result = compact([ + # Export the exact-commit native cache for same-commit reruns. + BUILDKITE_COMMIT != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-${BUILDKITE_COMMIT},mode=${ROCM_CSRC_CACHE_TO_MODE}" : "", + # Export the branch-scoped native cache so later commits on the same branch + # can reuse compiled ROCm objects even when the exact parent cache is absent. + ROCM_CACHE_BRANCH_TAG != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:csrc-rocm-branch-${ROCM_CACHE_BRANCH_TAG},mode=${ROCM_CSRC_CACHE_TO_MODE}" : "", + ]) +} + +# Cache functions for upstream dependency stages (RIXL/UCX, ROCShmem, DeepEP). +# These stages are pinned to specific upstream commit hashes, so cache keys use +# those hashes rather than the Buildkite commit. This means the cache persists +# across all vLLM commits as long as the upstream dependency pins don't change. + +function "get_cache_from_rocm_deps" { + params = [] + result = compact([ + RIXL_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_CACHE_KEY}" : (RIXL_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH}" : ""), + ROCSHMEM_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_CACHE_KEY}" : (ROCSHMEM_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_BRANCH}" : ""), + DEEPEP_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_CACHE_KEY}" : (DEEPEP_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH}" : ""), + ]) +} + +function "get_cache_to_rocm_rixl" { + params = [] + result = compact([ + RIXL_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_CACHE_KEY},mode=min" : (RIXL_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rixl-rocm-${RIXL_BRANCH}-ucx-${UCX_BRANCH},mode=min" : ""), + ]) +} + +function "get_cache_to_rocm_rocshmem" { + params = [] + result = compact([ + ROCSHMEM_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_CACHE_KEY},mode=min" : (ROCSHMEM_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:rocshmem-rocm-${ROCSHMEM_BRANCH},mode=min" : ""), + ]) +} + +function "get_cache_to_rocm_deepep" { + params = [] + result = compact([ + DEEPEP_CACHE_KEY != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_CACHE_KEY},mode=min" : (DEEPEP_BRANCH != "" ? "type=registry,ref=${DOCKERHUB_CACHE_REPO}:deepep-rocm-${DEEPEP_BRANCH}-rocshmem-${ROCSHMEM_BRANCH},mode=min" : ""), + ]) +} + +# CI targets + +target "_ci-rocm" { + annotations = [ + "manifest:vllm.buildkite.build_number=${BUILDKITE_BUILD_NUMBER}", + "manifest:vllm.buildkite.build_id=${BUILDKITE_BUILD_ID}", + ] + args = { + ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH + CI_BASE_IMAGE = CI_BASE_IMAGE + max_jobs = CI_MAX_JOBS + } +} + +target "test-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm", "_labels"] + target = "test" + cache-from = get_cache_from_rocm() + cache-to = get_cache_to_rocm() + tags = compact([ + IMAGE_TAG, + IMAGE_TAG_LATEST, + ]) + output = ["type=registry"] +} + +# Cache-only target for the source-scoped ROCm native build stage. +# This persists the csrc-build stage in the registry cache even though the +# final test image only consumes it indirectly while packaging the wheel. +target "csrc-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm"] + target = "csrc-build" + cache-from = get_cache_from_rocm_csrc() + cache-to = get_cache_to_rocm_csrc() + output = ["type=cacheonly"] +} + +# Keep wheel export on the same CI graph as the test image build so the +# shared build_vllm/export_vllm stages resolve identically within one bake +# invocation. Without this, export-wheel-rocm uses the plain local target +# args while test-rocm-ci uses CI-only args, which can lead to separate +# cache lineages and inconsistent export_vllm results. +target "export-wheel-rocm" { + inherits = ["_common-rocm", "_ci-rocm"] + target = "export_vllm" + cache-from = get_cache_from_rocm() + cache-to = get_cache_to_rocm() + output = ["type=local,dest=./wheel-export"] +} + +# Artifact-only vLLM build. GPU test jobs consume this artifact on top of +# ci_base, avoiding a per-commit multi-GB image push/pull. +group "test-rocm-ci-with-artifacts" { + targets = ["csrc-rocm-ci", "export-wheel-rocm"] +} + +# Full test image + wheel export. Kept for fallback/debugging when a pushed +# per-commit image is useful. +group "test-rocm-ci-with-wheel" { + targets = ["csrc-rocm-ci", "test-rocm-ci", "export-wheel-rocm"] +} + +# Image tags for the ci_base build. ci-bake-rocm.sh rewrites CI_BASE_IMAGE_TAG +# to the primary tag for this build. Non-nightly builds use a commit-scoped tag +# and also publish a content tag for reuse. NIGHTLY=1 builds on the stable branch +# can additionally set CI_BASE_IMAGE_TAG_STABLE to refresh rocm/vllm-dev:ci_base. +variable "CI_BASE_IMAGE_TAG" { + default = "rocm/vllm-dev:ci_base" +} + +variable "CI_BASE_IMAGE_TAG_CONTENT" { + default = "" +} + +variable "CI_BASE_IMAGE_TAG_STABLE" { + default = "" +} + +# Cache-only targets for upstream dependency stages. These persist each stage +# in the registry cache keyed by its upstream commit hash. When ci_base rebuilds +# (e.g., requirements change), these stages are cache hits if their upstream +# pins haven't changed -- saving ~35min of compilation. +target "rixl-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm"] + target = "build_rixl" + cache-from = get_cache_from_rocm_deps() + cache-to = get_cache_to_rocm_rixl() + output = ["type=cacheonly"] +} + +target "rocshmem-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm"] + target = "build_rocshmem" + cache-from = get_cache_from_rocm_deps() + cache-to = get_cache_to_rocm_rocshmem() + output = ["type=cacheonly"] +} + +target "deepep-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm"] + target = "build_deepep" + cache-from = get_cache_from_rocm_deps() + cache-to = get_cache_to_rocm_deepep() + output = ["type=cacheonly"] +} + +# Builds only the ci_base stage (RIXL, DeepEP, torchcodec, etc.) +# Invoked by the ensure-ci-base step when the content hash of ci_base-affecting +# files drifts from the remote image label. Per-PR builds then pull the result +# as CI_BASE_IMAGE instead of rebuilding those slow layers on every commit. +# Uses inline cache metadata on the ci_base image itself instead of exporting a +# separate registry cache artifact. +target "ci-base-rocm-ci" { + inherits = ["_common-rocm", "_ci-rocm", "_labels"] + target = "ci_base" + cache-from = concat( + compact([ + CI_BASE_IMAGE_TAG != "" ? "type=registry,ref=${CI_BASE_IMAGE_TAG}" : "", + CI_BASE_IMAGE_TAG_CONTENT != "" ? "type=registry,ref=${CI_BASE_IMAGE_TAG_CONTENT}" : "", + CI_BASE_IMAGE_TAG_STABLE != "" ? "type=registry,ref=${CI_BASE_IMAGE_TAG_STABLE}" : "", + ]), + # Import upstream dependency caches so RIXL/ROCShmem/DeepEP stages + # are cache hits even when ci_base itself needs rebuilding. + get_cache_from_rocm_deps(), + ) + cache-to = ["type=inline"] + tags = compact([CI_BASE_IMAGE_TAG, CI_BASE_IMAGE_TAG_CONTENT, CI_BASE_IMAGE_TAG_STABLE]) + output = ["type=registry"] +} + +# Group for ci_base builds -- exports dependency stage caches alongside the +# ci_base image so future rebuilds can reuse them independently. +group "ci-base-rocm-ci-with-deps" { + targets = ["rixl-rocm-ci", "rocshmem-rocm-ci", "deepep-rocm-ci", "ci-base-rocm-ci"] +} diff --git a/docker/docker-bake-rocm.hcl b/docker/docker-bake-rocm.hcl new file mode 100644 index 00000000000..6b51781834b --- /dev/null +++ b/docker/docker-bake-rocm.hcl @@ -0,0 +1,143 @@ +# docker-bake-rocm.hcl - vLLM ROCm Docker build configuration +# +# This file lives in the vLLM repo at docker/docker-bake-rocm.hcl +# Equivalent of docker-bake.hcl for ROCm builds. +# +# Usage: +# docker buildx bake -f docker/docker-bake-rocm.hcl # Build test (default) +# docker buildx bake -f docker/docker-bake-rocm.hcl final-rocm # Build final image +# docker buildx bake -f docker/docker-bake-rocm.hcl --print # Show resolved config +# +# CI usage (with the vLLM-owned CI overlay): +# docker buildx bake -f docker/docker-bake-rocm.hcl -f docker/ci-rocm.hcl test-rocm-ci + +variable "MAX_JOBS" { + # Empty string lets the Dockerfile fall back to $(nproc) via + # MAX_JOBS="${MAX_JOBS:-$(nproc)}" in each RUN step, which uses all + # available cores on whatever machine the build runs on. + # Override with --set '*.args.max_jobs=8' for local builds on small machines. + default = "" +} + +variable "PYTORCH_ROCM_ARCH" { + default = "gfx90a;gfx942;gfx950" +} + +variable "COMMIT" { + default = "" +} + +# Content hash of ci_base-affecting files. Computed by ci-bake-rocm.sh and +# embedded as a label so future builds can compare without rebuilding. +variable "CI_BASE_CONTENT_HASH" { + default = "" +} + +# REMOTE_VLLM=0: use local source via Docker build context (ONBUILD COPY ./ vllm/) +# REMOTE_VLLM=1: clone from GitHub at VLLM_BRANCH (standalone builds without local source) +variable "REMOTE_VLLM" { + default = "0" +} + +variable "VLLM_BRANCH" { + default = "main" +} + +# CI_BASE_IMAGE: pre-built ci_base image for per-PR test builds. +# Defaults to the local "ci_base" stage for standalone/local builds. +# CI overrides this to "rocm/vllm-dev:ci_base" via environment variable. +variable "CI_BASE_IMAGE" { + default = "rocm/vllm-dev:ci_base" +} + +# Upstream dependency commit pins. Plain local bake builds use the Dockerfile +# ARG defaults. ci-bake-rocm.sh resolves those defaults (plus any env +# overrides) and writes a small HCL override before invoking CI targets. +variable "RIXL_BRANCH" { + default = "" +} + +variable "UCX_BRANCH" { + default = "" +} + +variable "ROCSHMEM_BRANCH" { + default = "" +} + +variable "DEEPEP_BRANCH" { + default = "" +} + +group "default" { + targets = ["test-rocm"] +} + +target "_common-rocm" { + dockerfile = "docker/Dockerfile.rocm" + context = "." + args = { + max_jobs = MAX_JOBS + ARG_PYTORCH_ROCM_ARCH = PYTORCH_ROCM_ARCH + REMOTE_VLLM = REMOTE_VLLM + VLLM_BRANCH = VLLM_BRANCH + CI_BASE_IMAGE = CI_BASE_IMAGE + } +} + +target "_labels" { + labels = { + "org.opencontainers.image.source" = "https://github.com/vllm-project/vllm" + "org.opencontainers.image.vendor" = "vLLM" + "org.opencontainers.image.title" = "vLLM ROCm" + "org.opencontainers.image.description" = "vLLM: A high-throughput and memory-efficient inference and serving engine for LLMs (ROCm)" + "org.opencontainers.image.licenses" = "Apache-2.0" + "org.opencontainers.image.revision" = COMMIT + } + annotations = [ + "manifest:org.opencontainers.image.revision=${COMMIT}", + ] +} + +target "test-rocm" { + inherits = ["_common-rocm", "_labels"] + target = "test" + tags = ["rocm/vllm:test"] + output = ["type=docker"] +} + +# CI base image target - builds only the ci_base stage (RIXL, DeepEP, +# torchcodec, requirements, etc.). Used by the weekly scheduled build and +# the auto-rebuild trigger when requirements change in a PR. +target "ci-base-rocm" { + inherits = ["_common-rocm", "_labels"] + target = "ci_base" + labels = { + "vllm.ci_base.content_hash" = CI_BASE_CONTENT_HASH + } + tags = ["rocm/vllm-dev:ci_base"] + output = ["type=docker"] +} + +# Wheel export target - extracts the built vLLM wheel + test workspace +# to local disk. Used by CI to upload the wheel as a Buildkite artifact +# so test jobs can assemble images locally from ci_base + wheel instead +# of pulling the full large image from Docker Hub. +# +# Usage: +# docker buildx bake -f docker/docker-bake-rocm.hcl export-wheel-rocm +# # Creates ./wheel-export/*.whl, ./wheel-export/requirements/, etc. +# +# After a full bake build, BuildKit cache makes this nearly instant. +target "export-wheel-rocm" { + inherits = ["_common-rocm"] + target = "export_vllm" + output = ["type=local,dest=./wheel-export"] +} + +target "final-rocm" { + inherits = ["_common-rocm", "_labels"] + target = "final" + tags = ["rocm/vllm:latest"] + output = ["type=docker"] +} diff --git a/tools/install_torchcodec_rocm.sh b/tools/install_torchcodec_rocm.sh index 6cb3b39fd66..210d7b24145 100755 --- a/tools/install_torchcodec_rocm.sh +++ b/tools/install_torchcodec_rocm.sh @@ -3,12 +3,16 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project # Script to install TorchCodec from source (required for ROCm compatibility) +# The PyPI wheel is built against upstream PyTorch and has ABI mismatches with +# ROCm's custom torch build, so we must compile from source. set -e TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}" # Pin to a specific release for reproducibility; update as needed. TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}" +# Cache directory for pre-built wheels to avoid redundant recompilation. +TORCHCODEC_WHEEL_CACHE="${TORCHCODEC_WHEEL_CACHE:-/root/.cache/torchcodec-wheels}" echo "=== TorchCodec Installation Script ===" @@ -18,9 +22,26 @@ if python3 -c "from torchcodec.decoders import VideoDecoder" 2>/dev/null; then exit 0 fi +# Try to install from cached wheel first +ARCH_TAG="${PYTORCH_ROCM_ARCH:-all}" +# Normalize arch tag (replace ; with _) for use in filename +ARCH_TAG="${ARCH_TAG//;/_}" +CACHED_WHEEL="${TORCHCODEC_WHEEL_CACHE}/torchcodec-${TORCHCODEC_BRANCH}-${ARCH_TAG}.whl" + +if [ -f "$CACHED_WHEEL" ]; then + echo "Found cached wheel: $CACHED_WHEEL" + pip install "$CACHED_WHEEL" && { + echo "Installed from cached wheel." + echo "=== TorchCodec installation complete ===" + exit 0 + } + echo "Cached wheel installation failed, rebuilding from source..." +fi + echo "TorchCodec not found. Installing from source..." -# Install system dependencies (FFmpeg + pkg-config) +# Install system dependencies (FFmpeg + pkg-config) if not already present. +# The Docker test image pre-installs these, so this is a fallback for other envs. install_system_deps() { if command -v apt-get &> /dev/null; then echo "Installing system dependencies..." @@ -56,6 +77,12 @@ export pybind11_DIR=$(python3 -c "import pybind11; print(pybind11.get_cmake_dir( export CMAKE_PREFIX_PATH="${pybind11_DIR}:${CMAKE_PREFIX_PATH}" echo "pybind11_DIR set to: $pybind11_DIR" +# Limit GPU architectures to only what this image targets. +# The default builds for all supported archs which is very slow. +if [ -n "$PYTORCH_ROCM_ARCH" ]; then + echo "Building for PYTORCH_ROCM_ARCH=$PYTORCH_ROCM_ARCH" +fi + # Create temp directory for build BUILD_DIR=$(mktemp -d -t torchcodec-XXXXXX) echo "Building in temporary directory: $BUILD_DIR" @@ -77,9 +104,31 @@ cd torchcodec export TORCHCODEC_CMAKE_BUILD_DIR="${PWD}/build" export TORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR=1 export I_CONFIRM_THIS_IS_NOT_A_LICENSE_VIOLATION=1 +# Use ninja for faster builds and parallelize compilation +export CMAKE_GENERATOR=Ninja +export MAX_JOBS="${MAX_JOBS:-$(nproc)}" +# Use ccache if available to speed up recompilation +if command -v ccache &> /dev/null; then + export CMAKE_C_COMPILER_LAUNCHER=ccache + export CMAKE_CXX_COMPILER_LAUNCHER=ccache +fi -echo "Building TorchCodec..." -pip install . --no-build-isolation +echo "Building TorchCodec (MAX_JOBS=$MAX_JOBS)..." +pip wheel . --no-build-isolation --no-deps -w "$BUILD_DIR/dist" + +# Install the built wheel +BUILT_WHEEL=$(ls "$BUILD_DIR/dist"/torchcodec-*.whl 2>/dev/null | head -1) +if [ -z "$BUILT_WHEEL" ]; then + echo "Error: No wheel produced" + exit 1 +fi + +pip install "$BUILT_WHEEL" + +# Cache the wheel for future runs +mkdir -p "$TORCHCODEC_WHEEL_CACHE" +cp "$BUILT_WHEEL" "$CACHED_WHEEL" +echo "Cached wheel to: $CACHED_WHEEL" # Verify installation echo "Verifying installation..." @@ -88,4 +137,4 @@ if python3 -c "from torchcodec.decoders import VideoDecoder; print('TorchCodec i else echo "Error: TorchCodec installation failed verification" exit 1 -fi \ No newline at end of file +fi