[ROCm][CI] Optimize ROCm Docker build: registry cache, DeepEP, and ci-bake script (#36949)

Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
Andreas Karatzas
2026-06-03 01:43:07 -05:00
committed by GitHub
parent 71df063c49
commit 87954eb50e
10 changed files with 2746 additions and 158 deletions
+53 -4
View File
@@ -3,12 +3,16 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Script to install TorchCodec from source (required for ROCm compatibility)
# The PyPI wheel is built against upstream PyTorch and has ABI mismatches with
# ROCm's custom torch build, so we must compile from source.
set -e
TORCHCODEC_REPO="${TORCHCODEC_REPO:-https://github.com/pytorch/torchcodec.git}"
# Pin to a specific release for reproducibility; update as needed.
TORCHCODEC_BRANCH="${TORCHCODEC_BRANCH:-v0.10.0}"
# Cache directory for pre-built wheels to avoid redundant recompilation.
TORCHCODEC_WHEEL_CACHE="${TORCHCODEC_WHEEL_CACHE:-/root/.cache/torchcodec-wheels}"
echo "=== TorchCodec Installation Script ==="
@@ -18,9 +22,26 @@ if python3 -c "from torchcodec.decoders import VideoDecoder" 2>/dev/null; then
exit 0
fi
# Try to install from cached wheel first
ARCH_TAG="${PYTORCH_ROCM_ARCH:-all}"
# Normalize arch tag (replace ; with _) for use in filename
ARCH_TAG="${ARCH_TAG//;/_}"
CACHED_WHEEL="${TORCHCODEC_WHEEL_CACHE}/torchcodec-${TORCHCODEC_BRANCH}-${ARCH_TAG}.whl"
if [ -f "$CACHED_WHEEL" ]; then
echo "Found cached wheel: $CACHED_WHEEL"
pip install "$CACHED_WHEEL" && {
echo "Installed from cached wheel."
echo "=== TorchCodec installation complete ==="
exit 0
}
echo "Cached wheel installation failed, rebuilding from source..."
fi
echo "TorchCodec not found. Installing from source..."
# Install system dependencies (FFmpeg + pkg-config)
# Install system dependencies (FFmpeg + pkg-config) if not already present.
# The Docker test image pre-installs these, so this is a fallback for other envs.
install_system_deps() {
if command -v apt-get &> /dev/null; then
echo "Installing system dependencies..."
@@ -56,6 +77,12 @@ export pybind11_DIR=$(python3 -c "import pybind11; print(pybind11.get_cmake_dir(
export CMAKE_PREFIX_PATH="${pybind11_DIR}:${CMAKE_PREFIX_PATH}"
echo "pybind11_DIR set to: $pybind11_DIR"
# Limit GPU architectures to only what this image targets.
# The default builds for all supported archs which is very slow.
if [ -n "$PYTORCH_ROCM_ARCH" ]; then
echo "Building for PYTORCH_ROCM_ARCH=$PYTORCH_ROCM_ARCH"
fi
# Create temp directory for build
BUILD_DIR=$(mktemp -d -t torchcodec-XXXXXX)
echo "Building in temporary directory: $BUILD_DIR"
@@ -77,9 +104,31 @@ cd torchcodec
export TORCHCODEC_CMAKE_BUILD_DIR="${PWD}/build"
export TORCHCODEC_DISABLE_COMPILE_WARNING_AS_ERROR=1
export I_CONFIRM_THIS_IS_NOT_A_LICENSE_VIOLATION=1
# Use ninja for faster builds and parallelize compilation
export CMAKE_GENERATOR=Ninja
export MAX_JOBS="${MAX_JOBS:-$(nproc)}"
# Use ccache if available to speed up recompilation
if command -v ccache &> /dev/null; then
export CMAKE_C_COMPILER_LAUNCHER=ccache
export CMAKE_CXX_COMPILER_LAUNCHER=ccache
fi
echo "Building TorchCodec..."
pip install . --no-build-isolation
echo "Building TorchCodec (MAX_JOBS=$MAX_JOBS)..."
pip wheel . --no-build-isolation --no-deps -w "$BUILD_DIR/dist"
# Install the built wheel
BUILT_WHEEL=$(ls "$BUILD_DIR/dist"/torchcodec-*.whl 2>/dev/null | head -1)
if [ -z "$BUILT_WHEEL" ]; then
echo "Error: No wheel produced"
exit 1
fi
pip install "$BUILT_WHEEL"
# Cache the wheel for future runs
mkdir -p "$TORCHCODEC_WHEEL_CACHE"
cp "$BUILT_WHEEL" "$CACHED_WHEEL"
echo "Cached wheel to: $CACHED_WHEEL"
# Verify installation
echo "Verifying installation..."
@@ -88,4 +137,4 @@ if python3 -c "from torchcodec.decoders import VideoDecoder; print('TorchCodec i
else
echo "Error: TorchCodec installation failed verification"
exit 1
fi
fi