mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Frontend][RFC] Rust front-end integration (#40848)
Signed-off-by: Nick Hill <nickhill123@gmail.com> Signed-off-by: Bugen Zhao <i@bugenzhao.com> Co-authored-by: Bugen Zhao <i@bugenzhao.com>
This commit is contained in:
@@ -223,6 +223,13 @@ echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
|
||||
|
||||
check_and_skip_if_image_exists
|
||||
|
||||
# The rust frontend lives in a git submodule under rust/. Buildkite's default
|
||||
# checkout does not recurse submodules, and the Dockerfile only sees what's in
|
||||
# the build context, so initialize the submodule here before invoking bake.
|
||||
echo "--- :git: Initializing git submodules"
|
||||
git submodule sync --recursive
|
||||
git submodule update --init --recursive
|
||||
|
||||
echo "--- :docker: Setting up Docker buildx bake"
|
||||
echo "Target: ${TARGET}"
|
||||
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
|
||||
|
||||
@@ -21,6 +21,12 @@ else
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# The rust frontend lives in a git submodule under rust/. Buildkite's default
|
||||
# checkout does not recurse submodules, and the Dockerfile only sees what's in
|
||||
# the build context, so initialize the submodule here before building.
|
||||
git submodule sync --recursive
|
||||
git submodule update --init --recursive
|
||||
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
|
||||
@@ -21,6 +21,12 @@ else
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# The rust frontend lives in a git submodule under rust/. Buildkite's default
|
||||
# checkout does not recurse submodules, and the Dockerfile only sees what's in
|
||||
# the build context, so initialize the submodule here before building.
|
||||
git submodule sync --recursive
|
||||
git submodule update --init --recursive
|
||||
|
||||
# build
|
||||
docker build --file docker/Dockerfile.cpu \
|
||||
--build-arg max_jobs=16 \
|
||||
|
||||
@@ -11,7 +11,7 @@ REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
|
||||
|
||||
# skip build if image already exists
|
||||
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
|
||||
|
||||
@@ -11,8 +11,8 @@ REPO=$2
|
||||
BUILDKITE_COMMIT=$3
|
||||
|
||||
# authenticate with AWS ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
|
||||
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
|
||||
|
||||
# skip build if image already exists
|
||||
if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then
|
||||
|
||||
@@ -0,0 +1,107 @@
|
||||
group: Rust Frontend
|
||||
depends_on:
|
||||
- image-build
|
||||
steps:
|
||||
- label: Rust Frontend OpenAI Coverage
|
||||
timeout_in_minutes: 90
|
||||
device: h200_18gb
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- rust/
|
||||
- vllm/benchmarks/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/entrypoints/serve/
|
||||
- vllm/v1/sample/
|
||||
- tests/utils.py
|
||||
- tests/benchmarks/test_serve_cli.py
|
||||
- tests/entrypoints/openai/chat_completion/test_chat_completion.py
|
||||
# - tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py
|
||||
# - tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
|
||||
# - tests/entrypoints/openai/completion/test_prompt_validation.py
|
||||
- tests/entrypoints/openai/completion/test_shutdown.py
|
||||
# - tests/entrypoints/openai/test_return_token_ids.py
|
||||
# - tests/entrypoints/openai/test_uds.py
|
||||
- tests/v1/sample/test_logprobs_e2e.py
|
||||
commands:
|
||||
- export VLLM_USE_RUST_FRONTEND=1
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s benchmarks/test_serve_cli.py -k "not insecure and not (test_bench_serve and not test_bench_serve_chat)"
|
||||
- pytest -v -s entrypoints/openai/chat_completion/test_chat_completion.py
|
||||
# - pytest -v -s entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py -k "not invalid"
|
||||
# - pytest -v -s entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
|
||||
# - pytest -v -s entrypoints/openai/completion/test_prompt_validation.py -k "not prompt_embeds"
|
||||
- pytest -v -s entrypoints/openai/completion/test_shutdown.py -k "not engine_failure and not test_abort_timeout_exits_quickly"
|
||||
# - pytest -v -s entrypoints/openai/test_return_token_ids.py
|
||||
# - pytest -v -s entrypoints/openai/test_uds.py
|
||||
- pytest -v -s v1/sample/test_logprobs_e2e.py -k "test_prompt_logprobs_e2e_server"
|
||||
|
||||
- label: Rust Frontend Serve/Admin Coverage
|
||||
timeout_in_minutes: 60
|
||||
device: h200_18gb
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- rust/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/entrypoints/serve/
|
||||
- vllm/v1/engine/
|
||||
- tests/utils.py
|
||||
# - tests/entrypoints/rpc/test_collective_rpc.py
|
||||
- tests/entrypoints/serve/disagg/test_serving_tokens.py
|
||||
- tests/entrypoints/serve/instrumentator/test_basic.py
|
||||
- tests/entrypoints/serve/instrumentator/test_metrics.py
|
||||
# - tests/entrypoints/serve/instrumentator/test_sleep.py
|
||||
commands:
|
||||
- export VLLM_USE_RUST_FRONTEND=1
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
# - pytest -v -s entrypoints/rpc/test_collective_rpc.py
|
||||
- pytest -v -s entrypoints/serve/instrumentator/test_basic.py -k "not show_version and not server_load"
|
||||
- pytest -v -s entrypoints/serve/disagg/test_serving_tokens.py -k "not stream and not lora and not test_generate_logprobs and not stop_string_workflow"
|
||||
- pytest -v -s entrypoints/serve/instrumentator/test_metrics.py -k "text and not show and not run_batch and not test_metrics_counts and not test_metrics_exist"
|
||||
# - pytest -v -s entrypoints/serve/instrumentator/test_sleep.py
|
||||
|
||||
- label: Rust Frontend Core Correctness
|
||||
timeout_in_minutes: 30
|
||||
device: h200_18gb
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- rust/
|
||||
- vllm/entrypoints/openai/
|
||||
- tests/utils.py
|
||||
- tests/entrypoints/openai/correctness/test_lmeval.py
|
||||
commands:
|
||||
- export VLLM_USE_RUST_FRONTEND=1
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
|
||||
|
||||
- label: Rust Frontend Tool Use
|
||||
timeout_in_minutes: 60
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- rust/
|
||||
- vllm/entrypoints/openai/
|
||||
- vllm/tool_parsers/
|
||||
- tests/utils.py
|
||||
- tests/tool_use/
|
||||
commands:
|
||||
- export VLLM_USE_RUST_FRONTEND=1
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- pytest -v -s tool_use --ignore=tool_use/mistral --models llama3.2 -k "not test_response_format_with_tool_choice_required and not test_parallel_tool_calls_false and not test_tool_call_and_choice"
|
||||
|
||||
- label: Rust Frontend Distributed
|
||||
timeout_in_minutes: 30
|
||||
num_devices: 4
|
||||
working_dir: "/vllm-workspace/tests"
|
||||
source_file_dependencies:
|
||||
- rust/
|
||||
- vllm/distributed/
|
||||
- vllm/engine/
|
||||
- vllm/executor/
|
||||
- vllm/v1/engine/
|
||||
- vllm/v1/worker/
|
||||
- tests/utils.py
|
||||
- tests/v1/distributed/test_internal_lb_dp.py
|
||||
commands:
|
||||
- export VLLM_USE_RUST_FRONTEND=1
|
||||
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
|
||||
- export NCCL_CUMEM_HOST_ENABLE=0
|
||||
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py -k "not 4 and not server_info"
|
||||
@@ -2,6 +2,7 @@
|
||||
/build
|
||||
dist
|
||||
vllm/*.so
|
||||
vllm/vllm-rs
|
||||
|
||||
# Byte-compiled / optimized / DLL files
|
||||
__pycache__/
|
||||
@@ -31,3 +32,4 @@ share/python-wheels/
|
||||
.installed.cfg
|
||||
*.egg
|
||||
MANIFEST
|
||||
rust/target/
|
||||
|
||||
@@ -26,6 +26,9 @@ __pycache__/
|
||||
# C extensions
|
||||
*.so
|
||||
|
||||
# Rust binaries
|
||||
vllm/vllm-rs
|
||||
|
||||
# Distribution / packaging
|
||||
.Python
|
||||
build/
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
[submodule "rust"]
|
||||
path = rust
|
||||
url = https://github.com/Inferact/vllm-frontend-rs.git
|
||||
Executable
+43
@@ -0,0 +1,43 @@
|
||||
#!/bin/bash
|
||||
# Build the vllm-rs Rust frontend binary and install it into the vllm package.
|
||||
# Usage: ./build_rust.sh [--debug]
|
||||
#
|
||||
# By default builds in release mode. Pass --debug for faster compile times
|
||||
# during development.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")" && pwd)"
|
||||
RUST_DIR="$REPO_ROOT/rust"
|
||||
TARGET_PATH="$REPO_ROOT/vllm/vllm-rs"
|
||||
|
||||
# Read the required toolchain from rust-toolchain.toml.
|
||||
TOOLCHAIN=$(grep '^channel' "$RUST_DIR/rust-toolchain.toml" | sed 's/.*= *"\(.*\)"/\1/')
|
||||
|
||||
# Ensure rustup and the required toolchain are available.
|
||||
if ! command -v rustup &>/dev/null; then
|
||||
echo "rustup not found, installing..."
|
||||
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain none
|
||||
source "$HOME/.cargo/env"
|
||||
fi
|
||||
|
||||
if ! rustup run "$TOOLCHAIN" rustc --version &>/dev/null; then
|
||||
echo "Installing Rust toolchain: $TOOLCHAIN"
|
||||
rustup toolchain install "$TOOLCHAIN"
|
||||
fi
|
||||
|
||||
if [[ "${1:-}" == "--debug" ]]; then
|
||||
PROFILE_ARGS=()
|
||||
PROFILE_DIR="debug"
|
||||
else
|
||||
PROFILE_ARGS=(--release)
|
||||
PROFILE_DIR="release"
|
||||
fi
|
||||
|
||||
cargo +"$TOOLCHAIN" build "${PROFILE_ARGS[@]}" \
|
||||
--manifest-path "$RUST_DIR/Cargo.toml" \
|
||||
--bin vllm-rs \
|
||||
--features native-tls-vendored
|
||||
|
||||
cp "$RUST_DIR/target/$PROFILE_DIR/vllm-rs" "$TARGET_PATH"
|
||||
echo "Installed vllm-rs to $TARGET_PATH"
|
||||
@@ -231,6 +231,63 @@ ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX'
|
||||
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
|
||||
#################### BUILD BASE IMAGE ####################
|
||||
|
||||
#################### RUST BUILD IMAGE ####################
|
||||
# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the main wheel
|
||||
# build stage doesn't need the rust toolchain, protoc, or the rust source.
|
||||
# This stage runs in parallel with csrc-build/extensions-build.
|
||||
FROM ${BUILD_BASE_IMAGE} AS rust-build
|
||||
ARG BUILD_OS
|
||||
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
# Install protoc (used by tonic-build/prost-build) and a basic C toolchain
|
||||
# (some rust crates compile C in their build.rs scripts).
|
||||
RUN if [ "${BUILD_OS}" = "manylinux" ]; then \
|
||||
dnf install -y --setopt=install_weak_deps=False \
|
||||
ca-certificates curl git gcc gcc-c++ make \
|
||||
protobuf-compiler protobuf-devel \
|
||||
&& dnf clean all && rm -rf /var/cache/dnf; \
|
||||
else \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
ca-certificates curl git build-essential \
|
||||
protobuf-compiler libprotobuf-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*; \
|
||||
fi
|
||||
|
||||
# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
|
||||
sh -s -- -y --profile minimal --default-toolchain none
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# Copy only the rust workspace — the binary is the sole artifact we need.
|
||||
COPY rust rust
|
||||
|
||||
# Fail loudly if the rust submodule was not initialized on the host before
|
||||
# `docker build`. Without this check, cargo would emit a confusing error.
|
||||
RUN if [ ! -f rust/Cargo.toml ]; then \
|
||||
echo "ERROR: rust/ submodule is not initialized."; \
|
||||
echo "Run 'git submodule update --init --recursive' on the host before building."; \
|
||||
exit 1; \
|
||||
fi
|
||||
|
||||
# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
|
||||
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
|
||||
ENV CARGO_BUILD_JOBS=4
|
||||
|
||||
# Build the release binary. Cache cargo registry/git and target/, but copy the
|
||||
# binary out of the target/ cache mount so it persists into the image layer
|
||||
# for later COPY --from=rust-build.
|
||||
RUN --mount=type=cache,target=/root/.cargo/registry \
|
||||
--mount=type=cache,target=/root/.cargo/git \
|
||||
--mount=type=cache,target=/workspace/rust/target \
|
||||
cd rust \
|
||||
&& cargo build --release --bin vllm-rs --features native-tls-vendored \
|
||||
&& cp target/release/vllm-rs /workspace/vllm-rs
|
||||
#################### RUST BUILD IMAGE ####################
|
||||
|
||||
#################### CSRC BUILD IMAGE ####################
|
||||
FROM base AS csrc-build
|
||||
ARG TARGETPLATFORM
|
||||
@@ -435,6 +492,10 @@ WORKDIR /workspace
|
||||
COPY --from=csrc-build /workspace/dist /precompiled-wheels
|
||||
COPY . .
|
||||
|
||||
# Drop the pre-built rust frontend binary into the source tree. setup.py
|
||||
# detects it and ships it as-is, skipping the local cargo build.
|
||||
COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
|
||||
@@ -455,6 +516,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=bind,source=.git,target=.git \
|
||||
if [ "${vllm_target_device}" = "cuda" ]; then \
|
||||
export VLLM_USE_PRECOMPILED=1; \
|
||||
export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
|
||||
fi && \
|
||||
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
|
||||
|
||||
@@ -36,6 +36,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
|
||||
# Compiler and linker environment
|
||||
ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
|
||||
ENV CCACHE_DIR=/root/.cache/ccache
|
||||
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
|
||||
@@ -80,6 +81,51 @@ FROM base-${TARGETARCH} AS base
|
||||
|
||||
RUN echo 'ulimit -c 0' >> ~/.bashrc
|
||||
|
||||
######################### RUST BUILD IMAGE #########################
|
||||
# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the wheel build
|
||||
# stage doesn't need the rust toolchain or protoc. This stage runs in parallel
|
||||
# with the main vllm-build stage.
|
||||
FROM ubuntu:22.04 AS rust-build
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
ca-certificates curl git build-essential \
|
||||
protobuf-compiler libprotobuf-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
|
||||
sh -s -- -y --profile minimal --default-toolchain none
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
# Copy only the rust workspace — the binary is the sole artifact we need.
|
||||
COPY rust rust
|
||||
|
||||
# Fail loudly if the rust submodule was not initialized on the host before
|
||||
# `docker build`. Without this check, cargo would emit a confusing error.
|
||||
RUN if [ ! -f rust/Cargo.toml ]; then \
|
||||
echo "ERROR: rust/ submodule is not initialized."; \
|
||||
echo "Run 'git submodule update --init --recursive' on the host before building."; \
|
||||
exit 1; \
|
||||
fi
|
||||
|
||||
# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
|
||||
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
|
||||
ENV CARGO_BUILD_JOBS=4
|
||||
|
||||
# Build the release binary. Cache cargo registry/git and target/, but copy the
|
||||
# binary out of the target/ cache mount so it persists into the image layer
|
||||
# for later COPY --from=rust-build.
|
||||
RUN --mount=type=cache,target=/root/.cargo/registry \
|
||||
--mount=type=cache,target=/root/.cargo/git \
|
||||
--mount=type=cache,target=/workspace/rust/target \
|
||||
cd rust \
|
||||
&& cargo build --release --bin vllm-rs --features native-tls-vendored \
|
||||
&& cp target/release/vllm-rs /workspace/vllm-rs
|
||||
|
||||
######################### BUILD IMAGE #########################
|
||||
FROM base AS vllm-build
|
||||
|
||||
@@ -114,6 +160,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
COPY . .
|
||||
|
||||
# Drop the pre-built rust frontend binary into the source tree. setup.py
|
||||
# detects it and ships it as-is, skipping the local cargo build.
|
||||
COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
|
||||
|
||||
RUN if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
@@ -94,6 +94,47 @@ RUN cat torch_build_versions.txt
|
||||
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
|
||||
#################### RUST BUILD IMAGE ####################
|
||||
# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the wheel build
|
||||
# stage doesn't need the rust toolchain or protoc.
|
||||
FROM ubuntu:22.04 AS rust-build
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
ca-certificates curl git build-essential \
|
||||
protobuf-compiler libprotobuf-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
|
||||
sh -s -- -y --profile minimal --default-toolchain none
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY rust rust
|
||||
|
||||
# Fail loudly if the rust submodule was not initialized on the host before
|
||||
# `docker build`.
|
||||
RUN if [ ! -f rust/Cargo.toml ]; then \
|
||||
echo "ERROR: rust/ submodule is not initialized."; \
|
||||
echo "Run 'git submodule update --init --recursive' on the host before building."; \
|
||||
exit 1; \
|
||||
fi
|
||||
|
||||
# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
|
||||
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
|
||||
ENV CARGO_BUILD_JOBS=4
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cargo/registry \
|
||||
--mount=type=cache,target=/root/.cargo/git \
|
||||
--mount=type=cache,target=/workspace/rust/target \
|
||||
cd rust \
|
||||
&& cargo build --release --bin vllm-rs --features native-tls-vendored \
|
||||
&& cp target/release/vllm-rs /workspace/vllm-rs
|
||||
#################### RUST BUILD IMAGE ####################
|
||||
|
||||
#################### WHEEL BUILD IMAGE ####################
|
||||
FROM base AS build
|
||||
ARG TARGETPLATFORM
|
||||
@@ -104,6 +145,10 @@ ENV UV_HTTP_TIMEOUT=500
|
||||
|
||||
COPY . .
|
||||
|
||||
# Drop the pre-built rust frontend binary into the source tree. setup.py
|
||||
# detects it and ships it as-is, skipping the local cargo build.
|
||||
COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
|
||||
|
||||
RUN python3 use_existing_torch.py
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
|
||||
@@ -106,14 +106,61 @@ ONBUILD RUN git clone ${VLLM_REPO} \
|
||||
&& cd vllm \
|
||||
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
|
||||
&& git checkout FETCH_HEAD \
|
||||
&& git submodule update --init --recursive \
|
||||
&& if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
|
||||
git remote add upstream "https://github.com/vllm-project/vllm.git" \
|
||||
&& git fetch upstream ; fi
|
||||
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
|
||||
|
||||
# -----------------------
|
||||
# Rust build stage
|
||||
# Builds the `vllm-rs` frontend in a dedicated stage so the wheel build stages
|
||||
# don't need the rust toolchain or protoc. Runs in parallel with the main wheel
|
||||
# build for faster end-to-end builds.
|
||||
FROM fetch_vllm AS rust-build
|
||||
ARG COMMON_WORKDIR
|
||||
|
||||
# Fail loudly if the rust submodule was not initialized on the host before
|
||||
# `docker build`. The rust frontend source is brought in via the fetch_vllm
|
||||
# stage, so an uninitialized submodule would otherwise produce a confusing
|
||||
# cargo failure.
|
||||
RUN if [ ! -f ${COMMON_WORKDIR}/vllm/rust/Cargo.toml ]; then \
|
||||
echo "ERROR: rust/ submodule is not initialized."; \
|
||||
echo "Run 'git submodule update --init --recursive' on the host before building."; \
|
||||
exit 1; \
|
||||
fi
|
||||
|
||||
# protoc is used by tonic-build/prost-build.
|
||||
RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
|
||||
protobuf-compiler libprotobuf-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
|
||||
sh -s -- -y --profile minimal --default-toolchain none
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
# Cap cargo parallelism to avoid exhausting the AMD CI host's open-file limit
|
||||
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
|
||||
ENV CARGO_BUILD_JOBS=4
|
||||
|
||||
# Build the release binary. Cache cargo registry/git, and copy the binary out
|
||||
# so it persists into the image layer for later COPY --from=rust-build.
|
||||
RUN --mount=type=cache,target=/root/.cargo/registry \
|
||||
--mount=type=cache,target=/root/.cargo/git \
|
||||
cd ${COMMON_WORKDIR}/vllm/rust \
|
||||
&& cargo build --release --bin vllm-rs --features native-tls-vendored \
|
||||
&& cp target/release/vllm-rs /tmp/vllm-rs
|
||||
|
||||
# -----------------------
|
||||
# vLLM build stages
|
||||
FROM fetch_vllm AS build_vllm
|
||||
ARG COMMON_WORKDIR
|
||||
|
||||
# Drop the pre-built rust frontend binary into the source tree. setup.py
|
||||
# detects it and ships it as-is, skipping the local cargo build.
|
||||
COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs
|
||||
|
||||
# Build vLLM (setup.py auto-detects sccache in PATH)
|
||||
RUN cd vllm \
|
||||
&& python3 -m pip install -r requirements/rocm.txt \
|
||||
@@ -293,6 +340,10 @@ FROM fetch_vllm AS build_vllm_wheel_release
|
||||
|
||||
ARG COMMON_WORKDIR
|
||||
|
||||
# Drop the pre-built rust frontend binary into the source tree. setup.py
|
||||
# detects it and ships it as-is, skipping the local cargo build.
|
||||
COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs
|
||||
|
||||
# Create /install directory for custom wheels
|
||||
RUN mkdir -p /install
|
||||
|
||||
|
||||
@@ -1,3 +1,43 @@
|
||||
######################### RUST BUILD IMAGE #########################
|
||||
# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the main image
|
||||
# doesn't need the rust toolchain or protoc. Runs in parallel with vllm-base.
|
||||
FROM ubuntu:22.04 AS rust-build
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
|
||||
RUN apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends \
|
||||
ca-certificates curl git build-essential \
|
||||
protobuf-compiler libprotobuf-dev \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
|
||||
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
|
||||
sh -s -- -y --profile minimal --default-toolchain none
|
||||
ENV PATH="/root/.cargo/bin:${PATH}"
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY rust rust
|
||||
|
||||
# Fail loudly if the rust submodule was not initialized on the host before
|
||||
# `docker build`.
|
||||
RUN if [ ! -f rust/Cargo.toml ]; then \
|
||||
echo "ERROR: rust/ submodule is not initialized."; \
|
||||
echo "Run 'git submodule update --init --recursive' on the host before building."; \
|
||||
exit 1; \
|
||||
fi
|
||||
|
||||
# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
|
||||
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
|
||||
ENV CARGO_BUILD_JOBS=4
|
||||
|
||||
RUN --mount=type=cache,target=/root/.cargo/registry \
|
||||
--mount=type=cache,target=/root/.cargo/git \
|
||||
--mount=type=cache,target=/workspace/rust/target \
|
||||
cd rust \
|
||||
&& cargo build --release --bin vllm-rs --features native-tls-vendored \
|
||||
&& cp target/release/vllm-rs /workspace/vllm-rs
|
||||
|
||||
FROM intel/deep-learning-essentials:2025.3.2-0-devel-ubuntu24.04 AS vllm-base
|
||||
|
||||
WORKDIR /workspace/
|
||||
@@ -99,6 +139,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
|
||||
|
||||
COPY . .
|
||||
|
||||
# Drop the pre-built rust frontend binary into the source tree. setup.py
|
||||
# detects it and ships it as-is, skipping the local cargo build.
|
||||
COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
RUN --mount=type=bind,source=.git,target=.git \
|
||||
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
|
||||
|
||||
@@ -43,6 +43,13 @@ If you are only developing vLLM's Python code, install vLLM using:
|
||||
VLLM_USE_PRECOMPILED=1 uv pip install -e .
|
||||
```
|
||||
|
||||
To rebuild only the Rust frontend binary:
|
||||
|
||||
```bash
|
||||
./build_rust.sh # release build
|
||||
./build_rust.sh --debug # faster build for development
|
||||
```
|
||||
|
||||
If you are developing vLLM's Python and CUDA/C++ code, install Pytorch first:
|
||||
|
||||
```bash
|
||||
|
||||
@@ -136,10 +136,10 @@ When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script:
|
||||
3. **Selects compatible wheel** based on:
|
||||
- Package name (`vllm`)
|
||||
- Platform tag (architecture match)
|
||||
4. **Downloads and extracts** precompiled binaries from the wheel:
|
||||
- C++ extension modules (`.so` files)
|
||||
- Flash Attention Python modules
|
||||
- Triton kernel Python files
|
||||
4. **Downloads and extracts** precompiled artifacts from the wheel:
|
||||
- Native extension modules (`.so` files)
|
||||
- The `vllm-rs` Rust frontend binary
|
||||
- Flash Attention Python modules and Triton/FlashMLA Python files
|
||||
5. **Patches package_data** to include extracted files in the installation
|
||||
|
||||
!!! note "What is the base commit?"
|
||||
|
||||
@@ -101,12 +101,22 @@ This command will do the following:
|
||||
1. Look for the current branch in your vLLM clone.
|
||||
1. Identify the corresponding base commit in the main branch.
|
||||
1. Download the pre-built wheel of the base commit.
|
||||
1. Use its compiled libraries in the installation.
|
||||
1. Use its compiled libraries and `vllm-rs` binary in the installation.
|
||||
|
||||
!!! note
|
||||
1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol.
|
||||
2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date.
|
||||
|
||||
!!! tip "Rebuilding the Rust frontend"
|
||||
If you need to recompile the `vllm-rs` Rust frontend binary, you can rebuild and install it without re-running the full pip install:
|
||||
|
||||
```bash
|
||||
./build_rust.sh # release build
|
||||
./build_rust.sh --debug # faster build for development
|
||||
```
|
||||
|
||||
This will install the required Rust toolchain if needed, build the binary, and place it in `vllm/vllm-rs`.
|
||||
|
||||
In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the `main` branch was just merged and its precompiled wheel is not available yet. You can wait around an hour and retry, or set `VLLM_PRECOMPILED_WHEEL_COMMIT=nightly` to automatically select the most recent already-built commit on `main`.
|
||||
|
||||
```bash
|
||||
|
||||
@@ -6,6 +6,7 @@ requires = [
|
||||
"packaging>=24.2",
|
||||
"setuptools>=77.0.3,<81.0.0",
|
||||
"setuptools-scm>=8.0",
|
||||
"setuptools-rust>=1.9.0",
|
||||
"torch == 2.11.0",
|
||||
"wheel",
|
||||
"jinja2",
|
||||
|
||||
@@ -4,6 +4,7 @@ ninja
|
||||
packaging>=24.2
|
||||
setuptools==77.0.3 # this version can reuse CMake build dir
|
||||
setuptools-scm>=8
|
||||
setuptools-rust>=1.9.0
|
||||
torch==2.11.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" or platform_machine == "aarch64"
|
||||
torch==2.11.0; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "riscv64"
|
||||
wheel
|
||||
|
||||
@@ -4,6 +4,7 @@ ninja
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<81.0.0
|
||||
setuptools-scm>=8
|
||||
setuptools-rust>=1.9.0
|
||||
torch==2.11.0
|
||||
wheel
|
||||
jinja2>=3.1.6
|
||||
|
||||
@@ -15,6 +15,7 @@ tensorizer==2.10.1
|
||||
packaging>=24.2
|
||||
setuptools>=77.0.3,<80.0.0
|
||||
setuptools-scm>=8
|
||||
setuptools-rust>=1.9.0
|
||||
runai-model-streamer[s3,gcs,azure]==0.15.7
|
||||
conch-triton-kernels==1.2.1
|
||||
timm>=1.0.17
|
||||
|
||||
Submodule
+1
Submodule rust added at ad6771ac09
@@ -0,0 +1,2 @@
|
||||
[toolchain]
|
||||
channel = "1.95"
|
||||
@@ -18,6 +18,8 @@ import torch
|
||||
from packaging.version import Version, parse
|
||||
from setuptools import Extension, setup
|
||||
from setuptools.command.build_ext import build_ext
|
||||
from setuptools_rust import Binding, RustExtension
|
||||
from setuptools_rust.build import build_rust
|
||||
from setuptools_scm import get_version
|
||||
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
|
||||
|
||||
@@ -33,11 +35,24 @@ def load_module_from_path(module_name, path):
|
||||
ROOT_DIR = Path(__file__).parent
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PRECOMPILED_RUST_FRONTEND_PATH = ROOT_DIR / "vllm" / "vllm-rs"
|
||||
|
||||
# cannot import envs directly because it depends on vllm,
|
||||
# which is not installed yet
|
||||
envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm", "envs.py"))
|
||||
|
||||
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
|
||||
USE_PRECOMPILED_EXTENSIONS = envs.VLLM_USE_PRECOMPILED
|
||||
# VLLM_USE_PRECOMPILED implies precompiled rust frontend too.
|
||||
USE_PRECOMPILED_RUST_FRONTEND = (
|
||||
envs.VLLM_USE_PRECOMPILED or envs.VLLM_USE_PRECOMPILED_RUST
|
||||
)
|
||||
|
||||
|
||||
def should_require_rust_frontend() -> bool:
|
||||
value = os.getenv("VLLM_REQUIRE_RUST_FRONTEND", "")
|
||||
return value.lower() not in ("", "0", "false", "no")
|
||||
|
||||
|
||||
if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
|
||||
logger.warning("VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
|
||||
@@ -405,6 +420,24 @@ class precompiled_build_ext(build_ext):
|
||||
return
|
||||
|
||||
|
||||
class precompiled_build_rust(build_rust):
|
||||
"""Skips local Rust builds when the precompiled wheel already ships vllm-rs."""
|
||||
|
||||
def run(self) -> None:
|
||||
if PRECOMPILED_RUST_FRONTEND_PATH.exists():
|
||||
logger.info(
|
||||
"Skipping local Rust build: using precompiled %s",
|
||||
PRECOMPILED_RUST_FRONTEND_PATH,
|
||||
)
|
||||
return
|
||||
|
||||
logger.warning(
|
||||
"Precompiled wheel did not provide %s; falling back to local Rust build.",
|
||||
PRECOMPILED_RUST_FRONTEND_PATH,
|
||||
)
|
||||
super().run()
|
||||
|
||||
|
||||
class precompiled_wheel_utils:
|
||||
"""Extracts libraries and other files from an existing wheel."""
|
||||
|
||||
@@ -653,7 +686,11 @@ class precompiled_wheel_utils:
|
||||
|
||||
@staticmethod
|
||||
def extract_precompiled_and_patch_package(
|
||||
wheel_url_or_path: str, download_filename: str | None
|
||||
wheel_url_or_path: str,
|
||||
download_filename: str | None,
|
||||
*,
|
||||
extract_extensions: bool,
|
||||
extract_rust_frontend: bool,
|
||||
) -> dict:
|
||||
import tempfile
|
||||
import zipfile
|
||||
@@ -676,20 +713,26 @@ class precompiled_wheel_utils:
|
||||
package_data_patch = {}
|
||||
|
||||
with zipfile.ZipFile(wheel_path) as wheel:
|
||||
files_to_copy = [
|
||||
"vllm/_C.abi3.so",
|
||||
"vllm/_C_stable_libtorch.abi3.so",
|
||||
"vllm/_moe_C.abi3.so",
|
||||
"vllm/_flashmla_C.abi3.so",
|
||||
"vllm/_flashmla_extension_C.abi3.so",
|
||||
"vllm/_sparse_flashmla_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
"vllm/spinloop.abi3.so",
|
||||
# ROCm-specific libraries
|
||||
"vllm/_rocm_C.abi3.so",
|
||||
]
|
||||
exact_members = set()
|
||||
if extract_extensions:
|
||||
exact_members.update(
|
||||
{
|
||||
"vllm/_C.abi3.so",
|
||||
"vllm/_C_stable_libtorch.abi3.so",
|
||||
"vllm/_moe_C.abi3.so",
|
||||
"vllm/_flashmla_C.abi3.so",
|
||||
"vllm/_flashmla_extension_C.abi3.so",
|
||||
"vllm/_sparse_flashmla_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
|
||||
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
|
||||
"vllm/cumem_allocator.abi3.so",
|
||||
"vllm/spinloop.abi3.so",
|
||||
# ROCm-specific libraries
|
||||
"vllm/_rocm_C.abi3.so",
|
||||
}
|
||||
)
|
||||
if extract_rust_frontend:
|
||||
exact_members.add("vllm/vllm-rs")
|
||||
|
||||
flash_attn_regex = re.compile(
|
||||
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
|
||||
@@ -708,27 +751,25 @@ class precompiled_wheel_utils:
|
||||
)
|
||||
# DeepGEMM: extract all files (.py, .so, .cuh, .h, .hpp, etc.)
|
||||
deep_gemm_regex = re.compile(r"vllm/third_party/deep_gemm/.*")
|
||||
file_members = list(
|
||||
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
|
||||
)
|
||||
file_members += list(
|
||||
filter(
|
||||
lambda x: flash_attn_regex.match(x.filename)
|
||||
and x.filename not in flash_attn_files_to_skip,
|
||||
wheel.filelist,
|
||||
)
|
||||
)
|
||||
file_members += list(
|
||||
filter(
|
||||
lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
|
||||
)
|
||||
)
|
||||
file_members += list(
|
||||
filter(lambda x: flashmla_regex.match(x.filename), wheel.filelist)
|
||||
)
|
||||
file_members += list(
|
||||
filter(lambda x: deep_gemm_regex.match(x.filename), wheel.filelist)
|
||||
)
|
||||
file_members = []
|
||||
for member in wheel.filelist:
|
||||
if member.filename in exact_members:
|
||||
file_members.append(member)
|
||||
continue
|
||||
|
||||
if not extract_extensions:
|
||||
continue
|
||||
|
||||
if (
|
||||
(
|
||||
flash_attn_regex.match(member.filename)
|
||||
and member.filename not in flash_attn_files_to_skip
|
||||
)
|
||||
or triton_kernels_regex.match(member.filename)
|
||||
or flashmla_regex.match(member.filename)
|
||||
or deep_gemm_regex.match(member.filename)
|
||||
):
|
||||
file_members.append(member)
|
||||
|
||||
for file in file_members:
|
||||
print(f"[extract] {file.filename}")
|
||||
@@ -739,6 +780,9 @@ class precompiled_wheel_utils:
|
||||
open(target_path, "wb") as dst,
|
||||
):
|
||||
shutil.copyfileobj(src, dst)
|
||||
mode = file.external_attr >> 16
|
||||
if mode:
|
||||
os.chmod(target_path, mode)
|
||||
|
||||
pkg = os.path.dirname(file.filename).replace("/", ".")
|
||||
package_data_patch.setdefault(pkg, []).append(
|
||||
@@ -911,7 +955,7 @@ def get_vllm_version() -> str:
|
||||
if envs.VLLM_TARGET_DEVICE == "empty":
|
||||
version += f"{sep}empty"
|
||||
elif _is_cuda():
|
||||
if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
|
||||
if USE_PRECOMPILED_EXTENSIONS and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
|
||||
version += f"{sep}precompiled"
|
||||
else:
|
||||
cuda_version = str(get_nvcc_cuda_version())
|
||||
@@ -1006,7 +1050,7 @@ if _is_hip():
|
||||
|
||||
if _is_cuda():
|
||||
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
|
||||
if envs.VLLM_USE_PRECOMPILED or (
|
||||
if USE_PRECOMPILED_EXTENSIONS or (
|
||||
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
|
||||
):
|
||||
# FA3 requires CUDA 12.3 or later
|
||||
@@ -1016,7 +1060,7 @@ if _is_cuda():
|
||||
ext_modules.append(
|
||||
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa4_cutedsl_C", optional=True)
|
||||
)
|
||||
if envs.VLLM_USE_PRECOMPILED or (
|
||||
if USE_PRECOMPILED_EXTENSIONS or (
|
||||
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.9")
|
||||
):
|
||||
# FlashMLA requires CUDA 12.9 or later
|
||||
@@ -1065,15 +1109,25 @@ package_data = {
|
||||
}
|
||||
|
||||
|
||||
# If using precompiled, extract and patch package_data (in advance of setup)
|
||||
if envs.VLLM_USE_PRECOMPILED:
|
||||
# If using precompiled artifacts, extract and patch package_data in advance.
|
||||
if USE_PRECOMPILED_RUST_FRONTEND:
|
||||
wheel_url, download_filename = precompiled_wheel_utils.determine_wheel_url()
|
||||
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
|
||||
wheel_url, download_filename
|
||||
wheel_url,
|
||||
download_filename,
|
||||
extract_extensions=USE_PRECOMPILED_EXTENSIONS,
|
||||
extract_rust_frontend=True,
|
||||
)
|
||||
for pkg, files in patch.items():
|
||||
package_data.setdefault(pkg, []).extend(files)
|
||||
|
||||
# If the rust frontend binary is already present in the source tree (e.g.,
|
||||
# pre-built in a separate Docker build stage), ship it as-is.
|
||||
if PRECOMPILED_RUST_FRONTEND_PATH.exists():
|
||||
vllm_files = package_data.setdefault("vllm", [])
|
||||
if "vllm-rs" not in vllm_files:
|
||||
vllm_files.append("vllm-rs")
|
||||
|
||||
if _no_device():
|
||||
ext_modules = []
|
||||
|
||||
@@ -1082,14 +1136,32 @@ if not ext_modules:
|
||||
else:
|
||||
cmdclass = {
|
||||
"build_ext": precompiled_build_ext
|
||||
if envs.VLLM_USE_PRECOMPILED
|
||||
if USE_PRECOMPILED_EXTENSIONS
|
||||
else cmake_build_ext,
|
||||
}
|
||||
if USE_PRECOMPILED_RUST_FRONTEND or PRECOMPILED_RUST_FRONTEND_PATH.exists():
|
||||
cmdclass["build_rust"] = precompiled_build_rust
|
||||
|
||||
# Rust frontend binary, built via setuptools-rust and installed into the
|
||||
# package directory alongside the Python modules.
|
||||
# TODO: we may use `RustBin` to directly install it into `bin` directory, but this
|
||||
# requires extra work on using precompiled binaries.
|
||||
rust_extensions = [
|
||||
RustExtension(
|
||||
target="vllm.vllm-rs",
|
||||
path="rust/src/cmd/Cargo.toml",
|
||||
args=["--bin", "vllm-rs"],
|
||||
features=["native-tls-vendored"],
|
||||
binding=Binding.Exec,
|
||||
optional=not should_require_rust_frontend(),
|
||||
),
|
||||
]
|
||||
|
||||
setup(
|
||||
# static metadata should rather go in pyproject.toml
|
||||
version=get_vllm_version(),
|
||||
ext_modules=ext_modules,
|
||||
rust_extensions=rust_extensions,
|
||||
install_requires=get_requirements(),
|
||||
extras_require={
|
||||
# AMD Zen CPU optimizations via zentorch
|
||||
|
||||
@@ -299,22 +299,16 @@ async def test_abort_timeout_exits_quickly(wait_for_engine_idle: float):
|
||||
start_time = time.time()
|
||||
proc.send_signal(signal.SIGTERM)
|
||||
|
||||
# abort timeout (0) should stop the server promptly. On ROCm, process
|
||||
# exit can spend extra time in HIP/RCCL/native extension teardown after
|
||||
# the server and engine have already shut down.
|
||||
max_exit_time = 4.0 if _IS_ROCM else 2.1
|
||||
|
||||
# abort timeout (0) should stop the server promptly.
|
||||
try:
|
||||
proc.wait(timeout=max_exit_time)
|
||||
proc.wait(timeout=4.0)
|
||||
except subprocess.TimeoutExpired:
|
||||
proc.kill()
|
||||
proc.wait(timeout=5)
|
||||
pytest.fail("Process did not exit after SIGTERM with abort timeout")
|
||||
|
||||
exit_time = time.time() - start_time
|
||||
assert exit_time < max_exit_time, (
|
||||
f"Default shutdown took too long: {exit_time:.1f}s"
|
||||
)
|
||||
assert exit_time < 4.1, f"Default shutdown took too long: {exit_time:.1f}s"
|
||||
assert proc.returncode in (0, -15, None), f"Unexpected: {proc.returncode}"
|
||||
|
||||
await _assert_children_cleaned_up(child_pids)
|
||||
|
||||
@@ -103,6 +103,19 @@ def test_is_envs_cache_enabled() -> None:
|
||||
assert not envs._is_envs_cache_enabled()
|
||||
|
||||
|
||||
def test_precompiled_install_flags_are_orthogonal() -> None:
|
||||
with patch.dict(
|
||||
os.environ,
|
||||
{
|
||||
"VLLM_PRECOMPILED_WHEEL_LOCATION": "/tmp/vllm.whl",
|
||||
"VLLM_USE_PRECOMPILED_RUST": "1",
|
||||
},
|
||||
clear=False,
|
||||
):
|
||||
assert environment_variables["VLLM_USE_PRECOMPILED"]() is False
|
||||
assert environment_variables["VLLM_USE_PRECOMPILED_RUST"]() is True
|
||||
|
||||
|
||||
class TestEnvWithChoices:
|
||||
"""Test cases for env_with_choices function."""
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import argparse
|
||||
import signal
|
||||
|
||||
import uvloop
|
||||
|
||||
@@ -110,6 +111,14 @@ def cmd_init() -> list[CLISubcommand]:
|
||||
|
||||
async def run_launch_fastapi(args: argparse.Namespace) -> None:
|
||||
"""Run the online serving layer with FastAPI (no GPU inference)."""
|
||||
|
||||
# Interrupt initialization if SIGTERM arrives before uvicorn installs
|
||||
# its own signal handlers. Once uvicorn is running it replaces this.
|
||||
def _interrupt_init(*_) -> None:
|
||||
raise KeyboardInterrupt("terminated")
|
||||
|
||||
signal.signal(signal.SIGTERM, _interrupt_init)
|
||||
|
||||
# 1. Socket binding
|
||||
listen_address, sock = setup_server(args)
|
||||
|
||||
|
||||
@@ -21,7 +21,11 @@ from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
|
||||
from vllm.v1.executor import Executor
|
||||
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
|
||||
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
|
||||
from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
|
||||
from vllm.v1.utils import (
|
||||
APIServerProcessManager,
|
||||
RustFrontendProcessManager,
|
||||
wait_for_completion_or_failure,
|
||||
)
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
@@ -81,11 +85,12 @@ class ServeSubcommand(CLISubcommand):
|
||||
)
|
||||
|
||||
# Default api_server_count if not explicitly set.
|
||||
# - Rust frontend: Use 1 (not applicable as it's multithreaded)
|
||||
# - External LB: Leave as 1 (external LB handles distribution)
|
||||
# - Hybrid LB: Use local DP size (internal LB for local ranks only)
|
||||
# - Internal LB: Use full DP size
|
||||
if args.api_server_count is None:
|
||||
if is_external_lb:
|
||||
if is_external_lb or envs.VLLM_RUST_FRONTEND_PATH:
|
||||
args.api_server_count = 1
|
||||
elif is_hybrid_lb:
|
||||
args.api_server_count = args.data_parallel_size_local or 1
|
||||
@@ -102,6 +107,11 @@ class ServeSubcommand(CLISubcommand):
|
||||
"Defaulting api_server_count to data_parallel_size (%d).",
|
||||
args.api_server_count,
|
||||
)
|
||||
elif envs.VLLM_RUST_FRONTEND_PATH and args.api_server_count > 1:
|
||||
logger.warning(
|
||||
"Ignoring --api-server-count=%d when using rust front-end process"
|
||||
)
|
||||
args.api_server_count = 1
|
||||
|
||||
# Elastic EP currently only supports running with at most one API server.
|
||||
if getattr(args, "enable_elastic_ep", False) and args.api_server_count > 1:
|
||||
@@ -114,7 +124,7 @@ class ServeSubcommand(CLISubcommand):
|
||||
|
||||
if args.api_server_count < 1:
|
||||
run_headless(args)
|
||||
elif args.api_server_count > 1:
|
||||
elif args.api_server_count > 1 or envs.VLLM_RUST_FRONTEND_PATH:
|
||||
run_multi_api_server(args)
|
||||
else:
|
||||
# Single API server (this process).
|
||||
@@ -230,9 +240,15 @@ def run_headless(args: argparse.Namespace):
|
||||
|
||||
def run_multi_api_server(args: argparse.Namespace):
|
||||
assert not args.headless
|
||||
rust_frontend_path = envs.VLLM_RUST_FRONTEND_PATH
|
||||
num_api_servers: int = args.api_server_count
|
||||
assert num_api_servers > 0
|
||||
|
||||
if rust_frontend_path and num_api_servers > 1:
|
||||
raise ValueError(
|
||||
"VLLM_RUST_FRONTEND_PATH does not support api_server_count > 1"
|
||||
)
|
||||
|
||||
if num_api_servers > 1:
|
||||
setup_multiprocess_prometheus()
|
||||
|
||||
@@ -270,7 +286,9 @@ def run_multi_api_server(args: argparse.Namespace):
|
||||
dp_rank = parallel_config.data_parallel_rank
|
||||
assert parallel_config.local_engines_only or dp_rank == 0
|
||||
|
||||
api_server_manager: APIServerProcessManager | None = None
|
||||
api_server_manager: APIServerProcessManager | RustFrontendProcessManager | None = (
|
||||
None
|
||||
)
|
||||
|
||||
from vllm.v1.engine.utils import get_engine_zmq_addresses
|
||||
|
||||
@@ -279,23 +297,34 @@ def run_multi_api_server(args: argparse.Namespace):
|
||||
with launch_core_engines(
|
||||
vllm_config, executor_class, log_stats, addresses, num_api_servers
|
||||
) as (local_engine_manager, coordinator, addresses, tensor_queue):
|
||||
# Construct common args for the APIServerProcessManager up-front.
|
||||
stats_update_address = None
|
||||
if coordinator:
|
||||
stats_update_address = coordinator.get_stats_publish_address()
|
||||
|
||||
# Start API servers.
|
||||
api_server_manager = APIServerProcessManager(
|
||||
listen_address=listen_address,
|
||||
sock=sock,
|
||||
args=args,
|
||||
num_servers=num_api_servers,
|
||||
input_addresses=addresses.inputs,
|
||||
output_addresses=addresses.outputs,
|
||||
stats_update_address=stats_update_address,
|
||||
tensor_queue=tensor_queue,
|
||||
stats_update_address = (
|
||||
coordinator.get_stats_publish_address() if coordinator else None
|
||||
)
|
||||
|
||||
if rust_frontend_path:
|
||||
# Start rust front-end process.
|
||||
api_server_manager = RustFrontendProcessManager(
|
||||
binary_path=rust_frontend_path,
|
||||
sock=sock,
|
||||
args=args,
|
||||
input_address=addresses.inputs[0],
|
||||
output_address=addresses.outputs[0],
|
||||
engine_count=parallel_config.data_parallel_size,
|
||||
stats_update_address=stats_update_address,
|
||||
)
|
||||
else:
|
||||
# Start API server(s).
|
||||
api_server_manager = APIServerProcessManager(
|
||||
listen_address=listen_address,
|
||||
sock=sock,
|
||||
args=args,
|
||||
num_servers=num_api_servers,
|
||||
input_addresses=addresses.inputs,
|
||||
output_addresses=addresses.outputs,
|
||||
stats_update_address=stats_update_address,
|
||||
tensor_queue=tensor_queue,
|
||||
)
|
||||
|
||||
# Wait for API servers.
|
||||
try:
|
||||
wait_for_completion_or_failure(
|
||||
|
||||
@@ -533,8 +533,7 @@ def validate_api_server_args(args):
|
||||
|
||||
@instrument(span_name="API server setup")
|
||||
def setup_server(args):
|
||||
"""Validate API server args, set up signal handler, create socket
|
||||
ready to serve."""
|
||||
"""Validate API server args and create the server socket."""
|
||||
|
||||
log_version_and_model(logger, VLLM_VERSION, args.model)
|
||||
log_non_default_args(args)
|
||||
@@ -560,12 +559,6 @@ def setup_server(args):
|
||||
# many concurrent requests active
|
||||
set_ulimit()
|
||||
|
||||
def signal_handler(*_) -> None:
|
||||
# Interrupt server on sigterm while initializing
|
||||
raise KeyboardInterrupt("terminated")
|
||||
|
||||
signal.signal(signal.SIGTERM, signal_handler)
|
||||
|
||||
if args.uds:
|
||||
listen_address = f"unix:{args.uds}"
|
||||
else:
|
||||
@@ -675,6 +668,13 @@ async def run_server(args, **uvicorn_kwargs) -> None:
|
||||
# Add process-specific prefix to stdout and stderr.
|
||||
decorate_logs("APIServer")
|
||||
|
||||
# Interrupt initialization if SIGTERM arrives before uvicorn installs its
|
||||
# own signal handlers. Once uvicorn is running it replaces this.
|
||||
def _interrupt_init(*_) -> None:
|
||||
raise KeyboardInterrupt("terminated")
|
||||
|
||||
signal.signal(signal.SIGTERM, _interrupt_init)
|
||||
|
||||
listen_address, sock = setup_server(args)
|
||||
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ from argparse import Namespace
|
||||
from http import HTTPStatus
|
||||
from logging import Logger
|
||||
from string import Template
|
||||
from typing import Any
|
||||
|
||||
import regex as re
|
||||
from fastapi import Request
|
||||
@@ -210,7 +211,7 @@ def get_max_tokens(
|
||||
)
|
||||
|
||||
|
||||
def log_non_default_args(args: Namespace | EngineArgs):
|
||||
def get_non_default_args(args: Namespace | EngineArgs) -> dict[str, Any]:
|
||||
from vllm.entrypoints.openai.cli_args import make_arg_parser
|
||||
|
||||
non_default_args = {}
|
||||
@@ -237,6 +238,43 @@ def log_non_default_args(args: Namespace | EngineArgs):
|
||||
"Unsupported argument type. Must be Namespace or EngineArgs instance."
|
||||
)
|
||||
|
||||
return non_default_args
|
||||
|
||||
|
||||
def _jsonify_arg_value(value: Any) -> Any:
|
||||
if value is None or isinstance(value, bool | int | float | str):
|
||||
return value
|
||||
if dataclasses.is_dataclass(value) and not isinstance(value, type):
|
||||
return {
|
||||
key: _jsonify_arg_value(val)
|
||||
for key, val in dataclasses.asdict(value).items()
|
||||
}
|
||||
if isinstance(value, dict):
|
||||
return {str(key): _jsonify_arg_value(val) for key, val in value.items()}
|
||||
if isinstance(value, tuple | list):
|
||||
return [_jsonify_arg_value(item) for item in value]
|
||||
if (model_dump := getattr(value, "model_dump", None)) is not None:
|
||||
return _jsonify_arg_value(model_dump(mode="json"))
|
||||
if (to_dict := getattr(value, "dict", None)) is not None:
|
||||
return _jsonify_arg_value(to_dict())
|
||||
return repr(value)
|
||||
|
||||
|
||||
def jsonify_non_default_args(
|
||||
args: Namespace | EngineArgs,
|
||||
*,
|
||||
exclude: set[str] | None = None,
|
||||
) -> dict[str, Any]:
|
||||
non_default_args = get_non_default_args(args)
|
||||
if exclude is not None:
|
||||
for key in exclude:
|
||||
non_default_args.pop(key, None)
|
||||
|
||||
return {key: _jsonify_arg_value(value) for key, value in non_default_args.items()}
|
||||
|
||||
|
||||
def log_non_default_args(args: Namespace | EngineArgs):
|
||||
non_default_args = get_non_default_args(args)
|
||||
logger.info("non-default args: %s", non_default_args)
|
||||
|
||||
|
||||
|
||||
+51
-1
@@ -86,6 +86,7 @@ if TYPE_CHECKING:
|
||||
MAX_JOBS: str | None = None
|
||||
NVCC_THREADS: str | None = None
|
||||
VLLM_USE_PRECOMPILED: bool = False
|
||||
VLLM_USE_PRECOMPILED_RUST: bool = False
|
||||
VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
|
||||
VLLM_DOCKER_BUILD_CONTEXT: bool = False
|
||||
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
|
||||
@@ -134,6 +135,8 @@ if TYPE_CHECKING:
|
||||
Q_SCALE_CONSTANT: int = 200
|
||||
K_SCALE_CONSTANT: int = 200
|
||||
V_SCALE_CONSTANT: int = 100
|
||||
VLLM_USE_RUST_FRONTEND: bool = False
|
||||
VLLM_RUST_FRONTEND_PATH: str | None = "auto"
|
||||
VLLM_SERVER_DEV_MODE: bool = False
|
||||
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
|
||||
VLLM_MLA_DISABLE: bool = False
|
||||
@@ -514,6 +517,40 @@ def get_env_or_set_default(
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _resolve_rust_frontend_path() -> str | None:
|
||||
"""Resolve the Rust frontend binary path.
|
||||
|
||||
Returns None if VLLM_USE_RUST_FRONTEND is not enabled.
|
||||
When enabled, resolves VLLM_RUST_FRONTEND_PATH ("auto" by default)
|
||||
to the actual binary path.
|
||||
"""
|
||||
use_rust = bool(int(os.environ.get("VLLM_USE_RUST_FRONTEND", "0")))
|
||||
raw = os.environ.get("VLLM_RUST_FRONTEND_PATH", "auto")
|
||||
|
||||
if not use_rust:
|
||||
if os.environ.get("VLLM_RUST_FRONTEND_PATH") is not None:
|
||||
logger.warning(
|
||||
"VLLM_RUST_FRONTEND_PATH is set but VLLM_USE_RUST_FRONTEND "
|
||||
"is not enabled. The Rust frontend will not be used. "
|
||||
"Set VLLM_USE_RUST_FRONTEND=1 to enable it."
|
||||
)
|
||||
return None
|
||||
|
||||
if raw.lower() in ("auto", "1", "true"):
|
||||
pkg_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
candidate = os.path.join(pkg_dir, "vllm-rs")
|
||||
if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
|
||||
return candidate
|
||||
|
||||
raise FileNotFoundError(
|
||||
"VLLM_RUST_FRONTEND_PATH=auto but the vllm-rs binary was "
|
||||
f"not found at {candidate}. "
|
||||
"Build with setuptools-rust or set the path explicitly."
|
||||
)
|
||||
return raw
|
||||
|
||||
|
||||
environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# ================== Installation Time Env Vars ==================
|
||||
# Target device of vLLM, supporting [cuda (by default),
|
||||
@@ -551,11 +588,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# By default this is 1.
|
||||
# If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
|
||||
"NVCC_THREADS": lambda: os.getenv("NVCC_THREADS", None),
|
||||
# If set, vllm will use precompiled binaries (*.so)
|
||||
# If set, vllm will use precompiled native binaries (*.so and vllm-rs).
|
||||
"VLLM_USE_PRECOMPILED": lambda: (
|
||||
os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in ("1", "true")
|
||||
or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION"))
|
||||
),
|
||||
# If set, vllm will use the precompiled Rust frontend binary (vllm-rs).
|
||||
"VLLM_USE_PRECOMPILED_RUST": lambda: (
|
||||
os.environ.get("VLLM_USE_PRECOMPILED_RUST", "").strip().lower() in ("1", "true")
|
||||
),
|
||||
# If set, skip adding +precompiled suffix to version string
|
||||
"VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool(
|
||||
int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0"))
|
||||
@@ -1154,6 +1195,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
# If set to "0", disable LayerName opaque type for layer_name
|
||||
# parameters in custom ops. Defaults to enabled on torch >= 2.11.
|
||||
"VLLM_USE_LAYERNAME": lambda: bool(int(os.getenv("VLLM_USE_LAYERNAME", "1"))),
|
||||
# If set, use the Rust frontend binary instead of the Python API server
|
||||
# process(es).
|
||||
"VLLM_USE_RUST_FRONTEND": lambda: bool(
|
||||
int(os.getenv("VLLM_USE_RUST_FRONTEND", "0"))
|
||||
),
|
||||
# Path to the Rust frontend binary. Defaults to "auto" which discovers
|
||||
# the binary installed with the vllm package. Only used when
|
||||
# VLLM_USE_RUST_FRONTEND=1.
|
||||
"VLLM_RUST_FRONTEND_PATH": lambda: _resolve_rust_frontend_path(),
|
||||
# If set, vllm will run in development mode, which will enable
|
||||
# some additional endpoints for developing and debugging,
|
||||
# e.g. `/reset_prefix_cache`
|
||||
|
||||
@@ -75,6 +75,7 @@ class EngineCoreReadyResponse:
|
||||
max_model_len: int
|
||||
num_gpu_blocks: int
|
||||
dp_stats_address: str | None
|
||||
dtype: str | None = None
|
||||
|
||||
|
||||
class EngineCoreRequest(
|
||||
|
||||
@@ -1437,6 +1437,7 @@ class EngineCoreProc(EngineCore):
|
||||
max_model_len=self.vllm_config.model_config.max_model_len,
|
||||
num_gpu_blocks=self.vllm_config.cache_config.num_gpu_blocks or 0,
|
||||
dp_stats_address=self.frontend_stats_publish_address,
|
||||
dtype=str(self.vllm_config.model_config.dtype).removeprefix("torch."),
|
||||
)
|
||||
ready_payload = msgspec.msgpack.encode(ready_response)
|
||||
for input_socket in input_sockets:
|
||||
|
||||
+143
-2
@@ -2,6 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import argparse
|
||||
import contextlib
|
||||
import json
|
||||
import multiprocessing
|
||||
import threading
|
||||
import time
|
||||
@@ -233,6 +234,146 @@ class APIServerProcessManager:
|
||||
shutdown(self.processes, timeout=timeout)
|
||||
|
||||
|
||||
class RustFrontendProcessManager:
|
||||
"""Manages a single Rust frontend subprocess.
|
||||
|
||||
Launches the Rust vllm-rs binary in 'frontend' mode, passing the
|
||||
listening socket fd and ZMQ transport addresses. Provides the same
|
||||
interface as APIServerProcessManager for process monitoring.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
binary_path: str,
|
||||
sock: Any,
|
||||
args: argparse.Namespace,
|
||||
input_address: str,
|
||||
output_address: str,
|
||||
engine_count: int,
|
||||
stats_update_address: str | None = None,
|
||||
):
|
||||
import os
|
||||
import subprocess
|
||||
|
||||
fd = sock.fileno()
|
||||
os.set_inheritable(fd, True)
|
||||
|
||||
cmd = [
|
||||
binary_path,
|
||||
"frontend",
|
||||
"--listen-fd",
|
||||
str(fd),
|
||||
"--input-address",
|
||||
input_address,
|
||||
"--output-address",
|
||||
output_address,
|
||||
"--engine-count",
|
||||
str(engine_count),
|
||||
]
|
||||
if stats_update_address is not None:
|
||||
cmd.extend(["--coordinator-address", stats_update_address])
|
||||
from vllm.entrypoints.utils import jsonify_non_default_args
|
||||
|
||||
args_json = json.dumps(
|
||||
jsonify_non_default_args(args, exclude={"api_server_count"}),
|
||||
sort_keys=True,
|
||||
)
|
||||
cmd.extend(["--args-json", args_json])
|
||||
|
||||
logger.info("Launching Rust frontend: %s", " ".join(cmd))
|
||||
self._proc = subprocess.Popen(cmd, pass_fds=(fd,))
|
||||
|
||||
# Create a process wrapper with a sentinel fd for monitoring
|
||||
self.processes: list[_SubprocessWrapper] = [
|
||||
_SubprocessWrapper(self._proc, "RustFrontend")
|
||||
]
|
||||
|
||||
self._finalizer = weakref.finalize(self, _shutdown_subprocesses, self.processes)
|
||||
|
||||
def shutdown(self, timeout: float | None = None) -> None:
|
||||
if self._finalizer.detach() is not None:
|
||||
_shutdown_subprocesses(self.processes, timeout=timeout)
|
||||
|
||||
|
||||
class _SubprocessWrapper:
|
||||
"""Wraps subprocess.Popen to provide the BaseProcess-like interface
|
||||
needed by wait_for_completion_or_failure."""
|
||||
|
||||
def __init__(self, proc, name: str):
|
||||
self._proc = proc
|
||||
self.name = name
|
||||
self.pid = proc.pid
|
||||
self._sentinel_conn: connection.Connection | None = None
|
||||
self._sentinel_send: connection.Connection | None = None
|
||||
|
||||
# Use a Pipe-based sentinel so subprocess monitoring works uniformly
|
||||
# across platforms with multiprocessing.connection.wait().
|
||||
recv, send = connection.Pipe(duplex=False)
|
||||
self._sentinel_conn = recv
|
||||
self._sentinel_send = send
|
||||
|
||||
def monitor_subprocess() -> None:
|
||||
try:
|
||||
proc.wait()
|
||||
finally:
|
||||
with contextlib.suppress(Exception):
|
||||
send.close()
|
||||
|
||||
threading.Thread(
|
||||
target=monitor_subprocess, daemon=True, name=f"{name}Monitor"
|
||||
).start()
|
||||
|
||||
@property
|
||||
def sentinel(self):
|
||||
return self._sentinel_conn
|
||||
|
||||
@property
|
||||
def exitcode(self) -> int | None:
|
||||
return self._proc.returncode if self._proc.poll() is not None else None
|
||||
|
||||
def is_alive(self) -> bool:
|
||||
return self._proc.poll() is None
|
||||
|
||||
def terminate(self):
|
||||
self._proc.terminate()
|
||||
|
||||
def join(self, timeout=None):
|
||||
with contextlib.suppress(Exception):
|
||||
self._proc.wait(timeout=timeout)
|
||||
|
||||
def __del__(self):
|
||||
with contextlib.suppress(Exception):
|
||||
if self._sentinel_conn is not None:
|
||||
self._sentinel_conn.close()
|
||||
if self._sentinel_send is not None:
|
||||
self._sentinel_send.close()
|
||||
|
||||
|
||||
def _shutdown_subprocesses(
|
||||
procs: list[_SubprocessWrapper], timeout: float | None = None
|
||||
) -> None:
|
||||
"""Shutdown subprocess wrappers (mirrors the shutdown() function)."""
|
||||
if timeout is None:
|
||||
timeout = 0.0
|
||||
timeout = max(timeout, 5.0)
|
||||
|
||||
for proc in procs:
|
||||
if proc.is_alive():
|
||||
proc.terminate()
|
||||
|
||||
deadline = time.monotonic() + timeout
|
||||
for proc in procs:
|
||||
remaining = deadline - time.monotonic()
|
||||
if remaining <= 0:
|
||||
break
|
||||
if proc.is_alive():
|
||||
proc.join(remaining)
|
||||
|
||||
for proc in procs:
|
||||
if proc.is_alive() and (pid := proc.pid) is not None:
|
||||
kill_process_tree(pid)
|
||||
|
||||
|
||||
def run_api_server_worker_proc(
|
||||
listen_address, sock, args, client_config=None, **uvicorn_kwargs
|
||||
) -> None:
|
||||
@@ -253,7 +394,7 @@ def run_api_server_worker_proc(
|
||||
|
||||
|
||||
def wait_for_completion_or_failure(
|
||||
api_server_manager: APIServerProcessManager,
|
||||
api_server_manager: "APIServerProcessManager | RustFrontendProcessManager",
|
||||
engine_manager: Union["CoreEngineProcManager", "CoreEngineActorManager"]
|
||||
| None = None,
|
||||
coordinator: "DPCoordinator | None" = None,
|
||||
@@ -274,7 +415,7 @@ def wait_for_completion_or_failure(
|
||||
logger.info("Waiting for API servers to complete ...")
|
||||
# Create a mapping of sentinels to their corresponding processes
|
||||
# for efficient lookup
|
||||
sentinel_to_proc: dict[Any, BaseProcess] = {
|
||||
sentinel_to_proc: dict[Any, BaseProcess | _SubprocessWrapper | None] = {
|
||||
proc.sentinel: proc for proc in api_server_manager.processes
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user