[Frontend][RFC] Rust front-end integration (#40848)

Signed-off-by: Nick Hill <nickhill123@gmail.com>
Signed-off-by: Bugen Zhao <i@bugenzhao.com>
Co-authored-by: Bugen Zhao <i@bugenzhao.com>
This commit is contained in:
Nick Hill
2026-05-20 21:24:48 -07:00
committed by GitHub
parent d97ba29fdc
commit f2ace1d57d
35 changed files with 893 additions and 91 deletions
+7
View File
@@ -223,6 +223,13 @@ echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"
check_and_skip_if_image_exists
# The rust frontend lives in a git submodule under rust/. Buildkite's default
# checkout does not recurse submodules, and the Dockerfile only sees what's in
# the build context, so initialize the submodule here before invoking bake.
echo "--- :git: Initializing git submodules"
git submodule sync --recursive
git submodule update --init --recursive
echo "--- :docker: Setting up Docker buildx bake"
echo "Target: ${TARGET}"
echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
@@ -21,6 +21,12 @@ else
exit 0
fi
# The rust frontend lives in a git submodule under rust/. Buildkite's default
# checkout does not recurse submodules, and the Dockerfile only sees what's in
# the build context, so initialize the submodule here before building.
git submodule sync --recursive
git submodule update --init --recursive
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
@@ -21,6 +21,12 @@ else
exit 0
fi
# The rust frontend lives in a git submodule under rust/. Buildkite's default
# checkout does not recurse submodules, and the Dockerfile only sees what's in
# the build context, so initialize the submodule here before building.
git submodule sync --recursive
git submodule update --init --recursive
# build
docker build --file docker/Dockerfile.cpu \
--build-arg max_jobs=16 \
+1 -1
View File
@@ -11,7 +11,7 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
# skip build if image already exists
if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
+2 -2
View File
@@ -11,8 +11,8 @@ REPO=$2
BUILDKITE_COMMIT=$3
# authenticate with AWS ECR
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true
# skip build if image already exists
if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then
+107
View File
@@ -0,0 +1,107 @@
group: Rust Frontend
depends_on:
- image-build
steps:
- label: Rust Frontend OpenAI Coverage
timeout_in_minutes: 90
device: h200_18gb
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- rust/
- vllm/benchmarks/
- vllm/entrypoints/openai/
- vllm/entrypoints/serve/
- vllm/v1/sample/
- tests/utils.py
- tests/benchmarks/test_serve_cli.py
- tests/entrypoints/openai/chat_completion/test_chat_completion.py
# - tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py
# - tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
# - tests/entrypoints/openai/completion/test_prompt_validation.py
- tests/entrypoints/openai/completion/test_shutdown.py
# - tests/entrypoints/openai/test_return_token_ids.py
# - tests/entrypoints/openai/test_uds.py
- tests/v1/sample/test_logprobs_e2e.py
commands:
- export VLLM_USE_RUST_FRONTEND=1
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s benchmarks/test_serve_cli.py -k "not insecure and not (test_bench_serve and not test_bench_serve_chat)"
- pytest -v -s entrypoints/openai/chat_completion/test_chat_completion.py
# - pytest -v -s entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py -k "not invalid"
# - pytest -v -s entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
# - pytest -v -s entrypoints/openai/completion/test_prompt_validation.py -k "not prompt_embeds"
- pytest -v -s entrypoints/openai/completion/test_shutdown.py -k "not engine_failure and not test_abort_timeout_exits_quickly"
# - pytest -v -s entrypoints/openai/test_return_token_ids.py
# - pytest -v -s entrypoints/openai/test_uds.py
- pytest -v -s v1/sample/test_logprobs_e2e.py -k "test_prompt_logprobs_e2e_server"
- label: Rust Frontend Serve/Admin Coverage
timeout_in_minutes: 60
device: h200_18gb
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- rust/
- vllm/entrypoints/openai/
- vllm/entrypoints/serve/
- vllm/v1/engine/
- tests/utils.py
# - tests/entrypoints/rpc/test_collective_rpc.py
- tests/entrypoints/serve/disagg/test_serving_tokens.py
- tests/entrypoints/serve/instrumentator/test_basic.py
- tests/entrypoints/serve/instrumentator/test_metrics.py
# - tests/entrypoints/serve/instrumentator/test_sleep.py
commands:
- export VLLM_USE_RUST_FRONTEND=1
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
# - pytest -v -s entrypoints/rpc/test_collective_rpc.py
- pytest -v -s entrypoints/serve/instrumentator/test_basic.py -k "not show_version and not server_load"
- pytest -v -s entrypoints/serve/disagg/test_serving_tokens.py -k "not stream and not lora and not test_generate_logprobs and not stop_string_workflow"
- pytest -v -s entrypoints/serve/instrumentator/test_metrics.py -k "text and not show and not run_batch and not test_metrics_counts and not test_metrics_exist"
# - pytest -v -s entrypoints/serve/instrumentator/test_sleep.py
- label: Rust Frontend Core Correctness
timeout_in_minutes: 30
device: h200_18gb
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- rust/
- vllm/entrypoints/openai/
- tests/utils.py
- tests/entrypoints/openai/correctness/test_lmeval.py
commands:
- export VLLM_USE_RUST_FRONTEND=1
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
- label: Rust Frontend Tool Use
timeout_in_minutes: 60
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- rust/
- vllm/entrypoints/openai/
- vllm/tool_parsers/
- tests/utils.py
- tests/tool_use/
commands:
- export VLLM_USE_RUST_FRONTEND=1
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s tool_use --ignore=tool_use/mistral --models llama3.2 -k "not test_response_format_with_tool_choice_required and not test_parallel_tool_calls_false and not test_tool_call_and_choice"
- label: Rust Frontend Distributed
timeout_in_minutes: 30
num_devices: 4
working_dir: "/vllm-workspace/tests"
source_file_dependencies:
- rust/
- vllm/distributed/
- vllm/engine/
- vllm/executor/
- vllm/v1/engine/
- vllm/v1/worker/
- tests/utils.py
- tests/v1/distributed/test_internal_lb_dp.py
commands:
- export VLLM_USE_RUST_FRONTEND=1
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- export NCCL_CUMEM_HOST_ENABLE=0
- TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py -k "not 4 and not server_info"
+2
View File
@@ -2,6 +2,7 @@
/build
dist
vllm/*.so
vllm/vllm-rs
# Byte-compiled / optimized / DLL files
__pycache__/
@@ -31,3 +32,4 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
rust/target/
+3
View File
@@ -26,6 +26,9 @@ __pycache__/
# C extensions
*.so
# Rust binaries
vllm/vllm-rs
# Distribution / packaging
.Python
build/
+3
View File
@@ -0,0 +1,3 @@
[submodule "rust"]
path = rust
url = https://github.com/Inferact/vllm-frontend-rs.git
Executable
+43
View File
@@ -0,0 +1,43 @@
#!/bin/bash
# Build the vllm-rs Rust frontend binary and install it into the vllm package.
# Usage: ./build_rust.sh [--debug]
#
# By default builds in release mode. Pass --debug for faster compile times
# during development.
set -euo pipefail
REPO_ROOT="$(cd "$(dirname "$0")" && pwd)"
RUST_DIR="$REPO_ROOT/rust"
TARGET_PATH="$REPO_ROOT/vllm/vllm-rs"
# Read the required toolchain from rust-toolchain.toml.
TOOLCHAIN=$(grep '^channel' "$RUST_DIR/rust-toolchain.toml" | sed 's/.*= *"\(.*\)"/\1/')
# Ensure rustup and the required toolchain are available.
if ! command -v rustup &>/dev/null; then
echo "rustup not found, installing..."
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain none
source "$HOME/.cargo/env"
fi
if ! rustup run "$TOOLCHAIN" rustc --version &>/dev/null; then
echo "Installing Rust toolchain: $TOOLCHAIN"
rustup toolchain install "$TOOLCHAIN"
fi
if [[ "${1:-}" == "--debug" ]]; then
PROFILE_ARGS=()
PROFILE_DIR="debug"
else
PROFILE_ARGS=(--release)
PROFILE_DIR="release"
fi
cargo +"$TOOLCHAIN" build "${PROFILE_ARGS[@]}" \
--manifest-path "$RUST_DIR/Cargo.toml" \
--bin vllm-rs \
--features native-tls-vendored
cp "$RUST_DIR/target/$PROFILE_DIR/vllm-rs" "$TARGET_PATH"
echo "Installed vllm-rs to $TARGET_PATH"
+62
View File
@@ -231,6 +231,63 @@ ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX'
ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
#################### BUILD BASE IMAGE ####################
#################### RUST BUILD IMAGE ####################
# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the main wheel
# build stage doesn't need the rust toolchain, protoc, or the rust source.
# This stage runs in parallel with csrc-build/extensions-build.
FROM ${BUILD_BASE_IMAGE} AS rust-build
ARG BUILD_OS
ENV DEBIAN_FRONTEND=noninteractive
# Install protoc (used by tonic-build/prost-build) and a basic C toolchain
# (some rust crates compile C in their build.rs scripts).
RUN if [ "${BUILD_OS}" = "manylinux" ]; then \
dnf install -y --setopt=install_weak_deps=False \
ca-certificates curl git gcc gcc-c++ make \
protobuf-compiler protobuf-devel \
&& dnf clean all && rm -rf /var/cache/dnf; \
else \
apt-get update -y \
&& apt-get install -y --no-install-recommends \
ca-certificates curl git build-essential \
protobuf-compiler libprotobuf-dev \
&& rm -rf /var/lib/apt/lists/*; \
fi
# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
sh -s -- -y --profile minimal --default-toolchain none
ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /workspace
# Copy only the rust workspace — the binary is the sole artifact we need.
COPY rust rust
# Fail loudly if the rust submodule was not initialized on the host before
# `docker build`. Without this check, cargo would emit a confusing error.
RUN if [ ! -f rust/Cargo.toml ]; then \
echo "ERROR: rust/ submodule is not initialized."; \
echo "Run 'git submodule update --init --recursive' on the host before building."; \
exit 1; \
fi
# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
ENV CARGO_BUILD_JOBS=4
# Build the release binary. Cache cargo registry/git and target/, but copy the
# binary out of the target/ cache mount so it persists into the image layer
# for later COPY --from=rust-build.
RUN --mount=type=cache,target=/root/.cargo/registry \
--mount=type=cache,target=/root/.cargo/git \
--mount=type=cache,target=/workspace/rust/target \
cd rust \
&& cargo build --release --bin vllm-rs --features native-tls-vendored \
&& cp target/release/vllm-rs /workspace/vllm-rs
#################### RUST BUILD IMAGE ####################
#################### CSRC BUILD IMAGE ####################
FROM base AS csrc-build
ARG TARGETPLATFORM
@@ -435,6 +492,10 @@ WORKDIR /workspace
COPY --from=csrc-build /workspace/dist /precompiled-wheels
COPY . .
# Drop the pre-built rust frontend binary into the source tree. setup.py
# detects it and ships it as-is, skipping the local cargo build.
COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
@@ -455,6 +516,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=bind,source=.git,target=.git \
if [ "${vllm_target_device}" = "cuda" ]; then \
export VLLM_USE_PRECOMPILED=1; \
export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
fi && \
python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
+50
View File
@@ -36,6 +36,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh
# Compiler and linker environment
ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
ENV CCACHE_DIR=/root/.cache/ccache
ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
@@ -80,6 +81,51 @@ FROM base-${TARGETARCH} AS base
RUN echo 'ulimit -c 0' >> ~/.bashrc
######################### RUST BUILD IMAGE #########################
# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the wheel build
# stage doesn't need the rust toolchain or protoc. This stage runs in parallel
# with the main vllm-build stage.
FROM ubuntu:22.04 AS rust-build
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
ca-certificates curl git build-essential \
protobuf-compiler libprotobuf-dev \
&& rm -rf /var/lib/apt/lists/*
# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
sh -s -- -y --profile minimal --default-toolchain none
ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /workspace
# Copy only the rust workspace — the binary is the sole artifact we need.
COPY rust rust
# Fail loudly if the rust submodule was not initialized on the host before
# `docker build`. Without this check, cargo would emit a confusing error.
RUN if [ ! -f rust/Cargo.toml ]; then \
echo "ERROR: rust/ submodule is not initialized."; \
echo "Run 'git submodule update --init --recursive' on the host before building."; \
exit 1; \
fi
# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
ENV CARGO_BUILD_JOBS=4
# Build the release binary. Cache cargo registry/git and target/, but copy the
# binary out of the target/ cache mount so it persists into the image layer
# for later COPY --from=rust-build.
RUN --mount=type=cache,target=/root/.cargo/registry \
--mount=type=cache,target=/root/.cargo/git \
--mount=type=cache,target=/workspace/rust/target \
cd rust \
&& cargo build --release --bin vllm-rs --features native-tls-vendored \
&& cp target/release/vllm-rs /workspace/vllm-rs
######################### BUILD IMAGE #########################
FROM base AS vllm-build
@@ -114,6 +160,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \
COPY . .
# Drop the pre-built rust frontend binary into the source tree. setup.py
# detects it and ships it as-is, skipping the local cargo build.
COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
RUN if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi
RUN --mount=type=cache,target=/root/.cache/uv \
+45
View File
@@ -94,6 +94,47 @@ RUN cat torch_build_versions.txt
#################### BASE BUILD IMAGE ####################
#################### RUST BUILD IMAGE ####################
# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the wheel build
# stage doesn't need the rust toolchain or protoc.
FROM ubuntu:22.04 AS rust-build
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
ca-certificates curl git build-essential \
protobuf-compiler libprotobuf-dev \
&& rm -rf /var/lib/apt/lists/*
# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
sh -s -- -y --profile minimal --default-toolchain none
ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /workspace
COPY rust rust
# Fail loudly if the rust submodule was not initialized on the host before
# `docker build`.
RUN if [ ! -f rust/Cargo.toml ]; then \
echo "ERROR: rust/ submodule is not initialized."; \
echo "Run 'git submodule update --init --recursive' on the host before building."; \
exit 1; \
fi
# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
ENV CARGO_BUILD_JOBS=4
RUN --mount=type=cache,target=/root/.cargo/registry \
--mount=type=cache,target=/root/.cargo/git \
--mount=type=cache,target=/workspace/rust/target \
cd rust \
&& cargo build --release --bin vllm-rs --features native-tls-vendored \
&& cp target/release/vllm-rs /workspace/vllm-rs
#################### RUST BUILD IMAGE ####################
#################### WHEEL BUILD IMAGE ####################
FROM base AS build
ARG TARGETPLATFORM
@@ -104,6 +145,10 @@ ENV UV_HTTP_TIMEOUT=500
COPY . .
# Drop the pre-built rust frontend binary into the source tree. setup.py
# detects it and ships it as-is, skipping the local cargo build.
COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
RUN python3 use_existing_torch.py
RUN --mount=type=cache,target=/root/.cache/uv \
+51
View File
@@ -106,14 +106,61 @@ ONBUILD RUN git clone ${VLLM_REPO} \
&& cd vllm \
&& git fetch -v --prune -- origin ${VLLM_BRANCH} \
&& git checkout FETCH_HEAD \
&& git submodule update --init --recursive \
&& if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
git remote add upstream "https://github.com/vllm-project/vllm.git" \
&& git fetch upstream ; fi
FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm
# -----------------------
# Rust build stage
# Builds the `vllm-rs` frontend in a dedicated stage so the wheel build stages
# don't need the rust toolchain or protoc. Runs in parallel with the main wheel
# build for faster end-to-end builds.
FROM fetch_vllm AS rust-build
ARG COMMON_WORKDIR
# Fail loudly if the rust submodule was not initialized on the host before
# `docker build`. The rust frontend source is brought in via the fetch_vllm
# stage, so an uninitialized submodule would otherwise produce a confusing
# cargo failure.
RUN if [ ! -f ${COMMON_WORKDIR}/vllm/rust/Cargo.toml ]; then \
echo "ERROR: rust/ submodule is not initialized."; \
echo "Run 'git submodule update --init --recursive' on the host before building."; \
exit 1; \
fi
# protoc is used by tonic-build/prost-build.
RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
protobuf-compiler libprotobuf-dev \
&& rm -rf /var/lib/apt/lists/*
# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
sh -s -- -y --profile minimal --default-toolchain none
ENV PATH="/root/.cargo/bin:${PATH}"
# Cap cargo parallelism to avoid exhausting the AMD CI host's open-file limit
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
ENV CARGO_BUILD_JOBS=4
# Build the release binary. Cache cargo registry/git, and copy the binary out
# so it persists into the image layer for later COPY --from=rust-build.
RUN --mount=type=cache,target=/root/.cargo/registry \
--mount=type=cache,target=/root/.cargo/git \
cd ${COMMON_WORKDIR}/vllm/rust \
&& cargo build --release --bin vllm-rs --features native-tls-vendored \
&& cp target/release/vllm-rs /tmp/vllm-rs
# -----------------------
# vLLM build stages
FROM fetch_vllm AS build_vllm
ARG COMMON_WORKDIR
# Drop the pre-built rust frontend binary into the source tree. setup.py
# detects it and ships it as-is, skipping the local cargo build.
COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs
# Build vLLM (setup.py auto-detects sccache in PATH)
RUN cd vllm \
&& python3 -m pip install -r requirements/rocm.txt \
@@ -293,6 +340,10 @@ FROM fetch_vllm AS build_vllm_wheel_release
ARG COMMON_WORKDIR
# Drop the pre-built rust frontend binary into the source tree. setup.py
# detects it and ships it as-is, skipping the local cargo build.
COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs
# Create /install directory for custom wheels
RUN mkdir -p /install
+45
View File
@@ -1,3 +1,43 @@
######################### RUST BUILD IMAGE #########################
# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the main image
# doesn't need the rust toolchain or protoc. Runs in parallel with vllm-base.
FROM ubuntu:22.04 AS rust-build
ENV DEBIAN_FRONTEND=noninteractive
RUN apt-get update -y \
&& apt-get install -y --no-install-recommends \
ca-certificates curl git build-essential \
protobuf-compiler libprotobuf-dev \
&& rm -rf /var/lib/apt/lists/*
# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
sh -s -- -y --profile minimal --default-toolchain none
ENV PATH="/root/.cargo/bin:${PATH}"
WORKDIR /workspace
COPY rust rust
# Fail loudly if the rust submodule was not initialized on the host before
# `docker build`.
RUN if [ ! -f rust/Cargo.toml ]; then \
echo "ERROR: rust/ submodule is not initialized."; \
echo "Run 'git submodule update --init --recursive' on the host before building."; \
exit 1; \
fi
# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
ENV CARGO_BUILD_JOBS=4
RUN --mount=type=cache,target=/root/.cargo/registry \
--mount=type=cache,target=/root/.cargo/git \
--mount=type=cache,target=/workspace/rust/target \
cd rust \
&& cargo build --release --bin vllm-rs --features native-tls-vendored \
&& cp target/release/vllm-rs /workspace/vllm-rs
FROM intel/deep-learning-essentials:2025.3.2-0-devel-ubuntu24.04 AS vllm-base
WORKDIR /workspace/
@@ -99,6 +139,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"
COPY . .
# Drop the pre-built rust frontend binary into the source tree. setup.py
# detects it and ships it as-is, skipping the local cargo build.
COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
ARG GIT_REPO_CHECK=0
RUN --mount=type=bind,source=.git,target=.git \
if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
+7
View File
@@ -43,6 +43,13 @@ If you are only developing vLLM's Python code, install vLLM using:
VLLM_USE_PRECOMPILED=1 uv pip install -e .
```
To rebuild only the Rust frontend binary:
```bash
./build_rust.sh # release build
./build_rust.sh --debug # faster build for development
```
If you are developing vLLM's Python and CUDA/C++ code, install Pytorch first:
```bash
+4 -4
View File
@@ -136,10 +136,10 @@ When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script:
3. **Selects compatible wheel** based on:
- Package name (`vllm`)
- Platform tag (architecture match)
4. **Downloads and extracts** precompiled binaries from the wheel:
- C++ extension modules (`.so` files)
- Flash Attention Python modules
- Triton kernel Python files
4. **Downloads and extracts** precompiled artifacts from the wheel:
- Native extension modules (`.so` files)
- The `vllm-rs` Rust frontend binary
- Flash Attention Python modules and Triton/FlashMLA Python files
5. **Patches package_data** to include extracted files in the installation
!!! note "What is the base commit?"
@@ -101,12 +101,22 @@ This command will do the following:
1. Look for the current branch in your vLLM clone.
1. Identify the corresponding base commit in the main branch.
1. Download the pre-built wheel of the base commit.
1. Use its compiled libraries in the installation.
1. Use its compiled libraries and `vllm-rs` binary in the installation.
!!! note
1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol.
2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date.
!!! tip "Rebuilding the Rust frontend"
If you need to recompile the `vllm-rs` Rust frontend binary, you can rebuild and install it without re-running the full pip install:
```bash
./build_rust.sh # release build
./build_rust.sh --debug # faster build for development
```
This will install the required Rust toolchain if needed, build the binary, and place it in `vllm/vllm-rs`.
In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the `main` branch was just merged and its precompiled wheel is not available yet. You can wait around an hour and retry, or set `VLLM_PRECOMPILED_WHEEL_COMMIT=nightly` to automatically select the most recent already-built commit on `main`.
```bash
+1
View File
@@ -6,6 +6,7 @@ requires = [
"packaging>=24.2",
"setuptools>=77.0.3,<81.0.0",
"setuptools-scm>=8.0",
"setuptools-rust>=1.9.0",
"torch == 2.11.0",
"wheel",
"jinja2",
+1
View File
@@ -4,6 +4,7 @@ ninja
packaging>=24.2
setuptools==77.0.3 # this version can reuse CMake build dir
setuptools-scm>=8
setuptools-rust>=1.9.0
torch==2.11.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" or platform_machine == "aarch64"
torch==2.11.0; platform_system == "Darwin" or platform_machine == "ppc64le" or platform_machine == "riscv64"
wheel
+1
View File
@@ -4,6 +4,7 @@ ninja
packaging>=24.2
setuptools>=77.0.3,<81.0.0
setuptools-scm>=8
setuptools-rust>=1.9.0
torch==2.11.0
wheel
jinja2>=3.1.6
+1
View File
@@ -15,6 +15,7 @@ tensorizer==2.10.1
packaging>=24.2
setuptools>=77.0.3,<80.0.0
setuptools-scm>=8
setuptools-rust>=1.9.0
runai-model-streamer[s3,gcs,azure]==0.15.7
conch-triton-kernels==1.2.1
timm>=1.0.17
Submodule
+1
Submodule rust added at ad6771ac09
+2
View File
@@ -0,0 +1,2 @@
[toolchain]
channel = "1.95"
+115 -43
View File
@@ -18,6 +18,8 @@ import torch
from packaging.version import Version, parse
from setuptools import Extension, setup
from setuptools.command.build_ext import build_ext
from setuptools_rust import Binding, RustExtension
from setuptools_rust.build import build_rust
from setuptools_scm import get_version
from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME
@@ -33,11 +35,24 @@ def load_module_from_path(module_name, path):
ROOT_DIR = Path(__file__).parent
logger = logging.getLogger(__name__)
PRECOMPILED_RUST_FRONTEND_PATH = ROOT_DIR / "vllm" / "vllm-rs"
# cannot import envs directly because it depends on vllm,
# which is not installed yet
envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm", "envs.py"))
VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
USE_PRECOMPILED_EXTENSIONS = envs.VLLM_USE_PRECOMPILED
# VLLM_USE_PRECOMPILED implies precompiled rust frontend too.
USE_PRECOMPILED_RUST_FRONTEND = (
envs.VLLM_USE_PRECOMPILED or envs.VLLM_USE_PRECOMPILED_RUST
)
def should_require_rust_frontend() -> bool:
value = os.getenv("VLLM_REQUIRE_RUST_FRONTEND", "")
return value.lower() not in ("", "0", "false", "no")
if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
logger.warning("VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
@@ -405,6 +420,24 @@ class precompiled_build_ext(build_ext):
return
class precompiled_build_rust(build_rust):
"""Skips local Rust builds when the precompiled wheel already ships vllm-rs."""
def run(self) -> None:
if PRECOMPILED_RUST_FRONTEND_PATH.exists():
logger.info(
"Skipping local Rust build: using precompiled %s",
PRECOMPILED_RUST_FRONTEND_PATH,
)
return
logger.warning(
"Precompiled wheel did not provide %s; falling back to local Rust build.",
PRECOMPILED_RUST_FRONTEND_PATH,
)
super().run()
class precompiled_wheel_utils:
"""Extracts libraries and other files from an existing wheel."""
@@ -653,7 +686,11 @@ class precompiled_wheel_utils:
@staticmethod
def extract_precompiled_and_patch_package(
wheel_url_or_path: str, download_filename: str | None
wheel_url_or_path: str,
download_filename: str | None,
*,
extract_extensions: bool,
extract_rust_frontend: bool,
) -> dict:
import tempfile
import zipfile
@@ -676,20 +713,26 @@ class precompiled_wheel_utils:
package_data_patch = {}
with zipfile.ZipFile(wheel_path) as wheel:
files_to_copy = [
"vllm/_C.abi3.so",
"vllm/_C_stable_libtorch.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/_flashmla_extension_C.abi3.so",
"vllm/_sparse_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
"vllm/spinloop.abi3.so",
# ROCm-specific libraries
"vllm/_rocm_C.abi3.so",
]
exact_members = set()
if extract_extensions:
exact_members.update(
{
"vllm/_C.abi3.so",
"vllm/_C_stable_libtorch.abi3.so",
"vllm/_moe_C.abi3.so",
"vllm/_flashmla_C.abi3.so",
"vllm/_flashmla_extension_C.abi3.so",
"vllm/_sparse_flashmla_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
"vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
"vllm/cumem_allocator.abi3.so",
"vllm/spinloop.abi3.so",
# ROCm-specific libraries
"vllm/_rocm_C.abi3.so",
}
)
if extract_rust_frontend:
exact_members.add("vllm/vllm-rs")
flash_attn_regex = re.compile(
r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
@@ -708,27 +751,25 @@ class precompiled_wheel_utils:
)
# DeepGEMM: extract all files (.py, .so, .cuh, .h, .hpp, etc.)
deep_gemm_regex = re.compile(r"vllm/third_party/deep_gemm/.*")
file_members = list(
filter(lambda x: x.filename in files_to_copy, wheel.filelist)
)
file_members += list(
filter(
lambda x: flash_attn_regex.match(x.filename)
and x.filename not in flash_attn_files_to_skip,
wheel.filelist,
)
)
file_members += list(
filter(
lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
)
)
file_members += list(
filter(lambda x: flashmla_regex.match(x.filename), wheel.filelist)
)
file_members += list(
filter(lambda x: deep_gemm_regex.match(x.filename), wheel.filelist)
)
file_members = []
for member in wheel.filelist:
if member.filename in exact_members:
file_members.append(member)
continue
if not extract_extensions:
continue
if (
(
flash_attn_regex.match(member.filename)
and member.filename not in flash_attn_files_to_skip
)
or triton_kernels_regex.match(member.filename)
or flashmla_regex.match(member.filename)
or deep_gemm_regex.match(member.filename)
):
file_members.append(member)
for file in file_members:
print(f"[extract] {file.filename}")
@@ -739,6 +780,9 @@ class precompiled_wheel_utils:
open(target_path, "wb") as dst,
):
shutil.copyfileobj(src, dst)
mode = file.external_attr >> 16
if mode:
os.chmod(target_path, mode)
pkg = os.path.dirname(file.filename).replace("/", ".")
package_data_patch.setdefault(pkg, []).append(
@@ -911,7 +955,7 @@ def get_vllm_version() -> str:
if envs.VLLM_TARGET_DEVICE == "empty":
version += f"{sep}empty"
elif _is_cuda():
if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
if USE_PRECOMPILED_EXTENSIONS and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
version += f"{sep}precompiled"
else:
cuda_version = str(get_nvcc_cuda_version())
@@ -1006,7 +1050,7 @@ if _is_hip():
if _is_cuda():
ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
if envs.VLLM_USE_PRECOMPILED or (
if USE_PRECOMPILED_EXTENSIONS or (
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
):
# FA3 requires CUDA 12.3 or later
@@ -1016,7 +1060,7 @@ if _is_cuda():
ext_modules.append(
CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa4_cutedsl_C", optional=True)
)
if envs.VLLM_USE_PRECOMPILED or (
if USE_PRECOMPILED_EXTENSIONS or (
CUDA_HOME and get_nvcc_cuda_version() >= Version("12.9")
):
# FlashMLA requires CUDA 12.9 or later
@@ -1065,15 +1109,25 @@ package_data = {
}
# If using precompiled, extract and patch package_data (in advance of setup)
if envs.VLLM_USE_PRECOMPILED:
# If using precompiled artifacts, extract and patch package_data in advance.
if USE_PRECOMPILED_RUST_FRONTEND:
wheel_url, download_filename = precompiled_wheel_utils.determine_wheel_url()
patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
wheel_url, download_filename
wheel_url,
download_filename,
extract_extensions=USE_PRECOMPILED_EXTENSIONS,
extract_rust_frontend=True,
)
for pkg, files in patch.items():
package_data.setdefault(pkg, []).extend(files)
# If the rust frontend binary is already present in the source tree (e.g.,
# pre-built in a separate Docker build stage), ship it as-is.
if PRECOMPILED_RUST_FRONTEND_PATH.exists():
vllm_files = package_data.setdefault("vllm", [])
if "vllm-rs" not in vllm_files:
vllm_files.append("vllm-rs")
if _no_device():
ext_modules = []
@@ -1082,14 +1136,32 @@ if not ext_modules:
else:
cmdclass = {
"build_ext": precompiled_build_ext
if envs.VLLM_USE_PRECOMPILED
if USE_PRECOMPILED_EXTENSIONS
else cmake_build_ext,
}
if USE_PRECOMPILED_RUST_FRONTEND or PRECOMPILED_RUST_FRONTEND_PATH.exists():
cmdclass["build_rust"] = precompiled_build_rust
# Rust frontend binary, built via setuptools-rust and installed into the
# package directory alongside the Python modules.
# TODO: we may use `RustBin` to directly install it into `bin` directory, but this
# requires extra work on using precompiled binaries.
rust_extensions = [
RustExtension(
target="vllm.vllm-rs",
path="rust/src/cmd/Cargo.toml",
args=["--bin", "vllm-rs"],
features=["native-tls-vendored"],
binding=Binding.Exec,
optional=not should_require_rust_frontend(),
),
]
setup(
# static metadata should rather go in pyproject.toml
version=get_vllm_version(),
ext_modules=ext_modules,
rust_extensions=rust_extensions,
install_requires=get_requirements(),
extras_require={
# AMD Zen CPU optimizations via zentorch
@@ -299,22 +299,16 @@ async def test_abort_timeout_exits_quickly(wait_for_engine_idle: float):
start_time = time.time()
proc.send_signal(signal.SIGTERM)
# abort timeout (0) should stop the server promptly. On ROCm, process
# exit can spend extra time in HIP/RCCL/native extension teardown after
# the server and engine have already shut down.
max_exit_time = 4.0 if _IS_ROCM else 2.1
# abort timeout (0) should stop the server promptly.
try:
proc.wait(timeout=max_exit_time)
proc.wait(timeout=4.0)
except subprocess.TimeoutExpired:
proc.kill()
proc.wait(timeout=5)
pytest.fail("Process did not exit after SIGTERM with abort timeout")
exit_time = time.time() - start_time
assert exit_time < max_exit_time, (
f"Default shutdown took too long: {exit_time:.1f}s"
)
assert exit_time < 4.1, f"Default shutdown took too long: {exit_time:.1f}s"
assert proc.returncode in (0, -15, None), f"Unexpected: {proc.returncode}"
await _assert_children_cleaned_up(child_pids)
+13
View File
@@ -103,6 +103,19 @@ def test_is_envs_cache_enabled() -> None:
assert not envs._is_envs_cache_enabled()
def test_precompiled_install_flags_are_orthogonal() -> None:
with patch.dict(
os.environ,
{
"VLLM_PRECOMPILED_WHEEL_LOCATION": "/tmp/vllm.whl",
"VLLM_USE_PRECOMPILED_RUST": "1",
},
clear=False,
):
assert environment_variables["VLLM_USE_PRECOMPILED"]() is False
assert environment_variables["VLLM_USE_PRECOMPILED_RUST"]() is True
class TestEnvWithChoices:
"""Test cases for env_with_choices function."""
+9
View File
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import signal
import uvloop
@@ -110,6 +111,14 @@ def cmd_init() -> list[CLISubcommand]:
async def run_launch_fastapi(args: argparse.Namespace) -> None:
"""Run the online serving layer with FastAPI (no GPU inference)."""
# Interrupt initialization if SIGTERM arrives before uvicorn installs
# its own signal handlers. Once uvicorn is running it replaces this.
def _interrupt_init(*_) -> None:
raise KeyboardInterrupt("terminated")
signal.signal(signal.SIGTERM, _interrupt_init)
# 1. Socket binding
listen_address, sock = setup_server(args)
+48 -19
View File
@@ -21,7 +21,11 @@ from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
from vllm.v1.executor import Executor
from vllm.v1.executor.multiproc_executor import MultiprocExecutor
from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
from vllm.v1.utils import (
APIServerProcessManager,
RustFrontendProcessManager,
wait_for_completion_or_failure,
)
logger = init_logger(__name__)
@@ -81,11 +85,12 @@ class ServeSubcommand(CLISubcommand):
)
# Default api_server_count if not explicitly set.
# - Rust frontend: Use 1 (not applicable as it's multithreaded)
# - External LB: Leave as 1 (external LB handles distribution)
# - Hybrid LB: Use local DP size (internal LB for local ranks only)
# - Internal LB: Use full DP size
if args.api_server_count is None:
if is_external_lb:
if is_external_lb or envs.VLLM_RUST_FRONTEND_PATH:
args.api_server_count = 1
elif is_hybrid_lb:
args.api_server_count = args.data_parallel_size_local or 1
@@ -102,6 +107,11 @@ class ServeSubcommand(CLISubcommand):
"Defaulting api_server_count to data_parallel_size (%d).",
args.api_server_count,
)
elif envs.VLLM_RUST_FRONTEND_PATH and args.api_server_count > 1:
logger.warning(
"Ignoring --api-server-count=%d when using rust front-end process"
)
args.api_server_count = 1
# Elastic EP currently only supports running with at most one API server.
if getattr(args, "enable_elastic_ep", False) and args.api_server_count > 1:
@@ -114,7 +124,7 @@ class ServeSubcommand(CLISubcommand):
if args.api_server_count < 1:
run_headless(args)
elif args.api_server_count > 1:
elif args.api_server_count > 1 or envs.VLLM_RUST_FRONTEND_PATH:
run_multi_api_server(args)
else:
# Single API server (this process).
@@ -230,9 +240,15 @@ def run_headless(args: argparse.Namespace):
def run_multi_api_server(args: argparse.Namespace):
assert not args.headless
rust_frontend_path = envs.VLLM_RUST_FRONTEND_PATH
num_api_servers: int = args.api_server_count
assert num_api_servers > 0
if rust_frontend_path and num_api_servers > 1:
raise ValueError(
"VLLM_RUST_FRONTEND_PATH does not support api_server_count > 1"
)
if num_api_servers > 1:
setup_multiprocess_prometheus()
@@ -270,7 +286,9 @@ def run_multi_api_server(args: argparse.Namespace):
dp_rank = parallel_config.data_parallel_rank
assert parallel_config.local_engines_only or dp_rank == 0
api_server_manager: APIServerProcessManager | None = None
api_server_manager: APIServerProcessManager | RustFrontendProcessManager | None = (
None
)
from vllm.v1.engine.utils import get_engine_zmq_addresses
@@ -279,23 +297,34 @@ def run_multi_api_server(args: argparse.Namespace):
with launch_core_engines(
vllm_config, executor_class, log_stats, addresses, num_api_servers
) as (local_engine_manager, coordinator, addresses, tensor_queue):
# Construct common args for the APIServerProcessManager up-front.
stats_update_address = None
if coordinator:
stats_update_address = coordinator.get_stats_publish_address()
# Start API servers.
api_server_manager = APIServerProcessManager(
listen_address=listen_address,
sock=sock,
args=args,
num_servers=num_api_servers,
input_addresses=addresses.inputs,
output_addresses=addresses.outputs,
stats_update_address=stats_update_address,
tensor_queue=tensor_queue,
stats_update_address = (
coordinator.get_stats_publish_address() if coordinator else None
)
if rust_frontend_path:
# Start rust front-end process.
api_server_manager = RustFrontendProcessManager(
binary_path=rust_frontend_path,
sock=sock,
args=args,
input_address=addresses.inputs[0],
output_address=addresses.outputs[0],
engine_count=parallel_config.data_parallel_size,
stats_update_address=stats_update_address,
)
else:
# Start API server(s).
api_server_manager = APIServerProcessManager(
listen_address=listen_address,
sock=sock,
args=args,
num_servers=num_api_servers,
input_addresses=addresses.inputs,
output_addresses=addresses.outputs,
stats_update_address=stats_update_address,
tensor_queue=tensor_queue,
)
# Wait for API servers.
try:
wait_for_completion_or_failure(
+8 -8
View File
@@ -533,8 +533,7 @@ def validate_api_server_args(args):
@instrument(span_name="API server setup")
def setup_server(args):
"""Validate API server args, set up signal handler, create socket
ready to serve."""
"""Validate API server args and create the server socket."""
log_version_and_model(logger, VLLM_VERSION, args.model)
log_non_default_args(args)
@@ -560,12 +559,6 @@ def setup_server(args):
# many concurrent requests active
set_ulimit()
def signal_handler(*_) -> None:
# Interrupt server on sigterm while initializing
raise KeyboardInterrupt("terminated")
signal.signal(signal.SIGTERM, signal_handler)
if args.uds:
listen_address = f"unix:{args.uds}"
else:
@@ -675,6 +668,13 @@ async def run_server(args, **uvicorn_kwargs) -> None:
# Add process-specific prefix to stdout and stderr.
decorate_logs("APIServer")
# Interrupt initialization if SIGTERM arrives before uvicorn installs its
# own signal handlers. Once uvicorn is running it replaces this.
def _interrupt_init(*_) -> None:
raise KeyboardInterrupt("terminated")
signal.signal(signal.SIGTERM, _interrupt_init)
listen_address, sock = setup_server(args)
await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)
+39 -1
View File
@@ -9,6 +9,7 @@ from argparse import Namespace
from http import HTTPStatus
from logging import Logger
from string import Template
from typing import Any
import regex as re
from fastapi import Request
@@ -210,7 +211,7 @@ def get_max_tokens(
)
def log_non_default_args(args: Namespace | EngineArgs):
def get_non_default_args(args: Namespace | EngineArgs) -> dict[str, Any]:
from vllm.entrypoints.openai.cli_args import make_arg_parser
non_default_args = {}
@@ -237,6 +238,43 @@ def log_non_default_args(args: Namespace | EngineArgs):
"Unsupported argument type. Must be Namespace or EngineArgs instance."
)
return non_default_args
def _jsonify_arg_value(value: Any) -> Any:
if value is None or isinstance(value, bool | int | float | str):
return value
if dataclasses.is_dataclass(value) and not isinstance(value, type):
return {
key: _jsonify_arg_value(val)
for key, val in dataclasses.asdict(value).items()
}
if isinstance(value, dict):
return {str(key): _jsonify_arg_value(val) for key, val in value.items()}
if isinstance(value, tuple | list):
return [_jsonify_arg_value(item) for item in value]
if (model_dump := getattr(value, "model_dump", None)) is not None:
return _jsonify_arg_value(model_dump(mode="json"))
if (to_dict := getattr(value, "dict", None)) is not None:
return _jsonify_arg_value(to_dict())
return repr(value)
def jsonify_non_default_args(
args: Namespace | EngineArgs,
*,
exclude: set[str] | None = None,
) -> dict[str, Any]:
non_default_args = get_non_default_args(args)
if exclude is not None:
for key in exclude:
non_default_args.pop(key, None)
return {key: _jsonify_arg_value(value) for key, value in non_default_args.items()}
def log_non_default_args(args: Namespace | EngineArgs):
non_default_args = get_non_default_args(args)
logger.info("non-default args: %s", non_default_args)
+51 -1
View File
@@ -86,6 +86,7 @@ if TYPE_CHECKING:
MAX_JOBS: str | None = None
NVCC_THREADS: str | None = None
VLLM_USE_PRECOMPILED: bool = False
VLLM_USE_PRECOMPILED_RUST: bool = False
VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
VLLM_DOCKER_BUILD_CONTEXT: bool = False
VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -134,6 +135,8 @@ if TYPE_CHECKING:
Q_SCALE_CONSTANT: int = 200
K_SCALE_CONSTANT: int = 200
V_SCALE_CONSTANT: int = 100
VLLM_USE_RUST_FRONTEND: bool = False
VLLM_RUST_FRONTEND_PATH: str | None = "auto"
VLLM_SERVER_DEV_MODE: bool = False
VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
VLLM_MLA_DISABLE: bool = False
@@ -514,6 +517,40 @@ def get_env_or_set_default(
logger = logging.getLogger(__name__)
def _resolve_rust_frontend_path() -> str | None:
"""Resolve the Rust frontend binary path.
Returns None if VLLM_USE_RUST_FRONTEND is not enabled.
When enabled, resolves VLLM_RUST_FRONTEND_PATH ("auto" by default)
to the actual binary path.
"""
use_rust = bool(int(os.environ.get("VLLM_USE_RUST_FRONTEND", "0")))
raw = os.environ.get("VLLM_RUST_FRONTEND_PATH", "auto")
if not use_rust:
if os.environ.get("VLLM_RUST_FRONTEND_PATH") is not None:
logger.warning(
"VLLM_RUST_FRONTEND_PATH is set but VLLM_USE_RUST_FRONTEND "
"is not enabled. The Rust frontend will not be used. "
"Set VLLM_USE_RUST_FRONTEND=1 to enable it."
)
return None
if raw.lower() in ("auto", "1", "true"):
pkg_dir = os.path.dirname(os.path.abspath(__file__))
candidate = os.path.join(pkg_dir, "vllm-rs")
if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
return candidate
raise FileNotFoundError(
"VLLM_RUST_FRONTEND_PATH=auto but the vllm-rs binary was "
f"not found at {candidate}. "
"Build with setuptools-rust or set the path explicitly."
)
return raw
environment_variables: dict[str, Callable[[], Any]] = {
# ================== Installation Time Env Vars ==================
# Target device of vLLM, supporting [cuda (by default),
@@ -551,11 +588,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
# By default this is 1.
# If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
"NVCC_THREADS": lambda: os.getenv("NVCC_THREADS", None),
# If set, vllm will use precompiled binaries (*.so)
# If set, vllm will use precompiled native binaries (*.so and vllm-rs).
"VLLM_USE_PRECOMPILED": lambda: (
os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in ("1", "true")
or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION"))
),
# If set, vllm will use the precompiled Rust frontend binary (vllm-rs).
"VLLM_USE_PRECOMPILED_RUST": lambda: (
os.environ.get("VLLM_USE_PRECOMPILED_RUST", "").strip().lower() in ("1", "true")
),
# If set, skip adding +precompiled suffix to version string
"VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool(
int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0"))
@@ -1154,6 +1195,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
# If set to "0", disable LayerName opaque type for layer_name
# parameters in custom ops. Defaults to enabled on torch >= 2.11.
"VLLM_USE_LAYERNAME": lambda: bool(int(os.getenv("VLLM_USE_LAYERNAME", "1"))),
# If set, use the Rust frontend binary instead of the Python API server
# process(es).
"VLLM_USE_RUST_FRONTEND": lambda: bool(
int(os.getenv("VLLM_USE_RUST_FRONTEND", "0"))
),
# Path to the Rust frontend binary. Defaults to "auto" which discovers
# the binary installed with the vllm package. Only used when
# VLLM_USE_RUST_FRONTEND=1.
"VLLM_RUST_FRONTEND_PATH": lambda: _resolve_rust_frontend_path(),
# If set, vllm will run in development mode, which will enable
# some additional endpoints for developing and debugging,
# e.g. `/reset_prefix_cache`
+1
View File
@@ -75,6 +75,7 @@ class EngineCoreReadyResponse:
max_model_len: int
num_gpu_blocks: int
dp_stats_address: str | None
dtype: str | None = None
class EngineCoreRequest(
+1
View File
@@ -1437,6 +1437,7 @@ class EngineCoreProc(EngineCore):
max_model_len=self.vllm_config.model_config.max_model_len,
num_gpu_blocks=self.vllm_config.cache_config.num_gpu_blocks or 0,
dp_stats_address=self.frontend_stats_publish_address,
dtype=str(self.vllm_config.model_config.dtype).removeprefix("torch."),
)
ready_payload = msgspec.msgpack.encode(ready_response)
for input_socket in input_sockets:
+143 -2
View File
@@ -2,6 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import argparse
import contextlib
import json
import multiprocessing
import threading
import time
@@ -233,6 +234,146 @@ class APIServerProcessManager:
shutdown(self.processes, timeout=timeout)
class RustFrontendProcessManager:
"""Manages a single Rust frontend subprocess.
Launches the Rust vllm-rs binary in 'frontend' mode, passing the
listening socket fd and ZMQ transport addresses. Provides the same
interface as APIServerProcessManager for process monitoring.
"""
def __init__(
self,
binary_path: str,
sock: Any,
args: argparse.Namespace,
input_address: str,
output_address: str,
engine_count: int,
stats_update_address: str | None = None,
):
import os
import subprocess
fd = sock.fileno()
os.set_inheritable(fd, True)
cmd = [
binary_path,
"frontend",
"--listen-fd",
str(fd),
"--input-address",
input_address,
"--output-address",
output_address,
"--engine-count",
str(engine_count),
]
if stats_update_address is not None:
cmd.extend(["--coordinator-address", stats_update_address])
from vllm.entrypoints.utils import jsonify_non_default_args
args_json = json.dumps(
jsonify_non_default_args(args, exclude={"api_server_count"}),
sort_keys=True,
)
cmd.extend(["--args-json", args_json])
logger.info("Launching Rust frontend: %s", " ".join(cmd))
self._proc = subprocess.Popen(cmd, pass_fds=(fd,))
# Create a process wrapper with a sentinel fd for monitoring
self.processes: list[_SubprocessWrapper] = [
_SubprocessWrapper(self._proc, "RustFrontend")
]
self._finalizer = weakref.finalize(self, _shutdown_subprocesses, self.processes)
def shutdown(self, timeout: float | None = None) -> None:
if self._finalizer.detach() is not None:
_shutdown_subprocesses(self.processes, timeout=timeout)
class _SubprocessWrapper:
"""Wraps subprocess.Popen to provide the BaseProcess-like interface
needed by wait_for_completion_or_failure."""
def __init__(self, proc, name: str):
self._proc = proc
self.name = name
self.pid = proc.pid
self._sentinel_conn: connection.Connection | None = None
self._sentinel_send: connection.Connection | None = None
# Use a Pipe-based sentinel so subprocess monitoring works uniformly
# across platforms with multiprocessing.connection.wait().
recv, send = connection.Pipe(duplex=False)
self._sentinel_conn = recv
self._sentinel_send = send
def monitor_subprocess() -> None:
try:
proc.wait()
finally:
with contextlib.suppress(Exception):
send.close()
threading.Thread(
target=monitor_subprocess, daemon=True, name=f"{name}Monitor"
).start()
@property
def sentinel(self):
return self._sentinel_conn
@property
def exitcode(self) -> int | None:
return self._proc.returncode if self._proc.poll() is not None else None
def is_alive(self) -> bool:
return self._proc.poll() is None
def terminate(self):
self._proc.terminate()
def join(self, timeout=None):
with contextlib.suppress(Exception):
self._proc.wait(timeout=timeout)
def __del__(self):
with contextlib.suppress(Exception):
if self._sentinel_conn is not None:
self._sentinel_conn.close()
if self._sentinel_send is not None:
self._sentinel_send.close()
def _shutdown_subprocesses(
procs: list[_SubprocessWrapper], timeout: float | None = None
) -> None:
"""Shutdown subprocess wrappers (mirrors the shutdown() function)."""
if timeout is None:
timeout = 0.0
timeout = max(timeout, 5.0)
for proc in procs:
if proc.is_alive():
proc.terminate()
deadline = time.monotonic() + timeout
for proc in procs:
remaining = deadline - time.monotonic()
if remaining <= 0:
break
if proc.is_alive():
proc.join(remaining)
for proc in procs:
if proc.is_alive() and (pid := proc.pid) is not None:
kill_process_tree(pid)
def run_api_server_worker_proc(
listen_address, sock, args, client_config=None, **uvicorn_kwargs
) -> None:
@@ -253,7 +394,7 @@ def run_api_server_worker_proc(
def wait_for_completion_or_failure(
api_server_manager: APIServerProcessManager,
api_server_manager: "APIServerProcessManager | RustFrontendProcessManager",
engine_manager: Union["CoreEngineProcManager", "CoreEngineActorManager"]
| None = None,
coordinator: "DPCoordinator | None" = None,
@@ -274,7 +415,7 @@ def wait_for_completion_or_failure(
logger.info("Waiting for API servers to complete ...")
# Create a mapping of sentinels to their corresponding processes
# for efficient lookup
sentinel_to_proc: dict[Any, BaseProcess] = {
sentinel_to_proc: dict[Any, BaseProcess | _SubprocessWrapper | None] = {
proc.sentinel: proc for proc in api_server_manager.processes
}