[Frontend][RFC] Rust front-end integration (#40848)

Signed-off-by: Nick Hill <[email protected]> Signed-off-by: Bugen Zhao <[email protected]> Co-authored-by: Bugen Zhao <[email protected]>
2026-08-01 03:18:00 +00:00 · 2026-05-21 12:24:48 +08:00
parent d97ba29fdc
commit f2ace1d57d
35 changed files with 893 additions and 91 deletions
@@ -223,6 +223,13 @@ echo "CACHE_FROM_MAIN: ${CACHE_FROM_MAIN}"

 check_and_skip_if_image_exists

+# The rust frontend lives in a git submodule under rust/. Buildkite's default
+# checkout does not recurse submodules, and the Dockerfile only sees what's in
+# the build context, so initialize the submodule here before invoking bake.
+echo "--- :git: Initializing git submodules"
+git submodule sync --recursive
+git submodule update --init --recursive
+
 echo "--- :docker: Setting up Docker buildx bake"
 echo "Target: ${TARGET}"
 echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}"
@@ -21,6 +21,12 @@ else
  exit 0
 fi

+# The rust frontend lives in a git submodule under rust/. Buildkite's default
+# checkout does not recurse submodules, and the Dockerfile only sees what's in
+# the build context, so initialize the submodule here before building.
+git submodule sync --recursive
+git submodule update --init --recursive
+
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
@@ -21,6 +21,12 @@ else
  exit 0
 fi

+# The rust frontend lives in a git submodule under rust/. Buildkite's default
+# checkout does not recurse submodules, and the Dockerfile only sees what's in
+# the build context, so initialize the submodule here before building.
+git submodule sync --recursive
+git submodule update --init --recursive
+
 # build
 docker build --file docker/Dockerfile.cpu \
  --build-arg max_jobs=16 \
@@ -11,7 +11,7 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true

 # skip build if image already exists
 if [[ -z $(docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-hpu) ]]; then
@@ -11,8 +11,8 @@ REPO=$2
 BUILDKITE_COMMIT=$3

 # authenticate with AWS ECR
-aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY"
-aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com
+aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin "$REGISTRY" || true
+aws ecr get-login-password --region us-east-1 | docker login --username AWS --password-stdin 936637512419.dkr.ecr.us-east-1.amazonaws.com || true

 # skip build if image already exists
 if ! docker manifest inspect "$REGISTRY"/"$REPO":"$BUILDKITE_COMMIT"-xpu &> /dev/null; then
@@ -0,0 +1,107 @@
+group: Rust Frontend
+depends_on:
+  - image-build
+steps:
+- label: Rust Frontend OpenAI Coverage
+  timeout_in_minutes: 90
+  device: h200_18gb
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/benchmarks/
+  - vllm/entrypoints/openai/
+  - vllm/entrypoints/serve/
+  - vllm/v1/sample/
+  - tests/utils.py
+  - tests/benchmarks/test_serve_cli.py
+  - tests/entrypoints/openai/chat_completion/test_chat_completion.py
+  # - tests/entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py
+  # - tests/entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+  # - tests/entrypoints/openai/completion/test_prompt_validation.py
+  - tests/entrypoints/openai/completion/test_shutdown.py
+  # - tests/entrypoints/openai/test_return_token_ids.py
+  # - tests/entrypoints/openai/test_uds.py
+  - tests/v1/sample/test_logprobs_e2e.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s benchmarks/test_serve_cli.py -k "not insecure and not (test_bench_serve and not test_bench_serve_chat)"
+  - pytest -v -s entrypoints/openai/chat_completion/test_chat_completion.py
+  # - pytest -v -s entrypoints/openai/chat_completion/test_chat_logit_bias_validation.py -k "not invalid"
+  # - pytest -v -s entrypoints/openai/chat_completion/test_chat_with_tool_reasoning.py
+  # - pytest -v -s entrypoints/openai/completion/test_prompt_validation.py -k "not prompt_embeds"
+  - pytest -v -s entrypoints/openai/completion/test_shutdown.py -k "not engine_failure and not test_abort_timeout_exits_quickly"
+  # - pytest -v -s entrypoints/openai/test_return_token_ids.py
+  # - pytest -v -s entrypoints/openai/test_uds.py
+  - pytest -v -s v1/sample/test_logprobs_e2e.py -k "test_prompt_logprobs_e2e_server"
+
+- label: Rust Frontend Serve/Admin Coverage
+  timeout_in_minutes: 60
+  device: h200_18gb
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/entrypoints/openai/
+  - vllm/entrypoints/serve/
+  - vllm/v1/engine/
+  - tests/utils.py
+  # - tests/entrypoints/rpc/test_collective_rpc.py
+  - tests/entrypoints/serve/disagg/test_serving_tokens.py
+  - tests/entrypoints/serve/instrumentator/test_basic.py
+  - tests/entrypoints/serve/instrumentator/test_metrics.py
+  # - tests/entrypoints/serve/instrumentator/test_sleep.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  # - pytest -v -s entrypoints/rpc/test_collective_rpc.py
+  - pytest -v -s entrypoints/serve/instrumentator/test_basic.py -k "not show_version and not server_load"
+  - pytest -v -s entrypoints/serve/disagg/test_serving_tokens.py -k "not stream and not lora and not test_generate_logprobs and not stop_string_workflow"
+  - pytest -v -s entrypoints/serve/instrumentator/test_metrics.py -k "text and not show and not run_batch and not test_metrics_counts and not test_metrics_exist"
+  # - pytest -v -s entrypoints/serve/instrumentator/test_sleep.py
+
+- label: Rust Frontend Core Correctness
+  timeout_in_minutes: 30
+  device: h200_18gb
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/entrypoints/openai/
+  - tests/utils.py
+  - tests/entrypoints/openai/correctness/test_lmeval.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -s entrypoints/openai/correctness/test_lmeval.py::test_lm_eval_accuracy_v1_engine
+
+- label: Rust Frontend Tool Use
+  timeout_in_minutes: 60
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/entrypoints/openai/
+  - vllm/tool_parsers/
+  - tests/utils.py
+  - tests/tool_use/
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - pytest -v -s tool_use --ignore=tool_use/mistral --models llama3.2 -k "not test_response_format_with_tool_choice_required and not test_parallel_tool_calls_false and not test_tool_call_and_choice"
+
+- label: Rust Frontend Distributed
+  timeout_in_minutes: 30
+  num_devices: 4
+  working_dir: "/vllm-workspace/tests"
+  source_file_dependencies:
+  - rust/
+  - vllm/distributed/
+  - vllm/engine/
+  - vllm/executor/
+  - vllm/v1/engine/
+  - vllm/v1/worker/
+  - tests/utils.py
+  - tests/v1/distributed/test_internal_lb_dp.py
+  commands:
+  - export VLLM_USE_RUST_FRONTEND=1
+  - export VLLM_WORKER_MULTIPROC_METHOD=spawn
+  - export NCCL_CUMEM_HOST_ENABLE=0
+  - TP_SIZE=1 DP_SIZE=4 pytest -v -s v1/distributed/test_internal_lb_dp.py -k "not 4 and not server_info"
@@ -2,6 +2,7 @@
 /build
 dist
 vllm/*.so
+vllm/vllm-rs

 # Byte-compiled / optimized / DLL files
 __pycache__/
@@ -31,3 +32,4 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
+rust/target/
@@ -26,6 +26,9 @@ __pycache__/
 # C extensions
 *.so

+# Rust binaries
+vllm/vllm-rs
+
 # Distribution / packaging
 .Python
 build/
@@ -0,0 +1,3 @@
+[submodule "rust"]
+	path = rust
+	url = https://github.com/Inferact/vllm-frontend-rs.git
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Build the vllm-rs Rust frontend binary and install it into the vllm package.
+# Usage: ./build_rust.sh [--debug]
+#
+# By default builds in release mode. Pass --debug for faster compile times
+# during development.
+
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "$0")" && pwd)"
+RUST_DIR="$REPO_ROOT/rust"
+TARGET_PATH="$REPO_ROOT/vllm/vllm-rs"
+
+# Read the required toolchain from rust-toolchain.toml.
+TOOLCHAIN=$(grep '^channel' "$RUST_DIR/rust-toolchain.toml" | sed 's/.*= *"\(.*\)"/\1/')
+
+# Ensure rustup and the required toolchain are available.
+if ! command -v rustup &>/dev/null; then
+    echo "rustup not found, installing..."
+    curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain none
+    source "$HOME/.cargo/env"
+fi
+
+if ! rustup run "$TOOLCHAIN" rustc --version &>/dev/null; then
+    echo "Installing Rust toolchain: $TOOLCHAIN"
+    rustup toolchain install "$TOOLCHAIN"
+fi
+
+if [[ "${1:-}" == "--debug" ]]; then
+    PROFILE_ARGS=()
+    PROFILE_DIR="debug"
+else
+    PROFILE_ARGS=(--release)
+    PROFILE_DIR="release"
+fi
+
+cargo +"$TOOLCHAIN" build "${PROFILE_ARGS[@]}" \
+    --manifest-path "$RUST_DIR/Cargo.toml" \
+    --bin vllm-rs \
+    --features native-tls-vendored
+
+cp "$RUST_DIR/target/$PROFILE_DIR/vllm-rs" "$TARGET_PATH"
+echo "Installed vllm-rs to $TARGET_PATH"
@@ -231,6 +231,63 @@ ARG torch_cuda_arch_list='7.5 8.0 8.6 8.9 9.0 10.0 11.0 12.0+PTX'
 ENV TORCH_CUDA_ARCH_LIST=${torch_cuda_arch_list}
 #################### BUILD BASE IMAGE ####################

+#################### RUST BUILD IMAGE ####################
+# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the main wheel
+# build stage doesn't need the rust toolchain, protoc, or the rust source.
+# This stage runs in parallel with csrc-build/extensions-build.
+FROM ${BUILD_BASE_IMAGE} AS rust-build
+ARG BUILD_OS
+
+ENV DEBIAN_FRONTEND=noninteractive
+
+# Install protoc (used by tonic-build/prost-build) and a basic C toolchain
+# (some rust crates compile C in their build.rs scripts).
+RUN if [ "${BUILD_OS}" = "manylinux" ]; then \
+        dnf install -y --setopt=install_weak_deps=False \
+            ca-certificates curl git gcc gcc-c++ make \
+            protobuf-compiler protobuf-devel \
+        && dnf clean all && rm -rf /var/cache/dnf; \
+    else \
+        apt-get update -y \
+        && apt-get install -y --no-install-recommends \
+            ca-certificates curl git build-essential \
+            protobuf-compiler libprotobuf-dev \
+        && rm -rf /var/lib/apt/lists/*; \
+    fi
+
+# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+    sh -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+WORKDIR /workspace
+
+# Copy only the rust workspace — the binary is the sole artifact we need.
+COPY rust rust
+
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`. Without this check, cargo would emit a confusing error.
+RUN if [ ! -f rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
+# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
+# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
+ENV CARGO_BUILD_JOBS=4
+
+# Build the release binary. Cache cargo registry/git and target/, but copy the
+# binary out of the target/ cache mount so it persists into the image layer
+# for later COPY --from=rust-build.
+RUN --mount=type=cache,target=/root/.cargo/registry \
+    --mount=type=cache,target=/root/.cargo/git \
+    --mount=type=cache,target=/workspace/rust/target \
+    cd rust \
+    && cargo build --release --bin vllm-rs --features native-tls-vendored \
+    && cp target/release/vllm-rs /workspace/vllm-rs
+#################### RUST BUILD IMAGE ####################
+
 #################### CSRC BUILD IMAGE ####################
 FROM base AS csrc-build
 ARG TARGETPLATFORM
@@ -435,6 +492,10 @@ WORKDIR /workspace
 COPY --from=csrc-build /workspace/dist /precompiled-wheels
 COPY . .

+# Drop the pre-built rust frontend binary into the source tree. setup.py
+# detects it and ships it as-is, skipping the local cargo build.
+COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
+
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != "0" ]; then bash tools/check_repo.sh ; fi
@@ -455,6 +516,7 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=bind,source=.git,target=.git \
    if [ "${vllm_target_device}" = "cuda" ]; then \
+        export VLLM_USE_PRECOMPILED=1; \
        export VLLM_PRECOMPILED_WHEEL_LOCATION=$(ls /precompiled-wheels/*.whl); \
    fi && \
    python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38
@@ -36,6 +36,7 @@ RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
    && curl -LsSf https://astral.sh/uv/install.sh | sh

+# Compiler and linker environment
 ENV CC=/usr/bin/gcc-12 CXX=/usr/bin/g++-12
 ENV CCACHE_DIR=/root/.cache/ccache
 ENV CMAKE_CXX_COMPILER_LAUNCHER=ccache
@@ -80,6 +81,51 @@ FROM base-${TARGETARCH} AS base

 RUN echo 'ulimit -c 0' >> ~/.bashrc

+######################### RUST BUILD IMAGE #########################
+# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the wheel build
+# stage doesn't need the rust toolchain or protoc. This stage runs in parallel
+# with the main vllm-build stage.
+FROM ubuntu:22.04 AS rust-build
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+        ca-certificates curl git build-essential \
+        protobuf-compiler libprotobuf-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+    sh -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+WORKDIR /workspace
+
+# Copy only the rust workspace — the binary is the sole artifact we need.
+COPY rust rust
+
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`. Without this check, cargo would emit a confusing error.
+RUN if [ ! -f rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
+# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
+# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
+ENV CARGO_BUILD_JOBS=4
+
+# Build the release binary. Cache cargo registry/git and target/, but copy the
+# binary out of the target/ cache mount so it persists into the image layer
+# for later COPY --from=rust-build.
+RUN --mount=type=cache,target=/root/.cargo/registry \
+    --mount=type=cache,target=/root/.cargo/git \
+    --mount=type=cache,target=/workspace/rust/target \
+    cd rust \
+    && cargo build --release --bin vllm-rs --features native-tls-vendored \
+    && cp target/release/vllm-rs /workspace/vllm-rs
+
 ######################### BUILD IMAGE #########################
 FROM base AS vllm-build

@@ -114,6 +160,10 @@ RUN --mount=type=cache,target=/root/.cache/uv \

 COPY . .

+# Drop the pre-built rust frontend binary into the source tree. setup.py
+# detects it and ships it as-is, skipping the local cargo build.
+COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
+
 RUN if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh ; fi

 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -94,6 +94,47 @@ RUN cat torch_build_versions.txt

 #################### BASE BUILD IMAGE ####################

+#################### RUST BUILD IMAGE ####################
+# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the wheel build
+# stage doesn't need the rust toolchain or protoc.
+FROM ubuntu:22.04 AS rust-build
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+        ca-certificates curl git build-essential \
+        protobuf-compiler libprotobuf-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+    sh -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+WORKDIR /workspace
+
+COPY rust rust
+
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`.
+RUN if [ ! -f rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
+# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
+# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
+ENV CARGO_BUILD_JOBS=4
+
+RUN --mount=type=cache,target=/root/.cargo/registry \
+    --mount=type=cache,target=/root/.cargo/git \
+    --mount=type=cache,target=/workspace/rust/target \
+    cd rust \
+    && cargo build --release --bin vllm-rs --features native-tls-vendored \
+    && cp target/release/vllm-rs /workspace/vllm-rs
+#################### RUST BUILD IMAGE ####################
+
 #################### WHEEL BUILD IMAGE ####################
 FROM base AS build
 ARG TARGETPLATFORM
@@ -104,6 +145,10 @@ ENV UV_HTTP_TIMEOUT=500

 COPY . .

+# Drop the pre-built rust frontend binary into the source tree. setup.py
+# detects it and ships it as-is, skipping the local cargo build.
+COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
+
 RUN python3 use_existing_torch.py

 RUN --mount=type=cache,target=/root/.cache/uv \
@@ -106,14 +106,61 @@ ONBUILD RUN git clone ${VLLM_REPO} \
 	    && cd vllm \
 	    && git fetch -v --prune -- origin ${VLLM_BRANCH} \
 	    && git checkout FETCH_HEAD \
+	    && git submodule update --init --recursive \
        && if [ ${VLLM_REPO} != "https://github.com/vllm-project/vllm.git" ] ; then \
               git remote add upstream "https://github.com/vllm-project/vllm.git" \
               && git fetch upstream ; fi
 FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm

+# -----------------------
+# Rust build stage
+# Builds the `vllm-rs` frontend in a dedicated stage so the wheel build stages
+# don't need the rust toolchain or protoc. Runs in parallel with the main wheel
+# build for faster end-to-end builds.
+FROM fetch_vllm AS rust-build
+ARG COMMON_WORKDIR
+
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`. The rust frontend source is brought in via the fetch_vllm
+# stage, so an uninitialized submodule would otherwise produce a confusing
+# cargo failure.
+RUN if [ ! -f ${COMMON_WORKDIR}/vllm/rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
+# protoc is used by tonic-build/prost-build.
+RUN apt-get update -q -y && apt-get install -q -y --no-install-recommends \
+        protobuf-compiler libprotobuf-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+    sh -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+# Cap cargo parallelism to avoid exhausting the AMD CI host's open-file limit
+# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
+ENV CARGO_BUILD_JOBS=4
+
+# Build the release binary. Cache cargo registry/git, and copy the binary out
+# so it persists into the image layer for later COPY --from=rust-build.
+RUN --mount=type=cache,target=/root/.cargo/registry \
+    --mount=type=cache,target=/root/.cargo/git \
+    cd ${COMMON_WORKDIR}/vllm/rust \
+    && cargo build --release --bin vllm-rs --features native-tls-vendored \
+    && cp target/release/vllm-rs /tmp/vllm-rs
+
 # -----------------------
 # vLLM build stages
 FROM fetch_vllm AS build_vllm
+ARG COMMON_WORKDIR
+
+# Drop the pre-built rust frontend binary into the source tree. setup.py
+# detects it and ships it as-is, skipping the local cargo build.
+COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs
+
 # Build vLLM (setup.py auto-detects sccache in PATH)
 RUN cd vllm \
    && python3 -m pip install -r requirements/rocm.txt \
@@ -293,6 +340,10 @@ FROM fetch_vllm AS build_vllm_wheel_release

 ARG COMMON_WORKDIR

+# Drop the pre-built rust frontend binary into the source tree. setup.py
+# detects it and ships it as-is, skipping the local cargo build.
+COPY --from=rust-build /tmp/vllm-rs ${COMMON_WORKDIR}/vllm/vllm/vllm-rs
+
 # Create /install directory for custom wheels
 RUN mkdir -p /install

@@ -1,3 +1,43 @@
+######################### RUST BUILD IMAGE #########################
+# Build the Rust frontend (`vllm-rs`) in a dedicated stage so the main image
+# doesn't need the rust toolchain or protoc. Runs in parallel with vllm-base.
+FROM ubuntu:22.04 AS rust-build
+ENV DEBIAN_FRONTEND=noninteractive
+
+RUN apt-get update -y \
+    && apt-get install -y --no-install-recommends \
+        ca-certificates curl git build-essential \
+        protobuf-compiler libprotobuf-dev \
+    && rm -rf /var/lib/apt/lists/*
+
+# Install rustup; the toolchain itself is pinned by rust/rust-toolchain.toml.
+RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | \
+    sh -s -- -y --profile minimal --default-toolchain none
+ENV PATH="/root/.cargo/bin:${PATH}"
+
+WORKDIR /workspace
+
+COPY rust rust
+
+# Fail loudly if the rust submodule was not initialized on the host before
+# `docker build`.
+RUN if [ ! -f rust/Cargo.toml ]; then \
+        echo "ERROR: rust/ submodule is not initialized."; \
+        echo "Run 'git submodule update --init --recursive' on the host before building."; \
+        exit 1; \
+    fi
+
+# Cap cargo parallelism to avoid exhausting the CI host's open-file limit
+# (rustc spawns enough concurrent processes to hit RLIMIT_NOFILE otherwise).
+ENV CARGO_BUILD_JOBS=4
+
+RUN --mount=type=cache,target=/root/.cargo/registry \
+    --mount=type=cache,target=/root/.cargo/git \
+    --mount=type=cache,target=/workspace/rust/target \
+    cd rust \
+    && cargo build --release --bin vllm-rs --features native-tls-vendored \
+    && cp target/release/vllm-rs /workspace/vllm-rs
+
 FROM intel/deep-learning-essentials:2025.3.2-0-devel-ubuntu24.04 AS vllm-base

 WORKDIR /workspace/
@@ -99,6 +139,11 @@ RUN --mount=type=cache,target=/root/.cache/uv \
 ENV LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/usr/local/lib/"

 COPY . .
+
+# Drop the pre-built rust frontend binary into the source tree. setup.py
+# detects it and ships it as-is, skipping the local cargo build.
+COPY --from=rust-build /workspace/vllm-rs vllm/vllm-rs
+
 ARG GIT_REPO_CHECK=0
 RUN --mount=type=bind,source=.git,target=.git \
    if [ "$GIT_REPO_CHECK" != 0 ]; then bash tools/check_repo.sh; fi
@@ -43,6 +43,13 @@ If you are only developing vLLM's Python code, install vLLM using:
 VLLM_USE_PRECOMPILED=1 uv pip install -e .
 ```

+To rebuild only the Rust frontend binary:
+
+```bash
+./build_rust.sh          # release build
+./build_rust.sh --debug  # faster build for development
+```
+
 If you are developing vLLM's Python and CUDA/C++ code, install Pytorch first:

 ```bash
@@ -136,10 +136,10 @@ When installing vLLM with `VLLM_USE_PRECOMPILED=1`, the `setup.py` script:
 3. **Selects compatible wheel** based on:
    - Package name (`vllm`)
    - Platform tag (architecture match)
-4. **Downloads and extracts** precompiled binaries from the wheel:
-    - C++ extension modules (`.so` files)
-    - Flash Attention Python modules
-    - Triton kernel Python files
+4. **Downloads and extracts** precompiled artifacts from the wheel:
+    - Native extension modules (`.so` files)
+    - The `vllm-rs` Rust frontend binary
+    - Flash Attention Python modules and Triton/FlashMLA Python files
 5. **Patches package_data** to include extracted files in the installation

 !!! note "What is the base commit?"
@@ -101,12 +101,22 @@ This command will do the following:
 1. Look for the current branch in your vLLM clone.
 1. Identify the corresponding base commit in the main branch.
 1. Download the pre-built wheel of the base commit.
-1. Use its compiled libraries in the installation.
+1. Use its compiled libraries and `vllm-rs` binary in the installation.

 !!! note
    1. If you change C++ or kernel code, you cannot use Python-only build; otherwise you will see an import error about library not found or undefined symbol.
    2. If you rebase your dev branch, it is recommended to uninstall vllm and re-run the above command to make sure your libraries are up to date.

+!!! tip "Rebuilding the Rust frontend"
+If you need to recompile the `vllm-rs` Rust frontend binary, you can rebuild and install it without re-running the full pip install:
+
+    ```bash
+    ./build_rust.sh          # release build
+    ./build_rust.sh --debug  # faster build for development
+    ```
+
+    This will install the required Rust toolchain if needed, build the binary, and place it in `vllm/vllm-rs`.
+
 In case you see an error about wheel not found when running the above command, it might be because the commit you based on in the `main` branch was just merged and its precompiled wheel is not available yet. You can wait around an hour and retry, or set `VLLM_PRECOMPILED_WHEEL_COMMIT=nightly` to automatically select the most recent already-built commit on `main`.

 ```bash
@@ -6,6 +6,7 @@ requires = [
    "packaging>=24.2",
    "setuptools>=77.0.3,<81.0.0",
    "setuptools-scm>=8.0",
+    "setuptools-rust>=1.9.0",
    "torch == 2.11.0",
    "wheel",
    "jinja2",
@@ -4,6 +4,7 @@ ninja
 packaging>=24.2
 setuptools==77.0.3 # this version can reuse CMake build dir
 setuptools-scm>=8
+setuptools-rust>=1.9.0
 torch==2.11.0+cpu; platform_machine == "x86_64" or platform_machine == "s390x" or platform_machine == "aarch64"
 torch==2.11.0; platform_system == "Darwin" or platform_machine == "ppc64le"  or platform_machine == "riscv64"
 wheel
@@ -4,6 +4,7 @@ ninja
 packaging>=24.2
 setuptools>=77.0.3,<81.0.0
 setuptools-scm>=8
+setuptools-rust>=1.9.0
 torch==2.11.0
 wheel
 jinja2>=3.1.6
@@ -15,6 +15,7 @@ tensorizer==2.10.1
 packaging>=24.2
 setuptools>=77.0.3,<80.0.0
 setuptools-scm>=8
+setuptools-rust>=1.9.0
 runai-model-streamer[s3,gcs,azure]==0.15.7
 conch-triton-kernels==1.2.1
 timm>=1.0.17
@@ -0,0 +1,2 @@
+[toolchain]
+channel = "1.95"
@@ -18,6 +18,8 @@ import torch
 from packaging.version import Version, parse
 from setuptools import Extension, setup
 from setuptools.command.build_ext import build_ext
+from setuptools_rust import Binding, RustExtension
+from setuptools_rust.build import build_rust
 from setuptools_scm import get_version
 from torch.utils.cpp_extension import CUDA_HOME, ROCM_HOME

@@ -33,11 +35,24 @@ def load_module_from_path(module_name, path):
 ROOT_DIR = Path(__file__).parent
 logger = logging.getLogger(__name__)

+PRECOMPILED_RUST_FRONTEND_PATH = ROOT_DIR / "vllm" / "vllm-rs"
+
 # cannot import envs directly because it depends on vllm,
 #  which is not installed yet
 envs = load_module_from_path("envs", os.path.join(ROOT_DIR, "vllm", "envs.py"))

 VLLM_TARGET_DEVICE = envs.VLLM_TARGET_DEVICE
+USE_PRECOMPILED_EXTENSIONS = envs.VLLM_USE_PRECOMPILED
+# VLLM_USE_PRECOMPILED implies precompiled rust frontend too.
+USE_PRECOMPILED_RUST_FRONTEND = (
+    envs.VLLM_USE_PRECOMPILED or envs.VLLM_USE_PRECOMPILED_RUST
+)
+
+
+def should_require_rust_frontend() -> bool:
+    value = os.getenv("VLLM_REQUIRE_RUST_FRONTEND", "")
+    return value.lower() not in ("", "0", "false", "no")
+

 if sys.platform.startswith("darwin") and VLLM_TARGET_DEVICE != "cpu":
    logger.warning("VLLM_TARGET_DEVICE automatically set to `cpu` due to macOS")
@@ -405,6 +420,24 @@ class precompiled_build_ext(build_ext):
        return


+class precompiled_build_rust(build_rust):
+    """Skips local Rust builds when the precompiled wheel already ships vllm-rs."""
+
+    def run(self) -> None:
+        if PRECOMPILED_RUST_FRONTEND_PATH.exists():
+            logger.info(
+                "Skipping local Rust build: using precompiled %s",
+                PRECOMPILED_RUST_FRONTEND_PATH,
+            )
+            return
+
+        logger.warning(
+            "Precompiled wheel did not provide %s; falling back to local Rust build.",
+            PRECOMPILED_RUST_FRONTEND_PATH,
+        )
+        super().run()
+
+
 class precompiled_wheel_utils:
    """Extracts libraries and other files from an existing wheel."""

@@ -653,7 +686,11 @@ class precompiled_wheel_utils:

    @staticmethod
    def extract_precompiled_and_patch_package(
-        wheel_url_or_path: str, download_filename: str | None
+        wheel_url_or_path: str,
+        download_filename: str | None,
+        *,
+        extract_extensions: bool,
+        extract_rust_frontend: bool,
    ) -> dict:
        import tempfile
        import zipfile
@@ -676,20 +713,26 @@ class precompiled_wheel_utils:
            package_data_patch = {}

            with zipfile.ZipFile(wheel_path) as wheel:
-                files_to_copy = [
-                    "vllm/_C.abi3.so",
-                    "vllm/_C_stable_libtorch.abi3.so",
-                    "vllm/_moe_C.abi3.so",
-                    "vllm/_flashmla_C.abi3.so",
-                    "vllm/_flashmla_extension_C.abi3.so",
-                    "vllm/_sparse_flashmla_C.abi3.so",
-                    "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
-                    "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
-                    "vllm/cumem_allocator.abi3.so",
-                    "vllm/spinloop.abi3.so",
-                    # ROCm-specific libraries
-                    "vllm/_rocm_C.abi3.so",
-                ]
+                exact_members = set()
+                if extract_extensions:
+                    exact_members.update(
+                        {
+                            "vllm/_C.abi3.so",
+                            "vllm/_C_stable_libtorch.abi3.so",
+                            "vllm/_moe_C.abi3.so",
+                            "vllm/_flashmla_C.abi3.so",
+                            "vllm/_flashmla_extension_C.abi3.so",
+                            "vllm/_sparse_flashmla_C.abi3.so",
+                            "vllm/vllm_flash_attn/_vllm_fa2_C.abi3.so",
+                            "vllm/vllm_flash_attn/_vllm_fa3_C.abi3.so",
+                            "vllm/cumem_allocator.abi3.so",
+                            "vllm/spinloop.abi3.so",
+                            # ROCm-specific libraries
+                            "vllm/_rocm_C.abi3.so",
+                        }
+                    )
+                if extract_rust_frontend:
+                    exact_members.add("vllm/vllm-rs")

                flash_attn_regex = re.compile(
                    r"vllm/vllm_flash_attn/(?:[^/.][^/]*/)*(?!\.)[^/]*\.py"
@@ -708,27 +751,25 @@ class precompiled_wheel_utils:
                )
                # DeepGEMM: extract all files (.py, .so, .cuh, .h, .hpp, etc.)
                deep_gemm_regex = re.compile(r"vllm/third_party/deep_gemm/.*")
-                file_members = list(
-                    filter(lambda x: x.filename in files_to_copy, wheel.filelist)
-                )
-                file_members += list(
-                    filter(
-                        lambda x: flash_attn_regex.match(x.filename)
-                        and x.filename not in flash_attn_files_to_skip,
-                        wheel.filelist,
-                    )
-                )
-                file_members += list(
-                    filter(
-                        lambda x: triton_kernels_regex.match(x.filename), wheel.filelist
-                    )
-                )
-                file_members += list(
-                    filter(lambda x: flashmla_regex.match(x.filename), wheel.filelist)
-                )
-                file_members += list(
-                    filter(lambda x: deep_gemm_regex.match(x.filename), wheel.filelist)
-                )
+                file_members = []
+                for member in wheel.filelist:
+                    if member.filename in exact_members:
+                        file_members.append(member)
+                        continue
+
+                    if not extract_extensions:
+                        continue
+
+                    if (
+                        (
+                            flash_attn_regex.match(member.filename)
+                            and member.filename not in flash_attn_files_to_skip
+                        )
+                        or triton_kernels_regex.match(member.filename)
+                        or flashmla_regex.match(member.filename)
+                        or deep_gemm_regex.match(member.filename)
+                    ):
+                        file_members.append(member)

                for file in file_members:
                    print(f"[extract] {file.filename}")
@@ -739,6 +780,9 @@ class precompiled_wheel_utils:
                        open(target_path, "wb") as dst,
                    ):
                        shutil.copyfileobj(src, dst)
+                    mode = file.external_attr >> 16
+                    if mode:
+                        os.chmod(target_path, mode)

                    pkg = os.path.dirname(file.filename).replace("/", ".")
                    package_data_patch.setdefault(pkg, []).append(
@@ -911,7 +955,7 @@ def get_vllm_version() -> str:
        if envs.VLLM_TARGET_DEVICE == "empty":
            version += f"{sep}empty"
    elif _is_cuda():
-        if envs.VLLM_USE_PRECOMPILED and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
+        if USE_PRECOMPILED_EXTENSIONS and not envs.VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX:
            version += f"{sep}precompiled"
        else:
            cuda_version = str(get_nvcc_cuda_version())
@@ -1006,7 +1050,7 @@ if _is_hip():

 if _is_cuda():
    ext_modules.append(CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa2_C"))
-    if envs.VLLM_USE_PRECOMPILED or (
+    if USE_PRECOMPILED_EXTENSIONS or (
        CUDA_HOME and get_nvcc_cuda_version() >= Version("12.3")
    ):
        # FA3 requires CUDA 12.3 or later
@@ -1016,7 +1060,7 @@ if _is_cuda():
    ext_modules.append(
        CMakeExtension(name="vllm.vllm_flash_attn._vllm_fa4_cutedsl_C", optional=True)
    )
-    if envs.VLLM_USE_PRECOMPILED or (
+    if USE_PRECOMPILED_EXTENSIONS or (
        CUDA_HOME and get_nvcc_cuda_version() >= Version("12.9")
    ):
        # FlashMLA requires CUDA 12.9 or later
@@ -1065,15 +1109,25 @@ package_data = {
 }


-# If using precompiled, extract and patch package_data (in advance of setup)
-if envs.VLLM_USE_PRECOMPILED:
+# If using precompiled artifacts, extract and patch package_data in advance.
+if USE_PRECOMPILED_RUST_FRONTEND:
    wheel_url, download_filename = precompiled_wheel_utils.determine_wheel_url()
    patch = precompiled_wheel_utils.extract_precompiled_and_patch_package(
-        wheel_url, download_filename
+        wheel_url,
+        download_filename,
+        extract_extensions=USE_PRECOMPILED_EXTENSIONS,
+        extract_rust_frontend=True,
    )
    for pkg, files in patch.items():
        package_data.setdefault(pkg, []).extend(files)

+# If the rust frontend binary is already present in the source tree (e.g.,
+# pre-built in a separate Docker build stage), ship it as-is.
+if PRECOMPILED_RUST_FRONTEND_PATH.exists():
+    vllm_files = package_data.setdefault("vllm", [])
+    if "vllm-rs" not in vllm_files:
+        vllm_files.append("vllm-rs")
+
 if _no_device():
    ext_modules = []

@@ -1082,14 +1136,32 @@ if not ext_modules:
 else:
    cmdclass = {
        "build_ext": precompiled_build_ext
-        if envs.VLLM_USE_PRECOMPILED
+        if USE_PRECOMPILED_EXTENSIONS
        else cmake_build_ext,
    }
+if USE_PRECOMPILED_RUST_FRONTEND or PRECOMPILED_RUST_FRONTEND_PATH.exists():
+    cmdclass["build_rust"] = precompiled_build_rust
+
+# Rust frontend binary, built via setuptools-rust and installed into the
+# package directory alongside the Python modules.
+# TODO: we may use `RustBin` to directly install it into `bin` directory, but this
+# requires extra work on using precompiled binaries.
+rust_extensions = [
+    RustExtension(
+        target="vllm.vllm-rs",
+        path="rust/src/cmd/Cargo.toml",
+        args=["--bin", "vllm-rs"],
+        features=["native-tls-vendored"],
+        binding=Binding.Exec,
+        optional=not should_require_rust_frontend(),
+    ),
+]

 setup(
    # static metadata should rather go in pyproject.toml
    version=get_vllm_version(),
    ext_modules=ext_modules,
+    rust_extensions=rust_extensions,
    install_requires=get_requirements(),
    extras_require={
        # AMD Zen CPU optimizations via zentorch
@@ -299,22 +299,16 @@ async def test_abort_timeout_exits_quickly(wait_for_engine_idle: float):
        start_time = time.time()
        proc.send_signal(signal.SIGTERM)

-        # abort timeout (0) should stop the server promptly. On ROCm, process
-        # exit can spend extra time in HIP/RCCL/native extension teardown after
-        # the server and engine have already shut down.
-        max_exit_time = 4.0 if _IS_ROCM else 2.1
-
+        # abort timeout (0) should stop the server promptly.
        try:
-            proc.wait(timeout=max_exit_time)
+            proc.wait(timeout=4.0)
        except subprocess.TimeoutExpired:
            proc.kill()
            proc.wait(timeout=5)
            pytest.fail("Process did not exit after SIGTERM with abort timeout")

        exit_time = time.time() - start_time
-        assert exit_time < max_exit_time, (
-            f"Default shutdown took too long: {exit_time:.1f}s"
-        )
+        assert exit_time < 4.1, f"Default shutdown took too long: {exit_time:.1f}s"
        assert proc.returncode in (0, -15, None), f"Unexpected: {proc.returncode}"

        await _assert_children_cleaned_up(child_pids)
@@ -103,6 +103,19 @@ def test_is_envs_cache_enabled() -> None:
    assert not envs._is_envs_cache_enabled()


+def test_precompiled_install_flags_are_orthogonal() -> None:
+    with patch.dict(
+        os.environ,
+        {
+            "VLLM_PRECOMPILED_WHEEL_LOCATION": "/tmp/vllm.whl",
+            "VLLM_USE_PRECOMPILED_RUST": "1",
+        },
+        clear=False,
+    ):
+        assert environment_variables["VLLM_USE_PRECOMPILED"]() is False
+        assert environment_variables["VLLM_USE_PRECOMPILED_RUST"]() is True
+
+
 class TestEnvWithChoices:
    """Test cases for env_with_choices function."""

@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import argparse
+import signal

 import uvloop

@@ -110,6 +111,14 @@ def cmd_init() -> list[CLISubcommand]:

 async def run_launch_fastapi(args: argparse.Namespace) -> None:
    """Run the online serving layer with FastAPI (no GPU inference)."""
+
+    # Interrupt initialization if SIGTERM arrives before uvicorn installs
+    # its own signal handlers. Once uvicorn is running it replaces this.
+    def _interrupt_init(*_) -> None:
+        raise KeyboardInterrupt("terminated")
+
+    signal.signal(signal.SIGTERM, _interrupt_init)
+
    # 1. Socket binding
    listen_address, sock = setup_server(args)

@@ -21,7 +21,11 @@ from vllm.v1.engine.utils import CoreEngineProcManager, launch_core_engines
 from vllm.v1.executor import Executor
 from vllm.v1.executor.multiproc_executor import MultiprocExecutor
 from vllm.v1.metrics.prometheus import setup_multiprocess_prometheus
-from vllm.v1.utils import APIServerProcessManager, wait_for_completion_or_failure
+from vllm.v1.utils import (
+    APIServerProcessManager,
+    RustFrontendProcessManager,
+    wait_for_completion_or_failure,
+)

 logger = init_logger(__name__)

@@ -81,11 +85,12 @@ class ServeSubcommand(CLISubcommand):
            )

        # Default api_server_count if not explicitly set.
+        # - Rust frontend: Use 1 (not applicable as it's multithreaded)
        # - External LB: Leave as 1 (external LB handles distribution)
        # - Hybrid LB: Use local DP size (internal LB for local ranks only)
        # - Internal LB: Use full DP size
        if args.api_server_count is None:
-            if is_external_lb:
+            if is_external_lb or envs.VLLM_RUST_FRONTEND_PATH:
                args.api_server_count = 1
            elif is_hybrid_lb:
                args.api_server_count = args.data_parallel_size_local or 1
@@ -102,6 +107,11 @@ class ServeSubcommand(CLISubcommand):
                        "Defaulting api_server_count to data_parallel_size (%d).",
                        args.api_server_count,
                    )
+        elif envs.VLLM_RUST_FRONTEND_PATH and args.api_server_count > 1:
+            logger.warning(
+                "Ignoring --api-server-count=%d when using rust front-end process"
+            )
+            args.api_server_count = 1

        # Elastic EP currently only supports running with at most one API server.
        if getattr(args, "enable_elastic_ep", False) and args.api_server_count > 1:
@@ -114,7 +124,7 @@ class ServeSubcommand(CLISubcommand):

        if args.api_server_count < 1:
            run_headless(args)
-        elif args.api_server_count > 1:
+        elif args.api_server_count > 1 or envs.VLLM_RUST_FRONTEND_PATH:
            run_multi_api_server(args)
        else:
            # Single API server (this process).
@@ -230,9 +240,15 @@ def run_headless(args: argparse.Namespace):

 def run_multi_api_server(args: argparse.Namespace):
    assert not args.headless
+    rust_frontend_path = envs.VLLM_RUST_FRONTEND_PATH
    num_api_servers: int = args.api_server_count
    assert num_api_servers > 0

+    if rust_frontend_path and num_api_servers > 1:
+        raise ValueError(
+            "VLLM_RUST_FRONTEND_PATH does not support api_server_count > 1"
+        )
+
    if num_api_servers > 1:
        setup_multiprocess_prometheus()

@@ -270,7 +286,9 @@ def run_multi_api_server(args: argparse.Namespace):
    dp_rank = parallel_config.data_parallel_rank
    assert parallel_config.local_engines_only or dp_rank == 0

-    api_server_manager: APIServerProcessManager | None = None
+    api_server_manager: APIServerProcessManager | RustFrontendProcessManager | None = (
+        None
+    )

    from vllm.v1.engine.utils import get_engine_zmq_addresses

@@ -279,23 +297,34 @@ def run_multi_api_server(args: argparse.Namespace):
    with launch_core_engines(
        vllm_config, executor_class, log_stats, addresses, num_api_servers
    ) as (local_engine_manager, coordinator, addresses, tensor_queue):
-        # Construct common args for the APIServerProcessManager up-front.
-        stats_update_address = None
-        if coordinator:
-            stats_update_address = coordinator.get_stats_publish_address()
-
-        # Start API servers.
-        api_server_manager = APIServerProcessManager(
-            listen_address=listen_address,
-            sock=sock,
-            args=args,
-            num_servers=num_api_servers,
-            input_addresses=addresses.inputs,
-            output_addresses=addresses.outputs,
-            stats_update_address=stats_update_address,
-            tensor_queue=tensor_queue,
+        stats_update_address = (
+            coordinator.get_stats_publish_address() if coordinator else None
        )

+        if rust_frontend_path:
+            # Start rust front-end process.
+            api_server_manager = RustFrontendProcessManager(
+                binary_path=rust_frontend_path,
+                sock=sock,
+                args=args,
+                input_address=addresses.inputs[0],
+                output_address=addresses.outputs[0],
+                engine_count=parallel_config.data_parallel_size,
+                stats_update_address=stats_update_address,
+            )
+        else:
+            # Start API server(s).
+            api_server_manager = APIServerProcessManager(
+                listen_address=listen_address,
+                sock=sock,
+                args=args,
+                num_servers=num_api_servers,
+                input_addresses=addresses.inputs,
+                output_addresses=addresses.outputs,
+                stats_update_address=stats_update_address,
+                tensor_queue=tensor_queue,
+            )
+
    # Wait for API servers.
    try:
        wait_for_completion_or_failure(
@@ -533,8 +533,7 @@ def validate_api_server_args(args):

@instrument(span_name="API server setup")
 def setup_server(args):
-    """Validate API server args, set up signal handler, create socket
-    ready to serve."""
+    """Validate API server args and create the server socket."""

    log_version_and_model(logger, VLLM_VERSION, args.model)
    log_non_default_args(args)
@@ -560,12 +559,6 @@ def setup_server(args):
    # many concurrent requests active
    set_ulimit()

-    def signal_handler(*_) -> None:
-        # Interrupt server on sigterm while initializing
-        raise KeyboardInterrupt("terminated")
-
-    signal.signal(signal.SIGTERM, signal_handler)
-
    if args.uds:
        listen_address = f"unix:{args.uds}"
    else:
@@ -675,6 +668,13 @@ async def run_server(args, **uvicorn_kwargs) -> None:
    # Add process-specific prefix to stdout and stderr.
    decorate_logs("APIServer")

+    # Interrupt initialization if SIGTERM arrives before uvicorn installs its
+    # own signal handlers. Once uvicorn is running it replaces this.
+    def _interrupt_init(*_) -> None:
+        raise KeyboardInterrupt("terminated")
+
+    signal.signal(signal.SIGTERM, _interrupt_init)
+
    listen_address, sock = setup_server(args)
    await run_server_worker(listen_address, sock, args, **uvicorn_kwargs)

@@ -9,6 +9,7 @@ from argparse import Namespace
 from http import HTTPStatus
 from logging import Logger
 from string import Template
+from typing import Any

 import regex as re
 from fastapi import Request
@@ -210,7 +211,7 @@ def get_max_tokens(
    )


-def log_non_default_args(args: Namespace | EngineArgs):
+def get_non_default_args(args: Namespace | EngineArgs) -> dict[str, Any]:
    from vllm.entrypoints.openai.cli_args import make_arg_parser

    non_default_args = {}
@@ -237,6 +238,43 @@ def log_non_default_args(args: Namespace | EngineArgs):
            "Unsupported argument type. Must be Namespace or EngineArgs instance."
        )

+    return non_default_args
+
+
+def _jsonify_arg_value(value: Any) -> Any:
+    if value is None or isinstance(value, bool | int | float | str):
+        return value
+    if dataclasses.is_dataclass(value) and not isinstance(value, type):
+        return {
+            key: _jsonify_arg_value(val)
+            for key, val in dataclasses.asdict(value).items()
+        }
+    if isinstance(value, dict):
+        return {str(key): _jsonify_arg_value(val) for key, val in value.items()}
+    if isinstance(value, tuple | list):
+        return [_jsonify_arg_value(item) for item in value]
+    if (model_dump := getattr(value, "model_dump", None)) is not None:
+        return _jsonify_arg_value(model_dump(mode="json"))
+    if (to_dict := getattr(value, "dict", None)) is not None:
+        return _jsonify_arg_value(to_dict())
+    return repr(value)
+
+
+def jsonify_non_default_args(
+    args: Namespace | EngineArgs,
+    *,
+    exclude: set[str] | None = None,
+) -> dict[str, Any]:
+    non_default_args = get_non_default_args(args)
+    if exclude is not None:
+        for key in exclude:
+            non_default_args.pop(key, None)
+
+    return {key: _jsonify_arg_value(value) for key, value in non_default_args.items()}
+
+
+def log_non_default_args(args: Namespace | EngineArgs):
+    non_default_args = get_non_default_args(args)
    logger.info("non-default args: %s", non_default_args)


@@ -86,6 +86,7 @@ if TYPE_CHECKING:
    MAX_JOBS: str | None = None
    NVCC_THREADS: str | None = None
    VLLM_USE_PRECOMPILED: bool = False
+    VLLM_USE_PRECOMPILED_RUST: bool = False
    VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX: bool = False
    VLLM_DOCKER_BUILD_CONTEXT: bool = False
    VLLM_KEEP_ALIVE_ON_ENGINE_DEATH: bool = False
@@ -134,6 +135,8 @@ if TYPE_CHECKING:
    Q_SCALE_CONSTANT: int = 200
    K_SCALE_CONSTANT: int = 200
    V_SCALE_CONSTANT: int = 100
+    VLLM_USE_RUST_FRONTEND: bool = False
+    VLLM_RUST_FRONTEND_PATH: str | None = "auto"
    VLLM_SERVER_DEV_MODE: bool = False
    VLLM_V1_OUTPUT_PROC_CHUNK_SIZE: int = 128
    VLLM_MLA_DISABLE: bool = False
@@ -514,6 +517,40 @@ def get_env_or_set_default(

 logger = logging.getLogger(__name__)

+
+def _resolve_rust_frontend_path() -> str | None:
+    """Resolve the Rust frontend binary path.
+
+    Returns None if VLLM_USE_RUST_FRONTEND is not enabled.
+    When enabled, resolves VLLM_RUST_FRONTEND_PATH ("auto" by default)
+    to the actual binary path.
+    """
+    use_rust = bool(int(os.environ.get("VLLM_USE_RUST_FRONTEND", "0")))
+    raw = os.environ.get("VLLM_RUST_FRONTEND_PATH", "auto")
+
+    if not use_rust:
+        if os.environ.get("VLLM_RUST_FRONTEND_PATH") is not None:
+            logger.warning(
+                "VLLM_RUST_FRONTEND_PATH is set but VLLM_USE_RUST_FRONTEND "
+                "is not enabled. The Rust frontend will not be used. "
+                "Set VLLM_USE_RUST_FRONTEND=1 to enable it."
+            )
+        return None
+
+    if raw.lower() in ("auto", "1", "true"):
+        pkg_dir = os.path.dirname(os.path.abspath(__file__))
+        candidate = os.path.join(pkg_dir, "vllm-rs")
+        if os.path.isfile(candidate) and os.access(candidate, os.X_OK):
+            return candidate
+
+        raise FileNotFoundError(
+            "VLLM_RUST_FRONTEND_PATH=auto but the vllm-rs binary was "
+            f"not found at {candidate}. "
+            "Build with setuptools-rust or set the path explicitly."
+        )
+    return raw
+
+
 environment_variables: dict[str, Callable[[], Any]] = {
    # ================== Installation Time Env Vars ==================
    # Target device of vLLM, supporting [cuda (by default),
@@ -551,11 +588,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # By default this is 1.
    # If set, `MAX_JOBS` will be reduced to avoid oversubscribing the CPU.
    "NVCC_THREADS": lambda: os.getenv("NVCC_THREADS", None),
-    # If set, vllm will use precompiled binaries (*.so)
+    # If set, vllm will use precompiled native binaries (*.so and vllm-rs).
    "VLLM_USE_PRECOMPILED": lambda: (
        os.environ.get("VLLM_USE_PRECOMPILED", "").strip().lower() in ("1", "true")
        or bool(os.environ.get("VLLM_PRECOMPILED_WHEEL_LOCATION"))
    ),
+    # If set, vllm will use the precompiled Rust frontend binary (vllm-rs).
+    "VLLM_USE_PRECOMPILED_RUST": lambda: (
+        os.environ.get("VLLM_USE_PRECOMPILED_RUST", "").strip().lower() in ("1", "true")
+    ),
    # If set, skip adding +precompiled suffix to version string
    "VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX": lambda: bool(
        int(os.environ.get("VLLM_SKIP_PRECOMPILED_VERSION_SUFFIX", "0"))
@@ -1154,6 +1195,15 @@ environment_variables: dict[str, Callable[[], Any]] = {
    # If set to "0", disable LayerName opaque type for layer_name
    # parameters in custom ops.  Defaults to enabled on torch >= 2.11.
    "VLLM_USE_LAYERNAME": lambda: bool(int(os.getenv("VLLM_USE_LAYERNAME", "1"))),
+    # If set, use the Rust frontend binary instead of the Python API server
+    # process(es).
+    "VLLM_USE_RUST_FRONTEND": lambda: bool(
+        int(os.getenv("VLLM_USE_RUST_FRONTEND", "0"))
+    ),
+    # Path to the Rust frontend binary. Defaults to "auto" which discovers
+    # the binary installed with the vllm package. Only used when
+    # VLLM_USE_RUST_FRONTEND=1.
+    "VLLM_RUST_FRONTEND_PATH": lambda: _resolve_rust_frontend_path(),
    # If set, vllm will run in development mode, which will enable
    # some additional endpoints for developing and debugging,
    # e.g. `/reset_prefix_cache`
@@ -75,6 +75,7 @@ class EngineCoreReadyResponse:
    max_model_len: int
    num_gpu_blocks: int
    dp_stats_address: str | None
+    dtype: str | None = None


 class EngineCoreRequest(
@@ -1437,6 +1437,7 @@ class EngineCoreProc(EngineCore):
                max_model_len=self.vllm_config.model_config.max_model_len,
                num_gpu_blocks=self.vllm_config.cache_config.num_gpu_blocks or 0,
                dp_stats_address=self.frontend_stats_publish_address,
+                dtype=str(self.vllm_config.model_config.dtype).removeprefix("torch."),
            )
            ready_payload = msgspec.msgpack.encode(ready_response)
            for input_socket in input_sockets:
@@ -2,6 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import argparse
 import contextlib
+import json
 import multiprocessing
 import threading
 import time
@@ -233,6 +234,146 @@ class APIServerProcessManager:
            shutdown(self.processes, timeout=timeout)


+class RustFrontendProcessManager:
+    """Manages a single Rust frontend subprocess.
+
+    Launches the Rust vllm-rs binary in 'frontend' mode, passing the
+    listening socket fd and ZMQ transport addresses. Provides the same
+    interface as APIServerProcessManager for process monitoring.
+    """
+
+    def __init__(
+        self,
+        binary_path: str,
+        sock: Any,
+        args: argparse.Namespace,
+        input_address: str,
+        output_address: str,
+        engine_count: int,
+        stats_update_address: str | None = None,
+    ):
+        import os
+        import subprocess
+
+        fd = sock.fileno()
+        os.set_inheritable(fd, True)
+
+        cmd = [
+            binary_path,
+            "frontend",
+            "--listen-fd",
+            str(fd),
+            "--input-address",
+            input_address,
+            "--output-address",
+            output_address,
+            "--engine-count",
+            str(engine_count),
+        ]
+        if stats_update_address is not None:
+            cmd.extend(["--coordinator-address", stats_update_address])
+        from vllm.entrypoints.utils import jsonify_non_default_args
+
+        args_json = json.dumps(
+            jsonify_non_default_args(args, exclude={"api_server_count"}),
+            sort_keys=True,
+        )
+        cmd.extend(["--args-json", args_json])
+
+        logger.info("Launching Rust frontend: %s", " ".join(cmd))
+        self._proc = subprocess.Popen(cmd, pass_fds=(fd,))
+
+        # Create a process wrapper with a sentinel fd for monitoring
+        self.processes: list[_SubprocessWrapper] = [
+            _SubprocessWrapper(self._proc, "RustFrontend")
+        ]
+
+        self._finalizer = weakref.finalize(self, _shutdown_subprocesses, self.processes)
+
+    def shutdown(self, timeout: float | None = None) -> None:
+        if self._finalizer.detach() is not None:
+            _shutdown_subprocesses(self.processes, timeout=timeout)
+
+
+class _SubprocessWrapper:
+    """Wraps subprocess.Popen to provide the BaseProcess-like interface
+    needed by wait_for_completion_or_failure."""
+
+    def __init__(self, proc, name: str):
+        self._proc = proc
+        self.name = name
+        self.pid = proc.pid
+        self._sentinel_conn: connection.Connection | None = None
+        self._sentinel_send: connection.Connection | None = None
+
+        # Use a Pipe-based sentinel so subprocess monitoring works uniformly
+        # across platforms with multiprocessing.connection.wait().
+        recv, send = connection.Pipe(duplex=False)
+        self._sentinel_conn = recv
+        self._sentinel_send = send
+
+        def monitor_subprocess() -> None:
+            try:
+                proc.wait()
+            finally:
+                with contextlib.suppress(Exception):
+                    send.close()
+
+        threading.Thread(
+            target=monitor_subprocess, daemon=True, name=f"{name}Monitor"
+        ).start()
+
+    @property
+    def sentinel(self):
+        return self._sentinel_conn
+
+    @property
+    def exitcode(self) -> int | None:
+        return self._proc.returncode if self._proc.poll() is not None else None
+
+    def is_alive(self) -> bool:
+        return self._proc.poll() is None
+
+    def terminate(self):
+        self._proc.terminate()
+
+    def join(self, timeout=None):
+        with contextlib.suppress(Exception):
+            self._proc.wait(timeout=timeout)
+
+    def __del__(self):
+        with contextlib.suppress(Exception):
+            if self._sentinel_conn is not None:
+                self._sentinel_conn.close()
+            if self._sentinel_send is not None:
+                self._sentinel_send.close()
+
+
+def _shutdown_subprocesses(
+    procs: list[_SubprocessWrapper], timeout: float | None = None
+) -> None:
+    """Shutdown subprocess wrappers (mirrors the shutdown() function)."""
+    if timeout is None:
+        timeout = 0.0
+    timeout = max(timeout, 5.0)
+
+    for proc in procs:
+        if proc.is_alive():
+            proc.terminate()
+
+    deadline = time.monotonic() + timeout
+    for proc in procs:
+        remaining = deadline - time.monotonic()
+        if remaining <= 0:
+            break
+        if proc.is_alive():
+            proc.join(remaining)
+
+    for proc in procs:
+        if proc.is_alive() and (pid := proc.pid) is not None:
+            kill_process_tree(pid)
+
+
 def run_api_server_worker_proc(
    listen_address, sock, args, client_config=None, **uvicorn_kwargs
 ) -> None:
@@ -253,7 +394,7 @@ def run_api_server_worker_proc(


 def wait_for_completion_or_failure(
-    api_server_manager: APIServerProcessManager,
+    api_server_manager: "APIServerProcessManager | RustFrontendProcessManager",
    engine_manager: Union["CoreEngineProcManager", "CoreEngineActorManager"]
    | None = None,
    coordinator: "DPCoordinator | None" = None,
@@ -274,7 +415,7 @@ def wait_for_completion_or_failure(
        logger.info("Waiting for API servers to complete ...")
        # Create a mapping of sentinels to their corresponding processes
        # for efficient lookup
-        sentinel_to_proc: dict[Any, BaseProcess] = {
+        sentinel_to_proc: dict[Any, BaseProcess | _SubprocessWrapper | None] = {
            proc.sentinel: proc for proc in api_server_manager.processes
        }