Revert "[CPU] Experimentally enable Triton and MRV2 (#43225)"

This reverts commit 65b7a812a2.
This commit is contained in:
khluu
2026-05-29 02:28:43 -07:00
parent 799c3afa5d
commit 0b3ba88f16
11 changed files with 13 additions and 169 deletions
-14
View File
@@ -54,20 +54,6 @@ steps:
pytest -x -v -s tests/models/language/generation -m cpu_model
pytest -x -v -s tests/models/language/pooling -m cpu_model"
- label: CPU-ModelRunnerV2 Tests
depends_on: []
device: intel_cpu
no_plugin: true
soft_fail: true
source_file_dependencies:
- vllm/v1/worker/cpu/
- vllm/v1/worker/gpu/
commands:
- |
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
uv pip install git+https://github.com/triton-lang/triton-cpu.git@270e696d
VLLM_USE_V2_MODEL_RUNNER=1 pytest -x -v -s tests/models/language/generation/test_granite.py -m cpu_model"
- label: CPU-Quantization Model Tests
depends_on: []
device: intel_cpu
+6 -5
View File
@@ -27,14 +27,11 @@ WORKDIR /workspace
ARG PYTHON_VERSION=3.12
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ARG max_jobs=32
ENV MAX_JOBS=${max_jobs}
# Install minimal dependencies and uv
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
--mount=type=cache,target=/var/lib/apt,sharing=locked \
apt-get update -y \
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates zlib1g-dev \
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof make xz-utils \
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
&& curl -LsSf https://astral.sh/uv/install.sh | sh
@@ -126,6 +123,9 @@ RUN --mount=type=cache,target=/root/.cargo/registry \
######################### BUILD IMAGE #########################
FROM base AS vllm-build
ARG max_jobs=32
ENV MAX_JOBS=${max_jobs}
ARG GIT_REPO_CHECK=0
# Support for cross-compilation with x86 ISA including AVX2 and AVX512: docker build --build-arg VLLM_CPU_X86="true" ...
ARG VLLM_CPU_X86=0
@@ -257,7 +257,8 @@ WORKDIR /vllm-workspace
RUN --mount=type=cache,target=/root/.cache/uv \
--mount=type=cache,target=/root/.cache/ccache \
--mount=type=bind,from=vllm-build,src=/vllm-workspace/dist,target=dist \
uv pip install "$(realpath dist/*.whl)[audio,triton-cpu]"
uv pip install dist/*.whl && \
uv pip install "vllm[audio]"
# Add labels to document build configuration
LABEL org.opencontainers.image.title="vLLM CPU"
-5
View File
@@ -1195,11 +1195,6 @@ setup(
"opentelemetry-exporter-otlp>=1.26.0",
"opentelemetry-semantic-conventions-ai>=0.4.1",
],
"triton-cpu": [
"triton @ "
"git+https://github.com/triton-lang/triton-cpu.git@270e696d ; "
"platform_machine == 'x86_64'",
], # Remove after stable release
},
cmdclass=cmdclass,
package_data=package_data,
+3 -20
View File
@@ -14,7 +14,6 @@ from vllm.logger import init_logger
from vllm.utils.cpu_resource_utils import (
DEVICE_CONTROL_ENV_VAR,
get_memory_node_info,
get_visible_memory_node,
)
from vllm.utils.mem_constants import GiB_bytes
from vllm.v1.attention.backends.registry import AttentionBackendEnum
@@ -136,13 +135,9 @@ class CpuPlatform(Platform):
scheduler_config.async_scheduling = False
parallel_config = vllm_config.parallel_config
if (
os.environ.get("VLLM_ENABLE_V1_MULTIPROCESSING", "1") == "1"
and parallel_config.distributed_executor_backend == "uni"
):
# OMP requires the MP executor to function correctly, UniProc
# is not supported as it is not possible to set the OMP
# environment correctly
# OMP requires the MP executor to function correctly, UniProc is not
# supported as it is not possible to set the OMP environment correctly
if parallel_config.distributed_executor_backend == "uni":
parallel_config.distributed_executor_backend = "mp"
if parallel_config.worker_cls == "auto":
@@ -486,15 +481,3 @@ class CpuPlatform(Platform):
slot_mapping,
isa,
)
@classmethod
def get_current_memory_usage(
cls, device: torch.types.Device | None = None
) -> float:
allowed_mem_node_list = get_visible_memory_node()
mem_status_list = [get_memory_node_info(i) for i in allowed_mem_node_list]
memory_usage = 0
for s in mem_status_list:
memory_usage += s.total_memory - s.available_memory
return memory_usage
-12
View File
@@ -3,7 +3,6 @@
import os
import types
from importlib.metadata import version
from importlib.util import find_spec
from vllm.logger import init_logger
@@ -49,17 +48,6 @@ if HAS_TRITON:
len(active_drivers),
)
HAS_TRITON = False
# Check Triton CPU
if "cpu" in version("vllm"):
if "cpu" in backends:
HAS_TRITON = True
else:
logger.warning(
"Triton is installed, but doesn't include CPU backend. "
"Disabling Triton."
)
HAS_TRITON = False
except ImportError:
# This can occur if Triton is partially installed or triton.backends
# is missing.
+1 -3
View File
@@ -50,10 +50,8 @@ def is_pin_memory_available() -> bool:
def is_uva_available() -> bool:
"""Check if Unified Virtual Addressing (UVA) is available."""
# UVA requires pinned memory.
from vllm.platforms import current_platform
# TODO: Add more requirements for UVA if needed.
return is_pin_memory_available() or current_platform.is_cpu()
return is_pin_memory_available()
@cache
View File
-16
View File
@@ -1,16 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from collections.abc import Sequence
import torch
from vllm.utils.platform_utils import is_uva_available
class UvaBuffer:
def __init__(self, size: int | Sequence[int], dtype: torch.dtype):
if not is_uva_available():
raise RuntimeError("UVA is not available")
self.cpu = torch.zeros(size, dtype=dtype, device="cpu")
self.np = self.cpu.numpy()
self.uva = self.cpu
-16
View File
@@ -1,16 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from vllm.logger import init_logger
from vllm.v1.worker.gpu.model_runner import GPUModelRunner
logger = init_logger(__name__)
class CPUModelRunner(GPUModelRunner):
# TBD: Whether need to move this to Worker?
def warming_up_model(self) -> None:
logger.info("Warming up model for the compilation...")
# Only generate graph for the generic shape
self.profile_run()
logger.info("Warming up done.")
-62
View File
@@ -1,62 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# isort: skip_file
# ruff: noqa: E402
# mypy: disable-error-code="misc, assignment"
from typing import Any
# Patch torch APIs
import torch
def noop(*args: Any, **kwargs: Any) -> None:
pass
class _EventPlaceholder:
def __init__(self, *args, **kwargs) -> None:
self.record = noop
self.synchronize = noop
class _StreamPlaceholder:
def __init__(self, *args, **kwargs) -> None:
self.wait_stream = noop
def __enter__(self, *args, **kwargs):
return self
def __exit__(self, exc_type, exc_val, exc_tb):
pass
torch.Event = _EventPlaceholder
torch.cuda.Event = _EventPlaceholder
torch.cuda.Stream = _StreamPlaceholder
torch.cuda.set_stream = noop
torch.cuda.current_stream = lambda *args, **kwargs: _StreamPlaceholder()
torch.accelerator.synchronize = noop
torch.accelerator.empty_cache = noop
# Patch vLLM torch utils
import vllm.utils.torch_utils as torch_utils
def async_tensor_h2d(
data: list,
dtype: torch.dtype,
device: str | torch.device,
pin_memory: bool = False,
) -> torch.Tensor:
return torch.tensor(data, dtype=dtype, device="cpu")
torch_utils.async_tensor_h2d = async_tensor_h2d
# Patch model runner APIs
import vllm.v1.worker.gpu.buffer_utils as gpu_buffer_utils
import vllm.v1.worker.cpu.buffer_utils as cpu_buffer_utils
gpu_buffer_utils.UvaBuffer = cpu_buffer_utils.UvaBuffer
+3 -16
View File
@@ -1,9 +1,5 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# Must be imported firstly
import vllm.v1.worker.cpu.shm # noqa # isort: skip
import math
import os
import sys
@@ -105,8 +101,6 @@ class CPUWorker(Worker):
)
def init_device(self):
self.device = torch.device("cpu")
# Check whether critical libraries are loaded
def check_preloaded_libs(name: str):
ld_preload_list = os.environ.get("LD_PRELOAD", "")
@@ -147,16 +141,9 @@ class CPUWorker(Worker):
set_random_seed(self.model_config.seed)
# Construct the model runner
if self.use_v2_model_runner:
from vllm.v1.worker.cpu.model_runner import (
CPUModelRunner as CPUModelRunnerV2,
)
self.model_runner: CPUModelRunner = CPUModelRunnerV2( # type: ignore
self.vllm_config, self.device
)
else:
self.model_runner = CPUModelRunner(self.vllm_config, torch.device("cpu"))
self.model_runner: CPUModelRunner = CPUModelRunner(
self.vllm_config, torch.device("cpu")
)
def sleep(self, level: int = 1) -> None:
logger.warning("sleep mode is not supported on CPU, ignore it.")