mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
Revert "[CPU] Experimentally enable Triton and MRV2 (#43225)"
This reverts commit 65b7a812a2.
This commit is contained in:
@@ -54,20 +54,6 @@ steps:
|
||||
pytest -x -v -s tests/models/language/generation -m cpu_model
|
||||
pytest -x -v -s tests/models/language/pooling -m cpu_model"
|
||||
|
||||
- label: CPU-ModelRunnerV2 Tests
|
||||
depends_on: []
|
||||
device: intel_cpu
|
||||
no_plugin: true
|
||||
soft_fail: true
|
||||
source_file_dependencies:
|
||||
- vllm/v1/worker/cpu/
|
||||
- vllm/v1/worker/gpu/
|
||||
commands:
|
||||
- |
|
||||
bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
|
||||
uv pip install git+https://github.com/triton-lang/triton-cpu.git@270e696d
|
||||
VLLM_USE_V2_MODEL_RUNNER=1 pytest -x -v -s tests/models/language/generation/test_granite.py -m cpu_model"
|
||||
|
||||
- label: CPU-Quantization Model Tests
|
||||
depends_on: []
|
||||
device: intel_cpu
|
||||
|
||||
@@ -27,14 +27,11 @@ WORKDIR /workspace
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
|
||||
|
||||
ARG max_jobs=32
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
|
||||
# Install minimal dependencies and uv
|
||||
RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
|
||||
--mount=type=cache,target=/var/lib/apt,sharing=locked \
|
||||
apt-get update -y \
|
||||
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates zlib1g-dev \
|
||||
&& apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
|
||||
gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof make xz-utils \
|
||||
&& update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
|
||||
&& curl -LsSf https://astral.sh/uv/install.sh | sh
|
||||
@@ -126,6 +123,9 @@ RUN --mount=type=cache,target=/root/.cargo/registry \
|
||||
######################### BUILD IMAGE #########################
|
||||
FROM base AS vllm-build
|
||||
|
||||
ARG max_jobs=32
|
||||
ENV MAX_JOBS=${max_jobs}
|
||||
|
||||
ARG GIT_REPO_CHECK=0
|
||||
# Support for cross-compilation with x86 ISA including AVX2 and AVX512: docker build --build-arg VLLM_CPU_X86="true" ...
|
||||
ARG VLLM_CPU_X86=0
|
||||
@@ -257,7 +257,8 @@ WORKDIR /vllm-workspace
|
||||
RUN --mount=type=cache,target=/root/.cache/uv \
|
||||
--mount=type=cache,target=/root/.cache/ccache \
|
||||
--mount=type=bind,from=vllm-build,src=/vllm-workspace/dist,target=dist \
|
||||
uv pip install "$(realpath dist/*.whl)[audio,triton-cpu]"
|
||||
uv pip install dist/*.whl && \
|
||||
uv pip install "vllm[audio]"
|
||||
|
||||
# Add labels to document build configuration
|
||||
LABEL org.opencontainers.image.title="vLLM CPU"
|
||||
|
||||
@@ -1195,11 +1195,6 @@ setup(
|
||||
"opentelemetry-exporter-otlp>=1.26.0",
|
||||
"opentelemetry-semantic-conventions-ai>=0.4.1",
|
||||
],
|
||||
"triton-cpu": [
|
||||
"triton @ "
|
||||
"git+https://github.com/triton-lang/triton-cpu.git@270e696d ; "
|
||||
"platform_machine == 'x86_64'",
|
||||
], # Remove after stable release
|
||||
},
|
||||
cmdclass=cmdclass,
|
||||
package_data=package_data,
|
||||
|
||||
+3
-20
@@ -14,7 +14,6 @@ from vllm.logger import init_logger
|
||||
from vllm.utils.cpu_resource_utils import (
|
||||
DEVICE_CONTROL_ENV_VAR,
|
||||
get_memory_node_info,
|
||||
get_visible_memory_node,
|
||||
)
|
||||
from vllm.utils.mem_constants import GiB_bytes
|
||||
from vllm.v1.attention.backends.registry import AttentionBackendEnum
|
||||
@@ -136,13 +135,9 @@ class CpuPlatform(Platform):
|
||||
scheduler_config.async_scheduling = False
|
||||
|
||||
parallel_config = vllm_config.parallel_config
|
||||
if (
|
||||
os.environ.get("VLLM_ENABLE_V1_MULTIPROCESSING", "1") == "1"
|
||||
and parallel_config.distributed_executor_backend == "uni"
|
||||
):
|
||||
# OMP requires the MP executor to function correctly, UniProc
|
||||
# is not supported as it is not possible to set the OMP
|
||||
# environment correctly
|
||||
# OMP requires the MP executor to function correctly, UniProc is not
|
||||
# supported as it is not possible to set the OMP environment correctly
|
||||
if parallel_config.distributed_executor_backend == "uni":
|
||||
parallel_config.distributed_executor_backend = "mp"
|
||||
|
||||
if parallel_config.worker_cls == "auto":
|
||||
@@ -486,15 +481,3 @@ class CpuPlatform(Platform):
|
||||
slot_mapping,
|
||||
isa,
|
||||
)
|
||||
|
||||
@classmethod
|
||||
def get_current_memory_usage(
|
||||
cls, device: torch.types.Device | None = None
|
||||
) -> float:
|
||||
allowed_mem_node_list = get_visible_memory_node()
|
||||
mem_status_list = [get_memory_node_info(i) for i in allowed_mem_node_list]
|
||||
memory_usage = 0
|
||||
for s in mem_status_list:
|
||||
memory_usage += s.total_memory - s.available_memory
|
||||
|
||||
return memory_usage
|
||||
|
||||
@@ -3,7 +3,6 @@
|
||||
|
||||
import os
|
||||
import types
|
||||
from importlib.metadata import version
|
||||
from importlib.util import find_spec
|
||||
|
||||
from vllm.logger import init_logger
|
||||
@@ -49,17 +48,6 @@ if HAS_TRITON:
|
||||
len(active_drivers),
|
||||
)
|
||||
HAS_TRITON = False
|
||||
|
||||
# Check Triton CPU
|
||||
if "cpu" in version("vllm"):
|
||||
if "cpu" in backends:
|
||||
HAS_TRITON = True
|
||||
else:
|
||||
logger.warning(
|
||||
"Triton is installed, but doesn't include CPU backend. "
|
||||
"Disabling Triton."
|
||||
)
|
||||
HAS_TRITON = False
|
||||
except ImportError:
|
||||
# This can occur if Triton is partially installed or triton.backends
|
||||
# is missing.
|
||||
|
||||
@@ -50,10 +50,8 @@ def is_pin_memory_available() -> bool:
|
||||
def is_uva_available() -> bool:
|
||||
"""Check if Unified Virtual Addressing (UVA) is available."""
|
||||
# UVA requires pinned memory.
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
# TODO: Add more requirements for UVA if needed.
|
||||
return is_pin_memory_available() or current_platform.is_cpu()
|
||||
return is_pin_memory_available()
|
||||
|
||||
|
||||
@cache
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from collections.abc import Sequence
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.utils.platform_utils import is_uva_available
|
||||
|
||||
|
||||
class UvaBuffer:
|
||||
def __init__(self, size: int | Sequence[int], dtype: torch.dtype):
|
||||
if not is_uva_available():
|
||||
raise RuntimeError("UVA is not available")
|
||||
self.cpu = torch.zeros(size, dtype=dtype, device="cpu")
|
||||
self.np = self.cpu.numpy()
|
||||
self.uva = self.cpu
|
||||
@@ -1,16 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.v1.worker.gpu.model_runner import GPUModelRunner
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class CPUModelRunner(GPUModelRunner):
|
||||
# TBD: Whether need to move this to Worker?
|
||||
def warming_up_model(self) -> None:
|
||||
logger.info("Warming up model for the compilation...")
|
||||
# Only generate graph for the generic shape
|
||||
self.profile_run()
|
||||
logger.info("Warming up done.")
|
||||
@@ -1,62 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# isort: skip_file
|
||||
# ruff: noqa: E402
|
||||
# mypy: disable-error-code="misc, assignment"
|
||||
|
||||
from typing import Any
|
||||
|
||||
# Patch torch APIs
|
||||
import torch
|
||||
|
||||
|
||||
def noop(*args: Any, **kwargs: Any) -> None:
|
||||
pass
|
||||
|
||||
|
||||
class _EventPlaceholder:
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
self.record = noop
|
||||
self.synchronize = noop
|
||||
|
||||
|
||||
class _StreamPlaceholder:
|
||||
def __init__(self, *args, **kwargs) -> None:
|
||||
self.wait_stream = noop
|
||||
|
||||
def __enter__(self, *args, **kwargs):
|
||||
return self
|
||||
|
||||
def __exit__(self, exc_type, exc_val, exc_tb):
|
||||
pass
|
||||
|
||||
|
||||
torch.Event = _EventPlaceholder
|
||||
torch.cuda.Event = _EventPlaceholder
|
||||
torch.cuda.Stream = _StreamPlaceholder
|
||||
torch.cuda.set_stream = noop
|
||||
torch.cuda.current_stream = lambda *args, **kwargs: _StreamPlaceholder()
|
||||
torch.accelerator.synchronize = noop
|
||||
torch.accelerator.empty_cache = noop
|
||||
|
||||
# Patch vLLM torch utils
|
||||
import vllm.utils.torch_utils as torch_utils
|
||||
|
||||
|
||||
def async_tensor_h2d(
|
||||
data: list,
|
||||
dtype: torch.dtype,
|
||||
device: str | torch.device,
|
||||
pin_memory: bool = False,
|
||||
) -> torch.Tensor:
|
||||
return torch.tensor(data, dtype=dtype, device="cpu")
|
||||
|
||||
|
||||
torch_utils.async_tensor_h2d = async_tensor_h2d
|
||||
|
||||
# Patch model runner APIs
|
||||
import vllm.v1.worker.gpu.buffer_utils as gpu_buffer_utils
|
||||
import vllm.v1.worker.cpu.buffer_utils as cpu_buffer_utils
|
||||
|
||||
gpu_buffer_utils.UvaBuffer = cpu_buffer_utils.UvaBuffer
|
||||
@@ -1,9 +1,5 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
# Must be imported firstly
|
||||
import vllm.v1.worker.cpu.shm # noqa # isort: skip
|
||||
|
||||
import math
|
||||
import os
|
||||
import sys
|
||||
@@ -105,8 +101,6 @@ class CPUWorker(Worker):
|
||||
)
|
||||
|
||||
def init_device(self):
|
||||
self.device = torch.device("cpu")
|
||||
|
||||
# Check whether critical libraries are loaded
|
||||
def check_preloaded_libs(name: str):
|
||||
ld_preload_list = os.environ.get("LD_PRELOAD", "")
|
||||
@@ -147,16 +141,9 @@ class CPUWorker(Worker):
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
# Construct the model runner
|
||||
if self.use_v2_model_runner:
|
||||
from vllm.v1.worker.cpu.model_runner import (
|
||||
CPUModelRunner as CPUModelRunnerV2,
|
||||
)
|
||||
|
||||
self.model_runner: CPUModelRunner = CPUModelRunnerV2( # type: ignore
|
||||
self.vllm_config, self.device
|
||||
)
|
||||
else:
|
||||
self.model_runner = CPUModelRunner(self.vllm_config, torch.device("cpu"))
|
||||
self.model_runner: CPUModelRunner = CPUModelRunner(
|
||||
self.vllm_config, torch.device("cpu")
|
||||
)
|
||||
|
||||
def sleep(self, level: int = 1) -> None:
|
||||
logger.warning("sleep mode is not supported on CPU, ignore it.")
|
||||
|
||||
Reference in New Issue
Block a user