Revert "[CPU] Experimentally enable Triton and MRV2 (#43225)"

This reverts commit 65b7a812a2.
2026-06-06 00:16:14 +00:00 · 2026-05-29 02:28:43 -07:00
parent 799c3afa5d
commit 0b3ba88f16
11 changed files with 13 additions and 169 deletions
@@ -54,20 +54,6 @@ steps:
      pytest -x -v -s tests/models/language/generation -m cpu_model
      pytest -x -v -s tests/models/language/pooling -m cpu_model"

- label: CPU-ModelRunnerV2 Tests
-  depends_on: []
-  device: intel_cpu
-  no_plugin: true
-  soft_fail: true
-  source_file_dependencies:
-  - vllm/v1/worker/cpu/
-  - vllm/v1/worker/gpu/
-  commands:
-    - |
-      bash .buildkite/scripts/hardware_ci/run-cpu-test.sh 30m "
-      uv pip install git+https://github.com/triton-lang/triton-cpu.git@270e696d
-      VLLM_USE_V2_MODEL_RUNNER=1 pytest -x -v -s tests/models/language/generation/test_granite.py -m cpu_model"
-
 - label: CPU-Quantization Model Tests
  depends_on: []
  device: intel_cpu
@@ -27,14 +27,11 @@ WORKDIR /workspace
 ARG PYTHON_VERSION=3.12
 ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"

-ARG max_jobs=32
-ENV MAX_JOBS=${max_jobs}
-
 # Install minimal dependencies and uv
 RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
    --mount=type=cache,target=/var/lib/apt,sharing=locked \
    apt-get update -y \
-    && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates zlib1g-dev \
+    && apt-get install -y --no-install-recommends sudo ccache git curl wget ca-certificates \
    gcc-12 g++-12 libtcmalloc-minimal4 libnuma-dev ffmpeg libsm6 libxext6 libgl1 jq lsof make xz-utils \
    && update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 10 --slave /usr/bin/g++ g++ /usr/bin/g++-12 \
    && curl -LsSf https://astral.sh/uv/install.sh | sh
@@ -126,6 +123,9 @@ RUN --mount=type=cache,target=/root/.cargo/registry \
 ######################### BUILD IMAGE #########################
 FROM base AS vllm-build

+ARG max_jobs=32
+ENV MAX_JOBS=${max_jobs}
+
 ARG GIT_REPO_CHECK=0
 # Support for cross-compilation with x86 ISA including AVX2 and AVX512: docker build --build-arg VLLM_CPU_X86="true" ...
 ARG VLLM_CPU_X86=0
@@ -257,7 +257,8 @@ WORKDIR /vllm-workspace
 RUN --mount=type=cache,target=/root/.cache/uv \
    --mount=type=cache,target=/root/.cache/ccache \
    --mount=type=bind,from=vllm-build,src=/vllm-workspace/dist,target=dist \
-    uv pip install "$(realpath dist/*.whl)[audio,triton-cpu]"
+    uv pip install dist/*.whl && \
+    uv pip install "vllm[audio]"

 # Add labels to document build configuration
 LABEL org.opencontainers.image.title="vLLM CPU"
@@ -1195,11 +1195,6 @@ setup(
            "opentelemetry-exporter-otlp>=1.26.0",
            "opentelemetry-semantic-conventions-ai>=0.4.1",
        ],
-        "triton-cpu": [
-            "triton @ "
-            "git+https://github.com/triton-lang/triton-cpu.git@270e696d ; "
-            "platform_machine == 'x86_64'",
-        ],  # Remove after stable release
    },
    cmdclass=cmdclass,
    package_data=package_data,
@@ -14,7 +14,6 @@ from vllm.logger import init_logger
 from vllm.utils.cpu_resource_utils import (
    DEVICE_CONTROL_ENV_VAR,
    get_memory_node_info,
-    get_visible_memory_node,
 )
 from vllm.utils.mem_constants import GiB_bytes
 from vllm.v1.attention.backends.registry import AttentionBackendEnum
@@ -136,13 +135,9 @@ class CpuPlatform(Platform):
        scheduler_config.async_scheduling = False

        parallel_config = vllm_config.parallel_config
-        if (
-            os.environ.get("VLLM_ENABLE_V1_MULTIPROCESSING", "1") == "1"
-            and parallel_config.distributed_executor_backend == "uni"
-        ):
-            # OMP requires the MP executor to function correctly, UniProc
-            # is not supported as it is not possible to set the OMP
-            # environment correctly
+        # OMP requires the MP executor to function correctly, UniProc is not
+        # supported as it is not possible to set the OMP environment correctly
+        if parallel_config.distributed_executor_backend == "uni":
            parallel_config.distributed_executor_backend = "mp"

        if parallel_config.worker_cls == "auto":
@@ -486,15 +481,3 @@ class CpuPlatform(Platform):
            slot_mapping,
            isa,
        )
-
-    @classmethod
-    def get_current_memory_usage(
-        cls, device: torch.types.Device | None = None
-    ) -> float:
-        allowed_mem_node_list = get_visible_memory_node()
-        mem_status_list = [get_memory_node_info(i) for i in allowed_mem_node_list]
-        memory_usage = 0
-        for s in mem_status_list:
-            memory_usage += s.total_memory - s.available_memory
-
-        return memory_usage
@@ -3,7 +3,6 @@

 import os
 import types
-from importlib.metadata import version
 from importlib.util import find_spec

 from vllm.logger import init_logger
@@ -49,17 +48,6 @@ if HAS_TRITON:
                len(active_drivers),
            )
            HAS_TRITON = False
-
-        # Check Triton CPU
-        if "cpu" in version("vllm"):
-            if "cpu" in backends:
-                HAS_TRITON = True
-            else:
-                logger.warning(
-                    "Triton is installed, but doesn't include CPU backend. "
-                    "Disabling Triton."
-                )
-                HAS_TRITON = False
    except ImportError:
        # This can occur if Triton is partially installed or triton.backends
        # is missing.
@@ -50,10 +50,8 @@ def is_pin_memory_available() -> bool:
 def is_uva_available() -> bool:
    """Check if Unified Virtual Addressing (UVA) is available."""
    # UVA requires pinned memory.
-    from vllm.platforms import current_platform
-
    # TODO: Add more requirements for UVA if needed.
-    return is_pin_memory_available() or current_platform.is_cpu()
+    return is_pin_memory_available()


@cache
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-from collections.abc import Sequence
-
-import torch
-
-from vllm.utils.platform_utils import is_uva_available
-
-
-class UvaBuffer:
-    def __init__(self, size: int | Sequence[int], dtype: torch.dtype):
-        if not is_uva_available():
-            raise RuntimeError("UVA is not available")
-        self.cpu = torch.zeros(size, dtype=dtype, device="cpu")
-        self.np = self.cpu.numpy()
-        self.uva = self.cpu
@@ -1,16 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-from vllm.logger import init_logger
-from vllm.v1.worker.gpu.model_runner import GPUModelRunner
-
-logger = init_logger(__name__)
-
-
-class CPUModelRunner(GPUModelRunner):
-    # TBD: Whether need to move this to Worker?
-    def warming_up_model(self) -> None:
-        logger.info("Warming up model for the compilation...")
-        # Only generate graph for the generic shape
-        self.profile_run()
-        logger.info("Warming up done.")
@@ -1,62 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# isort: skip_file
-# ruff: noqa: E402
-# mypy: disable-error-code="misc, assignment"
-
-from typing import Any
-
-# Patch torch APIs
-import torch
-
-
-def noop(*args: Any, **kwargs: Any) -> None:
-    pass
-
-
-class _EventPlaceholder:
-    def __init__(self, *args, **kwargs) -> None:
-        self.record = noop
-        self.synchronize = noop
-
-
-class _StreamPlaceholder:
-    def __init__(self, *args, **kwargs) -> None:
-        self.wait_stream = noop
-
-    def __enter__(self, *args, **kwargs):
-        return self
-
-    def __exit__(self, exc_type, exc_val, exc_tb):
-        pass
-
-
-torch.Event = _EventPlaceholder
-torch.cuda.Event = _EventPlaceholder
-torch.cuda.Stream = _StreamPlaceholder
-torch.cuda.set_stream = noop
-torch.cuda.current_stream = lambda *args, **kwargs: _StreamPlaceholder()
-torch.accelerator.synchronize = noop
-torch.accelerator.empty_cache = noop
-
-# Patch vLLM torch utils
-import vllm.utils.torch_utils as torch_utils
-
-
-def async_tensor_h2d(
-    data: list,
-    dtype: torch.dtype,
-    device: str | torch.device,
-    pin_memory: bool = False,
-) -> torch.Tensor:
-    return torch.tensor(data, dtype=dtype, device="cpu")
-
-
-torch_utils.async_tensor_h2d = async_tensor_h2d
-
-# Patch model runner APIs
-import vllm.v1.worker.gpu.buffer_utils as gpu_buffer_utils
-import vllm.v1.worker.cpu.buffer_utils as cpu_buffer_utils
-
-gpu_buffer_utils.UvaBuffer = cpu_buffer_utils.UvaBuffer
@@ -1,9 +1,5 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-# Must be imported firstly
-import vllm.v1.worker.cpu.shm  # noqa # isort: skip
-
 import math
 import os
 import sys
@@ -105,8 +101,6 @@ class CPUWorker(Worker):
            )

    def init_device(self):
-        self.device = torch.device("cpu")
-
        # Check whether critical libraries are loaded
        def check_preloaded_libs(name: str):
            ld_preload_list = os.environ.get("LD_PRELOAD", "")
@@ -147,16 +141,9 @@ class CPUWorker(Worker):
        set_random_seed(self.model_config.seed)

        # Construct the model runner
-        if self.use_v2_model_runner:
-            from vllm.v1.worker.cpu.model_runner import (
-                CPUModelRunner as CPUModelRunnerV2,
-            )
-
-            self.model_runner: CPUModelRunner = CPUModelRunnerV2(  # type: ignore
-                self.vllm_config, self.device
-            )
-        else:
-            self.model_runner = CPUModelRunner(self.vllm_config, torch.device("cpu"))
+        self.model_runner: CPUModelRunner = CPUModelRunner(
+            self.vllm_config, torch.device("cpu")
+        )

    def sleep(self, level: int = 1) -> None:
        logger.warning("sleep mode is not supported on CPU, ignore it.")