From a970fb5a1a5800c552c74cf3278d6ee7c1c3fca1 Mon Sep 17 00:00:00 2001 From: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com> Date: Tue, 26 May 2026 20:59:40 +0800 Subject: [PATCH] Fix CuPy runtime deps and restore humming (#43530) Signed-off-by: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com> --- docker/Dockerfile | 6 +++-- requirements/cuda.txt | 5 +++- requirements/kv_connectors.txt | 3 +++ setup.py | 2 ++ .../layers/quantization/humming.py | 23 ++++++++----------- 5 files changed, 23 insertions(+), 16 deletions(-) diff --git a/docker/Dockerfile b/docker/Dockerfile index cae909862b5..47b17fc2d10 100644 --- a/docker/Dockerfile +++ b/docker/Dockerfile @@ -220,7 +220,8 @@ COPY use_existing_torch.py use_existing_torch.py COPY pyproject.toml pyproject.toml RUN --mount=type=cache,target=/opt/uv/cache \ if [ "$(echo $CUDA_VERSION | cut -d. -f1)" = "12" ]; then \ - sed -i 's/^nvidia-cutlass-dsl\[cu13\]>=/nvidia-cutlass-dsl>=/' requirements/cuda.txt; \ + sed -i 's/^nvidia-cutlass-dsl\[cu13\]/nvidia-cutlass-dsl/' requirements/cuda.txt; \ + sed -i 's/^humming-kernels\[cu13\]/humming-kernels[cu12]/' requirements/cuda.txt; \ fi \ && if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \ echo "Installing torch nightly..." \ @@ -746,7 +747,8 @@ COPY requirements/common.txt /tmp/common.txt COPY requirements/cuda.txt /tmp/requirements-cuda.txt RUN --mount=type=cache,target=/opt/uv/cache \ if [ "$(echo $CUDA_VERSION | cut -d. -f1)" = "12" ]; then \ - sed -i 's/^nvidia-cutlass-dsl\[cu13\]>=/nvidia-cutlass-dsl>=/' /tmp/requirements-cuda.txt; \ + sed -i 's/^nvidia-cutlass-dsl\[cu13\]/nvidia-cutlass-dsl/' /tmp/requirements-cuda.txt; \ + sed -i 's/^humming-kernels\[cu13\]/humming-kernels[cu12]/' /tmp/requirements-cuda.txt; \ fi && \ uv pip install --system -r /tmp/requirements-cuda.txt \ --extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \ diff --git a/requirements/cuda.txt b/requirements/cuda.txt index 14482744f0c..1e0a945e5a7 100644 --- a/requirements/cuda.txt +++ b/requirements/cuda.txt @@ -25,4 +25,7 @@ nvidia-cutlass-dsl[cu13]==4.5.0 quack-kernels>=0.3.3 # Tokenspeed_MLA for faster mla with spec decode -tokenspeed-mla==0.1.2 \ No newline at end of file +tokenspeed-mla==0.1.2 + +# Humming kernels for quantization gemm +humming-kernels[cu13]==0.1.2 diff --git a/requirements/kv_connectors.txt b/requirements/kv_connectors.txt index 6699442872c..7a5b5f25c37 100644 --- a/requirements/kv_connectors.txt +++ b/requirements/kv_connectors.txt @@ -1,3 +1,6 @@ lmcache >= 0.3.9 +# CuPy 14.1.0 imports pytest from cupy.testing._random. Use <14.1.0 +# until a fixed newer release is verified for runtime images. +cupy-cuda13x < 14.1.0 nixl >= 1.1.0 # Required for disaggregated prefill mooncake-transfer-engine >= 0.3.8 diff --git a/setup.py b/setup.py index b0b5337a925..a95ee3451b5 100644 --- a/setup.py +++ b/setup.py @@ -1017,6 +1017,8 @@ def get_requirements() -> list[str]: if "nvidia-cutlass-dsl[cu13]" in req and cuda_major == "12": # [cu13] extra is the default; strip it on CUDA 12 builds. req = req.replace("nvidia-cutlass-dsl[cu13]", "nvidia-cutlass-dsl") + if "humming-kernels[cu13]" in req and cuda_major == "12": + req = req.replace("humming-kernels[cu13]", "humming-kernels[cu12]") modified_requirements.append(req) requirements = modified_requirements elif _is_hip(): diff --git a/vllm/model_executor/layers/quantization/humming.py b/vllm/model_executor/layers/quantization/humming.py index 8139b2441b7..12bb07a4022 100644 --- a/vllm/model_executor/layers/quantization/humming.py +++ b/vllm/model_executor/layers/quantization/humming.py @@ -43,12 +43,9 @@ from vllm.model_executor.parameter import ( RowvLLMParameter, ) from vllm.model_executor.utils import set_weight_attrs +from vllm.platforms import current_platform -if TYPE_CHECKING: - from vllm.model_executor.models.utils import WeightsMapper - - -try: +if current_platform.is_cuda(): from humming.dtypes import DataType from humming.layer import HummingMethod from humming.schema import ( @@ -65,16 +62,17 @@ try: HummingIndexedExperts, get_humming_moe_gemm_type, ) -except ModuleNotFoundError: - HummingMethod = None - -def assert_humming_available(): - assert HummingMethod is not None, ( - "humming is not available, please run " - "'pip install git+https://github.com/inclusionAI/humming' to install it." +if TYPE_CHECKING: + from humming.schema import ( + BaseInputSchema, + BaseWeightSchema, + HummingInputSchema, + HummingWeightSchema, ) + from vllm.model_executor.models.utils import WeightsMapper + def prepare_padded_shape(shape, x): padded_shape = math.ceil(shape / x) * x @@ -186,7 +184,6 @@ class HummingConfig(QuantizationConfig): packed_modules_mapping: dict[str, list[str]] = {} def __init__(self, full_config: dict[str, Any] | None = None): - assert_humming_available() self.full_config: dict[str, Any] = full_config or {} @classmethod