Fix CuPy runtime deps and restore humming (#43530)

Signed-off-by: Mohammad Miadh Angkad <176301910+mmangkad@users.noreply.github.com>
(cherry picked from commit a970fb5a1a)
This commit is contained in:
Mohammad Miadh Angkad
2026-05-26 20:59:40 +08:00
committed by khluu
parent 6f955986e1
commit b0e9ae808e
5 changed files with 23 additions and 16 deletions
+4 -2
View File
@@ -220,7 +220,8 @@ COPY use_existing_torch.py use_existing_torch.py
COPY pyproject.toml pyproject.toml
RUN --mount=type=cache,target=/opt/uv/cache \
if [ "$(echo $CUDA_VERSION | cut -d. -f1)" = "12" ]; then \
sed -i 's/^nvidia-cutlass-dsl\[cu13\]>=/nvidia-cutlass-dsl>=/' requirements/cuda.txt; \
sed -i 's/^nvidia-cutlass-dsl\[cu13\]/nvidia-cutlass-dsl/' requirements/cuda.txt; \
sed -i 's/^humming-kernels\[cu13\]/humming-kernels[cu12]/' requirements/cuda.txt; \
fi \
&& if [ "${PYTORCH_NIGHTLY}" = "1" ]; then \
echo "Installing torch nightly..." \
@@ -746,7 +747,8 @@ COPY requirements/common.txt /tmp/common.txt
COPY requirements/cuda.txt /tmp/requirements-cuda.txt
RUN --mount=type=cache,target=/opt/uv/cache \
if [ "$(echo $CUDA_VERSION | cut -d. -f1)" = "12" ]; then \
sed -i 's/^nvidia-cutlass-dsl\[cu13\]>=/nvidia-cutlass-dsl>=/' /tmp/requirements-cuda.txt; \
sed -i 's/^nvidia-cutlass-dsl\[cu13\]/nvidia-cutlass-dsl/' /tmp/requirements-cuda.txt; \
sed -i 's/^humming-kernels\[cu13\]/humming-kernels[cu12]/' /tmp/requirements-cuda.txt; \
fi && \
uv pip install --system -r /tmp/requirements-cuda.txt \
--extra-index-url ${PYTORCH_CUDA_INDEX_BASE_URL}/cu$(echo $CUDA_VERSION | cut -d. -f1,2 | tr -d '.') && \
+4 -1
View File
@@ -25,4 +25,7 @@ nvidia-cutlass-dsl[cu13]==4.5.0
quack-kernels>=0.3.3
# Tokenspeed_MLA for faster mla with spec decode
tokenspeed-mla==0.1.2
tokenspeed-mla==0.1.2
# Humming kernels for quantization gemm
humming-kernels[cu13]==0.1.2
+3
View File
@@ -1,3 +1,6 @@
lmcache >= 0.3.9
# CuPy 14.1.0 imports pytest from cupy.testing._random. Use <14.1.0
# until a fixed newer release is verified for runtime images.
cupy-cuda13x < 14.1.0
nixl >= 1.1.0 # Required for disaggregated prefill
mooncake-transfer-engine >= 0.3.8
+2
View File
@@ -1017,6 +1017,8 @@ def get_requirements() -> list[str]:
if "nvidia-cutlass-dsl[cu13]" in req and cuda_major == "12":
# [cu13] extra is the default; strip it on CUDA 12 builds.
req = req.replace("nvidia-cutlass-dsl[cu13]", "nvidia-cutlass-dsl")
if "humming-kernels[cu13]" in req and cuda_major == "12":
req = req.replace("humming-kernels[cu13]", "humming-kernels[cu12]")
modified_requirements.append(req)
requirements = modified_requirements
elif _is_hip():
@@ -43,12 +43,9 @@ from vllm.model_executor.parameter import (
RowvLLMParameter,
)
from vllm.model_executor.utils import set_weight_attrs
from vllm.platforms import current_platform
if TYPE_CHECKING:
from vllm.model_executor.models.utils import WeightsMapper
try:
if current_platform.is_cuda():
from humming.dtypes import DataType
from humming.layer import HummingMethod
from humming.schema import (
@@ -65,16 +62,17 @@ try:
HummingIndexedExperts,
get_humming_moe_gemm_type,
)
except ModuleNotFoundError:
HummingMethod = None
def assert_humming_available():
assert HummingMethod is not None, (
"humming is not available, please run "
"'pip install git+https://github.com/inclusionAI/humming' to install it."
if TYPE_CHECKING:
from humming.schema import (
BaseInputSchema,
BaseWeightSchema,
HummingInputSchema,
HummingWeightSchema,
)
from vllm.model_executor.models.utils import WeightsMapper
def prepare_padded_shape(shape, x):
padded_shape = math.ceil(shape / x) * x
@@ -186,7 +184,6 @@ class HummingConfig(QuantizationConfig):
packed_modules_mapping: dict[str, list[str]] = {}
def __init__(self, full_config: dict[str, Any] | None = None):
assert_humming_available()
self.full_config: dict[str, Any] = full_config or {}
@classmethod