Fix cumem allocator default to probe C extension availability

The default resolution for enable_cumem_allocator used
is_sleep_mode_available(), which is a static CUDA/ROCm platform check.
This caused ValueError on CI nodes where the cumem C extension isn't
loadable. Use is_cumem_allocator_available() instead, which does a
runtime import probe of the extension.

Also fix test_cumem_required_for_sleep: the code auto-enables cumem
when sleep mode is on (lines 532-536), it doesn't raise ValueError.

Co-authored-by: Claude <noreply@anthropic.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
This commit is contained in:
Tyler Michael Smith
2026-06-05 15:09:28 -04:00
parent f99571b229
commit fa54d88f51
2 changed files with 14 additions and 11 deletions
+8 -8
View File
@@ -257,16 +257,16 @@ def test_cumem_without_sleep_mode():
assert output[0].outputs[0].text assert output[0].outputs[0].text
def test_cumem_required_for_sleep(): def test_sleep_mode_auto_enables_cumem():
"""Verify config validation rejects sleep mode without cumem.""" """Verify sleep mode automatically enables cumem allocator."""
from vllm.config.model import ModelConfig from vllm.config.model import ModelConfig
with pytest.raises(ValueError, match="cumem allocator"): cfg = ModelConfig(
ModelConfig( "hmellor/tiny-random-LlamaForCausalLM",
"hmellor/tiny-random-LlamaForCausalLM", enable_sleep_mode=True,
enable_sleep_mode=True, enable_cumem_allocator=False,
enable_cumem_allocator=False, )
) assert cfg.enable_cumem_allocator is True
@requires_fp8 @requires_fp8
+6 -3
View File
@@ -300,8 +300,9 @@ class ModelConfig:
"""Enable the custom cumem allocator to leverage advanced GPU memory """Enable the custom cumem allocator to leverage advanced GPU memory
allocation features such as multi-node NVLink support. allocation features such as multi-node NVLink support.
Defaults to True on CUDA and ROCm platforms. Sleep mode automatically Defaults to True when the cumem C extension is available (CUDA and ROCm
enables this allocator. Only cuda and hip platforms are supported. platforms with a full build). Sleep mode automatically enables this
allocator. Only cuda and hip platforms are supported.
""" """
model_impl: str | ModelImpl = "auto" model_impl: str | ModelImpl = "auto"
"""Which implementation of the model to use: """Which implementation of the model to use:
@@ -525,7 +526,9 @@ class ModelConfig:
) )
if self.enable_cumem_allocator is None: if self.enable_cumem_allocator is None:
self.enable_cumem_allocator = current_platform.is_sleep_mode_available() self.enable_cumem_allocator = (
current_platform.is_cumem_allocator_available()
)
if self.enable_sleep_mode: if self.enable_sleep_mode:
if not current_platform.is_sleep_mode_available(): if not current_platform.is_sleep_mode_available():
raise ValueError("Sleep mode is not supported on current platform.") raise ValueError("Sleep mode is not supported on current platform.")