From fa54d88f51f3179aac2b2434a8868aad468cd749 Mon Sep 17 00:00:00 2001 From: Tyler Michael Smith Date: Fri, 5 Jun 2026 15:09:28 -0400 Subject: [PATCH] Fix cumem allocator default to probe C extension availability The default resolution for enable_cumem_allocator used is_sleep_mode_available(), which is a static CUDA/ROCm platform check. This caused ValueError on CI nodes where the cumem C extension isn't loadable. Use is_cumem_allocator_available() instead, which does a runtime import probe of the extension. Also fix test_cumem_required_for_sleep: the code auto-enables cumem when sleep mode is on (lines 532-536), it doesn't raise ValueError. Co-authored-by: Claude Signed-off-by: Tyler Michael Smith --- tests/basic_correctness/test_cumem.py | 16 ++++++++-------- vllm/config/model.py | 9 ++++++--- 2 files changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/basic_correctness/test_cumem.py b/tests/basic_correctness/test_cumem.py index d74aa48bfb7..3010b8af31f 100644 --- a/tests/basic_correctness/test_cumem.py +++ b/tests/basic_correctness/test_cumem.py @@ -257,16 +257,16 @@ def test_cumem_without_sleep_mode(): assert output[0].outputs[0].text -def test_cumem_required_for_sleep(): - """Verify config validation rejects sleep mode without cumem.""" +def test_sleep_mode_auto_enables_cumem(): + """Verify sleep mode automatically enables cumem allocator.""" from vllm.config.model import ModelConfig - with pytest.raises(ValueError, match="cumem allocator"): - ModelConfig( - "hmellor/tiny-random-LlamaForCausalLM", - enable_sleep_mode=True, - enable_cumem_allocator=False, - ) + cfg = ModelConfig( + "hmellor/tiny-random-LlamaForCausalLM", + enable_sleep_mode=True, + enable_cumem_allocator=False, + ) + assert cfg.enable_cumem_allocator is True @requires_fp8 diff --git a/vllm/config/model.py b/vllm/config/model.py index 544a2fb2252..f8731ea534a 100644 --- a/vllm/config/model.py +++ b/vllm/config/model.py @@ -300,8 +300,9 @@ class ModelConfig: """Enable the custom cumem allocator to leverage advanced GPU memory allocation features such as multi-node NVLink support. - Defaults to True on CUDA and ROCm platforms. Sleep mode automatically - enables this allocator. Only cuda and hip platforms are supported. + Defaults to True when the cumem C extension is available (CUDA and ROCm + platforms with a full build). Sleep mode automatically enables this + allocator. Only cuda and hip platforms are supported. """ model_impl: str | ModelImpl = "auto" """Which implementation of the model to use: @@ -525,7 +526,9 @@ class ModelConfig: ) if self.enable_cumem_allocator is None: - self.enable_cumem_allocator = current_platform.is_sleep_mode_available() + self.enable_cumem_allocator = ( + current_platform.is_cumem_allocator_available() + ) if self.enable_sleep_mode: if not current_platform.is_sleep_mode_available(): raise ValueError("Sleep mode is not supported on current platform.")