diff --git a/requirements/common.txt b/requirements/common.txt index 8141dc8ea6b..8b37f3cd30c 100644 --- a/requirements/common.txt +++ b/requirements/common.txt @@ -32,7 +32,7 @@ partial-json-parser # used for parsing partial JSON outputs pyzmq >= 25.0.0 msgspec gguf >= 0.17.0 -mistral_common[image] >= 1.11.2 +mistral_common[image] >= 1.11.3 opencv-python-headless >= 4.13.0 # required for video IO pyyaml six>=1.16.0; python_version > '3.11' # transitive dependency of pandas that needs to be the latest version for python 3.12 diff --git a/requirements/test/cuda.in b/requirements/test/cuda.in index 344a58ec1bb..8d7ad7d0aa2 100644 --- a/requirements/test/cuda.in +++ b/requirements/test/cuda.in @@ -31,7 +31,7 @@ torchaudio==2.11.0 torchvision==0.26.0 transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test -mistral_common[image,audio] >= 1.11.2 # required for voxtral test +mistral_common[image,audio] >= 1.11.3 # required for voxtral test num2words # required for smolvlm test open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py opencv-python-headless >= 4.13.0 # required for video test diff --git a/requirements/test/cuda.txt b/requirements/test/cuda.txt index 7d847d10577..a3e1466c763 100644 --- a/requirements/test/cuda.txt +++ b/requirements/test/cuda.txt @@ -409,7 +409,7 @@ mbstrdecoder==1.1.3 # typepy mdurl==0.1.2 # via markdown-it-py -mistral-common==1.11.2 +mistral-common==1.11.3 # via # -c requirements/common.txt # -r requirements/test/cuda.in diff --git a/requirements/test/nightly-torch.txt b/requirements/test/nightly-torch.txt index 89fd4ea9b43..10eb7a62191 100644 --- a/requirements/test/nightly-torch.txt +++ b/requirements/test/nightly-torch.txt @@ -23,7 +23,7 @@ jiwer # required for audio tests timm # required for internvl test transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test -mistral_common[image,audio] >= 1.11.2 # required for voxtral test +mistral_common[image,audio] >= 1.11.3 # required for voxtral test num2words # required for smolvlm test opencv-python-headless >= 4.13.0 # required for video test datamodel_code_generator # required for minicpm3 test diff --git a/requirements/test/rocm.in b/requirements/test/rocm.in index 0a615831774..ed10270f565 100644 --- a/requirements/test/rocm.in +++ b/requirements/test/rocm.in @@ -30,7 +30,7 @@ tblib # for pickling test exceptions timm>=1.0.17 # required for internvl and gemma3n-mm test transformers_stream_generator # required for qwen-vl test matplotlib # required for qwen-vl test -mistral_common[image,audio]>=1.11.2 # required for voxtral test +mistral_common[image,audio]>=1.11.3 # required for voxtral test num2words # required for smolvlm test open_clip_torch==2.32.0 # Required for nemotron_vl test, Nemotron Parse in test_common.py opencv-python-headless>=4.13.0 # required for video test diff --git a/requirements/test/rocm.txt b/requirements/test/rocm.txt index e0232d8b6d3..eced7117116 100644 --- a/requirements/test/rocm.txt +++ b/requirements/test/rocm.txt @@ -512,7 +512,7 @@ mcp==1.27.0 # via -r requirements/test/../common.txt mdurl==0.1.2 # via markdown-it-py -mistral-common==1.11.2 +mistral-common==1.11.3 # via # -c requirements/common.txt # -r requirements/test/../common.txt diff --git a/requirements/test/xpu.txt b/requirements/test/xpu.txt index 5581d0a079c..6242ff420f6 100644 --- a/requirements/test/xpu.txt +++ b/requirements/test/xpu.txt @@ -266,7 +266,7 @@ mbstrdecoder==1.1.4 # typepy mdurl==0.1.2 # via markdown-it-py -mistral-common==1.11.2 +mistral-common==1.11.3 # via # -c requirements/common.txt # -r requirements/test/xpu.in diff --git a/tests/model_executor/test_eagle_quantization.py b/tests/model_executor/test_eagle_quantization.py index 481715da9cd..72c189d8331 100644 --- a/tests/model_executor/test_eagle_quantization.py +++ b/tests/model_executor/test_eagle_quantization.py @@ -100,32 +100,6 @@ def test_fc_layer_quant_config_usage(default_vllm_config, dist_init, device) -> assert output.shape == (2, output_size) -def test_kv_cache_scale_name_handling(): - # Mock a quant config that supports cache scales - mock_quant_config = Mock() - mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale") - - # Condition check in load_weights - name = "layers.0.self_attn.k_proj.weight" - scale_name = mock_quant_config.get_cache_scale(name) - - # Check if get_cache_scale is called and returns expected value - mock_quant_config.get_cache_scale.assert_called_once_with(name) - assert scale_name == "layers.0.self_attn.kv_scale" - - -def test_kv_cache_scale_name_no_scale(): - # Mock a quant config that returns None for get_cache_scale - mock_quant_config = Mock() - mock_quant_config.get_cache_scale = Mock(return_value=None) - - name = "layers.0.mlp.gate_proj.weight" - scale_name = mock_quant_config.get_cache_scale(name) - - # Should return None for weights that don't have cache scales - assert scale_name is None - - def test_maybe_remap_kv_scale_name(): from vllm.model_executor.model_loader.weight_utils import maybe_remap_kv_scale_name @@ -183,33 +157,3 @@ def test_eagle3_lm_head_receives_quant_config(): assert call_kwargs["quant_config"] is mock_quant_config, ( "ParallelLMHead must receive the draft model's quant_config" ) - - -def test_load_weights_kv_scale_handling(): - kv_scale_param = Mock() - kv_scale_param.weight_loader = Mock() - - params_dict = { - "layers.0.self_attn.kv_scale": kv_scale_param, - } - - mock_quant_config = Mock() - mock_quant_config.get_cache_scale = Mock(return_value="layers.0.self_attn.kv_scale") - - # Load_weights logic for KV cache scales - name = "layers.0.self_attn.k_proj.weight" - loaded_weight_tensor = torch.tensor([1.0, 2.0]) - - if mock_quant_config is not None: - scale_name = mock_quant_config.get_cache_scale(name) - if scale_name: - param = params_dict[scale_name] - assert param is kv_scale_param - weight_to_load = ( - loaded_weight_tensor - if loaded_weight_tensor.dim() == 0 - else loaded_weight_tensor[0] - ) - - assert scale_name == "layers.0.self_attn.kv_scale" - assert weight_to_load == loaded_weight_tensor[0] diff --git a/vllm/model_executor/layers/quantization/base_config.py b/vllm/model_executor/layers/quantization/base_config.py index 344ddd8abd2..3c03ff2233b 100644 --- a/vllm/model_executor/layers/quantization/base_config.py +++ b/vllm/model_executor/layers/quantization/base_config.py @@ -162,7 +162,13 @@ class QuantizationConfig(ABC): """ raise NotImplementedError - def get_cache_scale(self, name: str) -> str | None: + def get_cache_scale_mapper(self) -> "WeightsMapper | None": + """Mapping from checkpoint KV-cache scale names to vLLM scale names. + + Returning a mapper here causes `AutoWeightsLoader` to apply it to the + weight stream automatically; individual model `load_weights` methods + do not need to know about KV-cache scales. + """ return None def apply_vllm_mapper( # noqa: B027 diff --git a/vllm/model_executor/layers/quantization/fp8.py b/vllm/model_executor/layers/quantization/fp8.py index 41e2e19785c..a6461900d13 100644 --- a/vllm/model_executor/layers/quantization/fp8.py +++ b/vllm/model_executor/layers/quantization/fp8.py @@ -207,25 +207,18 @@ class Fp8Config(QuantizationConfig): return Fp8KVCacheMethod(self) return None - def get_cache_scale(self, name: str) -> str | None: - """ - Check whether the param name matches the format for k/v cache scales - in compressed-tensors. If this is the case, return its equivalent - param name expected by vLLM + def get_cache_scale_mapper(self) -> "WeightsMapper": + """Map compressed-tensors KV-cache scale names to vLLM names.""" + from vllm.model_executor.models.utils import WeightsMapper - :param name: param name - :return: matching param name for KV cache scale in vLLM - """ - if name.endswith(".output_scale") and ".k_proj" in name: - return name.replace(".k_proj.output_scale", ".attn.k_scale") - if name.endswith(".output_scale") and ".v_proj" in name: - return name.replace(".v_proj.output_scale", ".attn.v_scale") - if name.endswith(".output_scale") and ".q_proj" in name: - return name.replace(".q_proj.output_scale", ".attn.q_scale") - if name.endswith("self_attn.prob_output_scale"): - return name.replace(".prob_output_scale", ".attn.prob_scale") - # If no matches, return None - return None + return WeightsMapper( + orig_to_new_suffix={ + ".k_proj.output_scale": ".attn.k_scale", + ".v_proj.output_scale": ".attn.v_scale", + ".q_proj.output_scale": ".attn.q_scale", + ".self_attn.prob_output_scale": ".self_attn.attn.prob_scale", + } + ) class CopyNumelCounter(TorchDispatchMode): diff --git a/vllm/model_executor/layers/quantization/kv_cache.py b/vllm/model_executor/layers/quantization/kv_cache.py index 726ac2232af..739de9236cf 100644 --- a/vllm/model_executor/layers/quantization/kv_cache.py +++ b/vllm/model_executor/layers/quantization/kv_cache.py @@ -15,6 +15,30 @@ from vllm.v1.kv_cache_interface import kv_cache_uses_per_token_head_scales logger = init_logger(__name__) +class KVCacheScaleParameter(torch.nn.Parameter): + """Scalar parameter for KV-cache scales. + + Initialized to -1.0 (an invalid sentinel) so call sites just write + `KVCacheScaleParameter()`. The `weight_loader` accepts shape `()` or + `(1,)` and rejects anything else — per-head scales go through a separate + path (compressed-tensors' `_tp_aware_loader`), not this one. Per-instance + overrides still work because instance attribute assignment shadows this + class-level loader. + """ + + def __new__(cls) -> "KVCacheScaleParameter": + return super().__new__(cls, torch.tensor(-1.0), requires_grad=False) + + @staticmethod + def weight_loader(param: torch.nn.Parameter, loaded_weight: torch.Tensor) -> None: + if loaded_weight.numel() != 1: + raise ValueError( + f"KV-cache scale expects a scalar weight, got shape " + f"{tuple(loaded_weight.shape)}" + ) + param.data.copy_(loaded_weight.reshape(())) + + class BaseKVCacheMethod(QuantizeMethodBase): """ Quant method that adds `_k_scale` and `_v_scale` attributes to the @@ -37,11 +61,11 @@ class BaseKVCacheMethod(QuantizeMethodBase): # Initialize the Q and KV cache scales to -1.0, an invalid value. # If the q and k/v_scales appear in the checkpoint, it will be # overwritten when loading weights. - layer.q_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False) - layer.k_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False) - layer.v_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False) + layer.q_scale = KVCacheScaleParameter() + layer.k_scale = KVCacheScaleParameter() + layer.v_scale = KVCacheScaleParameter() # Initialize P = softmax(QK^T) scales - layer.prob_scale = torch.nn.Parameter(torch.tensor(-1.0), requires_grad=False) + layer.prob_scale = KVCacheScaleParameter() def apply(self, layer: torch.nn.Module) -> torch.Tensor: raise RuntimeError(f"{self.__class__.__name__}.apply should not be called.") diff --git a/vllm/model_executor/layers/quantization/quark/quark.py b/vllm/model_executor/layers/quantization/quark/quark.py index d1f7a169ee7..54dea48973b 100644 --- a/vllm/model_executor/layers/quantization/quark/quark.py +++ b/vllm/model_executor/layers/quantization/quark/quark.py @@ -646,26 +646,16 @@ class QuarkConfig(QuantizationConfig): return scheme - def get_cache_scale(self, name: str) -> str | None: - """ - Check whether the param name matches the format for k/v cache scales - in quark. If this is the case, return its equivalent param name - expected by vLLM - - :param name: param name - :return: matching param name for KV cache scale in vLLM - """ - if name.endswith(".output_scale") and ".k_proj" in name: - return name.replace(".k_proj.output_scale", ".attn.k_scale") - if name.endswith(".output_scale") and ".v_proj" in name: - return name.replace(".v_proj.output_scale", ".attn.v_scale") - if name.endswith(".output_scale") and ".q_proj" in name: - return name.replace(".q_proj.output_scale", ".attn.q_scale") - if name.endswith("self_attn.prob_output_scale"): - return name.replace(".prob_output_scale", ".attn.prob_scale") - - # If no matches, return None - return None + def get_cache_scale_mapper(self) -> "WeightsMapper": + """Map Quark KV-cache scale names to vLLM names.""" + return WeightsMapper( + orig_to_new_suffix={ + ".k_proj.output_scale": ".attn.k_scale", + ".v_proj.output_scale": ".attn.v_scale", + ".q_proj.output_scale": ".attn.q_scale", + ".self_attn.prob_output_scale": ".self_attn.attn.prob_scale", + } + ) class QuarkLinearMethod(LinearMethodBase): diff --git a/vllm/model_executor/model_loader/weight_utils.py b/vllm/model_executor/model_loader/weight_utils.py index cbb191ebb62..76a83898989 100644 --- a/vllm/model_executor/model_loader/weight_utils.py +++ b/vllm/model_executor/model_loader/weight_utils.py @@ -1541,6 +1541,11 @@ def maybe_remap_kv_scale_name(name: str, params_dict: dict) -> str | None: if no remapping is needed. None: If the remapped name is not found in params_dict. """ + # Already in vLLM's expected form (e.g. weights pre-renamed by a + # `WeightsMapper` from the quant config). Skip the regex remap, which + # would otherwise double-apply the `.attn` prefix and drop the weight. + if name in params_dict: + return name if name.endswith(".kv_scale"): logger.warning_once( "DEPRECATED. Found kv_scale in the checkpoint. " diff --git a/vllm/model_executor/models/apertus.py b/vllm/model_executor/models/apertus.py index 5905a198b28..0711fb03f84 100644 --- a/vllm/model_executor/models/apertus.py +++ b/vllm/model_executor/models/apertus.py @@ -430,18 +430,6 @@ class ApertusModel(nn.Module, EagleModelMixin): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue if "scale" in name or "zero_point" in name: # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) diff --git a/vllm/model_executor/models/arcee.py b/vllm/model_executor/models/arcee.py index eb8c3e3f65e..d25c954fc19 100644 --- a/vllm/model_executor/models/arcee.py +++ b/vllm/model_executor/models/arcee.py @@ -293,18 +293,6 @@ class ArceeModel(nn.Module, EagleModelMixin): if "rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - if "scale" in name or "zero_point" in name: remapped_name = maybe_remap_kv_scale_name(name, params_dict) if remapped_name is None: diff --git a/vllm/model_executor/models/aria.py b/vllm/model_executor/models/aria.py index 55bc64cd94a..e453e6e6cf6 100644 --- a/vllm/model_executor/models/aria.py +++ b/vllm/model_executor/models/aria.py @@ -363,18 +363,6 @@ class AriaTextModel(LlamaModel, SupportsQuant): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/cohere2_moe.py b/vllm/model_executor/models/cohere2_moe.py index aa8adff188f..16e299ea27a 100644 --- a/vllm/model_executor/models/cohere2_moe.py +++ b/vllm/model_executor/models/cohere2_moe.py @@ -464,18 +464,6 @@ class Cohere2MoeModel(nn.Module): if "rotary_emb.inv_freq" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - for param_name, shard_name, shard_id in stacked_params_mapping: if shard_name not in name: continue diff --git a/vllm/model_executor/models/cohere_eagle.py b/vllm/model_executor/models/cohere_eagle.py index 5c22d6e34dd..7b57c739ffe 100644 --- a/vllm/model_executor/models/cohere_eagle.py +++ b/vllm/model_executor/models/cohere_eagle.py @@ -150,18 +150,6 @@ class CohereEagleModel(nn.Module): if "rotary_emb.inv_freq" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/commandr.py b/vllm/model_executor/models/commandr.py index e73dfb1f01e..317269ec3b6 100644 --- a/vllm/model_executor/models/commandr.py +++ b/vllm/model_executor/models/commandr.py @@ -352,19 +352,6 @@ class CohereModel(nn.Module): params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - for param_name, shard_name, shard_id in stacked_params_mapping: if shard_name not in name: continue diff --git a/vllm/model_executor/models/dbrx.py b/vllm/model_executor/models/dbrx.py index 6c798bf2f36..32f4f15c7a3 100644 --- a/vllm/model_executor/models/dbrx.py +++ b/vllm/model_executor/models/dbrx.py @@ -394,19 +394,6 @@ class DbrxModel(nn.Module): loaded_params: set[str] = set() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - if name.endswith(("w1", "w2", "v1")): name = name + "_weight" for param_name, weight_name in expert_params_mapping: diff --git a/vllm/model_executor/models/deepseek_eagle3.py b/vllm/model_executor/models/deepseek_eagle3.py index 9b96cdec830..dc153ac9e0b 100644 --- a/vllm/model_executor/models/deepseek_eagle3.py +++ b/vllm/model_executor/models/deepseek_eagle3.py @@ -284,19 +284,6 @@ class DeepseekV2Eagle3Model(nn.Module): if "midlayer." in name: name = name.replace("midlayer.", "layers.0.") - # Handle kv cache quantization scales - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - # Remapping the name FP8 kv-scale if "scale" in name: name = maybe_remap_kv_scale_name(name, params_dict) diff --git a/vllm/model_executor/models/exaone.py b/vllm/model_executor/models/exaone.py index b633fd28508..dca05f72c69 100644 --- a/vllm/model_executor/models/exaone.py +++ b/vllm/model_executor/models/exaone.py @@ -391,18 +391,6 @@ class ExaoneModel(nn.Module): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/exaone4.py b/vllm/model_executor/models/exaone4.py index 04708de93d3..e38dbb5ee29 100644 --- a/vllm/model_executor/models/exaone4.py +++ b/vllm/model_executor/models/exaone4.py @@ -389,18 +389,6 @@ class Exaone4Model(nn.Module): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/exaone_moe.py b/vllm/model_executor/models/exaone_moe.py index 80b7e0957e8..3373983f5c9 100644 --- a/vllm/model_executor/models/exaone_moe.py +++ b/vllm/model_executor/models/exaone_moe.py @@ -374,18 +374,6 @@ class ExaoneMoeModel(nn.Module): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/gemma2.py b/vllm/model_executor/models/gemma2.py index 425ecc65195..733eb3ed3c1 100644 --- a/vllm/model_executor/models/gemma2.py +++ b/vllm/model_executor/models/gemma2.py @@ -328,16 +328,6 @@ class Gemma2Model(nn.Module): params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache scales for compressed-tensors quantization - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, shard_name, shard_id in stacked_params_mapping: if shard_name not in name: continue diff --git a/vllm/model_executor/models/gemma3.py b/vllm/model_executor/models/gemma3.py index f61f7c6f780..7bae2b1a5e7 100644 --- a/vllm/model_executor/models/gemma3.py +++ b/vllm/model_executor/models/gemma3.py @@ -386,17 +386,6 @@ class Gemma3Model(nn.Module): ): loaded_weight -= 1 - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache scales for compressed-tensors quantization - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - # Check if this is a scale parameter that needs remapping first if name.endswith((".k_scale", ".v_scale", ".q_scale", ".prob_scale")): # Try to remap the scale name first diff --git a/vllm/model_executor/models/gemma3n.py b/vllm/model_executor/models/gemma3n.py index 770424ba0fd..ad8b21d86b4 100644 --- a/vllm/model_executor/models/gemma3n.py +++ b/vllm/model_executor/models/gemma3n.py @@ -1056,16 +1056,6 @@ class Gemma3nTextModel(nn.Module, SupportsQuant): ): name = f"self_decoder.{name}" - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache scales for compressed-tensors quantization - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, shard_name, shard_id in stacked_params_mapping: if shard_name not in name: continue diff --git a/vllm/model_executor/models/gemma4.py b/vllm/model_executor/models/gemma4.py index 5d0e3efe2e1..d14a767df5f 100644 --- a/vllm/model_executor/models/gemma4.py +++ b/vllm/model_executor/models/gemma4.py @@ -1409,16 +1409,6 @@ class Gemma4Model(nn.Module, EagleModelMixin): params_dict.update(dict(self.named_buffers())) loaded_params: set[str] = set() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - if name.endswith((".k_scale", ".v_scale", ".q_scale", ".prob_scale")): remapped_name = maybe_remap_kv_scale_name(name, params_dict) if remapped_name is not None and remapped_name in params_dict: diff --git a/vllm/model_executor/models/glm4.py b/vllm/model_executor/models/glm4.py index 89447927d5c..4587a692766 100644 --- a/vllm/model_executor/models/glm4.py +++ b/vllm/model_executor/models/glm4.py @@ -258,18 +258,6 @@ class Glm4Model(LlamaModel): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue if "scale" in name or "zero_point" in name: # Remapping the name of FP8 kv-scale or zero point. name = maybe_remap_kv_scale_name(name, params_dict) diff --git a/vllm/model_executor/models/glm_ocr_mtp.py b/vllm/model_executor/models/glm_ocr_mtp.py index 34e602bb669..3d283c101ca 100644 --- a/vllm/model_executor/models/glm_ocr_mtp.py +++ b/vllm/model_executor/models/glm_ocr_mtp.py @@ -166,6 +166,10 @@ class GlmOcrMTP(nn.Module, SupportsPP): return self.model.compute_logits(hidden_states, spec_step_idx) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]: + if self.quant_config is not None and ( + cache_scale_mapper := self.quant_config.get_cache_scale_mapper() + ): + weights = cache_scale_mapper.apply(weights) stacked_params_mapping = [ # (param_name, shard_name, shard_id) (".qkv_proj", ".q_proj", "q"), @@ -189,19 +193,6 @@ class GlmOcrMTP(nn.Module, SupportsPP): name = self._rewrite_spec_layer_name(spec_layer, name) - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - if "scale" in name or "zero_point" in name: # Remapping the name of FP8 kv-scale or zero point. name = maybe_remap_kv_scale_name(name, params_dict) diff --git a/vllm/model_executor/models/gpt_j.py b/vllm/model_executor/models/gpt_j.py index c29103c6d52..30da9b4dea2 100644 --- a/vllm/model_executor/models/gpt_j.py +++ b/vllm/model_executor/models/gpt_j.py @@ -254,19 +254,6 @@ class GPTJModel(nn.Module): if "attn.bias" in name or "attn.masked_bias" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/gpt_oss.py b/vllm/model_executor/models/gpt_oss.py index d12db96c5d4..920d43392cf 100644 --- a/vllm/model_executor/models/gpt_oss.py +++ b/vllm/model_executor/models/gpt_oss.py @@ -635,52 +635,6 @@ class GptOssModel(nn.Module, EagleModelMixin): moe_quant_method = _get_moe_weight_dtype(layer_id=layer_id) - def kv_cache_scale_loader( - quant_config: QuantizationConfig, - name: str, - params_dict: dict[str, typing.Any], - weight: torch.Tensor, - default_weight_loader: Callable[..., None], - loaded_params: set[str], - ) -> tuple[bool, set[str]]: - """ - Load KV cache output scales. - Returns: - Tuple of (bool, set): - - bool: True if KV-cache scale was loaded into loaded_params - - set: Updated set of loaded_params if True else the original set - """ - # load explicit cached KV output scale from quant_config - if quant_config is not None and ( - scale_name := quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr( - param, "weight_loader", default_weight_loader - ) - if weight.numel() != 1: - raise ValueError( - f"KV cache scale '{scale_name}' is expected to be a " - f"scalar, but got a tensor of shape {weight.shape}." - ) - # Ensure weight is a scalar before passing to loader. - weight_loader(param, weight.flatten()[0]) - loaded_params.add(scale_name) - return True, loaded_params - - return False, loaded_params - - load_kv_cache_scale_completed, loaded_params = kv_cache_scale_loader( - self.quant_config, - name, - params_dict, - loaded_weight, - default_weight_loader, - loaded_params, - ) - if load_kv_cache_scale_completed: - continue - if ( all(key in name for key in ["input_scale", "mlp.experts"]) and expert_id is not None diff --git a/vllm/model_executor/models/granite.py b/vllm/model_executor/models/granite.py index 4b486ede443..2adc29f8d25 100644 --- a/vllm/model_executor/models/granite.py +++ b/vllm/model_executor/models/granite.py @@ -334,18 +334,6 @@ class GraniteModel(nn.Module): params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/granitemoe.py b/vllm/model_executor/models/granitemoe.py index e3585a6dd74..5909604bd54 100644 --- a/vllm/model_executor/models/granitemoe.py +++ b/vllm/model_executor/models/granitemoe.py @@ -365,19 +365,6 @@ class GraniteMoeModel(nn.Module): params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/granitemoehybrid.py b/vllm/model_executor/models/granitemoehybrid.py index 1ab069e3ba3..4f861fc5016 100644 --- a/vllm/model_executor/models/granitemoehybrid.py +++ b/vllm/model_executor/models/granitemoehybrid.py @@ -495,18 +495,6 @@ class GraniteMoeHybridModel(nn.Module): if "A_log" in n: n = n.replace("A_log", "A") - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(n) - ): - # Loading kv cache quantization scales - loaded_weight = p - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - _load(scale_name, loaded_weight) - loaded_params.add(scale_name) - continue - if _load_quant_expert(n, p): continue diff --git a/vllm/model_executor/models/grok1.py b/vllm/model_executor/models/grok1.py index f06122a7fd1..3fc3d1a2d2c 100644 --- a/vllm/model_executor/models/grok1.py +++ b/vllm/model_executor/models/grok1.py @@ -548,18 +548,6 @@ class Grok1Model(nn.Module): for old_pattern, new_pattern in self.weight_name_remapping.items(): if old_pattern in name: name = name.replace(old_pattern, new_pattern) - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: diff --git a/vllm/model_executor/models/hunyuan_v1.py b/vllm/model_executor/models/hunyuan_v1.py index b900c0ed83e..ec3cfbd017b 100644 --- a/vllm/model_executor/models/hunyuan_v1.py +++ b/vllm/model_executor/models/hunyuan_v1.py @@ -771,15 +771,6 @@ class HunYuanModel(nn.Module, EagleModelMixin): # processed with quantization, LoRA, fine-tuning, etc. if self.config.tie_word_embeddings and "lm_head.weight" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache scales for compressed-tensors quantization - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] - weight_loader(param, loaded_weight) - continue is_found = False for param_name, weight_name, shard_id in stacked_params_mapping: diff --git a/vllm/model_executor/models/hy_v3.py b/vllm/model_executor/models/hy_v3.py index bfff84b8049..6142f2c086a 100644 --- a/vllm/model_executor/models/hy_v3.py +++ b/vllm/model_executor/models/hy_v3.py @@ -531,17 +531,6 @@ class HYV3Model(nn.Module): for name, loaded_weight in weights: if self.config.tie_word_embeddings and "lm_head.weight" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue if "scale" in name: # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) diff --git a/vllm/model_executor/models/hy_v3_mtp.py b/vllm/model_executor/models/hy_v3_mtp.py index 8594a38c3ab..37ace1dada3 100644 --- a/vllm/model_executor/models/hy_v3_mtp.py +++ b/vllm/model_executor/models/hy_v3_mtp.py @@ -264,6 +264,10 @@ class HYV3MTP(nn.Module): return torch.concat((q, k, v)) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): + if self.quant_config is not None and ( + cache_scale_mapper := self.quant_config.get_cache_scale_mapper() + ): + weights = cache_scale_mapper.apply(weights) cla_factor = _get_cla_factor(self.config) stacked_params_mapping = [ # (param_name, shard_name, shard_id) @@ -336,14 +340,6 @@ class HYV3MTP(nn.Module): continue if self.config.tie_word_embeddings and "lm_head.weight" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] - weight_loader(param, loaded_weight) - continue spec_layer = get_spec_layer_idx_from_weight_name(self.config, name) if spec_layer is None: continue diff --git a/vllm/model_executor/models/hyperclovax.py b/vllm/model_executor/models/hyperclovax.py index 3176c428413..2f54f78e758 100644 --- a/vllm/model_executor/models/hyperclovax.py +++ b/vllm/model_executor/models/hyperclovax.py @@ -395,18 +395,6 @@ class HyperCLOVAXModel(nn.Module): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue if "scale" in name or "zero_point" in name: # Remapping the name of FP8 kv-scale or zero point. remapped_name = maybe_remap_kv_scale_name(name, params_dict) diff --git a/vllm/model_executor/models/iquest_loopcoder.py b/vllm/model_executor/models/iquest_loopcoder.py index 24c004ff4c2..3755cba5d1a 100644 --- a/vllm/model_executor/models/iquest_loopcoder.py +++ b/vllm/model_executor/models/iquest_loopcoder.py @@ -476,18 +476,6 @@ class IQuestLoopCoderModel(nn.Module): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if "gate_projections" in name: continue diff --git a/vllm/model_executor/models/jais2.py b/vllm/model_executor/models/jais2.py index 4e03eb12ee4..dafa0f03ae9 100644 --- a/vllm/model_executor/models/jais2.py +++ b/vllm/model_executor/models/jais2.py @@ -386,16 +386,6 @@ class Jais2Model(nn.Module): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache scales for compressed-tensors quantization - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue if "scale" in name: name = maybe_remap_kv_scale_name(name, params_dict) if name is None: diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 86d7edc25f9..8a0b7ceea5f 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -790,21 +790,6 @@ class KeyeSiglipVisionModel(nn.Module): continue if "head.mlp" in name or "head.probe" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr( - param, - "weight_loader", - default_weight_loader, - ) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for ( param_name, weight_name, diff --git a/vllm/model_executor/models/laguna.py b/vllm/model_executor/models/laguna.py index f79f6097c61..5572481565b 100644 --- a/vllm/model_executor/models/laguna.py +++ b/vllm/model_executor/models/laguna.py @@ -724,20 +724,6 @@ class LagunaModel(nn.Module, EagleModelMixin): loaded_params.add(name) continue - # Handle KV cache quantization scales - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - assert loaded_weight.numel() == 1, ( - f"KV scale numel {loaded_weight.numel()} != 1" - ) - loaded_weight = loaded_weight.squeeze() - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - # Handle stacked params (QKV, gate_up for # non-expert layers and shared_expert) for param_name, weight_name, shard_id in stacked_params_mapping: diff --git a/vllm/model_executor/models/llama.py b/vllm/model_executor/models/llama.py index 3c797d05e93..cf59ccdf750 100644 --- a/vllm/model_executor/models/llama.py +++ b/vllm/model_executor/models/llama.py @@ -451,18 +451,6 @@ class LlamaModel(nn.Module, EagleModelMixin): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue if "scale" in name or "zero_point" in name: # Remapping the name of FP8 kv-scale or zero point. name = maybe_remap_kv_scale_name(name, params_dict) diff --git a/vllm/model_executor/models/llama4.py b/vllm/model_executor/models/llama4.py index bfcb72a6a74..07ca2714ed2 100644 --- a/vllm/model_executor/models/llama4.py +++ b/vllm/model_executor/models/llama4.py @@ -588,21 +588,6 @@ class Llama4Model(LlamaModel): fused_experts_params = True expert_params_mapping = expert_params_mapping_fused - # If kv cache quantization scales exist and the weight name - # corresponds to one of the kv cache quantization scales, load - # them. - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - # Iterate over stacked_params_mapping to check if the current weight # is one of the stacked parameters. If so, load the weight with the # corresponding shard id. Note that MoE weights are handled @@ -625,9 +610,9 @@ class Llama4Model(LlamaModel): if is_pp_missing_parameter(name, self): continue - # Remap kv cache scale names for ModelOpt checkpoints. - # TODO: ModelOpt should implement get_cache_scale() such that - # kv cache scale name remapping can be done there. + # Remap kv cache scale names for any checkpoint format the + # quant config's `get_cache_scale_mapper` does not cover + # (idempotent for names already renamed by the mapper). if name.endswith("scale"): name = maybe_remap_kv_scale_name(name, params_dict) if name is None: diff --git a/vllm/model_executor/models/llama_eagle.py b/vllm/model_executor/models/llama_eagle.py index 585c8f6dbd2..14842a75fea 100644 --- a/vllm/model_executor/models/llama_eagle.py +++ b/vllm/model_executor/models/llama_eagle.py @@ -127,19 +127,6 @@ class LlamaModel(nn.Module): params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - # Handle kv cache quantization scales - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue # Remapping the name FP8 kv-scale or zero point. if "scale" in name or "zero_point" in name: name = maybe_remap_kv_scale_name(name, params_dict) diff --git a/vllm/model_executor/models/llama_eagle3.py b/vllm/model_executor/models/llama_eagle3.py index 9fd6652aa24..bb1bbb85537 100644 --- a/vllm/model_executor/models/llama_eagle3.py +++ b/vllm/model_executor/models/llama_eagle3.py @@ -267,19 +267,6 @@ class LlamaModel(nn.Module): for name, loaded_weight in weights: if "midlayer." in name: name = name.replace("midlayer.", "layers.0.") - # Handle kv cache quantization scales - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue # Remapping the name FP8 kv-scale or zero point. if "scale" in name or "zero_point" in name: name = maybe_remap_kv_scale_name(name, params_dict) diff --git a/vllm/model_executor/models/mimo.py b/vllm/model_executor/models/mimo.py index a7699f0d598..4f67d468ace 100644 --- a/vllm/model_executor/models/mimo.py +++ b/vllm/model_executor/models/mimo.py @@ -104,18 +104,6 @@ class MiMoModel(Qwen2Model): continue if "rotary_emb.inv_freq" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/mimo_v2.py b/vllm/model_executor/models/mimo_v2.py index 3f466162649..7c6d5363c0a 100644 --- a/vllm/model_executor/models/mimo_v2.py +++ b/vllm/model_executor/models/mimo_v2.py @@ -555,22 +555,6 @@ class MiMoV2Model(nn.Module): if "mtp" in name: continue - if self.quant_config is not None: - cache_scale_name = self.quant_config.get_cache_scale(name) - if cache_scale_name is not None and cache_scale_name in params_dict: - param = params_dict[cache_scale_name] - weight_loader = getattr( - param, "weight_loader", default_weight_loader - ) - - kv_scale = loaded_weight - if kv_scale.dim() > 0 and kv_scale.numel() > 1: - kv_scale = kv_scale.view(-1)[0] - - weight_loader(param, kv_scale) - loaded_params.add(cache_scale_name) - continue - expert_matched = False for param_name, weight_name, expert_id, shard_id in expert_params_mapping: if weight_name not in name: diff --git a/vllm/model_executor/models/mixtral.py b/vllm/model_executor/models/mixtral.py index cbfc254dda3..53c1c87cfce 100644 --- a/vllm/model_executor/models/mixtral.py +++ b/vllm/model_executor/models/mixtral.py @@ -388,19 +388,6 @@ class MixtralModel(nn.Module): loaded_params: set[str] = set() expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/nemotron.py b/vllm/model_executor/models/nemotron.py index 15d43a9ddf9..7b2e6b93b27 100644 --- a/vllm/model_executor/models/nemotron.py +++ b/vllm/model_executor/models/nemotron.py @@ -375,18 +375,6 @@ class NemotronModel(nn.Module): params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/nemotron_nas.py b/vllm/model_executor/models/nemotron_nas.py index f2f3811c064..b974a3eb085 100644 --- a/vllm/model_executor/models/nemotron_nas.py +++ b/vllm/model_executor/models/nemotron_nas.py @@ -334,18 +334,6 @@ class DeciModel(nn.Module): # Models trained using ColossalAI may include these tensors in # the checkpoint. Skip them. continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue if "scale" in name or "zero_point" in name: # Remapping the name of FP8 kv-scale. name = maybe_remap_kv_scale_name(name, params_dict) diff --git a/vllm/model_executor/models/ouro.py b/vllm/model_executor/models/ouro.py index 56505ec7be2..503d4b5c834 100644 --- a/vllm/model_executor/models/ouro.py +++ b/vllm/model_executor/models/ouro.py @@ -390,18 +390,6 @@ class OuroModel(nn.Module): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index cd88009c739..0bf10e3ce77 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -944,21 +944,6 @@ class SiglipVisionModel(nn.Module): continue if "packing_position_embedding" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr( - param, - "weight_loader", - default_weight_loader, - ) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for ( param_name, weight_name, diff --git a/vllm/model_executor/models/phimoe.py b/vllm/model_executor/models/phimoe.py index 5770420ce56..a49e8ce2e82 100644 --- a/vllm/model_executor/models/phimoe.py +++ b/vllm/model_executor/models/phimoe.py @@ -537,19 +537,6 @@ class PhiMoEModel(nn.Module): loaded_params: set[str] = set() expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/qwen2.py b/vllm/model_executor/models/qwen2.py index b83fedc70db..9c39c649708 100644 --- a/vllm/model_executor/models/qwen2.py +++ b/vllm/model_executor/models/qwen2.py @@ -439,18 +439,6 @@ class Qwen2Model(nn.Module, EagleModelMixin): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/qwen3_dflash.py b/vllm/model_executor/models/qwen3_dflash.py index 25f139f26cb..820260f795c 100644 --- a/vllm/model_executor/models/qwen3_dflash.py +++ b/vllm/model_executor/models/qwen3_dflash.py @@ -469,17 +469,6 @@ class DFlashQwen3Model(nn.Module): for name, loaded_weight in weights: if "midlayer." in name: name = name.replace("midlayer.", "layers.0.") - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue if "scale" in name: name = maybe_remap_kv_scale_name(name, params_dict) if name is None: diff --git a/vllm/model_executor/models/qwen3_moe.py b/vllm/model_executor/models/qwen3_moe.py index 4ec1be3367d..6980184cc8a 100644 --- a/vllm/model_executor/models/qwen3_moe.py +++ b/vllm/model_executor/models/qwen3_moe.py @@ -552,19 +552,6 @@ class Qwen3MoeModel(nn.Module, EagleModelMixin): loaded_params: set[str] = set() expert_params_mapping = self.get_expert_mapping() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - assert loaded_weight.numel() == 1, ( - f"KV scale numel {loaded_weight.numel()} != 1" - ) - loaded_weight = loaded_weight.squeeze() - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue if "scale" in name or "zero_point" in name: name = maybe_remap_kv_scale_name(name, params_dict) if name is None: diff --git a/vllm/model_executor/models/rnj1.py b/vllm/model_executor/models/rnj1.py index f83577b7a39..68c3722e2bc 100644 --- a/vllm/model_executor/models/rnj1.py +++ b/vllm/model_executor/models/rnj1.py @@ -350,16 +350,6 @@ class Rnj1Model(nn.Module): ): loaded_weight -= 1 - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = loaded_weight[0] - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue - if name.endswith((".k_scale", ".v_scale", ".q_scale", ".prob_scale")): remapped_name = maybe_remap_kv_scale_name(name, params_dict) if remapped_name is not None and remapped_name in params_dict: diff --git a/vllm/model_executor/models/seed_oss.py b/vllm/model_executor/models/seed_oss.py index d90174911fb..48147f7334e 100644 --- a/vllm/model_executor/models/seed_oss.py +++ b/vllm/model_executor/models/seed_oss.py @@ -376,18 +376,6 @@ class SeedOssModel(nn.Module): for name, loaded_weight in weights: if "rotary_emb.inv_freq" in name: continue - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/solar.py b/vllm/model_executor/models/solar.py index bff866d0d0c..454a0e97112 100644 --- a/vllm/model_executor/models/solar.py +++ b/vllm/model_executor/models/solar.py @@ -360,18 +360,6 @@ class SolarModel(nn.Module): params_dict = dict(self.named_parameters()) loaded_params: set[str] = set() for name, loaded_weight in weights: - if self.quant_config is not None and ( - scale_name := self.quant_config.get_cache_scale(name) - ): - # Loading kv cache quantization scales - param = params_dict[scale_name] - weight_loader = getattr(param, "weight_loader", default_weight_loader) - loaded_weight = ( - loaded_weight if loaded_weight.dim() == 0 else loaded_weight[0] - ) - weight_loader(param, loaded_weight) - loaded_params.add(scale_name) - continue for param_name, weight_name, shard_id in stacked_params_mapping: if weight_name not in name: continue diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 095d0e363d5..83d113415dc 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -52,6 +52,7 @@ class WeightsMapper: def __or__(self, other: "WeightsMapper") -> "WeightsMapper": """Combine two `WeightsMapper`s by merging their mappings.""" return WeightsMapper( + orig_to_new_regex={**self.orig_to_new_regex, **other.orig_to_new_regex}, orig_to_new_substr={**self.orig_to_new_substr, **other.orig_to_new_substr}, orig_to_new_prefix={**self.orig_to_new_prefix, **other.orig_to_new_prefix}, orig_to_new_suffix={**self.orig_to_new_suffix, **other.orig_to_new_suffix}, @@ -343,6 +344,20 @@ class AutoWeightsLoader: *, mapper: WeightsMapper | None = None, ) -> set[str]: + # Many models store quant_config in the base model instead of the causal model. + # We look at the causal model's direct children for this reason. + modules = (self.module, *self.module.children()) + iterator = (m.quant_config for m in modules if hasattr(m, "quant_config")) + quant_config = next(iterator, None) + cache_scale_mapper = ( + quant_config.get_cache_scale_mapper() if quant_config is not None else None + ) + if cache_scale_mapper is not None: + mapper = ( + mapper | cache_scale_mapper + if mapper is not None + else cache_scale_mapper + ) if mapper is not None: weights = mapper.apply(weights) # filter out weights with first-prefix/substr to skip in name diff --git a/vllm/models/deepseek_v4/attention.py b/vllm/models/deepseek_v4/attention.py index 4bdf6acc079..29302584880 100644 --- a/vllm/models/deepseek_v4/attention.py +++ b/vllm/models/deepseek_v4/attention.py @@ -330,10 +330,34 @@ class DeepseekV4Attention(nn.Module, AttentionLayerBase, ABC): device=hidden_states.device, ) + # Metadata-independent input GEMMs + RMSNorm stay in the captured + # graph; the metadata-dependent rest (q up-proj + kv-insert, indexer, + # compressor, MLA attention) runs in the eager break. + qr_kv, kv_score, indexer_kv_score, indexer_weights = ( + self.attn_gemm_parallel_execute(hidden_states) + ) + qr, kv = qr_kv.split([self.q_lora_rank, self.head_dim], dim=-1) + qr, kv = fused_q_kv_rmsnorm( + qr, + kv, + self.q_norm.weight.data, + self.kv_norm.weight.data, + self.eps, + ) + # attention_impl is wrapped with @eager_break_during_capture: this is # where the breakable cudagraph capture breaks (the attention op runs # eagerly between captured graph segments). - self.attention_impl(hidden_states, positions, o_padded) + self.attention_impl( + hidden_states, + qr, + kv, + kv_score, + indexer_kv_score, + indexer_weights, + positions, + o_padded, + ) o = o_padded[:, : self.n_local_heads, :] # Inverse-RoPE + wo_a + wo_b output projection (platform-specific). @@ -403,25 +427,17 @@ class DeepseekV4Attention(nn.Module, AttentionLayerBase, ABC): def attention_impl( self, hidden_states: torch.Tensor, + qr: torch.Tensor, + kv: torch.Tensor, + kv_score: torch.Tensor, + indexer_kv_score: torch.Tensor, + indexer_weights: torch.Tensor, positions: torch.Tensor, out: torch.Tensor, # [num_tokens, padded_heads, head_dim], written in place ) -> None: forward_context = get_forward_context() attn_metadata = forward_context.attn_metadata - qr_kv, kv_score, indexer_kv_score, indexer_weights = ( - self.attn_gemm_parallel_execute(hidden_states) - ) - - qr, kv = qr_kv.split([self.q_lora_rank, self.head_dim], dim=-1) - qr, kv = fused_q_kv_rmsnorm( - qr, - kv, - self.q_norm.weight.data, - self.kv_norm.weight.data, - self.eps, - ) - # wq_b + kv_insert (+ MLA compressor when an indexer is present) ride # on the default stream so q stays on its consumer stream (forward_mqa # downstream reads q on default). Indexer/compressor go on aux for diff --git a/vllm/platforms/cpu.py b/vllm/platforms/cpu.py index cf4319ac722..b1414665869 100644 --- a/vllm/platforms/cpu.py +++ b/vllm/platforms/cpu.py @@ -92,6 +92,11 @@ class CpuPlatform(Platform): return meminfo.total_memory + @classmethod + def mem_get_info(cls) -> tuple[int, int]: + meminfo = get_memory_node_info() + return meminfo.available_memory, meminfo.total_memory + @classmethod def set_device(cls, device: torch.device) -> None: """ diff --git a/vllm/reasoning/cohere_command_reasoning_parser.py b/vllm/reasoning/cohere_command_reasoning_parser.py index b28a59089e7..949c9ff5d99 100644 --- a/vllm/reasoning/cohere_command_reasoning_parser.py +++ b/vllm/reasoning/cohere_command_reasoning_parser.py @@ -90,7 +90,7 @@ MODEL_TO_TAG_STYLE: dict[str, CohereTagStyle] = { tools=COMMAND_A_TOOLS_TAG, ), "Cohere2MoeForCausalLM": CohereTagStyle( - json_tags=(COMMAND_A_JSON_TAG,), + json_tags=(COMMAND_A_JSON_TAG, COMMAND_A_PLUS_JSON_TAG), tools=COMMAND_A_TOOLS_TAG, ), }