diff --git a/vllm/model_executor/layers/quantization/humming.py b/vllm/model_executor/layers/quantization/humming.py index 12bb07a4022..e4d27efe370 100644 --- a/vllm/model_executor/layers/quantization/humming.py +++ b/vllm/model_executor/layers/quantization/humming.py @@ -810,8 +810,8 @@ class HummingMoEMethod(FusedMoEMethodBase): param = torch.nn.Parameter(tensor, requires_grad=False) setattr(layer, name, param) - layer.weight_schemas[sublayer_name] = weight_schema - layer.input_schemas[sublayer_name] = input_schema + layer.weight_schemas[sublayer_name] = weight_schema + layer.input_schemas[sublayer_name] = input_schema # force requant (origin quant setting -> fp16/bf16 -> new_quant setting) assert isinstance(weight_schema, HummingWeightSchema) @@ -865,6 +865,7 @@ class HummingMoEMethod(FusedMoEMethodBase): # use moe modular experts: HummingIndexedExperts | HummingGroupedExperts + layer.ensure_moe_quant_config_init() assert self.moe_quant_config is not None if get_humming_moe_gemm_type() == "indexed": experts = HummingIndexedExperts(layer, self.moe, self.moe_quant_config)