mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
135 lines
5.1 KiB
Python
135 lines
5.1 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""Unit tests for QuantizationConfigArgs parsing."""
|
|
|
|
import pytest
|
|
|
|
from vllm.config.quantization import (
|
|
QUANT_KEY_NAMES,
|
|
QuantizationConfigArgs,
|
|
QuantSpec,
|
|
resolve_quantization_config,
|
|
)
|
|
from vllm.model_executor.layers.quantization.utils.quant_utils import (
|
|
kFp8Dynamic128Sym,
|
|
kFp8DynamicTokenSym,
|
|
kFp8Static128BlockSym,
|
|
kFp8StaticTensorSym,
|
|
kInt8StaticChannelSym,
|
|
kMxfp8Dynamic,
|
|
)
|
|
|
|
# ---- QuantSpec ------------------------------------------------------------
|
|
|
|
|
|
def test_quant_spec_resolves_string_to_quant_key():
|
|
spec = QuantSpec(weight="mxfp8", activation="fp8_per_token")
|
|
assert spec.weight == kMxfp8Dynamic
|
|
assert spec.activation == kFp8DynamicTokenSym
|
|
|
|
|
|
def test_quant_spec_accepts_quant_key_directly():
|
|
spec = QuantSpec(weight=kFp8StaticTensorSym)
|
|
assert spec.weight is kFp8StaticTensorSym
|
|
assert spec.activation is None
|
|
|
|
|
|
def test_quant_spec_rejects_unknown_name():
|
|
with pytest.raises(ValueError, match="unknown quantization name"):
|
|
QuantSpec(weight="not_a_real_format")
|
|
|
|
|
|
# ---- QuantizationConfigArgs string shorthand on linear/moe ----------------
|
|
|
|
|
|
def test_args_linear_string_resolves_via_quant_key_names():
|
|
# A bare QUANT_KEY_NAMES entry desugars to QuantSpec(weight=<key>).
|
|
args = QuantizationConfigArgs(linear="fp8_per_block_static")
|
|
assert args.linear == QuantSpec(weight=kFp8Static128BlockSym)
|
|
assert args.moe is None
|
|
|
|
|
|
def test_args_moe_string_resolves_via_online_shorthand():
|
|
# An online-shorthand name pulls the matching slot from _ONLINE_SHORTHANDS
|
|
# (so `linear: "fp8_per_block"` and `moe: "fp8_per_block"` produce the
|
|
# same per-layer-kind spec the `--quantization fp8_per_block` shorthand
|
|
# would).
|
|
args = QuantizationConfigArgs(moe="fp8_per_block")
|
|
assert args.moe == QuantSpec(weight=kFp8Static128BlockSym)
|
|
|
|
|
|
def test_args_string_shorthand_missing_slot_raises():
|
|
# int8_per_channel_weight_only sets only `moe`; using it on `linear`
|
|
# has no defined spec and should raise rather than silently no-op.
|
|
with pytest.raises(ValueError, match="does not define a linear spec"):
|
|
QuantizationConfigArgs(linear="int8_per_channel_weight_only")
|
|
|
|
|
|
def test_args_accepts_dict_form():
|
|
args = QuantizationConfigArgs(moe={"activation": "mxfp8"})
|
|
assert args.moe == QuantSpec(weight=None, activation=kMxfp8Dynamic)
|
|
|
|
|
|
# ---- resolve_quantization_config -----------------------------------------
|
|
|
|
|
|
def test_resolve_shorthand_only_populates_both_slots():
|
|
args = resolve_quantization_config("fp8_per_block", None)
|
|
assert args.linear == QuantSpec(weight=kFp8Static128BlockSym)
|
|
assert args.moe == QuantSpec(weight=kFp8Static128BlockSym)
|
|
|
|
|
|
def test_resolve_int8_shorthand_leaves_linear_unset():
|
|
# int8_per_channel_weight_only is MoE-only; linear stays None so that
|
|
# OnlineQuantizationConfig leaves Linear layers in full precision.
|
|
args = resolve_quantization_config("int8_per_channel_weight_only", None)
|
|
assert args.linear is None
|
|
assert args.moe == QuantSpec(weight=kInt8StaticChannelSym)
|
|
|
|
|
|
def test_resolve_quantization_config_only():
|
|
# When only `quantization_config` is given (e.g. for an already-quantized
|
|
# checkpoint that needs an activation override), it's returned as-is.
|
|
args = resolve_quantization_config(None, {"moe": {"activation": "mxfp8"}})
|
|
assert args.linear is None
|
|
assert args.moe == QuantSpec(weight=None, activation=kMxfp8Dynamic)
|
|
|
|
|
|
def test_resolve_merges_explicit_over_shorthand():
|
|
# Explicit linear in quantization_config wins; moe falls back to the
|
|
# shorthand's slot.
|
|
args = resolve_quantization_config(
|
|
"fp8_per_tensor",
|
|
{"linear": "fp8_per_block"},
|
|
)
|
|
assert args.linear == QuantSpec(weight=kFp8Static128BlockSym)
|
|
assert args.moe == QuantSpec(weight=kFp8StaticTensorSym)
|
|
|
|
|
|
def test_resolve_rejects_quantization_config_with_non_shorthand_quant():
|
|
# If --quantization names something other than an online shorthand,
|
|
# quantization_config is not allowed via this path (checkpoint quant
|
|
# paths read it directly off ModelConfig instead).
|
|
with pytest.raises(ValueError, match="quantization_config is only supported"):
|
|
resolve_quantization_config("gptq", {"linear": "fp8_per_block"})
|
|
|
|
|
|
# ---- QUANT_KEY_NAMES coverage --------------------------------------------
|
|
|
|
|
|
def test_quant_key_names_round_trip():
|
|
# Every advertised name should round-trip through QuantSpec without error
|
|
# and produce the same QuantKey it maps to.
|
|
for name, expected in QUANT_KEY_NAMES.items():
|
|
assert QuantSpec(weight=name).weight == expected, name
|
|
assert QuantSpec(activation=name).activation == expected, name
|
|
|
|
|
|
def test_static_block_weight_paired_with_dynamic_block_activation():
|
|
# The block-FP8 shorthand pair: 128x128 static weights + 1x128 dynamic
|
|
# activations. Pinning this so renames in QUANT_KEY_NAMES don't quietly
|
|
# rewire the kernel dispatch.
|
|
spec = QuantSpec(weight="fp8_per_block_static", activation="fp8_per_block_dynamic")
|
|
assert spec.weight == kFp8Static128BlockSym
|
|
assert spec.activation == kFp8Dynamic128Sym
|