mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
0a62f5eec9
Signed-off-by: Hissu Hyvarinen <hissu.hyvarinen@amd.com>
186 lines
6.2 KiB
Python
186 lines
6.2 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""Tests for Cutlass W4A16 (Machete) kernel on Hopper.
|
|
|
|
Verifies that W4A16 quantized models loaded through vllm select the
|
|
MacheteLinearKernel on sm_90 GPUs, that weights are correctly repacked,
|
|
and that inference produces valid output.
|
|
|
|
Run `pytest tests/quantization/test_cutlass_w4a16.py`.
|
|
"""
|
|
|
|
import pytest
|
|
import torch
|
|
|
|
from vllm.platforms import current_platform
|
|
|
|
if not current_platform.has_device_capability(90) or current_platform.is_rocm():
|
|
pytest.skip(
|
|
"Machete W4A16 requires Hopper (sm_90).",
|
|
allow_module_level=True,
|
|
)
|
|
|
|
from vllm.model_executor.kernels.linear import (
|
|
MPLinearLayerConfig,
|
|
choose_mp_linear_kernel,
|
|
)
|
|
from vllm.model_executor.kernels.linear.mixed_precision import (
|
|
MacheteLinearKernel,
|
|
)
|
|
from vllm.model_executor.layers.quantization.compressed_tensors.compressed_tensors import ( # noqa: E501
|
|
CompressedTensorsLinearMethod,
|
|
CompressedTensorsWNA16,
|
|
)
|
|
from vllm.scalar_type import scalar_types
|
|
|
|
|
|
@pytest.fixture(scope="function", autouse=True)
|
|
def enable_pickle(monkeypatch):
|
|
"""`LLM.apply_model` requires pickling a function."""
|
|
monkeypatch.setenv("VLLM_ALLOW_INSECURE_SERIALIZATION", "1")
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"act_type,weight_type,group_size,zero_points",
|
|
[
|
|
(torch.float16, scalar_types.uint4b8, 128, False),
|
|
(torch.bfloat16, scalar_types.uint4b8, 128, False),
|
|
(torch.float16, scalar_types.uint4, 128, True),
|
|
(torch.float16, scalar_types.uint4b8, -1, False),
|
|
],
|
|
ids=[
|
|
"fp16-gptq-g128",
|
|
"bf16-gptq-g128",
|
|
"fp16-awq-g128",
|
|
"fp16-channelwise",
|
|
],
|
|
)
|
|
def test_machete_kernel_selected(act_type, weight_type, group_size, zero_points):
|
|
"""Verify choose_mp_linear_kernel picks MacheteLinearKernel."""
|
|
config = MPLinearLayerConfig(
|
|
full_weight_shape=(4096, 4096),
|
|
partition_weight_shape=(4096, 4096),
|
|
act_type=act_type,
|
|
weight_type=weight_type,
|
|
group_size=group_size,
|
|
zero_points=zero_points,
|
|
has_g_idx=False,
|
|
)
|
|
kernel = choose_mp_linear_kernel(config)
|
|
assert kernel is MacheteLinearKernel, (
|
|
f"Expected MacheteLinearKernel, got {kernel.__name__}"
|
|
)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"full_shape,part_shape,weight_type,group_size,has_g_idx,expected_reason",
|
|
[
|
|
((4096, 4096), (2048, 4096), scalar_types.uint4b8, 128, True, "Act reordering"),
|
|
(
|
|
(4096, 4096),
|
|
(4096, 4096),
|
|
scalar_types.float6_e3m2f,
|
|
128,
|
|
False,
|
|
"Quant type",
|
|
),
|
|
((4096, 4096), (4096, 4096), scalar_types.uint4b8, 32, False, "Group size"),
|
|
],
|
|
ids=["partitioned-g_idx", "unsupported-quant-type", "unsupported-group-size"],
|
|
)
|
|
def test_machete_rejects_invalid_config(
|
|
full_shape, part_shape, weight_type, group_size, has_g_idx, expected_reason
|
|
):
|
|
"""Verify Machete rejects unsupported configurations."""
|
|
config = MPLinearLayerConfig(
|
|
full_weight_shape=full_shape,
|
|
partition_weight_shape=part_shape,
|
|
act_type=torch.float16,
|
|
weight_type=weight_type,
|
|
group_size=group_size,
|
|
zero_points=False,
|
|
has_g_idx=has_g_idx,
|
|
)
|
|
can_impl, reason = MacheteLinearKernel.can_implement(config)
|
|
assert not can_impl
|
|
assert expected_reason in reason
|
|
|
|
|
|
def test_kernel_selection_with_disabled_machete(monkeypatch):
|
|
"""Verify kernel selection falls back when Machete is disabled."""
|
|
monkeypatch.setattr("vllm.envs.VLLM_DISABLED_KERNELS", ["MacheteLinearKernel"])
|
|
|
|
config = MPLinearLayerConfig(
|
|
full_weight_shape=(4096, 4096),
|
|
partition_weight_shape=(4096, 4096),
|
|
act_type=torch.float16,
|
|
weight_type=scalar_types.uint4b8,
|
|
group_size=128,
|
|
zero_points=False,
|
|
has_g_idx=False,
|
|
)
|
|
kernel = choose_mp_linear_kernel(config)
|
|
assert kernel is not MacheteLinearKernel, "MacheteLinearKernel should be disabled"
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"model_name",
|
|
[
|
|
"nm-testing/tinyllama-oneshot-w4a16-channel-v2",
|
|
"nm-testing/TinyLlama-1.1B-Chat-v1.0-W4A16-G128-Asym-Updated-ActOrder",
|
|
],
|
|
)
|
|
def test_w4a16_machete_e2e(vllm_runner, model_name):
|
|
"""Load a W4A16 model, verify Machete kernel is used, and generate."""
|
|
with vllm_runner(model_name, enforce_eager=True, gpu_memory_utilization=0.5) as llm:
|
|
|
|
def check_model(model):
|
|
layer = model.model.layers[0]
|
|
qkv_proj = layer.self_attn.qkv_proj
|
|
|
|
assert isinstance(qkv_proj.quant_method, CompressedTensorsLinearMethod)
|
|
assert isinstance(qkv_proj.scheme, CompressedTensorsWNA16)
|
|
assert isinstance(qkv_proj.scheme.kernel, MacheteLinearKernel), (
|
|
f"Expected MacheteLinearKernel on Hopper, "
|
|
f"got {type(qkv_proj.scheme.kernel).__name__}"
|
|
)
|
|
|
|
assert hasattr(qkv_proj, "weight_packed")
|
|
assert hasattr(qkv_proj, "weight_scale")
|
|
assert qkv_proj.weight_packed.dtype == torch.int32
|
|
|
|
llm.apply_model(check_model)
|
|
|
|
output = llm.generate_greedy("Hello my name is", max_tokens=10)
|
|
assert output
|
|
assert len(output[0][1]) > 0
|
|
|
|
|
|
def test_w4a16_machete_bfloat16_deterministic(vllm_runner):
|
|
"""Verify Machete works with bf16 activations and is deterministic."""
|
|
model_name = "nm-testing/tinyllama-oneshot-w4a16-channel-v2"
|
|
prompt = "The capital of France is"
|
|
|
|
with vllm_runner(
|
|
model_name,
|
|
enforce_eager=True,
|
|
dtype="bfloat16",
|
|
gpu_memory_utilization=0.5,
|
|
) as llm:
|
|
|
|
def check_kernel_type(model):
|
|
layer = model.model.layers[0]
|
|
scheme = layer.self_attn.qkv_proj.scheme
|
|
assert isinstance(scheme.kernel, MacheteLinearKernel), (
|
|
f"Expected MacheteLinearKernel with bf16, "
|
|
f"got {type(scheme.kernel).__name__}"
|
|
)
|
|
|
|
llm.apply_model(check_kernel_type)
|
|
|
|
out1 = llm.generate_greedy(prompt, max_tokens=10)
|
|
out2 = llm.generate_greedy(prompt, max_tokens=10)
|
|
assert out1[0][1] == out2[0][1], (
|
|
f"Non-deterministic: '{out1[0][1]}' vs '{out2[0][1]}'"
|
|
)
|