mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[ROCm][CI] Add missing quantization methods and fix online quant test failures (#39801)
Signed-off-by: Andreas Karatzas <akaratza@amd.com>
This commit is contained in:
@@ -54,13 +54,13 @@ MODEL_ARG_EXPTYPES = [
|
||||
(
|
||||
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ",
|
||||
None,
|
||||
"awq_marlin" if current_platform.is_cuda() else "awq",
|
||||
"awq_marlin" if current_platform.is_cuda_alike() else "awq",
|
||||
),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "awq", "awq"),
|
||||
(
|
||||
"TheBloke/OpenHermes-2.5-Mistral-7B-AWQ",
|
||||
"marlin",
|
||||
"awq_marlin" if current_platform.is_cuda() else "ERROR",
|
||||
"awq_marlin" if current_platform.is_cuda_alike() else "ERROR",
|
||||
),
|
||||
("TheBloke/OpenHermes-2.5-Mistral-7B-AWQ", "gptq", "ERROR"),
|
||||
]
|
||||
|
||||
@@ -43,7 +43,7 @@ TEST_CONFIGS = {
|
||||
"amd/Qwen3-8B-WMXFP4FP8-AMXFP4FP8-AMP-KVFP8": {"arc_challenge": 0.52, "mmlu": 0.72},
|
||||
# Non-mixed-precision (PTQ) model
|
||||
# - Reference for pipeline compatibility verification -> No conflicts or breakings
|
||||
"amd/Llama-2-70b-chat-hf-FP8-MLPerf-fp8_attn_quark_format": {
|
||||
"amd/Llama-2-70b-chat-hf_FP8_MLPerf_V2": {
|
||||
"arc_challenge": 0.53,
|
||||
"mmlu": 0.61,
|
||||
},
|
||||
|
||||
@@ -352,7 +352,10 @@ def generate_rotation_matrix(d: int, seed: int, device: str = "cpu") -> torch.Te
|
||||
gen = torch.Generator(device="cpu")
|
||||
gen.manual_seed(seed)
|
||||
G = torch.randn(d, d, generator=gen, device="cpu", dtype=torch.float32)
|
||||
Q, R = torch.linalg.qr(G)
|
||||
# torch.linalg.qr on CPU requires LAPACK, which some torch wheels
|
||||
# (ROCm) ship without. Run QR on accelerator instead
|
||||
qr_device = "cuda" if torch.cuda.is_available() else "cpu"
|
||||
Q, R = torch.linalg.qr(G.to(qr_device))
|
||||
diag_sign = torch.sign(torch.diag(R))
|
||||
diag_sign[diag_sign == 0] = 1.0
|
||||
Q = Q * diag_sign.unsqueeze(0)
|
||||
|
||||
@@ -80,14 +80,13 @@ def _test_online_quant_peak_mem_impl(
|
||||
print(f"GPU memory used after loading weights: {model_memory_gib} GiB")
|
||||
print(f"Peak GPU memory usage while loading weights: {peak_memory_gib} GiB")
|
||||
|
||||
# model specific, allenai/OLMoE-1B-7B-0125-Instruct fp8 online quant
|
||||
# uses 6.65 GiB for weight loading (bf16 checkpoint is ~12.89 GiB)
|
||||
expected_model_memory_gib = 6.7
|
||||
|
||||
# for allenai/OLMoE-1B-7B-0125-Instruct the number we see today is 9.06
|
||||
# GiB, which is 1.36x above model_memory_gib. A slightly higher number is
|
||||
# expected as when we load and quantize weights in a streaming fashion we
|
||||
# need to have individual weights in bf16 + fp8 alive at the same time.
|
||||
# GiB on CUDA, which is 1.36x above model_memory_gib. A slightly higher
|
||||
# number is expected as when we load and quantize weights in a streaming
|
||||
# fashion we need to have individual weights in bf16 + fp8 alive at the
|
||||
# same time.
|
||||
expected_peak_memory_gib = expected_model_memory_gib * 1.4
|
||||
|
||||
assert model_memory_gib < expected_model_memory_gib, (
|
||||
|
||||
@@ -144,13 +144,21 @@ class RowWiseTorchFP8ScaledMMLinearKernel(TorchFP8ScaledMMLinearKernel):
|
||||
# For CUDA platform please validate if the torch._scaled_mm supports
|
||||
# rowwise scaled GEMM before using it
|
||||
|
||||
# torch._scaled_mm rowwise requires scale_a = (m, 1), scale_b = (1, n).
|
||||
# CompressedTensors stores weight_scale as (n, 1), so `.t()` yields (1, n).
|
||||
# ModelOpt FP8_PER_CHANNEL_PER_TOKEN stores it as 1-D (n,); reshape to
|
||||
# (1, n) so both paths satisfy the rowwise contract.
|
||||
scale_b = Bs.view(1, -1) if Bs.dim() == 1 else Bs.t()
|
||||
if As.dim() == 1:
|
||||
As = As.view(-1, 1)
|
||||
|
||||
# Fused GEMM_DQ Rowwise GEMM
|
||||
output = torch._scaled_mm(
|
||||
A,
|
||||
B,
|
||||
out_dtype=out_dtype,
|
||||
scale_a=As,
|
||||
scale_b=Bs.t(),
|
||||
scale_b=scale_b,
|
||||
bias=bias,
|
||||
)
|
||||
|
||||
|
||||
@@ -1025,9 +1025,15 @@ class QuarkOCP_MX_MoEMethod(QuarkMoEMethod):
|
||||
get_current_vllm_config().model_config.hf_config, "model_type", None
|
||||
)
|
||||
|
||||
# TODO(aiter): extend once rocm_aiter_fused_experts gains dispatch
|
||||
# for the other OCP MX schemes. Today its CK MoE kernel only has an
|
||||
# entry for `w_mxfp4` (w4a16); mixed schemes like `w_mxfp4_a_mxfp6_*`
|
||||
# fall through to QuantMethod.NO and raise "Unsupported kernel config
|
||||
# for moe heuristic dispatch".
|
||||
_AITER_NATIVE_OCP_MX_SCHEMES = ("w_mxfp4",)
|
||||
self.emulate = (
|
||||
not current_platform.supports_mx()
|
||||
or not self.ocp_mx_scheme.startswith("w_mxfp4")
|
||||
or self.ocp_mx_scheme not in _AITER_NATIVE_OCP_MX_SCHEMES
|
||||
) and (
|
||||
self.mxfp4_backend is Mxfp4MoeBackend.NONE or not self.use_rocm_aiter_moe
|
||||
)
|
||||
|
||||
@@ -376,11 +376,15 @@ class QuarkOCP_MX(QuarkScheme):
|
||||
dq_w = self.dequant_func(layer.weight, layer.weight_scale, x.dtype)
|
||||
qdq_x = self.quant_dequant_func(x)
|
||||
return F.linear(qdq_x, dq_w, bias)
|
||||
else:
|
||||
return torch.ops.vllm.gemm_with_dynamic_quant(
|
||||
x,
|
||||
layer.weight,
|
||||
layer.weight_scale,
|
||||
self.rocm_use_aiter_fp4_asm_gemm,
|
||||
self.out_dtype,
|
||||
)
|
||||
y = torch.ops.vllm.gemm_with_dynamic_quant(
|
||||
x,
|
||||
layer.weight,
|
||||
layer.weight_scale,
|
||||
self.rocm_use_aiter_fp4_asm_gemm,
|
||||
self.out_dtype,
|
||||
)
|
||||
# gemm_with_dynamic_quant has no bias argument; add it here so the
|
||||
# native path matches F.linear (e.g. qkv_proj with qkv_bias=True).
|
||||
if bias is not None:
|
||||
y = y + bias
|
||||
return y
|
||||
|
||||
@@ -65,7 +65,7 @@ class BaseModelLoader(ABC):
|
||||
|
||||
# Log peak GPU memory after loading weights. This is needed
|
||||
# to have test coverage on peak memory for online quantization.
|
||||
if current_platform.is_cuda():
|
||||
if current_platform.is_cuda_alike():
|
||||
peak_memory = torch.accelerator.max_memory_allocated()
|
||||
logger.debug_once(
|
||||
"Peak GPU memory after loading weights: %s GiB",
|
||||
|
||||
+10
-3
@@ -414,10 +414,17 @@ class RocmPlatform(Platform):
|
||||
"gguf",
|
||||
"quark",
|
||||
"mxfp4",
|
||||
"gpt_oss_mxfp4",
|
||||
"mxfp8",
|
||||
"torchao",
|
||||
"bitsandbytes",
|
||||
"modelopt",
|
||||
"modelopt_fp4",
|
||||
"modelopt_mxfp8",
|
||||
"modelopt_mixed",
|
||||
"fp8_per_tensor",
|
||||
"fp8_per_block",
|
||||
"online",
|
||||
"gpt_oss_mxfp4",
|
||||
]
|
||||
|
||||
@classmethod
|
||||
@@ -785,9 +792,9 @@ class RocmPlatform(Platform):
|
||||
def get_current_memory_usage(
|
||||
cls, device: torch.types.Device | None = None
|
||||
) -> float:
|
||||
torch.cuda.empty_cache()
|
||||
torch.cuda.reset_peak_memory_stats(device)
|
||||
free_mem, total_mem = torch.cuda.mem_get_info(device)
|
||||
return total_mem - free_mem
|
||||
return torch.cuda.max_memory_allocated(device)
|
||||
|
||||
@classmethod
|
||||
def get_device_communicator_cls(cls) -> str:
|
||||
|
||||
Reference in New Issue
Block a user