From a3a5a5ece59b31e1e218c3fe47ce02a90522b4a3 Mon Sep 17 00:00:00 2001 From: Chaojun Zhang Date: Tue, 2 Jun 2026 11:09:21 +0800 Subject: [PATCH] [XPU][Bugfix] Fix per_token_group_fp8_quant missing dummy args on XPU (#43930) Signed-off-by: Chaojun,Zhang Co-authored-by: Kunshang Ji --- vllm/_xpu_ops.py | 11 ++++++++++- .../layers/quantization/utils/fp8_utils.py | 8 +------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/vllm/_xpu_ops.py b/vllm/_xpu_ops.py index 0df0338a648..6d6a85d26d1 100644 --- a/vllm/_xpu_ops.py +++ b/vllm/_xpu_ops.py @@ -338,7 +338,16 @@ def _xpu_mxfp8_quantize_impl( shape = x.shape[:-1] + (x.shape[-1] // MXFP8_BLOCK_SIZE,) x_s = torch.empty(shape, device=x.device, dtype=torch.float32) torch.ops._C.per_token_group_fp8_quant( - x, x_q, x_s, MXFP8_BLOCK_SIZE, eps, fp8_min, fp8_max, True + x, + x_q, + x_s, + MXFP8_BLOCK_SIZE, + eps, + fp8_min, + fp8_max, + True, + False, + False, # dummy_is_scale_transposed, dummy_is_tma_aligned ) x_s = x_s.to(torch.float8_e8m0fnu) return x_q, x_s diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py index 8b20c13a97f..d715b944940 100644 --- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py @@ -575,7 +575,7 @@ def per_token_group_quant_fp8( # prefer CUDA/XPU kernel if available # TODO(bnell): this causes some fp8 moe test to fail. - if current_platform.is_cuda() and x.is_contiguous(): + if (current_platform.is_cuda() or current_platform.is_xpu()) and x.is_contiguous(): torch.ops._C.per_token_group_fp8_quant( x, x_q, @@ -590,12 +590,6 @@ def per_token_group_quant_fp8( ) return x_q, x_s - if current_platform.is_xpu() and x.is_contiguous(): - torch.ops._C.per_token_group_fp8_quant( - x, x_q, x_s, group_size, eps, fp8_min, fp8_max, use_ue8m0 - ) - return x_q, x_s - # TRITON FALLBACK M = x.numel() // group_size N = group_size