From a3a5a5ece59b31e1e218c3fe47ce02a90522b4a3 Mon Sep 17 00:00:00 2001
From: Chaojun Zhang <chaojun.zhang@intel.com>
Date: Tue, 2 Jun 2026 11:09:21 +0800
Subject: [PATCH] [XPU][Bugfix] Fix per_token_group_fp8_quant missing dummy
 args on XPU (#43930)

Signed-off-by: Chaojun,Zhang <chaojun.zhang@intel.com>
Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
---
 vllm/_xpu_ops.py                                      | 11 ++++++++++-
 .../layers/quantization/utils/fp8_utils.py            |  8 +-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/vllm/_xpu_ops.py b/vllm/_xpu_ops.py
index 0df0338a648..6d6a85d26d1 100644
--- a/vllm/_xpu_ops.py
+++ b/vllm/_xpu_ops.py
@@ -338,7 +338,16 @@ def _xpu_mxfp8_quantize_impl(
     shape = x.shape[:-1] + (x.shape[-1] // MXFP8_BLOCK_SIZE,)
     x_s = torch.empty(shape, device=x.device, dtype=torch.float32)
     torch.ops._C.per_token_group_fp8_quant(
-        x, x_q, x_s, MXFP8_BLOCK_SIZE, eps, fp8_min, fp8_max, True
+        x,
+        x_q,
+        x_s,
+        MXFP8_BLOCK_SIZE,
+        eps,
+        fp8_min,
+        fp8_max,
+        True,
+        False,
+        False,  # dummy_is_scale_transposed, dummy_is_tma_aligned
     )
     x_s = x_s.to(torch.float8_e8m0fnu)
     return x_q, x_s
diff --git a/vllm/model_executor/layers/quantization/utils/fp8_utils.py b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
index 8b20c13a97f..d715b944940 100644
--- a/vllm/model_executor/layers/quantization/utils/fp8_utils.py
+++ b/vllm/model_executor/layers/quantization/utils/fp8_utils.py
@@ -575,7 +575,7 @@ def per_token_group_quant_fp8(
 
     # prefer CUDA/XPU kernel if available
     # TODO(bnell): this causes some fp8 moe test to fail.
-    if current_platform.is_cuda() and x.is_contiguous():
+    if (current_platform.is_cuda() or current_platform.is_xpu()) and x.is_contiguous():
         torch.ops._C.per_token_group_fp8_quant(
             x,
             x_q,
@@ -590,12 +590,6 @@ def per_token_group_quant_fp8(
         )
         return x_q, x_s
 
-    if current_platform.is_xpu() and x.is_contiguous():
-        torch.ops._C.per_token_group_fp8_quant(
-            x, x_q, x_s, group_size, eps, fp8_min, fp8_max, use_ue8m0
-        )
-        return x_q, x_s
-
     # TRITON FALLBACK
     M = x.numel() // group_size
     N = group_size