diff --git a/.github/mergify.yml b/.github/mergify.yml index baf65e14a88..b96d6b81ac0 100644 --- a/.github/mergify.yml +++ b/.github/mergify.yml @@ -262,7 +262,7 @@ pull_request_rules: - files~=^docker/Dockerfile.xpu - files~=^\\.buildkite/intel_jobs/ - files=\.buildkite/ci_config_intel.yaml - - files=vllm/model_executor/layers/fused_moe/xpu_fused_moe.py + - files=vllm/model_executor/layers/fused_moe/experts/xpu_moe.py - files=vllm/model_executor/kernels/linear/mixed_precision/xpu.py - files=vllm/model_executor/kernels/linear/mxfp8/xpu.py - files=vllm/model_executor/kernels/linear/scaled_mm/xpu.py diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index a154ede547b..1d273bd31e4 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -85,6 +85,11 @@ if HAS_TRITON: from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import ( DeepGemmExperts, ) + from vllm.model_executor.layers.fused_moe.experts.xpu_moe import ( + XPUExperts, + XPUExpertsFp8, + XPUExpertsMXFp4, + ) from vllm.model_executor.layers.fused_moe.fused_batched_moe import ( BatchedTritonExperts, ) @@ -106,10 +111,6 @@ if HAS_TRITON: from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import ( TritonOrDeepGemmExperts, ) - from vllm.model_executor.layers.fused_moe.xpu_fused_moe import ( - XPUExperts, - XPUExpertsFp8, - ) __all__ += [ "AiterExperts", @@ -129,6 +130,7 @@ if HAS_TRITON: "TritonOrDeepGemmExperts", "XPUExperts", "XPUExpertsFp8", + "XPUExpertsMXFp4", ] else: # Some model classes directly use the custom ops. Add placeholders diff --git a/vllm/model_executor/layers/fused_moe/xpu_fused_moe.py b/vllm/model_executor/layers/fused_moe/experts/xpu_moe.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/xpu_fused_moe.py rename to vllm/model_executor/layers/fused_moe/experts/xpu_moe.py diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index 584c2bf7928..ca13d0d901d 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -180,7 +180,7 @@ def backend_to_kernel_cls( return [CutlassBatchedExpertsFp8] elif backend == Fp8MoeBackend.XPU: - from vllm.model_executor.layers.fused_moe.xpu_fused_moe import ( + from vllm.model_executor.layers.fused_moe.experts.xpu_moe import ( XPUExpertsFp8, ) @@ -470,7 +470,7 @@ def convert_to_fp8_moe_kernel_format( is_trtllm=(fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM), ) elif fp8_backend == Fp8MoeBackend.XPU: - from vllm.model_executor.layers.fused_moe.xpu_fused_moe import ( + from vllm.model_executor.layers.fused_moe.experts.xpu_moe import ( prepare_fp8_moe_layer_for_xpu, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py index 6306d0e2e9d..9d2c9f8baff 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/mxfp4.py @@ -141,7 +141,7 @@ def backend_to_kernel_cls( return [AiterExperts] elif backend == Mxfp4MoeBackend.XPU: - from vllm.model_executor.layers.fused_moe.xpu_fused_moe import XPUExpertsMXFp4 + from vllm.model_executor.layers.fused_moe.experts.xpu_moe import XPUExpertsMXFp4 return [XPUExpertsMXFp4] diff --git a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py index cdfd6bb8c02..00fe914ad9d 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/unquantized.py +++ b/vllm/model_executor/layers/fused_moe/oracle/unquantized.py @@ -121,7 +121,7 @@ def backend_to_kernel_cls( return BatchedTritonExperts elif backend == UnquantizedMoeBackend.XPU: - from vllm.model_executor.layers.fused_moe.xpu_fused_moe import XPUExperts + from vllm.model_executor.layers.fused_moe.experts.xpu_moe import XPUExperts return XPUExperts