mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[MoE] Move xpu moe to fused_moe/experts/ (#40568)
Signed-off-by: Jackmin801 <ongjackm@gmail.com> Co-authored-by: Claude <noreply@anthropic.com> Co-authored-by: Kunshang Ji <kunshang.ji@intel.com>
This commit is contained in:
+1
-1
@@ -262,7 +262,7 @@ pull_request_rules:
|
||||
- files~=^docker/Dockerfile.xpu
|
||||
- files~=^\\.buildkite/intel_jobs/
|
||||
- files=\.buildkite/ci_config_intel.yaml
|
||||
- files=vllm/model_executor/layers/fused_moe/xpu_fused_moe.py
|
||||
- files=vllm/model_executor/layers/fused_moe/experts/xpu_moe.py
|
||||
- files=vllm/model_executor/kernels/linear/mixed_precision/xpu.py
|
||||
- files=vllm/model_executor/kernels/linear/mxfp8/xpu.py
|
||||
- files=vllm/model_executor/kernels/linear/scaled_mm/xpu.py
|
||||
|
||||
@@ -85,6 +85,11 @@ if HAS_TRITON:
|
||||
from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import (
|
||||
DeepGemmExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
|
||||
XPUExperts,
|
||||
XPUExpertsFp8,
|
||||
XPUExpertsMXFp4,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.fused_batched_moe import (
|
||||
BatchedTritonExperts,
|
||||
)
|
||||
@@ -106,10 +111,6 @@ if HAS_TRITON:
|
||||
from vllm.model_executor.layers.fused_moe.triton_deep_gemm_moe import (
|
||||
TritonOrDeepGemmExperts,
|
||||
)
|
||||
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
|
||||
XPUExperts,
|
||||
XPUExpertsFp8,
|
||||
)
|
||||
|
||||
__all__ += [
|
||||
"AiterExperts",
|
||||
@@ -129,6 +130,7 @@ if HAS_TRITON:
|
||||
"TritonOrDeepGemmExperts",
|
||||
"XPUExperts",
|
||||
"XPUExpertsFp8",
|
||||
"XPUExpertsMXFp4",
|
||||
]
|
||||
else:
|
||||
# Some model classes directly use the custom ops. Add placeholders
|
||||
|
||||
@@ -180,7 +180,7 @@ def backend_to_kernel_cls(
|
||||
return [CutlassBatchedExpertsFp8]
|
||||
|
||||
elif backend == Fp8MoeBackend.XPU:
|
||||
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
|
||||
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
|
||||
XPUExpertsFp8,
|
||||
)
|
||||
|
||||
@@ -470,7 +470,7 @@ def convert_to_fp8_moe_kernel_format(
|
||||
is_trtllm=(fp8_backend == Fp8MoeBackend.FLASHINFER_TRTLLM),
|
||||
)
|
||||
elif fp8_backend == Fp8MoeBackend.XPU:
|
||||
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import (
|
||||
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import (
|
||||
prepare_fp8_moe_layer_for_xpu,
|
||||
)
|
||||
|
||||
|
||||
@@ -141,7 +141,7 @@ def backend_to_kernel_cls(
|
||||
return [AiterExperts]
|
||||
|
||||
elif backend == Mxfp4MoeBackend.XPU:
|
||||
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import XPUExpertsMXFp4
|
||||
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import XPUExpertsMXFp4
|
||||
|
||||
return [XPUExpertsMXFp4]
|
||||
|
||||
|
||||
@@ -121,7 +121,7 @@ def backend_to_kernel_cls(
|
||||
return BatchedTritonExperts
|
||||
|
||||
elif backend == UnquantizedMoeBackend.XPU:
|
||||
from vllm.model_executor.layers.fused_moe.xpu_fused_moe import XPUExperts
|
||||
from vllm.model_executor.layers.fused_moe.experts.xpu_moe import XPUExperts
|
||||
|
||||
return XPUExperts
|
||||
|
||||
|
||||
Reference in New Issue
Block a user