From 079a4cf399ad548d442fd92bfffbfbe460b66133 Mon Sep 17 00:00:00 2001 From: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Date: Thu, 23 Apr 2026 23:05:49 -0700 Subject: [PATCH] [MoE] Move cutlass moe to fused_moe/experts/ (#40574) Signed-off-by: Jackmin801 Co-authored-by: Claude --- benchmarks/kernels/benchmark_cutlass_moe_fp8.py | 2 +- benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py | 2 +- benchmarks/kernels/benchmark_grouped_gemm_cutlass.py | 2 +- docs/design/moe_kernel_features.md | 4 ++-- tests/kernels/moe/modular_kernel_tools/mk_objects.py | 4 +++- tests/kernels/moe/test_cutlass_moe.py | 2 +- tests/kernels/moe/test_nvfp4_moe.py | 2 +- vllm/model_executor/layers/fused_moe/__init__.py | 8 ++++---- .../layers/fused_moe/{ => experts}/cutlass_moe.py | 0 vllm/model_executor/layers/fused_moe/oracle/fp8.py | 2 +- vllm/model_executor/layers/fused_moe/oracle/nvfp4.py | 2 +- .../model_executor/layers/fused_moe/triton_cutlass_moe.py | 2 +- .../compressed_tensors_moe_w4a4_mxfp4.py | 4 ++-- .../compressed_tensors_moe_w4a8_fp8.py | 2 +- 14 files changed, 20 insertions(+), 18 deletions(-) rename vllm/model_executor/layers/fused_moe/{ => experts}/cutlass_moe.py (100%) diff --git a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py index 3f80b024e10..03d7fb386f7 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_fp8.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_fp8.py @@ -16,7 +16,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import ( maybe_make_prepare_finalize, ) from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 +from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk from vllm.platforms import current_platform from vllm.utils.argparse_utils import FlexibleArgumentParser diff --git a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py index 2d4afd38c09..7379bf85888 100644 --- a/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py +++ b/benchmarks/kernels/benchmark_cutlass_moe_nvfp4.py @@ -22,7 +22,7 @@ from vllm.model_executor.layers.fused_moe.config import ( fp8_w8a8_moe_quant_config, nvfp4_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.cutlass_moe import ( +from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( CutlassExpertsFp4, ) from vllm.model_executor.layers.fused_moe.fused_moe import fused_experts, fused_topk diff --git a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py index dd4060bbdb9..04fc2960d1e 100644 --- a/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py +++ b/benchmarks/kernels/benchmark_grouped_gemm_cutlass.py @@ -13,7 +13,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import ( maybe_make_prepare_finalize, ) from vllm.model_executor.layers.fused_moe.config import fp8_w8a8_moe_quant_config -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 +from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.fused_moe import ( fused_experts, fused_topk, diff --git a/docs/design/moe_kernel_features.md b/docs/design/moe_kernel_features.md index 231bca3646f..4e3706645ef 100644 --- a/docs/design/moe_kernel_features.md +++ b/docs/design/moe_kernel_features.md @@ -83,8 +83,8 @@ To be used with a particular `FusedMoEPrepareAndFinalizeModular` subclass, MoE k | triton | standard | all1 | G,A,T | silu, gelu,
swigluoai,
silu_no_mul,
gelu_no_mul | Y | Y | [`fused_experts`][vllm.model_executor.layers.fused_moe.fused_moe.fused_experts],
[`TritonExperts`][vllm.model_executor.layers.fused_moe.fused_moe.TritonExperts] | | triton (batched) | batched | all1 | G,A,T | silu, gelu | 6 | Y | [`BatchedTritonExperts`][vllm.model_executor.layers.fused_moe.fused_batched_moe.BatchedTritonExperts] | | deep gemm | standard,
batched | fp8 | G(128),A,T | silu, gelu | 6 | Y |
[`DeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe.DeepGemmExperts],
[`BatchedDeepGemmExperts`][vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe.BatchedDeepGemmExperts] | -| cutlass_fp4 | standard,
batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp4] | -| cutlass_fp8 | standard,
batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassExpertsFp8],
[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.cutlass_moe.CutlassBatchedExpertsFp8] | +| cutlass_fp4 | standard,
batched | nvfp4 | A,T | silu | Y | Y | [`CutlassExpertsFp4`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp4] | +| cutlass_fp8 | standard,
batched | fp8 | A,T | silu, gelu | Y | Y | [`CutlassExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassExpertsFp8],
[`CutlasBatchedExpertsFp8`][vllm.model_executor.layers.fused_moe.experts.cutlass_moe.CutlassBatchedExpertsFp8] | | flashinfer | standard | nvfp4,
fp8 | T | 5 | N | Y | [`FlashInferExperts`][vllm.model_executor.layers.fused_moe.flashinfer_cutlass_moe.FlashInferExperts] | | gpt oss triton | standard | N/A | N/A | 5 | Y | Y | [`triton_kernel_fused_experts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.triton_kernel_fused_experts],
[`OAITritonExperts`][vllm.model_executor.layers.fused_moe.experts.gpt_oss_triton_kernels_moe.OAITritonExperts] | | marlin | standard,
batched | 3 / N/A | 3 / N/A | silu,
swigluoai | Y | Y | [`fused_marlin_moe`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.fused_marlin_moe],
[`MarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.MarlinExperts],
[`BatchedMarlinExperts`][vllm.model_executor.layers.fused_moe.fused_marlin_moe.BatchedMarlinExperts] | diff --git a/tests/kernels/moe/modular_kernel_tools/mk_objects.py b/tests/kernels/moe/modular_kernel_tools/mk_objects.py index 23ddc7011ac..812164ea287 100644 --- a/tests/kernels/moe/modular_kernel_tools/mk_objects.py +++ b/tests/kernels/moe/modular_kernel_tools/mk_objects.py @@ -367,7 +367,9 @@ else: CutlassExpertsFp8 = None if cutlass_fp4_supported(): - from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp4 + from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( + CutlassExpertsFp4, + ) register_experts( CutlassExpertsFp4, diff --git a/tests/kernels/moe/test_cutlass_moe.py b/tests/kernels/moe/test_cutlass_moe.py index e06672f41d0..a613e7d2e29 100644 --- a/tests/kernels/moe/test_cutlass_moe.py +++ b/tests/kernels/moe/test_cutlass_moe.py @@ -21,7 +21,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, fp8_w8a8_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.cutlass_moe import ( +from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( CutlassExpertsFp8, run_cutlass_moe_fp8, ) diff --git a/tests/kernels/moe/test_nvfp4_moe.py b/tests/kernels/moe/test_nvfp4_moe.py index e12659729c9..e2a6cd1a7dc 100644 --- a/tests/kernels/moe/test_nvfp4_moe.py +++ b/tests/kernels/moe/test_nvfp4_moe.py @@ -19,7 +19,7 @@ from vllm.model_executor.layers.fused_moe.all2all_utils import ( maybe_make_prepare_finalize, ) from vllm.model_executor.layers.fused_moe.config import nvfp4_moe_quant_config -from vllm.model_executor.layers.fused_moe.cutlass_moe import ( +from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( CutlassExpertsFp4, ) from vllm.model_executor.layers.fused_moe.prepare_finalize import ( diff --git a/vllm/model_executor/layers/fused_moe/__init__.py b/vllm/model_executor/layers/fused_moe/__init__.py index 1d273bd31e4..75a9faddc1f 100644 --- a/vllm/model_executor/layers/fused_moe/__init__.py +++ b/vllm/model_executor/layers/fused_moe/__init__.py @@ -73,15 +73,15 @@ __all__ = [ if HAS_TRITON: # import to register the custom ops - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import ( + BatchedDeepGemmExperts, + ) + from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( CutlassBatchedExpertsFp8, CutlassExpertsFp8, CutlassExpertsW4A8Fp8, cutlass_moe_w4a8_fp8, ) - from vllm.model_executor.layers.fused_moe.experts.batched_deep_gemm_moe import ( - BatchedDeepGemmExperts, - ) from vllm.model_executor.layers.fused_moe.experts.deep_gemm_moe import ( DeepGemmExperts, ) diff --git a/vllm/model_executor/layers/fused_moe/cutlass_moe.py b/vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py similarity index 100% rename from vllm/model_executor/layers/fused_moe/cutlass_moe.py rename to vllm/model_executor/layers/fused_moe/experts/cutlass_moe.py diff --git a/vllm/model_executor/layers/fused_moe/oracle/fp8.py b/vllm/model_executor/layers/fused_moe/oracle/fp8.py index ca13d0d901d..2e75e6f4ae7 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/fp8.py +++ b/vllm/model_executor/layers/fused_moe/oracle/fp8.py @@ -173,7 +173,7 @@ def backend_to_kernel_cls( return [TritonOrCutlassExperts] elif backend == Fp8MoeBackend.BATCHED_VLLM_CUTLASS: - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( CutlassBatchedExpertsFp8, ) diff --git a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py index 724f6d5399b..48e48a97ef9 100644 --- a/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py +++ b/vllm/model_executor/layers/fused_moe/oracle/nvfp4.py @@ -107,7 +107,7 @@ def backend_to_kernel_cls( return [FlashInferCuteDSLBatchedExperts] elif backend == NvFp4MoeBackend.VLLM_CUTLASS: - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( CutlassExpertsFp4, ) diff --git a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py b/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py index 4aa396d24b0..70431878932 100644 --- a/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py +++ b/vllm/model_executor/layers/fused_moe/triton_cutlass_moe.py @@ -10,7 +10,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEConfig, FusedMoEQuantConfig, ) -from vllm.model_executor.layers.fused_moe.cutlass_moe import CutlassExpertsFp8 +from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import CutlassExpertsFp8 from vllm.model_executor.layers.fused_moe.fallback import FallbackExperts from vllm.model_executor.layers.fused_moe.fused_moe import TritonExperts from vllm.platforms import current_platform diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py index 9d3e0e7a787..629e1c5ef1b 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a4_mxfp4.py @@ -14,7 +14,7 @@ from vllm.model_executor.layers.fused_moe.config import ( FusedMoEQuantConfig, mxfp4_moe_quant_config, ) -from vllm.model_executor.layers.fused_moe.cutlass_moe import ( +from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( CutlassExpertsMxfp4, ) from vllm.model_executor.layers.fused_moe.fused_marlin_moe import ( @@ -149,7 +149,7 @@ class CompressedTensorsW4A4Mxfp4MoEMethod(CompressedTensorsMoEMethod): if self.use_cutlass_mxfp4: # Swizzle weight scales from flat checkpoint layout [E, N, K//32] # to CUTLASS tiled layout [E, numMTiles*numKTiles*512]. - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( swizzle_mxfp4_scales, ) diff --git a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py index ab805591dee..b14571fe501 100644 --- a/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py +++ b/vllm/model_executor/layers/quantization/compressed_tensors/compressed_tensors_moe/compressed_tensors_moe_w4a8_fp8.py @@ -315,7 +315,7 @@ class CompressedTensorsW4A8Fp8MoEMethod(CompressedTensorsMoEMethod): ) assert self.moe_quant_config is not None - from vllm.model_executor.layers.fused_moe.cutlass_moe import ( + from vllm.model_executor.layers.fused_moe.experts.cutlass_moe import ( cutlass_moe_w4a8_fp8, )