diff --git a/tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py b/tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py
index c7f5b22a4a..651306f765 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/communication/communication_factory.py
@@ -154,7 +154,7 @@ class CommunicationFactory:
             logger.debug(f"NVLinkTwoSided not available: {e}")
 
         # Try DeepEP (if enabled and weight dtype is bfloat16)
-        if os.environ.get("TRTLLM_CAN_USE_DEEP_EP", "0") == "1" and act_dtype == torch.bfloat16:
+        if os.environ.get("TRTLLM_CAN_USE_DEEP_EP", "1") == "1" and act_dtype == torch.bfloat16:
             try:
                 strategy = DeepEP(
                     mapping,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep.py b/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep.py
index a8d71a1d6b..4f50b9d12d 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep.py
@@ -81,8 +81,6 @@ class DeepEP(Communication):
         """
         Check if DeepEP is supported on the current platform
         """
-        if os.environ.get("TRTLLM_CAN_USE_DEEP_EP", "0") != "1":
-            return False
         return deep_ep_installed
 
     @staticmethod
@@ -145,6 +143,13 @@ class DeepEP(Communication):
         """
         all_rank_max_num_tokens = max(all_rank_num_tokens)
 
+        # DeepEP C++ kernel requires topk_weights (token_final_scales) to be float32,
+        # but downstream backends (e.g. TRTLLM) may require the original dtype.
+        # Convert to float32 for dispatch, then restore afterward.
+        original_scales_dtype = token_final_scales.dtype if token_final_scales is not None else None
+        if token_final_scales is not None and token_final_scales.dtype != torch.float32:
+            token_final_scales = token_final_scales.to(torch.float32)
+
         if not self.supports_post_quant_dispatch():
             # Pre-quant dispatch (unquantized data)
             (
@@ -215,6 +220,14 @@ class DeepEP(Communication):
                 "padded": padded,
             }
 
+        # Restore token_final_scales to original dtype for downstream consumers
+        if (
+            token_final_scales is not None
+            and original_scales_dtype is not None
+            and token_final_scales.dtype != original_scales_dtype
+        ):
+            token_final_scales = token_final_scales.to(original_scales_dtype)
+
         return hidden_states, hidden_states_sf, token_selected_slots, token_final_scales
 
     def combine(
diff --git a/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py b/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py
index 656f4957fe..8fdcf7c330 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py
@@ -37,6 +37,15 @@ class DeepEPLowLatency(Communication):
     DeepEP Low Latency strategy supporting both pre-quant and post-quant
     """
 
+    SUPPORTED_HIDDEN_SIZES = {2048, 2560, 3584, 4096, 5120, 6144, 7168}
+    """set[int]: Hidden sizes supported by the low-latency DeepEP kernel (SWITCH_HIDDEN in launch.cuh)."""
+
+    SUPPORTED_HIDDEN_SIZES_EXTENSION = {4096, 6144, 7168}
+    """set[int]: Hidden sizes supported by extension kernels (nvfp4 post-quant/low-precision combine).
+
+    Sourced from SWITCH_HIDDEN_FOR_EXTENSION_KERNELS in extension_kernels.cu.
+    """
+
     def __init__(
         self,
         mapping: Mapping,
@@ -51,6 +60,13 @@ class DeepEPLowLatency(Communication):
     ):
         super().__init__(mapping)
 
+        # Validate hidden_size against kernel constraints
+        if hidden_size not in self.SUPPORTED_HIDDEN_SIZES:
+            raise RuntimeError(
+                f"DeepEPLowLatency does not support hidden_size={hidden_size}. "
+                f"Supported hidden sizes: {sorted(self.SUPPORTED_HIDDEN_SIZES)}"
+            )
+
         # Store needed parameters
         self.num_slots = num_slots
         self.hidden_size = hidden_size
@@ -86,8 +102,6 @@ class DeepEPLowLatency(Communication):
         """
         Check if DeepEP Low Latency is supported on the current platform
         """
-        if os.environ.get("TRTLLM_CAN_USE_DEEP_EP", "0") != "1":
-            return False
         if not deep_ep_installed:
             return False
         return True
@@ -95,15 +109,35 @@ class DeepEPLowLatency(Communication):
     def supports_post_quant_dispatch(self) -> bool:
         """
         DeepEP Low Latency supports post-quant for: fp8_qdq, nvfp4, w4afp8
+
+        Note: nvfp4 post-quant dispatch uses extension kernels which require
+        hidden_size in SUPPORTED_HIDDEN_SIZES_EXTENSION.
+
+        Note: fp8_qdq and w4afp8 post-quant dispatch views fp8 (1 byte) as
+        bf16 (2 bytes) via .view(torch.bfloat16), halving the hidden dimension.
+        The halved dimension must be in SUPPORTED_HIDDEN_SIZES for the dispatch
+        kernel (SWITCH_HIDDEN in internode_ll.cu) to work.
         """
         if not self.enable_postquant_alltoall:
             return False
-        return self._has_nvfp4() or self._has_fp8_qdq() or self._has_w4afp8()
+        if self._has_nvfp4():
+            # nvfp4 dispatch uses extension kernels with stricter hidden_size requirement
+            return self.hidden_size in self.SUPPORTED_HIDDEN_SIZES_EXTENSION
+        if self._has_fp8_qdq() or self._has_w4afp8():
+            # fp8/w4afp8 post-quant dispatch views fp8 (1 byte) as bf16 (2 bytes),
+            # halving the hidden dimension. The kernel must support the halved size.
+            return (self.hidden_size // 2) in self.SUPPORTED_HIDDEN_SIZES
+        return False
 
     def supports_low_precision_combine(self) -> bool:
         """
         DeepEP Low Latency supports low-precision combine for: fp8_qdq, nvfp4, w4afp8
+
+        Note: low-precision combine uses extension kernels which require
+        hidden_size in SUPPORTED_HIDDEN_SIZES_EXTENSION.
         """
+        if self.hidden_size not in self.SUPPORTED_HIDDEN_SIZES_EXTENSION:
+            return False
         return self._has_nvfp4() or self._has_fp8_qdq() or self._has_w4afp8()
 
     def is_workload_feasible(self, all_rank_num_tokens: List[int], num_chunks: int) -> bool:
diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
index b99de2086d..d1b7024c88 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@@ -100,25 +100,25 @@ class ConfigurableMoE(MoE):
         cls,
         quant_algo,
         dtype_activation: torch.dtype = torch.bfloat16,
-        gptoss_style: bool = False,
+        swiglu_gptoss_style: bool = False,
     ):
         """
         ConfigurableMoE is a wrapper class that delegates to specific backends.
 
         To check capability, query the specific backend class directly:
-        - CutlassFusedMoE.can_implement(quant_algo, dtype_activation, gptoss_style)
-        - TRTLLMGenFusedMoE.can_implement(quant_algo, dtype_activation, gptoss_style)
+        - CutlassFusedMoE.can_implement(quant_algo, dtype_activation, swiglu_gptoss_style)
+        - TRTLLMGenFusedMoE.can_implement(quant_algo, dtype_activation, swiglu_gptoss_style)
         - etc.
 
         Args:
             quant_algo: The quantization algorithm to check (None for unquantized)
             dtype_activation: The activation data type
-            gptoss_style: Whether gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled
+            swiglu_gptoss_style: Whether swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled
 
         Returns:
             Tuple[bool, Optional[str]]: Always returns (False, reason)
         """
-        del quant_algo, dtype_activation, gptoss_style  # Unused - wrapper class
+        del quant_algo, dtype_activation, swiglu_gptoss_style  # Unused - wrapper class
         return False, (
             "ConfigurableMoE is a wrapper class. "
             "Query the specific backend (CutlassFusedMoE, TRTLLMGenFusedMoE, etc.) directly."
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
index ca3e6c1a20..b15abc4ccc 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@@ -318,7 +318,7 @@ class CuteDslFusedMoE(CutlassFusedMoE):
         cls,
         quant_algo: Optional[QuantAlgo],
         dtype_activation: torch.dtype = torch.bfloat16,
-        gptoss_style: bool = False,
+        swiglu_gptoss_style: bool = False,
     ) -> Tuple[bool, Optional[str]]:
         """
         Check if CuteDslFusedMoE can implement the given quantization algorithm.
@@ -327,14 +327,14 @@ class CuteDslFusedMoE(CutlassFusedMoE):
         - NVFP4: SM in {100, 103}
 
         Does NOT support unquantized mode. Output dtype is hardcoded to bfloat16.
-        Does NOT support gptoss_style (bias/swiglu with custom alpha/beta/limit).
+        Does NOT support swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit).
 
         Args:
             quant_algo: The quantization algorithm to check (None for unquantized)
             dtype_activation: The activation input data type. Only bfloat16 is supported
                 because output dtype is hardcoded to bfloat16 (input/output dtype must match).
-            gptoss_style: Whether gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
-                CuteDslFusedMoE does NOT support gptoss_style.
+            swiglu_gptoss_style: Whether swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
+                CuteDslFusedMoE does NOT support swiglu_gptoss_style.
 
         Returns:
             Tuple[bool, Optional[str]]: (can_implement, skip_reason)
@@ -360,10 +360,10 @@ class CuteDslFusedMoE(CutlassFusedMoE):
             return _warn_and_return(
                 "CuteDslFusedMoE does not support unquantized mode")
 
-        # CuteDslFusedMoE does NOT support gptoss_style
-        if gptoss_style:
+        # CuteDslFusedMoE does NOT support swiglu_gptoss_style
+        if swiglu_gptoss_style:
             return _warn_and_return(
-                "CuteDslFusedMoE does not support gptoss_style (bias/swiglu with custom alpha/beta/limit)"
+                "CuteDslFusedMoE does not support swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit)"
             )
 
         # NVFP4 - SM in {100, 103}
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
index ff23a103bb..83aae9a06a 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -113,15 +113,15 @@ class CutlassFusedMoE(MoE):
         },
     }
 
-    # Quantization algorithms that support gptoss_style
     _GPTOSS_SUPPORTED_ALGOS = {QuantAlgo.W4A8_MXFP4_MXFP8}
+    """set[QuantAlgo]: Quantization algorithms that support swiglu_gptoss_style."""
 
     @classmethod
     def can_implement(
         cls,
         quant_algo: Optional[QuantAlgo],
         dtype_activation: torch.dtype = torch.bfloat16,
-        gptoss_style: bool = False,
+        swiglu_gptoss_style: bool = False,
     ) -> Tuple[bool, Optional[str]]:
         """
         Check if CutlassFusedMoE can implement the given quantization algorithm.
@@ -145,8 +145,8 @@ class CutlassFusedMoE(MoE):
                 - FP8/FP8_BLOCK_SCALES/W4A8_MXFP4_FP8: float16, bfloat16, float32
                 - NVFP4: float16, bfloat16, float8_e4m3fn
                 - W4A16_MXFP4/W4A8_AWQ/W8A16/W4A8_MXFP4_MXFP8: float16, bfloat16
-            gptoss_style: Whether gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
-                CutlassFusedMoE only supports gptoss_style for W4A8_MXFP4_MXFP8 quantization.
+            swiglu_gptoss_style: Whether swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
+                CutlassFusedMoE only supports swiglu_gptoss_style for W4A8_MXFP4_MXFP8 quantization.
 
         Returns:
             Tuple[bool, Optional[str]]: (can_implement, skip_reason)
@@ -160,10 +160,10 @@ class CutlassFusedMoE(MoE):
             return _warn_and_return(
                 f"CutlassFusedMoE requires SM >= 80, got SM{sm_version}")
 
-        # Check gptoss_style support
-        if gptoss_style and quant_algo not in cls._GPTOSS_SUPPORTED_ALGOS:
+        # Check swiglu_gptoss_style support
+        if swiglu_gptoss_style and quant_algo not in cls._GPTOSS_SUPPORTED_ALGOS:
             return _warn_and_return(
-                f"CutlassFusedMoE gptoss_style only supports W4A8_MXFP4_MXFP8 "
+                f"CutlassFusedMoE swiglu_gptoss_style only supports W4A8_MXFP4_MXFP8 "
                 f"(got quant_algo={quant_algo})")
 
         # Check if quant_algo is supported
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
index 671df5285e..f1e88cf743 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
@@ -382,7 +382,7 @@ class DeepGemmFusedMoE(CutlassFusedMoE):
         cls,
         quant_algo: Optional[QuantAlgo],
         dtype_activation: torch.dtype = torch.bfloat16,
-        gptoss_style: bool = False,
+        swiglu_gptoss_style: bool = False,
     ) -> Tuple[bool, Optional[str]]:
         """
         Check if DeepGemmFusedMoE can implement the given quantization algorithm.
@@ -391,15 +391,15 @@ class DeepGemmFusedMoE(CutlassFusedMoE):
         - FP8_BLOCK_SCALES: SM in {100, 103}
 
         Does NOT support unquantized mode. Output dtype is hardcoded to bfloat16.
-        Does NOT support gptoss_style (bias/swiglu with custom alpha/beta/limit).
+        Does NOT support swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit).
 
         Args:
             quant_algo: The quantization algorithm to check (None for unquantized)
             dtype_activation: The activation input data type. Supported types are
                 float32, bfloat16, and float16 (required by moe_permute_op kernel).
                 Note: Output dtype is always bfloat16 regardless of input dtype.
-            gptoss_style: Whether gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
-                DeepGemmFusedMoE does NOT support gptoss_style.
+            swiglu_gptoss_style: Whether swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
+                DeepGemmFusedMoE does NOT support swiglu_gptoss_style.
 
         Returns:
             Tuple[bool, Optional[str]]: (can_implement, skip_reason)
@@ -425,10 +425,10 @@ class DeepGemmFusedMoE(CutlassFusedMoE):
             return _warn_and_return(
                 "DeepGemmFusedMoE does not support unquantized mode")
 
-        # DeepGemmFusedMoE does NOT support gptoss_style
-        if gptoss_style:
+        # DeepGemmFusedMoE does NOT support swiglu_gptoss_style
+        if swiglu_gptoss_style:
             return _warn_and_return(
-                "DeepGemmFusedMoE does not support gptoss_style (bias/swiglu with custom alpha/beta/limit)"
+                "DeepGemmFusedMoE does not support swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit)"
             )
 
         # Only FP8_BLOCK_SCALES is supported
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
index 68ec51c18a..fbb9491353 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
@@ -1283,12 +1283,12 @@ class TritonFusedMoE(MoE):
         cls,
         quant_algo: Optional["QuantAlgo"],
         dtype_activation: torch.dtype = torch.bfloat16,
-        gptoss_style: bool = False,
+        swiglu_gptoss_style: bool = False,
     ) -> Tuple[bool, Optional[str]]:
         """
         Check if TritonFusedMoE can implement the given quantization algorithm.
 
-        TritonFusedMoE supports (SM90 only, gptoss_style=True only):
+        TritonFusedMoE supports (SM90 only, swiglu_gptoss_style=True only):
         - Unquantized (BF16 only)
         - FP8 per-tensor (QDQ)
         - W4A8_MXFP4_FP8
@@ -1298,8 +1298,8 @@ class TritonFusedMoE(MoE):
             quant_algo: The quantization algorithm to check (None for unquantized)
             dtype_activation: The activation data type. In unquantized mode, activation,
                 weight, and output dtypes must all match (only bfloat16 supported).
-            gptoss_style: Whether gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
-                TritonFusedMoE ONLY supports gptoss_style=True.
+            swiglu_gptoss_style: Whether swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
+                TritonFusedMoE ONLY supports swiglu_gptoss_style=True.
 
         Returns:
             Tuple[bool, Optional[str]]: (can_implement, skip_reason)
@@ -1316,10 +1316,10 @@ class TritonFusedMoE(MoE):
             return _warn_and_return(
                 f"TritonFusedMoE only supports SM90, got SM{sm_version}")
 
-        # TritonFusedMoE ONLY supports gptoss_style=True
-        if not gptoss_style:
+        # TritonFusedMoE ONLY supports swiglu_gptoss_style=True
+        if not swiglu_gptoss_style:
             return _warn_and_return(
-                "TritonFusedMoE only supports gptoss_style=True")
+                "TritonFusedMoE only supports swiglu_gptoss_style=True")
 
         # Unquantized mode - only bfloat16 is supported
         if quant_algo is None:
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
index ea259cc162..8026a7799b 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@@ -87,7 +87,7 @@ class TRTLLMGenFusedMoE(MoE):
         QuantAlgo.W4A8_MXFP4_MXFP8,
     }
 
-    # Quantization algorithms that support gptoss_style
+    # Quantization algorithms that support swiglu_gptoss_style
     _GPTOSS_SUPPORTED_ALGOS = {
         QuantAlgo.NVFP4,
         QuantAlgo.W4A16_MXFP4,
@@ -100,7 +100,7 @@ class TRTLLMGenFusedMoE(MoE):
         cls,
         quant_algo: Optional[QuantAlgo],
         dtype_activation: torch.dtype = torch.bfloat16,
-        gptoss_style: bool = False,
+        swiglu_gptoss_style: bool = False,
     ) -> Tuple[bool, Optional[str]]:
         """
         Check if TRTLLMGenFusedMoE can implement the given quantization algorithm.
@@ -119,7 +119,7 @@ class TRTLLMGenFusedMoE(MoE):
             quant_algo: The quantization algorithm to check (None for unquantized)
             dtype_activation: The activation input data type. Only bfloat16 is supported.
                 See: forward_impl() assert x.dtype == torch.bfloat16 (line 722).
-            gptoss_style: Whether gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
+            swiglu_gptoss_style: Whether swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
                 Only supported for nvfp4 and mxfp4 variants.
 
         Returns:
@@ -151,10 +151,10 @@ class TRTLLMGenFusedMoE(MoE):
             return _warn_and_return(
                 f"TRTLLMGenFusedMoE does not support quant_algo={quant_algo}")
 
-        # Check gptoss_style support: only supported for nvfp4 and mxfp4 variants
-        if gptoss_style and quant_algo not in cls._GPTOSS_SUPPORTED_ALGOS:
+        # Check swiglu_gptoss_style support: only supported for nvfp4 and mxfp4 variants
+        if swiglu_gptoss_style and quant_algo not in cls._GPTOSS_SUPPORTED_ALGOS:
             return _warn_and_return(
-                f"TRTLLMGenFusedMoE supports gptoss_style (bias/swiglu) only for nvfp4 and mxfp4 variants, "
+                f"TRTLLMGenFusedMoE supports swiglu_gptoss_style (bias/swiglu) only for nvfp4 and mxfp4 variants, "
                 f"got quant_algo={quant_algo}")
 
         return True, None
diff --git a/tensorrt_llm/_torch/modules/fused_moe/interface.py b/tensorrt_llm/_torch/modules/fused_moe/interface.py
index 7138cc9cfe..f5e8e1e6f5 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/interface.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/interface.py
@@ -1,3 +1,18 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import os
 import weakref
 from abc import abstractmethod
@@ -158,7 +173,7 @@ class MoE(nn.Module):
         cls,
         quant_algo: Optional[QuantAlgo],
         dtype_activation: torch.dtype = torch.bfloat16,
-        gptoss_style: bool = False,
+        swiglu_gptoss_style: bool = False,
     ) -> Tuple[bool, Optional[str]]:
         """
         Check if this MoE backend can implement the given quantization algorithm.
@@ -176,7 +191,7 @@ class MoE(nn.Module):
         Args:
             quant_algo: The quantization algorithm to check (None for unquantized)
             dtype_activation: The activation data type.
-            gptoss_style: Whether gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
+            swiglu_gptoss_style: Whether swiglu_gptoss_style (bias/swiglu with custom alpha/beta/limit) is enabled.
 
         Returns:
             Tuple[bool, Optional[str]]: (can_implement, skip_reason)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
index 21fd1940a3..85fb1332a5 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -1,6 +1,22 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import inspect
 import math
 from abc import ABC, abstractmethod
+from enum import Enum, auto
 from typing import Dict, List, NamedTuple, Optional, Tuple, Union
 
 import torch
@@ -173,11 +189,38 @@ def interleave_linear_and_gate(x: torch.Tensor,
     return x
 
 
+class EplbSupportStatus(Enum):
+    """EPLB support status for FusedMoEMethod classes."""
+    SUPPORTED = auto()
+    NOT_SUPPORTED = auto()
+    NOT_VERIFIED = auto()
+
+
 class FusedMoEMethodBase(ABC):
     """
     Base class for all fused MoE methods.
     """
     weight_alignment: int = 1
+    """Required byte alignment for MoE weight tensors."""
+
+    eplb_support_status: EplbSupportStatus = EplbSupportStatus.NOT_SUPPORTED
+    """Online EPLB support status for this quantization method.
+
+    Defaults to NOT_SUPPORTED for safety so that new subclasses do not
+    silently claim EPLB compatibility.  Subclasses that have been verified
+    to work with online EPLB should override this to SUPPORTED; those that
+    have not yet been tested may set it to NOT_VERIFIED.
+    """
+
+    @classmethod
+    def supports_online_eplb(cls) -> bool:
+        """
+        Check if this FusedMoEMethod supports online EPLB.
+
+        Returns:
+            True if online EPLB is supported, False otherwise.
+        """
+        return cls.eplb_support_status == EplbSupportStatus.SUPPORTED
 
     @classmethod
     def need_load_shared_weights(cls, module):
@@ -552,6 +595,7 @@ class FusedMoEMethodBase(ABC):
 
 
 class UnquantizedFusedMoEMethod(FusedMoEMethodBase):
+    eplb_support_status = EplbSupportStatus.SUPPORTED
 
     def create_weights(self, module: torch.nn.Module):
         weight_dtype = module.dtype
@@ -656,6 +700,7 @@ def requantize_expert_w3_w1_weight_fp8_qdq(module: torch.nn.Module,
 
 
 class FP8QDQFusedMoEMethod(FusedMoEMethodBase):
+    eplb_support_status = EplbSupportStatus.NOT_SUPPORTED
 
     def create_weights(self, module: torch.nn.Module):
         weight_dtype = torch.float8_e4m3fn
@@ -832,6 +877,7 @@ class FP8QDQFusedMoEMethod(FusedMoEMethodBase):
 
 
 class DeepSeekFP8BlockScalesFusedMoEMethod(FusedMoEMethodBase):
+    eplb_support_status = EplbSupportStatus.NOT_VERIFIED
 
     def create_weights(self, module: torch.nn.Module):
         weight_dtype = torch.float8_e4m3fn
@@ -1091,6 +1137,7 @@ class DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm(
 
 
 class INT8WoqPerChannelFusedMoEMethod(FusedMoEMethodBase):
+    eplb_support_status = EplbSupportStatus.NOT_SUPPORTED
 
     def create_weights(self, module: torch.nn.Module):
         module.sm_version = get_sm_version()
@@ -1224,6 +1271,7 @@ class INT8WoqPerChannelFusedMoEMethod(FusedMoEMethodBase):
 
 
 class WInt4AFP8FusedMoEMethod(FusedMoEMethodBase):
+    eplb_support_status = EplbSupportStatus.NOT_SUPPORTED
 
     def create_weights(self, module: torch.nn.Module):
         module.sm_version = get_sm_version()
@@ -1657,6 +1705,7 @@ class WInt4AFP8FusedMoEMethod(FusedMoEMethodBase):
 
 
 class WFP4A16FusedMoEMethod(FusedMoEMethodBase):
+    eplb_support_status = EplbSupportStatus.NOT_SUPPORTED
 
     group_size = 32
 
@@ -1866,6 +1915,7 @@ class NVFP4FusedMoEMethod(FusedMoEMethodBase):
     """
     Base class for NVFP4 fused MoE methods for all backends.
     """
+    eplb_support_status = EplbSupportStatus.SUPPORTED
 
     def get_weights_shapes(self, module: torch.nn.Module, weight_vec_size: int,
                            block_scales_vec_size: int):
@@ -3157,6 +3207,7 @@ class NVFP4TRTLLMGenFusedMoEMethod(NVFP4TRTLLMGenFusedMoEBaseMethod):
 
 
 class W4A8NVFP4FP8TRTLLMGenFusedMoEMethod(NVFP4TRTLLMGenFusedMoEBaseMethod):
+    eplb_support_status = EplbSupportStatus.NOT_VERIFIED
 
     def create_weights(self, module: torch.nn.Module):
         weight_vec_size = torch.iinfo(self.weight_dtype).bits // 4
@@ -3215,6 +3266,7 @@ def _get_weight_alignment(weight_alignment, scaling_vector_size, tp_size,
 
 
 class MXFP4WeightFusedMoEMethod(FusedMoEMethodBase):
+    eplb_support_status = EplbSupportStatus.SUPPORTED
 
     def create_weights(self,
                        module: torch.nn.Module,
@@ -3362,6 +3414,7 @@ class MXFP4WeightFusedMoEMethod(FusedMoEMethodBase):
 
 
 class MXFP4WeightCutlassFusedMoEMethod(MXFP4WeightFusedMoEMethod):
+    eplb_support_status = EplbSupportStatus.NOT_VERIFIED
     weight_dtype = FUSED_MOE_MXFP4_WEIGHT_DTYPE
     block_scales_dtype = FUSED_MOE_MXFP4_WEIGHT_BLOCK_SCALE_DTYPE
     # Cutlass MoE backend requires weight elements to be 128 aligned.
@@ -3534,6 +3587,7 @@ class W4A16MXFP4CutlassFusedMoEMethod(MXFP4WeightCutlassFusedMoEMethod):
 
 
 class W4A8MXFP4MXFP8CutlassFusedMoEMethod(MXFP4WeightCutlassFusedMoEMethod):
+    eplb_support_status = EplbSupportStatus.NOT_VERIFIED
 
     def create_weights(self, module: torch.nn.Module):
         fake_input_scale = nn.Parameter(torch.empty(
@@ -3564,6 +3618,7 @@ class W4A8MXFP4MXFP8CutlassFusedMoEMethod(MXFP4WeightCutlassFusedMoEMethod):
 
 
 class W4A8MXFP4FP8CutlassFusedMoEMethod(MXFP4WeightCutlassFusedMoEMethod):
+    eplb_support_status = EplbSupportStatus.NOT_SUPPORTED
 
     def create_weights(self, module: torch.nn.Module):
         fc31_input_scale = nn.Parameter(torch.tensor(1., dtype=torch.float32),
@@ -3970,6 +4025,7 @@ class W4A16MXFP4TRTLLMGenFusedMoEMethod(MXFP4WeightTRTLLMGenFusedMoEMethod):
 
 
 class W4A8MXFP4FP8TRTLLMGenFusedMoEMethod(MXFP4WeightTRTLLMGenFusedMoEMethod):
+    eplb_support_status = EplbSupportStatus.NOT_SUPPORTED
 
     def create_weights(self, module: torch.nn.Module):
         fc31_input_dequant = nn.Parameter(torch.empty(
diff --git a/tests/unittest/_torch/modules/moe/moe_test_utils.py b/tests/unittest/_torch/modules/moe/moe_test_utils.py
new file mode 100644
index 0000000000..b3987bafae
--- /dev/null
+++ b/tests/unittest/_torch/modules/moe/moe_test_utils.py
@@ -0,0 +1,727 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Shared utilities for MoE test files (test_moe_backend.py and test_moe_module.py).
+
+This module contains common code extracted from both test files:
+- MoeBackendType enum and get_backend_class()
+- MoeModelConfig dataclass
+- Skip logic functions (should_skip_trtllm, should_skip_cutedsl, should_skip_routing_method, etc.)
+- get_quick_skip_reason() - unified version supporting both backend and module tests
+- supports_autotuner_capture()
+- replay_tactics_and_check()
+- module_timer fixture
+- create_test_param() helper
+- Common test parameter constants
+"""
+
+import logging
+import time
+from dataclasses import dataclass
+from enum import Enum
+from itertools import product
+from typing import Callable, Optional, Type
+
+import pytest
+import torch
+
+from tensorrt_llm._torch.autotuner import AutoTuner
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_cute_dsl import CuteDslFusedMoE
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_cutlass import CutlassFusedMoE
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_deepgemm import DeepGemmFusedMoE
+from tensorrt_llm._torch.modules.fused_moe.fused_moe_trtllm_gen import TRTLLMGenFusedMoE
+from tensorrt_llm._torch.modules.fused_moe.interface import MoE
+from tensorrt_llm.models.modeling_utils import QuantAlgo
+
+G_LOGGER = logging.getLogger(__name__)
+
+
+# ============================================================================
+# MoE Backend Types
+# ============================================================================
+class MoeBackendType(str, Enum):
+    """Enum for MoE backend types."""
+
+    CUTLASS = "CUTLASS"
+    TRTLLM = "TRTLLM"
+    CUTEDSL = "CUTEDSL"
+    DEEPGEMM = "DEEPGEMM"
+
+
+def get_backend_class(backend_type: MoeBackendType) -> Type[MoE]:
+    """Get the MoE backend class for a given backend type."""
+    backend_class_map = {
+        MoeBackendType.CUTLASS: CutlassFusedMoE,
+        MoeBackendType.TRTLLM: TRTLLMGenFusedMoE,
+        MoeBackendType.CUTEDSL: CuteDslFusedMoE,
+        MoeBackendType.DEEPGEMM: DeepGemmFusedMoE,
+    }
+    return backend_class_map[backend_type]
+
+
+# ============================================================================
+# Model Configuration
+# ============================================================================
+@dataclass
+class MoeModelConfig:
+    """MoE model configuration: (num_experts, top_k, hidden_size, intermediate_size)."""
+
+    num_experts: int
+    top_k: int
+    hidden_size: int
+    intermediate_size: int
+
+    def __str__(self) -> str:
+        return f"e{self.num_experts}_k{self.top_k}_h{self.hidden_size}_i{self.intermediate_size}"
+
+
+# ============================================================================
+# Skip Logic Functions
+# ============================================================================
+def should_skip_trtllm(
+    backend_type: MoeBackendType,
+    quant_algo: Optional[QuantAlgo],
+    model_config: "MoeModelConfig",
+    routing_method_cls=None,
+    swiglu_gptoss_style: bool = False,
+    comm_method: Optional[str] = None,
+) -> Optional[str]:
+    """
+    Check TRTLLM Gen backend specific constraints.
+
+    The TRTLLM Gen MoE kernels have hardware-level constraints that must be satisfied.
+    These constraints are enforced in C++ layer.
+
+    Args:
+        backend_type: The MoE backend type
+        quant_algo: The quantization algorithm
+        model_config: The MoE model configuration
+        routing_method_cls: Optional routing method class for compatibility checks
+            (used by test_moe_module.py)
+        swiglu_gptoss_style: Whether using swiglu gptoss style
+        comm_method: Optional communication method (e.g. "DEEPEP", "DEEPEPLOWLATENCY")
+            for multi-GPU EP mode checks
+
+    Returns:
+        Skip reason string if test should be skipped, None otherwise
+    """
+    if backend_type != MoeBackendType.TRTLLM:
+        return None
+
+    # Routing method compatibility check (used by test_moe_module.py)
+    # TRTLLMGen C++ routing kernel (runner.cu) only implements:
+    # - DeepSeekV3 (requires float32 routing_logits)
+    # - Llama4 (requires top_k=1)
+    # - Renormalize
+    # - RenormalizeNaive
+    # See: cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.cu:77-212
+    if routing_method_cls is not None:
+        from tensorrt_llm._torch.modules.fused_moe import (
+            DeepSeekV3MoeRoutingMethod,
+            DefaultMoeRoutingMethod,
+            Llama4RenormalizeMoeRoutingMethod,
+            MiniMaxM2MoeRoutingMethod,
+        )
+
+        # Routing methods NOT implemented in C++ kernel
+        trtllm_unimplemented_routing = (
+            DefaultMoeRoutingMethod,  # runner.cu:210 - "Unimplemented routing method"
+            MiniMaxM2MoeRoutingMethod,  # runner.cu:210 - "Unimplemented routing method"
+        )
+        if routing_method_cls in trtllm_unimplemented_routing:
+            routing_name = routing_method_cls.__name__
+            return (
+                f"TRTLLMGen C++ routing kernel does not implement {routing_name}. See runner.cu:210"
+            )
+
+        # Llama4 routing only supports top_k=1
+        # See: runner.cu:113 - TLLM_CHECK_WITH_INFO(topK == 1, ...)
+        if routing_method_cls == Llama4RenormalizeMoeRoutingMethod:
+            if model_config is not None and model_config.top_k != 1:
+                return (
+                    f"TRTLLMGen Llama4 routing only supports top_k=1 "
+                    f"(got top_k={model_config.top_k}). See runner.cu:113"
+                )
+
+        # DeepSeekV3 routing requires num_experts >= 22
+        # See: RoutingDeepSeek.cu:32,664 - MaxSupportedTopExperts = 22
+        if routing_method_cls == DeepSeekV3MoeRoutingMethod:
+            if model_config is not None and model_config.num_experts < 22:
+                return (
+                    f"TRTLLMGen DeepSeekV3 routing requires num_experts >= 22 "
+                    f"(got num_experts={model_config.num_experts}). See RoutingDeepSeek.cu:664"
+                )
+
+            # DeepSeekV3 routing kernel only supports topk_group <= 4.
+            # topk_group is computed from num_experts in _create_routing_method:
+            #   n_group = max(1, num_experts // 2)
+            #   topk_group = min(n_group, max(1, n_group // 2))
+            if model_config is not None:
+                n_group = max(1, model_config.num_experts // 2)
+                topk_group = min(n_group, max(1, n_group // 2))
+                if topk_group > 4:
+                    return (
+                        f"TRTLLMGen DeepSeekV3 routing kernel only supports "
+                        f"topk_group <= 4 (got topk_group={topk_group} from "
+                        f"num_experts={model_config.num_experts})"
+                    )
+
+    if model_config is None:
+        return None
+
+    # These quantization algorithms use TRTLLM Gen kernels with the constraints
+    trtllm_gen_quant_algos = {
+        QuantAlgo.NVFP4,
+        QuantAlgo.FP8_BLOCK_SCALES,
+        QuantAlgo.W4A8_NVFP4_FP8,
+        QuantAlgo.W4A16_MXFP4,
+        QuantAlgo.W4A8_MXFP4_MXFP8,
+    }
+
+    if quant_algo not in trtllm_gen_quant_algos:
+        return None
+
+    num_experts = model_config.num_experts
+    top_k = model_config.top_k
+    intermediate_size = model_config.intermediate_size
+
+    # Check: num_experts must be divisible by 4
+    # Routing kernel uses vectorized operations that require this alignment
+    if num_experts % 4 != 0:
+        return (
+            f"TRTLLMGenFusedMoE routing kernel requires num_experts divisible by 4 "
+            f"(got num_experts={num_experts})"
+        )
+
+    # Check: num_experts must be greater than top_k
+    # Routing logic cannot handle the case where all experts are selected
+    if num_experts <= top_k:
+        return (
+            f"TRTLLMGenFusedMoE requires num_experts > top_k "
+            f"(got num_experts={num_experts}, top_k={top_k})"
+        )
+    # W4A8_MXFP4_MXFP8 with non-128-aligned hidden_size or intermediate_size
+    # causes block_scale_interleave_reverse to fail with
+    # "rows of Interleaved block scales should be multiple of 128".
+    if quant_algo == QuantAlgo.W4A8_MXFP4_MXFP8:
+        hidden_size = model_config.hidden_size
+        if hidden_size % 128 != 0 or intermediate_size % 128 != 0:
+            return (
+                f"TRTLLMGenFusedMoE W4A8_MXFP4_MXFP8 with non-128-aligned "
+                f"sizes (h={hidden_size}, i={intermediate_size}) causes "
+                f"block_scale_interleave_reverse rows must be multiple of 128."
+            )
+
+    # -----------------Potential issues------------------
+    # These are known issues that need investigation. Skipping to avoid test failures
+    # and CUDA errors that can cascade to subsequent tests.
+
+    # Issue: W4A8_NVFP4_FP8 with top_k=1 causes CUDA illegal memory access
+    if quant_algo == QuantAlgo.W4A8_NVFP4_FP8 and top_k == 1:
+        return (
+            "[Potential Bug] TRTLLMGenFusedMoE W4A8_NVFP4_FP8 with top_k=1 "
+            "causes CUDA illegal memory access."
+        )
+
+    # Issue: NVFP4 with large intermediate_size has known accuracy issues
+    if quant_algo == QuantAlgo.NVFP4 and intermediate_size >= 14336:
+        return (
+            f"[Potential Bug] TRTLLMGenFusedMoE NVFP4 with large intermediate_size "
+            f"has known accuracy issues (intermediate_size={intermediate_size} >= 14336)."
+        )
+
+    # Issue: W4A8_MXFP4_MXFP8 has accuracy issues on certain model configs
+    if quant_algo == QuantAlgo.W4A8_MXFP4_MXFP8:
+        if intermediate_size >= 14336:
+            return (
+                f"[Potential Bug] TRTLLMGenFusedMoE W4A8_MXFP4_MXFP8 with large "
+                f"intermediate_size has accuracy issues (intermediate_size={intermediate_size} >= 14336)."
+            )
+        if num_experts >= 60 and intermediate_size >= 1408:
+            return (
+                f"[Potential Bug] TRTLLMGenFusedMoE W4A8_MXFP4_MXFP8 with many experts "
+                f"has accuracy issues (num_experts={num_experts} >= 60)."
+            )
+        # Issue: W4A8_MXFP4_MXFP8 with swiglu_gptoss_style and top_k=1 has accuracy
+        # issues on TRTLLM backend. Observed mismatch ~20-22% exceeds the 20% threshold.
+        # CUTLASS backend with the same configuration passes.
+        if swiglu_gptoss_style and top_k == 1:
+            return (
+                f"[Potential Bug] TRTLLMGenFusedMoE W4A8_MXFP4_MXFP8 with "
+                f"swiglu_gptoss_style and top_k={top_k} has accuracy issues "
+                f"(mismatch ~20-22%). CUTLASS backend with the same config passes."
+            )
+
+    # Issue: Certain TRTLLM kernel runners crash with CUDA errors in multi-GPU
+    # DeepEP mode. the crash is specific to EP with DeepEP.
+    # Verified on 4 GPUs with DEP + DEEPEP + TRTLLM (e60_k4_h2048_i1408):
+    #   - FP8_BLOCK_SCALES:  CRASH   (fp8_block_scale_moe_runner -> CUDA_ERROR_INVALID_HANDLE)
+    #   - W4A16_MXFP4:       CRASH   (bf16_mxe2m1_block_scale_moe_runner -> illegal memory access)
+    #   - W4A8_MXFP4_MXFP8:  likely crash (same mxe2m1 kernel family as W4A16_MXFP4)
+    if comm_method in ("DEEPEP", "DEEPEPLOWLATENCY"):
+        deepep_crash_quant_algos = {
+            QuantAlgo.FP8_BLOCK_SCALES,
+            QuantAlgo.W4A16_MXFP4,
+            QuantAlgo.W4A8_MXFP4_MXFP8,
+        }
+        if quant_algo in deepep_crash_quant_algos:
+            return (
+                f"[Potential Bug] TRTLLMGenFusedMoE {quant_algo} crashes with "
+                f"CUDA error in multi-GPU DeepEP mode (comm={comm_method}). "
+                f"Single-GPU tests pass; issue is in the kernel runner under EP."
+            )
+
+    return None
+
+
+def should_skip_cutedsl(
+    backend_type: MoeBackendType,
+    quant_algo: Optional[QuantAlgo],
+    model_config: "MoeModelConfig" = None,
+    comm_method: Optional[str] = None,
+    routing_method_cls=None,
+) -> Optional[str]:
+    """
+    Check CuteDSL backend specific constraints.
+
+    Returns:
+        Skip reason string if test should be skipped, None otherwise
+    """
+    if backend_type != MoeBackendType.CUTEDSL:
+        return None
+
+    # DeepEPLowLatency _modify_output_to_adapt_fused_moe converts dispatch output
+    # to a format where token_selected_slots has shape [num_local_experts, tokens_per_expert]
+    # instead of [num_tokens, top_k]. CuteDSL moe_sort asserts
+    # token_selected_experts.size(1) == top_k, which fails with this format.
+    if comm_method == "DEEPEPLOWLATENCY":
+        return (
+            "[Potential Bug] CuteDslFusedMoE is incompatible with DeepEPLowLatency: "
+            "DeepEPLowLatency _modify_output_to_adapt_fused_moe reshapes "
+            "token_selected_slots to [num_local_experts, tokens_per_expert] "
+            "(effectively top_k=1), but CuteDSL moe_sort requires "
+            "token_selected_experts.size(1) == top_k."
+        )
+
+    if model_config is None:
+        return None
+
+    intermediate_size = model_config.intermediate_size
+    num_experts = model_config.num_experts
+
+    # NVFP4 with large intermediate_size has known accuracy issues
+    if quant_algo == QuantAlgo.NVFP4 and intermediate_size >= 14336:
+        return (
+            f"[Potential Bug] CuteDslFusedMoE NVFP4 with large intermediate_size "
+            f"has known accuracy issues (intermediate_size={intermediate_size} >= 14336)."
+        )
+
+    # NVFP4 with prime num_experts causes CUDA_ERROR_ILLEGAL_ADDRESS
+    prime_experts_with_issues = {7, 13}
+    if quant_algo == QuantAlgo.NVFP4 and num_experts in prime_experts_with_issues:
+        return (
+            f"[Potential Bug] CuteDslFusedMoE NVFP4 with prime num_experts={num_experts} "
+            f"causes CUDA_ERROR_ILLEGAL_ADDRESS due to autotuner cache bucket mapping."
+        )
+
+    # NVFP4 with Llama4Renormalize routing has significant accuracy issues on bfloat16.
+    # Observed mismatch up to 34.6% (threshold 2% at rtol=0.01, percent=0.98).
+    if routing_method_cls is not None:
+        from tensorrt_llm._torch.modules.fused_moe import Llama4RenormalizeMoeRoutingMethod
+
+        if (
+            quant_algo == QuantAlgo.NVFP4
+            and routing_method_cls == Llama4RenormalizeMoeRoutingMethod
+        ):
+            return (
+                "[Potential Bug] CuteDslFusedMoE NVFP4 with Llama4Renormalize "
+                "routing has significant accuracy issues (mismatch up to 34.6%%)."
+            )
+
+    return None
+
+
+def should_skip_deepgemm(
+    backend_type: MoeBackendType,
+    comm_method: Optional[str] = None,
+    quant_algo: Optional[QuantAlgo] = None,
+    model_config: "MoeModelConfig" = None,
+) -> Optional[str]:
+    """
+    Check DeepGemm backend specific constraints.
+
+    Returns:
+        Skip reason string if test should be skipped, None otherwise
+    """
+    if backend_type != MoeBackendType.DEEPGEMM:
+        return None
+
+    # DeepGemm workspace allocation in set_strides (fused_moe_deepgemm.py) uses a
+    # storage size that is 4x too small when combined with DeepEPLowLatency dispatch.
+    # The workspace is allocated based on assumptions that do not account for the
+    # DeepEPLowLatency output format ([num_local_experts, ep_size * max_tokens, hidden_size]).
+    if comm_method == "DEEPEPLOWLATENCY":
+        return (
+            "[Potential Bug] DeepGemmFusedMoE workspace allocation is incompatible "
+            "with DeepEPLowLatency: set_strides requires storage of "
+            "[num_local_experts * tokens * hidden_size] bytes but the allocated "
+            "workspace is ~4x too small, causing setStorage out of bounds."
+        )
+
+    # Issue: DEEPGEMM + FP8_BLOCK_SCALES crashes with CUDA illegal memory access
+    # on large expert counts (e.g. e384_k8_h7168_i2048) during post_load_weights().
+    # The crash occurs in get_col_major_tma_aligned_packed_tensor (fp8_utils.py)
+    # when resmoothing FP8 E8M0 scales on SM100f (Blackwell).
+    # Small configs (e.g. e60_k4_h2048_i1408) pass fine.
+    if quant_algo == QuantAlgo.FP8_BLOCK_SCALES and model_config is not None:
+        if model_config.num_experts > 128:
+            return (
+                f"[Potential Bug] DeepGemmFusedMoE FP8_BLOCK_SCALES crashes with "
+                f"CUDA illegal memory access on large expert count "
+                f"(num_experts={model_config.num_experts}). The crash occurs in "
+                f"get_col_major_tma_aligned_packed_tensor during "
+                f"post_load_weights() FP8 E8M0 scale resmoothing on SM100f."
+            )
+
+    return None
+
+
+def should_skip_multi_gpu(
+    parallel_mode: str,
+    model_config: "MoeModelConfig",
+    world_size: int = 4,
+) -> Optional[str]:
+    """
+    Check if a multi-GPU test should be skipped due to EP partitioning constraints.
+
+    In EP modes (DEP, TEP), num_experts must be divisible by ep_size (= world_size)
+    when EPLB (Expert Load Balancing) is not enabled. Otherwise the assertion
+    `num_experts % ep_size == 0` in interface.py _init_load_balancer will fail.
+
+    Args:
+        parallel_mode: Parallelism strategy ("DEP", "TEP", "DTP", "TTP")
+        model_config: MoE model configuration containing num_experts
+        world_size: Total number of GPUs (default: 4)
+
+    Returns:
+        Skip reason string if test should be skipped, None otherwise
+    """
+    # Only EP modes have ep_size = world_size; TP modes have ep_size = 1
+    if parallel_mode not in ("DEP", "TEP"):
+        return None
+
+    ep_size = world_size
+    num_experts = model_config.num_experts
+    if num_experts % ep_size != 0:
+        return (
+            f"num_experts={num_experts} is not divisible by ep_size={ep_size} "
+            f"in {parallel_mode} mode. Requires EPLB to handle non-uniform "
+            f"expert partitioning (tested separately in test_ConfigurableMoE_multi_gpu_eplb)."
+        )
+
+    return None
+
+
+def should_skip_routing_method(
+    routing_method_cls,
+    model_config: "MoeModelConfig",
+) -> Optional[str]:
+    """
+    Check routing method specific constraints that are independent of backend.
+
+    Args:
+        routing_method_cls: The routing method class
+        model_config: The MoE model configuration
+
+    Returns:
+        Skip reason string if test should be skipped, None otherwise
+    """
+    if routing_method_cls is None or model_config is None:
+        return None
+
+    from tensorrt_llm._torch.modules.fused_moe import DeepSeekV3MoeRoutingMethod
+
+    # DeepSeekV3 routing: num_experts must be divisible by n_group for the
+    # view operation in noaux_tc (routing.py:298). n_group = max(1, num_experts // 2),
+    # so odd num_experts (e.g. 7, 13) fail because num_experts % n_group != 0.
+    if routing_method_cls == DeepSeekV3MoeRoutingMethod:
+        num_experts = model_config.num_experts
+        experts_per_group = 2
+        n_group = max(1, num_experts // experts_per_group)
+        if n_group > 1 and num_experts % n_group != 0:
+            return (
+                f"DeepSeekV3 routing requires num_experts divisible by n_group "
+                f"(num_experts={num_experts}, n_group={n_group}). "
+                f"noaux_tc view([n_group, num_experts // n_group]) fails."
+            )
+
+    return None
+
+
+def supports_autotuner_capture(
+    backend_type: MoeBackendType,
+    _quant_algo: Optional[QuantAlgo],
+) -> bool:
+    """
+    Determine if a backend+quant_algo combination supports AutoTuner capture/replay.
+
+    Args:
+        backend_type: The MoE backend type
+        _quant_algo: The quantization algorithm (None for unquantized).
+            Reserved for future per-algorithm gating; currently unused.
+
+    Returns:
+        True if autotuner capture/replay is supported, False otherwise
+    """
+    # DEEPGEMM does not support autotuner capture
+    if backend_type == MoeBackendType.DEEPGEMM:
+        return False
+
+    return True
+
+
+def get_quick_skip_reason(
+    backend_type: MoeBackendType,
+    quant_algo: Optional[QuantAlgo],
+    dtype: torch.dtype,
+    model_config: "MoeModelConfig",
+    routing_method_cls=None,
+    swiglu_gptoss_style: bool = False,
+) -> Optional[str]:
+    """
+    Fast skip check that calls backend's can_implement() method.
+
+    Unified version supporting both backend-level and module-level tests:
+    - routing_method_cls: Used by test_moe_module.py for routing method compatibility checks
+    - swiglu_gptoss_style: Used by test_moe_backend.py for SwiGLU parameter checks
+
+    Returns:
+        Skip reason string if test should be skipped, None otherwise
+    """
+    import logging as _logging
+
+    # Suppress logger warnings during parameter generation
+    trtllm_logger = _logging.getLogger("tensorrt_llm")
+    original_level = trtllm_logger.level
+    trtllm_logger.setLevel(_logging.ERROR)
+
+    try:
+        # Call backend's can_implement for dtype/quant_algo checks
+        backend_cls = get_backend_class(backend_type)
+        can_impl_kwargs = {"dtype_activation": dtype}
+        if swiglu_gptoss_style:
+            can_impl_kwargs["swiglu_gptoss_style"] = swiglu_gptoss_style
+        can_impl, skip_reason = backend_cls.can_implement(quant_algo, **can_impl_kwargs)
+        if not can_impl:
+            return skip_reason
+
+        # Chain skip checks: routing method, then per-backend constraints
+        skip_checks = [
+            lambda: should_skip_routing_method(routing_method_cls, model_config),
+            lambda: should_skip_trtllm(
+                backend_type, quant_algo, model_config, routing_method_cls, swiglu_gptoss_style
+            ),
+            lambda: should_skip_cutedsl(
+                backend_type, quant_algo, model_config, routing_method_cls=routing_method_cls
+            ),
+            lambda: should_skip_deepgemm(
+                backend_type, quant_algo=quant_algo, model_config=model_config
+            ),
+        ]
+        for check in skip_checks:
+            skip_reason = check()
+            if skip_reason:
+                return skip_reason
+
+        # DEEPGEMM: float16 reference module constraint
+        if backend_type == MoeBackendType.DEEPGEMM and dtype == torch.float16:
+            return "DeepGemmFusedMoE reference module requires bfloat16 input"
+
+        # 128-alignment requirement for quantization
+        if quant_algo is not None:
+            hidden_size = model_config.hidden_size
+            intermediate_size = model_config.intermediate_size
+            is_hidden_128_aligned = hidden_size % 128 == 0
+            is_intermediate_128_aligned = intermediate_size % 128 == 0
+
+            if not is_hidden_128_aligned or not is_intermediate_128_aligned:
+                # TRTLLM with MXFP4 variants automatically pads to 128 alignment
+                is_mxfp4_variant = quant_algo in {QuantAlgo.W4A16_MXFP4, QuantAlgo.W4A8_MXFP4_MXFP8}
+                is_trtllm_backend = backend_type == MoeBackendType.TRTLLM
+                if not (is_trtllm_backend and is_mxfp4_variant):
+                    return (
+                        f"Non-128-aligned sizes (h={hidden_size}, i={intermediate_size}) "
+                        f"require TRTLLM backend with MXFP4 quantization"
+                    )
+
+        return None
+
+    finally:
+        trtllm_logger.setLevel(original_level)
+
+
+# ============================================================================
+# Autotuner Tactic Replay
+# ============================================================================
+def replay_tactics_and_check(
+    all_tactics,
+    run_moe_fn: Callable[[], torch.Tensor],
+    check_accuracy_fn: Callable[[torch.Tensor, torch.Tensor], None],
+    ref_output: torch.Tensor,
+    backend_type: MoeBackendType,
+    quant_algo: Optional[QuantAlgo],
+    fail_fast: bool = False,
+) -> None:
+    """
+    Replay all tactics and check accuracy.
+
+    Args:
+        all_tactics: TacticsCapture object from AutoTuner.capture()
+        run_moe_fn: Function to run MoE computation
+        check_accuracy_fn: Function to check accuracy (output, ref_output) -> None
+        ref_output: Reference output tensor
+        backend_type: Backend type for error reporting
+        quant_algo: Quantization algorithm for error reporting
+        fail_fast: If True, fail on first error. If False, run all and report summary.
+    """
+    tactics_list = list(all_tactics)
+    passed_tactics = []
+    failed_tactics = []
+    G_LOGGER.info(f"Replay tactics : {len(tactics_list)} and check accuracy")
+    for idx, tactic in enumerate(tactics_list):
+        with AutoTuner.get().replay(tactic), torch.inference_mode():
+            output = run_moe_fn()
+            try:
+                check_accuracy_fn(output, ref_output)
+                passed_tactics.append((idx, tactic))
+            except Exception as e:
+                if fail_fast:
+                    pytest.fail(
+                        f"Accuracy check failed for tactic[{idx}/{len(tactics_list)}]={tactic}, "
+                        f"backend={backend_type}, quant_algo={quant_algo}: {e}"
+                    )
+                failed_tactics.append((idx, tactic, str(e)))
+
+    # Report results (only when fail_fast=False)
+    total = len(tactics_list)
+    num_passed = len(passed_tactics)
+    num_failed = len(failed_tactics)
+    if failed_tactics:
+        fail_details = "\n".join(
+            f"  tactic[{idx}]={tactic}: {err}" for idx, tactic, err in failed_tactics
+        )
+        pytest.fail(
+            f"backend={backend_type}, quant_algo={quant_algo}: "
+            f"{num_passed}/{total} passed, {num_failed}/{total} failed\n"
+            f"Failed tactics:\n{fail_details}"
+        )
+
+
+# ============================================================================
+# Test Parameter Helpers
+# ============================================================================
+def create_test_param(param_values, test_id, skip_reason=None):
+    """Create a pytest.param with optional skip mark."""
+    if skip_reason:
+        return pytest.param(*param_values, id=test_id, marks=pytest.mark.skip(reason=skip_reason))
+    return pytest.param(*param_values, id=test_id)
+
+
+# ============================================================================
+# Timing Fixture
+# ============================================================================
+@pytest.fixture(scope="module", autouse=True)
+def module_timer(request):
+    """Fixture to measure and log total module execution time."""
+    start = time.perf_counter()
+    yield
+    elapsed = time.perf_counter() - start
+    G_LOGGER.info(
+        "[TIMING] Total %s: %.3fs (%.2f min)",
+        request.module.__name__,
+        elapsed,
+        elapsed / 60,
+    )
+
+
+# ============================================================================
+# Base Test Config Iterator
+# ============================================================================
+def iter_base_test_configs(
+    swiglu_combos, model_configs, seq_lens, dtypes, backend_types, quant_algos, routing_methods=None
+):
+    """
+    Iterate over base test configurations using itertools.product.
+
+    This is shared by test_moe_backend.py and test_moe_module.py.
+    When routing_methods is None, defaults to [RenormalizeMoeRoutingMethod].
+
+    Args:
+        swiglu_combos: List of (swiglu_alpha, swiglu_beta, swiglu_limit) tuples
+        model_configs: List of MoeModelConfig
+        seq_lens: List of sequence lengths
+        dtypes: List of data types
+        backend_types: List of backend types
+        quant_algos: List of quantization algorithms
+        routing_methods: List of routing method classes (default: [RenormalizeMoeRoutingMethod])
+
+    Yields:
+        Tuple of (swiglu_alpha, swiglu_beta, swiglu_limit, model_config, seq_len,
+                  dtype, backend_type, quant_algo, routing_method_cls, skip_reason, base_test_id)
+    """
+    if routing_methods is None:
+        from tensorrt_llm._torch.modules.fused_moe import RenormalizeMoeRoutingMethod
+
+        routing_methods = [RenormalizeMoeRoutingMethod]
+
+    for (
+        swiglu_alpha,
+        swiglu_beta,
+        swiglu_limit,
+    ), model_config, seq_len, dtype, backend_type, quant_algo, routing_method_cls in product(
+        swiglu_combos, model_configs, seq_lens, dtypes, backend_types, quant_algos, routing_methods
+    ):
+        swiglu_gptoss_style = swiglu_alpha != 1 or swiglu_beta != 0 or swiglu_limit != float("inf")
+        skip_reason = get_quick_skip_reason(
+            backend_type,
+            quant_algo,
+            dtype,
+            model_config,
+            routing_method_cls,
+            swiglu_gptoss_style=swiglu_gptoss_style,
+        )
+        routing_name = routing_method_cls.__name__.replace("MoeRoutingMethod", "")
+        swiglu_id = (
+            f"alpha={swiglu_alpha}_beta={swiglu_beta}_limit={swiglu_limit}-"
+            if swiglu_gptoss_style
+            else ""
+        )
+        base_test_id = (
+            f"{swiglu_id}{model_config}-seq={seq_len}-dtype={dtype}-"
+            f"backend={backend_type.value}-quant={quant_algo}-routing={routing_name}"
+        )
+        yield (
+            swiglu_alpha,
+            swiglu_beta,
+            swiglu_limit,
+            model_config,
+            seq_len,
+            dtype,
+            backend_type,
+            quant_algo,
+            routing_method_cls,
+            skip_reason,
+            base_test_id,
+        )
diff --git a/tests/unittest/_torch/modules/moe/quantize_utils.py b/tests/unittest/_torch/modules/moe/quantize_utils.py
index 57fbb4f832..24652a1c06 100644
--- a/tests/unittest/_torch/modules/moe/quantize_utils.py
+++ b/tests/unittest/_torch/modules/moe/quantize_utils.py
@@ -217,7 +217,7 @@ class RefGatedMLPFusedMoE(nn.Module):
             model_config = ModelConfig()
         self.quant_config = model_config.quant_config
 
-        # Custom swiglu activation for gptoss_style
+        # Custom swiglu activation for swiglu_gptoss_style
         def custom_swiglu(x):
             gate, value = x.chunk(2, dim=-1)
             if swiglu_limit is not None and swiglu_limit != float("inf"):
@@ -314,7 +314,7 @@ class BaseQuantizeUtil(ABC):
     """
     BaseQuantizeUtil serves as a base class for MoE correctess testing which provides interface
     to create quantized weights and reference modules. It can be extended for different quantization algorithms.
-    Supports gptoss_style with custom swiglu parameters.
+    Supports swiglu_gptoss_style with custom swiglu parameters.
     """
 
     def __init__(
@@ -325,10 +325,11 @@ class BaseQuantizeUtil(ABC):
         hidden_size: int,
         quant_config: QuantConfig,
         bias: bool = False,
-        gptoss_style: bool = False,
+        swiglu_gptoss_style: bool = False,
         swiglu_alpha: Optional[float] = None,
         swiglu_beta: Optional[float] = None,
         swiglu_limit: Optional[float] = None,
+        num_local_experts: Optional[int] = None,
     ):
         self.num_experts = num_experts
         self.dtype = dtype
@@ -336,38 +337,48 @@ class BaseQuantizeUtil(ABC):
         self.hidden_size = hidden_size
         self.quant_config = quant_config
         self.bias = bias
-        self._gptoss_style = gptoss_style
+        self._swiglu_gptoss_style = swiglu_gptoss_style
         self.swiglu_alpha = swiglu_alpha
         self.swiglu_beta = swiglu_beta
         self.swiglu_limit = swiglu_limit
+        # In EP mode, swiglu tensors must be sized per local experts
+        # (see modeling_gpt_oss.py: num_slots // moe_ep_size)
+        self.num_local_experts = num_local_experts if num_local_experts is not None else num_experts
 
-        # Pre-create swiglu tensors if gptoss_style is enabled
-        if self._gptoss_style:
+        # Pre-create swiglu tensors if swiglu_gptoss_style is enabled
+        if self._swiglu_gptoss_style:
+            self.swiglu_alpha = 1.0 if self.swiglu_alpha is None else self.swiglu_alpha
+            self.swiglu_beta = 0.0 if self.swiglu_beta is None else self.swiglu_beta
+            self.swiglu_limit = float("inf") if self.swiglu_limit is None else self.swiglu_limit
             self._swiglu_tensors = self._create_swiglu_tensors()
         else:
             self._swiglu_tensors = None
 
     @property
-    def gptoss_style(self) -> bool:
-        """Check if gptoss_style is enabled."""
-        return self._gptoss_style
+    def swiglu_gptoss_style(self) -> bool:
+        """Check if swiglu_gptoss_style is enabled."""
+        return self._swiglu_gptoss_style
 
     def _create_swiglu_tensors(self) -> Dict[str, torch.Tensor]:
         """
         Internal method to create swiglu tensors for MoE backend.
 
+        Uses num_local_experts (= num_experts // ep_size in EP mode) to match
+        the kernel expectation. See modeling_gpt_oss.py for reference:
+        swiglu_alpha is created with size (num_slots // moe_ep_size).
+
         Returns:
             Dict with 'swiglu_alpha', 'swiglu_beta', 'swiglu_limit' tensors.
         """
         return {
             "swiglu_alpha": torch.full(
-                (self.num_experts,), self.swiglu_alpha, device="cuda", dtype=torch.float
+                (self.num_local_experts,), self.swiglu_alpha, device="cuda", dtype=torch.float
             ),
             "swiglu_beta": torch.full(
-                (self.num_experts,), self.swiglu_beta, device="cuda", dtype=torch.float
+                (self.num_local_experts,), self.swiglu_beta, device="cuda", dtype=torch.float
             ),
             "swiglu_limit": torch.full(
-                (self.num_experts,), self.swiglu_limit, device="cuda", dtype=torch.float
+                (self.num_local_experts,), self.swiglu_limit, device="cuda", dtype=torch.float
             ),
         }
 
@@ -376,7 +387,7 @@ class BaseQuantizeUtil(ABC):
         Get pre-created swiglu tensors.
 
         Returns:
-            Dict with swiglu tensors if gptoss_style is enabled, None otherwise.
+            Dict with swiglu tensors if swiglu_gptoss_style is enabled, None otherwise.
         """
         return self._swiglu_tensors
 
@@ -428,11 +439,12 @@ class FP8RefGatedMLPFusedMoE(RefGatedMLPFusedMoE):
     expected_quant_algo = QuantAlgo.FP8
 
     def check_accuracy(self, output, ref_output):
-        # Relaxed percent from 0.99 to 0.97 to account for FP8 quantization error accumulation
-        # in large intermediate dimensions and multi-expert routing computations.
+        # Relaxed percent from 0.97 to 0.95 to account for FP8 quantization error accumulation
+        # in large intermediate dimensions and multi-expert routing computations,
+        # especially with Llama4Renormalize sigmoid-based routing.
         # Theoretical basis: FP8 (E4M3) has ~12.5% unit error, accumulated error grows as sqrt(K)
-        # where K is GEMM reduction dimension. Max observed mismatch is ~2.1% < 3%.
-        check_accuracy(output, ref_output, rtol=4e-2, atol=1e-1, percent=0.97)
+        # where K is GEMM reduction dimension. Max observed mismatch is ~4.8% < 5%.
+        check_accuracy(output, ref_output, rtol=4e-2, atol=1e-1, percent=0.95)
 
 
 class FP8QuantizeUtil(BaseQuantizeUtil):
@@ -447,7 +459,6 @@ class FP8QuantizeUtil(BaseQuantizeUtil):
         assert self.quant_config is not None and self.quant_config.quant_algo == QuantAlgo.FP8, (
             "expect quant_algo to be fp8"
         )
-        bias = quant_kwargs.get("bias", False)
         weights = {}
         for expert_id in range(self.num_experts):
             w1_weight = torch.randn(
@@ -489,7 +500,7 @@ class FP8QuantizeUtil(BaseQuantizeUtil):
             weights[f"{expert_id}.w2.input_scale"] = w2_input_scale
             weights[f"{expert_id}.w3.input_scale"] = w3_input_scale
 
-            if bias:
+            if self.bias:
                 weights[f"{expert_id}.w1.bias"] = torch.randn(
                     (self.intermediate_size,), dtype=self.dtype, device="cuda"
                 )
@@ -514,22 +525,25 @@ class NVFP4RefGatedMLPFusedMoE(RefGatedMLPFusedMoE):
     scale_keys = ["weight_scale", "input_scale", "weight_scale_2"]
     expected_quant_algo = QuantAlgo.NVFP4
 
-    def __init__(self, *args, gptoss_style: bool = False, **kwargs):
+    def __init__(self, *args, swiglu_gptoss_style: bool = False, **kwargs):
         super().__init__(*args, **kwargs)
-        self.gptoss_style = gptoss_style
+        self.swiglu_gptoss_style = swiglu_gptoss_style
 
     def check_accuracy(self, output, ref_output):
-        if self.gptoss_style:
-            # gptoss_style uses relaxed tolerance
+        if self.swiglu_gptoss_style:
+            # swiglu_gptoss_style uses relaxed tolerance
             check_accuracy(output, ref_output, rtol=0.1, atol=0.1, percent=0.95)
         else:
-            check_accuracy(output, ref_output, rtol=1e-2, atol=0.15, percent=0.98)
+            # Relaxed percent from 0.98 to 0.97 to account for NVFP4 quantization
+            # error accumulation with certain routing methods (e.g. Llama4Renormalize).
+            # Max observed mismatch in non-skipped cases is ~2.7% < 3%.
+            check_accuracy(output, ref_output, rtol=1e-2, atol=0.15, percent=0.97)
 
 
 class NVFP4QuantizeUtil(BaseQuantizeUtil):
     """
     NVFP4QuantizeUtil inherits from BaseQuantizeUtil to support correctness testing for NVFP4 quantized MoE modules.
-    Supports gptoss_style with custom swiglu parameters (inherited from BaseQuantizeUtil).
+    Supports swiglu_gptoss_style with custom swiglu parameters (inherited from BaseQuantizeUtil).
     """
 
     def create_weights(self, **quant_kwargs) -> Dict[str, torch.Tensor]:
@@ -629,7 +643,7 @@ class NVFP4QuantizeUtil(BaseQuantizeUtil):
         self, routing_method, ref_cls=NVFP4RefGatedMLPFusedMoE
     ) -> torch.nn.Module:
         """
-        Create a reference module for correctness testing with gptoss_style support.
+        Create a reference module for correctness testing with swiglu_gptoss_style support.
         """
         ref_fused_moe = ref_cls(
             num_experts=self.num_experts,
@@ -639,7 +653,7 @@ class NVFP4QuantizeUtil(BaseQuantizeUtil):
             dtype=self.dtype,
             model_config=ModelConfig(quant_config=self.quant_config),
             bias=self.bias,
-            gptoss_style=self.gptoss_style,
+            swiglu_gptoss_style=self.swiglu_gptoss_style,
             swiglu_alpha=self.swiglu_alpha,
             swiglu_beta=self.swiglu_beta,
             swiglu_limit=self.swiglu_limit,
@@ -667,6 +681,11 @@ class FP8BlockScalesQuantizeUtil(BaseQuantizeUtil):
     for FP8 block-wise quantized MoE modules.
     """
 
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        # Will be set by create_weights() if ref_cls is provided in quant_kwargs
+        self._ref_cls = None
+
     def create_weights(self, **quant_kwargs) -> Dict[str, torch.Tensor]:
         """
         Create quantized weights for MoE experts using FP8 block-wise quantization.
@@ -675,12 +694,18 @@ class FP8BlockScalesQuantizeUtil(BaseQuantizeUtil):
             use_e8m0_scale: If True, use per_block_cast_to_fp8_e8m0 which produces E8M0
                             format scales required by DEEPGEMM and TRTLLM backends.
                             If False, use per_block_cast_to_fp8 with regular float scales.
+            ref_cls: Optional reference module class to use for accuracy testing.
+                     If provided, will be stored and used by create_ref_module().
         """
         assert (
             self.quant_config is not None
             and self.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
         ), "expect quant_algo to be FP8_BLOCK_SCALES"
 
+        # Store ref_cls if provided for later use in create_ref_module()
+        if "ref_cls" in quant_kwargs:
+            self._ref_cls = quant_kwargs.pop("ref_cls")
+
         # Select quantization function based on scale format requirement
         use_e8m0_scale = quant_kwargs.get("use_e8m0_scale", False)
         quant_fn = per_block_cast_to_fp8_e8m0 if use_e8m0_scale else per_block_cast_to_fp8
@@ -712,12 +737,19 @@ class FP8BlockScalesQuantizeUtil(BaseQuantizeUtil):
             weights[f"{expert_id}.w3.weight_scale_inv"] = w3_weight_scale
         return weights
 
-    def create_ref_module(
-        self, routing_method, ref_cls=FP8BlockScalesRefGatedMLPFusedMoE
-    ) -> torch.nn.Module:
+    def create_ref_module(self, routing_method, ref_cls=None) -> torch.nn.Module:
         """
         Create a reference module for correctness testing.
+
+        Uses ref_cls in the following priority:
+        1. Explicitly passed ref_cls argument
+        2. ref_cls stored from create_weights() call (via quant_kwargs)
+        3. Default FP8BlockScalesRefGatedMLPFusedMoE
         """
+        if ref_cls is None:
+            ref_cls = (
+                self._ref_cls if self._ref_cls is not None else FP8BlockScalesRefGatedMLPFusedMoE
+            )
         return super().create_ref_module(routing_method, ref_cls)
 
     def create_input(self, seq_len: int) -> torch.Tensor:
@@ -783,14 +815,19 @@ class DeepGemmFP8BlockScalesRefFusedMoE(FP8BlockScalesRefGatedMLPFusedMoE):
         self.w2_weights_stacked = torch.stack(w2_list, dim=0)
         self.w2_scales_stacked = torch.stack(w2_scale_list, dim=0)
 
-    def cuda(self):
-        """Move all weights to CUDA."""
-        super().cuda()
+    def cuda(self, device=None):
+        """Move all weights to CUDA.
+
+        Args:
+            device: Optional device specification (e.g., 'cuda:0', 0, or torch.device).
+                    If None, uses the current CUDA device.
+        """
+        super().cuda(device)
         if self.w3_w1_weights is not None:
-            self.w3_w1_weights = self.w3_w1_weights.cuda()
-            self.w3_w1_scales = self.w3_w1_scales.cuda()
-            self.w2_weights_stacked = self.w2_weights_stacked.cuda()
-            self.w2_scales_stacked = self.w2_scales_stacked.cuda()
+            self.w3_w1_weights = self.w3_w1_weights.cuda(device)
+            self.w3_w1_scales = self.w3_w1_scales.cuda(device)
+            self.w2_weights_stacked = self.w2_weights_stacked.cuda(device)
+            self.w2_scales_stacked = self.w2_scales_stacked.cuda(device)
         return self
 
     def _swiglu(self, x):
@@ -918,6 +955,20 @@ class DeepGemmFP8BlockScalesRefFusedMoE(FP8BlockScalesRefGatedMLPFusedMoE):
 
         return output
 
+    def check_accuracy(self, output, ref_output):
+        """
+        Check accuracy with relaxed tolerance for DEEPGEMM FP8 block scale kernel.
+
+        DEEPGEMM with FP8 block scaling has specific numerical behavior due to:
+        - E8M0 scale format quantization
+        - Manual grouped GEMM computation pattern
+        - Different routing methods (especially MiniMaxM2 with sigmoid + manual normalization)
+
+        Relaxed from rtol=0.01 to rtol=0.02 to accommodate these numerical differences
+        while still catching significant errors.
+        """
+        torch.testing.assert_close(output, ref_output, rtol=2e-2, atol=1.5)
+
 
 class DeepGemmFP8BlockScalesQuantizeUtil(BaseQuantizeUtil):
     """
@@ -1124,7 +1175,7 @@ class MXFP4MXFP8RefGatedMLPFusedMoE(RefGatedMLPFusedMoE):
         model_config: Optional[ModelConfig] = None,
         bias=False,
         hidden_size_unpadded: Optional[int] = None,
-        gptoss_style: bool = False,
+        swiglu_gptoss_style: bool = False,
         **kwargs,
     ):
         super().__init__(
@@ -1142,7 +1193,7 @@ class MXFP4MXFP8RefGatedMLPFusedMoE(RefGatedMLPFusedMoE):
         self.hidden_size_unpadded = (
             hidden_size_unpadded if hidden_size_unpadded is not None else hidden_size
         )
-        self.gptoss_style = gptoss_style
+        self.swiglu_gptoss_style = swiglu_gptoss_style
 
     def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> torch.Tensor:
         # Pad input if hidden_size_unpadded < hidden_size
@@ -1158,8 +1209,14 @@ class MXFP4MXFP8RefGatedMLPFusedMoE(RefGatedMLPFusedMoE):
         return output
 
     def check_accuracy(self, output, ref_output):
-        if self.gptoss_style:
+        if self.swiglu_gptoss_style:
             check_accuracy(output, ref_output, rtol=0.1, atol=0.2, percent=0.8)
+        elif self.hidden_size >= 4096:
+            # Relax tolerance for large hidden_size (e.g., DeepSeek-V3 h=7168).
+            # MXFP4 (4-bit) weights + MXFP8 (8-bit) activations accumulate more
+            # quantization error in large GEMM reduction dimensions: error ~ sqrt(K).
+            # Observed mismatch: ~17-19% for h=7168 vs <15% for h=512.
+            check_accuracy(output, ref_output, rtol=0.15, atol=0.3, percent=0.85)
         else:
             check_accuracy(output, ref_output, rtol=0.10, atol=0.2, percent=0.85)
 
@@ -1244,7 +1301,6 @@ class MXFP4MXFP8QuantizeUtil(BaseQuantizeUtil):
         intermediate_size_unpadded = quant_kwargs.get(
             "intermediate_size_unpadded", self.intermediate_size
         )
-        bias = quant_kwargs.get("bias", False)
         pad_zero_or_val = quant_kwargs.get("pad_zero_or_val", True)
         weight_alignment = quant_kwargs.get("weight_alignment", 128)
         input_hidden_alignment = quant_kwargs.get("input_hidden_alignment", 512)
@@ -1256,7 +1312,7 @@ class MXFP4MXFP8QuantizeUtil(BaseQuantizeUtil):
 
         weights = {}
         for expert_id in range(self.num_experts):
-            if bias:
+            if self.bias:
                 w1_bias = torch.randn((intermediate_size_unpadded,), dtype=self.dtype).cuda() * 0.1
                 w2_bias = torch.randn((hidden_size_unpadded,), dtype=self.dtype).cuda() * 0.1
                 w3_bias = torch.randn((intermediate_size_unpadded,), dtype=self.dtype).cuda() * 0.1
@@ -1434,7 +1490,7 @@ class MXFP4MXFP8QuantizeUtil(BaseQuantizeUtil):
             model_config=ModelConfig(quant_config=self.quant_config),
             bias=self.bias,
             hidden_size_unpadded=hs_unpadded,
-            gptoss_style=self.gptoss_style,
+            swiglu_gptoss_style=self.swiglu_gptoss_style,
             swiglu_alpha=self.swiglu_alpha,
             swiglu_beta=self.swiglu_beta,
             swiglu_limit=self.swiglu_limit,
@@ -1547,7 +1603,7 @@ class WFP4A16QuantizeUtil(BaseQuantizeUtil):
     """
     WFP4A16QuantizeUtil inherits from BaseQuantizeUtil to support correctness testing
     for W4A16_MXFP4 quantized MoE modules.
-    Supports gptoss_style with custom swiglu parameters (inherited from BaseQuantizeUtil).
+    Supports swiglu_gptoss_style with custom swiglu parameters (inherited from BaseQuantizeUtil).
     """
 
     def create_weights(self, **quant_kwargs) -> Dict[str, torch.Tensor]:
@@ -1620,7 +1676,7 @@ class WFP4A16QuantizeUtil(BaseQuantizeUtil):
             weights[f"{expert_id}.w2.weight_scale_inv"] = w2_scale
             weights[f"{expert_id}.w3.weight_scale_inv"] = w3_scale
 
-            # Bias for gptoss_style
+            # Bias for swiglu_gptoss_style
             if self.bias:
                 weights[f"{expert_id}.w1.bias"] = torch.randn(
                     self.intermediate_size, device="cuda", dtype=torch.float
@@ -1637,7 +1693,7 @@ class WFP4A16QuantizeUtil(BaseQuantizeUtil):
         self, routing_method, ref_cls=WFP4A16RefGatedMLPFusedMoE
     ) -> torch.nn.Module:
         """
-        Create a reference module for correctness testing with gptoss_style support.
+        Create a reference module for correctness testing with swiglu_gptoss_style support.
         """
         return super().create_ref_module(routing_method, ref_cls)
 
diff --git a/tests/unittest/_torch/modules/moe/test_moe_backend.py b/tests/unittest/_torch/modules/moe/test_moe_backend.py
index f80f26bd47..b31c696ab0 100644
--- a/tests/unittest/_torch/modules/moe/test_moe_backend.py
+++ b/tests/unittest/_torch/modules/moe/test_moe_backend.py
@@ -28,13 +28,20 @@ Design Goals:
 
 import itertools
 import logging
-import time
-from dataclasses import dataclass
-from enum import Enum
-from typing import Callable, List, Optional, Type
+from typing import List, Optional
 
 import pytest
 import torch
+from _torch.modules.moe.moe_test_utils import (
+    MoeBackendType,
+    MoeModelConfig,
+    create_test_param,
+    get_backend_class,
+    iter_base_test_configs,
+    module_timer,  # noqa: F401 - imported for pytest fixture registration
+    replay_tactics_and_check,
+    supports_autotuner_capture,
+)
 from _torch.modules.moe.quantize_utils import get_test_quant_params
 from transformers.configuration_utils import PretrainedConfig
 
@@ -42,10 +49,6 @@ from tensorrt_llm._torch.autotuner import AutoTuner, autotune
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.modules.fused_moe import RenormalizeMoeRoutingMethod
 from tensorrt_llm._torch.modules.fused_moe.create_moe import create_moe_backend
-from tensorrt_llm._torch.modules.fused_moe.fused_moe_cute_dsl import CuteDslFusedMoE
-from tensorrt_llm._torch.modules.fused_moe.fused_moe_cutlass import CutlassFusedMoE
-from tensorrt_llm._torch.modules.fused_moe.fused_moe_deepgemm import DeepGemmFusedMoE
-from tensorrt_llm._torch.modules.fused_moe.fused_moe_trtllm_gen import TRTLLMGenFusedMoE
 from tensorrt_llm._torch.modules.fused_moe.interface import MoE
 from tensorrt_llm._utils import mpi_rank
 from tensorrt_llm.mapping import Mapping
@@ -54,249 +57,39 @@ from tensorrt_llm.models.modeling_utils import QuantAlgo
 logger = logging.getLogger(__name__)
 
 
-class MoeBackendType(str, Enum):
-    """Enum for MoE backend types."""
-
-    CUTLASS = "CUTLASS"
-    TRTLLM = "TRTLLM"
-    CUTEDSL = "CUTEDSL"
-    DEEPGEMM = "DEEPGEMM"
-
-
-def get_backend_class(backend_type: MoeBackendType) -> Type[MoE]:
-    """Get the MoE backend class for a given backend type."""
-    backend_class_map = {
-        MoeBackendType.CUTLASS: CutlassFusedMoE,
-        MoeBackendType.TRTLLM: TRTLLMGenFusedMoE,
-        MoeBackendType.CUTEDSL: CuteDslFusedMoE,
-        MoeBackendType.DEEPGEMM: DeepGemmFusedMoE,
-    }
-    return backend_class_map[backend_type]
-
-
-def should_skip_TRTLLM(
-    backend_type: MoeBackendType,
-    quant_algo: Optional[QuantAlgo],
-    model_config: "MoeModelConfig",
-) -> Optional[str]:
-    """
-    Check TRTLLM Gen backend specific constraints.
-
-    The TRTLLM Gen MoE kernels have hardware-level constraints that must be satisfied.
-    These constraints are enforced in C++ layer.
-
-    Constraints:
-    1. num_experts must be divisible by 4 (routing kernel vectorization requirement)
-    2. num_experts must be greater than top_k (routing logic requirement)
-
-    Args:
-        backend_type: The MoE backend type
-        quant_algo: The quantization algorithm
-        model_config: The MoE model configuration
-
-    Returns:
-        Skip reason string if test should be skipped, None otherwise
-    """
-    if backend_type != MoeBackendType.TRTLLM:
-        return None
-
-    if model_config is None:
-        return None
-
-    # These quantization algorithms use TRTLLM Gen kernels with the constraints
-    trtllm_gen_quant_algos = {
-        QuantAlgo.NVFP4,
-        QuantAlgo.FP8_BLOCK_SCALES,
-        QuantAlgo.W4A8_NVFP4_FP8,
-        QuantAlgo.W4A16_MXFP4,
-        QuantAlgo.W4A8_MXFP4_MXFP8,
-    }
-
-    if quant_algo not in trtllm_gen_quant_algos:
-        return None
-
-    num_experts = model_config.num_experts
-    top_k = model_config.top_k
-    intermediate_size = model_config.intermediate_size
-
-    # Check: num_experts must be divisible by 4
-    # Routing kernel uses vectorized operations that require this alignment
-    if num_experts % 4 != 0:
-        return (
-            f"TRTLLMGenFusedMoE routing kernel requires num_experts divisible by 4 "
-            f"(got num_experts={num_experts})"
-        )
-
-    # Check: num_experts must be greater than top_k
-    # Routing logic cannot handle the case where all experts are selected
-    if num_experts <= top_k:
-        return (
-            f"TRTLLMGenFusedMoE requires num_experts > top_k "
-            f"(got num_experts={num_experts}, top_k={top_k})"
-        )
-
-    # -----------------Potential issues------------------
-    # These are known issues that need investigation. Skipping to avoid test failures
-    # and CUDA errors that can cascade to subsequent tests.
-
-    # Issue 1: W4A8_NVFP4_FP8 with top_k=1 causes CUDA illegal memory access
-    # This triggers GPU state corruption that affects all subsequent tests.
-    # Affected config: e8_k1_h512_i512
-    if quant_algo == QuantAlgo.W4A8_NVFP4_FP8 and top_k == 1:
-        return (
-            "[Potential Bug] TRTLLMGenFusedMoE W4A8_NVFP4_FP8 with top_k=1 "
-            "causes CUDA illegal memory access. Needs kernel investigation."
-        )
-
-    # Issue 2: NVFP4 with large intermediate_size has known accuracy issues
-    # Observed mismatch: 18%~25% vs expected <7.5% (per test_moe.py baseline)
-    # Affected configs: e8_k2_h4096_i14336, e8_k2_h6144_i32768
-    if quant_algo == QuantAlgo.NVFP4 and intermediate_size >= 14336:
-        return (
-            f"[Potential Bug] TRTLLMGenFusedMoE NVFP4 with large intermediate_size "
-            f"has known accuracy issues (intermediate_size={intermediate_size} >= 14336). "
-            f"Observed mismatch 18%~25% exceeds expected threshold."
-        )
-
-    # Issue 3: W4A8_MXFP4_MXFP8 has accuracy issues on certain model configs
-    # Observed mismatch: 14%~18% vs expected <15% (percent=0.85)
-    # Affected configs: large intermediate_size or many experts
-    # e8_k2_h4096_i14336, e64_k6_h2048_i1408, e60_k4_h2048_i1408,
-    # e256_k8_h7168_i2048, e8_k2_h6144_i32768, e128_k4_h2880_i2880
-    if quant_algo == QuantAlgo.W4A8_MXFP4_MXFP8:
-        # Large intermediate_size (>= 14336) has precision issues
-        if intermediate_size >= 14336:
-            return (
-                f"[Potential Bug] TRTLLMGenFusedMoE W4A8_MXFP4_MXFP8 with large "
-                f"intermediate_size has accuracy issues (intermediate_size={intermediate_size} >= 14336). "
-                f"Observed mismatch 14%~18% exceeds 15% threshold."
-            )
-        # Many experts (>= 60) with moderate intermediate_size has precision issues
-        if num_experts >= 60 and intermediate_size >= 1408:
-            return (
-                f"[Potential Bug] TRTLLMGenFusedMoE W4A8_MXFP4_MXFP8 with many experts "
-                f"has accuracy issues (num_experts={num_experts} >= 60, intermediate_size={intermediate_size}). "
-                f"Observed mismatch 14%~18% exceeds 15% threshold."
-            )
-
-    return None
-
-
-def should_skip_CUTEDSL(
-    backend_type: MoeBackendType,
-    quant_algo: Optional[QuantAlgo],
-    model_config: "MoeModelConfig" = None,
-) -> Optional[str]:
-    """
-    Check CuteDSL backend specific constraints.
-
-    The CuteDSL MoE kernels have known accuracy issues with certain configurations.
-
-    Args:
-        backend_type: The MoE backend type
-        quant_algo: The quantization algorithm
-        model_config: The MoE model configuration
-
-    Returns:
-        Skip reason string if test should be skipped, None otherwise
-    """
-    if backend_type != MoeBackendType.CUTEDSL:
-        return None
-
-    if model_config is None:
-        return None
-
-    intermediate_size = model_config.intermediate_size
-
-    # -----------------Potential issues------------------
-    # NVFP4 with large intermediate_size has known accuracy issues (same as TRTLLM)
-    # Observed mismatch: 8%~26% vs expected <2%
-    # Affected configs: e8_k2_h4096_i14336, e8_k2_h6144_i32768
-    if quant_algo == QuantAlgo.NVFP4 and intermediate_size >= 14336:
-        return (
-            f"[Potential Bug] CuteDslFusedMoE NVFP4 with large intermediate_size "
-            f"has known accuracy issues (intermediate_size={intermediate_size} >= 14336). "
-            f"Observed mismatch 8%~26% exceeds 2% threshold."
-        )
-
-    # NVFP4 with prime num_experts (7, 13) causes CUDA_ERROR_ILLEGAL_ADDRESS
-    # Root cause: Autotuner cache bucket mapping issue
-    # - When tests run in batch, previous tests cache tactics to buckets
-    # - Prime num_experts shapes map to same bucket as other configs
-    # - The cached tactic (e.g., ((128, 256), (1, 2), False)) works for other configs
-    #   but causes illegal memory access for prime num_experts' actual shape
-    # - Single test run passes because fallback tactic ((128, 128), (1, 1), False) is used
-    # Affected configs: e7_k2_h256_i512, e13_k3_h256_i512
-    num_experts = model_config.num_experts
-    prime_experts_with_issues = {7, 13}
-    if quant_algo == QuantAlgo.NVFP4 and num_experts in prime_experts_with_issues:
-        return (
-            f"[Potential Bug] CuteDslFusedMoE NVFP4 with prime num_experts={num_experts} "
-            f"causes CUDA_ERROR_ILLEGAL_ADDRESS due to autotuner cache bucket mapping. "
-            f"Cached tactic from other configs is incompatible with this shape."
-        )
-
-    return None
-
-
 def should_skip_gptoss(
     backend_type: MoeBackendType,
     quant_algo: Optional[QuantAlgo],
-    gptoss_style: bool,
+    swiglu_gptoss_style: bool,
 ) -> Optional[str]:
     """
-    Check if gptoss_style test should be skipped for this backend.
+    Check if swiglu_gptoss_style test should be skipped for this backend.
 
-    Only CUTLASS and TRTLLM backends support gptoss_style (SwiGlu with custom
+    Only CUTLASS and TRTLLM backends support swiglu_gptoss_style (SwiGlu with custom
     alpha/beta/limit parameters and bias).
 
     Args:
         backend_type: The MoE backend type
         quant_algo: The quantization algorithm
-        gptoss_style: Whether gptoss_style is enabled
+        swiglu_gptoss_style: Whether swiglu_gptoss_style is enabled
 
     Returns:
         Skip reason string if test should be skipped, None otherwise
     """
-    if not gptoss_style:
+    if not swiglu_gptoss_style:
         return None
 
-    # Only CUTLASS and TRTLLM backends support gptoss_style
+    # Only CUTLASS and TRTLLM backends support swiglu_gptoss_style
     supported_backends = {MoeBackendType.CUTLASS, MoeBackendType.TRTLLM}
     if backend_type not in supported_backends:
         return (
-            f"gptoss_style is only supported by CUTLASS and TRTLLM backends "
+            f"swiglu_gptoss_style is only supported by CUTLASS and TRTLLM backends "
             f"(got backend_type={backend_type.value})"
         )
 
     return None
 
 
-def supports_autotuner_capture(
-    backend_type: MoeBackendType,
-    quant_algo: Optional[QuantAlgo],
-) -> bool:
-    """
-    Determine if a backend+quant_algo combination supports AutoTuner capture/replay.
-
-    AutoTuner capture/replay requires AutoTuner.choose_one() to be called during
-    run_moe execution.
-
-    Args:
-        backend_type: The MoE backend type
-        quant_algo: The quantization algorithm (None for unquantized)
-
-    Returns:
-        True if autotuner capture/replay is supported, False otherwise
-    """
-    # DEEPGEMM does not support autotuner capture
-    # Evidence: fused_moe_deepgemm.py has no AutoTuner/choose_one references
-    if backend_type == MoeBackendType.DEEPGEMM:
-        return False
-
-    return True
-
-
 def create_test_backend(
     backend_type: MoeBackendType,
     routing_method: RenormalizeMoeRoutingMethod,
@@ -397,60 +190,6 @@ def run_backend_moe(
     return backend.run_moe(**args)
 
 
-def replay_tactics_and_check(
-    all_tactics,
-    run_moe_fn: Callable[[], torch.Tensor],
-    check_accuracy_fn: Callable[[torch.Tensor, torch.Tensor], None],
-    ref_output: torch.Tensor,
-    backend_type: MoeBackendType,
-    quant_algo: Optional[QuantAlgo],
-    fail_fast: bool = False,
-) -> None:
-    """
-    Replay all tactics and check accuracy.
-
-    Args:
-        all_tactics: TacticsCapture object from AutoTuner.capture()
-        run_moe_fn: Function to run MoE computation
-        check_accuracy_fn: Function to check accuracy (output, ref_output) -> None
-        ref_output: Reference output tensor
-        backend_type: Backend type for error reporting
-        quant_algo: Quantization algorithm for error reporting
-        fail_fast: If True, fail on first error. If False, run all and report summary.
-    """
-    tactics_list = list(all_tactics)
-    passed_tactics = []
-    failed_tactics = []
-    logger.info(f"Replay tactics : {len(tactics_list)} and check accuracy")
-    for idx, tactic in enumerate(tactics_list):
-        with AutoTuner.get().replay(tactic), torch.inference_mode():
-            output = run_moe_fn()
-            try:
-                check_accuracy_fn(output, ref_output)
-                passed_tactics.append((idx, tactic))
-            except Exception as e:
-                if fail_fast:
-                    pytest.fail(
-                        f"Accuracy check failed for tactic[{idx}/{len(tactics_list)}]={tactic}, "
-                        f"backend={backend_type}, quant_algo={quant_algo}: {e}"
-                    )
-                failed_tactics.append((idx, tactic, str(e)))
-
-    # Report results (only when fail_fast=False)
-    total = len(tactics_list)
-    num_passed = len(passed_tactics)
-    num_failed = len(failed_tactics)
-    if failed_tactics:
-        fail_details = "\n".join(
-            f"  tactic[{idx}]={tactic}: {err}" for idx, tactic, err in failed_tactics
-        )
-        pytest.fail(
-            f"backend={backend_type}, quant_algo={quant_algo}: "
-            f"{num_passed}/{total} passed, {num_failed}/{total} failed\n"
-            f"Failed tactics:\n{fail_details}"
-        )
-
-
 # ============================================================================
 # Test Parameters
 # ============================================================================
@@ -482,23 +221,6 @@ DTYPES_TO_TEST = [
     torch.bfloat16,
 ]
 
-
-# ============================================================================
-# Model MoE Configurations
-# ============================================================================
-@dataclass
-class MoeModelConfig:
-    """MoE model configuration: (num_experts, top_k, hidden_size, intermediate_size)."""
-
-    num_experts: int
-    top_k: int
-    hidden_size: int
-    intermediate_size: int
-
-    def __str__(self) -> str:
-        return f"e{self.num_experts}_k{self.top_k}_h{self.hidden_size}_i{self.intermediate_size}"
-
-
 # Format: (num_experts, top_k, hidden_size, intermediate_size)
 MOE_MODEL_CONFIGS = [
     # === Real Model Configs ===
@@ -521,89 +243,10 @@ MOE_MODEL_CONFIGS = [
 # Sequence lengths to test
 SEQ_LENS_TO_TEST = [1, 8]
 
-# SwiGLU parameters for gptoss_style testing
-SWIGLU_ALPHAS = [1, 0.1]
-SWIGLU_BETAS = [0, 1]
-SWIGLU_LIMITS = [float("inf"), 1]
-
-
-# ============================================================================
-# Fast Skip Check (for parametrize-level skip, avoids entering test function)
-# ============================================================================
-def get_quick_skip_reason(
-    backend_type: MoeBackendType,
-    quant_algo: Optional[QuantAlgo],
-    dtype: torch.dtype,
-    model_config: "MoeModelConfig",
-    gptoss_style: bool,
-) -> Optional[str]:
-    """
-    Fast skip check that calls backend's can_implement() method.
-
-    This function calls the backend's can_implement() classmethod to check
-    dtype/quant_algo/gptoss_style support, then uses should_skip_* functions
-    for additional model_config specific checks.
-
-    Note: Logging is temporarily suppressed to avoid excessive warning output
-    during test parameter generation.
-
-    Returns:
-        Skip reason string if test should be skipped, None otherwise
-    """
-    import logging as _logging
-
-    # Suppress logger warnings during parameter generation to avoid excessive output
-    trtllm_logger = _logging.getLogger("tensorrt_llm")
-    original_level = trtllm_logger.level
-    trtllm_logger.setLevel(_logging.ERROR)
-
-    try:
-        # ===== Call backend's can_implement for dtype/quant_algo/gptoss_style checks =====
-        backend_cls = get_backend_class(backend_type)
-        can_impl, skip_reason = backend_cls.can_implement(
-            quant_algo, dtype_activation=dtype, gptoss_style=gptoss_style
-        )
-        if not can_impl:
-            return skip_reason
-
-        # ===== Additional model_config specific checks =====
-
-        # TRTLLM: num_experts constraints and accuracy issues
-        skip_reason = should_skip_TRTLLM(backend_type, quant_algo, model_config)
-        if skip_reason:
-            return skip_reason
-
-        # CUTEDSL: accuracy issues with specific configs
-        skip_reason = should_skip_CUTEDSL(backend_type, quant_algo, model_config)
-        if skip_reason:
-            return skip_reason
-
-        # DEEPGEMM: float16 reference module constraint
-        if backend_type == MoeBackendType.DEEPGEMM and dtype == torch.float16:
-            return "DeepGemmFusedMoE reference module (FP8BlockScalesLinearMethod) requires bfloat16 input"
-
-        # 128-alignment requirement for quantization
-        if quant_algo is not None:
-            hidden_size = model_config.hidden_size
-            intermediate_size = model_config.intermediate_size
-            is_hidden_128_aligned = hidden_size % 128 == 0
-            is_intermediate_128_aligned = intermediate_size % 128 == 0
-
-            if not is_hidden_128_aligned or not is_intermediate_128_aligned:
-                # TRTLLM with MXFP4 variants automatically pads to 128 alignment
-                is_mxfp4_variant = quant_algo in {QuantAlgo.W4A16_MXFP4, QuantAlgo.W4A8_MXFP4_MXFP8}
-                is_trtllm_backend = backend_type == MoeBackendType.TRTLLM
-                if not (is_trtllm_backend and is_mxfp4_variant):
-                    return (
-                        f"Non-128-aligned sizes (h={hidden_size}, i={intermediate_size}) "
-                        f"require TRTLLM backend with MXFP4 quantization"
-                    )
-
-        return None
-
-    finally:
-        # Restore logger level
-        trtllm_logger.setLevel(original_level)
+# SwiGLU parameters for swiglu_gptoss_style testing
+SWIGLU_ALPHAS = [1, 1.702]  # default, GPT-OSS (modeling_gpt_oss.py)
+SWIGLU_BETAS = [0, 1.0]  # default, GPT-OSS
+SWIGLU_LIMITS = [float("inf"), 7.0]  # default, GPT-OSS
 
 
 def generate_test_params() -> List:
@@ -617,57 +260,41 @@ def generate_test_params() -> List:
     Returns:
         List of pytest.param objects with appropriate skip marks
     """
-    params: List = []
-
-    # Generate all combinations
     swiglu_combos = list(itertools.product(SWIGLU_ALPHAS, SWIGLU_BETAS, SWIGLU_LIMITS))
 
-    for swiglu_alpha, swiglu_beta, swiglu_limit in swiglu_combos:
-        for model_config in MOE_MODEL_CONFIGS:
-            for seq_len in SEQ_LENS_TO_TEST:
-                for dtype in DTYPES_TO_TEST:
-                    for backend_type in BACKEND_TYPES_TO_TEST:
-                        for quant_algo in QUANT_ALGOS_TO_TEST:
-                            # Determine gptoss_style
-                            gptoss_style = (
-                                swiglu_alpha != 1
-                                or swiglu_beta != 0
-                                or swiglu_limit != float("inf")
-                            )
-
-                            # Generate test ID
-                            test_id = (
-                                f"alpha={swiglu_alpha}_beta={swiglu_beta}_limit={swiglu_limit}-"
-                                f"{model_config}-seq={seq_len}-dtype={dtype}-"
-                                f"backend={backend_type.value}-quant_algo={quant_algo}"
-                            )
-
-                            # Check if should skip
-                            skip_reason = get_quick_skip_reason(
-                                backend_type, quant_algo, dtype, model_config, gptoss_style
-                            )
-
-                            param_values = (
-                                dtype,
-                                backend_type,
-                                quant_algo,
-                                seq_len,
-                                model_config,
-                                swiglu_alpha,
-                                swiglu_beta,
-                                swiglu_limit,
-                            )
-
-                            if skip_reason:
-                                params.append(
-                                    pytest.param(
-                                        *param_values,
-                                        id=test_id,
-                                        marks=pytest.mark.skip(reason=skip_reason),
-                                    )
-                                )
-                            else:
-                                params.append(pytest.param(*param_values, id=test_id))
+    params: List = []
+    for (
+        swiglu_alpha,
+        swiglu_beta,
+        swiglu_limit,
+        model_config,
+        seq_len,
+        dtype,
+        backend_type,
+        quant_algo,
+        routing_method_cls,
+        skip_reason,
+        test_id,
+    ) in iter_base_test_configs(
+        swiglu_combos,
+        MOE_MODEL_CONFIGS,
+        SEQ_LENS_TO_TEST,
+        DTYPES_TO_TEST,
+        BACKEND_TYPES_TO_TEST,
+        QUANT_ALGOS_TO_TEST,
+    ):
+        param_values = (
+            dtype,
+            backend_type,
+            quant_algo,
+            seq_len,
+            model_config,
+            routing_method_cls,
+            swiglu_alpha,
+            swiglu_beta,
+            swiglu_limit,
+        )
+        params.append(create_test_param(param_values, test_id, skip_reason))
 
     return params
 
@@ -676,23 +303,6 @@ def generate_test_params() -> List:
 TEST_PARAMS = generate_test_params()
 
 
-# ============================================================================
-# Timing Fixtures
-# ============================================================================
-@pytest.fixture(scope="module", autouse=True)
-def module_timer(request):
-    """Fixture to measure and log total module execution time."""
-    start = time.perf_counter()
-    yield
-    elapsed = time.perf_counter() - start
-    logger.info(
-        "[TIMING] Total %s: %.3fs (%.2f min)",
-        request.module.__name__,
-        elapsed,
-        elapsed / 60,
-    )
-
-
 # ============================================================================
 # Test Implementation
 # ============================================================================
@@ -740,15 +350,16 @@ def module_timer(request):
 # Skip Logic
 # =============================================================================
 # Tests are automatically skipped for unsupported configurations using:
-# - backend.can_implement(): Check dtype/quant_algo/gptoss_style support
-# - should_skip_TRTLLM(): TRTLLM-specific constraints (num_experts % 4, etc.)
-# - should_skip_CUTEDSL(): CuteDSL-specific accuracy issues
+# - backend.can_implement(): Check dtype/quant_algo/swiglu_gptoss_style support
+# - should_skip_trtllm(): TRTLLM-specific constraints (num_experts % 4, etc.)
+# - should_skip_cutedsl(): CuteDSL-specific accuracy issues
 # - 128-alignment requirements for quantization
 #
 # =============================================================================
 @pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test")
 @pytest.mark.parametrize(
-    "dtype_activation,backend_type,quant_algo,seq_len,model_config,swiglu_alpha,swiglu_beta,swiglu_limit",
+    "dtype_activation,backend_type,quant_algo,seq_len,model_config,"
+    "routing_method_cls,swiglu_alpha,swiglu_beta,swiglu_limit",
     TEST_PARAMS,
 )
 def test_moe_backend(
@@ -757,6 +368,7 @@ def test_moe_backend(
     quant_algo: Optional[QuantAlgo],
     seq_len: int,
     model_config: MoeModelConfig,
+    routing_method_cls,
     swiglu_alpha: float,
     swiglu_beta: float,
     swiglu_limit: float,
@@ -768,12 +380,12 @@ def test_moe_backend(
     1. Autotune works correctly with the backend
     2. All tactics are captured properly
     3. Different sequence lengths use appropriate tactics
-    4. gptoss_style (SwiGlu with custom parameters) works correctly
+    4. swiglu_gptoss_style (SwiGlu with custom parameters) works correctly
     """
-    # Determine gptoss_style based on swiglu parameters
-    # gptoss_style is True when any swiglu parameter deviates from default
+    # Determine swiglu_gptoss_style based on swiglu parameters
+    # swiglu_gptoss_style is True when any swiglu parameter deviates from default
     # Default values: alpha=1, beta=0, limit=inf
-    gptoss_style = swiglu_alpha != 1 or swiglu_beta != 0 or swiglu_limit != float("inf")
+    swiglu_gptoss_style = swiglu_alpha != 1 or swiglu_beta != 0 or swiglu_limit != float("inf")
 
     # Note: Skip logic is now handled at parametrize level via get_quick_skip_reason()
     # which calls backend's can_implement() and should_skip_* functions.
@@ -797,8 +409,8 @@ def test_moe_backend(
         # Setup autotuner distributed state
         AutoTuner.get().setup_distributed_state(mapping)
 
-        # Create routing method
-        routing_method = RenormalizeMoeRoutingMethod(top_k=top_k)
+        # Create routing method from parametrized class
+        routing_method = routing_method_cls(top_k=top_k)
 
         # Create test inputs
         x = torch.randn((seq_len, hidden_size), dtype=dtype_activation, device="cuda")
@@ -810,21 +422,21 @@ def test_moe_backend(
             quant_algo, x, backend_type
         )
 
-        # Create quantize utility with gptoss_style parameters
+        # Create quantize utility with swiglu_gptoss_style parameters
         quantize_util = quantize_util_cls(
             num_experts=num_experts,
             dtype=dtype_activation,
             intermediate_size=intermediate_size,
             hidden_size=hidden_size,
             quant_config=quant_config,
-            bias=gptoss_style,
-            gptoss_style=gptoss_style,
-            swiglu_alpha=swiglu_alpha if gptoss_style else None,
-            swiglu_beta=swiglu_beta if gptoss_style else None,
-            swiglu_limit=swiglu_limit if gptoss_style else None,
+            bias=swiglu_gptoss_style,
+            swiglu_gptoss_style=swiglu_gptoss_style,
+            swiglu_alpha=swiglu_alpha if swiglu_gptoss_style else None,
+            swiglu_beta=swiglu_beta if swiglu_gptoss_style else None,
+            swiglu_limit=swiglu_limit if swiglu_gptoss_style else None,
         )
 
-        # Get swiglu tensors if gptoss_style is enabled
+        # Get swiglu tensors if swiglu_gptoss_style is enabled
         swiglu_tensors = quantize_util.get_swiglu_tensors()
 
         # Create backend first (needed for MXFP4_MXFP8 to get shapes)
@@ -837,7 +449,7 @@ def test_moe_backend(
             dtype=dtype_activation,
             quant_config=quant_config,
             mapping=mapping,
-            bias=gptoss_style,
+            bias=swiglu_gptoss_style,
             swiglu_alpha=swiglu_tensors["swiglu_alpha"] if swiglu_tensors else None,
             swiglu_beta=swiglu_tensors["swiglu_beta"] if swiglu_tensors else None,
             swiglu_limit=swiglu_tensors["swiglu_limit"] if swiglu_tensors else None,
diff --git a/tests/unittest/_torch/modules/moe/test_moe_module.py b/tests/unittest/_torch/modules/moe/test_moe_module.py
index becc8df849..bc4060b5a6 100644
--- a/tests/unittest/_torch/modules/moe/test_moe_module.py
+++ b/tests/unittest/_torch/modules/moe/test_moe_module.py
@@ -12,33 +12,89 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+MoE Module Unit Tests
+
+This module provides a unified test framework for testing MoE modules through the
+high-level create_moe() + forward() interface, rather than the backend-level interfaces.
+
+Design Goals:
+1. Test MoE module via: create_moe -> load_weights -> forward
+2. Cover key quantization + backend combinations
+3. Support EPLB (Expert Load Balancing) testing
+4. Support autotune and tactic capture testing
+"""
+
 import copy
+import logging
 import os
 import pickle
 import sys
 from contextlib import nullcontext
+from itertools import product
+from typing import List, Optional
 
 import cloudpickle
 import pytest
 import torch
+from _torch.modules.moe.moe_test_utils import (
+    MoeBackendType,
+    MoeModelConfig,
+    create_test_param,
+    get_quick_skip_reason,
+    iter_base_test_configs,
+    module_timer,  # noqa: F401 - imported for pytest fixture registration
+    replay_tactics_and_check,
+    should_skip_cutedsl,
+    should_skip_deepgemm,
+    should_skip_multi_gpu,
+    should_skip_trtllm,
+    supports_autotuner_capture,
+)
 from _torch.modules.moe.quantize_utils import get_test_quant_params
 from mpi4py import MPI
 from mpi4py.futures import MPIPoolExecutor
 from transformers.configuration_utils import PretrainedConfig
-from utils.util import getSMVersion
 
 import tensorrt_llm.bindings.internal.runtime as _tbr
+from tensorrt_llm._torch.autotuner import AutoTuner, autotune
 from tensorrt_llm._torch.model_config import ModelConfig
-from tensorrt_llm._torch.modules.fused_moe import RenormalizeMoeRoutingMethod, create_moe
+from tensorrt_llm._torch.modules.fused_moe import (
+    DeepSeekV3MoeRoutingMethod,
+    DefaultMoeRoutingMethod,
+    Llama4RenormalizeMoeRoutingMethod,
+    MiniMaxM2MoeRoutingMethod,
+    RenormalizeMoeRoutingMethod,
+    RenormalizeNaiveMoeRoutingMethod,
+    create_moe,
+)
 from tensorrt_llm._torch.modules.fused_moe.moe_load_balancer import (
     MoeLoadBalancer,
     MoeLoadBalancerIterContext,
 )
+from tensorrt_llm._torch.modules.fused_moe.quantization import (
+    DeepSeekFP8BlockScalesFusedMoEMethod,
+    FP8QDQFusedMoEMethod,
+    INT8WoqPerChannelFusedMoEMethod,
+    NVFP4CutlassFusedMoEMethod,
+    NVFP4TRTLLMGenFusedMoEMethod,
+    UnquantizedFusedMoEMethod,
+    W4A8MXFP4FP8CutlassFusedMoEMethod,
+    W4A8MXFP4FP8TRTLLMGenFusedMoEMethod,
+    W4A8MXFP4MXFP8CutlassFusedMoEMethod,
+    W4A8MXFP4MXFP8TRTLLMGenFusedMoEMethod,
+    W4A8NVFP4FP8TRTLLMGenFusedMoEMethod,
+    W4A16MXFP4TRTLLMGenFusedMoEMethod,
+    WFP4A16FusedMoEMethod,
+    WInt4AFP8FusedMoEMethod,
+)
 from tensorrt_llm._utils import mpi_rank
 from tensorrt_llm.llmapi.llm_args import MoeLoadBalancerConfig
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantAlgo
 
+logger = logging.getLogger(__name__)
+
 cloudpickle.register_pickle_by_value(sys.modules[__name__])
 MPI.pickle.__init__(
     cloudpickle.dumps,
@@ -47,9 +103,250 @@ MPI.pickle.__init__(
 )
 
 
-def _skip_helper(quant_algo):
-    if quant_algo == QuantAlgo.NVFP4 and getSMVersion() < 100:
-        pytest.skip("This test is not supported in pre-Blackwell architecture")
+def _create_mapping_for_parallel_mode(world_size, parallel_mode):
+    """Create Mapping for different parallelism strategies.
+
+    Args:
+        world_size: Total number of GPUs
+        parallel_mode: One of "DEP", "TEP", "DTP", "TTP"
+            - DEP: Attention uses DP, MoE uses EP
+            - TEP: Attention uses TP, MoE uses EP
+            - DTP: Attention uses DP, MoE uses TP
+            - TTP: Attention uses TP, MoE uses TP
+
+    Returns:
+        Mapping object configured for the specified parallel mode
+    """
+    configs = {
+        "DEP": {  # Attention DP, MoE EP
+            "moe_ep_size": world_size,
+            "moe_tp_size": 1,
+            "enable_attention_dp": True,
+        },
+        "TEP": {  # Attention TP, MoE EP
+            "moe_ep_size": world_size,
+            "moe_tp_size": 1,
+            "enable_attention_dp": False,
+        },
+        "DTP": {  # Attention DP, MoE TP
+            "moe_ep_size": 1,
+            "moe_tp_size": world_size,
+            "enable_attention_dp": True,
+        },
+        "TTP": {  # Attention TP, MoE TP
+            "moe_ep_size": 1,
+            "moe_tp_size": world_size,
+            "enable_attention_dp": False,
+        },
+    }
+    if parallel_mode not in configs:
+        raise ValueError(
+            f"Unknown parallel_mode: {parallel_mode}. Must be one of {list(configs.keys())}"
+        )
+
+    cfg = configs[parallel_mode]
+    return Mapping(
+        world_size=world_size,
+        tp_size=world_size,
+        moe_ep_size=cfg["moe_ep_size"],
+        moe_tp_size=cfg["moe_tp_size"],
+        enable_attention_dp=cfg["enable_attention_dp"],
+    )
+
+
+def _create_moe_load_balancer(model_cfg, enable_eplb):
+    """Create MoeLoadBalancer if EPLB is enabled, otherwise return nullcontext."""
+    if not enable_eplb:
+        return nullcontext()
+
+    ep_rank = model_cfg.mapping.moe_ep_rank
+    ep_size = model_cfg.mapping.moe_ep_size
+    model_cfg.moe_load_balancer.setup(ep_rank=ep_rank, ep_size=ep_size)
+    return MoeLoadBalancer(
+        ep_rank=ep_rank,
+        ep_size=ep_size,
+        layer_updates_per_iter=model_cfg.moe_load_balancer.layer_updates_per_iter,
+    )
+
+
+def _setup_autotuner_for_test(mapping):
+    """Configure AutoTuner for faster unit test profiling."""
+    AutoTuner.get().setup_distributed_state(mapping)
+    AutoTuner.get().clear_cache()
+    autotuner = AutoTuner.get()
+    autotuner.warmup = 0  # default: 2
+    autotuner.repeat = 1  # default: 10
+    autotuner.stream_delay_micro_secs = 10  # default: 1000
+
+
+def _create_model_config(
+    num_experts,
+    hidden_size,
+    intermediate_size,
+    dtype,
+    mapping,
+    quant_config,
+    moe_backend,
+    enable_eplb=False,
+    num_slots=-1,
+    layer_updates_per_iter=-1,
+):
+    """Create PretrainedConfig and ModelConfig for MoE testing."""
+    pretrained_config = PretrainedConfig()
+    pretrained_config.num_experts = num_experts
+    pretrained_config.hidden_size = hidden_size
+    pretrained_config.intermediate_size = intermediate_size
+    pretrained_config.torch_dtype = dtype
+
+    moe_load_balancer_config = (
+        MoeLoadBalancerConfig(
+            num_slots=num_slots,
+            layer_updates_per_iter=layer_updates_per_iter,
+        )
+        if enable_eplb
+        else None
+    )
+
+    return ModelConfig(
+        pretrained_config=pretrained_config,
+        mapping=mapping,
+        quant_config=quant_config,
+        moe_backend=moe_backend,
+        moe_disable_finalize_fusion=False,
+        moe_load_balancer=moe_load_balancer_config,
+    )
+
+
+def _run_autotune_test(
+    run_forward_fn, ref_fused_moe, ref_output, backend_type, quant_algo, run_all_tactics=False
+):
+    """Run autotune phase and tactic replay test.
+
+    Args:
+        run_forward_fn: Forward function to run
+        ref_fused_moe: Reference MoE module for accuracy check
+        ref_output: Reference output for comparison
+        backend_type: MoE backend type
+        quant_algo: Quantization algorithm
+        run_all_tactics: If False, skip full tactic replay and only run simple accuracy check
+    """
+    # Autotune phase
+    with torch.inference_mode(), autotune(cache_path="/tmp/moe_module_autotuner_cache.json"):
+        _ = run_forward_fn()
+
+    # Check if we should run full tactic replay
+    if not run_all_tactics or not supports_autotuner_capture(backend_type, quant_algo):
+        # Simple accuracy check for unsupported backends or when run_all_tactics is False
+        with torch.inference_mode():
+            output = run_forward_fn()
+            ref_fused_moe.check_accuracy(output, ref_output)
+        return
+
+    # Capture phase: record which tactics are used
+    with AutoTuner.get().capture() as all_tactics, torch.inference_mode():
+        _ = run_forward_fn()
+
+    # Replay phase: test each tactic for correctness
+    replay_tactics_and_check(
+        all_tactics=all_tactics,
+        run_moe_fn=run_forward_fn,
+        check_accuracy_fn=ref_fused_moe.check_accuracy,
+        ref_output=ref_output,
+        backend_type=backend_type,
+        quant_algo=quant_algo,
+        fail_fast=False,
+    )
+
+
+def _run_eplb_test(
+    run_forward_fn, ref_fused_moe, ref_output, moe_load_balancer, initial_expert_ids
+):
+    """Run EPLB multi-iteration test.
+
+    Args:
+        run_forward_fn: Forward function to run
+        ref_fused_moe: Reference MoE module for accuracy check
+        ref_output: Reference output for comparison
+        moe_load_balancer: MoeLoadBalancer instance
+        initial_expert_ids: Expert IDs recorded immediately after MoE initialization (before any forward)
+    """
+    assert isinstance(moe_load_balancer, MoeLoadBalancer), (
+        "Moe load balancer should be created when eplb is enabled"
+    )
+    assert initial_expert_ids is not None, (
+        "initial_expert_ids should be recorded before any forward pass"
+    )
+
+    extra_steps = 1
+    for _ in range(extra_steps):
+        output = run_forward_fn()
+        ref_fused_moe.check_accuracy(output, ref_output)
+
+    current_expert_ids = copy.deepcopy(
+        moe_load_balancer.single_layer_load_balancers[0].get_old_rank_expert_ids()
+    )
+
+    # EPLB should have updated expert_ids from initial state
+    assert initial_expert_ids != current_expert_ids, (
+        f"Expert ids after eplb update should be different from the initial loaded ones. "
+        f"Initial: {initial_expert_ids}, Current: {current_expert_ids}"
+    )
+
+
+def _create_routing_method(routing_method_cls, top_k, num_experts, dtype):
+    """
+    Create a routing method instance with appropriate parameters for each routing method type.
+
+    Args:
+        routing_method_cls: The routing method class to instantiate
+        top_k: Number of experts to select per token
+        num_experts: Total number of experts
+        dtype: Data type for tensors
+
+    Returns:
+        An instance of the routing method
+    """
+    # Routing methods with force_enable_pytorch_op support
+    if routing_method_cls in (RenormalizeMoeRoutingMethod, DefaultMoeRoutingMethod):
+        return routing_method_cls(top_k=top_k, force_enable_pytorch_op=True)
+
+    # Simple routing methods (only top_k)
+    if routing_method_cls in (RenormalizeNaiveMoeRoutingMethod, Llama4RenormalizeMoeRoutingMethod):
+        return routing_method_cls(top_k=top_k)
+
+    # DeepSeekV3 routing method requires special parameters
+    if routing_method_cls == DeepSeekV3MoeRoutingMethod:
+        # DeepSeek-V3 routing: groups experts, selects top groups, then selects top_k from those
+        # The routing logic does topk(k=2) within each group, so each group must have >= 2 experts
+        # Calculate n_group such that each group has at least 2 experts
+        experts_per_group = 2
+        n_group = max(1, num_experts // experts_per_group)
+        # topk_group should be <= n_group and reasonable for the selection
+        topk_group = min(n_group, max(1, n_group // 2))
+        routed_scaling_factor = 1.0
+        # Create e_score_correction_bias as a zero tensor (no bias correction in test)
+        e_score_correction_bias = torch.zeros(num_experts, dtype=dtype, device="cuda")
+        return routing_method_cls(
+            top_k=top_k,
+            n_group=n_group,
+            topk_group=topk_group,
+            routed_scaling_factor=routed_scaling_factor,
+            callable_e_score_correction_bias=lambda: e_score_correction_bias,
+            is_fused=False,  # Use PyTorch implementation for testing
+        )
+
+    # MiniMaxM2 routing method requires special parameters
+    if routing_method_cls == MiniMaxM2MoeRoutingMethod:
+        # Create e_score_correction_bias as a zero tensor (no bias correction in test)
+        e_score_correction_bias = torch.zeros(num_experts, dtype=dtype, device="cuda")
+        return routing_method_cls(
+            top_k=top_k,
+            num_experts=num_experts,
+            callable_e_score_correction_bias=lambda: e_score_correction_bias,
+        )
+
+    # Fallback: try with just top_k
+    return routing_method_cls(top_k=top_k)
 
 
 def _test_moe_worker(
@@ -60,119 +357,202 @@ def _test_moe_worker(
     enable_eplb=False,
     layer_updates_per_iter=-1,
     num_slots=-1,
+    model_config: Optional[MoeModelConfig] = None,
+    seq_len: int = 4,
+    enable_autotune: bool = False,
+    routing_method_cls=RenormalizeMoeRoutingMethod,
+    dtype_routing_logits=None,
+    swiglu_alpha: float = 1,
+    swiglu_beta: float = 0,
+    swiglu_limit: float = float("inf"),
 ):
-    # Hardcode some parameters for testing
-    # activation and weight related
-    seq_len = 4
-    top_k = 2
-    num_experts = 8
-    hidden_size = 512
-    intermediate_size = 512
+    """
+    Test MoE module worker function.
 
-    # Other parameters
-    finalize_fusion = True
+    This test verifies:
+    1. MoE module forward pass produces correct results
+    2. EPLB (Expert Load Balancing) works correctly when enabled
+    3. Autotune works correctly with the module when enabled
+    4. All tactics are captured and replayed properly when autotune is enabled
 
+    Args:
+        routing_method_cls: Routing method class to use (default: RenormalizeMoeRoutingMethod)
+        dtype_routing_logits: Data type for routing logits (default: same as dtype).
+                              DeepSeekV3 routing requires torch.float32.
+        swiglu_alpha: SwiGLU alpha parameter (default=1, non-gptoss)
+        swiglu_beta: SwiGLU beta parameter (default=0, non-gptoss)
+        swiglu_limit: SwiGLU limit parameter (default=inf, non-gptoss)
+    """
+    import traceback
+
+    try:
+        _test_moe_worker_impl(
+            moe_backend=moe_backend,
+            dtype=dtype,
+            quant_algo=quant_algo,
+            mapping=mapping,
+            enable_eplb=enable_eplb,
+            layer_updates_per_iter=layer_updates_per_iter,
+            num_slots=num_slots,
+            model_config=model_config,
+            seq_len=seq_len,
+            enable_autotune=enable_autotune,
+            routing_method_cls=routing_method_cls,
+            dtype_routing_logits=dtype_routing_logits,
+            swiglu_alpha=swiglu_alpha,
+            swiglu_beta=swiglu_beta,
+            swiglu_limit=swiglu_limit,
+        )
+    except Exception:
+        traceback.print_exc()
+        raise
+
+
+def _test_moe_worker_impl(
+    moe_backend,
+    dtype,
+    quant_algo,
+    mapping=None,
+    enable_eplb=False,
+    layer_updates_per_iter=-1,
+    num_slots=-1,
+    model_config: Optional[MoeModelConfig] = None,
+    seq_len: int = 4,
+    enable_autotune: bool = False,
+    routing_method_cls=RenormalizeMoeRoutingMethod,
+    dtype_routing_logits=None,
+    swiglu_alpha: float = 1,
+    swiglu_beta: float = 0,
+    swiglu_limit: float = float("inf"),
+):
+    """Actual implementation of _test_moe_worker."""
+    # Default routing logits dtype to model dtype if not specified
+    if dtype_routing_logits is None:
+        dtype_routing_logits = dtype
+    # Parse model config
+    if model_config is not None:
+        num_experts = model_config.num_experts
+        top_k = model_config.top_k
+        hidden_size = model_config.hidden_size
+        intermediate_size = model_config.intermediate_size
+    else:
+        num_experts, top_k, hidden_size, intermediate_size = 8, 2, 512, 512
+
+    # Setup mapping
     mapping = mapping or Mapping()
     mapping.rank = mpi_rank()
-
     all_rank_num_tokens = [seq_len] * mapping.world_size
-
     torch.cuda.set_device(mapping.rank)
 
     with torch.device(f"cuda:{mapping.rank}"):
         torch.manual_seed(0)
         torch.cuda.manual_seed(0)
 
-        # Create route method
-        routing_method = RenormalizeMoeRoutingMethod(top_k=top_k, force_enable_pytorch_op=True)
-
-        # Create activation and weight
+        # Create routing method and input tensors
+        routing_method = _create_routing_method(
+            routing_method_cls, top_k=top_k, num_experts=num_experts, dtype=dtype
+        )
         x = torch.randn((seq_len, hidden_size), dtype=dtype, device="cuda")
         if enable_eplb:
-            # Here we create same router_logits for all tokens to force the eplb update weights
-            router_logits = torch.randn((1, num_experts), dtype=dtype, device="cuda").repeat(
-                seq_len, 1
-            )
+            # Same router_logits for all tokens to force the eplb update weights
+            router_logits = torch.randn(
+                (1, num_experts), dtype=dtype_routing_logits, device="cuda"
+            ).repeat(seq_len, 1)
         else:
-            router_logits = torch.randn((seq_len, num_experts), dtype=dtype, device="cuda")
+            router_logits = torch.randn(
+                (seq_len, num_experts), dtype=dtype_routing_logits, device="cuda"
+            )
 
-        quantize_util_cls, quant_config, quant_kwargs = get_test_quant_params(quant_algo, x)
+        # Determine swiglu_gptoss_style
+        swiglu_gptoss_style = swiglu_alpha != 1 or swiglu_beta != 0 or swiglu_limit != float("inf")
+
+        # In EP mode, swiglu tensors must be sized per local experts
+        # (C++ kernels check: swiglu_alpha.size(0) == num_experts_on_rank)
+        num_local_experts = num_experts // mapping.moe_ep_size
+
+        # Setup quantization
+        backend_type = MoeBackendType(moe_backend)
+        quantize_util_cls, quant_config, quant_kwargs = get_test_quant_params(
+            quant_algo, x, backend_type
+        )
         quantize_util = quantize_util_cls(
             num_experts=num_experts,
             dtype=dtype,
             intermediate_size=intermediate_size,
             hidden_size=hidden_size,
             quant_config=quant_config,
+            bias=swiglu_gptoss_style,
+            swiglu_gptoss_style=swiglu_gptoss_style,
+            swiglu_alpha=swiglu_alpha if swiglu_gptoss_style else None,
+            swiglu_beta=swiglu_beta if swiglu_gptoss_style else None,
+            swiglu_limit=swiglu_limit if swiglu_gptoss_style else None,
+            num_local_experts=num_local_experts,
         )
         weights = quantize_util.create_weights(**quant_kwargs)
 
+        # For EPLB, keep weights on CPU
         if enable_eplb:
-            # Keep the tensor on CPU for eplb
             for key in weights:
                 if isinstance(weights[key], torch.Tensor):
                     weights[key] = weights[key].to("cpu")
-
-        # Deepcopy the CPU weight since when eplb turns on, fused moe may advise_tensor_pageout in post load weight.
         ref_weights = copy.deepcopy(weights) if enable_eplb else weights
 
-        # Create pretrained config
-        pretrained_config = PretrainedConfig()
-        pretrained_config.num_experts = num_experts
-        pretrained_config.hidden_size = hidden_size
-        pretrained_config.intermediate_size = intermediate_size
-        pretrained_config.torch_dtype = dtype
-
-        if enable_eplb:
-            moe_load_balancer_config = MoeLoadBalancerConfig(
-                num_slots=num_slots,
-                layer_updates_per_iter=layer_updates_per_iter,
-            )
-        else:
-            moe_load_balancer_config = None
-
-        model_config = ModelConfig(
-            pretrained_config=pretrained_config,
+        # Create configs
+        model_cfg = _create_model_config(
+            num_experts=num_experts,
+            hidden_size=hidden_size,
+            intermediate_size=intermediate_size,
+            dtype=dtype,
             mapping=mapping,
             quant_config=quant_config,
             moe_backend=moe_backend,
-            moe_disable_finalize_fusion=not finalize_fusion,
-            moe_load_balancer=moe_load_balancer_config,
+            enable_eplb=enable_eplb,
+            num_slots=num_slots,
+            layer_updates_per_iter=layer_updates_per_iter,
         )
 
-        moe_load_balancer = nullcontext()
-        if enable_eplb:
-            # A simple implementation of maybe_create_moe_load_balancer for unit test.
-            ep_rank = model_config.mapping.moe_ep_rank
-            ep_size = model_config.mapping.moe_ep_size
-            model_config.moe_load_balancer.setup(ep_rank=ep_rank, ep_size=ep_size)
-            moe_load_balancer = MoeLoadBalancer(
-                ep_rank=ep_rank,
-                ep_size=ep_size,
-                layer_updates_per_iter=model_config.moe_load_balancer.layer_updates_per_iter,
-            )
+        # Create MoE load balancer
+        moe_load_balancer = _create_moe_load_balancer(model_cfg, enable_eplb)
+
+        # Get swiglu tensors if swiglu_gptoss_style is enabled
+        swiglu_tensors = quantize_util.get_swiglu_tensors()
 
         with moe_load_balancer:
-            # Create fused MoE module
+            # Create and setup fused MoE module
             fused_moe = create_moe(
-                routing_method=routing_method, reduce_results=True, model_config=model_config
+                routing_method=routing_method,
+                reduce_results=True,
+                model_config=model_cfg,
+                bias=swiglu_gptoss_style,
+                swiglu_alpha=swiglu_tensors["swiglu_alpha"] if swiglu_tensors else None,
+                swiglu_beta=swiglu_tensors["swiglu_beta"] if swiglu_tensors else None,
+                swiglu_limit=swiglu_tensors["swiglu_limit"] if swiglu_tensors else None,
             )
-
             fused_moe.load_weights([weights])
             fused_moe.post_load_weights()
             fused_moe.cuda(f"cuda:{mapping.rank}")
 
+            # Record initial expert_ids before any forward pass (for EPLB test)
+            initial_expert_ids = None
             if isinstance(moe_load_balancer, MoeLoadBalancer):
                 moe_load_balancer.register_weight_slots_after_to_cuda()
                 moe_load_balancer.finalize_model()
+                moe_load_balancer.set_iter_info(enable_statistic=True, enable_update_weights=True)
+                # Record initial expert_ids immediately after initialization
+                # Use deepcopy to avoid reference issues if the list is modified in-place
+                initial_expert_ids = copy.deepcopy(
+                    moe_load_balancer.single_layer_load_balancers[0].get_old_rank_expert_ids()
+                )
+                logger.info(f"[EPLB Debug] Initial expert_ids (after init): {initial_expert_ids}")
 
+            # Create reference module
             ref_fused_moe = quantize_util.create_ref_module(routing_method)
             ref_fused_moe.load_weights([ref_weights])
             ref_fused_moe.cuda(f"cuda:{mapping.rank}")
 
-            # Evaluate the outputs
-            def _run_forward(x, router_logits, skip_ref=False):
+            # Define forward function
+            def run_forward():
                 with torch.inference_mode():
-                    ref_output = None if skip_ref else ref_fused_moe.forward(x, router_logits)
                     if isinstance(moe_load_balancer, MoeLoadBalancer):
                         with MoeLoadBalancerIterContext(moe_load_balancer):
                             output = fused_moe.forward(
@@ -183,72 +563,24 @@ def _test_moe_worker(
                             x, router_logits, all_rank_num_tokens=all_rank_num_tokens
                         )
                 torch.cuda.synchronize()
-                return ref_output, output
+                return output
 
-            load_expert_ids = None
-            if isinstance(moe_load_balancer, MoeLoadBalancer):
-                moe_load_balancer.set_iter_info(enable_statistic=True, enable_update_weights=True)
-                load_expert_ids = moe_load_balancer.single_layer_load_balancers[
-                    0
-                ].get_old_rank_expert_ids()
+            # Get reference output
+            with torch.inference_mode():
+                ref_output = ref_fused_moe.forward(x, router_logits)
 
-            ref_output, output = _run_forward(x, router_logits)
-            ref_fused_moe.check_accuracy(output, ref_output)
+            # Run tests
+            if enable_autotune:
+                _setup_autotuner_for_test(mapping)
+                _run_autotune_test(run_forward, ref_fused_moe, ref_output, backend_type, quant_algo)
+            else:
+                output = run_forward()
+                ref_fused_moe.check_accuracy(output, ref_output)
 
             if enable_eplb:
-                # Multi iter run for eplb
-                assert isinstance(moe_load_balancer, MoeLoadBalancer), (
-                    "Moe load balancer should be created when eplb is enabled"
+                _run_eplb_test(
+                    run_forward, ref_fused_moe, ref_output, moe_load_balancer, initial_expert_ids
                 )
-                extra_steps = 3
-                for _ in range(extra_steps):
-                    _, output = _run_forward(x, router_logits, skip_ref=True)
-                    ref_fused_moe.check_accuracy(output, ref_output)
-                assert moe_load_balancer.iter_id == extra_steps + 1, (
-                    "Iter id should be equal to extra steps + 1 after multiple iterations"
-                )
-
-                current_expert_ids = moe_load_balancer.single_layer_load_balancers[
-                    0
-                ].get_old_rank_expert_ids()
-                assert load_expert_ids != current_expert_ids, (
-                    "Expert ids after eplb update should be different from the initial loaded ones"
-                )
-
-
-@pytest.mark.parametrize(
-    "quant_algo",
-    [
-        None,
-        QuantAlgo.FP8,
-        QuantAlgo.NVFP4,
-    ],
-    ids=lambda val: f"quant_algo={val}",
-)
-@pytest.mark.parametrize(
-    "moe_backend",
-    [
-        "CUTLASS",
-        "TRTLLM",
-    ],
-    ids=lambda val: f"moe_backend={val}",
-)
-@pytest.mark.parametrize(
-    "dtype",
-    [
-        torch.float16,
-        torch.bfloat16,
-    ],
-    ids=lambda val: f"dtype={val}",
-)
-def test_moe(dtype, moe_backend, quant_algo):
-    # Enable configurable moe by default
-    if moe_backend == "TRTLLM":
-        if dtype == torch.float16 and quant_algo == QuantAlgo.NVFP4:
-            pytest.skip("TRTLLM NVFP4 MoE backend does not support float16 yet")
-    _skip_helper(quant_algo)
-
-    _test_moe_worker(moe_backend=moe_backend, dtype=dtype, quant_algo=quant_algo)
 
 
 def _test_moe_multi_gpu(
@@ -256,12 +588,43 @@ def _test_moe_multi_gpu(
     moe_backend,
     quant_algo,
     dtype,
-    ep_size,
     world_size,
+    parallel_mode="DEP",
     enable_eplb=False,
     layer_updates_per_iter=-1,
     num_slots=-1,
+    model_config: Optional[MoeModelConfig] = None,
+    seq_len: int = 4,
+    enable_autotune: bool = False,
+    routing_method_cls=RenormalizeMoeRoutingMethod,
+    dtype_routing_logits=None,
+    swiglu_alpha: float = 1,
+    swiglu_beta: float = 0,
+    swiglu_limit: float = float("inf"),
 ):
+    """
+    Test MoE module with multi-GPU support.
+
+    Args:
+        comm_method_type: Communication method type
+        moe_backend: Backend type string
+        quant_algo: Quantization algorithm
+        dtype: Activation data type
+        world_size: Total world size
+        parallel_mode: Parallelism strategy ("DEP", "TEP", "DTP", "TTP")
+        enable_eplb: Enable Expert Load Balancing
+        layer_updates_per_iter: EPLB layer updates per iteration
+        num_slots: EPLB number of slots
+        model_config: MoE model configuration
+        seq_len: Sequence length for test input
+        enable_autotune: Enable autotune and tactic capture/replay testing
+        routing_method_cls: Routing method class to use
+        dtype_routing_logits: Data type for routing logits (default: same as dtype)
+        swiglu_alpha: SwiGLU alpha parameter (default=1, non-gptoss)
+        swiglu_beta: SwiGLU beta parameter (default=0, non-gptoss)
+        swiglu_limit: SwiGLU limit parameter (default=inf, non-gptoss)
+    """
+
     def init_worker(custom_paths, comm_method_type):
         # Update the sys.path to align with main process for submodule import
         for custom_path in custom_paths:
@@ -271,6 +634,8 @@ def _test_moe_multi_gpu(
         # Set comm method
         os.environ["TRTLLM_FORCE_COMM_METHOD"] = comm_method_type
 
+    mapping = _create_mapping_for_parallel_mode(world_size, parallel_mode)
+
     with MPIPoolExecutor(
         initializer=init_worker, initargs=(sys.path, comm_method_type), max_workers=world_size
     ) as executor:
@@ -282,16 +647,18 @@ def _test_moe_multi_gpu(
                         moe_backend,
                         dtype,
                         quant_algo,
-                        Mapping(
-                            world_size=world_size,
-                            tp_size=world_size,
-                            moe_ep_size=ep_size,
-                            moe_tp_size=world_size // ep_size,
-                            enable_attention_dp=True,
-                        ),
+                        mapping,
                         enable_eplb,
                         layer_updates_per_iter,
                         num_slots,
+                        model_config,
+                        seq_len,
+                        enable_autotune,
+                        routing_method_cls,
+                        dtype_routing_logits,
+                        swiglu_alpha,
+                        swiglu_beta,
+                        swiglu_limit,
                     )
                 ]
                 * world_size
@@ -301,104 +668,639 @@ def _test_moe_multi_gpu(
             assert r is None
 
 
+# ============================================================================
+# Test Parameters Configuration
+# ============================================================================
+
+# Quantization algorithms to test
+QUANT_ALGOS = [
+    None,  # Unquantized
+    QuantAlgo.FP8,
+    QuantAlgo.NVFP4,
+    QuantAlgo.FP8_BLOCK_SCALES,
+    QuantAlgo.W4A8_NVFP4_FP8,
+    QuantAlgo.W4A16_MXFP4,
+    QuantAlgo.W4A8_MXFP4_MXFP8,
+    QuantAlgo.W8A16,
+    QuantAlgo.W4A8_AWQ,
+]
+
+# Backend types to test
+BACKEND_TYPES = [
+    MoeBackendType.CUTLASS,
+    MoeBackendType.TRTLLM,
+    MoeBackendType.CUTEDSL,
+    MoeBackendType.DEEPGEMM,
+]
+
+# Data types to test
+DTYPES = [
+    torch.float16,
+    torch.bfloat16,
+]
+
+# Model configurations for testing
+# (num_experts, top_k, hidden_size, intermediate_size)
+#
+# Default runs the full local config matrix (TRTLLM_TEST_MOE_CI=0).
+# Set TRTLLM_TEST_MOE_CI=1 in CI to run only the smaller subset for speed.
+CI_MOE_MODEL_CONFIGS = [
+    MoeModelConfig(60, 4, 2048, 1408),  # Qwen1.5-MoE-A2.7B
+    MoeModelConfig(32, 8, 7168, 2048),  # DeepSeek-V3 (reduced from 256 experts to accelerate test)
+    MoeModelConfig(128, 4, 2880, 2880),  # GPT-OSS-120B
+    MoeModelConfig(8, 1, 512, 512),  # boundary: top_k=1, single expert activated
+]
+
+LOCAL_MOE_MODEL_CONFIGS = CI_MOE_MODEL_CONFIGS + [
+    MoeModelConfig(64, 6, 2048, 1408),  # DeepSeek-MoE-16B / DeepSeek-V2-Lite
+    MoeModelConfig(384, 8, 7168, 2048),  # Kimi-K2
+    # === Boundary Tests: num_experts / top_k ===
+    MoeModelConfig(4, 4, 512, 512),  # top_k=num_experts, all experts activated
+    MoeModelConfig(7, 2, 256, 512),  # prime num_experts
+    MoeModelConfig(13, 3, 256, 512),  # prime num_experts, odd top_k
+    # === Boundary Tests: small sizes ===
+    MoeModelConfig(4, 2, 64, 128),  # very small hidden_size
+    MoeModelConfig(4, 2, 128, 64),  # intermediate < hidden
+]
+
+MOE_MODEL_CONFIGS = (
+    CI_MOE_MODEL_CONFIGS
+    if os.environ.get("TRTLLM_TEST_MOE_CI", "0") == "1"
+    else LOCAL_MOE_MODEL_CONFIGS
+)
+
+# Sequence lengths to test
+SEQ_LENS = [1, 8]
+
+# Routing methods to test
+ROUTING_METHODS = [
+    RenormalizeMoeRoutingMethod,  # TopK -> Softmax (Mixtral, etc.)
+    DefaultMoeRoutingMethod,  # Softmax -> TopK
+    RenormalizeNaiveMoeRoutingMethod,  # Softmax -> TopK -> Renormalize (Qwen3)
+    Llama4RenormalizeMoeRoutingMethod,  # Top1 -> Sigmoid (Llama4)
+    DeepSeekV3MoeRoutingMethod,  # Sigmoid -> BiasAdd -> Group TopK (DeepSeek-V3)
+    MiniMaxM2MoeRoutingMethod,  # Sigmoid -> BiasAdd -> TopK -> Renormalize (MiniMax-M2)
+]
+
+
+MULTI_GPU_ROUTING_METHODS = [
+    RenormalizeMoeRoutingMethod,  # TopK -> Softmax (Mixtral, etc.)
+    DeepSeekV3MoeRoutingMethod,  # Sigmoid -> BiasAdd -> Group TopK (DeepSeek-V3)
+]
+
+
+# ============================================================================
+# Multi-GPU Test Configuration
+# ============================================================================
+# Parallel modes to test
+PARALLEL_MODES = [
+    "DEP",  # Attention DP, MoE EP
+    "TEP",  # Attention TP, MoE EP
+    "DTP",  # Attention DP, MoE TP
+    "TTP",  # Attention TP, MoE TP
+]
+
+# Communication methods to test
+COMM_METHODS = [
+    "NVLINK_ONE_SIDED",
+    "NVLINK_TWO_SIDED",
+    "DEEPEP",
+    "DEEPEPLOWLATENCY",
+]
+
+# SwiGLU parameters for swiglu_gptoss_style testing
+SWIGLU_ALPHAS = [1, 1.702]  # default, GPT-OSS (modeling_gpt_oss.py)
+SWIGLU_BETAS = [0, 1.0]  # default, GPT-OSS
+SWIGLU_LIMITS = [float("inf"), 7.0]  # default, GPT-OSS
+
+# Single-GPU: full product of all SwiGLU combos
+SWIGLU_COMBOS = list(product(SWIGLU_ALPHAS, SWIGLU_BETAS, SWIGLU_LIMITS))
+
+# Multi-GPU: only non-gptoss (default) and one gptoss combo
+MULTI_GPU_SWIGLU_COMBOS = [
+    (1, 0, float("inf")),  # non-gptoss (default SwiGLU)
+    (1.702, 1.0, 7.0),  # gptoss style (GPT-OSS real values)
+]
+
+
+def _get_comm_method_skip_reason(
+    comm_method: str,
+    model_config: "MoeModelConfig",
+) -> Optional[str]:
+    """
+    Check if a communication method is compatible with the given model config.
+
+    Returns a skip reason string if incompatible, None otherwise.
+    """
+    from tensorrt_llm._torch.modules.fused_moe.communication.deep_ep_low_latency import (
+        DeepEPLowLatency,
+    )
+
+    if comm_method == "DEEPEPLOWLATENCY":
+        if model_config.hidden_size not in DeepEPLowLatency.SUPPORTED_HIDDEN_SIZES:
+            return (
+                f"DeepEPLowLatency does not support hidden_size={model_config.hidden_size}, "
+                f"requires one of {sorted(DeepEPLowLatency.SUPPORTED_HIDDEN_SIZES)}"
+            )
+    return None
+
+
+def generate_multi_gpu_test_params(
+    parallel_modes,
+    comm_methods,
+    swiglu_combos,
+    model_configs,
+    seq_lens,
+    dtypes,
+    backend_types,
+    quant_algos,
+    routing_methods,
+) -> List:
+    """
+    Generate test parameter combinations for multi-GPU tests.
+
+    Args:
+        parallel_modes: List of parallel modes
+        comm_methods: List of communication methods
+        swiglu_combos: List of (swiglu_alpha, swiglu_beta, swiglu_limit) tuples
+        model_configs: List of MoeModelConfig
+        seq_lens: List of sequence lengths
+        dtypes: List of data types
+        backend_types: List of backend types
+        quant_algos: List of quantization algorithms
+        routing_methods: List of routing method classes
+
+    Returns:
+        List of pytest.param objects with appropriate skip marks
+    """
+    params: List = []
+    for parallel_mode, comm_method in product(parallel_modes, comm_methods):
+        for (
+            swiglu_alpha,
+            swiglu_beta,
+            swiglu_limit,
+            model_config,
+            seq_len,
+            dtype,
+            backend_type,
+            quant_algo,
+            routing_method_cls,
+            skip_reason,
+            base_test_id,
+        ) in iter_base_test_configs(
+            swiglu_combos,
+            model_configs,
+            seq_lens,
+            dtypes,
+            backend_types,
+            quant_algos,
+            routing_methods,
+        ):
+            # Check multi-GPU specific skip conditions
+            if not skip_reason:
+                skip_reason = _get_comm_method_skip_reason(comm_method, model_config)
+            if not skip_reason:
+                skip_reason = should_skip_trtllm(
+                    backend_type, quant_algo, model_config, comm_method=comm_method
+                )
+            if not skip_reason:
+                skip_reason = should_skip_cutedsl(
+                    backend_type, quant_algo, model_config, comm_method
+                )
+            if not skip_reason:
+                skip_reason = should_skip_deepgemm(
+                    backend_type, comm_method, quant_algo=quant_algo, model_config=model_config
+                )
+            if not skip_reason:
+                skip_reason = should_skip_multi_gpu(parallel_mode, model_config, world_size=4)
+
+            test_id = f"parallel={parallel_mode}-comm={comm_method}-{base_test_id}"
+            param_values = (
+                parallel_mode,
+                comm_method,
+                dtype,
+                backend_type.value,
+                quant_algo,
+                seq_len,
+                model_config,
+                routing_method_cls,
+                swiglu_alpha,
+                swiglu_beta,
+                swiglu_limit,
+            )
+            params.append(create_test_param(param_values, test_id, skip_reason))
+
+    return params
+
+
+def generate_base_test_params(
+    swiglu_combos, model_configs, seq_lens, dtypes, backend_types, quant_algos, routing_methods
+) -> List:
+    """
+    Generate test parameter combinations for base tests.
+
+    Args:
+        swiglu_combos: List of (swiglu_alpha, swiglu_beta, swiglu_limit) tuples
+        model_configs: List of MoeModelConfig
+        seq_lens: List of sequence lengths
+        dtypes: List of data types
+        backend_types: List of backend types
+        quant_algos: List of quantization algorithms
+        routing_methods: List of routing method classes
+
+    Returns:
+        List of pytest.param objects with appropriate skip marks
+    """
+    params: List = []
+    for (
+        swiglu_alpha,
+        swiglu_beta,
+        swiglu_limit,
+        model_config,
+        seq_len,
+        dtype,
+        backend_type,
+        quant_algo,
+        routing_method_cls,
+        skip_reason,
+        base_test_id,
+    ) in iter_base_test_configs(
+        swiglu_combos, model_configs, seq_lens, dtypes, backend_types, quant_algos, routing_methods
+    ):
+        param_values = (
+            dtype,
+            backend_type.value,
+            quant_algo,
+            seq_len,
+            model_config,
+            routing_method_cls,
+            swiglu_alpha,
+            swiglu_beta,
+            swiglu_limit,
+        )
+        params.append(create_test_param(param_values, base_test_id, skip_reason))
+
+    return params
+
+
+# ============================================================================
+# MoE Single GPU Tests
+# ============================================================================
+# Pre-generate test parameters at module load time
+BASE_TEST_PARAMS = generate_base_test_params(
+    swiglu_combos=SWIGLU_COMBOS,
+    model_configs=MOE_MODEL_CONFIGS,
+    seq_lens=SEQ_LENS,
+    dtypes=DTYPES,
+    backend_types=BACKEND_TYPES,
+    quant_algos=QUANT_ALGOS,
+    routing_methods=ROUTING_METHODS,
+)
+
+
+@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test")
+@pytest.mark.parametrize(
+    "dtype,moe_backend,quant_algo,seq_len,model_config,routing_method_cls,"
+    "swiglu_alpha,swiglu_beta,swiglu_limit",
+    BASE_TEST_PARAMS,
+)
+def test_ConfigurableMoE_single_gpu(
+    dtype: torch.dtype,
+    moe_backend: str,
+    quant_algo: Optional[QuantAlgo],
+    seq_len: int,
+    model_config: MoeModelConfig,
+    routing_method_cls,
+    swiglu_alpha: float,
+    swiglu_beta: float,
+    swiglu_limit: float,
+):
+    """
+    Single-GPU test for ConfigurableMoE module.
+
+    This test verifies:
+    1. MoE create_moe -> load_weights -> forward produces correct results
+    2. Various backend + quantization combinations work correctly
+    3. Autotune captures and replays all tactics properly
+    4. swiglu_gptoss_style (SwiGLU with custom parameters) works correctly
+    """
+    # DeepSeekV3 routing requires float32 routing_logits for TRTLLM backend
+    # See: cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp:70-72
+    dtype_routing_logits = None
+    if (
+        moe_backend == MoeBackendType.TRTLLM.value
+        and routing_method_cls == DeepSeekV3MoeRoutingMethod
+    ):
+        dtype_routing_logits = torch.float32
+
+    _test_moe_worker(
+        moe_backend=moe_backend,
+        dtype=dtype,
+        quant_algo=quant_algo,
+        model_config=model_config,
+        seq_len=seq_len,
+        enable_autotune=True,
+        routing_method_cls=routing_method_cls,
+        dtype_routing_logits=dtype_routing_logits,
+        swiglu_alpha=swiglu_alpha,
+        swiglu_beta=swiglu_beta,
+        swiglu_limit=swiglu_limit,
+    )
+
+
+# ============================================================================
+# MoE Multi-GPU Tests
+# ============================================================================
+# Pre-generate multi-GPU test parameters at module load time
+MULTI_GPU_TEST_PARAMS = generate_multi_gpu_test_params(
+    parallel_modes=PARALLEL_MODES,
+    comm_methods=COMM_METHODS,
+    swiglu_combos=MULTI_GPU_SWIGLU_COMBOS,
+    model_configs=MOE_MODEL_CONFIGS,
+    seq_lens=SEQ_LENS,
+    dtypes=DTYPES,
+    backend_types=BACKEND_TYPES,
+    quant_algos=QUANT_ALGOS,
+    routing_methods=MULTI_GPU_ROUTING_METHODS,
+)
+
+
+@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test")
 @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test")
 @pytest.mark.parametrize(
-    "quant_algo",
-    [
-        None,
-        QuantAlgo.NVFP4,
-    ],
-    ids=lambda val: f"quant_algo={val}",
+    "parallel_mode,comm_method_type,dtype,moe_backend,quant_algo,seq_len,model_config,"
+    "routing_method_cls,swiglu_alpha,swiglu_beta,swiglu_limit",
+    MULTI_GPU_TEST_PARAMS,
 )
-@pytest.mark.parametrize(
-    "moe_backend",
-    [
-        "CUTLASS",
-        "TRTLLM",
-    ],
-    ids=lambda val: f"moe_backend={val}",
-)
-@pytest.mark.parametrize(
-    "comm_method_type",
-    [
-        "NVLINK_ONE_SIDED",
-        "NVLINK_TWO_SIDED",
-    ],
-    ids=lambda val: f"comm_method_type={val}",
-)
-def test_moe_multi_gpu(comm_method_type, moe_backend, quant_algo):
-    _skip_helper(quant_algo)
+def test_ConfigurableMoE_multi_gpu(
+    parallel_mode,
+    comm_method_type,
+    dtype,
+    moe_backend,
+    quant_algo,
+    seq_len,
+    model_config,
+    routing_method_cls,
+    swiglu_alpha,
+    swiglu_beta,
+    swiglu_limit,
+):
+    # DeepSeekV3 routing requires float32 routing_logits for TRTLLM backend
+    # See: cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp:70-72
+    dtype_routing_logits = None
+    if (
+        moe_backend == MoeBackendType.TRTLLM.value
+        and routing_method_cls == DeepSeekV3MoeRoutingMethod
+    ):
+        dtype_routing_logits = torch.float32
 
-    dtype = torch.bfloat16
-    ep_size = 4
     world_size = 4
     _test_moe_multi_gpu(
         comm_method_type,
         moe_backend,
         quant_algo,
         dtype=dtype,
-        ep_size=ep_size,
         world_size=world_size,
+        parallel_mode=parallel_mode,
+        model_config=model_config,
+        seq_len=seq_len,
+        routing_method_cls=routing_method_cls,
+        dtype_routing_logits=dtype_routing_logits,
+        swiglu_alpha=swiglu_alpha,
+        swiglu_beta=swiglu_beta,
+        swiglu_limit=swiglu_limit,
     )
 
 
+# ============================================================================
+# MoE Multi-GPU EPLB Tests
+# ============================================================================
+# EPLB-specific configuration
+EPLB_PARALLEL_MODES = ["DEP"]  # EPLB only works with DEP mode (use_dp=True)
+EPLB_COMM_METHODS = [
+    "NVLINK_ONE_SIDED",
+    "NVLINK_TWO_SIDED",
+]  # Communication methods for EPLB
+EPLB_ROUTING_METHODS = [RenormalizeMoeRoutingMethod]  # Common routing methods
+EPLB_MODEL_CONFIGS = [MoeModelConfig(8, 2, 512, 512)]  # Model configs for EPLB
+EPLB_NUM_SLOTS_LIST = [16]  # Must be > num_experts (8) to be effective
+
+
+def _get_fused_moe_method_class(quant_algo, backend_type):
+    """
+    Get the FusedMoEMethod class based on quant_algo and backend_type.
+
+    This mirrors the logic in each backend's _get_quant_method() method.
+
+    Returns:
+        FusedMoEMethod class or None if not found
+    """
+    backend_str = backend_type.value if hasattr(backend_type, "value") else str(backend_type)
+
+    if quant_algo is None:
+        # Unquantized - only CUTLASS supports it
+        if backend_str == "CUTLASS":
+            return UnquantizedFusedMoEMethod
+        return None
+
+    # CUTLASS backend
+    # Mapping based on CutlassFusedMoE._get_quant_method() logic
+    if backend_str == "CUTLASS":
+        method_map = {
+            QuantAlgo.FP8: FP8QDQFusedMoEMethod,
+            QuantAlgo.FP8_BLOCK_SCALES: DeepSeekFP8BlockScalesFusedMoEMethod,
+            QuantAlgo.NVFP4: NVFP4CutlassFusedMoEMethod,
+            # W4A8_AWQ uses is_int4_weight_only_per_group() -> WInt4AFP8FusedMoEMethod
+            QuantAlgo.W4A8_AWQ: WInt4AFP8FusedMoEMethod,
+            QuantAlgo.W8A16: INT8WoqPerChannelFusedMoEMethod,
+            QuantAlgo.W4A16_MXFP4: WFP4A16FusedMoEMethod,
+            QuantAlgo.W4A8_MXFP4_FP8: W4A8MXFP4FP8CutlassFusedMoEMethod,
+            QuantAlgo.W4A8_MXFP4_MXFP8: W4A8MXFP4MXFP8CutlassFusedMoEMethod,
+            # Note: W4A8_NVFP4_FP8 is NOT supported by CUTLASS backend
+        }
+        return method_map.get(quant_algo)
+
+    # TRTLLM backend
+    if backend_str == "TRTLLM":
+        method_map = {
+            QuantAlgo.FP8_BLOCK_SCALES: DeepSeekFP8BlockScalesFusedMoEMethod,
+            QuantAlgo.NVFP4: NVFP4TRTLLMGenFusedMoEMethod,
+            QuantAlgo.W4A16_MXFP4: W4A16MXFP4TRTLLMGenFusedMoEMethod,
+            QuantAlgo.W4A8_NVFP4_FP8: W4A8NVFP4FP8TRTLLMGenFusedMoEMethod,
+            QuantAlgo.W4A8_MXFP4_FP8: W4A8MXFP4FP8TRTLLMGenFusedMoEMethod,
+            QuantAlgo.W4A8_MXFP4_MXFP8: W4A8MXFP4MXFP8TRTLLMGenFusedMoEMethod,
+        }
+        return method_map.get(quant_algo)
+
+    # CUTEDSL backend uses same methods as CUTLASS for quantization
+    if backend_str == "CUTEDSL":
+        method_map = {
+            QuantAlgo.NVFP4: NVFP4CutlassFusedMoEMethod,
+        }
+        return method_map.get(quant_algo)
+
+    # DEEPGEMM backend
+    if backend_str == "DEEPGEMM":
+        method_map = {
+            QuantAlgo.FP8_BLOCK_SCALES: DeepSeekFP8BlockScalesFusedMoEMethod,
+        }
+        return method_map.get(quant_algo)
+
+    return None
+
+
+def _should_skip_EPLB(quant_algo, backend_type, num_slots, num_experts):
+    """
+    Check if EPLB test should be skipped based on quant_algo, backend_type, and slot configuration.
+
+    Returns:
+        str or None: Skip reason if should skip, None otherwise
+    """
+    # Check num_slots > num_experts requirement
+    if num_slots <= num_experts:
+        return f"EPLB requires num_slots ({num_slots}) > num_experts ({num_experts})"
+
+    # Get the FusedMoEMethod class for this quant_algo + backend combination
+    method_class = _get_fused_moe_method_class(quant_algo, backend_type)
+
+    if method_class is None:
+        # Cannot determine the method class, skip the test
+        return (
+            f"Cannot determine FusedMoEMethod for quant_algo={quant_algo}, backend={backend_type}"
+        )
+
+    # Query the method class directly for EPLB support
+    if not method_class.supports_online_eplb():
+        return f"EPLB not supported for {method_class.__name__} (supports_online_eplb=False)"
+
+    return None
+
+
+def generate_eplb_test_params(
+    parallel_modes,
+    comm_methods,
+    model_configs,
+    num_slots_list,
+    dtypes,
+    backend_types,
+    quant_algos,
+    routing_methods,
+) -> List:
+    """
+    Generate test parameter combinations for EPLB tests.
+
+    EPLB requires num_slots > num_experts to be effective.
+
+    Args:
+        parallel_modes: List of parallel modes (only EP modes: DEP, TEP)
+        comm_methods: List of communication methods
+        model_configs: List of MoeModelConfig
+        num_slots_list: List of EPLB slots (must be > num_experts)
+        dtypes: List of data types
+        backend_types: List of backend types
+        quant_algos: List of quantization algorithms
+        routing_methods: List of routing method classes
+
+    Returns:
+        List of pytest.param objects with appropriate skip marks
+    """
+    params: List = []
+
+    for (
+        parallel_mode,
+        comm_method,
+        model_config,
+        num_slots,
+        dtype,
+        backend_type,
+        quant_algo,
+        routing_method_cls,
+    ) in product(
+        parallel_modes,
+        comm_methods,
+        model_configs,
+        num_slots_list,
+        dtypes,
+        backend_types,
+        quant_algos,
+        routing_methods,
+    ):
+        # Get skip reason using existing logic
+        skip_reason = get_quick_skip_reason(
+            backend_type, quant_algo, dtype, model_config, routing_method_cls
+        )
+
+        # Check EPLB-specific skip conditions
+        if not skip_reason:
+            skip_reason = _should_skip_EPLB(
+                quant_algo, backend_type, num_slots, model_config.num_experts
+            )
+
+        routing_name = routing_method_cls.__name__.replace("MoeRoutingMethod", "")
+        test_id = (
+            f"parallel={parallel_mode}-comm={comm_method}-{model_config}-slots={num_slots}-"
+            f"dtype={dtype}-backend={backend_type.value}-quant={quant_algo}-routing={routing_name}"
+        )
+
+        param_values = (
+            parallel_mode,
+            comm_method,
+            dtype,
+            backend_type.value,
+            quant_algo,
+            model_config,
+            num_slots,
+            routing_method_cls,
+        )
+        params.append(create_test_param(param_values, test_id, skip_reason))
+
+    return params
+
+
+# Pre-generate EPLB test parameters at module load time
+EPLB_TEST_PARAMS = generate_eplb_test_params(
+    parallel_modes=EPLB_PARALLEL_MODES,
+    comm_methods=EPLB_COMM_METHODS,
+    model_configs=EPLB_MODEL_CONFIGS,
+    num_slots_list=EPLB_NUM_SLOTS_LIST,
+    dtypes=DTYPES,
+    backend_types=BACKEND_TYPES,
+    quant_algos=QUANT_ALGOS,
+    routing_methods=EPLB_ROUTING_METHODS,
+)
+
+
+@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test")
 @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test")
 @pytest.mark.skipif(
     not _tbr.is_host_accessible_device_memory_supported(),
     reason="needs support of host accessible device memory",
 )
 @pytest.mark.parametrize(
-    "quant_algo",
-    [
-        None,
-        QuantAlgo.NVFP4,
-    ],
-    ids=lambda val: f"quant_algo={val}",
+    "parallel_mode,comm_method_type,dtype,moe_backend,quant_algo,model_config,num_slots,routing_method_cls",
+    EPLB_TEST_PARAMS,
 )
-@pytest.mark.parametrize(
-    "moe_backend",
-    [
-        "CUTLASS",
-    ],
-    ids=lambda val: f"moe_backend={val}",
-)
-@pytest.mark.parametrize(
-    "comm_method_type",
-    [
-        "NVLINK_ONE_SIDED",
-    ],
-    ids=lambda val: f"comm_method_type={val}",
-)
-@pytest.mark.parametrize(
-    "num_slots",
-    [
-        16,
-    ],
-    ids=lambda val: f"num_slots={val}",
-)
-@pytest.mark.parametrize(
-    "layer_updates_per_iter",
-    [
-        1,
-    ],
-    ids=lambda val: f"layer_updates_per_iter={val}",
-)
-def test_moe_multi_gpu_eplb(
-    layer_updates_per_iter, num_slots, comm_method_type, moe_backend, quant_algo
+def test_ConfigurableMoE_multi_gpu_eplb(
+    parallel_mode,
+    comm_method_type,
+    dtype,
+    moe_backend,
+    quant_algo,
+    model_config,
+    num_slots,
+    routing_method_cls,
 ):
-    _skip_helper(quant_algo)
-
-    dtype = torch.bfloat16
-    ep_size = 4
     world_size = 4
     _test_moe_multi_gpu(
         comm_method_type,
         moe_backend,
         quant_algo,
         dtype=dtype,
-        ep_size=ep_size,
         world_size=world_size,
+        parallel_mode=parallel_mode,
         enable_eplb=True,
-        layer_updates_per_iter=layer_updates_per_iter,
+        layer_updates_per_iter=1,
         num_slots=num_slots,
+        model_config=model_config,
+        routing_method_cls=routing_method_cls,
     )