diff --git a/tensorrt_llm/_torch/models/modeling_minimaxm2.py b/tensorrt_llm/_torch/models/modeling_minimaxm2.py
index b3eef52aae..d2a225e3f2 100644
--- a/tensorrt_llm/_torch/models/modeling_minimaxm2.py
+++ b/tensorrt_llm/_torch/models/modeling_minimaxm2.py
@@ -71,6 +71,7 @@ class MiniMaxM2MoE(nn.Module):
                 num_experts=self.num_experts,
                 callable_e_score_correction_bias=lambda: self.e_score_correction_bias,
             ),
+            num_experts=self.num_experts,
             aux_stream_dict={AuxStreamType.MoeChunkingOverlap: aux_stream},
             reduce_results=reduce_results,
             model_config=model_config,
@@ -115,24 +116,13 @@ class MiniMaxM2Attention(Attention):
         self,
         *,
         model_config: ModelConfig[PretrainedConfig],
-        skip_rope: bool = False,
         fuse_qk_norm_rope: bool = False,
         layer_idx: Optional[int] = None,
-        is_qk_norm: bool = True,
     ):
         config = model_config.pretrained_config
         self.pretrained_config = config
 
         self.fuse_qk_norm_rope = fuse_qk_norm_rope
-        self.skip_rope = skip_rope
-
-        # If fuse_qk_norm_rope is true, do not apply fused RoPE in attention OP, and self.rotary_emb
-        # will be skipped in the overridden apply_rope.
-        rope_fusion = not self.fuse_qk_norm_rope and not skip_rope
-        self.is_qk_norm = is_qk_norm
-        assert not (fuse_qk_norm_rope and skip_rope), (
-            "Fusing qk norm and skipping rope is not supported"
-        )
 
         super().__init__(
             hidden_size=config.hidden_size,
@@ -144,7 +134,7 @@ class MiniMaxM2Attention(Attention):
                 type=PositionEmbeddingType.rope_gpt_neox,
                 rope=RopeParams.from_config(config),
             ),
-            rope_fusion=rope_fusion,
+            rope_fusion=True,
             layer_idx=layer_idx,
             dtype=config.torch_dtype,
             config=model_config,
@@ -160,8 +150,6 @@ class MiniMaxM2Attention(Attention):
             eps=config.rms_norm_eps,
             dtype=config.torch_dtype,
         )
-        self.aux_stream = torch.cuda.Stream()
-        self.ln_events = [torch.cuda.Event(), torch.cuda.Event()]
 
     def apply_qk_norm(self, q, k):
         if self.qkv_proj.mapping.tp_size > 1:
@@ -201,10 +189,7 @@ class MiniMaxM2Attention(Attention):
         )
         q, k, v = self.split_qkv(q, k, v)
         q, k = self.apply_qk_norm(q, k)
-        if not self.skip_rope:
-            return super().apply_rope(q, k, v, position_ids)
-        else:
-            return q, k, v
+        return super().apply_rope(q, k, v, position_ids)
 
 
 class MiniMaxM2DecoderLayer(DecoderLayer):
diff --git a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml
index 6b8183f4b0..ffe7843cdf 100644
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml
@@ -72,7 +72,7 @@ l0_gb200_multi_gpus:
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass]
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass]
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm]
-  - accuracy/test_llm_api_pytorch.py::TestMiniMaxM2::test_4gpus[attention_dp=False-cuda_graph=True-overlap_scheduler=True-tp_size=4-ep_size=4]
+  - accuracy/test_llm_api_pytorch.py::TestMiniMaxM2::test_4gpus[attention_dp=False-cuda_graph=True-overlap_scheduler=True-tp_size=4-ep_size=4] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch_multimodal.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm] TIMEOUT (90)
 - condition:
     ranges: