From 8d5a7ea5b3c8a326aff5f32b23a9d4d46f81a0f4 Mon Sep 17 00:00:00 2001
From: Jiagan Cheng <jiaganc@nvidia.com>
Date: Sun, 31 Aug 2025 18:28:09 -0700
Subject: [PATCH] [https://nvbugs/5443053][fix] Disable finalize fusion when
 Lora is used

Signed-off-by: Jiagan Cheng <jiaganc@nvidia.com>
---
 .../plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp    | 7 ++++---
 tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py | 3 ++-
 tests/unittest/trt/functional/test_moe.py                  | 2 --
 3 files changed, 6 insertions(+), 6 deletions(-)
diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
index 59d92e6429..53cef2b4b0 100644
--- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp
@@ -334,8 +334,9 @@ void MixtureOfExpertsPlugin::init()
             static_cast<int>(mType), static_cast<int>(mWeightType), static_cast<int>(mOutputType));
     }
 
+    // Finalize fusion should be disabled if Lora is used.
     mMOERunner->use_fused_finalize_
-        = (mExpertsPerToken < 3 || !mUseDeterministicKernels) && !getEnvMOEDisableFinalizeFusion();
+        = (mExpertsPerToken < 3 || !mUseDeterministicKernels) && !getEnvMOEDisableFinalizeFusion() && !hasLora();
 
     mGemmId1 = GemmIDMoe{1, mNumExperts, mExpertsPerToken, mParallelismConfig, mExpertHiddenSize, mExpertInterSize,
         mGroupSize, mActivationType, mType, mWeightType, mQuantMode, !mMOERunner->use_fused_finalize_};
@@ -535,9 +536,9 @@ void MixtureOfExpertsPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc c
     }
 
     mGemmId1 = GemmIDMoe{1, mNumExperts, mExpertsPerToken, mParallelismConfig, mExpertHiddenSize, mExpertInterSize,
-        mGroupSize, mActivationType, mType, mWeightType, mQuantMode};
+        mGroupSize, mActivationType, mType, mWeightType, mQuantMode, !mMOERunner->use_fused_finalize_};
     mGemmId2 = GemmIDMoe{2, mNumExperts, mExpertsPerToken, mParallelismConfig, mExpertHiddenSize, mExpertInterSize,
-        mGroupSize, mActivationType, mType, mWeightType, mQuantMode};
+        mGroupSize, mActivationType, mType, mWeightType, mQuantMode, !mMOERunner->use_fused_finalize_};
 
     if (hasLora())
     {
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
index da90df16bd..a30cef3fc1 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -150,7 +150,8 @@ class CutlassFusedMoE(MoE):
         # If True, the router weight will be multiplied on the input rather than at the end of FC2
         self.apply_router_weight_on_input = apply_router_weight_on_input
 
-        self.use_fused_finalize = not model_config.moe_disable_finalize_fusion
+        # Finalize fusion should be disabled if Lora is used.
+        self.use_fused_finalize = not model_config.moe_disable_finalize_fusion and model_config.lora_config is None
 
         self._weights_created = False
         if not model_config.skip_create_weights_in_init:
diff --git a/tests/unittest/trt/functional/test_moe.py b/tests/unittest/trt/functional/test_moe.py
index e5dcefcbaa..dedf21fda8 100644
--- a/tests/unittest/trt/functional/test_moe.py
+++ b/tests/unittest/trt/functional/test_moe.py
@@ -1020,8 +1020,6 @@ class TestMoE(unittest.TestCase):
         product(["float16", "bfloat16", "int4", "int8"], ["gelu", "geglu"],
                 [True], [32, 64])),
                           name_func=unittest_name_func)
-    @pytest.mark.skip(
-        "https://nvbugswb.nvidia.com/NVBugs5/redir.aspx?url=/5443053")
     def test_mlp_lora_comparison(self, dtype_str, actfn, use_plugin, lora_rank):
         """This test uses one expert and compares the result to a plain MLP"""
         torch.random.manual_seed(42)