From 8d5a7ea5b3c8a326aff5f32b23a9d4d46f81a0f4 Mon Sep 17 00:00:00 2001 From: Jiagan Cheng Date: Sun, 31 Aug 2025 18:28:09 -0700 Subject: [PATCH] [https://nvbugs/5443053][fix] Disable finalize fusion when Lora is used Signed-off-by: Jiagan Cheng --- .../plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp | 7 ++++--- tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py | 3 ++- tests/unittest/trt/functional/test_moe.py | 2 -- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp index 59d92e6429..53cef2b4b0 100644 --- a/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp +++ b/cpp/tensorrt_llm/plugins/mixtureOfExperts/mixtureOfExpertsPlugin.cpp @@ -334,8 +334,9 @@ void MixtureOfExpertsPlugin::init() static_cast(mType), static_cast(mWeightType), static_cast(mOutputType)); } + // Finalize fusion should be disabled if Lora is used. mMOERunner->use_fused_finalize_ - = (mExpertsPerToken < 3 || !mUseDeterministicKernels) && !getEnvMOEDisableFinalizeFusion(); + = (mExpertsPerToken < 3 || !mUseDeterministicKernels) && !getEnvMOEDisableFinalizeFusion() && !hasLora(); mGemmId1 = GemmIDMoe{1, mNumExperts, mExpertsPerToken, mParallelismConfig, mExpertHiddenSize, mExpertInterSize, mGroupSize, mActivationType, mType, mWeightType, mQuantMode, !mMOERunner->use_fused_finalize_}; @@ -535,9 +536,9 @@ void MixtureOfExpertsPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc c } mGemmId1 = GemmIDMoe{1, mNumExperts, mExpertsPerToken, mParallelismConfig, mExpertHiddenSize, mExpertInterSize, - mGroupSize, mActivationType, mType, mWeightType, mQuantMode}; + mGroupSize, mActivationType, mType, mWeightType, mQuantMode, !mMOERunner->use_fused_finalize_}; mGemmId2 = GemmIDMoe{2, mNumExperts, mExpertsPerToken, mParallelismConfig, mExpertHiddenSize, mExpertInterSize, - mGroupSize, mActivationType, mType, mWeightType, mQuantMode}; + mGroupSize, mActivationType, mType, mWeightType, mQuantMode, !mMOERunner->use_fused_finalize_}; if (hasLora()) { diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py index da90df16bd..a30cef3fc1 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py @@ -150,7 +150,8 @@ class CutlassFusedMoE(MoE): # If True, the router weight will be multiplied on the input rather than at the end of FC2 self.apply_router_weight_on_input = apply_router_weight_on_input - self.use_fused_finalize = not model_config.moe_disable_finalize_fusion + # Finalize fusion should be disabled if Lora is used. + self.use_fused_finalize = not model_config.moe_disable_finalize_fusion and model_config.lora_config is None self._weights_created = False if not model_config.skip_create_weights_in_init: diff --git a/tests/unittest/trt/functional/test_moe.py b/tests/unittest/trt/functional/test_moe.py index e5dcefcbaa..dedf21fda8 100644 --- a/tests/unittest/trt/functional/test_moe.py +++ b/tests/unittest/trt/functional/test_moe.py @@ -1020,8 +1020,6 @@ class TestMoE(unittest.TestCase): product(["float16", "bfloat16", "int4", "int8"], ["gelu", "geglu"], [True], [32, 64])), name_func=unittest_name_func) - @pytest.mark.skip( - "https://nvbugswb.nvidia.com/NVBugs5/redir.aspx?url=/5443053") def test_mlp_lora_comparison(self, dtype_str, actfn, use_plugin, lora_rank): """This test uses one expert and compares the result to a plain MLP""" torch.random.manual_seed(42)