[fix] WAR to fix the illegal memory access issue in moe gemm on SM120 (#5636)

Signed-off-by: peaceh <103117813+peaceh-nv@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-07-10 09:20:30 +08:00 · 2025-07-10 09:20:30 +08:00 · 76c3a12bcb
commit 76c3a12bcb
parent 3209b31665
3 changed files with 6 additions and 1 deletions
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl
@ -341,13 +341,14 @@ using SafeBF16 = void;
            using AtomThrShape = decltype(shape_div(ClusterShape{}, Shape<AtomClusterDiv, _1, _1>{}));                                                                                                                                            \
            using EpilogueTileShapeSm100 = decltype(shape_div(TileShape{}, AtomThrShape{}));                                                                                                                                                      \
            using EpilogueTileShape = std::conditional_t<IsBlackwell, EpilogueTileShapeSm100, EpilogueTileShapeSm90>;                                                                                                                             \
+            using EpilogueElementC = std::conditional_t<IsSM120, ElementCSafe, ElementC>;                                                                                                                                                         \
            /* Epilogue For Default Finalize */                                                                                                                                                                                                   \
            using CollectiveEpilogueDefault = typename cutlass::epilogue::collective::CollectiveBuilder</**/                                                                                                                                      \
                Arch, cutlass::arch::OpClassTensorOp,                                                   /**/                                                                                                                                      \
                EpilogueTileShape, ClusterShape,                                                        /**/                                                                                                                                      \
                cutlass::epilogue::collective::EpilogueTileAuto,                                        /**/                                                                                                                                      \
                ElementAccumulator, ElementAccumulator,                                                 /**/                                                                                                                                      \
-                ElementC, LayoutC*, AlignmentC,                                                         /**/                                                                                                                                      \
+                EpilogueElementC, LayoutC*, AlignmentC,                                                 /**/                                                                                                                                      \
                ElementD, LayoutD*, AlignmentD,                                                         /**/                                                                                                                                      \
                EpilogueSchedule>::CollectiveOp;                                                                                                                                                                                                  \
                                                                                                                                                                                                                                                  \
--- a/tests/integration/test_lists/test-db/l0_gb202.yml
+++ b/tests/integration/test_lists/test-db/l0_gb202.yml
@ -17,6 +17,8 @@ l0_gb202:
  # ------------- PyTorch tests ---------------
  - unittest/_torch/modeling -k "modeling_mllama"
  - unittest/_torch/modeling -k "modeling_out_of_tree"
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype0]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype1]
  # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
  - unittest/_torch/test_attention_mla.py
  - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
--- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
+++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@ -18,6 +18,8 @@ l0_rtx_pro_6000:
  - unittest/_torch/modeling -k "modeling_out_of_tree"
  # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
  - unittest/_torch/test_attention_mla.py
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype0]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[dtype1]
  - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
  - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
  - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]