[https://nvbugs/5848377][fix] fix deepeplowlatency with trtllm moe backend running fp8 DS_R1 (#11266)

Signed-off-by: leslie-fang25 <leslief@nvidia.com> Signed-off-by: Leslie Fang <leslief@nvidia.com> Co-authored-by: Tailing Yuan <yuantailing@gmail.com>
2026-02-16 15:55:08 +08:00 · 2026-02-10 20:09:00 +08:00 · 2026-02-10 20:09:00 +08:00 · d6e49542bd
commit d6e49542bd
parent cf02456613
3 changed files with 12 additions and 1 deletions
--- a/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/communication/deep_ep_low_latency.py
@ -283,6 +283,10 @@ class DeepEPLowLatency(Communication):
            self.expert_size_per_partition, num_tokens_per_expert, self.hidden_size
        )

+        if deep_ep_topk_weights.dtype != torch.float32:
+            # Deep ep low latency combine requires for fp32 weights
+            deep_ep_topk_weights = deep_ep_topk_weights.to(torch.float32)
+
        if self.use_low_precision_combine:
            if self._has_nvfp4():
                precision = "nvfp4"
--- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@ -756,7 +756,10 @@ class ConfigurableMoE(MoE):
            if self.enable_dummy_allreduce:
                self.dummy_allreduce()
            # Use unified combine interface (reads dispatch state from strategy)
-            final_hidden_states = self.comm.combine(final_hidden_states)
+            all_rank_max_num_tokens = max(all_rank_num_tokens)
+            final_hidden_states = self.comm.combine(
+                final_hidden_states, all_rank_max_num_tokens=all_rank_max_num_tokens
+            )
        else:
            # For non-comm case, It should be attention TP or single rank.
            # only check if allreduce is needed
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@ -531,6 +531,10 @@ class TRTLLMGenFusedMoE(MoE):

        routing_bias = routing_bias if router_logits is not None else None

+        if token_selected_experts is not None:
+            # for cases like deepep low latency where fake top_k=1 might be used
+            top_k = token_selected_experts.shape[-1]
+
        # Ensure x_sf is 2D before flattening
        if x_sf is not None:
            assert len(