[nvbugs/5368410][fix] Disable moe allreduce for multi node (#5918)

Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-07-14 10:06:29 +08:00 · 2025-07-14 10:06:29 +08:00 · eb7d0f84b5
commit eb7d0f84b5
parent c66941036f
2 changed files with 4 additions and 1 deletions
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@ -38,6 +38,7 @@ from torch import nn
 from tqdm import tqdm
 from transformers import PretrainedConfig

+from tensorrt_llm._ipc_utils import can_access_peer
 from tensorrt_llm._utils import get_sm_version
 from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.llmapi.utils import enable_llm_debug
@ -602,6 +603,7 @@ class DeepseekV3DecoderLayer(DecoderLayer):
        self.enable_attention_dp = mapping.enable_attention_dp

        self.mlp_tp_size = mapping.tp_size
+        self.is_p2p_supported = can_access_peer(mapping)

        self.fusion_config = EagerFusionConfig()
        self.enable_fusion = os.environ.get(
@ -796,7 +798,7 @@ class DeepseekV3DecoderLayer(DecoderLayer):
            not (hidden_states.shape[0] <= self.moe_allreduce.max_token
                 and self.fusion_config.POST_MOE_FUSION
                 and self.model_config.moe_backend == "TRTLLM"
-                 and self.mlp.experts.has_nvfp4))
+                 and self.mlp.experts.has_nvfp4 and self.is_p2p_supported))

        hidden_states = _run_MoE(hidden_states,
                                 hidden_states_fp4=None,
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes.yml
@ -15,5 +15,6 @@ l0_gb200_multi_nodes:
  tests:
  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
+  - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
  - accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)