[nvbugs/5368410][fix] Disable moe allreduce for multi node (#5918)

Signed-off-by: Yi Zhang <187001205+yizhang-nv@users.noreply.github.com>
This commit is contained in:
Yi Zhang 2025-07-14 10:06:29 +08:00 committed by Zhenhuan Chen
parent c66941036f
commit eb7d0f84b5
2 changed files with 4 additions and 1 deletions

View File

@ -38,6 +38,7 @@ from torch import nn
from tqdm import tqdm
from transformers import PretrainedConfig
from tensorrt_llm._ipc_utils import can_access_peer
from tensorrt_llm._utils import get_sm_version
from tensorrt_llm.functional import PositionEmbeddingType
from tensorrt_llm.llmapi.utils import enable_llm_debug
@ -602,6 +603,7 @@ class DeepseekV3DecoderLayer(DecoderLayer):
self.enable_attention_dp = mapping.enable_attention_dp
self.mlp_tp_size = mapping.tp_size
self.is_p2p_supported = can_access_peer(mapping)
self.fusion_config = EagerFusionConfig()
self.enable_fusion = os.environ.get(
@ -796,7 +798,7 @@ class DeepseekV3DecoderLayer(DecoderLayer):
not (hidden_states.shape[0] <= self.moe_allreduce.max_token
and self.fusion_config.POST_MOE_FUSION
and self.model_config.moe_backend == "TRTLLM"
and self.mlp.experts.has_nvfp4))
and self.mlp.experts.has_nvfp4 and self.is_p2p_supported))
hidden_states = _run_MoE(hidden_states,
hidden_states_fp4=None,

View File

@ -15,5 +15,6 @@ l0_gb200_multi_nodes:
tests:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass] TIMEOUT (180)
- accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm] TIMEOUT (180)