From cbca6505ff26eb41c923f074fa2ff1dcda0fa31f Mon Sep 17 00:00:00 2001
From: Yukun He <23156053+hyukn@users.noreply.github.com>
Date: Tue, 13 May 2025 17:17:25 +0800
Subject: [PATCH] [nvbugs/5268808][fix] Fix the list-out-of-range access issue
 of AllReduce workspace on multi-node. (#4159)

This issue is found for tp=ep=8 on the multi-node machine due to the inconsistent PP sizes.
* Reform the workspace allocation implementation to avoid the list-out-of-range issues.
* Disable min_latency_mode under the multi-node scenario to avoid the illegal memory access issue.

Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com>
---
 tensorrt_llm/_torch/distributed/ops.py            | 9 +++++----
 tensorrt_llm/_torch/models/modeling_deepseekv3.py | 4 ++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py
index 8b8b705f35..070fc76497 100644
--- a/tensorrt_llm/_torch/distributed/ops.py
+++ b/tensorrt_llm/_torch/distributed/ops.py
@@ -12,10 +12,11 @@ _thread_local = threading.local()
 
 
 def get_allreduce_workspace(mapping: Mapping) -> torch.LongTensor:
-    if not hasattr(_thread_local, 'allreduce_workspaces'):
-        _thread_local.allreduce_workspaces = [{}
-                                              for _ in range(mapping.pp_size)]
-    allreduce_workspaces = _thread_local.allreduce_workspaces[mapping.pp_rank]
+    if not hasattr(_thread_local, f'allreduce_workspaces_{mapping.pp_rank}'):
+        setattr(_thread_local, f'allreduce_workspaces_{mapping.pp_rank}', {})
+
+    allreduce_workspaces = getattr(_thread_local,
+                                   f'allreduce_workspaces_{mapping.pp_rank}')
     if mapping not in allreduce_workspaces:
         ipc_buffers, workspace = CustomAllReduceHelper.allocate_allreduce_fusion_workspace(
             mapping,
diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
index 39699e0059..67324488d9 100644
--- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py
+++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py
@@ -647,8 +647,8 @@ class DeepseekV3DecoderLayer(DecoderLayer):
 
     def _enable_min_latency_mode(self, num_tokens: int):
         return (num_tokens <= 128 and self.fusion_config.POST_MOE_FUSION
-                and self.is_nvfp4
-                and self.model_config.moe_backend == 'CUTLASS')
+                and self.is_nvfp4 and self.model_config.moe_backend == 'CUTLASS'
+                and not self.mapping.is_multi_node())
 
     def forward(
         self,