diff --git a/tensorrt_llm/_torch/distributed/ops.py b/tensorrt_llm/_torch/distributed/ops.py index 8b8b705f35..070fc76497 100644 --- a/tensorrt_llm/_torch/distributed/ops.py +++ b/tensorrt_llm/_torch/distributed/ops.py @@ -12,10 +12,11 @@ _thread_local = threading.local() def get_allreduce_workspace(mapping: Mapping) -> torch.LongTensor: - if not hasattr(_thread_local, 'allreduce_workspaces'): - _thread_local.allreduce_workspaces = [{} - for _ in range(mapping.pp_size)] - allreduce_workspaces = _thread_local.allreduce_workspaces[mapping.pp_rank] + if not hasattr(_thread_local, f'allreduce_workspaces_{mapping.pp_rank}'): + setattr(_thread_local, f'allreduce_workspaces_{mapping.pp_rank}', {}) + + allreduce_workspaces = getattr(_thread_local, + f'allreduce_workspaces_{mapping.pp_rank}') if mapping not in allreduce_workspaces: ipc_buffers, workspace = CustomAllReduceHelper.allocate_allreduce_fusion_workspace( mapping, diff --git a/tensorrt_llm/_torch/models/modeling_deepseekv3.py b/tensorrt_llm/_torch/models/modeling_deepseekv3.py index 39699e0059..67324488d9 100644 --- a/tensorrt_llm/_torch/models/modeling_deepseekv3.py +++ b/tensorrt_llm/_torch/models/modeling_deepseekv3.py @@ -647,8 +647,8 @@ class DeepseekV3DecoderLayer(DecoderLayer): def _enable_min_latency_mode(self, num_tokens: int): return (num_tokens <= 128 and self.fusion_config.POST_MOE_FUSION - and self.is_nvfp4 - and self.model_config.moe_backend == 'CUTLASS') + and self.is_nvfp4 and self.model_config.moe_backend == 'CUTLASS' + and not self.mapping.is_multi_node()) def forward( self,