mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[nvbugs/5268808][fix] Fix the list-out-of-range access issue of AllReduce workspace on multi-node. (#4159)
This issue is found for tp=ep=8 on the multi-node machine due to the inconsistent PP sizes. * Reform the workspace allocation implementation to avoid the list-out-of-range issues. * Disable min_latency_mode under the multi-node scenario to avoid the illegal memory access issue. Signed-off-by: Yukun He <23156053+hyukn@users.noreply.github.com>
This commit is contained in:
parent
8f68d56cc1
commit
cbca6505ff
@ -12,10 +12,11 @@ _thread_local = threading.local()
|
||||
|
||||
|
||||
def get_allreduce_workspace(mapping: Mapping) -> torch.LongTensor:
|
||||
if not hasattr(_thread_local, 'allreduce_workspaces'):
|
||||
_thread_local.allreduce_workspaces = [{}
|
||||
for _ in range(mapping.pp_size)]
|
||||
allreduce_workspaces = _thread_local.allreduce_workspaces[mapping.pp_rank]
|
||||
if not hasattr(_thread_local, f'allreduce_workspaces_{mapping.pp_rank}'):
|
||||
setattr(_thread_local, f'allreduce_workspaces_{mapping.pp_rank}', {})
|
||||
|
||||
allreduce_workspaces = getattr(_thread_local,
|
||||
f'allreduce_workspaces_{mapping.pp_rank}')
|
||||
if mapping not in allreduce_workspaces:
|
||||
ipc_buffers, workspace = CustomAllReduceHelper.allocate_allreduce_fusion_workspace(
|
||||
mapping,
|
||||
|
||||
@ -647,8 +647,8 @@ class DeepseekV3DecoderLayer(DecoderLayer):
|
||||
|
||||
def _enable_min_latency_mode(self, num_tokens: int):
|
||||
return (num_tokens <= 128 and self.fusion_config.POST_MOE_FUSION
|
||||
and self.is_nvfp4
|
||||
and self.model_config.moe_backend == 'CUTLASS')
|
||||
and self.is_nvfp4 and self.model_config.moe_backend == 'CUTLASS'
|
||||
and not self.mapping.is_multi_node())
|
||||
|
||||
def forward(
|
||||
self,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user