From d3f4fbb742d553967a82f877a1354a3f7723f52b Mon Sep 17 00:00:00 2001 From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> Date: Wed, 14 Jan 2026 09:39:36 +0800 Subject: [PATCH] [None][fix] Avoid write-write race for async pp send. (#10488) Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com> --- tensorrt_llm/_torch/distributed/communicator.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py index 20401b5a24..d6e54a6411 100644 --- a/tensorrt_llm/_torch/distributed/communicator.py +++ b/tensorrt_llm/_torch/distributed/communicator.py @@ -839,9 +839,13 @@ class PPCommNCCL: self.nccl_comm.send(tensor, dest) return - self.tensor_ready_event.record() + # If the tensor is allocated from non-default memory pool + # like userbuffers, its underlying memory may be reused + # before the send operation is completed. + # We clone the tensor to avoid write-write conflicts. + tensor = tensor.clone() + self.send_stream.wait_stream(torch.cuda.current_stream()) with torch.cuda.stream(self.send_stream): - self.tensor_ready_event.wait() self.nccl_comm.send(tensor, dest) def recv(self, tensor: torch.Tensor, src: Optional[int] = None):