From d3f4fbb742d553967a82f877a1354a3f7723f52b Mon Sep 17 00:00:00 2001
From: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
Date: Wed, 14 Jan 2026 09:39:36 +0800
Subject: [PATCH] [None][fix] Avoid write-write race for async pp send.
 (#10488)

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
---
 tensorrt_llm/_torch/distributed/communicator.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/tensorrt_llm/_torch/distributed/communicator.py b/tensorrt_llm/_torch/distributed/communicator.py
index 20401b5a24..d6e54a6411 100644
--- a/tensorrt_llm/_torch/distributed/communicator.py
+++ b/tensorrt_llm/_torch/distributed/communicator.py
@@ -839,9 +839,13 @@ class PPCommNCCL:
             self.nccl_comm.send(tensor, dest)
             return
 
-        self.tensor_ready_event.record()
+        # If the tensor is allocated from non-default memory pool
+        # like userbuffers, its underlying memory may be reused
+        # before the send operation is completed.
+        # We clone the tensor to avoid write-write conflicts.
+        tensor = tensor.clone()
+        self.send_stream.wait_stream(torch.cuda.current_stream())
         with torch.cuda.stream(self.send_stream):
-            self.tensor_ready_event.wait()
             self.nccl_comm.send(tensor, dest)
 
     def recv(self, tensor: torch.Tensor, src: Optional[int] = None):