[None][fix] Avoid write-write race for async pp send. (#10488)

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
2026-02-04 10:11:47 +08:00 · 2026-01-14 09:39:36 +08:00 · 2026-01-14 09:39:36 +08:00 · d3f4fbb742
commit d3f4fbb742
parent 2acd03030a
1 changed files with 6 additions and 2 deletions
--- a/tensorrt_llm/_torch/distributed/communicator.py
+++ b/tensorrt_llm/_torch/distributed/communicator.py
@ -839,9 +839,13 @@ class PPCommNCCL:
            self.nccl_comm.send(tensor, dest)
            return

-        self.tensor_ready_event.record()
+        # If the tensor is allocated from non-default memory pool
+        # like userbuffers, its underlying memory may be reused
+        # before the send operation is completed.
+        # We clone the tensor to avoid write-write conflicts.
+        tensor = tensor.clone()
+        self.send_stream.wait_stream(torch.cuda.current_stream())
        with torch.cuda.stream(self.send_stream):
-            self.tensor_ready_event.wait()
            self.nccl_comm.send(tensor, dest)

    def recv(self, tensor: torch.Tensor, src: Optional[int] = None):