[None][fix] Avoid write-write race for async pp send. (#10488)

Signed-off-by: Yuxian Qiu <142763828+yuxianq@users.noreply.github.com>
This commit is contained in:
Yuxian Qiu 2026-01-14 09:39:36 +08:00 committed by GitHub
parent 2acd03030a
commit d3f4fbb742
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -839,9 +839,13 @@ class PPCommNCCL:
self.nccl_comm.send(tensor, dest)
return
self.tensor_ready_event.record()
# If the tensor is allocated from non-default memory pool
# like userbuffers, its underlying memory may be reused
# before the send operation is completed.
# We clone the tensor to avoid write-write conflicts.
tensor = tensor.clone()
self.send_stream.wait_stream(torch.cuda.current_stream())
with torch.cuda.stream(self.send_stream):
self.tensor_ready_event.wait()
self.nccl_comm.send(tensor, dest)
def recv(self, tensor: torch.Tensor, src: Optional[int] = None):