[https://nvbugs/5791830][fix] fix pp loop hang caused by i-sending new requests (#10665)

Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
This commit is contained in:
Lizhi Zhou 2026-01-15 16:33:55 +08:00 committed by GitHub
parent cd55fb4551
commit ff277b591e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -1,6 +1,7 @@
import dataclasses
import datetime
import heapq
import os
import queue
import threading
import time
@ -611,12 +612,20 @@ class ExecutorRequestQueue:
with nvtx_range("recv_requests_from_prev_pp"):
payloads = self.dist.recv_object(self.dist.prev_pp_rank, tag)
# isend new requests may cause deadlock, when CUDA_LAUNCH_BLOCKING=1 or PP microbatches can't overlap,
# the deadlock will happen deterministicly:
# 1. rank1 will wait on nccl.send(rank2), without invoking mpi.wait(isend-handle)
# 2. rank2 will wait on mpi.recv(rank1) but never receive the new requests.
# 3. rank1 will hang on nccl.send because rank2 will never reach nccl.recv(rank1).
pp_send_func = self.dist.isend_object if os.environ.get(
"TRTLLM_PP_REQ_SEND_ASYNC", "0") == "1" else self.dist.send_object
if not self.dist.is_last_pp_rank:
if self.send_requests_handler is not None:
with nvtx_range("wait_prev_send_requests_handler"):
self.send_requests_handler.wait()
with nvtx_range("send_requests_to_next_pp"):
self.send_requests_handler = self.dist.isend_object(
self.send_requests_handler = pp_send_func(
payloads, self.dist.next_pp_rank, tag)
return payloads