From ff277b591e56fcbeb80772de9b711d093580a9b7 Mon Sep 17 00:00:00 2001
From: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
Date: Thu, 15 Jan 2026 16:33:55 +0800
Subject: [PATCH] [https://nvbugs/5791830][fix] fix pp loop hang caused by
 i-sending new requests (#10665)

Signed-off-by: Lizhi Zhou <1432185+reasonsolo@users.noreply.github.com>
---
 .../_torch/pyexecutor/executor_request_queue.py       | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
index cb42186520..080a615e75 100644
--- a/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
+++ b/tensorrt_llm/_torch/pyexecutor/executor_request_queue.py
@@ -1,6 +1,7 @@
 import dataclasses
 import datetime
 import heapq
+import os
 import queue
 import threading
 import time
@@ -611,12 +612,20 @@ class ExecutorRequestQueue:
             with nvtx_range("recv_requests_from_prev_pp"):
                 payloads = self.dist.recv_object(self.dist.prev_pp_rank, tag)
 
+        # isend new requests may cause deadlock, when CUDA_LAUNCH_BLOCKING=1 or PP microbatches can't overlap,
+        # the deadlock will happen deterministicly:
+        # 1. rank1 will wait on nccl.send(rank2), without invoking mpi.wait(isend-handle)
+        # 2. rank2 will wait on mpi.recv(rank1) but never receive the new requests.
+        # 3. rank1 will hang on nccl.send because rank2 will never reach nccl.recv(rank1).
+        pp_send_func = self.dist.isend_object if os.environ.get(
+            "TRTLLM_PP_REQ_SEND_ASYNC", "0") == "1" else self.dist.send_object
+
         if not self.dist.is_last_pp_rank:
             if self.send_requests_handler is not None:
                 with nvtx_range("wait_prev_send_requests_handler"):
                     self.send_requests_handler.wait()
             with nvtx_range("send_requests_to_next_pp"):
-                self.send_requests_handler = self.dist.isend_object(
+                self.send_requests_handler = pp_send_func(
                     payloads, self.dist.next_pp_rank, tag)
 
         return payloads