From 5553391c5e9ba4372eb1c2cc4fea317a27394d99 Mon Sep 17 00:00:00 2001 From: Tailing Yuan Date: Tue, 27 Jan 2026 13:18:34 +0800 Subject: [PATCH] [TRTLLM-10560][fix] Fix the time of pause() for overlap scheduler (#10943) Signed-off-by: Tailing Yuan --- tensorrt_llm/_torch/pyexecutor/py_executor.py | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 39f62285e1..59ce223d0d 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -244,7 +244,7 @@ class PyExecutor: max_num_sequences: int, drafter: Optional[Drafter] = None, disable_overlap_scheduler: bool = False, - max_input_len: int = 2048, + max_input_len: int = 0x7fffffff, max_batch_size: int = 8, max_beam_width: int = 1, max_draft_len: int = 0, @@ -1503,6 +1503,7 @@ class PyExecutor: if scheduled_batch is None: break + self._terminate_requests(scheduled_batch.paused_requests) self._pause_requests(scheduled_batch.paused_requests) finished_requests = [] @@ -1722,7 +1723,7 @@ class PyExecutor: else: can_forward = True - self._pause_requests(scheduled_batch.paused_requests) + self._terminate_requests(scheduled_batch.paused_requests) can_queue, can_queue_this_rank = self._can_queue( scheduled_batch) @@ -1819,6 +1820,8 @@ class PyExecutor: # Cleanup previous draft resources used in the draft model self.drafter.cleanup_previous_draft_resources() + self._pause_requests(scheduled_batch.paused_requests) + if can_queue: guided_decoder_failed_requests = None if self.guided_decoder is not None: @@ -2871,14 +2874,16 @@ class PyExecutor: self.responses.pop(id) return response - def _pause_requests(self, requests_to_pause): + def _terminate_requests(self, requests_to_pause): # todo: support work with self.inflight_req_ids. # Currently, self.inflight_req_ids is not. - max_input_len = self.max_input_len for req in requests_to_pause: - req.pause(max_input_len) self._terminate_request(req) + def _pause_requests(self, requests_to_pause): + for req in requests_to_pause: + req.pause(self.max_input_len) + def _add_inflight_ids(self, scheduled_requests): """Add request IDs of current requests to self.inflight_req_ids.