[TRTLLM-10560][fix] Fix the time of pause() for overlap scheduler (#10943)

Signed-off-by: Tailing Yuan <yuantailing@gmail.com>
This commit is contained in:
Tailing Yuan 2026-01-27 13:18:34 +08:00 committed by GitHub
parent 4a206351bb
commit 5553391c5e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -244,7 +244,7 @@ class PyExecutor:
max_num_sequences: int,
drafter: Optional[Drafter] = None,
disable_overlap_scheduler: bool = False,
max_input_len: int = 2048,
max_input_len: int = 0x7fffffff,
max_batch_size: int = 8,
max_beam_width: int = 1,
max_draft_len: int = 0,
@ -1503,6 +1503,7 @@ class PyExecutor:
if scheduled_batch is None:
break
self._terminate_requests(scheduled_batch.paused_requests)
self._pause_requests(scheduled_batch.paused_requests)
finished_requests = []
@ -1722,7 +1723,7 @@ class PyExecutor:
else:
can_forward = True
self._pause_requests(scheduled_batch.paused_requests)
self._terminate_requests(scheduled_batch.paused_requests)
can_queue, can_queue_this_rank = self._can_queue(
scheduled_batch)
@ -1819,6 +1820,8 @@ class PyExecutor:
# Cleanup previous draft resources used in the draft model
self.drafter.cleanup_previous_draft_resources()
self._pause_requests(scheduled_batch.paused_requests)
if can_queue:
guided_decoder_failed_requests = None
if self.guided_decoder is not None:
@ -2871,14 +2874,16 @@ class PyExecutor:
self.responses.pop(id)
return response
def _pause_requests(self, requests_to_pause):
def _terminate_requests(self, requests_to_pause):
# todo: support work with self.inflight_req_ids.
# Currently, self.inflight_req_ids is not.
max_input_len = self.max_input_len
for req in requests_to_pause:
req.pause(max_input_len)
self._terminate_request(req)
def _pause_requests(self, requests_to_pause):
for req in requests_to_pause:
req.pause(self.max_input_len)
def _add_inflight_ids(self, scheduled_requests):
"""Add request IDs of current requests to self.inflight_req_ids.