diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 4129973363..042a002395 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -2789,9 +2789,23 @@ class PyExecutor: def _pause_requests(self, requests_to_pause): # todo: support work with self.inflight_req_ids. # Currently, self.inflight_req_ids is not. + MAX_PAUSES_PER_STEP = 8 max_input_len = self.max_input_len + pauses_remaining = MAX_PAUSES_PER_STEP + for req in requests_to_pause: + if pauses_remaining <= 0: + break + + if getattr(req, "_paused", False): + continue + + if req.request_id in self.inflight_req_ids: + continue + req.pause(max_input_len) + req._paused = True + pauses_remaining -= 1 self._terminate_request(req) def _add_inflight_ids(self, scheduled_requests):