Add sleep function for disagg gen-only benchmarking (#5398)

Signed-off-by: Xianjie <5410381+qiaoxj07@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-06-26 07:32:16 +08:00 · 2025-06-26 07:32:16 +08:00 · 1e4fa13d33
commit 1e4fa13d33
parent feaf789342
1 changed files with 9 additions and 0 deletions
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@ -236,6 +236,8 @@ class PyExecutor:
        self.ctx_in_transmission_requests = []
        self.previous_batch: Optional[BatchState] = None
        self.num_scheduled_requests: int = 0
+        self.benchmark_req_queues_size = int(
+            os.environ.get("TLLM_BENCHMARK_REQ_QUEUES_SIZE", 0))

        # list of requests in each PP micro batch
        self.num_micro_batches = self.dist.pp_size
@ -996,6 +998,13 @@ class PyExecutor:

    def _executor_loop_overlap(self):
        torch.cuda.set_device(self.device_id)
+        if self.dist.rank == 0 and not self.is_warmup and self.benchmark_req_queues_size > 0 and self.kv_cache_transceiver:
+            while self.request_queue.qsize() < self.benchmark_req_queues_size:
+                logger.info(
+                    f"sleep 5 seconds, num_request_queue: {self.request_queue.qsize()}"
+                )
+                time.sleep(5)
+
        with self._profiler() as profile_step:
            iter_start_time = time.time()
            iter_stats = None