From 87ab794aa2d999b552c937e836865c3909eed84b Mon Sep 17 00:00:00 2001 From: Yan Chunwei <328693+Superjomn@users.noreply.github.com> Date: Thu, 27 Mar 2025 18:45:43 +0800 Subject: [PATCH] fix: fix hang in mgmn with trtllm-llmapi-launch command (#3119) * init Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> * restore Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> --------- Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> --- tensorrt_llm/executor/worker.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py index 2504f6e5cb..36eb4bf0a8 100644 --- a/tensorrt_llm/executor/worker.py +++ b/tensorrt_llm/executor/worker.py @@ -435,13 +435,14 @@ class ExecutorBindingsWorker(GenerationExecutor): self._client_id_to_request_id.pop(client_id, None) def shutdown(self): - print_colored_debug(f'Worker {mpi_rank()} shutdown...\n', "yellow") if self.doing_shutdown: return else: self.doing_shutdown = True + print_colored_debug(f'Worker {mpi_rank()} shutdown...\n', "yellow") + if self.engine is not None: if self.engine.can_enqueue_requests(): @@ -500,6 +501,10 @@ def worker_main( is_llm_executor: Optional[ bool] = True, # whether it's the main executor instance ) -> None: + mpi_comm().barrier() + print_colored_debug(f"Worker {mpi_rank()} entering worker_main...\n", + "green") + pid = os.getpid() cpus = os.sched_getaffinity(pid) if cpus: @@ -608,6 +613,10 @@ def worker_main( # and handled by future.done_callback, that will propagate the # error to the error_queue in the main thread. + mpi_comm().barrier() + print_colored_debug(f"Worker {mpi_rank()} ready to setup backend...\n", + "green") + try: worker: ExecutorBindingsWorker = worker_cls( engine,