fix: fix hang in mgmn with trtllm-llmapi-launch command (#3119)

* init Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> * restore Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com> --------- Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-03-27 18:45:43 +08:00 · 2025-03-27 18:45:43 +08:00 · 87ab794aa2
commit 87ab794aa2
parent 0976360204
1 changed files with 10 additions and 1 deletions
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@ -435,13 +435,14 @@ class ExecutorBindingsWorker(GenerationExecutor):
        self._client_id_to_request_id.pop(client_id, None)

    def shutdown(self):
-        print_colored_debug(f'Worker {mpi_rank()} shutdown...\n', "yellow")

        if self.doing_shutdown:
            return
        else:
            self.doing_shutdown = True

+        print_colored_debug(f'Worker {mpi_rank()} shutdown...\n', "yellow")
+
        if self.engine is not None:
            if self.engine.can_enqueue_requests():

@ -500,6 +501,10 @@ def worker_main(
        is_llm_executor: Optional[
            bool] = True,  # whether it's the main executor instance
 ) -> None:
+    mpi_comm().barrier()
+    print_colored_debug(f"Worker {mpi_rank()} entering worker_main...\n",
+                        "green")
+
    pid = os.getpid()
    cpus = os.sched_getaffinity(pid)
    if cpus:
@ -608,6 +613,10 @@ def worker_main(
    #         and handled by future.done_callback, that will propagate the
    #         error to the error_queue in the main thread.

+    mpi_comm().barrier()
+    print_colored_debug(f"Worker {mpi_rank()} ready to setup backend...\n",
+                        "green")
+
    try:
        worker: ExecutorBindingsWorker = worker_cls(
            engine,