fix: fix hang in mgmn with trtllm-llmapi-launch command (#3119)

* init

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>

* restore

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>

---------

Signed-off-by: Superjomn <328693+Superjomn@users.noreply.github.com>
This commit is contained in:
Yan Chunwei 2025-03-27 18:45:43 +08:00 committed by GitHub
parent 0976360204
commit 87ab794aa2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -435,13 +435,14 @@ class ExecutorBindingsWorker(GenerationExecutor):
self._client_id_to_request_id.pop(client_id, None)
def shutdown(self):
print_colored_debug(f'Worker {mpi_rank()} shutdown...\n', "yellow")
if self.doing_shutdown:
return
else:
self.doing_shutdown = True
print_colored_debug(f'Worker {mpi_rank()} shutdown...\n', "yellow")
if self.engine is not None:
if self.engine.can_enqueue_requests():
@ -500,6 +501,10 @@ def worker_main(
is_llm_executor: Optional[
bool] = True, # whether it's the main executor instance
) -> None:
mpi_comm().barrier()
print_colored_debug(f"Worker {mpi_rank()} entering worker_main...\n",
"green")
pid = os.getpid()
cpus = os.sched_getaffinity(pid)
if cpus:
@ -608,6 +613,10 @@ def worker_main(
# and handled by future.done_callback, that will propagate the
# error to the error_queue in the main thread.
mpi_comm().barrier()
print_colored_debug(f"Worker {mpi_rank()} ready to setup backend...\n",
"green")
try:
worker: ExecutorBindingsWorker = worker_cls(
engine,