diff --git a/tensorrt_llm/executor/worker.py b/tensorrt_llm/executor/worker.py
index 2504f6e5cb..36eb4bf0a8 100644
--- a/tensorrt_llm/executor/worker.py
+++ b/tensorrt_llm/executor/worker.py
@@ -435,13 +435,14 @@ class ExecutorBindingsWorker(GenerationExecutor):
         self._client_id_to_request_id.pop(client_id, None)
 
     def shutdown(self):
-        print_colored_debug(f'Worker {mpi_rank()} shutdown...\n', "yellow")
 
         if self.doing_shutdown:
             return
         else:
             self.doing_shutdown = True
 
+        print_colored_debug(f'Worker {mpi_rank()} shutdown...\n', "yellow")
+
         if self.engine is not None:
             if self.engine.can_enqueue_requests():
 
@@ -500,6 +501,10 @@ def worker_main(
         is_llm_executor: Optional[
             bool] = True,  # whether it's the main executor instance
 ) -> None:
+    mpi_comm().barrier()
+    print_colored_debug(f"Worker {mpi_rank()} entering worker_main...\n",
+                        "green")
+
     pid = os.getpid()
     cpus = os.sched_getaffinity(pid)
     if cpus:
@@ -608,6 +613,10 @@ def worker_main(
     #         and handled by future.done_callback, that will propagate the
     #         error to the error_queue in the main thread.
 
+    mpi_comm().barrier()
+    print_colored_debug(f"Worker {mpi_rank()} ready to setup backend...\n",
+                        "green")
+
     try:
         worker: ExecutorBindingsWorker = worker_cls(
             engine,