From 0e872ef0b0fdfc0d16489f587b0227dd82ab4b1a Mon Sep 17 00:00:00 2001
From: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
Date: Fri, 16 May 2025 12:35:25 -0400
Subject: [PATCH] [AutoDeploy] fix: proper process group clean up (#4373)

[AutoDeploy] proper process group clean up

Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
---
 .../_torch/auto_deploy/distributed/common.py    | 17 +++++++++++++++++
 tensorrt_llm/_torch/auto_deploy/shim/demollm.py |  7 +++++++
 2 files changed, 24 insertions(+)

diff --git a/tensorrt_llm/_torch/auto_deploy/distributed/common.py b/tensorrt_llm/_torch/auto_deploy/distributed/common.py
index d680a9fa4c..d9c698b30b 100644
--- a/tensorrt_llm/_torch/auto_deploy/distributed/common.py
+++ b/tensorrt_llm/_torch/auto_deploy/distributed/common.py
@@ -103,6 +103,13 @@ def is_torchelastic():
     return "TORCHELASTIC_RUN_ID" in os.environ
 
 
+def cleanup():
+    """Destroy process group when the program exits."""
+    if dist.is_initialized():
+        ad_logger.info("Destroying process group")
+        dist.destroy_process_group()
+
+
 def initialize(rank: int = 0, world_size: int = 1, port: Optional[int] = None) -> Tuple[int, int]:
     if is_ompi():
         lib = "OMPI"
@@ -136,6 +143,9 @@ def initialize(rank: int = 0, world_size: int = 1, port: Optional[int] = None) -
     # We use nccl backend
     dist.init_process_group("nccl", world_size=world_size, rank=local_rank)
 
+    # Register cleanup function to be called at exit
+    atexit.register(cleanup)
+
     # set a manual seed for reproducibility
     torch.manual_seed(1111)
 
@@ -153,6 +163,9 @@ def init_and_run_process(job, rank, size, port, **kwargs):
                 kwargs[q].put(None)
                 kwargs[q].close()
         raise e
+    finally:
+        # Make sure to clean up even if an exception occurs
+        cleanup()
 
 
 def _start_multiprocess_job(
@@ -275,3 +288,7 @@ class MultiProcessExecutor:
             q.join_thread()
         self.output_queue.close()
         self.output_queue.join_thread()
+
+        # Make sure all process groups are cleaned up
+        if dist.is_initialized():
+            dist.destroy_process_group()
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
index 3de91df6f3..37d0c35440 100644
--- a/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/demollm.py
@@ -391,6 +391,13 @@ class DemoLLM(LLM):
             device="cuda",
         )
 
+    def __del__(self):
+        """Ensure proper cleanup of distributed resources."""
+        if hasattr(self, "_executor") and self._executor is not None:
+            self._executor.shutdown()
+        # Call cleanup to ensure process group is properly destroyed
+        dist_ad.cleanup()
+
     @staticmethod
     def _handle_response(request_output: RequestOutput, response: List[CompletionOutput]):
         request_output._done = True