From fe4c690b6cc4512433c98071f8deba99f3148c59 Mon Sep 17 00:00:00 2001
From: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
Date: Mon, 9 Feb 2026 18:01:12 -0500
Subject: [PATCH] [https://nvbugs/5855540][fix] AutoDeploy: thread cleanup of
 eagle test (#11289)

Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com>
---
 .../examples/test_ad_speculative_decoding.py  | 61 ++++++++++---------
 tests/integration/test_lists/waives.txt       |  1 -
 2 files changed, 31 insertions(+), 31 deletions(-)

diff --git a/tests/integration/defs/examples/test_ad_speculative_decoding.py b/tests/integration/defs/examples/test_ad_speculative_decoding.py
index 2228c4a6f5..0c316bfbb1 100644
--- a/tests/integration/defs/examples/test_ad_speculative_decoding.py
+++ b/tests/integration/defs/examples/test_ad_speculative_decoding.py
@@ -244,7 +244,8 @@ def test_autodeploy_eagle3_acceptance_rate():
     # We directly instantiate the LLM class instead of using the main() function
     # so that we can stream the outputs to see acceptance rates without needing to
     # collect them in the executor.
-    llm = LLM(
+    # Use context manager to ensure proper cleanup and avoid thread leaks.
+    with LLM(
         model=base_model,
         skip_loading_weights=False,
         runtime="trtllm",
@@ -253,43 +254,43 @@ def test_autodeploy_eagle3_acceptance_rate():
         speculative_config=speculative_config,
         disable_overlap_scheduler=True,
         max_num_tokens=64,
-    )
+    ) as llm:
+        # Tokenize 2 prompts to test multiple sequential requests
+        batch_tok_ids = [llm.tokenizer.encode(p) for p in prompts[:2]]
 
-    # Tokenize 2 prompts to test multiple sequential requests
-    batch_tok_ids = [llm.tokenizer.encode(p) for p in prompts[:2]]
+        sampling_params = SamplingParams(max_tokens=128, temperature=0, seed=42)
 
-    sampling_params = SamplingParams(max_tokens=128, temperature=0, seed=42)
+        print("\nRunning Eagle3 speculative decoding with streaming...")
 
-    print("\nRunning Eagle3 speculative decoding with streaming...")
+        # Process each request sequentially and verify acceptance rate
+        for i in range(len(batch_tok_ids)):
+            num_tokens = 0
+            num_drafted = 0
+            num_accepted = 0
 
-    # Process each request sequentially and verify acceptance rate
-    for i in range(len(batch_tok_ids)):
-        num_tokens = 0
-        num_drafted = 0
-        num_accepted = 0
+            for output in llm.generate_async(batch_tok_ids[i], sampling_params, streaming=True):
+                new_tokens = output.outputs[0].token_ids
+                num_drafted += max_draft_len
+                num_accepted += len(new_tokens) - num_tokens - 1
+                num_tokens = len(new_tokens)
 
-        for output in llm.generate_async(batch_tok_ids[i], sampling_params, streaming=True):
-            new_tokens = output.outputs[0].token_ids
-            num_drafted += max_draft_len
-            num_accepted += len(new_tokens) - num_tokens - 1
-            num_tokens = len(new_tokens)
+            accept_rate = num_accepted / num_drafted
 
-        accept_rate = num_accepted / num_drafted
+            print(f"\nRequest {i + 1} Acceptance Rate Statistics:")
+            print(f"  Total tokens drafted: {num_drafted}")
+            print(f"  Total tokens accepted: {num_accepted}")
+            print(f"  Acceptance rate: {accept_rate:.2%}")
 
-        print(f"\nRequest {i + 1} Acceptance Rate Statistics:")
-        print(f"  Total tokens drafted: {num_drafted}")
-        print(f"  Total tokens accepted: {num_accepted}")
-        print(f"  Acceptance rate: {accept_rate:.2%}")
+            # Verify acceptance rate is above minimum threshold (10%)
+            min_acceptance_rate = 0.10
+            assert accept_rate > min_acceptance_rate, (
+                f"Request {i + 1}: Acceptance rate {accept_rate:.2%} is below minimum threshold "
+                f"{min_acceptance_rate:.0%}"
+            )
 
-        # Verify acceptance rate is above minimum threshold (10%)
-        min_acceptance_rate = 0.10
-        assert accept_rate > min_acceptance_rate, (
-            f"Request {i + 1}: Acceptance rate {accept_rate:.2%} is below minimum threshold {min_acceptance_rate:.0%}"
-        )
-
-    print("\n" + "=" * 80)
-    print("SUCCESS! All requests passed acceptance rate threshold")
-    print("=" * 80)
+        print("\n" + "=" * 80)
+        print("SUCCESS! All requests passed acceptance rate threshold")
+        print("=" * 80)
 
 
 def load_weights(model_path: Path, model: torch.nn.Module):
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 8eb322af48..9690e60fdb 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -334,7 +334,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] S
 accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5847284)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183)
-examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate SKIP (https://nvbugs/5855540)
 unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_autotune_fp8_fp4[RoutingDSlite-384-1024-1] SKIP (https://nvbugs/5859881)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[disable_skip_indexer] SKIP (https://nvbugs/5859886)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8] SKIP (https://nvbugs/5651865)