From fe4c690b6cc4512433c98071f8deba99f3148c59 Mon Sep 17 00:00:00 2001 From: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com> Date: Mon, 9 Feb 2026 18:01:12 -0500 Subject: [PATCH] [https://nvbugs/5855540][fix] AutoDeploy: thread cleanup of eagle test (#11289) Signed-off-by: Lucas Liebenwein <11156568+lucaslie@users.noreply.github.com> --- .../examples/test_ad_speculative_decoding.py | 61 ++++++++++--------- tests/integration/test_lists/waives.txt | 1 - 2 files changed, 31 insertions(+), 31 deletions(-) diff --git a/tests/integration/defs/examples/test_ad_speculative_decoding.py b/tests/integration/defs/examples/test_ad_speculative_decoding.py index 2228c4a6f5..0c316bfbb1 100644 --- a/tests/integration/defs/examples/test_ad_speculative_decoding.py +++ b/tests/integration/defs/examples/test_ad_speculative_decoding.py @@ -244,7 +244,8 @@ def test_autodeploy_eagle3_acceptance_rate(): # We directly instantiate the LLM class instead of using the main() function # so that we can stream the outputs to see acceptance rates without needing to # collect them in the executor. - llm = LLM( + # Use context manager to ensure proper cleanup and avoid thread leaks. + with LLM( model=base_model, skip_loading_weights=False, runtime="trtllm", @@ -253,43 +254,43 @@ def test_autodeploy_eagle3_acceptance_rate(): speculative_config=speculative_config, disable_overlap_scheduler=True, max_num_tokens=64, - ) + ) as llm: + # Tokenize 2 prompts to test multiple sequential requests + batch_tok_ids = [llm.tokenizer.encode(p) for p in prompts[:2]] - # Tokenize 2 prompts to test multiple sequential requests - batch_tok_ids = [llm.tokenizer.encode(p) for p in prompts[:2]] + sampling_params = SamplingParams(max_tokens=128, temperature=0, seed=42) - sampling_params = SamplingParams(max_tokens=128, temperature=0, seed=42) + print("\nRunning Eagle3 speculative decoding with streaming...") - print("\nRunning Eagle3 speculative decoding with streaming...") + # Process each request sequentially and verify acceptance rate + for i in range(len(batch_tok_ids)): + num_tokens = 0 + num_drafted = 0 + num_accepted = 0 - # Process each request sequentially and verify acceptance rate - for i in range(len(batch_tok_ids)): - num_tokens = 0 - num_drafted = 0 - num_accepted = 0 + for output in llm.generate_async(batch_tok_ids[i], sampling_params, streaming=True): + new_tokens = output.outputs[0].token_ids + num_drafted += max_draft_len + num_accepted += len(new_tokens) - num_tokens - 1 + num_tokens = len(new_tokens) - for output in llm.generate_async(batch_tok_ids[i], sampling_params, streaming=True): - new_tokens = output.outputs[0].token_ids - num_drafted += max_draft_len - num_accepted += len(new_tokens) - num_tokens - 1 - num_tokens = len(new_tokens) + accept_rate = num_accepted / num_drafted - accept_rate = num_accepted / num_drafted + print(f"\nRequest {i + 1} Acceptance Rate Statistics:") + print(f" Total tokens drafted: {num_drafted}") + print(f" Total tokens accepted: {num_accepted}") + print(f" Acceptance rate: {accept_rate:.2%}") - print(f"\nRequest {i + 1} Acceptance Rate Statistics:") - print(f" Total tokens drafted: {num_drafted}") - print(f" Total tokens accepted: {num_accepted}") - print(f" Acceptance rate: {accept_rate:.2%}") + # Verify acceptance rate is above minimum threshold (10%) + min_acceptance_rate = 0.10 + assert accept_rate > min_acceptance_rate, ( + f"Request {i + 1}: Acceptance rate {accept_rate:.2%} is below minimum threshold " + f"{min_acceptance_rate:.0%}" + ) - # Verify acceptance rate is above minimum threshold (10%) - min_acceptance_rate = 0.10 - assert accept_rate > min_acceptance_rate, ( - f"Request {i + 1}: Acceptance rate {accept_rate:.2%} is below minimum threshold {min_acceptance_rate:.0%}" - ) - - print("\n" + "=" * 80) - print("SUCCESS! All requests passed acceptance rate threshold") - print("=" * 80) + print("\n" + "=" * 80) + print("SUCCESS! All requests passed acceptance rate threshold") + print("=" * 80) def load_weights(model_path: Path, model: torch.nn.Module): diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 8eb322af48..9690e60fdb 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -334,7 +334,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[latency] S accuracy/test_disaggregated_serving.py::TestDeepSeekV32Exp::test_auto_dtype[False] SKIP (https://nvbugs/5847284) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v2_kv_cache-dp4-trtllm-fp8] SKIP (https://nvbugs/5850183) -examples/test_ad_speculative_decoding.py::test_autodeploy_eagle3_acceptance_rate SKIP (https://nvbugs/5855540) unittest/_torch/thop/serial/test_moe.py::TestMoeFp4::test_autotune_fp8_fp4[RoutingDSlite-384-1024-1] SKIP (https://nvbugs/5859881) accuracy/test_llm_api_pytorch.py::TestDeepSeekV32::test_fp8_blockscale[disable_skip_indexer] SKIP (https://nvbugs/5859886) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[v1_kv_cache-tp4-cutlass-fp8] SKIP (https://nvbugs/5651865)