Fix thread leak for test_draft_len_schedule. Enhance stability for test_spec_gate.

Signed-off-by: Zheyu Fu <zheyuf@NVIDIA.com>
This commit is contained in:
Zheyu Fu 2025-12-19 02:01:38 +00:00
parent 8922ca839f
commit 5ab0d1edec
4 changed files with 41 additions and 46 deletions

View File

@ -15,7 +15,7 @@ import time
import traceback
import warnings
import weakref
from functools import cache, wraps
from functools import wraps
from pathlib import Path
from queue import Queue
from typing import (Any, Callable, Iterable, List, Optional, Tuple, Type,
@ -353,7 +353,6 @@ def enable_llmapi_debug() -> bool:
return _enable_llmapi_debug_
@cache
def enable_worker_single_process_for_tp1() -> bool:
''' Tell whether to make worker use single process for TP1.
This is helpful for return-logits performance and debugging. '''

View File

@ -361,7 +361,6 @@ accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] S
unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py::test_build_ad[meta-llama/Meta-Llama-3.1-8B-Instruct-llm_extra_args0-2] SKIP (https://nvbugs/5680755)
examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5683039)
full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551)
unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_batch_sizes[model_drafter-schedule1] SKIP (https://nvbugs/5680911)
test_e2e.py::test_openai_responses SKIP (https://nvbugs/5635153)
accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype SKIP (https://nvbugs/5612438)
accuracy/test_llm_api_autodeploy.py::TestNemotronH::test_auto_dtype[True] SKIP (https://nvbugs/5688721)

View File

@ -13,23 +13,18 @@ from utils.util import similar
# # ============================================================================
# # Fixture: Force single-worker mode for all tests in this module
# # Fixture: Force single-worker mode (only for tests that use mocking)
# # ============================================================================
@pytest.fixture(scope="module", autouse=True)
def enforce_single_worker():
"""Force single-worker mode for all tests in this module."""
import os
os.environ["TLLM_WORKER_USE_SINGLE_PROCESS"] = "1"
@pytest.fixture(scope="function")
def enforce_single_worker(monkeypatch):
"""Mock functions don't work with multiple processes, so we enforce single worker."""
monkeypatch.setenv("TLLM_WORKER_USE_SINGLE_PROCESS", "1")
yield
if "TLLM_WORKER_USE_SINGLE_PROCESS" in os.environ:
del os.environ["TLLM_WORKER_USE_SINGLE_PROCESS"]
# # ============================================================================
# # test 1: Generation correctness check
# # ============================================================================
@pytest.mark.skip("https://nvbugspro.nvidia.com/bug/5680911")
@pytest.mark.parametrize(
"drafter_type,schedule",
[
@ -151,8 +146,9 @@ def test_correctness_across_batch_sizes(drafter_type: str, schedule: dict):
],
)
@pytest.mark.high_cuda_memory
@pytest.mark.skip("https://nvbugspro.nvidia.com/bug/5680911")
def test_draft_len_schedule_functionality(drafter_type: str, draft_schedule: dict):
def test_draft_len_schedule_functionality(
enforce_single_worker, drafter_type: str, draft_schedule: dict
):
if not torch.cuda.is_available():
pytest.skip("CUDA not available")

View File

@ -101,42 +101,43 @@ def test_spec_gate_e2e(enforce_single_worker):
llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
with patch.object(SpeculationGate, 'record_avg_decoded',
mock_record_avg_decoded):
llm_spec.generate(prompts, sampling_params)
try:
with patch.object(SpeculationGate, 'record_avg_decoded',
mock_record_avg_decoded):
llm_spec.generate(prompts, sampling_params)
# Verify the mock was called (requests completed)
assert len(gate_state["record_calls"]
) > 0, "record_avg_decoded should have been called"
# Verify the mock was called (requests completed)
assert len(gate_state["record_calls"]
) > 0, "record_avg_decoded should have been called"
# Verify the gate was disabled after enough requests with low acceptance
assert gate_state["gate_disabled"], \
f"Gate should have been disabled with simulated low acceptance. Calls: {gate_state['record_calls']}"
# Verify the gate was disabled after enough requests with low acceptance
assert gate_state["gate_disabled"], \
f"Gate should have been disabled with simulated low acceptance. Calls: {gate_state['record_calls']}"
# Verify the gate triggered at the right time (after window is filled)
# The gate should trigger on the `acceptance_window`-th call (index = window - 1)
disable_indices = [
i for i, call in enumerate(gate_state["record_calls"])
if call["disabled_now"]
]
assert len(disable_indices) == 1, \
f"Gate should have triggered exactly once, but triggered at indices: {disable_indices}"
assert disable_indices[0] >= acceptance_window - 1, \
f"Gate should trigger after window ({acceptance_window}) is filled, but triggered at index {disable_indices[0]}"
# Verify the gate triggered at the right time (after window is filled)
# The gate should trigger on the `acceptance_window`-th call (index = window - 1)
disable_indices = [
i for i, call in enumerate(gate_state["record_calls"])
if call["disabled_now"]
]
assert len(disable_indices) == 1, \
f"Gate should have triggered exactly once, but triggered at indices: {disable_indices}"
assert disable_indices[0] >= acceptance_window - 1, \
f"Gate should trigger after window ({acceptance_window}) is filled, but triggered at index {disable_indices[0]}"
# Verify the average acceptance was below threshold when disabled
disable_call = gate_state["record_calls"][disable_indices[0]]
assert disable_call["avg_accept"] is not None
assert disable_call["avg_accept"] < acceptance_threshold, \
f"Avg acceptance ({disable_call['avg_accept']}) should be below threshold ({acceptance_threshold})"
# Verify the average acceptance was below threshold when disabled
disable_call = gate_state["record_calls"][disable_indices[0]]
assert disable_call["avg_accept"] is not None
assert disable_call["avg_accept"] < acceptance_threshold, \
f"Avg acceptance ({disable_call['avg_accept']}) should be below threshold ({acceptance_threshold})"
logger.debug(
f"Gate correctly triggered after {disable_indices[0] + 1} requests")
logger.debug(
f"Final avg acceptance: {disable_call['avg_accept']:.3f} < threshold {acceptance_threshold}"
)
llm_spec.shutdown()
logger.debug(
f"Gate correctly triggered after {disable_indices[0] + 1} requests")
logger.debug(
f"Final avg acceptance: {disable_call['avg_accept']:.3f} < threshold {acceptance_threshold}"
)
finally:
llm_spec.shutdown()
def test_returns_none_until_window_and_enabled_when_above_threshold():