[https://nvbugs/5680911][fix] Remove @cache decorator to enhance CI stability for unit tests using single process mode (#10730)

Signed-off-by: Zheyu Fu <zheyuf@NVIDIA.com>
Signed-off-by: Wangshanshan <30051912+dominicshanshan@users.noreply.github.com>
This commit is contained in:
Zheyu Fu 2026-01-22 22:47:51 -08:00 committed by Yanchao Lu
parent 7e5e5b90b9
commit d31482686c
5 changed files with 97 additions and 45 deletions

View File

@ -15,7 +15,7 @@ import time
import traceback
import warnings
import weakref
from functools import cache, wraps
from functools import wraps
from pathlib import Path
from queue import Queue
from typing import (Any, Callable, Iterable, List, Optional, Tuple, Type,
@ -355,7 +355,6 @@ def enable_llmapi_debug() -> bool:
return _enable_llmapi_debug_
@cache
def enable_worker_single_process_for_tp1() -> bool:
''' Tell whether to make worker use single process for TP1.
This is helpful for return-logits performance and debugging. '''

View File

@ -197,7 +197,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[t
unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency] SKIP (https://nvbugs/5808500)
unittest/_torch/auto_deploy/unit/multigpu/test_ad_build_small_multi.py::test_build_ad[meta-llama/Meta-Llama-3.1-8B-Instruct-llm_extra_args0-2] SKIP (https://nvbugs/5680755)
full:H100_PCIe/unittest/llmapi/test_llm_pytorch.py::test_llama_7b_multi_lora_evict_and_reload_lora_gpu_cache SKIP (https://nvbugs/5682551)
unittest/_torch/speculative/test_draft_len_schedule.py::test_correctness_across_batch_sizes[model_drafter-schedule1] SKIP (https://nvbugs/5680911)
test_e2e.py::test_openai_completions_example[trt] SKIP (https://nvbugs/5701450)
triton_server/test_triton_llm.py::test_llmapi_backend[4-0-disableDecoupleMode-tensorrt_llm] SKIP (https://nvbugs/5701480)
unittest/_torch/modules/tests_lora_modules/test_lora_attention_pytorch_flow_vs_trt.py::TestLoraAttentionPytorchFlowVsTRT::test_lora_attention SKIP (https://nvbugs/5701421)
@ -207,7 +206,6 @@ accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/570
accuracy/test_cli_flow.py::TestTinyLlama1_1BChat::test_weight_only_int8_kv_cache[int8] SKIP (https://nvbugs/5666826)
disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5705199)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343)
unittest/_torch/speculative/test_spec_gate.py::test_spec_gate_e2e SKIP (https://nvbugs/5710045)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629)
full:RTXPro6000D/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=0] SKIP (https://nvbugs/5748600)
full:RTXPro6000D/accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/5748600)
@ -225,6 +223,7 @@ triton_server/test_triton.py::test_opt[opt] SKIP (https://nvbugs/5739981)
cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979)
examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979)
unittest/_torch/speculative/test_dynamic_spec_decode.py::test_dynamic_spec_decode SKIP (https://nvbugs/5758449)
triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118)
triton_server/test_triton.py::test_gpt_speculative_decoding[gpt-speculative-decoding] SKIP (https://nvbugs/5762854)
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B_Instruct_RocketKV::test_auto_dtype SKIP (https://nvbugs/5762822)

View File

@ -13,23 +13,18 @@ from utils.util import similar
# # ============================================================================
# # Fixture: Force single-worker mode for all tests in this module
# # Fixture: Force single-worker mode (only for tests that use mocking)
# # ============================================================================
@pytest.fixture(scope="module", autouse=True)
def enforce_single_worker():
"""Force single-worker mode for all tests in this module."""
import os
os.environ["TLLM_WORKER_USE_SINGLE_PROCESS"] = "1"
@pytest.fixture(scope="function")
def enforce_single_worker(monkeypatch):
"""Mock functions don't work with multiple processes, so we enforce single worker."""
monkeypatch.setenv("TLLM_WORKER_USE_SINGLE_PROCESS", "1")
yield
if "TLLM_WORKER_USE_SINGLE_PROCESS" in os.environ:
del os.environ["TLLM_WORKER_USE_SINGLE_PROCESS"]
# # ============================================================================
# # test 1: Generation correctness check
# # ============================================================================
@pytest.mark.skip("https://nvbugspro.nvidia.com/bug/5680911")
@pytest.mark.parametrize(
"drafter_type,schedule",
[
@ -151,8 +146,9 @@ def test_correctness_across_batch_sizes(drafter_type: str, schedule: dict):
],
)
@pytest.mark.high_cuda_memory
@pytest.mark.skip("https://nvbugspro.nvidia.com/bug/5680911")
def test_draft_len_schedule_functionality(drafter_type: str, draft_schedule: dict):
def test_draft_len_schedule_functionality(
enforce_single_worker, drafter_type: str, draft_schedule: dict
):
if not torch.cuda.is_available():
pytest.skip("CUDA not available")

View File

@ -21,6 +21,7 @@ def enforce_single_worker(monkeypatch):
yield
@pytest.mark.skip("https://nvbugs/5758449")
@pytest.mark.parametrize("disable_overlap_scheduler", [True, False])
@pytest.mark.high_cuda_memory
def test_dynamic_spec_decode(enforce_single_worker,

View File

@ -1,28 +1,34 @@
import os
import sys
import unittest
from unittest.mock import patch
import pytest
import torch
from utils.llm_data import llm_models_root
from utils.util import similar, skip_blackwell
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm._torch.speculative.speculation_gate import SpeculationGate
from tensorrt_llm.llmapi import (CudaGraphConfig, Eagle3DecodingConfig,
KvCacheConfig)
from tensorrt_llm.logger import logger
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
# It tests the end-to-end functionality of the SpeculationGate,
# which will turn off spec decode when the average acceptance length is below the threshold.
# It is set with acceptance window and acceptance threshold in spec_config.
# This test set the max_concurrency to a large value to prevent spec decode turned off due to number of effective requests > max_concurrency,
# So that we can only focus on the turning off effect from the SpeculationGate.
@skip_blackwell # TODO: Remove after fixing TRTLLM-GEN FMHA segfault on Blackwell. NVBugs: https://nvbugspro.nvidia.com/bug/5698292
@pytest.fixture(scope="function")
def enforce_single_worker(monkeypatch):
"""Mock functions don't work with multiple processes, so we enforce single worker."""
monkeypatch.setenv("TLLM_WORKER_USE_SINGLE_PROCESS", "1")
yield
# Tests that the SpeculationGate correctly disables speculative decoding
# when the average acceptance rate drops below the threshold.
# This test uses a mock to simulate low acceptance rates and verifies
# that the spec gate triggers and disables speculation.
@pytest.mark.high_cuda_memory
def test_spec_gate_e2e():
def test_spec_gate_e2e(enforce_single_worker):
total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
if total_mem_gb < 35:
pytest.skip("Not enough memory to load target + draft model")
@ -32,6 +38,8 @@ def test_spec_gate_e2e():
max_batch_size = 2
max_draft_len = 4
acceptance_window = 3
acceptance_threshold = 0.6
kv_cache_config = KvCacheConfig(enable_block_reuse=True, max_tokens=8192)
cuda_graph_config = CudaGraphConfig(batch_sizes=[1])
@ -48,39 +56,88 @@ def test_spec_gate_e2e():
spec_config = Eagle3DecodingConfig(
max_draft_len=max_draft_len,
speculative_model=eagle_model_dir,
# Llama 3 does not support one model eagle.
eagle3_one_model=False,
max_concurrency=10000,
acceptance_window=5,
acceptance_length_threshold=0.6,
acceptance_window=acceptance_window,
acceptance_length_threshold=acceptance_threshold,
)
llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
# Output tests
prompts = [
"The capital of France is",
"The president of the United States is",
"What is the capital of Australia?",
"Explain in one sentence why the sky is blue.",
"Who wrote the book 'Pride and Prejudice'?",
"List three U.S. national holidays in the year 2025.",
"What is the currency of Japan?",
"How many players are on a basketball court for one team?",
"List three primary colors.",
]
sampling_params = SamplingParams(max_tokens=32, temperature=0)
sampling_params = SamplingParams(max_tokens=20, temperature=0)
results_spec = llm_spec.generate(prompts, sampling_params)
generated_text_spec = [result.outputs[0].text for result in results_spec]
llm_spec.shutdown()
# Track calls to record_avg_decoded and the disabled state
gate_state = {"record_calls": [], "gate_disabled": False}
llm_ref = LLM(**llm_common_config)
results_ref = llm_ref.generate(prompts, sampling_params)
generated_text_ref = [result.outputs[0].text for result in results_ref]
llm_ref.shutdown()
original_record_avg_decoded = SpeculationGate.record_avg_decoded
for text_spec, text_ref in zip(generated_text_spec, generated_text_ref):
assert similar(text_spec, text_ref)
def mock_record_avg_decoded(self,
avg_decoded_tokens_per_iter,
request_id=None):
"""
Mock that simulates low acceptance rate (1.2 tokens/iter = 0.2 accepted).
This is below the threshold of 0.6, so the gate should trigger after the window fills.
"""
# Simulate low acceptance: avg_decoded = 1.2 means accepted_len = 0.2
# This is below threshold (0.6), so gate should trigger
simulated_low_avg = 1.2
disabled_now, avg = original_record_avg_decoded(self, simulated_low_avg,
request_id)
gate_state["record_calls"].append({
"original_avg": avg_decoded_tokens_per_iter,
"simulated_avg": simulated_low_avg,
"disabled_now": disabled_now,
"avg_accept": avg,
"request_id": request_id,
})
if disabled_now:
gate_state["gate_disabled"] = True
return disabled_now, avg
llm_spec = LLM(**llm_common_config, speculative_config=spec_config)
try:
with patch.object(SpeculationGate, 'record_avg_decoded',
mock_record_avg_decoded):
llm_spec.generate(prompts, sampling_params)
# Verify the mock was called (requests completed)
assert len(gate_state["record_calls"]
) > 0, "record_avg_decoded should have been called"
# Verify the gate was disabled after enough requests with low acceptance
assert gate_state["gate_disabled"], \
f"Gate should have been disabled with simulated low acceptance. Calls: {gate_state['record_calls']}"
# Verify the gate triggered at the right time (after window is filled)
# The gate should trigger on the `acceptance_window`-th call (index = window - 1)
disable_indices = [
i for i, call in enumerate(gate_state["record_calls"])
if call["disabled_now"]
]
assert len(disable_indices) == 1, \
f"Gate should have triggered exactly once, but triggered at indices: {disable_indices}"
assert disable_indices[0] >= acceptance_window - 1, \
f"Gate should trigger after window ({acceptance_window}) is filled, but triggered at index {disable_indices[0]}"
# Verify the average acceptance was below threshold when disabled
disable_call = gate_state["record_calls"][disable_indices[0]]
assert disable_call["avg_accept"] is not None
assert disable_call["avg_accept"] < acceptance_threshold, \
f"Avg acceptance ({disable_call['avg_accept']}) should be below threshold ({acceptance_threshold})"
logger.debug(
f"Gate correctly triggered after {disable_indices[0] + 1} requests")
logger.debug(
f"Final avg acceptance: {disable_call['avg_accept']:.3f} < threshold {acceptance_threshold}"
)
finally:
llm_spec.shutdown()
def test_returns_none_until_window_and_enabled_when_above_threshold():