From e308eb50f430efd930f46dcfca804c4fc074e0fa Mon Sep 17 00:00:00 2001 From: Anish Shanbhag Date: Mon, 2 Feb 2026 21:58:15 -0800 Subject: [PATCH] [TRTLLM-10803][fix] Fix mocking of HuggingFace downloads in `with_mocked_hf_download` (#11200) Signed-off-by: Anish Shanbhag --- tests/test_common/llm_data.py | 19 +++++++++++++++++-- .../singlegpu/test_ad_speculative_decoding.py | 6 +++--- .../_torch/speculative/test_eagle3.py | 4 ++-- 3 files changed, 22 insertions(+), 7 deletions(-) diff --git a/tests/test_common/llm_data.py b/tests/test_common/llm_data.py index 74fd310366..cc8d939e04 100644 --- a/tests/test_common/llm_data.py +++ b/tests/test_common/llm_data.py @@ -99,16 +99,31 @@ def mock_snapshot_download(repo_id: str, **kwargs) -> str: return local_path -def with_mocked_hf_download(func): +def with_mocked_hf_download_for_single_gpu(func): """Decorator to mock huggingface_hub.snapshot_download for tests. When applied, any calls to snapshot_download will be redirected to use local model paths from LLM_MODELS_ROOT instead of downloading from HuggingFace. + + NOTE: We must patch snapshot_download at the location where it's actually imported + with 'from huggingface_hub import snapshot_download', since that creates a + local binding that won't be affected by patching huggingface_hub.snapshot_download. + + Additionally sets HF_HUB_OFFLINE=1 to ensure no network requests are made to + HuggingFace. + + WARNING: This decorator only works for single-GPU tests. For multi-GPU tests, the + mock won't be applied in MPI worker processes. """ @wraps(func) def wrapper(*args, **kwargs): - with patch("huggingface_hub.snapshot_download", side_effect=mock_snapshot_download): + with ( + patch.dict(os.environ, {"HF_HUB_OFFLINE": "1"}), + patch( + "tensorrt_llm.llmapi.utils.snapshot_download", side_effect=mock_snapshot_download + ), + ): return func(*args, **kwargs) return wrapper diff --git a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py index 3d4e8a8794..2fdf5100a8 100644 --- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py +++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py @@ -16,7 +16,7 @@ import pytest from _model_test_utils import get_small_model_config from build_and_run_ad import ExperimentConfig, main -from test_common.llm_data import with_mocked_hf_download +from test_common.llm_data import with_mocked_hf_download_for_single_gpu from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig @@ -24,8 +24,8 @@ from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig @pytest.mark.skip( reason="OOM on A30 GPUs on CI - speculative model loading does not support model_kwargs reduction" ) -@pytest.mark.parametrize("use_hf_speculative_model", [False]) -@with_mocked_hf_download +@pytest.mark.parametrize("use_hf_speculative_model", [False, True]) +@with_mocked_hf_download_for_single_gpu def test_ad_speculative_decoding_smoke(use_hf_speculative_model: bool): """Test speculative decoding with AutoDeploy using the build_and_run_ad main().""" diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py index 71befcfef8..c8ede3ed8c 100644 --- a/tests/unittest/_torch/speculative/test_eagle3.py +++ b/tests/unittest/_torch/speculative/test_eagle3.py @@ -8,7 +8,7 @@ from unittest.mock import MagicMock import pytest import torch -from test_common.llm_data import with_mocked_hf_download +from test_common.llm_data import with_mocked_hf_download_for_single_gpu from utils.llm_data import llm_models_root from tensorrt_llm import LLM, SamplingParams @@ -150,7 +150,7 @@ def test_kv_lens_runtime_with_eagle3_one_model(): [False, "TRTLLM", True, False, False, False, True, False, False, True], ]) @pytest.mark.high_cuda_memory -@with_mocked_hf_download +@with_mocked_hf_download_for_single_gpu def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str, disable_overlap_scheduler: bool, enable_block_reuse: bool, use_one_model: bool, enable_chunked_prefill: bool,