[TRTLLM-10803][fix] Fix mocking of HuggingFace downloads in with_mocked_hf_download (#11200)

Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
2026-02-04 18:21:52 +08:00 · 2026-02-02 21:58:15 -08:00 · 2026-02-02 21:58:15 -08:00 · e308eb50f4
commit e308eb50f4
parent 304dc6f3c0
3 changed files with 22 additions and 7 deletions
--- a/tests/test_common/llm_data.py
+++ b/tests/test_common/llm_data.py
@ -99,16 +99,31 @@ def mock_snapshot_download(repo_id: str, **kwargs) -> str:
    return local_path


-def with_mocked_hf_download(func):
+def with_mocked_hf_download_for_single_gpu(func):
    """Decorator to mock huggingface_hub.snapshot_download for tests.

    When applied, any calls to snapshot_download will be redirected to use
    local model paths from LLM_MODELS_ROOT instead of downloading from HuggingFace.
+
+    NOTE: We must patch snapshot_download at the location where it's actually imported
+    with 'from huggingface_hub import snapshot_download', since that creates a
+    local binding that won't be affected by patching huggingface_hub.snapshot_download.
+
+    Additionally sets HF_HUB_OFFLINE=1 to ensure no network requests are made to
+    HuggingFace.
+
+    WARNING: This decorator only works for single-GPU tests. For multi-GPU tests, the
+    mock won't be applied in MPI worker processes.
    """

    @wraps(func)
    def wrapper(*args, **kwargs):
-        with patch("huggingface_hub.snapshot_download", side_effect=mock_snapshot_download):
+        with (
+            patch.dict(os.environ, {"HF_HUB_OFFLINE": "1"}),
+            patch(
+                "tensorrt_llm.llmapi.utils.snapshot_download", side_effect=mock_snapshot_download
+            ),
+        ):
            return func(*args, **kwargs)

    return wrapper
--- a/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py
+++ b/tests/unittest/_torch/auto_deploy/unit/singlegpu/test_ad_speculative_decoding.py
@ -16,7 +16,7 @@
 import pytest
 from _model_test_utils import get_small_model_config
 from build_and_run_ad import ExperimentConfig, main
-from test_common.llm_data import with_mocked_hf_download
+from test_common.llm_data import with_mocked_hf_download_for_single_gpu

 from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig

@ -24,8 +24,8 @@ from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig
@pytest.mark.skip(
    reason="OOM on A30 GPUs on CI - speculative model loading does not support model_kwargs reduction"
 )
-@pytest.mark.parametrize("use_hf_speculative_model", [False])
-@with_mocked_hf_download
+@pytest.mark.parametrize("use_hf_speculative_model", [False, True])
+@with_mocked_hf_download_for_single_gpu
 def test_ad_speculative_decoding_smoke(use_hf_speculative_model: bool):
    """Test speculative decoding with AutoDeploy using the build_and_run_ad main()."""

--- a/tests/unittest/_torch/speculative/test_eagle3.py
+++ b/tests/unittest/_torch/speculative/test_eagle3.py
@ -8,7 +8,7 @@ from unittest.mock import MagicMock

 import pytest
 import torch
-from test_common.llm_data import with_mocked_hf_download
+from test_common.llm_data import with_mocked_hf_download_for_single_gpu
 from utils.llm_data import llm_models_root

 from tensorrt_llm import LLM, SamplingParams
@ -150,7 +150,7 @@ def test_kv_lens_runtime_with_eagle3_one_model():
        [False, "TRTLLM", True, False, False, False, True, False, False, True],
    ])
@pytest.mark.high_cuda_memory
-@with_mocked_hf_download
+@with_mocked_hf_download_for_single_gpu
 def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
                      disable_overlap_scheduler: bool, enable_block_reuse: bool,
                      use_one_model: bool, enable_chunked_prefill: bool,