mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-13 14:33:52 +08:00
[TRTLLM-10803][fix] Fix mocking of HuggingFace downloads in with_mocked_hf_download (#11200)
Signed-off-by: Anish Shanbhag <ashanbhag@nvidia.com>
This commit is contained in:
parent
304dc6f3c0
commit
e308eb50f4
@ -99,16 +99,31 @@ def mock_snapshot_download(repo_id: str, **kwargs) -> str:
|
|||||||
return local_path
|
return local_path
|
||||||
|
|
||||||
|
|
||||||
def with_mocked_hf_download(func):
|
def with_mocked_hf_download_for_single_gpu(func):
|
||||||
"""Decorator to mock huggingface_hub.snapshot_download for tests.
|
"""Decorator to mock huggingface_hub.snapshot_download for tests.
|
||||||
|
|
||||||
When applied, any calls to snapshot_download will be redirected to use
|
When applied, any calls to snapshot_download will be redirected to use
|
||||||
local model paths from LLM_MODELS_ROOT instead of downloading from HuggingFace.
|
local model paths from LLM_MODELS_ROOT instead of downloading from HuggingFace.
|
||||||
|
|
||||||
|
NOTE: We must patch snapshot_download at the location where it's actually imported
|
||||||
|
with 'from huggingface_hub import snapshot_download', since that creates a
|
||||||
|
local binding that won't be affected by patching huggingface_hub.snapshot_download.
|
||||||
|
|
||||||
|
Additionally sets HF_HUB_OFFLINE=1 to ensure no network requests are made to
|
||||||
|
HuggingFace.
|
||||||
|
|
||||||
|
WARNING: This decorator only works for single-GPU tests. For multi-GPU tests, the
|
||||||
|
mock won't be applied in MPI worker processes.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@wraps(func)
|
@wraps(func)
|
||||||
def wrapper(*args, **kwargs):
|
def wrapper(*args, **kwargs):
|
||||||
with patch("huggingface_hub.snapshot_download", side_effect=mock_snapshot_download):
|
with (
|
||||||
|
patch.dict(os.environ, {"HF_HUB_OFFLINE": "1"}),
|
||||||
|
patch(
|
||||||
|
"tensorrt_llm.llmapi.utils.snapshot_download", side_effect=mock_snapshot_download
|
||||||
|
),
|
||||||
|
):
|
||||||
return func(*args, **kwargs)
|
return func(*args, **kwargs)
|
||||||
|
|
||||||
return wrapper
|
return wrapper
|
||||||
|
|||||||
@ -16,7 +16,7 @@
|
|||||||
import pytest
|
import pytest
|
||||||
from _model_test_utils import get_small_model_config
|
from _model_test_utils import get_small_model_config
|
||||||
from build_and_run_ad import ExperimentConfig, main
|
from build_and_run_ad import ExperimentConfig, main
|
||||||
from test_common.llm_data import with_mocked_hf_download
|
from test_common.llm_data import with_mocked_hf_download_for_single_gpu
|
||||||
|
|
||||||
from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig
|
from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig
|
||||||
|
|
||||||
@ -24,8 +24,8 @@ from tensorrt_llm.llmapi import DraftTargetDecodingConfig, KvCacheConfig
|
|||||||
@pytest.mark.skip(
|
@pytest.mark.skip(
|
||||||
reason="OOM on A30 GPUs on CI - speculative model loading does not support model_kwargs reduction"
|
reason="OOM on A30 GPUs on CI - speculative model loading does not support model_kwargs reduction"
|
||||||
)
|
)
|
||||||
@pytest.mark.parametrize("use_hf_speculative_model", [False])
|
@pytest.mark.parametrize("use_hf_speculative_model", [False, True])
|
||||||
@with_mocked_hf_download
|
@with_mocked_hf_download_for_single_gpu
|
||||||
def test_ad_speculative_decoding_smoke(use_hf_speculative_model: bool):
|
def test_ad_speculative_decoding_smoke(use_hf_speculative_model: bool):
|
||||||
"""Test speculative decoding with AutoDeploy using the build_and_run_ad main()."""
|
"""Test speculative decoding with AutoDeploy using the build_and_run_ad main()."""
|
||||||
|
|
||||||
|
|||||||
@ -8,7 +8,7 @@ from unittest.mock import MagicMock
|
|||||||
|
|
||||||
import pytest
|
import pytest
|
||||||
import torch
|
import torch
|
||||||
from test_common.llm_data import with_mocked_hf_download
|
from test_common.llm_data import with_mocked_hf_download_for_single_gpu
|
||||||
from utils.llm_data import llm_models_root
|
from utils.llm_data import llm_models_root
|
||||||
|
|
||||||
from tensorrt_llm import LLM, SamplingParams
|
from tensorrt_llm import LLM, SamplingParams
|
||||||
@ -150,7 +150,7 @@ def test_kv_lens_runtime_with_eagle3_one_model():
|
|||||||
[False, "TRTLLM", True, False, False, False, True, False, False, True],
|
[False, "TRTLLM", True, False, False, False, True, False, False, True],
|
||||||
])
|
])
|
||||||
@pytest.mark.high_cuda_memory
|
@pytest.mark.high_cuda_memory
|
||||||
@with_mocked_hf_download
|
@with_mocked_hf_download_for_single_gpu
|
||||||
def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
|
def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
|
||||||
disable_overlap_scheduler: bool, enable_block_reuse: bool,
|
disable_overlap_scheduler: bool, enable_block_reuse: bool,
|
||||||
use_one_model: bool, enable_chunked_prefill: bool,
|
use_one_model: bool, enable_chunked_prefill: bool,
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user