TensorRT-LLMs/tests/integration/defs/triton_server/conftest.py

# -*- coding: utf-8 -*-

import datetime
import os
import shutil
import tempfile

import pytest

from .test_list_parser import (CorrectionMode, get_test_name_corrections_v2,
                               handle_corrections)
from .trt_test_alternative import (SessionDataWriter, check_call, check_output,
                                   print_info)


def llm_models_root() -> str:
    '''return LLM_MODELS_ROOT path if it is set in env, assert when it's set but not a valid path
    '''
    LLM_MODELS_ROOT = os.environ.get("LLM_MODELS_ROOT", None)
    if LLM_MODELS_ROOT is not None:
        assert os.path.isabs(
            LLM_MODELS_ROOT), "LLM_MODELS_ROOT must be absolute path"
        assert os.path.exists(
            LLM_MODELS_ROOT), "LLM_MODELS_ROOT must exists when its specified"
    return LLM_MODELS_ROOT


def venv_check_call(venv, cmd):

    def _war_check_call(*args, **kwargs):
        kwargs["cwd"] = venv.get_working_directory()
        return check_call(*args, **kwargs)

    venv.run_cmd(cmd, caller=_war_check_call, print_script=False)


def venv_check_output(venv, cmd):

    def _war_check_output(*args, **kwargs):
        kwargs["cwd"] = venv.get_working_directory()
        output = check_output(*args, **kwargs)
        return output

    return venv.run_cmd(cmd, caller=_war_check_output, print_script=False)


@pytest.fixture(scope="session")
def trt_performance_cache_name():
    return "performance.cache"


@pytest.fixture(scope="session")
def trt_performance_cache_fpath(trt_config, trt_performance_cache_name):
    fpath = os.path.join(trt_config["workspace"], trt_performance_cache_name)
    return fpath


# Get the executing test case name
@pytest.fixture(autouse=True)
def test_case_name(request):
    return request.node.nodeid


@pytest.fixture(scope="session")
def output_dir(request):
    return request.config._trt_config["output_dir"]


@pytest.fixture(scope="session")
def llm_backend_root():
    return os.path.join(os.environ["LLM_ROOT"], "triton_backend")


@pytest.fixture(scope="session")
def llm_session_data_writer(trt_config, trt_gpu_clock_lock,
                            versions_from_infer_device, output_dir):
    """
    Fixture for the SessionDataWriter, used to write session data to output directory.
    """

    # Attempt to see if we can run infer_device to get the necessary tags for perf_runner
    perf_tag_data = trt_config["perf_trt_tag"]

    if versions_from_infer_device:
        for k, v in versions_from_infer_device.items():
            if k not in perf_tag_data or perf_tag_data[k] is None:
                perf_tag_data[k] = v

    session_data_writer = SessionDataWriter(
        perf_trt_tag=perf_tag_data,
        log_output_directory=output_dir,
        output_formats=trt_config["perf_log_formats"],
        gpu_clock_lock=trt_gpu_clock_lock,
    )

    yield session_data_writer

    session_data_writer.teardown()


@pytest.fixture(scope="session")
def custom_user_workspace(request):
    return request.config.getoption("--workspace")


@pytest.fixture(scope="session")
def llm_backend_venv(custom_user_workspace):
    workspace_dir = custom_user_workspace
    subdir = datetime.datetime.now().strftime("ws-%Y-%m-%d-%H-%M-%S")
    if workspace_dir is None:
        workspace_dir = "triton-backend-test-workspace"
    workspace_dir = os.path.join(workspace_dir, subdir)
    from defs.local_venv import PythonVenvRunnerImpl
    return PythonVenvRunnerImpl("", "", "python3",
                                os.path.join(os.getcwd(), workspace_dir))


@pytest.fixture(scope="session")
def llm_backend_gpt_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "tools", "gpt")


@pytest.fixture(scope="session")
def llm_backend_all_models_root(llm_backend_root):
    return os.path.join(llm_backend_root, "all_models")


@pytest.fixture(scope="session")
def llm_backend_whisper_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "tools", "whisper")


@pytest.fixture(scope="session")
def llm_backend_multimodal_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "tools", "multimodal")


@pytest.fixture(scope="session")
def llm_backend_llmapi_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "tools", "llmapi")


@pytest.fixture(scope="session")
def llm_backend_inflight_batcher_llm_root(llm_backend_root):
    return os.path.join(llm_backend_root, "tools", "inflight_batcher_llm")


@pytest.fixture(scope="session")
def llm_backend_dataset_root(llm_backend_root):
    return os.path.join(llm_backend_root, "tools", "dataset")


@pytest.fixture(scope="session")
def tensorrt_llm_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples")


@pytest.fixture(scope="session")
def tensorrt_llm_gpt_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/models/core/gpt")


@pytest.fixture(scope="session")
def tensorrt_llm_gptj_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/models/contrib/gptj")


@pytest.fixture(scope="session")
def tensorrt_llm_multimodal_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/models/core/multimodal")


@pytest.fixture(scope="session")
def tensorrt_llm_opt_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/models/contrib/opt")


@pytest.fixture(scope="session")
def tensorrt_llm_medusa_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/medusa")


@pytest.fixture(scope="session")
def tensorrt_llm_eagle_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/eagle")


@pytest.fixture(scope="session")
def tensorrt_llm_enc_dec_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/models/core/enc_dec")


@pytest.fixture(scope="session")
def tensorrt_llm_whisper_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/models/core/whisper")


@pytest.fixture(scope="session")
def tensorrt_llm_llama_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/models/core/llama")


@pytest.fixture(scope="session")
def tensorrt_llm_qwen_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/models/core/qwen")


@pytest.fixture(scope="session")
def tensorrt_llm_mllama_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/models/core/mllama")


@pytest.fixture(scope="session")
def tensorrt_llm_mixtral_example_root(llm_backend_root):
    return os.path.join(llm_backend_root, "../examples/models/core/mixtral")


@pytest.fixture(scope="session")
def inflight_batcher_llm_client_root(llm_backend_root):
    inflight_batcher_llm_client_root = os.path.join(llm_backend_root,
                                                    "inflight_batcher_llm",
                                                    "client")

    assert os.path.exists(
        inflight_batcher_llm_client_root
    ), f"{inflight_batcher_llm_client_root} does not exists."
    return inflight_batcher_llm_client_root


@pytest.fixture(autouse=True)
def skip_by_device_count(request):
    if request.node.get_closest_marker('skip_less_device'):
        device_count = get_device_count()
        expected_count = request.node.get_closest_marker(
            'skip_less_device').args[0]
        if expected_count > int(device_count):
            pytest.skip(
                f'Device count {device_count} is less than {expected_count}')


def get_device_count():
    output = check_output("nvidia-smi -L", shell=True, cwd="/tmp")
    device_count = len(output.strip().split('\n'))

    return device_count


@pytest.fixture(autouse=True)
def skip_by_device_memory(request):
    "fixture for skip less device memory"
    if request.node.get_closest_marker('skip_less_device_memory'):
        device_memory = get_device_memory()
        expected_memory = request.node.get_closest_marker(
            'skip_less_device_memory').args[0]
        if expected_memory > int(device_memory):
            pytest.skip(
                f'Device memory {device_memory} is less than {expected_memory}')


def get_device_memory():
    "get gpu memory"
    memory = 0
    with tempfile.TemporaryDirectory() as temp_dirname:
        cmd = " ".join(
            ["nvidia-smi", "--query-gpu=memory.total", "--format=csv,noheader"])
        output = check_output(cmd, shell=True, cwd=temp_dirname)
        memory = int(output.strip().split()[0])

    return memory


@pytest.fixture(scope="session")
def models_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"

    return models_root


@pytest.fixture(scope="session")
def llama_v2_tokenizer_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    llama_v2_tokenizer_model_root = os.path.join(models_root, "llama-models-v2")

    assert os.path.exists(
        llama_v2_tokenizer_model_root
    ), f"{llama_v2_tokenizer_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return llama_v2_tokenizer_model_root


@pytest.fixture(scope="session")
def mistral_v1_tokenizer_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    mistral_v1_tokenizer_model_root = os.path.join(models_root,
                                                   "mistral-7b-v0.1")

    assert os.path.exists(
        mistral_v1_tokenizer_model_root
    ), f"{mistral_v1_tokenizer_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return mistral_v1_tokenizer_model_root


@pytest.fixture(scope="session")
def gpt_tokenizer_model_root(llm_backend_venv):
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    gpt_tokenizer_model_root = os.path.join(models_root, "gpt2")

    assert os.path.exists(
        gpt_tokenizer_model_root
    ), f"{gpt_tokenizer_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return gpt_tokenizer_model_root


@pytest.fixture(scope="session")
def gptj_tokenizer_model_root(llm_backend_venv):
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    gptj_tokenizer_model_root = os.path.join(models_root, "gpt-j-6b")

    assert os.path.exists(
        gptj_tokenizer_model_root
    ), f"{gptj_tokenizer_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return gptj_tokenizer_model_root


@pytest.fixture(scope="session")
def gpt2_medium_tokenizer_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    gpt_tokenizer_model_root = os.path.join(models_root, "gpt2-medium")

    assert os.path.exists(
        gpt_tokenizer_model_root
    ), f"{gpt_tokenizer_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return gpt_tokenizer_model_root


@pytest.fixture(scope="session")
def gpt_next_ptuning_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    gpt_next_ptuning_model_root = os.path.join(models_root, "email_composition")

    assert os.path.exists(
        gpt_next_ptuning_model_root
    ), f"{gpt_next_ptuning_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return gpt_next_ptuning_model_root


@pytest.fixture(scope="session")
def gpt_2b_lora_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    gpt_2b_lora_model_root = os.path.join(models_root, "lora", "gpt-next-2b")

    assert os.path.exists(
        gpt_2b_lora_model_root
    ), f"{gpt_2b_lora_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return gpt_2b_lora_model_root


@pytest.fixture(scope="session")
def blip2_opt_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    blip2_opt_model_root = os.path.join(models_root, "blip2-opt-2.7b")

    assert os.path.exists(
        blip2_opt_model_root
    ), f"{blip2_opt_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return blip2_opt_model_root


@pytest.fixture(scope="session")
def llava_onevision_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    llava_onevision_model_root = os.path.join(models_root,
                                              "llava-onevision-qwen2-7b-ov-hf")

    assert os.path.exists(
        llava_onevision_model_root
    ), f"{llava_onevision_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return llava_onevision_model_root


@pytest.fixture(scope="session")
def test_video_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    test_video = os.path.join(models_root, "video-neva", "test_video")

    assert os.path.exists(
        test_video
    ), f"{test_video} does not exist under NFS LLM_MODELS_ROOT dir"
    return test_video


@pytest.fixture(scope="session")
def llava_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    llava_model_root = os.path.join(models_root, "llava-1.5-7b-hf")

    assert os.path.exists(
        llava_model_root
    ), f"{llava_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return llava_model_root


@pytest.fixture(scope="session")
def mllama_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    mllama_model_root = os.path.join(models_root, "llama-3.2-models",
                                     "Llama-3.2-11B-Vision")

    assert os.path.exists(
        mllama_model_root
    ), f"{mllama_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return mllama_model_root


@pytest.fixture(scope="session")
def llama_v3_8b_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    llama_model_root = os.path.join(models_root, "llama-models-v3",
                                    "llama-v3-8b-instruct-hf")

    assert os.path.exists(
        llama_model_root
    ), f"{llama_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return llama_model_root


@pytest.fixture(scope="session")
def llama3_v1_8b_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    llama_model_root = os.path.join(models_root, "llama-3.1-model",
                                    "Meta-Llama-3.1-8B")

    assert os.path.exists(
        llama_model_root
    ), f"{llama_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return llama_model_root


@pytest.fixture(scope="session")
def mixtral_8x7b_v0_1_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    mixtral_8x7b_v0_1_model_root = os.path.join(models_root,
                                                "Mixtral-8x7B-v0.1")

    assert os.path.exists(
        mixtral_8x7b_v0_1_model_root
    ), f"{mixtral_8x7b_v0_1_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return mixtral_8x7b_v0_1_model_root


@pytest.fixture(scope="session")
def llama_v3_70b_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    llama_model_root = os.path.join(models_root, "llama-models-v3",
                                    "Llama-3-70B-Instruct-Gradient-1048k")

    assert os.path.exists(
        llama_model_root
    ), f"{llama_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return llama_model_root


@pytest.fixture(scope="session")
def vicuna_7b_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    vicuna_7b_model_root = os.path.join(models_root, "vicuna-7b-v1.3")

    assert os.path.exists(
        vicuna_7b_model_root
    ), f"{vicuna_7b_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return vicuna_7b_model_root


@pytest.fixture(scope="session")
def medusa_vicuna_7b_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    medusa_vicuna_7b_model_root = os.path.join(models_root,
                                               "medusa-vicuna-7b-v1.3")

    assert os.path.exists(
        medusa_vicuna_7b_model_root
    ), f"{medusa_vicuna_7b_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return medusa_vicuna_7b_model_root


@pytest.fixture(scope="session")
def eagle_vicuna_7b_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    eagle_vicuna_7b_model_root = os.path.join(models_root,
                                              "EAGLE-Vicuna-7B-v1.3")

    assert os.path.exists(
        eagle_vicuna_7b_model_root
    ), f"{eagle_vicuna_7b_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return eagle_vicuna_7b_model_root


@pytest.fixture(scope="session")
def t5_small_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    t5_small_model_root = os.path.join(models_root, "t5-small")

    assert os.path.exists(
        t5_small_model_root
    ), f"{t5_small_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return t5_small_model_root


@pytest.fixture(scope="session")
def whisper_large_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    whisper_large_model_root = os.path.join(models_root, "whisper-models",
                                            "large-v3")

    assert os.path.exists(
        whisper_large_model_root
    ), f"{whisper_large_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return whisper_large_model_root


@pytest.fixture(scope="session")
def tiny_llama_model_root():
    models_root = llm_models_root()
    assert models_root, "Did you set LLM_MODELS_ROOT?"
    tiny_llama_model_root = os.path.join(models_root, "llama-models-v2",
                                         "TinyLlama-1.1B-Chat-v1.0")

    assert os.path.exists(
        tiny_llama_model_root
    ), f"{tiny_llama_model_root} does not exist under NFS LLM_MODELS_ROOT dir"
    return tiny_llama_model_root


# Returns an array of total memory for each available device
@pytest.fixture(scope="session")
def total_gpu_memory_mib():
    output = check_output("nvidia-smi --query-gpu memory.total --format=csv",
                          shell=True,
                          cwd="/tmp")
    lines = [l.strip() for l in output.strip().split("\n")]
    lines = lines[1:]  # skip header
    lines = [l[:-4] for l in lines]  # remove MiB suffix
    lines = [int(l) for l in lines]
    return lines


# Pytset cache mechanism can be used to store and retrieve data across test runs.
@pytest.fixture(scope="session", autouse=True)
def setup_cache_data(request, tensorrt_llm_example_root):
    # This variable will be used in hook function: pytest_runtest_teardown since
    # fixtures cannot be directly used in hooks.
    request.config.cache.set('example_root', tensorrt_llm_example_root)


def cleanup_engine_outputs(output_dir_root):
    for dirpath, dirnames, _ in os.walk(output_dir_root, topdown=False):
        for dirname in dirnames:
            if "engine_dir" in dirname or "model_dir" in dirname or "ckpt_dir" in dirname:
                folder_path = os.path.join(dirpath, dirname)
                try:
                    shutil.rmtree(folder_path)
                    print_info(f"Deleted folder: {folder_path}")
                except Exception as e:
                    print_info(f"Error deleting {folder_path}: {e}")


# Teardown hook to clean up engine outputs after each group of test cases are finished
@pytest.hookimpl(hookwrapper=True)
def pytest_runtest_teardown(item, nextitem):
    current_test_basename = item.name.split(
        "[")[0] if '[' in item.name else item.name

    if nextitem:
        next_test_basename = nextitem.name.split(
            "[")[0] if '[' in nextitem.name else nextitem.name
    else:
        next_test_basename = None

    # User can set SKIP_CLEANUP_ENGINES=True to skip clean up engines.
    skip_cleanup_engines = os.getenv("SKIP_CLEANUP_ENGINES", "false")
    if skip_cleanup_engines.lower() != "true":
        if next_test_basename != current_test_basename:
            print_info(
                "SKIP_CLEANUP_ENGINES is not set to True. Cleaning up engine outputs:"
            )
            engine_outputs_root = item.config.cache.get('example_root', None)
            cleanup_engine_outputs(engine_outputs_root)
    else:
        print_info(
            "SKIP_CLEANUP_ENGINES is set to True, will not clean up engines.")

    yield


@pytest.fixture(autouse=True)
def install_root_requirements(llm_backend_root):
    """
    Fixture that automatically runs at the beginning of each test to ensure root requirements.txt is installed.
    """
    requirements_file = os.path.join(llm_backend_root, "requirements.txt")
    if os.path.exists(requirements_file):
        install_requirement_cmd = "pip3 install -r requirements.txt"
        check_call(install_requirement_cmd, shell=True, cwd=llm_backend_root)
    else:
        print_info(
            f"Warning: requirements.txt not found at {requirements_file}")


@pytest.fixture(scope="session")
def output_dir(request):
    if USE_TURTLE:
        return request.config._trt_config["output_dir"]
    else:
        return request.config.getoption("--output-dir")


def deselect_by_regex(regexp, items, test_prefix, config):
    """Filter out tests based on the patterns specified in the given list of regular expressions.
        If a test matches *any* of the expressions in the list it is considered selected."""
    compiled_regexes = []
    regex_list = []
    r = re.compile(regexp)
    compiled_regexes.append(r)
    regex_list.append(regexp)

    selected = []
    deselected = []

    corrections = get_test_name_corrections_v2(set(regex_list),
                                               set(it.nodeid for it in items),
                                               CorrectionMode.REGEX)
    handle_corrections(corrections, test_prefix)

    for item in items:
        found = False
        for regex in compiled_regexes:
            if regex.search(item.nodeid):
                found = True
                break
        if found:
            selected.append(item)
        else:
            deselected.append(item)

    if deselected:
        config.hook.pytest_deselected(items=deselected)
    items[:] = selected