TensorRT-LLMs/tests/integration/defs/test_cpp.py
Kaiyu Xie 2631f21089
Update (#2978)
Signed-off-by: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
2025-03-23 16:39:35 +08:00

348 lines
11 KiB
Python

import glob
import logging as _logger
import os as _os
import pathlib as _pl
import platform
import shutil
import sys as _sys
from typing import List
import pytest
build_script_dir = _pl.Path(
__file__).parent.resolve().parent.parent.parent / 'scripts'
assert build_script_dir.is_dir()
_sys.path.append(str(build_script_dir))
from build_wheel import main as build_trt_llm
from defs.conftest import llm_models_root
from defs.cpp_common import (default_test_parallel, default_test_timeout,
find_build_dir, find_root_dir,
generate_excluded_model_tests, parallel_run_ctest,
prepare_model_tests, prepare_multi_gpu_model_tests,
run_benchmarks, run_command, run_multi_gpu_tests,
run_single_gpu_tests)
@pytest.fixture(scope="session")
def build_dir():
return find_build_dir()
@pytest.fixture(scope="session")
def cpp_resources_dir():
return _pl.Path("cpp") / "tests" / "resources"
@pytest.fixture(scope="session")
def model_cache():
return llm_models_root()
@pytest.fixture(scope="session")
def model_cache_arg(model_cache):
return ["--model_cache", model_cache] if model_cache else []
@pytest.fixture(scope="session")
def python_exe():
return _sys.executable
@pytest.fixture(scope="session")
def root_dir():
return find_root_dir()
@pytest.fixture(scope="session")
def lora_setup(root_dir, cpp_resources_dir, python_exe):
cpp_script_dir = cpp_resources_dir / "scripts"
cpp_data_dir = cpp_resources_dir / "data"
generate_lora_data_args_tp1 = [
python_exe, f"{cpp_script_dir}/generate_test_lora_weights.py",
f"--out-dir={cpp_data_dir}/lora-test-weights-tp1", "--tp-size=1"
]
generate_lora_data_args_tp2 = [
python_exe, f"{cpp_script_dir}/generate_test_lora_weights.py",
f"--out-dir={cpp_data_dir}/lora-test-weights-tp2", "--tp-size=2"
]
generate_multi_lora_tp2_args = [
python_exe,
f"{cpp_script_dir}/generate_test_lora_weights.py",
f"--out-dir={cpp_data_dir}/multi_lora",
"--tp-size=2",
"--num-loras=128",
]
generate_gpt2_lora_data_args_tp1 = [
python_exe, f"{cpp_script_dir}/generate_test_lora_weights.py",
f"--out-dir={cpp_data_dir}/lora-test-weights-gpt2-tp1", "--tp-size=1",
"--hidden-size=768", "--num-layers=12", "--config-ids-filter=0",
"--no-generate-cache-pages"
]
generate_lora_data_args_prefetch_task_3 = [
python_exe, f"{cpp_script_dir}/generate_test_lora_weights.py",
f"--out-dir={cpp_data_dir}/lora_prefetch/3",
"--target-file-name=model.lora_weights.npy",
"--config-file-name=model.lora_config.npy"
]
generate_lora_data_args_prefetch_task_5 = [
python_exe, f"{cpp_script_dir}/generate_test_lora_weights.py",
f"--out-dir={cpp_data_dir}/lora_prefetch/5",
"--target-file-name=model.lora_weights.npy",
"--config-file-name=model.lora_config.npy"
]
run_command(generate_lora_data_args_tp1, cwd=root_dir, timeout=100)
run_command(generate_lora_data_args_tp2, cwd=root_dir, timeout=100)
run_command(generate_multi_lora_tp2_args, cwd=root_dir, timeout=100)
run_command(generate_gpt2_lora_data_args_tp1, cwd=root_dir, timeout=100)
run_command(generate_lora_data_args_prefetch_task_3,
cwd=root_dir,
timeout=100)
run_command(generate_lora_data_args_prefetch_task_5,
cwd=root_dir,
timeout=100)
@pytest.fixture(scope="session")
def install_additional_requirements(python_exe, root_dir):
def _install(model_name: str):
if model_name == "mamba":
run_command(
[python_exe, "-m", "pip", "install", "transformers>=4.39.0"],
cwd=root_dir,
env=_os.environ,
timeout=300)
elif model_name == "recurrentgemma":
run_command([
python_exe, "-m", "pip", "install", "-r",
"examples/recurrentgemma/requirements.txt"
],
cwd=root_dir,
env=_os.environ,
timeout=300)
return _install
@pytest.fixture(scope="session")
def build_google_tests(request, build_dir):
cuda_arch = f"{request.param}-real"
print(f"Using CUDA arch: {cuda_arch}")
build_trt_llm(cuda_architectures=cuda_arch,
job_count=12,
use_ccache=True,
clean=True,
trt_root="/usr/local/tensorrt")
make_google_tests = [
"cmake", "--build", ".", "--config", "Release", "-j", "--target",
"google-tests"
]
# Build engine and generate output scripts need modelSpec
make_modelSpec = [
"cmake", "--build", ".", "--config", "Release", "-j", "--target",
"modelSpec"
]
run_command(make_google_tests, cwd=build_dir, timeout=300)
run_command(make_modelSpec, cwd=build_dir, timeout=300)
script_dir = _pl.Path(__file__).parent.resolve(
).parent.parent.parent / 'cpp' / 'tests' / 'resources' / 'scripts'
assert script_dir.is_dir()
_sys.path.append(str(script_dir))
from build_engines_utils import init_model_spec_module
init_model_spec_module(force_init_trtllm_bindings=False)
@pytest.fixture(scope="session")
def build_benchmarks(build_google_tests, build_dir):
make_benchmarks = [
"cmake", "--build", ".", "--config", "Release", "-j", "--target",
"benchmarks"
]
run_command(make_benchmarks, cwd=build_dir, timeout=300)
@pytest.fixture(scope="session")
def models_multi_gpu(python_exe, root_dir, cpp_resources_dir, model_cache):
if platform.system() != "Windows":
prepare_multi_gpu_model_tests(python_exe=python_exe,
root_dir=root_dir,
resources_dir=cpp_resources_dir,
model_cache=model_cache)
@pytest.fixture(scope="session")
def prepare_model(root_dir, cpp_resources_dir, python_exe, model_cache_arg,
install_additional_requirements):
def _prepare(model_name: str, run_fp8=False):
install_additional_requirements(model_name)
prepare_model_tests(model_name=model_name,
python_exe=python_exe,
root_dir=root_dir,
resources_dir=cpp_resources_dir,
model_cache_arg=model_cache_arg)
if model_name == "gptj" and run_fp8:
only_fp8_arg = ["--only_fp8"]
prepare_model_tests(model_name=model_name,
python_exe=python_exe,
root_dir=root_dir,
resources_dir=cpp_resources_dir,
model_cache_arg=model_cache_arg,
only_fp8_arg=only_fp8_arg)
return _prepare
@pytest.fixture(scope="session")
def run_model_tests(build_dir, lora_setup):
def _run(model_name: str, run_fp8: bool):
run_single_gpu_tests(build_dir=build_dir,
test_list=[model_name],
timeout=default_test_timeout,
run_fp8=run_fp8)
return _run
@pytest.fixture(scope="session")
def run_model_benchmarks(root_dir, build_dir, cpp_resources_dir, python_exe,
model_cache):
def _run(model_name: str, test_gpt_session_benchmark: bool,
batching_types: List[str], api_types: List[str]):
run_benchmarks(model_name=model_name,
python_exe=python_exe,
root_dir=root_dir,
build_dir=build_dir,
resources_dir=cpp_resources_dir,
model_cache=model_cache,
test_gpt_session_benchmark=test_gpt_session_benchmark,
batching_types=batching_types,
api_types=api_types)
return _run
# Unit tests
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
indirect=True)
def test_unit_tests(build_google_tests, build_dir, lora_setup):
# Discover and run the actual gtests
ctest_command = [
"ctest", "--output-on-failure", "--output-junit",
"results-unit-tests.xml"
]
excluded_tests = list(generate_excluded_model_tests())
ctest_command.extend(["-E", "|".join(excluded_tests)])
parallel = default_test_parallel
if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE", None):
parallel = int(parallel_override)
cpp_env = {**_os.environ}
parallel_run_ctest(ctest_command,
cwd=build_dir,
env=cpp_env,
timeout=2700,
parallel=parallel)
# Model tests
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
indirect=True)
@pytest.mark.parametrize("model", [
"bart", "chatglm", "eagle", "encoder", "enc_dec_language_adapter", "gpt",
"gptj", "llama", "mamba", "medusa", "recurrentgemma", "redrafter", "t5"
])
@pytest.mark.parametrize("run_fp8", [False, True], ids=["", "fp8"])
def test_model(build_google_tests, model, prepare_model, run_model_tests,
run_fp8):
prepare_model(model, run_fp8)
run_model_tests(model, run_fp8)
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
indirect=True)
@pytest.mark.parametrize("model", ["bart", "gpt", "t5"])
def test_benchmarks(build_benchmarks, model, prepare_model,
run_model_benchmarks):
prepare_model(model)
test_gpt_session_benchmark = True if model == "gpt" else False
batching_types = ["IFB", "V1"] if model == "gpt" else ["IFB"]
api_types = ["executor"]
run_model_benchmarks(model_name=model,
test_gpt_session_benchmark=test_gpt_session_benchmark,
batching_types=batching_types,
api_types=api_types)
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
indirect=True)
def test_multi_gpu(models_multi_gpu, build_google_tests, build_dir):
if platform.system() != "Windows":
run_multi_gpu_tests(build_dir=build_dir, timeout=default_test_timeout)
@pytest.fixture(scope="function", autouse=True)
def keep_log_files(llm_root):
"Backup previous cpp test results when run multiple ctest"
results_dir = f"{llm_root}/cpp/build"
yield
backup_dir = f"{llm_root}/cpp/build_backup"
_os.makedirs(backup_dir, exist_ok=True)
# Copy XML files to backup directory
xml_files = glob.glob(f"{results_dir}/*.xml")
if xml_files:
for xml_file in xml_files:
try:
shutil.copy(xml_file, backup_dir)
_logger.info(f"Copied {xml_file} to {backup_dir}")
except Exception as e:
_logger.error(f"Error copying {xml_file}: {str(e)}")
else:
_logger.info("No XML files found in the build directory.")