TensorRT-LLMs/tests/integration/defs/cpp/test_e2e.py
Dom Brown 2d0f93a054
Refactor: Restructure C++ tests for better modularisation of non-shared code (#4027)
* Refactor: Restructure C++ tests for better modularisation of non-shared code

Start cleanup of pytest code for C++ tests

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>

Clean up names and remove references to test_cpp.py

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>

WIP

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>

Move multi-GPU code

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>

Update doc and try un-waiving

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>

* Update multi GPU file check

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>

* Address minor multi-GPU setup bug

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>

---------

Signed-off-by: Dom Brown <3886319+DomBrown@users.noreply.github.com>
2025-05-09 19:16:51 +01:00

307 lines
12 KiB
Python

import copy
import logging as _logger
import os as _os
import pathlib as _pl
from typing import List
import defs.cpp.cpp_common as _cpp
import pytest
def run_single_gpu_tests(build_dir: _pl.Path,
test_list: List[str],
run_fp8=False,
timeout=3600):
cpp_env = {**_os.environ}
included_tests = list(_cpp.generate_included_model_tests(test_list))
fname_list = list(_cpp.generate_result_file_name(test_list,
run_fp8=run_fp8))
resultFileName = "-".join(fname_list) + ".xml"
excluded_tests = ["FP8"] if not run_fp8 else []
excluded_tests.extend(list(_cpp.generate_excluded_test_list(test_list)))
ctest = ["ctest", "--output-on-failure", "--output-junit", resultFileName]
if included_tests:
ctest.extend(["-R", "|".join(included_tests)])
if excluded_tests:
ctest.extend(["-E", "|".join(excluded_tests)])
parallel = _cpp.default_test_parallel
if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE",
None):
parallel = int(parallel_override)
_cpp.parallel_run_ctest(ctest,
cwd=build_dir,
env=cpp_env,
timeout=timeout,
parallel=parallel)
if "gpt" in test_list:
xml_output_file = build_dir / "results-single-gpu-disagg-executor_gpt.xml"
new_env = copy.copy(cpp_env)
new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
trt_model_test = _cpp.produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=2,
local_commands=[
"tests/executor/disaggExecutorTest",
"--gtest_filter=*GptSingleDeviceDisaggSymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
_cpp.run_command(trt_model_test,
cwd=build_dir,
env=new_env,
timeout=timeout)
run_spec_dec_tests(build_dir=build_dir)
def run_benchmarks(model_name: str, python_exe: str, root_dir: _pl.Path,
build_dir: _pl.Path, resources_dir: _pl.Path,
model_cache: str, test_gpt_session_benchmark: bool,
batching_types: list[str], api_types: list[str]):
benchmark_exe_dir = build_dir / "benchmarks"
if model_name == "gpt":
model_engine_dir = resources_dir / "models" / "rt_engine" / "gpt2"
tokenizer_dir = resources_dir / "models" / "gpt2"
elif model_name in ('bart', 't5'):
if model_name == "t5":
hf_repo_name = "t5-small"
elif model_name == "bart":
hf_repo_name = "bart-large-cnn"
model_engine_dir = resources_dir / "models" / "enc_dec" / "trt_engines" / hf_repo_name
tokenizer_dir = model_cache + "/" + hf_repo_name
model_engine_path = model_engine_dir / "1-gpu" / "float16" / "decoder"
encoder_model_engine_path = model_engine_dir / "1-gpu" / "float16" / "encoder"
model_name = "enc_dec"
else:
_logger.info(
f"run_benchmark test does not support {model_name}. Skipping benchmarks"
)
return NotImplementedError
if test_gpt_session_benchmark:
if model_name == "gpt":
pass
# WAR: Currently importing the bindings here causes a segfault in pybind 11 during shutdown
# As this just builds a path we hard-code for now to obviate the need for import of bindings
# model_spec_obj = model_spec.ModelSpec(input_file, _tb.DataType.HALF)
# model_spec_obj.set_kv_cache_type(_tb.KVCacheType.CONTINUOUS)
# model_spec_obj.use_gpt_plugin()
# model_engine_path = model_engine_dir / model_spec_obj.get_model_path(
# ) / "tp1-pp1-cp1-gpu"
model_engine_path = model_engine_dir / "fp16_plugin_continuous" / "tp1-pp1-cp1-gpu"
else:
_logger.info(
f"gptSessionBenchmark test does not support {model_name}. Skipping benchmarks"
)
return NotImplementedError
benchmark = [
str(benchmark_exe_dir / "gptSessionBenchmark"), "--engine_dir",
str(model_engine_path), "--batch_size", "8", "--input_output_len",
"10,20", "--duration", "10"
]
_cpp.run_command(benchmark, cwd=root_dir, timeout=600)
prompt_datasets_args = [{
'--dataset-name': "cnn_dailymail",
'--dataset-config-name': "3.0.0",
'--dataset-split': "validation",
'--dataset-input-key': "article",
'--dataset-prompt': "Summarize the following article:",
'--dataset-output-key': "highlights"
}, {
'--dataset-name': "Open-Orca/1million-gpt-4",
'--dataset-split': "train",
'--dataset-input-key': "question",
'--dataset-prompt-key': "system_prompt",
'--dataset-output-key': "response"
}]
token_files = [
"prepared_" + s['--dataset-name'].replace('/', '_')
for s in prompt_datasets_args
]
max_input_lens = ["256", "20"]
num_reqs = ["50", "10"]
if model_name == "gpt":
model_engine_path = model_engine_dir / "fp16_plugin_packed_paged" / "tp1-pp1-cp1-gpu"
# WAR: Currently importing the bindings here causes a segfault in pybind 11 during shutdown
# As this just builds a path we hard-code for now to obviate the need for import of bindings
# model_spec_obj = model_spec.ModelSpec(input_file, _tb.DataType.HALF)
# model_spec_obj.set_kv_cache_type(_tb.KVCacheType.PAGED)
# model_spec_obj.use_gpt_plugin()
# model_spec_obj.use_packed_input()
# model_engine_path = model_engine_dir / model_spec_obj.get_model_path(
# ) / "tp1-pp1-cp1-gpu"
for prompt_ds_args, tokens_f, len, num_req in zip(prompt_datasets_args,
token_files,
max_input_lens, num_reqs):
benchmark_src_dir = _pl.Path("benchmarks") / "cpp"
data_dir = resources_dir / "data"
prepare_dataset = [
python_exe,
str(benchmark_src_dir / "prepare_dataset.py"), "--tokenizer",
str(tokenizer_dir), "--output",
str(data_dir / tokens_f), "dataset", "--max-input-len", len,
"--num-requests", num_req
]
for k, v in prompt_ds_args.items():
prepare_dataset += [k, v]
# https://nvbugs/4658787
# WAR before the prepare dataset can use offline cached dataset
_cpp.run_command(prepare_dataset,
cwd=root_dir,
timeout=300,
env={'HF_DATASETS_OFFLINE': '0'})
for batching_type in batching_types:
for api_type in api_types:
benchmark = [
str(benchmark_exe_dir / "gptManagerBenchmark"),
"--engine_dir",
str(model_engine_path), "--type",
str(batching_type), "--api",
str(api_type), "--dataset",
str(data_dir / tokens_f)
]
if model_name == "enc_dec":
benchmark += [
"--encoder_engine_dir",
str(encoder_model_engine_path)
]
_cpp.run_command(benchmark, cwd=root_dir, timeout=600)
req_rate_benchmark = benchmark + [
"--request_rate", "100", "--enable_exp_delays"
]
_cpp.run_command(req_rate_benchmark, cwd=root_dir, timeout=600)
concurrency_benchmark = benchmark + ["--concurrency", "30"]
_cpp.run_command(concurrency_benchmark,
cwd=root_dir,
timeout=600)
if "IFB" in batching_type and "executor" in api_types:
# executor streaming test
benchmark = [
str(benchmark_exe_dir / "gptManagerBenchmark"), "--engine_dir",
str(model_engine_path), "--type", "IFB", "--dataset",
str(data_dir / tokens_f), "--api", "executor", "--streaming"
]
if model_name == "enc_dec":
benchmark += [
"--encoder_engine_dir",
str(encoder_model_engine_path)
]
_cpp.run_command(benchmark, cwd=root_dir, timeout=600)
def run_spec_dec_tests(build_dir: _pl.Path):
xml_output_file = build_dir / "results-spec-dec-fast-logits.xml"
cpp_env = {**_os.environ}
tests_dir = build_dir / "tests"
trt_model_test = _cpp.produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=3,
local_commands=[
"executor/executorTest", "--gtest_filter=*SpecDecFastLogits*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
_cpp.run_command(trt_model_test, cwd=tests_dir, env=cpp_env, timeout=1500)
@pytest.fixture(scope="session")
def run_model_tests(build_dir, lora_setup):
def _run(model_name: str, run_fp8: bool):
run_single_gpu_tests(
build_dir=build_dir,
test_list=[model_name],
timeout=_cpp.default_test_timeout,
run_fp8=run_fp8,
)
return _run
@pytest.fixture(scope="session")
def run_model_benchmarks(root_dir, build_dir, cpp_resources_dir, python_exe,
model_cache):
def _run(
model_name: str,
test_gpt_session_benchmark: bool,
batching_types: List[str],
api_types: List[str],
):
run_benchmarks(
model_name=model_name,
python_exe=python_exe,
root_dir=root_dir,
build_dir=build_dir,
resources_dir=cpp_resources_dir,
model_cache=model_cache,
test_gpt_session_benchmark=test_gpt_session_benchmark,
batching_types=batching_types,
api_types=api_types,
)
return _run
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
indirect=True)
@pytest.mark.parametrize("model", [
"bart", "chatglm", "eagle", "encoder", "enc_dec_language_adapter", "gpt",
"gpt_executor", "gpt_session", "gpt_tests", "llama", "mamba", "medusa",
"recurrentgemma", "redrafter", "t5"
])
@pytest.mark.parametrize("run_fp8", [False, True], ids=["", "fp8"])
def test_model(build_google_tests, model, prepare_model, run_model_tests,
run_fp8):
if model == "recurrentgemma":
pytest.skip(
"TODO: fix recurrentgemma OOM with newest version of transformers")
return
prepare_model(model, run_fp8)
run_model_tests(model, run_fp8)
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
indirect=True)
@pytest.mark.parametrize("model", ["bart", "gpt", "t5"])
def test_benchmarks(build_benchmarks, model, prepare_model,
run_model_benchmarks):
prepare_model(model)
test_gpt_session_benchmark = True if model == "gpt" else False
batching_types = ["IFB", "V1"] if model == "gpt" else ["IFB"]
api_types = ["executor"]
run_model_benchmarks(
model_name=model,
test_gpt_session_benchmark=test_gpt_session_benchmark,
batching_types=batching_types,
api_types=api_types,
)