mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-04 02:02:01 +08:00
329 lines
11 KiB
Python
329 lines
11 KiB
Python
import copy
|
|
import logging as _logger
|
|
import os as _os
|
|
import pathlib as _pl
|
|
from dataclasses import dataclass
|
|
from typing import List, Optional
|
|
|
|
import defs.cpp.cpp_common as _cpp
|
|
import pytest
|
|
|
|
|
|
@dataclass(frozen=True)
|
|
class DatasetConfig:
|
|
"""Configuration for a benchmark dataset."""
|
|
name: str
|
|
local_path: str
|
|
split: str
|
|
input_key: str
|
|
output_key: str
|
|
max_input_len: str
|
|
num_requests: str
|
|
config_name: Optional[str] = None
|
|
prompt: Optional[str] = None
|
|
prompt_key: Optional[str] = None
|
|
|
|
@property
|
|
def token_file(self) -> str:
|
|
return "prepared_" + self.name.replace('/', '_')
|
|
|
|
def get_dataset_args(self) -> dict[str, str]:
|
|
"""Build the dataset args dict for prepare_dataset.py."""
|
|
args = {
|
|
'--dataset-local-path': self.local_path,
|
|
'--dataset-split': self.split,
|
|
'--dataset-input-key': self.input_key,
|
|
'--dataset-output-key': self.output_key,
|
|
}
|
|
if self.config_name:
|
|
args['--dataset-config-name'] = self.config_name
|
|
if self.prompt:
|
|
args['--dataset-prompt'] = self.prompt
|
|
if self.prompt_key:
|
|
args['--dataset-prompt-key'] = self.prompt_key
|
|
return args
|
|
|
|
|
|
def get_benchmark_dataset_configs(model_cache: str) -> List[DatasetConfig]:
|
|
"""Define dataset configurations for benchmark tests.
|
|
|
|
To add a new dataset, add a new DatasetConfig entry to this list.
|
|
"""
|
|
datasets_dir = _pl.Path(model_cache) / "datasets"
|
|
|
|
return [
|
|
DatasetConfig(
|
|
name="ccdv/cnn_dailymail",
|
|
local_path=str(datasets_dir / "ccdv" / "cnn_dailymail"),
|
|
config_name="3.0.0",
|
|
split="validation",
|
|
input_key="article",
|
|
prompt="Summarize the following article:",
|
|
output_key="highlights",
|
|
max_input_len="256",
|
|
num_requests="50",
|
|
),
|
|
DatasetConfig(
|
|
name="Open-Orca/1million-gpt-4",
|
|
local_path=str(datasets_dir / "Open-Orca" / "1million-gpt-4" /
|
|
"1M-GPT4-Augmented.parquet"),
|
|
split="train",
|
|
input_key="question",
|
|
prompt_key="system_prompt",
|
|
output_key="response",
|
|
max_input_len="20",
|
|
num_requests="10",
|
|
),
|
|
]
|
|
|
|
|
|
def run_single_gpu_tests(build_dir: _pl.Path,
|
|
test_list: List[str],
|
|
run_fp8=False,
|
|
timeout=3600):
|
|
|
|
cpp_env = {**_os.environ}
|
|
tests_dir = build_dir / "tests" / "e2e_tests"
|
|
|
|
included_tests = list(_cpp.generate_included_model_tests(test_list))
|
|
|
|
fname_list = list(_cpp.generate_result_file_name(test_list,
|
|
run_fp8=run_fp8))
|
|
resultFileName = "-".join(fname_list) + ".xml"
|
|
|
|
excluded_tests = ["FP8"] if not run_fp8 else []
|
|
|
|
excluded_tests.extend(list(_cpp.generate_excluded_test_list(test_list)))
|
|
|
|
ctest = ["ctest", "--output-on-failure", "--output-junit", resultFileName]
|
|
|
|
if included_tests:
|
|
ctest.extend(["-R", "|".join(included_tests)])
|
|
if excluded_tests:
|
|
ctest.extend(["-E", "|".join(excluded_tests)])
|
|
|
|
parallel = _cpp.default_test_parallel
|
|
if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE",
|
|
None):
|
|
parallel = int(parallel_override)
|
|
|
|
_cpp.parallel_run_ctest(ctest,
|
|
cwd=tests_dir,
|
|
env=cpp_env,
|
|
timeout=timeout,
|
|
parallel=parallel)
|
|
if "gpt" in test_list:
|
|
xml_output_file = build_dir / "results-single-gpu-disagg-executor_gpt.xml"
|
|
new_env = copy.copy(cpp_env)
|
|
new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
|
|
trt_model_test = _cpp.produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=2,
|
|
local_commands=[
|
|
"executor/disaggExecutorTest",
|
|
"--gtest_filter=*GptSingleDeviceDisaggSymmetricExecutorTest*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
_cpp.run_command(trt_model_test,
|
|
cwd=tests_dir,
|
|
env=new_env,
|
|
timeout=timeout)
|
|
|
|
run_spec_dec_tests(build_dir=build_dir)
|
|
|
|
|
|
def run_benchmarks(
|
|
model_name: str,
|
|
python_exe: str,
|
|
root_dir: _pl.Path,
|
|
build_dir: _pl.Path,
|
|
resources_dir: _pl.Path,
|
|
model_cache: str,
|
|
batching_types: list[str],
|
|
api_types: list[str],
|
|
):
|
|
benchmark_exe_dir = build_dir / "benchmarks"
|
|
if model_name == "gpt":
|
|
model_engine_dir = resources_dir / "models" / "rt_engine" / "gpt2"
|
|
tokenizer_dir = resources_dir / "models" / "gpt2"
|
|
elif model_name in ('bart', 't5'):
|
|
if model_name == "t5":
|
|
hf_repo_name = "t5-small"
|
|
elif model_name == "bart":
|
|
hf_repo_name = "bart-large-cnn"
|
|
model_engine_dir = resources_dir / "models" / "enc_dec" / "trt_engines" / hf_repo_name
|
|
tokenizer_dir = model_cache + "/" + hf_repo_name
|
|
model_engine_path = model_engine_dir / "1-gpu" / "float16" / "decoder"
|
|
encoder_model_engine_path = model_engine_dir / "1-gpu" / "float16" / "encoder"
|
|
model_name = "enc_dec"
|
|
else:
|
|
_logger.info(
|
|
f"run_benchmark test does not support {model_name}. Skipping benchmarks"
|
|
)
|
|
return NotImplementedError
|
|
|
|
if model_name == "gpt":
|
|
model_engine_path = model_engine_dir / "fp16_plugin_packed_paged" / "tp1-pp1-cp1-gpu"
|
|
|
|
# WAR: Currently importing the bindings here causes a segfault in pybind 11 during shutdown
|
|
# As this just builds a path we hard-code for now to obviate the need for import of bindings
|
|
|
|
# model_spec_obj = model_spec.ModelSpec(input_file, _tb.DataType.HALF)
|
|
# model_spec_obj.set_kv_cache_type(_tb.KVCacheType.PAGED)
|
|
# model_spec_obj.use_gpt_plugin()
|
|
# model_spec_obj.use_packed_input()
|
|
# model_engine_path = model_engine_dir / model_spec_obj.get_model_path(
|
|
# ) / "tp1-pp1-cp1-gpu"
|
|
|
|
for config in get_benchmark_dataset_configs(model_cache):
|
|
benchmark_src_dir = _pl.Path("benchmarks") / "cpp"
|
|
data_dir = resources_dir / "data"
|
|
prepare_dataset = [
|
|
python_exe,
|
|
str(benchmark_src_dir / "prepare_dataset.py"), "--tokenizer",
|
|
str(tokenizer_dir), "--output",
|
|
str(data_dir / config.token_file), "dataset", "--max-input-len",
|
|
config.max_input_len, "--num-requests", config.num_requests
|
|
]
|
|
for k, v in config.get_dataset_args().items():
|
|
prepare_dataset += [k, v]
|
|
|
|
# Use environment variable to force HuggingFace to use offline cached dataset
|
|
offline_env = {**_os.environ, 'HF_DATASETS_OFFLINE': '1'}
|
|
_cpp.run_command(prepare_dataset,
|
|
cwd=root_dir,
|
|
timeout=300,
|
|
env=offline_env)
|
|
|
|
for batching_type in batching_types:
|
|
for api_type in api_types:
|
|
benchmark = [
|
|
str(benchmark_exe_dir / "gptManagerBenchmark"),
|
|
"--engine_dir",
|
|
str(model_engine_path), "--type",
|
|
str(batching_type), "--api",
|
|
str(api_type), "--dataset",
|
|
str(data_dir / config.token_file)
|
|
]
|
|
if model_name == "enc_dec":
|
|
benchmark += [
|
|
"--encoder_engine_dir",
|
|
str(encoder_model_engine_path)
|
|
]
|
|
|
|
_cpp.run_command(benchmark, cwd=root_dir, timeout=600)
|
|
req_rate_benchmark = benchmark + [
|
|
"--request_rate", "100", "--enable_exp_delays"
|
|
]
|
|
_cpp.run_command(req_rate_benchmark, cwd=root_dir, timeout=600)
|
|
concurrency_benchmark = benchmark + ["--concurrency", "30"]
|
|
_cpp.run_command(concurrency_benchmark,
|
|
cwd=root_dir,
|
|
timeout=600)
|
|
|
|
if "IFB" in batching_types and "executor" in api_types:
|
|
# executor streaming test
|
|
benchmark = [
|
|
str(benchmark_exe_dir / "gptManagerBenchmark"), "--engine_dir",
|
|
str(model_engine_path), "--type", "IFB", "--dataset",
|
|
str(data_dir / config.token_file), "--api", "executor",
|
|
"--streaming"
|
|
]
|
|
if model_name == "enc_dec":
|
|
benchmark += [
|
|
"--encoder_engine_dir",
|
|
str(encoder_model_engine_path)
|
|
]
|
|
_cpp.run_command(benchmark, cwd=root_dir, timeout=600)
|
|
|
|
|
|
def run_spec_dec_tests(build_dir: _pl.Path):
|
|
xml_output_file = build_dir / "results-spec-dec-fast-logits.xml"
|
|
cpp_env = {**_os.environ}
|
|
tests_dir = build_dir / "tests" / "e2e_tests"
|
|
trt_model_test = _cpp.produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=3,
|
|
local_commands=[
|
|
"executor/executorTest", "--gtest_filter=*SpecDecFastLogits*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
_cpp.run_command(trt_model_test, cwd=tests_dir, env=cpp_env, timeout=1500)
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def run_model_tests(build_dir, lora_setup):
|
|
|
|
def _run(model_name: str, run_fp8: bool):
|
|
run_single_gpu_tests(
|
|
build_dir=build_dir,
|
|
test_list=[model_name],
|
|
timeout=_cpp.default_test_timeout,
|
|
run_fp8=run_fp8,
|
|
)
|
|
|
|
return _run
|
|
|
|
|
|
@pytest.fixture(scope="session")
|
|
def run_model_benchmarks(root_dir, build_dir, cpp_resources_dir, python_exe,
|
|
model_cache):
|
|
|
|
def _run(
|
|
model_name: str,
|
|
batching_types: List[str],
|
|
api_types: List[str],
|
|
):
|
|
|
|
run_benchmarks(
|
|
model_name=model_name,
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
build_dir=build_dir,
|
|
resources_dir=cpp_resources_dir,
|
|
model_cache=model_cache,
|
|
batching_types=batching_types,
|
|
api_types=api_types,
|
|
)
|
|
|
|
return _run
|
|
|
|
|
|
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
|
indirect=True)
|
|
@pytest.mark.parametrize("model", [
|
|
"bart", "chatglm", "eagle", "encoder", "enc_dec_language_adapter", "gpt",
|
|
"gpt_executor", "gpt_tests", "llama", "mamba", "medusa", "recurrentgemma",
|
|
"redrafter", "t5"
|
|
])
|
|
@pytest.mark.parametrize("run_fp8", [False, True], ids=["", "fp8"])
|
|
def test_model(build_google_tests, model, prepare_model, run_model_tests,
|
|
run_fp8):
|
|
|
|
if model == "recurrentgemma":
|
|
pytest.skip(
|
|
"TODO: fix recurrentgemma OOM with newest version of transformers")
|
|
return
|
|
|
|
prepare_model(model, run_fp8)
|
|
|
|
run_model_tests(model, run_fp8)
|
|
|
|
|
|
@pytest.mark.parametrize("build_google_tests", ["80", "86", "89", "90"],
|
|
indirect=True)
|
|
@pytest.mark.parametrize("model", ["bart", "gpt", "t5"])
|
|
def test_benchmarks(build_benchmarks, model, prepare_model,
|
|
run_model_benchmarks):
|
|
|
|
prepare_model(model)
|
|
|
|
batching_types = ["IFB"]
|
|
api_types = ["executor"]
|
|
|
|
run_model_benchmarks(
|
|
model_name=model,
|
|
batching_types=batching_types,
|
|
api_types=api_types,
|
|
)
|