mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Puneesh Khanna <puneesh.khanna@tii.ae> Co-authored-by: Ethan Zhang <26497102+ethnzhng@users.noreply.github.com>
1145 lines
43 KiB
Python
Executable File
1145 lines
43 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import argparse as _arg
|
|
import copy
|
|
import glob
|
|
import logging as _log
|
|
import os as _os
|
|
import pathlib as _pl
|
|
import platform
|
|
import subprocess as _sp
|
|
import sys as _sys
|
|
import typing as _tp
|
|
|
|
build_script_dir = _pl.Path(
|
|
__file__).parent.resolve().parent.parent.parent.parent / 'scripts'
|
|
assert build_script_dir.is_dir()
|
|
_sys.path.append(str(build_script_dir))
|
|
from build_wheel import add_arguments as add_build_arguments
|
|
from build_wheel import get_build_dir
|
|
from build_wheel import main as build_trt_llm
|
|
|
|
|
|
def find_dir_containing(files: _tp.Sequence[str],
|
|
start_dir: _tp.Optional[_pl.Path] = None) -> _pl.Path:
|
|
if start_dir is None:
|
|
start_dir = _pl.Path.cwd().absolute()
|
|
|
|
assert isinstance(start_dir, _pl.Path)
|
|
assert start_dir.is_dir()
|
|
|
|
if set(files).issubset({f.name for f in start_dir.iterdir()}):
|
|
return start_dir
|
|
elif start_dir.parent is not start_dir:
|
|
return find_dir_containing(files, start_dir.parent)
|
|
else:
|
|
raise FileNotFoundError(files)
|
|
|
|
|
|
def find_root_dir(start_dir: _tp.Optional[_pl.Path] = None) -> _pl.Path:
|
|
return find_dir_containing(("scripts", "examples", "cpp"), start_dir)
|
|
|
|
|
|
def run_command(command: _tp.Sequence[str],
|
|
cwd: _pl.Path,
|
|
*,
|
|
shell=False,
|
|
env=None,
|
|
timeout=None) -> None:
|
|
_log.info("Running: cd %s && %s", str(cwd), " ".join(command))
|
|
override_timeout = int(_os.environ.get("CPP_TEST_TIMEOUT_OVERRIDDEN", "-1"))
|
|
if override_timeout > 0 and (timeout is None or override_timeout > timeout):
|
|
_log.info("Overriding the command timeout: %s (before) and %s (after)",
|
|
timeout, override_timeout)
|
|
timeout = override_timeout
|
|
_sp.check_call(command, cwd=cwd, shell=shell, env=env, timeout=timeout)
|
|
|
|
|
|
def merge_report(parallel, retry, output):
|
|
import xml.etree.ElementTree as ElementTree
|
|
base = ElementTree.parse(parallel)
|
|
extra = ElementTree.parse(retry)
|
|
|
|
base_suite = base.getroot()
|
|
extra_suite = extra.getroot()
|
|
|
|
base_suite.attrib['failures'] = extra_suite.attrib['failures']
|
|
base_suite.attrib['time'] = str(
|
|
int(base_suite.attrib['time']) + int(extra_suite.attrib['time']))
|
|
|
|
case_names = {element.attrib['name'] for element in extra_suite}
|
|
base_suite[:] = [
|
|
element
|
|
for element in base_suite if element.attrib['name'] not in case_names
|
|
] + list(extra_suite)
|
|
|
|
base.write(output, encoding="UTF-8", xml_declaration=True)
|
|
|
|
|
|
def add_parallel_info(report, parallel):
|
|
import xml.etree.ElementTree as ElementTree
|
|
try:
|
|
document = ElementTree.parse(report)
|
|
except FileNotFoundError:
|
|
return
|
|
root = document.getroot()
|
|
root.attrib['parallel'] = str(parallel)
|
|
document.write(report, encoding="UTF-8", xml_declaration=True)
|
|
|
|
|
|
default_test_parallel = 2
|
|
|
|
|
|
def parallel_run_ctest(
|
|
command: _tp.Sequence[str],
|
|
cwd: _pl.Path,
|
|
*,
|
|
shell=False,
|
|
env=None,
|
|
timeout=None,
|
|
parallel=default_test_parallel,
|
|
) -> None:
|
|
if parallel == 1:
|
|
return run_command(command,
|
|
cwd=cwd,
|
|
shell=shell,
|
|
env=env,
|
|
timeout=timeout)
|
|
|
|
env = {} if env is None else env
|
|
env['CTEST_PARALLEL_LEVEL'] = str(parallel)
|
|
|
|
def get_report():
|
|
reports = glob.glob("results-*.xml", root_dir=cwd)
|
|
if not reports:
|
|
return ''
|
|
|
|
return reports[0]
|
|
|
|
report = None
|
|
try:
|
|
run_command(command, cwd=cwd, shell=shell, env=env, timeout=timeout)
|
|
except _sp.CalledProcessError:
|
|
report = get_report()
|
|
if report == '':
|
|
# Some catastrophic fail happened that there's no report generated
|
|
raise
|
|
|
|
parallel_report = 'parallel-' + report
|
|
_os.rename(cwd / report, cwd / parallel_report)
|
|
|
|
try:
|
|
_log.info("Parallel test failed, retry serial on failed tests")
|
|
del env['CTEST_PARALLEL_LEVEL']
|
|
command = [*command, "--rerun-failed"]
|
|
run_command(command, cwd=cwd, shell=shell, env=env, timeout=timeout)
|
|
finally:
|
|
if not _os.path.exists(cwd / report):
|
|
# Some catastrophic fail happened that there's no report generated
|
|
# Use parallel result as final report
|
|
_os.rename(cwd / parallel_report, cwd / report)
|
|
else:
|
|
retry_report = 'retry-' + report
|
|
_os.rename(cwd / report, cwd / retry_report)
|
|
merge_report(cwd / parallel_report, cwd / retry_report,
|
|
cwd / report)
|
|
finally:
|
|
if report is None:
|
|
report = get_report()
|
|
if report:
|
|
add_parallel_info(cwd / report, parallel)
|
|
|
|
|
|
def run_tests(build_dir: _pl.Path,
|
|
model_cache: _tp.Optional[str] = None,
|
|
skip_unit_tests=False,
|
|
run_gpt=False,
|
|
run_gptj=False,
|
|
run_llama=False,
|
|
run_chatglm=False,
|
|
run_medusa=False,
|
|
run_mamba=False,
|
|
run_recurrentgemma=False,
|
|
run_encoder=False,
|
|
run_bart=False,
|
|
run_t5=False,
|
|
run_redrafter=False,
|
|
run_fp8=False,
|
|
only_multi_gpu=False,
|
|
build_only=False,
|
|
test_timeout=3600) -> None:
|
|
root_dir = find_root_dir()
|
|
_log.info("Using root directory: %s", str(root_dir))
|
|
|
|
python_exe = _sys.executable
|
|
|
|
if run_mamba:
|
|
run_command(
|
|
[python_exe, "-m", "pip", "install", "transformers>=4.39.0"],
|
|
cwd=root_dir,
|
|
env=_os.environ,
|
|
timeout=300)
|
|
|
|
if run_recurrentgemma:
|
|
run_command([
|
|
"git", "clone",
|
|
"https://github.com/google-deepmind/recurrentgemma.git"
|
|
],
|
|
cwd=root_dir,
|
|
env=_os.environ,
|
|
timeout=300)
|
|
run_command(
|
|
[python_exe, "-m", "pip", "install", "./recurrentgemma[full]"],
|
|
cwd=root_dir,
|
|
env=_os.environ,
|
|
timeout=300)
|
|
|
|
build_dir = build_dir if build_dir.is_absolute() else root_dir / build_dir
|
|
resources_dir = _pl.Path("cpp") / "tests" / "resources"
|
|
|
|
generate_lora_data_args_tp1 = [
|
|
python_exe,
|
|
str(resources_dir / "scripts" / "generate_test_lora_weights.py"),
|
|
"--out-dir=cpp/tests/resources/data/lora-test-weights-tp1",
|
|
"--tp-size=1"
|
|
]
|
|
|
|
generate_lora_data_args_tp2 = [
|
|
python_exe,
|
|
str(resources_dir / "scripts" / "generate_test_lora_weights.py"),
|
|
"--out-dir=cpp/tests/resources/data/lora-test-weights-tp2",
|
|
"--tp-size=2"
|
|
]
|
|
|
|
generate_multi_lora_tp2_args = [
|
|
python_exe,
|
|
str(resources_dir / "scripts" / "generate_test_lora_weights.py"),
|
|
"--out-dir=cpp/tests/resources/data/multi_lora",
|
|
"--tp-size=2",
|
|
"--num-loras=128",
|
|
]
|
|
|
|
generate_gpt2_lora_data_args_tp1 = [
|
|
python_exe,
|
|
str(resources_dir / "scripts" / "generate_test_lora_weights.py"),
|
|
"--out-dir=cpp/tests/resources/data/lora-test-weights-gpt2-tp1",
|
|
"--tp-size=1", "--hidden-size=768", "--num-layers=12",
|
|
"--config-ids-filter=0", "--no-generate-cache-pages"
|
|
]
|
|
|
|
run_command(generate_lora_data_args_tp1, cwd=root_dir, timeout=100)
|
|
run_command(generate_lora_data_args_tp2, cwd=root_dir, timeout=100)
|
|
run_command(generate_multi_lora_tp2_args, cwd=root_dir, timeout=100)
|
|
run_command(generate_gpt2_lora_data_args_tp1, cwd=root_dir, timeout=100)
|
|
|
|
if not skip_unit_tests:
|
|
run_unit_tests(build_dir=build_dir, timeout=test_timeout)
|
|
else:
|
|
_log.info("Skipping unit tests")
|
|
|
|
if not only_multi_gpu:
|
|
prepare_all_model_tests(python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache=model_cache,
|
|
run_gpt=run_gpt,
|
|
run_gptj=run_gptj,
|
|
run_llama=run_llama,
|
|
run_chatglm=run_chatglm,
|
|
run_medusa=run_medusa,
|
|
run_mamba=run_mamba,
|
|
run_recurrentgemma=run_recurrentgemma,
|
|
run_encoder=run_encoder,
|
|
run_bart=run_bart,
|
|
run_t5=run_t5,
|
|
run_redrafter=run_redrafter,
|
|
run_fp8=run_fp8)
|
|
|
|
if build_only:
|
|
return
|
|
|
|
run_single_gpu_tests(build_dir=build_dir,
|
|
run_gpt=run_gpt,
|
|
run_gptj=run_gptj,
|
|
run_llama=run_llama,
|
|
run_chatglm=run_chatglm,
|
|
run_medusa=run_medusa,
|
|
run_mamba=run_mamba,
|
|
run_recurrentgemma=run_recurrentgemma,
|
|
run_encoder=run_encoder,
|
|
run_bart=run_bart,
|
|
run_t5=run_t5,
|
|
run_redrafter=run_redrafter,
|
|
run_fp8=run_fp8,
|
|
timeout=test_timeout)
|
|
|
|
if run_gpt:
|
|
run_benchmarks(model_name="gpt",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
build_dir=build_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache=model_cache,
|
|
test_gpt_session_benchmark=True,
|
|
batching_types=["IFB", "V1"],
|
|
api_types=["gptManager", "executor"])
|
|
elif run_t5:
|
|
run_benchmarks(model_name="t5",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
build_dir=build_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache=model_cache,
|
|
test_gpt_session_benchmark=False,
|
|
batching_types=["IFB"],
|
|
api_types=["executor"])
|
|
elif run_bart:
|
|
run_benchmarks(model_name="bart",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
build_dir=build_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache=model_cache,
|
|
test_gpt_session_benchmark=False,
|
|
batching_types=["IFB"],
|
|
api_types=["executor"])
|
|
else:
|
|
_log.info("Skipping benchmarks")
|
|
|
|
elif platform.system() != "Windows":
|
|
prepare_multi_gpu_model_tests(python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache=model_cache)
|
|
|
|
if build_only:
|
|
return
|
|
|
|
run_multi_gpu_tests(build_dir=build_dir, timeout=test_timeout)
|
|
|
|
|
|
def prepare_all_model_tests(python_exe: str,
|
|
root_dir: _pl.Path,
|
|
resources_dir: _pl.Path,
|
|
model_cache: _tp.Optional[str] = None,
|
|
run_gpt=False,
|
|
run_gptj=False,
|
|
run_llama=False,
|
|
run_chatglm=False,
|
|
run_medusa=False,
|
|
run_mamba=False,
|
|
run_recurrentgemma=False,
|
|
run_encoder=False,
|
|
run_bart=False,
|
|
run_t5=False,
|
|
run_redrafter=False,
|
|
run_fp8=False):
|
|
model_cache_arg = ["--model_cache", model_cache] if model_cache else []
|
|
|
|
if run_gpt:
|
|
prepare_model_tests(model_name="gpt",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
else:
|
|
_log.info("Skipping GPT tests")
|
|
|
|
if run_gptj:
|
|
prepare_model_tests(model_name="gptj",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
if run_fp8:
|
|
only_fp8_arg = ["--only_fp8"]
|
|
prepare_model_tests(model_name="gptj",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg,
|
|
only_fp8_arg=only_fp8_arg)
|
|
else:
|
|
_log.info("Skipping GPT-J tests")
|
|
|
|
if run_llama:
|
|
prepare_model_tests(model_name="llama",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
else:
|
|
_log.info("Skipping Lllama tests")
|
|
|
|
if run_chatglm:
|
|
prepare_model_tests(model_name="chatglm",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
else:
|
|
_log.info("Skipping ChatGLM tests")
|
|
|
|
if run_medusa:
|
|
prepare_model_tests(model_name="medusa",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
else:
|
|
_log.info("Skipping Medusa tests")
|
|
|
|
if run_mamba:
|
|
prepare_model_tests(model_name="mamba",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
else:
|
|
_log.info("Skipping Mamba tests")
|
|
|
|
if run_recurrentgemma:
|
|
prepare_model_tests(model_name="recurrentgemma",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
else:
|
|
_log.info("Skipping RecurrentGemma tests")
|
|
|
|
if run_encoder:
|
|
prepare_model_tests(model_name="enc_dec",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
else:
|
|
_log.info("Skipping encoder tests")
|
|
|
|
if run_bart:
|
|
prepare_model_tests(model_name="bart",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
else:
|
|
_log.info("Skipping BART tests")
|
|
|
|
if run_t5:
|
|
prepare_model_tests(model_name="t5",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
else:
|
|
_log.info("Skipping T5 tests")
|
|
|
|
if run_redrafter:
|
|
prepare_model_tests(model_name="redrafter",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
else:
|
|
_log.info("Skipping ReDrafter tests")
|
|
|
|
|
|
def prepare_multi_gpu_model_tests(python_exe: str,
|
|
root_dir: _pl.Path,
|
|
resources_dir: _pl.Path,
|
|
model_cache: _tp.Optional[str] = None):
|
|
model_cache_arg = ["--model_cache", model_cache] if model_cache else []
|
|
only_multi_gpu_arg = ["--only_multi_gpu"]
|
|
|
|
prepare_model_tests(model_name="llama",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg,
|
|
only_multi_gpu_arg=only_multi_gpu_arg)
|
|
|
|
prepare_model_tests(model_name="t5",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg,
|
|
only_multi_gpu_arg=['--tp', '4', '--pp', '1'])
|
|
|
|
prepare_model_tests(model_name="gpt",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
|
|
prepare_model_tests(model_name="chatglm",
|
|
python_exe=python_exe,
|
|
root_dir=root_dir,
|
|
resources_dir=resources_dir,
|
|
model_cache_arg=model_cache_arg)
|
|
|
|
|
|
def prepare_model_tests(model_name: str,
|
|
python_exe: str,
|
|
root_dir: _pl.Path,
|
|
resources_dir: _pl.Path,
|
|
model_cache_arg=[],
|
|
only_fp8_arg=[],
|
|
only_multi_gpu_arg=[]):
|
|
scripts_dir = resources_dir / "scripts"
|
|
|
|
model_env = {**_os.environ, "PYTHONPATH": f"examples/{model_name}"}
|
|
enc_dec_model_name_arg = []
|
|
beams_arg = []
|
|
if model_name in ('bart', 't5'):
|
|
enc_dec_model_name_arg = [
|
|
'--hf_repo_name',
|
|
'facebook/bart-large-cnn' if model_name == 'bart' else 't5-small'
|
|
]
|
|
if model_name == 't5' and (not only_multi_gpu_arg):
|
|
beams_arg = ['--beams', '1,2']
|
|
model_name = 'enc_dec'
|
|
|
|
build_engines = [
|
|
python_exe,
|
|
str(scripts_dir / f"build_{model_name}_engines.py")
|
|
] + model_cache_arg + only_fp8_arg + only_multi_gpu_arg + enc_dec_model_name_arg + beams_arg
|
|
|
|
if model_name in ['gpt']:
|
|
build_engines += ['--clean']
|
|
run_command(build_engines, cwd=root_dir, env=model_env, timeout=1800)
|
|
|
|
model_env["PYTHONPATH"] = "examples"
|
|
generate_expected_output = [
|
|
python_exe,
|
|
str(scripts_dir / f"generate_expected_{model_name}_output.py")
|
|
] + only_fp8_arg + only_multi_gpu_arg + enc_dec_model_name_arg
|
|
if "enc_dec" in model_name:
|
|
generate_expected_output += model_cache_arg
|
|
generate_expected_output += beams_arg
|
|
|
|
if model_name in ['gpt']:
|
|
generate_expected_output += ['--clean']
|
|
|
|
if only_multi_gpu_arg and model_name != 'enc_dec':
|
|
for world_size in (2, 4):
|
|
generate_command = [
|
|
"mpirun", "-n",
|
|
str(world_size), "--allow-run-as-root", "--timeout", "600"
|
|
] + generate_expected_output
|
|
run_command(generate_command,
|
|
cwd=root_dir,
|
|
env=model_env,
|
|
timeout=600)
|
|
else:
|
|
run_command(generate_expected_output,
|
|
cwd=root_dir,
|
|
env=model_env,
|
|
timeout=600)
|
|
|
|
|
|
def build_tests(build_dir: _pl.Path):
|
|
make_google_tests = [
|
|
"cmake", "--build", ".", "--config", "Release", "-j", "--target",
|
|
"google-tests"
|
|
]
|
|
run_command(make_google_tests, cwd=build_dir, timeout=300)
|
|
|
|
|
|
def run_unit_tests(build_dir: _pl.Path, timeout=1800):
|
|
build_tests(build_dir=build_dir)
|
|
|
|
cpp_env = {**_os.environ}
|
|
ctest = [
|
|
"ctest", "--output-on-failure", "--output-junit",
|
|
"results-unit-tests.xml"
|
|
]
|
|
excluded_tests = []
|
|
excluded_tests.append("Gpt[^j]")
|
|
excluded_tests.append("Gptj")
|
|
excluded_tests.append("Llama")
|
|
excluded_tests.append("ChatGlm")
|
|
excluded_tests.append("Medusa")
|
|
excluded_tests.append("ExplicitDraftTokensDecoding")
|
|
excluded_tests.append("Mamba")
|
|
excluded_tests.append("RecurrentGemma")
|
|
excluded_tests.append("Encoder")
|
|
excluded_tests.append("EncDec")
|
|
ctest.extend(["-E", "|".join(excluded_tests)])
|
|
|
|
parallel = default_test_parallel
|
|
if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE", None):
|
|
parallel = int(parallel_override)
|
|
|
|
parallel_run_ctest(ctest,
|
|
cwd=build_dir,
|
|
env=cpp_env,
|
|
timeout=timeout,
|
|
parallel=parallel)
|
|
|
|
|
|
def run_single_gpu_tests(build_dir: _pl.Path,
|
|
run_gpt,
|
|
run_gptj,
|
|
run_llama,
|
|
run_chatglm,
|
|
run_medusa,
|
|
run_mamba,
|
|
run_recurrentgemma,
|
|
run_encoder,
|
|
run_bart,
|
|
run_t5,
|
|
run_redrafter,
|
|
run_fp8,
|
|
timeout=3600):
|
|
build_tests(build_dir=build_dir)
|
|
|
|
cpp_env = {**_os.environ}
|
|
ctest = [
|
|
"ctest", "--output-on-failure", "--output-junit",
|
|
"results-single-gpu.xml"
|
|
]
|
|
|
|
included_tests = []
|
|
if run_gpt:
|
|
included_tests.append("Gpt[^j]")
|
|
if run_gptj:
|
|
included_tests.append("Gptj")
|
|
if run_llama:
|
|
included_tests.append("Llama")
|
|
if run_chatglm:
|
|
included_tests.append("ChatGlm")
|
|
if run_medusa:
|
|
included_tests.append("Medusa")
|
|
if run_mamba:
|
|
included_tests.append("Mamba")
|
|
if run_recurrentgemma:
|
|
included_tests.append("RecurrentGemma")
|
|
if run_encoder:
|
|
included_tests.append("EncoderModelTestSingleGPU")
|
|
if run_bart:
|
|
included_tests.append("BartBasicTest")
|
|
if run_t5:
|
|
included_tests.append("T5BasicTest")
|
|
included_tests.append("T5Beam2Test")
|
|
if run_redrafter:
|
|
included_tests.append("ExplicitDraftTokens")
|
|
|
|
excluded_tests = []
|
|
if not run_fp8:
|
|
excluded_tests.append("FP8")
|
|
|
|
if included_tests:
|
|
ctest.extend(["-R", "|".join(included_tests)])
|
|
if excluded_tests:
|
|
ctest.extend(["-E", "|".join(excluded_tests)])
|
|
|
|
parallel = default_test_parallel
|
|
if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE",
|
|
None):
|
|
parallel = int(parallel_override)
|
|
|
|
parallel_run_ctest(ctest,
|
|
cwd=build_dir,
|
|
env=cpp_env,
|
|
timeout=timeout,
|
|
parallel=parallel)
|
|
if run_gpt:
|
|
xml_output_file = build_dir / "results-single-gpu-disagg-executor_gpt.xml"
|
|
trt_model_test = produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=2,
|
|
local_commands=[
|
|
"tests/executor/executorTest",
|
|
"--gtest_filter=*GptSingleDeviceDisaggSymmetricExecutorTest*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
run_command(trt_model_test, cwd=build_dir, env=cpp_env, timeout=timeout)
|
|
|
|
|
|
def produce_mpirun_command(*, global_commands, nranks, local_commands,
|
|
leader_commands):
|
|
l = global_commands
|
|
for rank in range(nranks):
|
|
l += ["-n", "1"] + local_commands + (leader_commands
|
|
if rank == 0 else []) + [":"]
|
|
return l[:-1]
|
|
|
|
|
|
def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
|
build_tests(build_dir=build_dir)
|
|
|
|
tests_dir = build_dir / "tests"
|
|
cpp_env = {**_os.environ}
|
|
# Utils tests
|
|
mpi_utils_test = [
|
|
"mpirun",
|
|
"-n",
|
|
"4",
|
|
"--allow-run-as-root",
|
|
"mpiUtilsTest",
|
|
]
|
|
run_command(mpi_utils_test, cwd=tests_dir, env=cpp_env, timeout=300)
|
|
|
|
# Cache transceiver tests
|
|
cache_trans_test = [
|
|
"mpirun",
|
|
"-n",
|
|
"2",
|
|
"--allow-run-as-root",
|
|
"batch_manager/cacheTransceiverTest",
|
|
]
|
|
run_command(cache_trans_test, cwd=tests_dir, env=cpp_env, timeout=300)
|
|
|
|
# UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
|
|
if _os.path.exists(
|
|
_os.path.join(tests_dir, "batch_manager/ucxDataTransceiverTest")):
|
|
ucx_trans_test = [
|
|
"mpirun",
|
|
"-n",
|
|
"2",
|
|
"--allow-run-as-root",
|
|
"batch_manager/ucxDataTransceiverTest",
|
|
]
|
|
run_command(ucx_trans_test, cwd=tests_dir, env=cpp_env, timeout=300)
|
|
|
|
xml_output_file = build_dir / "results-multi-gpu-real-decoder.xml"
|
|
trt_model_test = produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=4,
|
|
local_commands=[
|
|
"batch_manager/trtGptModelRealDecoderTest",
|
|
"--gtest_filter=*TP*:*PP*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
run_command(trt_model_test, cwd=tests_dir, env=cpp_env,
|
|
timeout=timeout) # expecting ~ 1200s
|
|
|
|
#Executor test in leader mode
|
|
new_env = copy.copy(cpp_env)
|
|
xml_output_file = build_dir / "results-multi-gpu-llama-exec-leader-mode.xml"
|
|
new_env["RUN_LLAMA_MULTI_GPU"] = "true"
|
|
trt_model_test = produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=4,
|
|
local_commands=[
|
|
"executor/executorTest",
|
|
"--gtest_filter=*LlamaExecutorTest*LeaderMode*:*LlamaMultiExecutorTest*LeaderMode*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
|
|
|
# Executor test in orchestrator mode
|
|
# https://nvbugs/4690328 - Disabled BW2 tests because of spurious failure
|
|
xml_output_file = build_dir / "results-multi-gpu-llama-exec-orch-mode.xml"
|
|
trt_model_test = [
|
|
"mpirun", "-n", "1", "--allow-run-as-root", "executor/executorTest",
|
|
"--gtest_filter=*LlamaExecutorTest*OrchMode*:-*BW2*",
|
|
f"--gtest_output=xml:{xml_output_file}"
|
|
]
|
|
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
|
|
|
#EncDec test in leader mode
|
|
new_env = copy.copy(cpp_env)
|
|
xml_output_file = build_dir / "results-multi-gpu-t5-exec-leader-mode.xml"
|
|
trt_model_test = produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=4,
|
|
local_commands=[
|
|
"executor/executorTest",
|
|
"--gtest_filter=T5MultiGPUTest/EncDecParamsTest.Forward*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"],
|
|
)
|
|
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
|
|
|
#Logits processor test in leader mode
|
|
new_env = copy.copy(cpp_env)
|
|
xml_output_file = build_dir / "results-multi-gpu-logits-proc.xml"
|
|
trt_model_test = produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=4,
|
|
local_commands=[
|
|
"executor/executorTest",
|
|
"--gtest_filter=LlamaExecutorTest/LogitsProcParamsTest*tp2_pp2*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
|
|
|
new_env = copy.copy(cpp_env)
|
|
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-2-process.xml"
|
|
trt_model_test = produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=2,
|
|
local_commands=[
|
|
"executor/executorTest",
|
|
"--gtest_filter=*DisaggSymmetricExecutorTest*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
|
|
|
new_env = copy.copy(cpp_env)
|
|
new_env["RUN_LLAMA_MULTI_GPU"] = "true"
|
|
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-4-process.xml"
|
|
trt_model_test = produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=4,
|
|
local_commands=[
|
|
"executor/executorTest",
|
|
"--gtest_filter=*DisaggSymmetricExecutorTest*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
|
|
|
new_env = copy.copy(cpp_env)
|
|
new_env["RUN_LLAMA_MULTI_GPU"] = "true"
|
|
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-8-process.xml"
|
|
trt_model_test = produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=8,
|
|
local_commands=[
|
|
"executor/executorTest",
|
|
"--gtest_filter=*LlamaTP2PP2DisaggSymmetricExecutorTest*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
|
|
|
new_env = copy.copy(cpp_env)
|
|
new_env["RUN_LLAMA_MULTI_GPU"] = "true"
|
|
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-4-process.xml"
|
|
trt_model_test = produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=4,
|
|
local_commands=[
|
|
"executor/executorTest",
|
|
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
|
|
|
new_env = copy.copy(cpp_env)
|
|
new_env["RUN_LLAMA_MULTI_GPU"] = "true"
|
|
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-6-process.xml"
|
|
trt_model_test = produce_mpirun_command(
|
|
global_commands=["mpirun", "--allow-run-as-root"],
|
|
nranks=6,
|
|
local_commands=[
|
|
"executor/executorTest",
|
|
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
|
|
],
|
|
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
|
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
|
|
|
|
|
def run_benchmarks(model_name: str, python_exe: str, root_dir: _pl.Path,
|
|
build_dir: _pl.Path, resources_dir: _pl.Path,
|
|
model_cache: str, test_gpt_session_benchmark: bool,
|
|
batching_types: list[str], api_types: list[str]):
|
|
|
|
# At this moment, CI env might not installed tensorrt_llm before, so tensorrt_llm module might not be available.
|
|
import pathlib
|
|
import sys
|
|
|
|
import model_spec
|
|
src_root_dir = pathlib.Path(
|
|
__file__).parent.resolve().parent.parent.parent.parent
|
|
|
|
sys.path.insert(0, str(src_root_dir))
|
|
import tensorrt_llm.bindings as _tb
|
|
|
|
make_benchmarks = [
|
|
"cmake", "--build", ".", "--config", "Release", "-j", "--target",
|
|
"benchmarks"
|
|
]
|
|
run_command(make_benchmarks, cwd=build_dir, timeout=300)
|
|
|
|
benchmark_exe_dir = build_dir / "benchmarks"
|
|
if model_name == "gpt":
|
|
model_engine_dir = resources_dir / "models" / "rt_engine" / "gpt2"
|
|
tokenizer_dir = resources_dir / "models" / "gpt2"
|
|
elif model_name in ('bart', 't5'):
|
|
if model_name == "t5":
|
|
hf_repo_name = "t5-small"
|
|
elif model_name == "bart":
|
|
hf_repo_name = "bart-large-cnn"
|
|
model_engine_dir = resources_dir / "models" / "enc_dec" / "trt_engines" / hf_repo_name
|
|
tokenizer_dir = model_cache + "/" + hf_repo_name
|
|
model_engine_path = model_engine_dir / "1-gpu" / "float16" / "decoder"
|
|
encoder_model_engine_path = model_engine_dir / "1-gpu" / "float16" / "encoder"
|
|
model_name = "enc_dec"
|
|
else:
|
|
_log.info(
|
|
f"run_benchmark test does not support {model_name}. Skipping benchmarks"
|
|
)
|
|
return NotImplementedError
|
|
|
|
if test_gpt_session_benchmark:
|
|
if model_name == "gpt":
|
|
input_file = 'input_tokens.npy'
|
|
model_spec_obj = model_spec.ModelSpec(input_file, _tb.DataType.HALF)
|
|
model_spec_obj.set_kv_cache_type(_tb.KVCacheType.CONTINUOUS)
|
|
model_spec_obj.use_gpt_plugin()
|
|
model_engine_path = model_engine_dir / model_spec_obj.get_model_path(
|
|
) / "tp1-pp1-gpu"
|
|
else:
|
|
_log.info(
|
|
f"gptSessionBenchmark test does not support {model_name}. Skipping benchmarks"
|
|
)
|
|
return NotImplementedError
|
|
|
|
benchmark = [
|
|
str(benchmark_exe_dir / "gptSessionBenchmark"), "--engine_dir",
|
|
str(model_engine_path), "--batch_size", "8", "--input_output_len",
|
|
"10,20", "--duration", "10"
|
|
]
|
|
run_command(benchmark, cwd=root_dir, timeout=600)
|
|
|
|
prompt_datasets_args = [{
|
|
'--dataset-name': "cnn_dailymail",
|
|
'--dataset-config-name': "3.0.0",
|
|
'--dataset-split': "validation",
|
|
'--dataset-input-key': "article",
|
|
'--dataset-prompt': "Summarize the following article:",
|
|
'--dataset-output-key': "highlights"
|
|
}, {
|
|
'--dataset-name': "Open-Orca/1million-gpt-4",
|
|
'--dataset-split': "train",
|
|
'--dataset-input-key': "question",
|
|
'--dataset-prompt-key': "system_prompt",
|
|
'--dataset-output-key': "response"
|
|
}]
|
|
token_files = [
|
|
"prepared_" + s['--dataset-name'].replace('/', '_')
|
|
for s in prompt_datasets_args
|
|
]
|
|
max_input_lens = ["256", "20"]
|
|
num_reqs = ["50", "10"]
|
|
|
|
if model_name == "gpt":
|
|
input_file = 'input_tokens.npy'
|
|
model_spec_obj = model_spec.ModelSpec(input_file, _tb.DataType.HALF)
|
|
model_spec_obj.set_kv_cache_type(_tb.KVCacheType.PAGED)
|
|
model_spec_obj.use_gpt_plugin()
|
|
model_spec_obj.use_packed_input()
|
|
model_engine_path = model_engine_dir / model_spec_obj.get_model_path(
|
|
) / "tp1-pp1-gpu"
|
|
|
|
for prompt_ds_args, tokens_f, len, num_req in zip(prompt_datasets_args,
|
|
token_files,
|
|
max_input_lens, num_reqs):
|
|
|
|
benchmark_src_dir = _pl.Path("benchmarks") / "cpp"
|
|
data_dir = resources_dir / "data"
|
|
prepare_dataset = [
|
|
python_exe,
|
|
str(benchmark_src_dir / "prepare_dataset.py"), "--tokenizer",
|
|
str(tokenizer_dir), "--output",
|
|
str(data_dir / tokens_f), "dataset", "--max-input-len", len,
|
|
"--num-requests", num_req
|
|
]
|
|
for k, v in prompt_ds_args.items():
|
|
prepare_dataset += [k, v]
|
|
# https://nvbugs/4658787
|
|
# WAR before the prepare dataset can use offline cached dataset
|
|
run_command(prepare_dataset,
|
|
cwd=root_dir,
|
|
timeout=300,
|
|
env={'HF_DATASETS_OFFLINE': '0'})
|
|
|
|
for batching_type in batching_types:
|
|
for api_type in api_types:
|
|
benchmark = [
|
|
str(benchmark_exe_dir / "gptManagerBenchmark"),
|
|
"--engine_dir",
|
|
str(model_engine_path), "--type",
|
|
str(batching_type), "--api",
|
|
str(api_type), "--dataset",
|
|
str(data_dir / tokens_f)
|
|
]
|
|
if model_name == "enc_dec":
|
|
benchmark += [
|
|
"--encoder_engine_dir",
|
|
str(encoder_model_engine_path)
|
|
]
|
|
|
|
run_command(benchmark, cwd=root_dir, timeout=600)
|
|
req_rate_benchmark = benchmark + ["--request_rate", "100"]
|
|
run_command(req_rate_benchmark, cwd=root_dir, timeout=600)
|
|
concurrency_benchmark = benchmark + ["--concurrency", "30"]
|
|
run_command(concurrency_benchmark, cwd=root_dir, timeout=600)
|
|
|
|
if "IFB" in batching_type and "executor" in api_types:
|
|
# executor streaming test
|
|
benchmark = [
|
|
str(benchmark_exe_dir / "gptManagerBenchmark"), "--engine_dir",
|
|
str(model_engine_path), "--type", "IFB", "--dataset",
|
|
str(data_dir / tokens_f), "--api", "executor", "--streaming"
|
|
]
|
|
if model_name == "enc_dec":
|
|
benchmark += [
|
|
"--encoder_engine_dir",
|
|
str(encoder_model_engine_path)
|
|
]
|
|
run_command(benchmark, cwd=root_dir, timeout=600)
|
|
|
|
if "IFB" in batching_type and "gptManager" in api_type:
|
|
# gptManager streaming test
|
|
benchmark = [
|
|
str(benchmark_exe_dir / "gptManagerBenchmark"), "--engine_dir",
|
|
str(model_engine_path), "--type", "IFB", "--dataset",
|
|
str(data_dir / tokens_f), "--api", "gptManager", "--streaming"
|
|
]
|
|
if model_name == "enc_dec":
|
|
benchmark += [
|
|
"--encoder_engine_dir",
|
|
str(encoder_model_engine_path)
|
|
]
|
|
run_command(benchmark, cwd=root_dir, timeout=600)
|
|
|
|
# gptManager streaming test with delay
|
|
benchmark = [
|
|
str(benchmark_exe_dir / "gptManagerBenchmark"), "--engine_dir",
|
|
str(model_engine_path), "--type", "IFB", "--dataset",
|
|
str(data_dir / tokens_f), "--api", "gptManager", "--streaming",
|
|
"request_rate", "100", "--enable_exp_delays"
|
|
]
|
|
if model_name == "enc_dec":
|
|
benchmark += [
|
|
"--encoder_engine_dir",
|
|
str(encoder_model_engine_path)
|
|
]
|
|
run_command(benchmark, cwd=root_dir, timeout=600)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
_log.basicConfig(level=_log.INFO)
|
|
parser = _arg.ArgumentParser()
|
|
|
|
build_config_group = "Build config"
|
|
build_config_parser = parser.add_argument_group(
|
|
build_config_group, "Configure TensorRT-LLM build")
|
|
add_build_arguments(build_config_parser)
|
|
build_config_parser.set_defaults(install=True, skip_building_wheel=True)
|
|
|
|
test_config_group = "Tests config"
|
|
tests_config_parser = parser.add_argument_group(test_config_group,
|
|
"Configure tests")
|
|
|
|
tests_config_parser.add_argument("--model_cache",
|
|
type=str,
|
|
help="Directory where models are stored")
|
|
tests_config_parser.add_argument(
|
|
"--build_only",
|
|
action="store_true",
|
|
help=
|
|
"Only build engines and generate expected outputs, do not run tests.")
|
|
tests_config_parser.add_argument(
|
|
"--skip_unit_tests",
|
|
action="store_true",
|
|
help="Skip unit tests. Only run model tests.")
|
|
tests_config_parser.add_argument("--run_all_models",
|
|
action="store_true",
|
|
help="Run the tests for all models")
|
|
tests_config_parser.add_argument("--run_gpt",
|
|
action="store_true",
|
|
help="Run the tests for GPT")
|
|
tests_config_parser.add_argument("--run_gptj",
|
|
action="store_true",
|
|
help="Run the tests for GPT-J")
|
|
tests_config_parser.add_argument("--run_llama",
|
|
action="store_true",
|
|
help="Run the tests for Llama")
|
|
tests_config_parser.add_argument("--run_chatglm",
|
|
action="store_true",
|
|
help="Run the tests for ChatGLM")
|
|
tests_config_parser.add_argument("--run_medusa",
|
|
action="store_true",
|
|
help="Run the tests for Medusa")
|
|
tests_config_parser.add_argument("--run_mamba",
|
|
action="store_true",
|
|
help="Run the tests for Mamba")
|
|
tests_config_parser.add_argument("--run_recurrentgemma",
|
|
action="store_true",
|
|
help="Run the tests for RecurrentGemma")
|
|
tests_config_parser.add_argument("--run_encoder",
|
|
action="store_true",
|
|
help="Run the tests for BART encoder")
|
|
tests_config_parser.add_argument("--run_bart",
|
|
action="store_true",
|
|
help="Run the tests for BART")
|
|
tests_config_parser.add_argument("--run_t5",
|
|
action="store_true",
|
|
help="Run the tests for T5")
|
|
tests_config_parser.add_argument("--run_redrafter",
|
|
action="store_true",
|
|
help="Run the tests for ReDrafter")
|
|
tests_config_parser.add_argument(
|
|
"--run_fp8",
|
|
action="store_true",
|
|
help="Additionally run FP8 tests. Implemented for H100 runners.")
|
|
tests_config_parser.add_argument(
|
|
"--only_multi_gpu",
|
|
action="store_true",
|
|
help="Run only mulit-GPU tests. Implemented for 4 GPUs.")
|
|
tests_config_parser.add_argument("--test_timeout",
|
|
type=int,
|
|
help="Timeout for tests.")
|
|
|
|
args = parser.parse_args()
|
|
|
|
arg_groups = {}
|
|
for group in parser._action_groups:
|
|
group_dict = {
|
|
a.dest: getattr(args, a.dest, None)
|
|
for a in group._group_actions
|
|
}
|
|
arg_groups[group.title] = _arg.Namespace(**group_dict)
|
|
|
|
build_args = arg_groups[build_config_group]
|
|
build_trt_llm(**vars(build_args))
|
|
|
|
test_args = arg_groups[test_config_group]
|
|
test_args.build_dir = get_build_dir(build_args.build_dir,
|
|
build_args.build_type)
|
|
# Make modelSpec module since build engine and generate output scripts will need it.
|
|
make_modelSpec = [
|
|
"cmake", "--build",
|
|
test_args.build_dir.__str__(), "--config", build_args.build_type, "-j",
|
|
"--target", "modelSpec"
|
|
]
|
|
run_command(make_modelSpec, cwd=build_args.build_dir, timeout=300)
|
|
|
|
from build_engines_utils import init_model_spec_module
|
|
|
|
init_model_spec_module(force_init_trtllm_bindings=False)
|
|
|
|
if test_args.run_all_models:
|
|
test_args.run_gpt = True
|
|
test_args.run_gptj = True
|
|
test_args.run_llama = True
|
|
test_args.run_chatglm = True
|
|
test_args.run_mamba = True
|
|
test_args.run_recurrentgemma = True
|
|
test_args.run_encoder = True
|
|
test_args.run_bart = True
|
|
test_args.run_t5 = True
|
|
test_args.run_medusa = True
|
|
test_args.run_redrafter = True
|
|
|
|
del test_args.run_all_models
|
|
|
|
run_tests(**vars(test_args))
|