TensorRT-LLMs/tests/integration/defs/cpp_common.py
Chuang Zhu bc5811da65
chore: Ucx ip port remove mpi depend (#3101)
* initial ucx support

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* fixes to support dynloading and ucx connection establishment - not stable yet

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* update

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* more connection bringup fixes - faillig on connection vector build

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* executor test pass

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* update

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* passed full benchmark

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* changing to TLLM_THROW and removing cout

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* stoping progress thread at ucxComm destructor

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* fixing build with ENABLE_UCX=0 to not build ucx traget at all and removing includes for ucxConnection for cache transceiver, also delete commented cold code

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* fix copyrights

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* adding ucx flavor to cache transceiver test and insertto the CI pipeline

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* allowing sending non ib interfaces IPs

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* setting UCX port reuse for the tests in pipeline

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* code review fixes

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* querying ep after GID message is sent to avoid UCX Errors

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* fixing more CR issues

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* querying ep to not fail is ep_not_connected yet

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>

* remove mpi dependency and debug

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

* debug to info

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

* mpirun n 2

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

* remove mpi comm split when disaggOrchestrator mode

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

* waive disagg_mtp test

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

* use future instead of thread

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

* use future_promise instead of cv wait

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

* connectionId type

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

* improve test

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

* imporve test 2

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

* gtest_skip

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>

---------

Signed-off-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>
Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
Co-authored-by: roeya <165803633+RoeyAzran1992@users.noreply.github.com>
2025-04-02 09:42:29 +08:00

892 lines
33 KiB
Python
Executable File

import copy
import datetime
import glob
import logging as _logger
import os as _os
import pathlib as _pl
import subprocess as _sp
import sys as _sys
from typing import Generator, List, Optional, Sequence
build_script_dir = _pl.Path(
__file__).parent.resolve().parent.parent.parent / "scripts"
assert build_script_dir.is_dir()
_sys.path.append(str(build_script_dir))
from build_wheel import get_build_dir as get_trt_llm_build_dir
default_test_parallel = 2
default_test_timeout = 3600
include_test_map = {
"gpt": ("Gpt[^j]", ),
"gptj": ("Gptj", ),
"llama": ("Llama", ),
"chatglm": ("ChatGlm", ),
"medusa": ("Medusa", ),
"eagle": ("Eagle", ),
"mamba": ("Mamba", ),
"recurrentgemma": ("RecurrentGemma", ),
"encoder": ("EncoderModelTestSingleGPU", ),
"bart": ("BartBasicTest", ),
"t5": (
"T5BasicTest",
"T5Beam2Test",
),
"enc_dec_language_adapter": ("LanguageAdapterBasicTest", ),
"redrafter": ("ExplicitDraftTokens", )
}
def generate_excluded_model_tests() -> Generator[str, None, None]:
yield "Gpt[^j]"
yield "Gptj"
yield "Llama"
yield "ChatGlm"
yield "Medusa"
yield "Eagle"
yield "ExplicitDraftTokensDecoding"
yield "Mamba"
yield "RecurrentGemma"
yield "Encoder"
yield "EncDec"
yield "SpeculativeDecoding"
def generate_included_model_tests(
test_list: List[str]) -> Generator[str, None, None]:
yield from (item for model in test_list for item in include_test_map[model])
def generate_result_file_name(test_list: List[str],
run_fp8=False) -> Generator[str, None, None]:
yield "results-single-gpu"
yield from test_list
if run_fp8:
yield "fp8"
def find_dir_containing(files: Sequence[str],
start_dir: Optional[_pl.Path] = None) -> _pl.Path:
if start_dir is None:
start_dir = _pl.Path.cwd().absolute()
assert isinstance(start_dir, _pl.Path)
assert start_dir.is_dir()
if set(files).issubset({f.name for f in start_dir.iterdir()}):
return start_dir
elif start_dir.parent is not start_dir:
return find_dir_containing(files, start_dir.parent)
else:
raise FileNotFoundError(files)
def find_root_dir(start_dir: Optional[_pl.Path] = None) -> _pl.Path:
return find_dir_containing(("scripts", "examples", "cpp"), start_dir)
def find_build_dir():
root_dir = find_root_dir()
dir = get_trt_llm_build_dir(None, "Release")
return dir if dir.is_absolute() else root_dir / dir
def run_command(command: Sequence[str],
cwd: _pl.Path,
*,
shell=False,
env=None,
timeout=None) -> None:
_logger.info("Running: cd %s && %s", str(cwd), " ".join(command))
override_timeout = int(_os.environ.get("CPP_TEST_TIMEOUT_OVERRIDDEN", "-1"))
if override_timeout > 0 and (timeout is None or override_timeout > timeout):
_logger.info(
"Overriding the command timeout: %s (before) and %s (after)",
timeout, override_timeout)
timeout = override_timeout
_sp.check_call(command, cwd=cwd, shell=shell, env=env, timeout=timeout)
def merge_report(parallel, retry, output):
import xml.etree.ElementTree as ElementTree
base = ElementTree.parse(parallel)
extra = ElementTree.parse(retry)
base_suite = base.getroot()
extra_suite = extra.getroot()
base_suite.attrib['failures'] = extra_suite.attrib['failures']
base_suite.attrib['time'] = str(
int(base_suite.attrib['time']) + int(extra_suite.attrib['time']))
case_names = {element.attrib['name'] for element in extra_suite}
base_suite[:] = [
element
for element in base_suite if element.attrib['name'] not in case_names
] + list(extra_suite)
base.write(output, encoding="UTF-8", xml_declaration=True)
def add_parallel_info(report, parallel):
import xml.etree.ElementTree as ElementTree
try:
document = ElementTree.parse(report)
except FileNotFoundError:
return
root = document.getroot()
root.attrib['parallel'] = str(parallel)
document.write(report, encoding="UTF-8", xml_declaration=True)
def run_ctest(command: Sequence[str],
cwd: _pl.Path,
*,
shell=False,
env=None,
timeout=None) -> None:
override_timeout = int(_os.environ.get("CPP_TEST_TIMEOUT_OVERRIDDEN", "-1"))
if override_timeout > 0 and (timeout is None or override_timeout > timeout):
_logger.info(
"Overriding the command timeout: %s (before) and %s (after)",
timeout, override_timeout)
timeout = override_timeout
deadline = None
if timeout is not None:
deadline = datetime.datetime.now() + datetime.timedelta(seconds=timeout)
command = list(command)
command += ["--stop-time", deadline.strftime("%H:%M:%S")]
try:
_logger.info("Running: cd %s && %s", str(cwd), " ".join(command))
_sp.check_call(command, cwd=cwd, shell=shell, env=env)
except _sp.CalledProcessError as e:
fuzz = datetime.timedelta(seconds=2)
if deadline is not None and deadline - fuzz < datetime.datetime.now():
# Detect timeout
raise _sp.TimeoutExpired(e.cmd, timeout, e.output, e.stderr)
raise
def parallel_run_ctest(
command: Sequence[str],
cwd: _pl.Path,
*,
shell=False,
env=None,
timeout=None,
parallel=default_test_parallel,
) -> None:
if parallel == 1:
return run_ctest(command,
cwd=cwd,
shell=shell,
env=env,
timeout=timeout)
env = {} if env is None else env
env['CTEST_PARALLEL_LEVEL'] = str(parallel)
def get_report():
reports = glob.glob("results-*.xml", root_dir=cwd)
if not reports:
return ''
return reports[0]
report = None
try:
run_ctest(command, cwd=cwd, shell=shell, env=env, timeout=timeout)
# except _sp.TimeoutExpired:
# Deliberately let timeout propagate. We don't want to retry on timeout
except _sp.CalledProcessError:
report = get_report()
if report == '':
# Some catastrophic fail happened that there's no report generated
raise
# Avoid .xml extension to prevent CI from reading failures from it
parallel_report = 'parallel-' + report + ".intermediate"
_os.rename(cwd / report, cwd / parallel_report)
try:
_logger.info("Parallel test failed, retry serial on failed tests")
del env['CTEST_PARALLEL_LEVEL']
command = [*command, "--rerun-failed"]
run_ctest(command, cwd=cwd, shell=shell, env=env, timeout=timeout)
finally:
if not _os.path.exists(cwd / report):
# Some catastrophic fail happened that there's no report generated
# Use parallel result as final report
_os.rename(cwd / parallel_report, cwd / report)
else:
retry_report = 'retry-' + report + ".intermediate"
_os.rename(cwd / report, cwd / retry_report)
merge_report(cwd / parallel_report, cwd / retry_report,
cwd / report)
finally:
if report is None:
report = get_report()
if report:
add_parallel_info(cwd / report, parallel)
def produce_mpirun_command(*, global_commands, nranks, local_commands,
leader_commands):
l = global_commands
for rank in range(nranks):
l += ["-n", "1"] + local_commands + (leader_commands
if rank == 0 else []) + [":"]
return l[:-1]
def run_simple_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
tests_dir = build_dir / "tests"
cpp_env = {**_os.environ}
# Utils tests
mpi_utils_test = [
"mpirun",
"-n",
"4",
"--allow-run-as-root",
"mpiUtilsTest",
]
run_command(mpi_utils_test, cwd=tests_dir, env=cpp_env, timeout=300)
# Cache transceiver tests
new_env = copy.copy(cpp_env)
new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
cache_trans_test = [
"mpirun",
"-n",
"2",
"--allow-run-as-root",
"batch_manager/cacheTransceiverTest",
]
run_command(cache_trans_test, cwd=tests_dir, env=new_env, timeout=300)
new_env = copy.copy(cpp_env)
new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
# Cache transceiver tests
cache_trans_test_8_proc = [
"mpirun",
"-n",
"8",
"--allow-run-as-root",
"batch_manager/cacheTransceiverTest",
]
run_command(cache_trans_test_8_proc,
cwd=tests_dir,
env=new_env,
timeout=600)
# Cache transceiver tests with UCX
new_env = copy.copy(cpp_env)
new_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
cache_trans_test = [
"mpirun",
"-n",
"2",
"--allow-run-as-root",
"batch_manager/cacheTransceiverTest",
]
run_command(cache_trans_test, cwd=tests_dir, env=new_env, timeout=300)
new_env = copy.copy(cpp_env)
new_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
# Cache transceiver tests
cache_trans_test_8_proc = [
"mpirun",
"-n",
"8",
"--allow-run-as-root",
"batch_manager/cacheTransceiverTest",
]
run_command(cache_trans_test_8_proc,
cwd=tests_dir,
env=new_env,
timeout=600)
def run_llama_executor_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
tests_dir = build_dir / "tests"
cpp_env = {**_os.environ}
mgpu_env = copy.copy(cpp_env)
mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
#Executor test in leader mode
xml_output_file = build_dir / "results-multi-gpu-llama-exec-leader-mode.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=4,
local_commands=[
"executor/executorTest",
"--gtest_filter=*LlamaExecutorTest*LeaderMode*:*LlamaMultiExecutorTest*LeaderMode*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
# https://nvbugspro.nvidia.com/bug/5026255 disable below tests for now.
if False:
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
#Executor test in orchestrator mode
xml_output_file = build_dir / "results-multi-gpu-llama-exec-orch-mode.xml"
trt_model_test = [
"mpirun", "-n", "1", "--allow-run-as-root", "executor/executorTest",
"--gtest_filter=*LlamaExecutorTest*OrchMode*",
f"--gtest_output=xml:{xml_output_file}"
]
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
#Logits processor and guided decoding test in leader mode
xml_output_file = build_dir / "results-multi-gpu-logits-proc.xml"
tp_pp_sizes = [(4, 1), (2, 2), (1, 4)]
gtest_filter = [
f"LlamaExecutorTest/LogitsProcParamsTest*tp{tp}_pp{pp}*"
for tp, pp in tp_pp_sizes
]
gtest_filter.extend([
f"LlamaExecutorGuidedDecodingTest/GuidedDecodingParamsTest*tp{tp}_pp{pp}*"
for tp, pp in tp_pp_sizes
])
gtest_filter = ":".join(gtest_filter)
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=4,
local_commands=[
"executor/executorTest", f"--gtest_filter={gtest_filter}"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
def run_t5_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
tests_dir = build_dir / "tests"
cpp_env = {**_os.environ}
#EncDec test in leader mode
xml_output_file = build_dir / "results-multi-gpu-t5-exec-leader-mode.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=4,
local_commands=[
"executor/encDecTest",
"--gtest_filter=T5MultiGPUTest/EncDecParamsTest.Forward*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"],
)
run_command(trt_model_test, cwd=tests_dir, env=cpp_env, timeout=1500)
def run_trt_gpt_model_real_decoder_multi_gpu_tests(build_dir: _pl.Path,
timeout=1500):
tests_dir = build_dir / "tests"
cpp_env = {**_os.environ}
xml_output_file = build_dir / "results-multi-gpu-real-decoder.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=4,
local_commands=[
"batch_manager/trtGptModelRealDecoderTest",
"--gtest_filter=*TP*:*PP*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=cpp_env,
timeout=timeout) # expecting ~ 1200s
def run_disagg_multi_gpu_tests(build_dir: _pl.Path):
tests_dir = build_dir / "tests"
cpp_env = {**_os.environ}
new_env = copy.copy(cpp_env)
new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-2-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=2,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggSymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
mgpu_env = copy.copy(cpp_env)
mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
mgpu_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-4-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=4,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggSymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
# https://nvbugspro.nvidia.com/bug/5026255 disable below tests for now.
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-8-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=8,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*LlamaTP2PP2DisaggSymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-4-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=4,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-6-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=6,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=8,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-orchestrator-executor-7-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=7,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggOrchestratorParamsTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
# UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
new_env = copy.copy(cpp_env)
new_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-2-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=2,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggSymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
mgpu_env = copy.copy(cpp_env)
mgpu_env["RUN_LLAMA_MULTI_GPU"] = "true"
mgpu_env["TRTLLM_USE_UCX_KVCACHE"] = "1"
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-4-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=4,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggSymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
# https://nvbugspro.nvidia.com/bug/5026255 disable below tests for now.
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-executor-8-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=8,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*LlamaTP2PP2DisaggSymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-4-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=4,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-6-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=6,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=8,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-orchestrator-executor-7-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=7,
local_commands=[
"executor/disaggExecutorTest",
"--gtest_filter=*DisaggOrchestratorParamsTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=mgpu_env, timeout=1500)
xml_output_file = build_dir / "results-multi-gpu-disagg-spawn-asymmetric-orchestrator-executor-1-process.xml"
comms = [
"executor/disaggExecutorTest",
"--gtest_filter=*DisaaggSpawnOrchestrator*"
]
run_command(comms, cwd=tests_dir, env=mgpu_env, timeout=1500)
def run_spec_dec_tests(build_dir: _pl.Path):
xml_output_file = build_dir / "results-spec-dec-fast-logits.xml"
cpp_env = {**_os.environ}
tests_dir = build_dir / "tests"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=3,
local_commands=[
"executor/executorTest", "--gtest_filter=*SpecDecFastLogits*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=cpp_env, timeout=1500)
def prepare_model_tests(model_name: str,
python_exe: str,
root_dir: _pl.Path,
resources_dir: _pl.Path,
model_cache_arg=[],
only_fp8_arg=[],
only_multi_gpu_arg=[]):
scripts_dir = resources_dir / "scripts"
model_env = {**_os.environ, "PYTHONPATH": f"examples/{model_name}"}
enc_dec_model_name_arg = []
beams_arg = []
if model_name in ('bart', 't5', 'enc_dec_language_adapter'):
enc_dec_repo_name_dict = {
'bart': 'facebook/bart-large-cnn',
't5': 't5-small',
'enc_dec_language_adapter':
'language_adapter-enc_dec_language_adapter'
}
enc_dec_model_name_arg = [
'--hf_repo_name', enc_dec_repo_name_dict[model_name]
]
if model_name == 't5' and (not only_multi_gpu_arg):
beams_arg = ['--beams', '1,2']
model_name = 'enc_dec'
build_engines = [
python_exe,
str(scripts_dir / f"build_{model_name}_engines.py")
] + model_cache_arg + only_fp8_arg + only_multi_gpu_arg + enc_dec_model_name_arg + beams_arg
if model_name in ['gpt']:
build_engines += ['--clean']
run_command(build_engines, cwd=root_dir, env=model_env, timeout=1800)
model_env["PYTHONPATH"] = "examples"
generate_expected_output = [
python_exe,
str(scripts_dir / f"generate_expected_{model_name}_output.py")
] + only_fp8_arg + only_multi_gpu_arg + enc_dec_model_name_arg
if "enc_dec" in model_name:
generate_expected_output += model_cache_arg
generate_expected_output += beams_arg
if model_name in ['gpt']:
generate_expected_output += ['--clean']
if only_multi_gpu_arg and model_name != 'enc_dec':
for world_size in (2, 4):
generate_command = [
"mpirun", "-n",
str(world_size), "--allow-run-as-root", "--timeout", "600"
] + generate_expected_output
run_command(generate_command,
cwd=root_dir,
env=model_env,
timeout=600)
else:
run_command(generate_expected_output,
cwd=root_dir,
env=model_env,
timeout=600)
if model_name in ['gpt', 'llama']:
if model_name == 'gpt':
script_model_name = 'gpt2'
elif model_name == 'llama':
script_model_name = 'llama-7b-hf'
generate_tokenizer_info = [
python_exe, "examples/generate_xgrammar_tokenizer_info.py",
f"--model_dir={str(resources_dir / 'models' / script_model_name)}",
f"--output_dir={str(resources_dir / 'data' / script_model_name)}"
]
run_command(generate_tokenizer_info,
cwd=root_dir,
env=model_env,
timeout=600)
def prepare_multi_gpu_model_tests(test_list: List[str],
python_exe: str,
root_dir: _pl.Path,
resources_dir: _pl.Path,
model_cache: Optional[str] = None):
model_cache_arg = ["--model_cache", model_cache] if model_cache else []
if "llama" in test_list:
prepare_model_tests(model_name="llama",
python_exe=python_exe,
root_dir=root_dir,
resources_dir=resources_dir,
model_cache_arg=model_cache_arg,
only_multi_gpu_arg=["--only_multi_gpu"])
if "t5" in test_list:
prepare_model_tests(model_name="t5",
python_exe=python_exe,
root_dir=root_dir,
resources_dir=resources_dir,
model_cache_arg=model_cache_arg,
only_multi_gpu_arg=['--tp', '4', '--pp', '1'])
def run_single_gpu_tests(build_dir: _pl.Path,
test_list: List[str],
run_fp8=False,
timeout=3600):
cpp_env = {**_os.environ}
included_tests = list(generate_included_model_tests(test_list))
fname_list = list(generate_result_file_name(test_list, run_fp8=run_fp8))
resultFileName = "-".join(fname_list) + ".xml"
excluded_tests = ["FP8"] if not run_fp8 else []
ctest = ["ctest", "--output-on-failure", "--output-junit", resultFileName]
if included_tests:
ctest.extend(["-R", "|".join(included_tests)])
if excluded_tests:
ctest.extend(["-E", "|".join(excluded_tests)])
parallel = default_test_parallel
if parallel_override := _os.environ.get("LLM_TEST_PARALLEL_OVERRIDE",
None):
parallel = int(parallel_override)
parallel_run_ctest(ctest,
cwd=build_dir,
env=cpp_env,
timeout=timeout,
parallel=parallel)
if "gpt" in test_list:
xml_output_file = build_dir / "results-single-gpu-disagg-executor_gpt.xml"
new_env = copy.copy(cpp_env)
new_env["TRTLLM_USE_MPI_KVCACHE"] = "1"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=2,
local_commands=[
"tests/executor/disaggExecutorTest",
"--gtest_filter=*GptSingleDeviceDisaggSymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=build_dir, env=new_env, timeout=timeout)
run_spec_dec_tests(build_dir=build_dir)
def run_benchmarks(model_name: str, python_exe: str, root_dir: _pl.Path,
build_dir: _pl.Path, resources_dir: _pl.Path,
model_cache: str, test_gpt_session_benchmark: bool,
batching_types: list[str], api_types: list[str]):
benchmark_exe_dir = build_dir / "benchmarks"
if model_name == "gpt":
model_engine_dir = resources_dir / "models" / "rt_engine" / "gpt2"
tokenizer_dir = resources_dir / "models" / "gpt2"
elif model_name in ('bart', 't5'):
if model_name == "t5":
hf_repo_name = "t5-small"
elif model_name == "bart":
hf_repo_name = "bart-large-cnn"
model_engine_dir = resources_dir / "models" / "enc_dec" / "trt_engines" / hf_repo_name
tokenizer_dir = model_cache + "/" + hf_repo_name
model_engine_path = model_engine_dir / "1-gpu" / "float16" / "decoder"
encoder_model_engine_path = model_engine_dir / "1-gpu" / "float16" / "encoder"
model_name = "enc_dec"
else:
_logger.info(
f"run_benchmark test does not support {model_name}. Skipping benchmarks"
)
return NotImplementedError
if test_gpt_session_benchmark:
if model_name == "gpt":
pass
# WAR: Currently importing the bindings here causes a segfault in pybind 11 during shutdown
# As this just builds a path we hard-code for now to obviate the need for import of bindings
# model_spec_obj = model_spec.ModelSpec(input_file, _tb.DataType.HALF)
# model_spec_obj.set_kv_cache_type(_tb.KVCacheType.CONTINUOUS)
# model_spec_obj.use_gpt_plugin()
# model_engine_path = model_engine_dir / model_spec_obj.get_model_path(
# ) / "tp1-pp1-cp1-gpu"
model_engine_path = model_engine_dir / "fp16_plugin_continuous" / "tp1-pp1-cp1-gpu"
else:
_logger.info(
f"gptSessionBenchmark test does not support {model_name}. Skipping benchmarks"
)
return NotImplementedError
benchmark = [
str(benchmark_exe_dir / "gptSessionBenchmark"), "--engine_dir",
str(model_engine_path), "--batch_size", "8", "--input_output_len",
"10,20", "--duration", "10"
]
run_command(benchmark, cwd=root_dir, timeout=600)
prompt_datasets_args = [{
'--dataset-name': "cnn_dailymail",
'--dataset-config-name': "3.0.0",
'--dataset-split': "validation",
'--dataset-input-key': "article",
'--dataset-prompt': "Summarize the following article:",
'--dataset-output-key': "highlights"
}, {
'--dataset-name': "Open-Orca/1million-gpt-4",
'--dataset-split': "train",
'--dataset-input-key': "question",
'--dataset-prompt-key': "system_prompt",
'--dataset-output-key': "response"
}]
token_files = [
"prepared_" + s['--dataset-name'].replace('/', '_')
for s in prompt_datasets_args
]
max_input_lens = ["256", "20"]
num_reqs = ["50", "10"]
if model_name == "gpt":
model_engine_path = model_engine_dir / "fp16_plugin_packed_paged" / "tp1-pp1-cp1-gpu"
# WAR: Currently importing the bindings here causes a segfault in pybind 11 during shutdown
# As this just builds a path we hard-code for now to obviate the need for import of bindings
# model_spec_obj = model_spec.ModelSpec(input_file, _tb.DataType.HALF)
# model_spec_obj.set_kv_cache_type(_tb.KVCacheType.PAGED)
# model_spec_obj.use_gpt_plugin()
# model_spec_obj.use_packed_input()
# model_engine_path = model_engine_dir / model_spec_obj.get_model_path(
# ) / "tp1-pp1-cp1-gpu"
for prompt_ds_args, tokens_f, len, num_req in zip(prompt_datasets_args,
token_files,
max_input_lens, num_reqs):
benchmark_src_dir = _pl.Path("benchmarks") / "cpp"
data_dir = resources_dir / "data"
prepare_dataset = [
python_exe,
str(benchmark_src_dir / "prepare_dataset.py"), "--tokenizer",
str(tokenizer_dir), "--output",
str(data_dir / tokens_f), "dataset", "--max-input-len", len,
"--num-requests", num_req
]
for k, v in prompt_ds_args.items():
prepare_dataset += [k, v]
# https://nvbugs/4658787
# WAR before the prepare dataset can use offline cached dataset
run_command(prepare_dataset,
cwd=root_dir,
timeout=300,
env={'HF_DATASETS_OFFLINE': '0'})
for batching_type in batching_types:
for api_type in api_types:
benchmark = [
str(benchmark_exe_dir / "gptManagerBenchmark"),
"--engine_dir",
str(model_engine_path), "--type",
str(batching_type), "--api",
str(api_type), "--dataset",
str(data_dir / tokens_f)
]
if model_name == "enc_dec":
benchmark += [
"--encoder_engine_dir",
str(encoder_model_engine_path)
]
run_command(benchmark, cwd=root_dir, timeout=600)
req_rate_benchmark = benchmark + [
"--request_rate", "100", "--enable_exp_delays"
]
run_command(req_rate_benchmark, cwd=root_dir, timeout=600)
concurrency_benchmark = benchmark + ["--concurrency", "30"]
run_command(concurrency_benchmark, cwd=root_dir, timeout=600)
if "IFB" in batching_type and "executor" in api_types:
# executor streaming test
benchmark = [
str(benchmark_exe_dir / "gptManagerBenchmark"), "--engine_dir",
str(model_engine_path), "--type", "IFB", "--dataset",
str(data_dir / tokens_f), "--api", "executor", "--streaming"
]
if model_name == "enc_dec":
benchmark += [
"--encoder_engine_dir",
str(encoder_model_engine_path)
]
run_command(benchmark, cwd=root_dir, timeout=600)