mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
202 lines
7.6 KiB
Python
202 lines
7.6 KiB
Python
#!/usr/bin/env python3
|
|
import os
|
|
import subprocess # nosec B404
|
|
import sys
|
|
import tempfile
|
|
from pathlib import Path
|
|
from typing import Optional
|
|
|
|
import click
|
|
|
|
from tensorrt_llm.hlapi import BuildConfig
|
|
from tensorrt_llm.hlapi._perf_evaluator import LLMPerfEvaluator
|
|
from tensorrt_llm.hlapi.llm import ModelLoader
|
|
from tensorrt_llm.hlapi.llm_utils import _ModelFormatKind
|
|
from tensorrt_llm.hlapi.utils import print_colored
|
|
|
|
try:
|
|
from .grid_searcher import GridSearcher
|
|
except:
|
|
from grid_searcher import GridSearcher
|
|
|
|
|
|
@click.group()
|
|
def cli():
|
|
pass
|
|
|
|
|
|
@click.command("benchmark")
|
|
@click.option("--model-path", type=str, required=True)
|
|
@click.option("--samples-path", type=str, required=True)
|
|
@click.option("--report-path-prefix", type=str, required=True)
|
|
@click.option("--num-samples", type=int, default=None, show_default=True)
|
|
@click.option("--tp-size", type=int, default=1, show_default=True)
|
|
@click.option("--streaming/--no-streaming",
|
|
type=bool,
|
|
default=False,
|
|
show_default=True)
|
|
@click.option("--warmup", type=int, default=2, show_default=True)
|
|
@click.option("--concurrency", type=int, default=None, show_default=True)
|
|
@click.option("--max-num-tokens", type=int, default=2048, show_default=True)
|
|
@click.option("--max-input-length", type=int, required=True, default=200)
|
|
@click.option("--max-seq-length", type=int, required=True, default=400)
|
|
@click.option("--max-batch-size", type=int, default=128)
|
|
@click.option("--engine-output-dir", type=str, default="")
|
|
@click.option(
|
|
"--cpp-executable",
|
|
type=str,
|
|
default=None,
|
|
help="Path to the cpp executable, set it if you want to run the cpp benchmark"
|
|
)
|
|
def benchmark_main(model_path: str,
|
|
samples_path: str,
|
|
report_path_prefix: str,
|
|
num_samples: Optional[int] = None,
|
|
tp_size: int = 1,
|
|
streaming: bool = False,
|
|
warmup: int = 2,
|
|
concurrency: Optional[int] = None,
|
|
max_num_tokens: int = 2048,
|
|
max_input_length: int = 200,
|
|
max_seq_length: int = 400,
|
|
max_batch_size: int = 128,
|
|
engine_output_dir: str = "",
|
|
cpp_executable: str = None):
|
|
''' Run the benchmark on HLAPI.
|
|
If `cpp_executable_path` is provided, it will run the cpp benchmark as well.
|
|
'''
|
|
model_path = Path(model_path)
|
|
samples_path = Path(samples_path)
|
|
if not model_path.exists():
|
|
raise FileNotFoundError(f"Model path {model_path} not found")
|
|
if not samples_path.exists():
|
|
raise FileNotFoundError(f"Samples path {samples_path} not found")
|
|
|
|
engine_output_dir = engine_output_dir or None
|
|
temp_dir = None
|
|
if engine_output_dir:
|
|
engine_output_dir = Path(engine_output_dir)
|
|
elif cpp_executable:
|
|
if ModelLoader.get_model_format(
|
|
model_path) is _ModelFormatKind.TLLM_ENGINE:
|
|
engine_output_dir = model_path
|
|
else:
|
|
temp_dir = tempfile.TemporaryDirectory()
|
|
engine_output_dir = Path(temp_dir.name)
|
|
|
|
def run_hlapi():
|
|
print_colored(f"Running HLAPI benchmark ...\n",
|
|
"bold_green",
|
|
writer=sys.stdout)
|
|
|
|
build_config = BuildConfig(max_num_tokens=max_num_tokens,
|
|
max_input_len=max_input_length,
|
|
max_seq_len=max_seq_length,
|
|
max_batch_size=max_batch_size)
|
|
|
|
evaluator = LLMPerfEvaluator.create(
|
|
model=model_path,
|
|
samples_path=samples_path,
|
|
num_samples=num_samples,
|
|
streaming=streaming,
|
|
warmup=warmup,
|
|
concurrency=concurrency,
|
|
engine_cache_path=engine_output_dir,
|
|
# The options should be identical to the cpp benchmark
|
|
tensor_parallel_size=tp_size,
|
|
build_config=build_config)
|
|
assert evaluator
|
|
report = evaluator.run()
|
|
report.display()
|
|
|
|
report_path = Path(f"{report_path_prefix}.json")
|
|
i = 0
|
|
while report_path.exists():
|
|
report_path = Path(f"{report_path_prefix}{i}.json")
|
|
i += 1
|
|
report.save_json(report_path)
|
|
|
|
def run_gpt_manager_benchmark():
|
|
print_colored(f"Running gptManagerBenchmark ...\n",
|
|
"bold_green",
|
|
writer=sys.stdout)
|
|
if os.path.isfile(cpp_executable):
|
|
cpp_executable_path = cpp_executable
|
|
else:
|
|
cpp_executable_path = os.path.join(
|
|
os.path.dirname(__file__),
|
|
"../../cpp/build/benchmarks/gptManagerBenchmark")
|
|
|
|
command = f"{cpp_executable_path} --engine_dir {engine_output_dir} --type IFB --dataset {samples_path} --warm_up {warmup} --output_csv {report_path_prefix}.cpp.csv --api executor"
|
|
if streaming:
|
|
command = f"{command} --streaming"
|
|
if concurrency:
|
|
command = f"{command} --concurrency {concurrency}"
|
|
if tp_size > 1:
|
|
command = f"mpirun -n {tp_size} {command}"
|
|
print_colored(f'cpp benchmark command: {command}\n',
|
|
"grey",
|
|
writer=sys.stdout)
|
|
output = subprocess.run(command,
|
|
check=True,
|
|
universal_newlines=True,
|
|
shell=True,
|
|
capture_output=True,
|
|
env=os.environ) # nosec B603
|
|
print_colored(f'cpp benchmark output: {output.stdout}',
|
|
"grey",
|
|
writer=sys.stdout)
|
|
if output.stderr:
|
|
print_colored(f'cpp benchmark error: {output.stderr}',
|
|
"red",
|
|
writer=sys.stdout)
|
|
|
|
run_hlapi()
|
|
if cpp_executable:
|
|
run_gpt_manager_benchmark()
|
|
|
|
|
|
@click.command("gridsearch")
|
|
@click.option("--model-path", type=str, required=True)
|
|
@click.option("--samples-path", type=str, required=True)
|
|
@click.option("--reports-root", type=str, required=True)
|
|
@click.option("--prune-space-for-debug",
|
|
type=int,
|
|
default=1e8,
|
|
help="Specify the first N cases to test")
|
|
@click.option("--max-input-len", type=int, default=1024)
|
|
@click.option("--max-seq-len", type=int, default=2048)
|
|
@click.option("--max-num-tokens", type=int, default=4096)
|
|
@click.option("--tp-size", type=int, default=1)
|
|
@click.option("--num-samples", type=int, default=200)
|
|
def grid_searcher_main(model_path,
|
|
samples_path,
|
|
reports_root,
|
|
prune_space_for_debug: int,
|
|
max_input_len: int,
|
|
max_seq_len: int,
|
|
max_num_tokens: int,
|
|
tp_size: int = 1,
|
|
num_samples: int = 200):
|
|
reports_root = Path(reports_root)
|
|
|
|
grid_searcher = GridSearcher(prune_space_for_debug=prune_space_for_debug)
|
|
|
|
build_config = BuildConfig(max_seq_len=max_seq_len,
|
|
max_input_len=max_input_len,
|
|
max_num_tokens=max_num_tokens)
|
|
|
|
grid_searcher.evaluate(model=model_path,
|
|
samples_path=samples_path,
|
|
report_dir=reports_root,
|
|
memory_monitor_interval=1,
|
|
num_samples=num_samples,
|
|
tensor_parallel_size=tp_size,
|
|
build_config=build_config)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
cli.add_command(benchmark_main)
|
|
cli.add_command(grid_searcher_main)
|
|
cli()
|