TensorRT-LLMs/tests/hlapi/hlapi_evaluator.py

#!/usr/bin/env python3
import os
import subprocess  # nosec B404
import sys
import tempfile
from pathlib import Path
from typing import Optional

import click

from tensorrt_llm.hlapi import BuildConfig
from tensorrt_llm.hlapi._perf_evaluator import LLMPerfEvaluator
from tensorrt_llm.hlapi.llm import ModelLoader
from tensorrt_llm.hlapi.llm_utils import _ModelFormatKind
from tensorrt_llm.hlapi.utils import print_colored

try:
    from .grid_searcher import GridSearcher
except:
    from grid_searcher import GridSearcher


@click.group()
def cli():
    pass


@click.command("benchmark")
@click.option("--model-path", type=str, required=True)
@click.option("--samples-path", type=str, required=True)
@click.option("--report-path-prefix", type=str, required=True)
@click.option("--num-samples", type=int, default=None, show_default=True)
@click.option("--tp-size", type=int, default=1, show_default=True)
@click.option("--streaming/--no-streaming",
              type=bool,
              default=False,
              show_default=True)
@click.option("--warmup", type=int, default=2, show_default=True)
@click.option("--concurrency", type=int, default=None, show_default=True)
@click.option("--max-num-tokens", type=int, default=2048, show_default=True)
@click.option("--max-input-length", type=int, required=True, default=200)
@click.option("--max-seq-length", type=int, required=True, default=400)
@click.option("--max-batch-size", type=int, default=128)
@click.option("--engine-output-dir", type=str, default="")
@click.option(
    "--cpp-executable",
    type=str,
    default=None,
    help="Path to the cpp executable, set it if you want to run the cpp benchmark"
)
def benchmark_main(model_path: str,
                   samples_path: str,
                   report_path_prefix: str,
                   num_samples: Optional[int] = None,
                   tp_size: int = 1,
                   streaming: bool = False,
                   warmup: int = 2,
                   concurrency: Optional[int] = None,
                   max_num_tokens: int = 2048,
                   max_input_length: int = 200,
                   max_seq_length: int = 400,
                   max_batch_size: int = 128,
                   engine_output_dir: str = "",
                   cpp_executable: str = None):
    ''' Run the benchmark on HLAPI.
    If `cpp_executable_path` is provided, it will run the cpp benchmark as well.
    '''
    model_path = Path(model_path)
    samples_path = Path(samples_path)
    if not model_path.exists():
        raise FileNotFoundError(f"Model path {model_path} not found")
    if not samples_path.exists():
        raise FileNotFoundError(f"Samples path {samples_path} not found")

    engine_output_dir = engine_output_dir or None
    temp_dir = None
    if engine_output_dir:
        engine_output_dir = Path(engine_output_dir)
    elif cpp_executable:
        if ModelLoader.get_model_format(
                model_path) is _ModelFormatKind.TLLM_ENGINE:
            engine_output_dir = model_path
        else:
            temp_dir = tempfile.TemporaryDirectory()
            engine_output_dir = Path(temp_dir.name)

    def run_hlapi():
        print_colored(f"Running HLAPI benchmark ...\n",
                      "bold_green",
                      writer=sys.stdout)

        build_config = BuildConfig(max_num_tokens=max_num_tokens,
                                   max_input_len=max_input_length,
                                   max_seq_len=max_seq_length,
                                   max_batch_size=max_batch_size)

        evaluator = LLMPerfEvaluator.create(
            model=model_path,
            samples_path=samples_path,
            num_samples=num_samples,
            streaming=streaming,
            warmup=warmup,
            concurrency=concurrency,
            engine_cache_path=engine_output_dir,
            # The options should be identical to the cpp benchmark
            tensor_parallel_size=tp_size,
            build_config=build_config)
        assert evaluator
        report = evaluator.run()
        report.display()

        report_path = Path(f"{report_path_prefix}.json")
        i = 0
        while report_path.exists():
            report_path = Path(f"{report_path_prefix}{i}.json")
            i += 1
        report.save_json(report_path)

    def run_gpt_manager_benchmark():
        print_colored(f"Running gptManagerBenchmark ...\n",
                      "bold_green",
                      writer=sys.stdout)
        if os.path.isfile(cpp_executable):
            cpp_executable_path = cpp_executable
        else:
            cpp_executable_path = os.path.join(
                os.path.dirname(__file__),
                "../../cpp/build/benchmarks/gptManagerBenchmark")

        command = f"{cpp_executable_path} --engine_dir {engine_output_dir} --type IFB --dataset {samples_path} --warm_up {warmup} --output_csv {report_path_prefix}.cpp.csv --api executor"
        if streaming:
            command = f"{command} --streaming"
        if concurrency:
            command = f"{command} --concurrency {concurrency}"
        if tp_size > 1:
            command = f"mpirun -n {tp_size} {command}"
        print_colored(f'cpp benchmark command: {command}\n',
                      "grey",
                      writer=sys.stdout)
        output = subprocess.run(command,
                                check=True,
                                universal_newlines=True,
                                shell=True,
                                capture_output=True,
                                env=os.environ)  # nosec B603
        print_colored(f'cpp benchmark output: {output.stdout}',
                      "grey",
                      writer=sys.stdout)
        if output.stderr:
            print_colored(f'cpp benchmark error: {output.stderr}',
                          "red",
                          writer=sys.stdout)

    run_hlapi()
    if cpp_executable:
        run_gpt_manager_benchmark()


@click.command("gridsearch")
@click.option("--model-path", type=str, required=True)
@click.option("--samples-path", type=str, required=True)
@click.option("--reports-root", type=str, required=True)
@click.option("--prune-space-for-debug",
              type=int,
              default=1e8,
              help="Specify the first N cases to test")
@click.option("--max-input-len", type=int, default=1024)
@click.option("--max-seq-len", type=int, default=2048)
@click.option("--max-num-tokens", type=int, default=4096)
@click.option("--tp-size", type=int, default=1)
@click.option("--num-samples", type=int, default=200)
def grid_searcher_main(model_path,
                       samples_path,
                       reports_root,
                       prune_space_for_debug: int,
                       max_input_len: int,
                       max_seq_len: int,
                       max_num_tokens: int,
                       tp_size: int = 1,
                       num_samples: int = 200):
    reports_root = Path(reports_root)

    grid_searcher = GridSearcher(prune_space_for_debug=prune_space_for_debug)

    build_config = BuildConfig(max_seq_len=max_seq_len,
                               max_input_len=max_input_len,
                               max_num_tokens=max_num_tokens)

    grid_searcher.evaluate(model=model_path,
                           samples_path=samples_path,
                           report_dir=reports_root,
                           memory_monitor_interval=1,
                           num_samples=num_samples,
                           tensor_parallel_size=tp_size,
                           build_config=build_config)


if __name__ == '__main__':
    cli.add_command(benchmark_main)
    cli.add_command(grid_searcher_main)
    cli()