TensorRT-LLMs/tensorrt_llm/bench/benchmark/__init__.py

import json
from pathlib import Path
from typing import Callable, Dict, Optional

from pydantic import AliasChoices, BaseModel, Field

from tensorrt_llm import LLM as PyTorchLLM
from tensorrt_llm._tensorrt_engine import LLM
from tensorrt_llm._torch.auto_deploy import LLM as AutoDeployLLM
from tensorrt_llm.bench.benchmark.utils.processes import IterationWriter
from tensorrt_llm.bench.build.build import get_model_config
from tensorrt_llm.bench.dataclasses.configuration import RuntimeConfig
from tensorrt_llm.bench.dataclasses.general import BenchmarkEnvironment
from tensorrt_llm.logger import logger


class GeneralExecSettings(BaseModel):
    model_config = {
        "extra": "ignore"
    }  # Ignore extra fields not defined in the model

    backend: str = Field(
        default="pytorch",
        description="The backend to use when running benchmarking")
    beam_width: int = Field(default=1, description="Number of search beams")
    model_path: Optional[Path] = Field(default=None,
                                       description="Path to model checkpoint")
    concurrency: int = Field(
        default=-1, description="Desired concurrency rate, <=0 for no limit")
    dataset_path: Optional[Path] = Field(default=None,
                                         validation_alias=AliasChoices(
                                             "dataset_path", "dataset"),
                                         description="Path to dataset file")
    engine_dir: Optional[Path] = Field(
        default=None, description="Path to a serialized TRT-LLM engine")
    eos_id: int = Field(
        default=-1, description="End-of-sequence token ID, -1 to disable EOS")
    iteration_log: Optional[Path] = Field(
        default=None, description="Path where iteration logging is written")
    kv_cache_percent: float = Field(
        default=0.90,
        validation_alias=AliasChoices("kv_cache_percent",
                                      "kv_cache_free_gpu_mem_fraction"),
        description="Percentage of memory for KV Cache after model load")
    max_input_len: int = Field(default=4096,
                               description="Maximum input sequence length")
    max_seq_len: Optional[int] = Field(default=None,
                                       description="Maximum sequence length")
    modality: Optional[str] = Field(
        default=None, description="Modality of multimodal requests")
    model: Optional[str] = Field(default=None, description="Model name or path")
    num_requests: int = Field(
        default=0, description="Number of requests to cap benchmark run at")
    output_json: Optional[Path] = Field(
        default=None, description="Path where output should be written")
    report_json: Optional[Path] = Field(
        default=None, description="Path where report should be written")
    request_json: Optional[Path] = Field(
        default=None,
        description="Path where per request information is written")
    streaming: bool = Field(default=False,
                            description="Whether to use streaming mode")
    warmup: int = Field(default=2,
                        description="Number of requests to warm up benchmark")

    @property
    def iteration_writer(self) -> IterationWriter:
        return IterationWriter(self.iteration_log)

    @property
    def model_type(self) -> str:
        return get_model_config(self.model, self.checkpoint_path).model_type

    @property
    def checkpoint_path(self) -> Path:
        return self.model_path or self.model


def ignore_trt_only_args(kwargs: dict, backend: str):
    """Ignore TensorRT-only arguments for non-TensorRT backends.

    Args:
        kwargs: Dictionary of keyword arguments to be passed to the LLM constructor.
        backend: The backend type (e.g., "pytorch", "_autodeploy").
    """
    trt_only_args = [
        "batching_type",
        "normalize_log_probs",
        "extended_runtime_perf_knob_config",
    ]
    for arg in trt_only_args:
        if kwargs.pop(arg, None):
            logger.warning(f"Ignore {arg} for {backend} backend.")


def get_llm(runtime_config: RuntimeConfig, kwargs: dict):
    """Create and return an appropriate LLM instance based on the backend configuration.

    Args:
        runtime_config: Runtime configuration containing backend selection and settings.
        kwargs: Additional keyword arguments to pass to the LLM constructor.

    Returns:
        An instance of the appropriate LLM class for the specified backend.
    """
    llm_cls = LLM

    if runtime_config.backend != "tensorrt":
        ignore_trt_only_args(kwargs, runtime_config.backend)

    if runtime_config.backend == 'pytorch':
        llm_cls = PyTorchLLM

        if runtime_config.iteration_log is not None:
            kwargs["enable_iter_perf_stats"] = True

    elif runtime_config.backend == "_autodeploy":
        kwargs["world_size"] = kwargs.pop("tensor_parallel_size", None)
        llm_cls = AutoDeployLLM

    llm = llm_cls(**kwargs)
    return llm


def get_general_cli_options(
        params: Dict, bench_env: BenchmarkEnvironment) -> GeneralExecSettings:
    """Get general execution settings from command line parameters.

    Args:
        params: Dictionary of command line parameters.
        bench_env: Benchmark environment containing model and checkpoint information.

    Returns:
        An instance of GeneralExecSettings containing general execution settings.
    """
    # Create a copy of params to avoid modifying the original
    settings_dict = params.copy()

    # Add derived values that need to be computed from bench_env
    model_path = bench_env.checkpoint_path
    model = bench_env.model
    # Override/add the computed values
    settings_dict.update({
        "model_path": model_path,
        "model": model,
    })

    # Create and return the settings object, ignoring any extra fields
    return GeneralExecSettings(**settings_dict)


def generate_json_report(report_path: Optional[Path], func: Callable):
    if report_path is None:
        logger.debug("No report path provided, skipping report generation.")
    else:
        logger.info(f"Writing report information to {report_path}...")
        with open(report_path, "w") as f:
            f.write(json.dumps(func(), indent=4))
        logger.info(f"Report information written to {report_path}.")