TensorRT-LLMs/tensorrt_llm/bench/benchmark/utils/general.py

from __future__ import annotations

import json
from pathlib import Path
from typing import Dict, List, Tuple, Union

import tensorrt_llm.bindings.executor as trtllm
from tensorrt_llm.bench.dataclasses.general import InferenceRequest


def get_executor_requests(
    requests: List[InferenceRequest],
    streaming: bool,
    eos_id: int,
    pad_id: int,
) -> List[trtllm.Request]:
    """Generate a list of TRT-LLM Executor requests.

    Args:
        requests (List[InferenceRequest]): A list of inference requests for processing.
        pad_id (int): Padding token identifier
        eos_id (int): End of sequence token identifier.
        streaming (bool, optional): Enable streaming for this request. Defaults to False.

    Returns:
        List[trtllm.Request]:  A list of TRT-LLM Executor request instance.
    """
    executor_requests = []
    while requests:
        request = requests.pop()
        executor_requests.append(
            get_executor_request(request,
                                 pad_id=pad_id,
                                 eos_id=eos_id,
                                 streaming=streaming))
        del request

    return executor_requests


def get_executor_request(request: InferenceRequest,
                         pad_id: int,
                         eos_id: int,
                         streaming: bool = False) -> trtllm.Request:
    """Generate a TRT-LLM Executor request.

    Args:
        request (InferenceRequest): An inference request for processing.
        pad_id (int): Padding token identifier
        eos_id (int): End of sequence token identifier.
        streaming (bool, optional): Enable streaming for this request. Defaults to False.

    Returns:
        trtllm.Request: A TRT-LLM Executor request instance.
    """
    return trtllm.Request(
        input_token_ids=request.input_ids,
        max_tokens=request.output_tokens,
        stop_words=[],
        bad_words=[],
        streaming=streaming,
        output_config=trtllm.OutputConfig(exclude_input_from_output=True),
        pad_id=pad_id,
        end_id=eos_id,
    )


def get_settings_from_engine(
    engine_path: Path
) -> Tuple[Dict[str, Union[str, int]], Dict[str, Union[str, int]]]:
    """Retrieve basic engine information.

    Args:
        engine_path (Path): Path to a TRT-LLM engine directory.

    Returns:
        Tuple[Dict[str, Union[str, int]], Dict[str, Union[str, int]]]: Engine
        properties parsed from the engine at engine_path.
    """
    config_path = engine_path / "config.json"
    runtime_config = {}

    with open(config_path, "r") as config_json:
        config = json.load(config_json)

    engine_world_map = config["pretrained_config"]["mapping"]
    engine_build_cfg = config["build_config"]
    engine_parallel_map = engine_build_cfg["auto_parallel_config"]

    world_config = {
        "pp_size": engine_world_map["pp_size"],
        "tp_size": engine_world_map["tp_size"],
        "world_size": engine_world_map["world_size"],
        "gpus_per_node": engine_parallel_map["gpus_per_node"],
    }

    executor_settings = {
        "max_batch_size": engine_build_cfg["max_batch_size"],
        "max_num_tokens": engine_build_cfg["max_num_tokens"],
    }

    runtime_config.update({
        "sw_version": config["version"],
        "engine_dir": str(engine_path.absolute()),
        "settings_config": executor_settings,
        "world_config": world_config,
    })

    runtime_config["performance_options"] = {}
    runtime_config["decoding_config"] = {
        "decoding_mode": engine_build_cfg["speculative_decoding_mode"]
    }
    return runtime_config, engine_build_cfg