TensorRT-LLMs/tests/unittest/test_model_runner_cpp.py

import typing as tp
from pathlib import Path

import torch
from bindings.binding_test_utils import *
from transformers import AutoTokenizer
from utils.cpp_paths import *
from utils.llm_data import llm_models_root

from tensorrt_llm.runtime.model_runner_cpp import ModelRunnerCpp


@pytest.fixture
def model_files(llm_root: Path, resource_path: Path, results_data_path: Path):
    # Model engines and expected outputs need to be generated.
    print(results_data_path)
    if not results_data_path.exists():
        model_cache = llm_models_root()
        model_cache_arg = ["--model_cache", str(model_cache)
                           ] if model_cache is not None else []
        prepare_model_tests(llm_root, resource_path, "gpt", model_cache_arg)


def test_logits_post_processor(model_files, model_path):

    # Define the logits post-processor callback
    def logits_post_processor(req_id: int, logits: torch.Tensor,
                              ids: tp.List[tp.List[int]], stream_ptr: int,
                              client_id: tp.Optional[int]):
        with torch.cuda.stream(torch.cuda.ExternalStream(stream_ptr)):
            logits[:] = float("-inf")
            logits[..., 42] = 0

    # Create ModelRunnerCpp
    logits_processor_map = {"my_logits_pp": logits_post_processor}
    runner = ModelRunnerCpp.from_dir(model_path,
                                     logits_processor_map=logits_processor_map)

    model_root = llm_models_root(check=True)
    hf_model_dir = Path(model_root, "gpt2")

    tokenizer = AutoTokenizer.from_pretrained(hf_model_dir,
                                              padding_side="left",
                                              truncation_side="left",
                                              trust_remote_code=True,
                                              use_fast=True)

    input_text = "Born in north-east France, Soyer trained as a"
    batch_input_ids = [
        torch.tensor(tokenizer.encode(input_text,
                                      add_special_tokens=True,
                                      truncation=True),
                     dtype=torch.int32)
    ]

    pad_token_id = tokenizer.pad_token_id
    if tokenizer.pad_token_id is None:
        pad_token_id = tokenizer.eos_token_id

    # Create the request
    max_new_tokens = 5
    with torch.no_grad():
        outputs = runner.generate(batch_input_ids=batch_input_ids,
                                  max_new_tokens=max_new_tokens,
                                  end_id=tokenizer.eos_token_id,
                                  pad_id=pad_token_id,
                                  output_sequence_lengths=True,
                                  return_dict=True,
                                  logits_processor_names={"my_logits_pp"})

    torch.cuda.synchronize()

    # Get the new tokens
    tokens = outputs['output_ids']
    sequence_lengths = outputs['sequence_lengths']

    output_begin = len(batch_input_ids[0])
    output_end = sequence_lengths[0][0]

    # check that all output tokens are 42
    assert tokens[0][0][output_begin:output_end].tolist() == [42
                                                              ] * max_new_tokens