TensorRT-LLMs/tests/model_api/test_model_level_api.py

import asyncio
import os
import sys
import tempfile
from contextlib import contextmanager

from profile_utils import profile
from transformers import AutoTokenizer

import tensorrt_llm
from tensorrt_llm.builder import BuildConfig, build
from tensorrt_llm.executor import GenerationExecutor, SamplingParams
from tensorrt_llm.models import LLaMAForCausalLM
from tensorrt_llm.models.llama.config import LLaMAConfig

sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from utils.llm_data import llm_models_root
from utils.util import force_ampere

tensorrt_llm.logger.set_level('verbose')

batch_input_text = [
    "Born in north-east France, Soyer trained as a",
    "What is large language model?"
]
batch_output_text_expected = [
    "chef in Paris and London before moving to New York",
    "\nLarge language model is a model that is"
]


@contextmanager
def workspace(suffix, prefix="./trtllm_workspace"):
    keep_workspace = os.environ.get("TRTLLM_KEEP", False)
    if not keep_workspace:
        temp = tempfile.TemporaryDirectory(suffix)
        yield temp.name
    else:
        temp = f"{prefix}/{suffix}"
        os.makedirs(temp, exist_ok=True)
        yield temp


# 233s on ipp1-1197: loading weights 37s, network/engine 27s, save engine: 35s, load engine (14GB) about 100s
@profile("save-and-load")
@force_ampere
def test_save_load():
    '''When the engine_dir parameter of to_trt and generate is not None
        to_trt() saves the engine to disk.
        generate() loads engine from the disk.
        This is optional, but users can store the engine into any folder they want, and use later
    '''
    max_batch_size, max_isl, max_osl = 8, 256, 256
    hf_model_dir = str(llm_models_root() / "llama-models/llama-7b-hf")

    with workspace("llama-save-load") as engine_dir:
        # build and run by one llama object
        llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir)
        build_config = BuildConfig(max_batch_size=max_batch_size,
                                   max_input_len=max_isl,
                                   max_seq_len=max_osl + max_isl,
                                   plugin_config=llama.default_plugin_config())
        build_config.plugin_config.gemm_plugin = 'auto'  # faster build
        engine = build(llama, build_config)
        engine.save(engine_dir)

        tokenizer = AutoTokenizer.from_pretrained(hf_model_dir)

        # use context manager to make sure the __exit__ can release the resources immediately
        with GenerationExecutor.create(engine_dir) as executor:
            batch_input_ids = [
                tokenizer.encode(inp) for inp in batch_input_text
            ]
            outputs = executor.generate(
                batch_input_ids, sampling_params=SamplingParams(max_tokens=10))

            for idx, output in enumerate(outputs):
                tensorrt_llm.logger.info(f"Input: {batch_input_text[idx]}")
                output_text = tokenizer.decode(output.outputs[0].token_ids)
                tensorrt_llm.logger.info(f'Output: {output_text}')
                # note the output.text contains everything from the input, so only compare the suffix here.
                assert output_text.endswith(
                    batch_output_text_expected[idx]
                ), f"Expecting and got: {batch_output_text_expected[idx]!r} Got: {output_text!r}"


@profile(tag="fake-weights")
@force_ampere
def test_high_level_fake_weights():
    '''sanity to make sure the flow works.
    '''
    max_batch_size, max_isl, max_osl = 8, 256, 256
    hf_model_dir = str(llm_models_root() / "llama-models/llama-7b-hf")

    # Fake weights, skipping save and load engine. Make it faster to sanity test
    config = LLaMAConfig.from_hugging_face(hf_model_dir)
    llama = LLaMAForCausalLM(config)
    build_config = BuildConfig(max_batch_size=max_batch_size,
                               max_input_len=max_isl,
                               max_seq_len=max_osl + max_isl,
                               plugin_config=llama.default_plugin_config())
    build_config.plugin_config.gemm_plugin = 'auto'  # faster build
    build(llama, build_config)


@force_ampere
def test_async_io():
    max_batch_size, max_isl, max_osl = 8, 256, 256
    hf_model_dir = str(llm_models_root() / "llama-models/llama-7b-hf")

    llama = LLaMAForCausalLM.from_hugging_face(hf_model_dir)
    build_config = BuildConfig(max_batch_size=max_batch_size,
                               max_input_len=max_isl,
                               max_seq_len=max_osl + max_isl)
    build_config.plugin_config.gemm_plugin = 'auto'  # faster build
    engine = build(llama, build_config)

    engine_dir = "llama-ifb"
    engine_temp = tempfile.TemporaryDirectory(engine_dir)
    engine_dir = engine_temp.name
    engine.save(engine_dir)

    tokenizer = AutoTokenizer.from_pretrained(hf_model_dir)

    async def main():
        with GenerationExecutor.create(engine_dir) as async_engine:

            async def generate_and_print(idx, inp):
                result = async_engine.generate_async(
                    tokenizer.encode(inp),
                    sampling_params=SamplingParams(max_tokens=10),
                    streaming=False)
                await result.aresult()
                output_text = tokenizer.decode(result.outputs[0].token_ids)
                tensorrt_llm.logger.info(output_text)
                assert output_text.endswith(batch_output_text_expected[idx])

                async for stream in async_engine.generate_async(
                        tokenizer.encode(inp),
                        sampling_params=SamplingParams(max_tokens=10),
                        streaming=True):
                    output_text = tokenizer.decode(stream.outputs[0].token_ids)
                    tensorrt_llm.logger.info(
                        f"prompt: {inp!r}, generation: {output_text!r}")

            loop = asyncio.get_running_loop()
            tasks = []
            # submit many request concurrently
            for idx, inp in enumerate(batch_input_text):
                task = loop.create_task(generate_and_print(idx, inp))
                tasks.append(task)

            # wait all task done
            await asyncio.gather(*tasks)

    asyncio.run(main())


if __name__ == "__main__":
    test_save_load()
    test_async_io()
    test_high_level_fake_weights()