TensorRT-LLMs/examples/high-level-api/llm_examples.py

#!/usr/bin/env python3
import asyncio
import inspect
import sys
from argparse import ArgumentParser
from typing import List, Optional

from tensorrt_llm import LLM, ModelConfig

# NOTE, Currently, the following examples are only available for LLaMA models.


def run_llm_from_huggingface_model(prompts: List[str],
                                   llama_model_dir: str,
                                   dump_engine_dir: Optional[str] = None):
    ''' Loading a HuggingFace model. '''
    # Load the model from a local HuggingFace model
    config = ModelConfig(model_dir=llama_model_dir)
    llm = LLM(config)
    if dump_engine_dir:
        llm.save(dump_engine_dir)

    for output in llm.generate(prompts):
        print(output)


def run_llm_from_tllm_engine(prompts: List[str], llama_engine_dir: str):
    ''' Loading a built TensorRT-LLM engine.  '''
    config = ModelConfig(model_dir=llama_engine_dir)
    llm = LLM(config)

    for output in llm.generate(prompts):
        print(output)


def run_llm_on_tensor_parallel(prompts: List[str], llama_model_dir: str):
    ''' Running LLM with Tensor Parallel on multiple GPUs. '''
    config = ModelConfig(model_dir=llama_model_dir)
    config.parallel_config.tp_size = 2  # 2 GPUs

    llm = LLM(config)

    for output in llm.generate(prompts):
        print(output)


def run_llm_generate_async_example(prompts: List[str],
                                   llama_model_dir: str,
                                   streaming: bool = False):
    ''' Running LLM generation asynchronously. '''
    config = ModelConfig(model_dir=llama_model_dir)

    llm = LLM(config, async_mode=True)

    async def task(prompt: str):
        outputs = []
        async for output in llm.generate_async(prompt, streaming=streaming):
            outputs.append(output.text)
        print(' '.join(outputs))

    async def main():
        tasks = [task(prompt) for prompt in prompts]
        await asyncio.gather(*tasks)

    asyncio.run(main())


def run_llm_with_quantization(prompts: List[str],
                              llama_model_dir: str,
                              engine_dump_dir: Optional[str] = None,
                              quant_type: str = 'int4_awq'):
    ''' Running LLM with quantization.
    quant_type could be 'int4_awq' or 'fp8'.
    '''

    config = ModelConfig(model_dir=llama_model_dir)
    if quant_type == 'int4_awq':
        config.quant_config.init_from_description(quantize_weights=True,
                                                  use_int4_weights=True,
                                                  per_group=True)
    else:
        config.quant_config.set_fp8_qdq()
        config.quant_config.set_fp8_kv_cache()

    llm = LLM(config)

    for output in llm.generate(prompts):
        print(output)


def _parse_arguments():
    parser = ArgumentParser()
    parser.add_argument('--task', type=str, choices=_get_functions())
    parser.add_argument('--hf_model_dir',
                        type=str,
                        help='The directory of the model.')
    parser.add_argument('--dump_engine_dir',
                        type=str,
                        help='The directory to dump the engine.',
                        default=None)
    parser.add_argument('--quant_type', type=str, choices=['int4_awq', 'fp8'])
    parser.add_argument('--prompt', type=str, required=True)
    return parser.parse_args()


def _get_functions():
    cur_module = sys.modules[__name__]
    function_names = [
        name for name, _ in inspect.getmembers(cur_module, inspect.isfunction)
        if not name.startswith('_')
    ]
    return function_names


if __name__ == '__main__':
    args = _parse_arguments()

    tasks = dict(
        run_llm_from_huggingface_model=lambda: run_llm_from_huggingface_model(
            [args.prompt], args.hf_model_dir, args.dump_engine_dir),
        run_llm_from_tllm_engine=lambda: run_llm_from_tllm_engine(
            [args.prompt], args.dump_engine_dir),
        run_llm_on_tensor_parallel=lambda: run_llm_on_tensor_parallel(
            [args.prompt], args.hf_model_dir),
        run_llm_generate_async_example=lambda: run_llm_generate_async_example(
            [args.prompt], args.hf_model_dir),
        run_llm_with_quantization=lambda: run_llm_with_quantization([
            args.prompt
        ], args.hf_model_dir, args.dump_engine_dir, args.quant_type),
    )

    print(f'Running {args.task} ...')

    tasks[args.task]()