TensorRT-LLMs/examples/high-level-api/run_examples.py
Kaiyu Xie bf0a5afc92
Update TensorRT-LLM (#1598)
* Update TensorRT-LLM
2024-05-14 16:43:41 +08:00

128 lines
4.1 KiB
Python
Executable File

#!/usr/bin/env python
import subprocess
import sys
from llm_examples import *
from tensorrt_llm.hlapi.utils import print_colored
@click.group()
def cli():
pass
@click.command('run_single_gpu')
@click.option('--prompt', type=str, default="What is LLM?")
@click.option('--model_dir', type=str, help='The directory of the model.')
@click.option('--examples_root',
type=str,
help='The root directory of the examples.')
@click.option('--llm_examples',
type=str,
help='The path to the llm_examples.py.',
default='llm_examples.py')
@click.option('--engine_dir',
type=str,
help='The directory of the engine.',
default="/tmp/hlapi.engine.example")
def run_single_gpu(
prompt: str,
model_dir: str,
examples_root: str,
llm_examples: str,
engine_dir: str,
):
run_example(
"Running LLM from HuggingFace model",
f"{sys.executable} {llm_examples} run_llm_generate --prompt=\"{prompt}\" --model_dir={model_dir} --engine_dir={engine_dir}"
)
run_example(
"Running LLM from built engine with streaming enabled",
f"{sys.executable} {llm_examples} run_llm_generate_async_example --prompt=\"{prompt}\" --model_dir={engine_dir} --streaming"
)
run_example(
"Running LLM with async future",
f"{sys.executable} {llm_examples} run_llm_with_async_future --prompt=\"{prompt}\" --model_dir={engine_dir}"
)
@click.command("run_multi_gpu")
@click.option('--prompt', type=str, default="What is LLM?")
@click.option('--model_dir', type=str, help='The directory of the model.')
@click.option('--examples_root',
type=str,
help='The root directory of the examples.')
@click.option('--llm_examples',
type=str,
help='The path to the llm_examples.py.',
default='llm_examples.py')
@click.option('--engine_dir',
type=str,
help='The directory of the engine.',
default="/tmp/hlapi.engine.example")
@click.option('--run_autopp',
type=bool,
help='Whether to run with auto parallel.',
default=True)
def run_multi_gpu(
prompt: str,
model_dir: str,
examples_root: str,
llm_examples: str,
engine_dir: str,
run_autopp: bool = True,
):
run_example(
"Running LLM from HuggingFace model with TP enabled",
f"{sys.executable} {llm_examples} run_llm_generate --prompt=\"{prompt}\" --model_dir={model_dir} --tp_size=2 --engine_dir={engine_dir}.tp2"
)
run_example(
"Running LLM from built engine with streaming enabled and TP=2",
f"{sys.executable} {llm_examples} run_llm_generate_async_example --prompt=\"{prompt}\" --model_dir={engine_dir}.tp2 --streaming"
) # Loading the engine with TP=2.
if run_autopp:
run_example(
"Running LLM with auto parallel",
f"{sys.executable} {llm_examples} run_llm_with_auto_parallel --prompt=\"{prompt}\" --model_dir={model_dir} --world_size=2"
)
@click.command("run_quant")
@click.option('--prompt', type=str, default="What is LLM?")
@click.option('--model_dir', type=str, help='The directory of the model.')
@click.option('--examples_root',
type=str,
help='The root directory of the examples.')
@click.option('--llm_examples',
type=str,
help='The path to the llm_examples.py.',
default='llm_examples.py')
def run_quant(
prompt: str,
model_dir: str,
examples_root: str,
llm_examples: str,
):
run_example(
"Running LLM with quantization",
f"{sys.executable} {llm_examples} run_llm_with_quantization --quant_type=int4_awq --prompt=\"{prompt}\" --model_dir={model_dir}"
)
def run_example(hint: str, command: str):
print_colored(hint + "\n", "bold_green")
print(command)
subprocess.run(command, shell=True, check=True)
if __name__ == '__main__':
cli.add_command(run_single_gpu)
cli.add_command(run_multi_gpu)
cli.add_command(run_quant)
cli()