mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
36 lines
1.2 KiB
Python
36 lines
1.2 KiB
Python
import argparse
|
|
|
|
import tensorrt_llm.bindings.executor as trtllm
|
|
|
|
# This example hows to use the python bindings to create an executor, enqueue a
|
|
# request, and get the generated tokens.
|
|
|
|
# First, follow the steps in README.md to generate the engines.
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Executor Bindings Example")
|
|
parser.add_argument("--model_path",
|
|
type=str,
|
|
required=True,
|
|
help="Directory containing model engine")
|
|
args = parser.parse_args()
|
|
|
|
# Create the executor.
|
|
executor = trtllm.Executor(args.model_path, trtllm.ModelType.DECODER_ONLY,
|
|
trtllm.ExecutorConfig(1))
|
|
|
|
if executor.can_enqueue_requests():
|
|
# Create the request.
|
|
request = trtllm.Request(input_token_ids=[1, 2, 3, 4],
|
|
max_new_tokens=10)
|
|
|
|
# Enqueue the request.
|
|
request_id = executor.enqueue_request(request)
|
|
|
|
# Wait for the new tokens.
|
|
responses = executor.await_responses(request_id)
|
|
output_tokens = responses[0].result.output_token_ids
|
|
|
|
# Print tokens.
|
|
print(output_tokens)
|