mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Sherlock Xu <65327072+Sherlock113@users.noreply.github.com>
68 lines
2.4 KiB
Python
68 lines
2.4 KiB
Python
import argparse
|
|
import pathlib as pl
|
|
|
|
import numpy as np
|
|
|
|
import tensorrt_llm.bindings.executor as trtllm
|
|
|
|
# This example hows to use the python bindings to create an executor, enqueue a
|
|
# request, and get the generated tokens.
|
|
|
|
# First, follow the steps in README.md to generate the engines.
|
|
|
|
if __name__ == "__main__":
|
|
parser = argparse.ArgumentParser(description="Executor Bindings Example")
|
|
parser.add_argument("--model_path",
|
|
type=str,
|
|
required=True,
|
|
help="Directory containing model engine")
|
|
parser.add_argument("--dump_tensors",
|
|
action="store_true",
|
|
help="Dump debug tensors to files")
|
|
args = parser.parse_args()
|
|
|
|
max_tokens = 2
|
|
|
|
# Select which tensors should be kept or dumped
|
|
debug_config = trtllm.DebugConfig(
|
|
debug_tensor_names=["sequence_length"],
|
|
debug_tensors_max_iterations=0 if args.dump_tensors else max_tokens)
|
|
|
|
# Create the executor.
|
|
executor = trtllm.Executor(
|
|
args.model_path, trtllm.ModelType.DECODER_ONLY,
|
|
trtllm.ExecutorConfig(1, debug_config=debug_config))
|
|
|
|
if executor.can_enqueue_requests():
|
|
# Create the request.
|
|
request = trtllm.Request(input_token_ids=[1, 2, 3, 4],
|
|
max_tokens=max_tokens)
|
|
|
|
# Enqueue the request.
|
|
request_id = executor.enqueue_request(request)
|
|
|
|
# Wait for the new tokens.
|
|
responses = executor.await_responses(request_id)
|
|
output_tokens = responses[0].result.output_token_ids
|
|
|
|
# Print tokens.
|
|
print(output_tokens)
|
|
|
|
if args.dump_tensors:
|
|
print("debug tensors from files:")
|
|
debug_dir = pl.Path("/tmp/tllm_debug/PP_1/TP_1")
|
|
if debug_dir.is_dir():
|
|
for iter_dir in [x for x in debug_dir.iterdir() if x.is_dir()]:
|
|
print(iter_dir.name)
|
|
for file in [x for x in iter_dir.iterdir() if x.is_file()]:
|
|
print(file.name, np.load(file))
|
|
else:
|
|
print("debug dir not found")
|
|
else:
|
|
print("debug tensors from queue:")
|
|
debug_tensors = executor.get_latest_debug_tensors()
|
|
for debug_iter in debug_tensors:
|
|
print(f"iteration {debug_iter.iter}")
|
|
for [name, tensor] in debug_iter.debug_tensors.items():
|
|
print(name, tensor)
|