TensorRT-LLMs/examples/server/test_executor.py
Kaiyu Xie 5955b8afba
Update TensorRT-LLM Release branch (#1192)
* Update TensorRT-LLM

---------

Co-authored-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
2024-02-29 17:20:55 +08:00

56 lines
1.8 KiB
Python

from pathlib import Path
from tensorrt_llm.executor import GenerationExecutor, GenerationRequest
def test_sync_generation():
model_path = Path(
"cpp/tests/resources/models/rt_engine/gpt2/fp16-plugin-packed-paged/tp1-pp1-gpu/"
)
tokenizer = "gpt2"
prompt = "deep learning"
max_new_tokens = 4
executor = GenerationExecutor(model_path, tokenizer)
# Simple generations (synchronous)
result = executor.generate(prompt, max_new_tokens=max_new_tokens)
print(result.text)
results = executor.generate(
[prompt, prompt], max_new_tokens=[max_new_tokens, 2 * max_new_tokens])
for result in results:
print(result.text)
# Simple generations (asynchronous)
#
# Iterate the partial results when streaming
future = executor.generate_async(prompt,
streaming=True,
max_new_tokens=max_new_tokens)
for partial_result in future:
print(partial_result.text)
# Iterate the partial results when streaming
# Streaming results in nested loop
futures = executor.generate_async(
[prompt, prompt],
streaming=True,
max_new_tokens=[max_new_tokens, 2 * max_new_tokens])
for future in futures:
for partial_result in future:
print(partial_result.text)
# Low-level api with .submit
# Submit a batch of requests
futures = []
for _ in range(5):
futures.append(
executor.submit(
GenerationRequest(prompt, max_new_tokens=[max_new_tokens])))
print("We have sent the requests: ", [id(f) for f in futures])
for future in executor.wait_first_completed(futures):
print(
f"Request {id(future)} has finished: {future.wait_completion(timeout=0).text}"
)