mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-04 10:11:47 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
56 lines
1.8 KiB
Python
56 lines
1.8 KiB
Python
from pathlib import Path
|
|
|
|
from tensorrt_llm.executor import GenerationExecutor, GenerationRequest
|
|
|
|
|
|
def test_sync_generation():
|
|
model_path = Path(
|
|
"cpp/tests/resources/models/rt_engine/gpt2/fp16-plugin-packed-paged/tp1-pp1-gpu/"
|
|
)
|
|
tokenizer = "gpt2"
|
|
prompt = "deep learning"
|
|
max_new_tokens = 4
|
|
executor = GenerationExecutor(model_path, tokenizer)
|
|
|
|
# Simple generations (synchronous)
|
|
result = executor.generate(prompt, max_new_tokens=max_new_tokens)
|
|
print(result.text)
|
|
|
|
results = executor.generate(
|
|
[prompt, prompt], max_new_tokens=[max_new_tokens, 2 * max_new_tokens])
|
|
for result in results:
|
|
print(result.text)
|
|
|
|
# Simple generations (asynchronous)
|
|
#
|
|
# Iterate the partial results when streaming
|
|
future = executor.generate_async(prompt,
|
|
streaming=True,
|
|
max_new_tokens=max_new_tokens)
|
|
for partial_result in future:
|
|
print(partial_result.text)
|
|
|
|
# Iterate the partial results when streaming
|
|
# Streaming results in nested loop
|
|
futures = executor.generate_async(
|
|
[prompt, prompt],
|
|
streaming=True,
|
|
max_new_tokens=[max_new_tokens, 2 * max_new_tokens])
|
|
for future in futures:
|
|
for partial_result in future:
|
|
print(partial_result.text)
|
|
|
|
# Low-level api with .submit
|
|
# Submit a batch of requests
|
|
futures = []
|
|
for _ in range(5):
|
|
futures.append(
|
|
executor.submit(
|
|
GenerationRequest(prompt, max_new_tokens=[max_new_tokens])))
|
|
|
|
print("We have sent the requests: ", [id(f) for f in futures])
|
|
for future in executor.wait_first_completed(futures):
|
|
print(
|
|
f"Request {id(future)} has finished: {future.wait_completion(timeout=0).text}"
|
|
)
|