mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-26 21:53:30 +08:00
* Use updateDecoderBuffers in python decoder. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Fix synchronize in trtllm decoder. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Enable by default. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Use guided_decoder to setup seqslots and free them. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Use always decode_async and update_requests. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Update decoder buffers. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Fix speculative decoding tests. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Send new_tensors_host instead of assuming dict. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Make default False in enable_trtllm_decoder. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Partially fix mtp, partially fix py_executor. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Update request states before sending disagg ctx cache. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Fix disagg test for torch decoder. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Make isend_tensor_list and recv_tensor_list for sending the tensors_host. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Formatting. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Fix rebase. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Add disagg serving case to guided decoder. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Get overlap scheduling to work. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Update cutlass to main. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Update after rebasing. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Formatting. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Update to use decode async and update requests. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Properly pass information to update_requests Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Formatting. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Make disaggregated serving a step closer to working. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Fix rebase. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Fix rebase and format. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Copy new device tokens more pythonic. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Restore MTP add dummy reqs. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Add ordereddict import to py_executor. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Formatting. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Added seq slot manager. Add test. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Use transmission for single tensor except when list of tensors is received. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Add TRTLLMDecoder allocation to estimate max kv cache tokens. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Add stream synchronization Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Formatting. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Make memory calculation of decoder adapt to the chosen decoder. Recognize decoder option passed in executorconfig. Make overlap scheduler test run on TinyLlama. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Format Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Add decoder creation to estimate max kv. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Formatting. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> * Update submodule UCXX inline with main. Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com> --------- Signed-off-by: Daniel Campora <961215+dcampora@users.noreply.github.com>
83 lines
2.7 KiB
Python
83 lines
2.7 KiB
Python
import json
|
|
from pathlib import Path
|
|
|
|
import pytest
|
|
from utils.llm_data import llm_models_root
|
|
from utils.util import similar
|
|
|
|
from tensorrt_llm import SamplingParams
|
|
from tensorrt_llm._torch import LLM
|
|
from tensorrt_llm._torch.pyexecutor.config import PyTorchConfig
|
|
from tensorrt_llm.llmapi import KvCacheConfig as TRT_KvCacheConfig
|
|
|
|
|
|
# A test case of mmlu_llama from lm_eval
|
|
@pytest.fixture(scope="module")
|
|
def test_case():
|
|
with open(Path(__file__).parent / "test_overlap_scheduler_input.json") as f:
|
|
return json.load(f)
|
|
|
|
|
|
@pytest.fixture(scope="module")
|
|
def model_path():
|
|
return llm_models_root() / "llama-models-v2/TinyLlama-1.1B-Chat-v1.0"
|
|
|
|
|
|
def create_llm(model_dir):
|
|
"""Create LLM with specific overlap scheduler setting"""
|
|
pytorch_config = PyTorchConfig(use_cuda_graph=True,
|
|
enable_trtllm_decoder=True)
|
|
|
|
trt_kv_cache_config = TRT_KvCacheConfig(enable_block_reuse=False)
|
|
|
|
return LLM(
|
|
model=str(model_dir),
|
|
tensor_parallel_size=1,
|
|
trust_remote_code=True,
|
|
enable_chunked_prefill=True,
|
|
pytorch_backend_config=pytorch_config,
|
|
kv_cache_config=trt_kv_cache_config,
|
|
max_num_tokens=
|
|
128 # Only one request longer than max_num_tokens is required to test chunked prefill
|
|
)
|
|
|
|
|
|
def test_trtllm_decoder(model_path, test_case):
|
|
prompts = [
|
|
"Magellan and Elcano lead the first",
|
|
"The capital of France is",
|
|
"The capital of Bolivia is",
|
|
]
|
|
|
|
expected_outputs = [["circumnavigation of the world."], ["Paris."],
|
|
["La Paz."]]
|
|
|
|
# Test configuration
|
|
max_new_tokens = test_case["max_new_tokens"]
|
|
temperature = test_case["temperature"]
|
|
top_p = test_case["top_p"]
|
|
stop_words = test_case["stop_words"]
|
|
|
|
sampling_config = SamplingParams(max_tokens=max_new_tokens,
|
|
beam_width=1,
|
|
stop=stop_words,
|
|
temperature=temperature,
|
|
top_p=top_p)
|
|
|
|
# Test with overlap scheduler disabled
|
|
llm = create_llm(model_path)
|
|
outputs = llm.generate(prompts,
|
|
sampling_params=sampling_config,
|
|
use_tqdm=True)
|
|
texts = [[completion.text for completion in request_output.outputs]
|
|
for request_output in outputs]
|
|
llm.shutdown()
|
|
|
|
# Remove any text after \n\n, consider texts is a list of list of strings
|
|
texts = [[text.split('\n\n')[0] for text in request_output]
|
|
for request_output in texts]
|
|
|
|
# Verify outputs are consistent
|
|
for text, expected in zip(texts, expected_outputs):
|
|
assert similar(text, expected)
|