mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* Move TRT-LLM backend repo to TRT-LLM repo Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * Address review comments Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * debug ci Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * Update triton backend Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * Fixes after update Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> --------- Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
377 lines
14 KiB
Python
377 lines
14 KiB
Python
# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
#
|
|
# Redistribution and use in source and binary forms, with or without
|
|
# modification, are permitted provided that the following conditions
|
|
# are met:
|
|
# * Redistributions of source code must retain the above copyright
|
|
# notice, this list of conditions and the following disclaimer.
|
|
# * Redistributions in binary form must reproduce the above copyright
|
|
# notice, this list of conditions and the following disclaimer in the
|
|
# documentation and/or other materials provided with the distribution.
|
|
# * Neither the name of NVIDIA CORPORATION nor the names of its
|
|
# contributors may be used to endorse or promote products derived
|
|
# from this software without specific prior written permission.
|
|
#
|
|
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
|
|
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
|
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
|
|
from typing import Dict, Optional
|
|
|
|
import numpy as np
|
|
import pytest
|
|
# Use PYTHONPATH=../inflight_batcher_llm/tensorrt_llm_bls/1/
|
|
from lib.decode import *
|
|
|
|
|
|
class MockDecoder(Decoder):
|
|
|
|
def __init__(self,
|
|
streaming=False,
|
|
accumulate=False,
|
|
data_dict: Optional[Dict] = None):
|
|
super().__init__(streaming=streaming, accumulate=accumulate)
|
|
self.data_dict = data_dict
|
|
self.draft_step = -1
|
|
self.target_step = -1
|
|
|
|
self.draft_num_calls = 0
|
|
self.target_num_calls = 0
|
|
|
|
def preprocess(self, request: Request) -> PreprocResponse:
|
|
return PreprocResponse(
|
|
input_ids=np.array([self.data_dict["input_ids"]]),
|
|
input_lengths=np.array([[len(self.data_dict["input_ids"])]]),
|
|
stop_words_list=np.array([[[]]]))
|
|
|
|
def _postprocess(self, tokens: np.ndarray,
|
|
sequence_lengths: Optional[np.ndarray],
|
|
gen_response: GenerationResponse) -> Response:
|
|
target_output = self.data_dict["target_output"][self.target_step]
|
|
return Response(text_output=np.array([target_output["output_text"]]))
|
|
|
|
def _draft_generate_non_streaming(
|
|
self, preproc: PreprocResponse, request: Request,
|
|
num_draft_tokens: int) -> GenerationResponse:
|
|
self.draft_num_calls += 1
|
|
self.draft_step += 1
|
|
draft_output = self.data_dict["draft_output"][self.draft_step]
|
|
response = GenerationResponse(
|
|
output_ids=np.array([[draft_output["output_ids"]]]),
|
|
generation_logits=None,
|
|
sequence_length=np.array([[draft_output["sequence_length"]]]))
|
|
if self.data_dict.get("use_draft_logits", False):
|
|
print("!!!")
|
|
response.generation_logits = draft_output["generation_logits"]
|
|
return response
|
|
|
|
def _generate(
|
|
self,
|
|
preproc: PreprocResponse,
|
|
request: Request,
|
|
draft_request: Optional[DraftRequest] = None,
|
|
multimodal_enc_response: Optional[MultimodalEncResponse] = None
|
|
) -> Generator[GenerationResponse, None, None]:
|
|
for idx, target_output in enumerate(self.data_dict["target_output"]):
|
|
self.target_num_calls += 1
|
|
self.target_step = idx
|
|
output_len = len(target_output["output_ids"])
|
|
yield GenerationResponse(output_ids=np.array(
|
|
[[target_output["output_ids"]]]),
|
|
sequence_length=np.array([[output_len]]))
|
|
|
|
def _generate_non_streaming(
|
|
self,
|
|
preproc: PreprocResponse,
|
|
request: Request,
|
|
draft_request: Optional[DraftRequest] = None,
|
|
multimodal_enc_response: Optional[MultimodalEncResponse] = None
|
|
) -> GenerationResponse:
|
|
self.target_num_calls += 1
|
|
# Return the full completion (final step) if not using speculative decoding in non-streaming mode
|
|
if not self.data_dict["use_speculative"]:
|
|
self.target_step = (len(self.data_dict["target_output"]) - 2)
|
|
else:
|
|
print(draft_request)
|
|
assert draft_request is not None
|
|
if draft_request.draft_input_ids is not None:
|
|
assert draft_request.draft_input_ids.shape[1] > 0
|
|
if self.data_dict.get("use_draft_logits", False):
|
|
assert draft_request.draft_logits is not None
|
|
assert draft_request.draft_logits.shape[
|
|
1] == draft_request.draft_input_ids.shape[1]
|
|
|
|
self.target_step += 1
|
|
target_output = self.data_dict["target_output"][self.target_step]
|
|
output_len = len(target_output["output_ids"])
|
|
return GenerationResponse(output_ids=np.array(
|
|
[[target_output["output_ids"]]]),
|
|
sequence_length=np.array([[output_len]]))
|
|
|
|
|
|
decode_testcases = [
|
|
{
|
|
"text_input":
|
|
"Deep learning is",
|
|
"max_tokens":
|
|
10,
|
|
"use_speculative":
|
|
False,
|
|
"input_ids": [1, 10, 11, 23],
|
|
"target_output": [{
|
|
"output_ids": [1, 10, 11, 23, 7],
|
|
"output_text": "Deep learning is a"
|
|
}, {
|
|
"output_ids": [1, 10, 11, 23, 7, 9],
|
|
"output_text": "Deep learning is a subset"
|
|
}, {
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21],
|
|
"output_text": "Deep learning is a subset of"
|
|
}, {
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22],
|
|
"output_text": "Deep learning is a subset of Machine"
|
|
}, {
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
|
|
"output_text":
|
|
"Deep learning is a subset of Machine learning"
|
|
}]
|
|
},
|
|
{
|
|
"text_input":
|
|
"Deep learning is",
|
|
"max_tokens":
|
|
10,
|
|
"use_speculative":
|
|
True,
|
|
"num_draft_tokens":
|
|
3,
|
|
"use_draft_logits":
|
|
False,
|
|
"input_ids": [1, 10, 11, 23],
|
|
"target_output": [{
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21],
|
|
"output_text": "Deep learning is a subset of"
|
|
}, {
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
|
|
"output_text":
|
|
"Deep learning is a subset of Machine learning"
|
|
}],
|
|
"draft_output": [{
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 22],
|
|
"sequence_length": 7,
|
|
}, {
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
|
|
"sequence_length": 9,
|
|
}]
|
|
},
|
|
{
|
|
"text_input":
|
|
"Deep learning is",
|
|
"max_tokens":
|
|
10,
|
|
"use_speculative":
|
|
True,
|
|
"num_draft_tokens":
|
|
3,
|
|
"use_draft_logits":
|
|
True,
|
|
"input_ids": [1, 10, 11, 23],
|
|
"target_output": [{
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21],
|
|
"output_text": "Deep learning is a subset of"
|
|
}, {
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
|
|
"output_text":
|
|
"Deep learning is a subset of Machine learning"
|
|
}],
|
|
"draft_output": [{
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 22],
|
|
"sequence_length": 7,
|
|
"generation_logits": np.random.rand(1, 1, 7, 1024),
|
|
}, {
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
|
|
"sequence_length": 9,
|
|
"generation_logits": np.random.rand(1, 1, 7, 1024),
|
|
}]
|
|
},
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("test_case", decode_testcases)
|
|
def test_decode(test_case):
|
|
|
|
request = Request(
|
|
text_input=np.array([[test_case["text_input"]]], dtype=object),
|
|
max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
|
|
num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
|
|
dtype=np.int32)
|
|
if "num_draft_tokens" in test_case else None),
|
|
use_draft_logits=(np.array([[test_case["use_draft_logits"]]],
|
|
dtype=bool)
|
|
if "use_draft_logits" in test_case else None),
|
|
stop_words=np.array([[[]]]))
|
|
# Last index is the expected response
|
|
expected_res = Response(text_output=np.array(
|
|
[test_case["target_output"][-1]["output_text"]], dtype=object))
|
|
|
|
if not test_case["use_speculative"]:
|
|
# Test non speculative mode
|
|
|
|
# non-streaming
|
|
d = MockDecoder(data_dict=test_case, streaming=False)
|
|
for res in d.decode(request):
|
|
assert expected_res == res
|
|
assert d.target_num_calls == 1
|
|
|
|
# streaming
|
|
d = MockDecoder(data_dict=test_case, streaming=True)
|
|
final_res = None
|
|
for res in d.decode(request):
|
|
final_res = res
|
|
assert final_res == expected_res
|
|
assert d.target_num_calls == len(test_case["target_output"])
|
|
else:
|
|
# Test speculative decoding
|
|
d = MockDecoder(data_dict=test_case)
|
|
final_res = None
|
|
for res in d.decode(request, speculative_decoding=True):
|
|
final_res = res
|
|
assert final_res == expected_res
|
|
num_steps = len(test_case["draft_output"])
|
|
assert d.target_num_calls == num_steps
|
|
assert d.draft_num_calls == num_steps
|
|
|
|
|
|
length_stop_testcases = [{
|
|
"text_input":
|
|
"Deep learning is",
|
|
"max_tokens":
|
|
1,
|
|
"use_speculative":
|
|
True,
|
|
"num_draft_tokens":
|
|
3,
|
|
"input_ids": [1, 10, 11, 23],
|
|
"target_output": [{
|
|
"output_ids": [1, 10, 11, 23],
|
|
"output_text": "Deep learning is a"
|
|
}, {
|
|
"output_ids": "not important",
|
|
"output_text": "not important"
|
|
}],
|
|
"draft_output": [{
|
|
"output_ids": ["not important"],
|
|
"sequence_length": 0
|
|
}, {
|
|
"output_ids": ["not important"],
|
|
"sequence_length": 0
|
|
}]
|
|
}]
|
|
|
|
|
|
@pytest.mark.parametrize("test_case", length_stop_testcases)
|
|
def test_length_stop(test_case):
|
|
# Since max_tokens is 1, test if get the first output as the final output
|
|
# and make sure the draft model is never called
|
|
request = Request(
|
|
text_input=np.array([[test_case["text_input"]]], dtype=object),
|
|
max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
|
|
num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
|
|
dtype=np.int32)
|
|
if "num_draft_tokens" in test_case else None),
|
|
stop_words=np.array([[[]]]))
|
|
# Index 0 is the expected response
|
|
expected_res = Response(text_output=np.array(
|
|
[test_case["target_output"][0]["output_text"]], dtype=object))
|
|
|
|
d = MockDecoder(data_dict=test_case)
|
|
final_res = None
|
|
for res in d.decode(request, speculative_decoding=True):
|
|
final_res = res
|
|
assert final_res == expected_res
|
|
assert d.target_num_calls == 1
|
|
assert d.draft_num_calls == 0
|
|
|
|
|
|
early_stopping_testcases = [
|
|
{
|
|
"text_input":
|
|
"Deep learning is",
|
|
"max_tokens":
|
|
10,
|
|
"use_speculative":
|
|
True,
|
|
"num_draft_tokens":
|
|
3,
|
|
"input_ids": [1, 10, 11, 23],
|
|
"target_output": [{
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21],
|
|
"output_text": "Deep learning is a subset of"
|
|
}, {
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21],
|
|
"output_text": "Deep learning is a subset of Machine"
|
|
}, {
|
|
"output_ids": ["not important"],
|
|
"output_text": "not important"
|
|
}],
|
|
"draft_output": [{
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 22],
|
|
"sequence_length": 7
|
|
}, {
|
|
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
|
|
"sequence_length": 9
|
|
}, {
|
|
"output_ids": ["not important"],
|
|
"sequence_length": 0
|
|
}]
|
|
},
|
|
]
|
|
|
|
|
|
@pytest.mark.parametrize("test_case", early_stopping_testcases)
|
|
def test_early_stopping(test_case):
|
|
|
|
request = Request(
|
|
text_input=np.array([[test_case["text_input"]]], dtype=object),
|
|
max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
|
|
num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
|
|
dtype=np.int32)
|
|
if "num_draft_tokens" in test_case else None),
|
|
stop_words=np.array([[[]]]))
|
|
# Index 1 is the expected response
|
|
expected_res = Response(text_output=np.array(
|
|
[test_case["target_output"][1]["output_text"]], dtype=object))
|
|
|
|
d = MockDecoder(data_dict=test_case)
|
|
final_res = None
|
|
for res in d.decode(request, speculative_decoding=True):
|
|
final_res = res
|
|
assert final_res == expected_res
|
|
assert d.target_num_calls == 2
|
|
assert d.draft_num_calls == 2
|
|
|
|
|
|
def test_request_validation():
|
|
req = Request()
|
|
with pytest.raises(RequestValidationError):
|
|
req.validate()
|
|
req.text_input = np.array([["input string"]], dtype=object)
|
|
with pytest.raises(RequestValidationError):
|
|
req.validate()
|
|
req.max_tokens = np.array([[10]])
|
|
req.validate()
|
|
|
|
req.stream = np.array([[True]])
|
|
req.num_draft_tokens = np.array([[5]])
|
|
|
|
with pytest.raises(RequestValidationError):
|
|
req.validate()
|