TensorRT-LLMs/triton_backend/all_models/tests/test_decode.py
Iman Tabrizian 4c7191af67
Move Triton backend to TRT-LLM main (#3549)
* Move TRT-LLM backend repo to TRT-LLM repo

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>

* Address review comments

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>

* debug ci

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>

* Update triton backend

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>

* Fixes after update

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>

---------

Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
2025-05-16 07:15:23 +08:00

377 lines
14 KiB
Python

# Copyright 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
# * Redistributions of source code must retain the above copyright
# notice, this list of conditions and the following disclaimer.
# * Redistributions in binary form must reproduce the above copyright
# notice, this list of conditions and the following disclaimer in the
# documentation and/or other materials provided with the distribution.
# * Neither the name of NVIDIA CORPORATION nor the names of its
# contributors may be used to endorse or promote products derived
# from this software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS ``AS IS'' AND ANY
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY
# OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
from typing import Dict, Optional
import numpy as np
import pytest
# Use PYTHONPATH=../inflight_batcher_llm/tensorrt_llm_bls/1/
from lib.decode import *
class MockDecoder(Decoder):
def __init__(self,
streaming=False,
accumulate=False,
data_dict: Optional[Dict] = None):
super().__init__(streaming=streaming, accumulate=accumulate)
self.data_dict = data_dict
self.draft_step = -1
self.target_step = -1
self.draft_num_calls = 0
self.target_num_calls = 0
def preprocess(self, request: Request) -> PreprocResponse:
return PreprocResponse(
input_ids=np.array([self.data_dict["input_ids"]]),
input_lengths=np.array([[len(self.data_dict["input_ids"])]]),
stop_words_list=np.array([[[]]]))
def _postprocess(self, tokens: np.ndarray,
sequence_lengths: Optional[np.ndarray],
gen_response: GenerationResponse) -> Response:
target_output = self.data_dict["target_output"][self.target_step]
return Response(text_output=np.array([target_output["output_text"]]))
def _draft_generate_non_streaming(
self, preproc: PreprocResponse, request: Request,
num_draft_tokens: int) -> GenerationResponse:
self.draft_num_calls += 1
self.draft_step += 1
draft_output = self.data_dict["draft_output"][self.draft_step]
response = GenerationResponse(
output_ids=np.array([[draft_output["output_ids"]]]),
generation_logits=None,
sequence_length=np.array([[draft_output["sequence_length"]]]))
if self.data_dict.get("use_draft_logits", False):
print("!!!")
response.generation_logits = draft_output["generation_logits"]
return response
def _generate(
self,
preproc: PreprocResponse,
request: Request,
draft_request: Optional[DraftRequest] = None,
multimodal_enc_response: Optional[MultimodalEncResponse] = None
) -> Generator[GenerationResponse, None, None]:
for idx, target_output in enumerate(self.data_dict["target_output"]):
self.target_num_calls += 1
self.target_step = idx
output_len = len(target_output["output_ids"])
yield GenerationResponse(output_ids=np.array(
[[target_output["output_ids"]]]),
sequence_length=np.array([[output_len]]))
def _generate_non_streaming(
self,
preproc: PreprocResponse,
request: Request,
draft_request: Optional[DraftRequest] = None,
multimodal_enc_response: Optional[MultimodalEncResponse] = None
) -> GenerationResponse:
self.target_num_calls += 1
# Return the full completion (final step) if not using speculative decoding in non-streaming mode
if not self.data_dict["use_speculative"]:
self.target_step = (len(self.data_dict["target_output"]) - 2)
else:
print(draft_request)
assert draft_request is not None
if draft_request.draft_input_ids is not None:
assert draft_request.draft_input_ids.shape[1] > 0
if self.data_dict.get("use_draft_logits", False):
assert draft_request.draft_logits is not None
assert draft_request.draft_logits.shape[
1] == draft_request.draft_input_ids.shape[1]
self.target_step += 1
target_output = self.data_dict["target_output"][self.target_step]
output_len = len(target_output["output_ids"])
return GenerationResponse(output_ids=np.array(
[[target_output["output_ids"]]]),
sequence_length=np.array([[output_len]]))
decode_testcases = [
{
"text_input":
"Deep learning is",
"max_tokens":
10,
"use_speculative":
False,
"input_ids": [1, 10, 11, 23],
"target_output": [{
"output_ids": [1, 10, 11, 23, 7],
"output_text": "Deep learning is a"
}, {
"output_ids": [1, 10, 11, 23, 7, 9],
"output_text": "Deep learning is a subset"
}, {
"output_ids": [1, 10, 11, 23, 7, 9, 21],
"output_text": "Deep learning is a subset of"
}, {
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22],
"output_text": "Deep learning is a subset of Machine"
}, {
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
"output_text":
"Deep learning is a subset of Machine learning"
}]
},
{
"text_input":
"Deep learning is",
"max_tokens":
10,
"use_speculative":
True,
"num_draft_tokens":
3,
"use_draft_logits":
False,
"input_ids": [1, 10, 11, 23],
"target_output": [{
"output_ids": [1, 10, 11, 23, 7, 9, 21],
"output_text": "Deep learning is a subset of"
}, {
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
"output_text":
"Deep learning is a subset of Machine learning"
}],
"draft_output": [{
"output_ids": [1, 10, 11, 23, 7, 9, 22],
"sequence_length": 7,
}, {
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
"sequence_length": 9,
}]
},
{
"text_input":
"Deep learning is",
"max_tokens":
10,
"use_speculative":
True,
"num_draft_tokens":
3,
"use_draft_logits":
True,
"input_ids": [1, 10, 11, 23],
"target_output": [{
"output_ids": [1, 10, 11, 23, 7, 9, 21],
"output_text": "Deep learning is a subset of"
}, {
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
"output_text":
"Deep learning is a subset of Machine learning"
}],
"draft_output": [{
"output_ids": [1, 10, 11, 23, 7, 9, 22],
"sequence_length": 7,
"generation_logits": np.random.rand(1, 1, 7, 1024),
}, {
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
"sequence_length": 9,
"generation_logits": np.random.rand(1, 1, 7, 1024),
}]
},
]
@pytest.mark.parametrize("test_case", decode_testcases)
def test_decode(test_case):
request = Request(
text_input=np.array([[test_case["text_input"]]], dtype=object),
max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
dtype=np.int32)
if "num_draft_tokens" in test_case else None),
use_draft_logits=(np.array([[test_case["use_draft_logits"]]],
dtype=bool)
if "use_draft_logits" in test_case else None),
stop_words=np.array([[[]]]))
# Last index is the expected response
expected_res = Response(text_output=np.array(
[test_case["target_output"][-1]["output_text"]], dtype=object))
if not test_case["use_speculative"]:
# Test non speculative mode
# non-streaming
d = MockDecoder(data_dict=test_case, streaming=False)
for res in d.decode(request):
assert expected_res == res
assert d.target_num_calls == 1
# streaming
d = MockDecoder(data_dict=test_case, streaming=True)
final_res = None
for res in d.decode(request):
final_res = res
assert final_res == expected_res
assert d.target_num_calls == len(test_case["target_output"])
else:
# Test speculative decoding
d = MockDecoder(data_dict=test_case)
final_res = None
for res in d.decode(request, speculative_decoding=True):
final_res = res
assert final_res == expected_res
num_steps = len(test_case["draft_output"])
assert d.target_num_calls == num_steps
assert d.draft_num_calls == num_steps
length_stop_testcases = [{
"text_input":
"Deep learning is",
"max_tokens":
1,
"use_speculative":
True,
"num_draft_tokens":
3,
"input_ids": [1, 10, 11, 23],
"target_output": [{
"output_ids": [1, 10, 11, 23],
"output_text": "Deep learning is a"
}, {
"output_ids": "not important",
"output_text": "not important"
}],
"draft_output": [{
"output_ids": ["not important"],
"sequence_length": 0
}, {
"output_ids": ["not important"],
"sequence_length": 0
}]
}]
@pytest.mark.parametrize("test_case", length_stop_testcases)
def test_length_stop(test_case):
# Since max_tokens is 1, test if get the first output as the final output
# and make sure the draft model is never called
request = Request(
text_input=np.array([[test_case["text_input"]]], dtype=object),
max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
dtype=np.int32)
if "num_draft_tokens" in test_case else None),
stop_words=np.array([[[]]]))
# Index 0 is the expected response
expected_res = Response(text_output=np.array(
[test_case["target_output"][0]["output_text"]], dtype=object))
d = MockDecoder(data_dict=test_case)
final_res = None
for res in d.decode(request, speculative_decoding=True):
final_res = res
assert final_res == expected_res
assert d.target_num_calls == 1
assert d.draft_num_calls == 0
early_stopping_testcases = [
{
"text_input":
"Deep learning is",
"max_tokens":
10,
"use_speculative":
True,
"num_draft_tokens":
3,
"input_ids": [1, 10, 11, 23],
"target_output": [{
"output_ids": [1, 10, 11, 23, 7, 9, 21],
"output_text": "Deep learning is a subset of"
}, {
"output_ids": [1, 10, 11, 23, 7, 9, 21],
"output_text": "Deep learning is a subset of Machine"
}, {
"output_ids": ["not important"],
"output_text": "not important"
}],
"draft_output": [{
"output_ids": [1, 10, 11, 23, 7, 9, 22],
"sequence_length": 7
}, {
"output_ids": [1, 10, 11, 23, 7, 9, 21, 22, 11],
"sequence_length": 9
}, {
"output_ids": ["not important"],
"sequence_length": 0
}]
},
]
@pytest.mark.parametrize("test_case", early_stopping_testcases)
def test_early_stopping(test_case):
request = Request(
text_input=np.array([[test_case["text_input"]]], dtype=object),
max_tokens=np.array([[test_case["max_tokens"]]], dtype=np.int32),
num_draft_tokens=(np.array([[test_case["num_draft_tokens"]]],
dtype=np.int32)
if "num_draft_tokens" in test_case else None),
stop_words=np.array([[[]]]))
# Index 1 is the expected response
expected_res = Response(text_output=np.array(
[test_case["target_output"][1]["output_text"]], dtype=object))
d = MockDecoder(data_dict=test_case)
final_res = None
for res in d.decode(request, speculative_decoding=True):
final_res = res
assert final_res == expected_res
assert d.target_num_calls == 2
assert d.draft_num_calls == 2
def test_request_validation():
req = Request()
with pytest.raises(RequestValidationError):
req.validate()
req.text_input = np.array([["input string"]], dtype=object)
with pytest.raises(RequestValidationError):
req.validate()
req.max_tokens = np.array([[10]])
req.validate()
req.stream = np.array([[True]])
req.num_draft_tokens = np.array([[5]])
with pytest.raises(RequestValidationError):
req.validate()