mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* move rest models to examples/models/core directory Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * update multimodal readme Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix example path Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix cpp test Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix tensorrt test Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --------- Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
337 lines
15 KiB
Python
337 lines
15 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import argparse
|
|
import json
|
|
import os
|
|
|
|
# isort: off
|
|
import torch
|
|
import tensorrt as trt
|
|
# isort: on
|
|
|
|
from transformers import AutoConfig, AutoTokenizer
|
|
from utils import (compare_bertcls_result, compare_bertqa_result,
|
|
decode_bertcls_output, decode_bertqa_output, get_engine_name,
|
|
intermediate_check, prepare_text_inputs, process_input,
|
|
temporary_datasets_config)
|
|
|
|
import tensorrt_llm
|
|
from tensorrt_llm import logger
|
|
from tensorrt_llm._utils import trt_dtype_to_torch
|
|
from tensorrt_llm.runtime import Session, TensorInfo
|
|
|
|
from transformers import BertConfig, BertPreTrainedModel, BertForQuestionAnswering, BertForSequenceClassification, BertModel # isort:skip
|
|
from transformers import RobertaConfig, RobertaPreTrainedModel, RobertaForQuestionAnswering, RobertaForSequenceClassification, RobertaModel # isort:skip
|
|
|
|
OUTPUT_NAME_MAPPING = {
|
|
'BertModel': 'hidden_states',
|
|
'BertForQuestionAnswering': 'logits',
|
|
'BertForSequenceClassification': 'logits',
|
|
'RobertaModel': 'hidden_states',
|
|
'RobertaForQuestionAnswering': 'logits',
|
|
'RobertaForSequenceClassification': 'logits'
|
|
}
|
|
|
|
|
|
def parse_arguments():
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument('--log_level', type=str, default='info')
|
|
parser.add_argument('--engine_dir', type=str, default='bert_outputs')
|
|
parser.add_argument('--hf_model_dir', type=str, required=True)
|
|
parser.add_argument('--run_hf_test', action='store_true')
|
|
parser.add_argument('--remove_input_padding', action='store_true')
|
|
parser.add_argument('--debug', action='store_true')
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
args = parse_arguments()
|
|
|
|
tensorrt_llm.logger.set_level(args.log_level)
|
|
|
|
config_path = os.path.join(args.engine_dir, 'config.json')
|
|
with open(config_path, 'r') as f:
|
|
config = json.load(f)
|
|
|
|
remove_padding = config['build_config']['plugin_config'][
|
|
'remove_input_padding']
|
|
assert args.remove_input_padding == remove_padding, \
|
|
f"The engine is build with remove_input_padding={remove_padding}, \
|
|
but the inference runtime is performed with remove_input_padding={args.remove_input_padding}!"
|
|
|
|
world_size = config['pretrained_config']['mapping']['world_size']
|
|
assert world_size == tensorrt_llm.mpi_world_size(), \
|
|
f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
|
|
|
|
model_name = config['pretrained_config']['architecture']
|
|
|
|
# Roberta doesn't have token_type_ids, use all zeros to replace
|
|
is_roberta = "Roberta" in model_name
|
|
|
|
runtime_rank = tensorrt_llm.mpi_rank() if world_size > 1 else 0
|
|
|
|
runtime_mapping = tensorrt_llm.Mapping(world_size,
|
|
runtime_rank,
|
|
tp_size=world_size)
|
|
torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
|
|
|
|
serialize_path = get_engine_name(runtime_rank)
|
|
serialize_path = os.path.join(args.engine_dir, serialize_path)
|
|
|
|
stream = torch.cuda.current_stream().cuda_stream
|
|
logger.info(f'Loading engine from {serialize_path}')
|
|
with open(serialize_path, 'rb') as f:
|
|
engine_buffer = f.read()
|
|
logger.info(f'Creating session from engine')
|
|
session = Session.from_serialized_engine(engine_buffer)
|
|
if args.debug: session._print_engine_info()
|
|
|
|
#NOTE: prepare input
|
|
with temporary_datasets_config(HF_DATASETS_OFFLINE=False):
|
|
test_inputs = prepare_text_inputs(model_name)
|
|
hf_tokenizer = AutoTokenizer.from_pretrained(args.hf_model_dir)
|
|
if args.remove_input_padding:
|
|
#NOTE:Remove padding
|
|
inputs_without_padding = hf_tokenizer(**test_inputs)
|
|
input_ids_list = [
|
|
torch.tensor(ids).int().cuda() \
|
|
for ids in inputs_without_padding['input_ids']
|
|
]
|
|
# attention_mask_list = inputs_without_padding['attention_mask'],
|
|
if is_roberta:
|
|
token_type_ids_list = [
|
|
torch.zeros_like(torch.tensor(ids)).int().cuda() \
|
|
for ids in inputs_without_padding['input_ids']
|
|
]
|
|
else:
|
|
token_type_ids_list = [
|
|
torch.tensor(ids).int().cuda() \
|
|
for ids in inputs_without_padding['token_type_ids']
|
|
]
|
|
|
|
input_ids, input_lengths, token_type_ids, position_ids, max_input_length = \
|
|
process_input(input_ids_list=input_ids_list,
|
|
token_type_ids_list=token_type_ids_list,
|
|
is_roberta=is_roberta,
|
|
padding_idx=config['pretrained_config']['pad_token_id'])
|
|
|
|
else:
|
|
|
|
#NOTE:Padding: pad to longest seq len
|
|
inputs_with_padding = hf_tokenizer(
|
|
**test_inputs,
|
|
padding=True,
|
|
)
|
|
|
|
inputs_without_padding = hf_tokenizer(**test_inputs)
|
|
input_ids = torch.tensor(inputs_with_padding['input_ids']).int().cuda()
|
|
input_lengths = [len(x) for x in inputs_without_padding['input_ids']]
|
|
input_lengths = torch.tensor(input_lengths,
|
|
device=input_ids.device,
|
|
dtype=torch.int32)
|
|
attention_mask = torch.tensor(inputs_with_padding['attention_mask'],
|
|
device=input_ids.device,
|
|
dtype=torch.int32)
|
|
if is_roberta:
|
|
token_type_ids = torch.zeros_like(torch.tensor(
|
|
inputs_with_padding['input_ids']),
|
|
device=input_ids.device,
|
|
dtype=torch.int32)
|
|
else:
|
|
token_type_ids = torch.tensor(inputs_with_padding['token_type_ids'],
|
|
device=input_ids.device,
|
|
dtype=torch.int32)
|
|
|
|
# NOTE: TRT-LLM perform inference
|
|
output_name = OUTPUT_NAME_MAPPING[model_name]
|
|
if args.remove_input_padding:
|
|
# NOTE: Remove padding:
|
|
inputs = {
|
|
"input_ids": input_ids,
|
|
"input_lengths": input_lengths,
|
|
"token_type_ids": token_type_ids,
|
|
"position_ids": position_ids,
|
|
"max_input_length": max_input_length
|
|
}
|
|
output_info = session.infer_shapes([
|
|
TensorInfo("input_ids", trt.DataType.INT32, input_ids.shape),
|
|
TensorInfo("input_lengths", trt.DataType.INT32,
|
|
input_lengths.shape),
|
|
TensorInfo("token_type_ids", trt.DataType.INT32,
|
|
token_type_ids.shape),
|
|
TensorInfo("position_ids", trt.DataType.INT32, position_ids.shape),
|
|
TensorInfo("max_input_length", trt.DataType.INT32,
|
|
max_input_length.shape)
|
|
])
|
|
else:
|
|
#NOTE: Padding:
|
|
inputs = {
|
|
'input_ids': input_ids,
|
|
'input_lengths': input_lengths,
|
|
'token_type_ids': token_type_ids,
|
|
}
|
|
output_info = session.infer_shapes([
|
|
TensorInfo('input_ids', trt.DataType.INT32, input_ids.shape),
|
|
TensorInfo('input_lengths', trt.DataType.INT32,
|
|
input_lengths.shape),
|
|
TensorInfo('token_type_ids', trt.DataType.INT32,
|
|
token_type_ids.shape)
|
|
])
|
|
|
|
outputs = {
|
|
t.name:
|
|
torch.empty(tuple(t.shape),
|
|
dtype=trt_dtype_to_torch(t.dtype),
|
|
device='cuda')
|
|
for t in output_info
|
|
}
|
|
assert output_name in outputs, f'{output_name} not found in outputs, check if build.py set output name correctly'
|
|
|
|
logger.info(f"Rank{runtime_rank} is running inference...")
|
|
ok = session.run(inputs=inputs, outputs=outputs, stream=stream)
|
|
assert ok, "Runtime execution failed"
|
|
torch.cuda.synchronize()
|
|
res = outputs[output_name]
|
|
if args.debug: logger.info(f"Outputs:{outputs.keys()}")
|
|
|
|
# NOTE: load hf model and perform inference as reference (only on rank0)
|
|
if tensorrt_llm.mpi_rank() == 0:
|
|
logger.info(f"Rank{runtime_rank} is generating HF reference...")
|
|
if args.run_hf_test:
|
|
hf_bert = globals()[f'{model_name}'].from_pretrained(
|
|
args.hf_model_dir).cuda().to(torch.float16).eval()
|
|
hf_inputs = hf_tokenizer(**test_inputs,
|
|
padding=True,
|
|
return_tensors="pt")
|
|
hf_inputs = hf_inputs.to(hf_bert.device)
|
|
with torch.no_grad():
|
|
hf_outputs = hf_bert.forward(output_hidden_states=args.debug,
|
|
**hf_inputs)
|
|
|
|
torch.cuda.synchronize()
|
|
# NOTE: Decode output (only on rank0)
|
|
|
|
if tensorrt_llm.mpi_rank() == 0:
|
|
logger.info(f"Rank{runtime_rank} is comparing with HF reference...")
|
|
if model_name == "BertModel" or model_name == "RobertaModel":
|
|
if args.remove_input_padding:
|
|
# reshape result back to [batch_size, ...],
|
|
# and then "padding" so we could compare the tensor
|
|
from torch.nn.utils.rnn import pad_sequence
|
|
res = torch.split(res, input_lengths.tolist(), dim=0)
|
|
res = pad_sequence(list(res), batch_first=True, padding_value=0)
|
|
else:
|
|
# applied attention mask on trtllm res
|
|
attention_mask_tmp = attention_mask.unsqueeze(-1)
|
|
res = res * attention_mask_tmp
|
|
if args.run_hf_test:
|
|
ref = hf_outputs.last_hidden_state
|
|
ref = ref * hf_inputs['attention_mask'].unsqueeze(-1)
|
|
|
|
if args.debug:
|
|
intermediate_check(outputs, hf_outputs['hidden_states'],
|
|
attention_mask_tmp, logger)
|
|
|
|
if world_size == 1:
|
|
torch.testing.assert_close(actual=res.half(),
|
|
expected=ref,
|
|
rtol=1.5e-2,
|
|
atol=1.5e-2)
|
|
else:
|
|
# the arithmetic order of TP>1 is different from HF ref, which is always TP=1 for convenience.
|
|
torch.testing.assert_close(actual=res.half(),
|
|
expected=ref,
|
|
rtol=4e-2,
|
|
atol=2e-2)
|
|
print(f"{model_name} result is all close to HF reference!")
|
|
|
|
if model_name == 'BertForQuestionAnswering' or model_name == 'RobertaForQuestionAnswering':
|
|
if args.remove_input_padding:
|
|
|
|
# [num_tokens, 2] -> [num_tokens, 1]
|
|
res_start_logits, res_end_logits = torch.split(res, 1, -1)
|
|
|
|
# reshape result back to [batch_size, ...]
|
|
res_start_logits = torch.split(res_start_logits,
|
|
input_lengths.tolist(),
|
|
dim=0)
|
|
res_start_logits = tuple(t.squeeze() for t in res_start_logits)
|
|
res_end_logits = torch.split(res_end_logits,
|
|
input_lengths.tolist(),
|
|
dim=0)
|
|
res_end_logits = tuple(t.squeeze() for t in res_end_logits)
|
|
|
|
else:
|
|
#NOTE: Padding
|
|
# [B, Padding_len, 2] -> [B, Padding_len, 1]
|
|
res_start_logits, res_end_logits = torch.split(res, 1, -1)
|
|
# [B, Padding_len, 1] -> [B, Padding_len]
|
|
res_start_logits = res_start_logits.squeeze()
|
|
res_end_logits = res_end_logits.squeeze()
|
|
res_start_logits = res_start_logits * attention_mask
|
|
res_end_logits = res_end_logits * attention_mask
|
|
|
|
res_start_logits = torch.split(res_start_logits, 1, dim=0)
|
|
res_start_logits = tuple(t.squeeze(0) for t in res_start_logits)
|
|
res_end_logits = torch.split(res_end_logits, 1, dim=0)
|
|
res_end_logits = tuple(t.squeeze(0) for t in res_end_logits)
|
|
|
|
|
|
decode_res = decode_bertqa_output( inputs_text=test_inputs, \
|
|
hf_tokenizer=hf_tokenizer, start_logits=res_start_logits, \
|
|
end_logits=res_end_logits)
|
|
|
|
if args.run_hf_test:
|
|
ref_start_logits = hf_outputs.start_logits
|
|
ref_end_logits = hf_outputs.end_logits
|
|
# when we use_plugin and have real-data model_dir and input
|
|
# We do not need to care about the output of padding positions:
|
|
ref_start_logits = ref_start_logits * hf_inputs['attention_mask']
|
|
ref_end_logits = ref_end_logits * hf_inputs['attention_mask']
|
|
|
|
decode_ref = decode_bertqa_output( inputs_text=test_inputs, \
|
|
hf_tokenizer=hf_tokenizer, start_logits=ref_start_logits, \
|
|
end_logits=ref_end_logits)
|
|
compare_bertqa_result(inputs_text=test_inputs,
|
|
res_answers=decode_res,
|
|
ref_answers=decode_ref)
|
|
|
|
elif model_name == 'BertForSequenceClassification' or model_name == 'RobertaForSequenceClassification':
|
|
hf_config = AutoConfig.from_pretrained(args.hf_model_dir)
|
|
decode_res = decode_bertcls_output(logits=res,
|
|
hf_model_config=hf_config,
|
|
inputs_text=test_inputs)
|
|
|
|
if args.run_hf_test:
|
|
ref = hf_outputs.logits
|
|
if world_size == 1:
|
|
torch.testing.assert_close(actual=res.half(),
|
|
expected=ref,
|
|
rtol=1.5e-2,
|
|
atol=1.5e-2)
|
|
else:
|
|
# the arithmetic order of TP>1 is different from HF ref, which is always TP=1 for convenience.
|
|
torch.testing.assert_close(actual=res.half(),
|
|
expected=ref,
|
|
rtol=4e-2,
|
|
atol=2e-2)
|
|
decode_ref = decode_bertcls_output(logits=hf_outputs.logits,
|
|
hf_model_config=hf_config,
|
|
inputs_text=test_inputs)
|
|
compare_bertcls_result(inputs_text=test_inputs,
|
|
res_answers=decode_res,
|
|
ref_answers=decode_res)
|