mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
474 lines
18 KiB
Python
474 lines
18 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
import argparse
|
|
import copy
|
|
import json
|
|
from pathlib import Path
|
|
|
|
import evaluate
|
|
import numpy as np
|
|
import torch
|
|
from datasets import load_dataset
|
|
from transformers import AutoModel, AutoTokenizer
|
|
|
|
import tensorrt_llm
|
|
import tensorrt_llm.profiler as profiler
|
|
from tensorrt_llm.logger import logger
|
|
from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
|
|
ModelConfig, SamplingConfig)
|
|
|
|
from build import find_engines # isort:skip
|
|
|
|
model_name = ""
|
|
|
|
|
|
def TRT(args, config):
|
|
|
|
model_name = config['builder_config']['name']
|
|
dtype = config['builder_config']['precision']
|
|
world_size = config['builder_config']['tensor_parallel']
|
|
assert world_size == tensorrt_llm.mpi_world_size(), \
|
|
f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
|
|
|
|
world_size = config['builder_config']['tensor_parallel']
|
|
remove_input_padding = config['plugin_config']['remove_input_padding']
|
|
|
|
model_config = ModelConfig(
|
|
model_name=model_name,
|
|
vocab_size=config['builder_config']['vocab_size'],
|
|
num_layers=config['builder_config']['num_layers'],
|
|
num_heads=config['builder_config']['num_heads'] // world_size,
|
|
num_kv_heads=max(config['builder_config']['num_kv_heads'] // world_size,
|
|
1),
|
|
hidden_size=config['builder_config']['hidden_size'] // world_size,
|
|
gpt_attention_plugin=bool(
|
|
config['plugin_config']['gpt_attention_plugin']),
|
|
remove_input_padding=remove_input_padding,
|
|
tokens_per_block=config['plugin_config']['tokens_per_block'],
|
|
paged_kv_cache=config['plugin_config']['paged_kv_cache'],
|
|
dtype=dtype,
|
|
use_custom_all_reduce=config['plugin_config']['use_custom_all_reduce'],
|
|
)
|
|
|
|
runtime_rank = tensorrt_llm.mpi_rank()
|
|
runtime_mapping = tensorrt_llm.Mapping(world_size,
|
|
runtime_rank,
|
|
tp_size=world_size)
|
|
torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
|
|
|
|
serialize_path = find_engines(
|
|
args.engine_dir,
|
|
model_name=model_name,
|
|
dtype=dtype,
|
|
tp_size=world_size,
|
|
rank=runtime_rank,
|
|
)[0]
|
|
|
|
tensorrt_llm.logger.set_level(args.log_level)
|
|
|
|
with open(serialize_path, 'rb') as f:
|
|
engine_buffer = f.read()
|
|
|
|
if model_name == 'chatglm-6b':
|
|
decoder = ChatGLMGenerationSession(
|
|
model_config,
|
|
engine_buffer,
|
|
runtime_mapping,
|
|
)
|
|
else:
|
|
decoder = GenerationSession(
|
|
model_config,
|
|
engine_buffer,
|
|
runtime_mapping,
|
|
)
|
|
|
|
return decoder
|
|
|
|
|
|
def main(args):
|
|
runtime_rank = tensorrt_llm.mpi_rank()
|
|
logger.set_level(args.log_level)
|
|
|
|
test_hf = args.test_hf and runtime_rank == 0 # only run hf on rank 0
|
|
test_trt_llm = args.test_trt_llm
|
|
tokenizer = AutoTokenizer.from_pretrained(
|
|
args.tokenizer,
|
|
padding_side='left',
|
|
trust_remote_code=True,
|
|
)
|
|
|
|
if args.eval_type == 'code_completion':
|
|
dataset_name = "openai_humaneval"
|
|
dataset_revision = None
|
|
dataset_input_key = 'prompt'
|
|
dataset_output_key = 'canonical_solution'
|
|
elif args.eval_type == 'summarize':
|
|
dataset_name = "ccdv/cnn_dailymail"
|
|
dataset_revision = "3.0.0"
|
|
dataset_input_key = 'article'
|
|
dataset_output_key = 'highlights'
|
|
args.dataset_path.mkdir(parents=True, exist_ok=True)
|
|
dataset = load_dataset(dataset_name,
|
|
dataset_revision,
|
|
cache_dir=args.dataset_path)
|
|
|
|
config_path = str(args.engine_dir / 'config.json')
|
|
with open(config_path, 'r') as f:
|
|
config = json.load(f)
|
|
|
|
max_batch_size = args.batch_size
|
|
|
|
# runtime parameters
|
|
# repetition_penalty = 1
|
|
top_k = args.top_k
|
|
output_len = args.output_len
|
|
test_token_num = 800
|
|
# top_p = 0.0
|
|
# random_seed = 5
|
|
temperature = 1
|
|
num_beams = args.num_beams
|
|
length_penalty = args.length_penalty
|
|
|
|
pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0]
|
|
end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0]
|
|
|
|
if test_trt_llm:
|
|
tensorrt_llm_gpt = TRT(args, config)
|
|
|
|
if test_hf:
|
|
model = AutoModel.from_pretrained(
|
|
args.hf_model_location,
|
|
trust_remote_code=True,
|
|
)
|
|
model.cuda()
|
|
if args.data_type == 'fp16':
|
|
model.half()
|
|
|
|
def eval_tensorrt_llm(datapoint, eval_type='summarize'):
|
|
batch_size = len(datapoint)
|
|
append_str = ' TL;DR: ' if eval_type == 'summarize' else ''
|
|
line = copy.copy(datapoint)
|
|
line_encoded = []
|
|
input_lengths = []
|
|
for i in range(batch_size):
|
|
line[i] = line[i] + append_str
|
|
|
|
line[i] = line[i].strip()
|
|
line[i] = line[i].replace(" n't", "n't")
|
|
|
|
input_id = tokenizer.encode(
|
|
line[i],
|
|
return_tensors='pt',
|
|
).type(torch.int32)
|
|
if model_name == 'chatglm-6b':
|
|
input_id = input_id[:, -test_token_num:]
|
|
else:
|
|
input_id = input_id[:, :test_token_num]
|
|
|
|
line_encoded.append(input_id)
|
|
input_lengths.append(input_id.shape[-1])
|
|
|
|
max_length = max(input_lengths)
|
|
|
|
if tensorrt_llm_gpt.remove_input_padding:
|
|
line_encoded = [t.to(torch.int32).cuda() for t in line_encoded]
|
|
else:
|
|
# do padding, should move outside the profiling to prevent the overhead
|
|
for i in range(batch_size):
|
|
pad_size = max_length - input_lengths[i]
|
|
|
|
pad = torch.ones([1, pad_size], dtype=torch.int32) * pad_id
|
|
line_encoded[i] = torch.cat(
|
|
[line_encoded[i].to(torch.int32), pad], axis=-1)
|
|
|
|
line_encoded = torch.cat(line_encoded, axis=0).cuda()
|
|
input_lengths = torch.tensor(input_lengths,
|
|
dtype=torch.int32).cuda()
|
|
|
|
sampling_config = SamplingConfig(
|
|
end_id=end_id,
|
|
pad_id=pad_id,
|
|
top_k=top_k,
|
|
num_beams=num_beams,
|
|
length_penalty=length_penalty,
|
|
)
|
|
|
|
with torch.no_grad():
|
|
tensorrt_llm_gpt.setup(batch_size,
|
|
max_context_length=max_length,
|
|
max_new_tokens=output_len,
|
|
beam_width=num_beams,
|
|
max_kv_cache_length=args.max_kv_cache_len)
|
|
|
|
if tensorrt_llm_gpt.remove_input_padding:
|
|
output_ids = tensorrt_llm_gpt.decode_batch(
|
|
line_encoded, sampling_config)
|
|
else:
|
|
output_ids = tensorrt_llm_gpt.decode(
|
|
line_encoded,
|
|
input_lengths,
|
|
sampling_config,
|
|
)
|
|
|
|
torch.cuda.synchronize()
|
|
|
|
# Extract a list of tensors of shape beam_width x output_ids.
|
|
if tensorrt_llm_gpt.mapping.is_first_pp_rank():
|
|
output_beams_list = [
|
|
tokenizer.batch_decode(output_ids[batch_idx, :,
|
|
input_lengths[batch_idx]:],
|
|
skip_special_tokens=True)
|
|
for batch_idx in range(batch_size)
|
|
]
|
|
return output_beams_list, output_ids[:, :, max_length:].tolist()
|
|
return [], []
|
|
|
|
def eval_hf(datapoint, eval_type='summarize'):
|
|
batch_size = len(datapoint)
|
|
append_str = ' TL;DR: ' if eval_type == 'summarize' else ''
|
|
if batch_size > 1:
|
|
logger.warning(
|
|
f"HF does not support batch_size > 1 to verify correctness due to padding and attention mask. Current batch size is {batch_size}"
|
|
)
|
|
|
|
line = copy.copy(datapoint)
|
|
line_encoded = []
|
|
input_lengths = []
|
|
for i in range(batch_size):
|
|
line[i] = line[i] + append_str
|
|
|
|
line[i] = line[i].strip()
|
|
line[i] = line[i].replace(" n't", "n't")
|
|
|
|
input_id = tokenizer.encode(
|
|
line[i],
|
|
return_tensors='pt',
|
|
).type(torch.int64)
|
|
if model_name == 'chatglm-6b':
|
|
input_id = input_id[:, -test_token_num:]
|
|
else:
|
|
input_id = input_id[:, :test_token_num]
|
|
|
|
line_encoded.append(input_id)
|
|
input_lengths.append(input_id.shape[-1])
|
|
|
|
max_length = max(input_lengths)
|
|
|
|
for i in range(batch_size):
|
|
pad_size = max_length - input_lengths[i]
|
|
|
|
pad = torch.ones([1, pad_size], dtype=torch.int64) * pad_id
|
|
line_encoded[i] = torch.cat([pad, line_encoded[i].to(torch.int64)],
|
|
axis=-1)
|
|
|
|
line_encoded = torch.cat(line_encoded, axis=0).cuda()
|
|
|
|
with torch.no_grad():
|
|
output = model.generate(line_encoded,
|
|
max_length=len(line_encoded[0]) +
|
|
output_len,
|
|
top_k=top_k,
|
|
temperature=temperature,
|
|
eos_token_id=tokenizer.eos_token_id,
|
|
pad_token_id=tokenizer.pad_token_id,
|
|
num_beams=num_beams,
|
|
num_return_sequences=num_beams,
|
|
early_stopping=True,
|
|
length_penalty=length_penalty)
|
|
|
|
tokens_list = output[:, len(line_encoded[0]):].tolist()
|
|
output = output.reshape([batch_size, num_beams, -1])
|
|
output_lines_list = [
|
|
tokenizer.batch_decode(output[:, i, len(line_encoded[0]):],
|
|
skip_special_tokens=True)
|
|
for i in range(num_beams)
|
|
]
|
|
|
|
return output_lines_list, tokens_list
|
|
|
|
if test_trt_llm:
|
|
datapoint = dataset['test'][0:1]
|
|
output, _ = eval_tensorrt_llm(datapoint[dataset_input_key],
|
|
eval_type=args.eval_type)
|
|
if runtime_rank == 0:
|
|
logger.info(
|
|
"---------------------------------------------------------")
|
|
logger.info("TensorRT-LLM Generated : ")
|
|
logger.info(f" Input : {datapoint[dataset_input_key]}")
|
|
logger.info(f"\n Reference : {datapoint[dataset_output_key]}")
|
|
logger.info(f"\n Output : {output}")
|
|
logger.info(
|
|
"---------------------------------------------------------")
|
|
|
|
if test_hf:
|
|
datapoint = dataset['test'][0:1]
|
|
output, _ = eval_hf(datapoint[dataset_input_key],
|
|
eval_type=args.eval_type)
|
|
logger.info("---------------------------------------------------------")
|
|
logger.info("HF Generated : ")
|
|
logger.info(f" Input : {datapoint[dataset_input_key]}")
|
|
logger.info(f"\n Reference : {datapoint[dataset_output_key]}")
|
|
logger.info(f"\n Output : {output}")
|
|
logger.info("---------------------------------------------------------")
|
|
|
|
metric_tensorrt_llm = [evaluate.load("rouge") for _ in range(num_beams)]
|
|
metric_hf = [evaluate.load("rouge") for _ in range(num_beams)]
|
|
for i in range(num_beams):
|
|
metric_tensorrt_llm[i].seed = 0
|
|
metric_hf[i].seed = 0
|
|
|
|
ite_count = 0
|
|
data_point_idx = 0
|
|
while (data_point_idx < len(dataset['test'])) and (ite_count <
|
|
args.max_ite):
|
|
if runtime_rank == 0:
|
|
logger.debug(
|
|
f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}"
|
|
)
|
|
datapoint = dataset['test'][data_point_idx:(data_point_idx +
|
|
max_batch_size)]
|
|
|
|
if test_trt_llm:
|
|
profiler.start('tensorrt_llm')
|
|
output_tensorrt_llm, _ = eval_tensorrt_llm(
|
|
datapoint[dataset_input_key])
|
|
profiler.stop('tensorrt_llm')
|
|
|
|
if test_hf:
|
|
profiler.start('hf')
|
|
output_hf, _ = eval_hf(datapoint[dataset_input_key])
|
|
profiler.stop('hf')
|
|
|
|
if runtime_rank == 0:
|
|
if test_trt_llm:
|
|
for batch_idx in range(len(output_tensorrt_llm)):
|
|
for beam_idx in range(num_beams):
|
|
metric_tensorrt_llm[beam_idx].add_batch(
|
|
predictions=[
|
|
output_tensorrt_llm[batch_idx][beam_idx]
|
|
],
|
|
references=[
|
|
datapoint[dataset_output_key][batch_idx]
|
|
])
|
|
if test_hf:
|
|
for beam_idx in range(num_beams):
|
|
for batch_idx in range(len(output_hf[beam_idx])):
|
|
metric_hf[beam_idx].add_batch(
|
|
predictions=[output_hf[beam_idx][batch_idx]],
|
|
references=[
|
|
datapoint[dataset_output_key][batch_idx]
|
|
])
|
|
|
|
logger.debug('-' * 100)
|
|
logger.debug(f"Input : {datapoint[dataset_input_key]}")
|
|
if test_trt_llm:
|
|
logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}')
|
|
if test_hf:
|
|
logger.debug(f'HF Output: {output_hf}')
|
|
logger.debug(f"highlights : {datapoint[dataset_output_key]}")
|
|
|
|
data_point_idx += max_batch_size
|
|
ite_count += 1
|
|
|
|
if runtime_rank == 0:
|
|
if test_trt_llm:
|
|
np.random.seed(0) # rouge score use sampling to compute the score
|
|
logger.info(
|
|
f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)'
|
|
)
|
|
for beam_idx in range(num_beams):
|
|
logger.info(f"TensorRT-LLM beam {beam_idx} result")
|
|
computed_metrics_tensorrt_llm = metric_tensorrt_llm[
|
|
beam_idx].compute()
|
|
for key in computed_metrics_tensorrt_llm.keys():
|
|
logger.info(
|
|
f' {key} : {computed_metrics_tensorrt_llm[key] * 100}')
|
|
|
|
if args.check_accuracy and beam_idx == 0:
|
|
assert computed_metrics_tensorrt_llm[
|
|
'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold
|
|
if test_hf:
|
|
np.random.seed(0) # rouge score use sampling to compute the score
|
|
logger.info(
|
|
f'Hugging Face (total latency: {profiler.elapsed_time_in_sec("hf")} sec)'
|
|
)
|
|
for beam_idx in range(num_beams):
|
|
logger.info(f"HF beam {beam_idx} result")
|
|
computed_metrics_hf = metric_hf[beam_idx].compute()
|
|
for key in computed_metrics_hf.keys():
|
|
logger.info(f' {key} : {computed_metrics_hf[key] * 100}')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
parser = argparse.ArgumentParser()
|
|
parser.add_argument(
|
|
'--model_version',
|
|
'-m',
|
|
type=str,
|
|
required=True,
|
|
choices=["1", "2", "3", "2-32k", "3-32k"],
|
|
help=
|
|
'1, 2, 3, 2-32k, 3-32k for ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-32k and ChatGLM3-32k respectively'
|
|
)
|
|
parser.add_argument('--hf_model_location', type=str, default=None)
|
|
parser.add_argument(
|
|
'--tokenizer',
|
|
default=None,
|
|
help='tokenizer path; defaults to hf_model_location if left unspecified'
|
|
)
|
|
parser.add_argument('--test_hf', action='store_true', default=True)
|
|
parser.add_argument('--test_trt_llm', action='store_true', default=True)
|
|
parser.add_argument('--data_type',
|
|
type=str,
|
|
choices=['fp32', 'fp16'],
|
|
default='fp16')
|
|
parser.add_argument('--dataset_path', type=Path, default='dataset')
|
|
parser.add_argument('--log_level', type=str, default='info')
|
|
parser.add_argument('--engine_dir', type=Path, default='trtModel')
|
|
parser.add_argument('--batch_size', type=int, default=1)
|
|
parser.add_argument('--max_ite', type=int, default=20)
|
|
parser.add_argument('--output_len', type=int, default=100)
|
|
parser.add_argument('--max_kv_cache_len',
|
|
type=int,
|
|
default=None,
|
|
help='The max kv cache length. \
|
|
If the final sequence length exceeds the kv cache length, we will enable cyclic kv cache. \
|
|
If it is set to None, we will use the max sequence length.')
|
|
parser.add_argument('--check_accuracy', action='store_true', default=True)
|
|
parser.add_argument('--tensorrt_llm_rouge1_threshold',
|
|
type=float,
|
|
default=15.0)
|
|
parser.add_argument('--num_beams', type=int, default=1)
|
|
parser.add_argument('--top_k', type=int, default=1)
|
|
parser.add_argument('--eval_type',
|
|
type=str,
|
|
default='summarize',
|
|
choices=['summarize', 'code_completion'])
|
|
parser.add_argument('--length_penalty', type=float, default=1.0)
|
|
|
|
args = parser.parse_args()
|
|
|
|
if args.model_version == "1":
|
|
args.model_name = "chatglm-6b"
|
|
elif args.model_version in ["2", "3"]:
|
|
args.model_name = "chatglm%s-6b" % args.model_version
|
|
else:
|
|
args.model_name = "chatglm%s-6b-32k" % args.model_version.split("-")[0]
|
|
|
|
if args.tokenizer == None:
|
|
args.tokenizer = args.model_name
|
|
|
|
main(args)
|