mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-23 20:23:08 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Bhuvanesh Sridharan <bhuvan.sridharan@gmail.com> Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com> Co-authored-by: Eddie-Wang1120 <wangjinheng1120@163.com> Co-authored-by: meghagarwal <16129366+megha95@users.noreply.github.com>
136 lines
4.4 KiB
Python
Executable File
136 lines
4.4 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
from pathlib import Path
|
|
|
|
import numpy as np
|
|
import run
|
|
import torch
|
|
|
|
|
|
def generate_output(
|
|
model_name: str = "",
|
|
engine_kind: str = "fp16-plugin",
|
|
num_batchs: int = 1,
|
|
num_beams: int = 1,
|
|
max_output_len: int = 512,
|
|
output_logits: bool = False,
|
|
):
|
|
|
|
examples_chatglm_dir = Path(
|
|
__file__).parent.parent.parent.parent.parent / "examples/chatglm"
|
|
resources_dir = Path(__file__).parent.parent.resolve()
|
|
|
|
engine_dir = resources_dir / 'models' / 'rt_engine' / model_name
|
|
'''
|
|
# we do not distinguish TP / PP / engine_kind yet
|
|
tp_size = 1
|
|
pp_size = 1
|
|
tp_pp_dir = 'tp' + str(tp_size) + '-pp' + str(pp_size) + '-gpu/'
|
|
engine_dir = engine_dir / engine_kind / tp_pp_dir
|
|
'''
|
|
data_output_dir = resources_dir / 'data' / model_name
|
|
data_output_dir.mkdir(exist_ok=True, parents=True)
|
|
data_input_file_name = f"inputId-BS{num_batchs}-BM{num_beams}.npy"
|
|
data_output_file_name = f"outputId-BS{num_batchs}-BM{num_beams}.npy"
|
|
input_text = [
|
|
"Born in north-east France, Soyer trained as a",
|
|
"Jen-Hsun Huang was born in Tainan, Taiwan, in 1963. His family",
|
|
]
|
|
|
|
if num_batchs <= 2:
|
|
input_text = input_text[:num_batchs]
|
|
else:
|
|
input_text = input_text + input_text[-1] * (num_batchs - 2)
|
|
|
|
args = run.parse_arguments([
|
|
'--engine_dir',
|
|
str(engine_dir),
|
|
'--tokenizer_dir',
|
|
str(examples_chatglm_dir / model_name),
|
|
'--input_text',
|
|
*input_text,
|
|
'--output_npy',
|
|
str(data_output_dir / data_output_file_name),
|
|
'--max_output_len',
|
|
str(max_output_len),
|
|
'--num_beams',
|
|
str(num_beams),
|
|
])
|
|
|
|
# Since main in run.py does not save input_ids, we save it manually
|
|
model_name, model_version = run.read_model_name(args.engine_dir)
|
|
tokenizer, pad_id, end_id = run.load_tokenizer(
|
|
tokenizer_dir=args.tokenizer_dir,
|
|
model_name=model_name,
|
|
model_version=model_version,
|
|
)
|
|
batch_input_ids = run.parse_input(
|
|
tokenizer,
|
|
input_text=input_text,
|
|
prompt_template=None,
|
|
input_file=None,
|
|
add_special_tokens=True,
|
|
max_input_length=512,
|
|
pad_id=pad_id,
|
|
num_prepend_vtokens=[],
|
|
model_name=model_name,
|
|
model_version=model_version,
|
|
)
|
|
input_len = [x.size(0) for x in batch_input_ids]
|
|
max_input_len = max(input_len)
|
|
|
|
batch_input_ids_padding = torch.zeros([num_batchs, max_input_len],
|
|
dtype=torch.int32) + end_id
|
|
|
|
for i, sample in enumerate(batch_input_ids):
|
|
# padding to left
|
|
batch_input_ids_padding[i, :len(sample)] = sample
|
|
"""
|
|
# padding to right
|
|
nPadding = 0
|
|
for token in sample:
|
|
if token == pad_id:
|
|
nPadding += 1
|
|
else:
|
|
break
|
|
batch_input_ids_padding[i, :len(sample[nPadding:])] = sample[nPadding:]
|
|
"""
|
|
batch_input_ids = batch_input_ids_padding
|
|
|
|
np.save(data_output_dir / data_input_file_name,
|
|
batch_input_ids.detach().cpu().numpy())
|
|
|
|
run.main(args)
|
|
|
|
output_data = np.load(args.output_npy)
|
|
np.save(args.output_npy, output_data.reshape(num_batchs, num_beams, -1))
|
|
|
|
|
|
if __name__ == '__main__':
|
|
|
|
generate_output(model_name='chatglm_6b', num_batchs=1, num_beams=1)
|
|
|
|
generate_output(model_name='chatglm2_6b', num_batchs=1, num_beams=1)
|
|
generate_output(model_name='chatglm2_6b', num_batchs=2, num_beams=1)
|
|
generate_output(model_name='chatglm2_6b', num_batchs=1, num_beams=2)
|
|
|
|
generate_output(model_name='chatglm3_6b', num_batchs=1, num_beams=1)
|
|
generate_output(model_name='chatglm3_6b', num_batchs=2, num_beams=1)
|
|
generate_output(model_name='chatglm3_6b', num_batchs=1, num_beams=2)
|
|
|
|
print("Done")
|