mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* Update TensorRT-LLM --------- Co-authored-by: Shixiaowei02 <39303645+Shixiaowei02@users.noreply.github.com>
100 lines
3.6 KiB
Python
100 lines
3.6 KiB
Python
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
# SPDX-License-Identifier: Apache-2.0
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
import argparse
|
|
import subprocess
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
|
|
def parse_arguments():
|
|
parser = argparse.ArgumentParser(
|
|
description="Benchmark llama TensorRT-LLM model.")
|
|
parser.add_argument('--model_dir',
|
|
type=str,
|
|
default=None,
|
|
help="Directory with HF model")
|
|
parser.add_argument('--quant_ckpt_path',
|
|
type=str,
|
|
default=None,
|
|
help="Path with quanitzed weights")
|
|
parser.add_argument('--engine_dir',
|
|
type=str,
|
|
default=None,
|
|
help="Directory to store engines")
|
|
|
|
return parser.parse_args()
|
|
|
|
|
|
def main(args):
|
|
dir_level = 3 # How nested is the current file?
|
|
top_level_path = Path(__file__).resolve().parents[dir_level]
|
|
example_path = top_level_path / "examples" / "llama"
|
|
benchmark_path = top_level_path / "benchmarks" / "python"
|
|
|
|
input_seqlen = 100
|
|
output_seqlen = 100
|
|
batch_size = 8
|
|
|
|
build_args = [
|
|
sys.executable,
|
|
str(example_path / "build.py"), "--model_dir",
|
|
str(args.model_dir), "--quant_ckpt_path",
|
|
str(args.quant_ckpt_path), "--dtype", "float16", "--log_level", "info",
|
|
"--use_gpt_attention_plugin", "float16", "--use_gemm_plugin", "float16",
|
|
"--enable_context_fmha", "--use_weight_only", "--weight_only_precision",
|
|
"int4_gptq", "--per_group", "--max_input_len",
|
|
str(input_seqlen), "--max_output_len",
|
|
str(output_seqlen), "--n_positions",
|
|
str(input_seqlen + output_seqlen + 1), "--max_batch_size",
|
|
str(batch_size), "--output_dir",
|
|
str(args.engine_dir)
|
|
]
|
|
benchmark_args = [
|
|
sys.executable,
|
|
str(benchmark_path / "benchmark.py"), "--engine_dir",
|
|
str(args.engine_dir), "--mode", "plugin", "-m", "llama_7b", "--dtype",
|
|
"float16", "--log_level", "info", "--batch_size",
|
|
str(batch_size), "--input_output_len",
|
|
f"{input_seqlen},{output_seqlen}", "--num_beams", "1", "--warm_up", "1",
|
|
"--num_runs", "3", "--duration", "10", "--csv"
|
|
]
|
|
|
|
def run(pass_args):
|
|
print("Running {}".format(" ".join(pass_args)))
|
|
subprocess.run(pass_args)
|
|
|
|
run(build_args)
|
|
run(benchmark_args)
|
|
|
|
|
|
if __name__ == '__main__':
|
|
args = parse_arguments()
|
|
assert args.model_dir, "Please pass in path to model"
|
|
assert args.quant_ckpt_path, "Please pass in path to quantized weights"
|
|
if not args.engine_dir:
|
|
args.engine_dir = Path.cwd() / "engines"
|
|
|
|
assert Path(
|
|
args.model_dir).exists(), "Please pass a valid, existing model path"
|
|
assert Path(args.quant_ckpt_path).exists(
|
|
), "Please pass a valid, existing path to quantized weights"
|
|
|
|
args.model_dir = Path(args.model_dir)
|
|
args.quant_ckpt_path = Path(args.quant_ckpt_path)
|
|
args.engine_dir = Path(args.engine_dir)
|
|
|
|
main(args)
|