TensorRT-LLMs/windows/examples/llama/benchmark.py

# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import argparse
import subprocess
import sys
from pathlib import Path


def parse_arguments():
    parser = argparse.ArgumentParser(
        description="Benchmark llama TensorRT-LLM model.")
    parser.add_argument('--model_dir',
                        type=str,
                        default=None,
                        help="Directory with HF model")
    parser.add_argument('--quant_ckpt_path',
                        type=str,
                        default=None,
                        help="Path with quanitzed weights")
    parser.add_argument('--engine_dir',
                        type=str,
                        default=None,
                        help="Directory to store engines")

    return parser.parse_args()


def main(args):
    dir_level = 3  # How nested is the current file?
    top_level_path = Path(__file__).resolve().parents[dir_level]
    example_path = top_level_path / "examples" / "llama"
    benchmark_path = top_level_path / "benchmarks" / "python"

    input_seqlen = 100
    output_seqlen = 100
    batch_size = 8

    build_args = [
        sys.executable,
        str(example_path / "build.py"), "--model_dir",
        str(args.model_dir), "--quant_ckpt_path",
        str(args.quant_ckpt_path), "--dtype", "float16", "--log_level", "info",
        "--use_gpt_attention_plugin", "float16", "--use_gemm_plugin", "float16",
        "--enable_context_fmha", "--use_weight_only", "--weight_only_precision",
        "int4_gptq", "--per_group", "--max_input_len",
        str(input_seqlen), "--max_output_len",
        str(output_seqlen), "--n_positions",
        str(input_seqlen + output_seqlen + 1), "--max_batch_size",
        str(batch_size), "--output_dir",
        str(args.engine_dir)
    ]
    benchmark_args = [
        sys.executable,
        str(benchmark_path / "benchmark.py"), "--engine_dir",
        str(args.engine_dir), "--mode", "plugin", "-m", "llama_7b", "--dtype",
        "float16", "--log_level", "info", "--batch_size",
        str(batch_size), "--input_output_len",
        f"{input_seqlen},{output_seqlen}", "--num_beams", "1", "--warm_up", "1",
        "--num_runs", "3", "--duration", "10", "--csv"
    ]

    def run(pass_args):
        print("Running {}".format(" ".join(pass_args)))
        subprocess.run(pass_args)

    run(build_args)
    run(benchmark_args)


if __name__ == '__main__':
    args = parse_arguments()
    assert args.model_dir, "Please pass in path to model"
    assert args.quant_ckpt_path, "Please pass in path to quantized weights"
    if not args.engine_dir:
        args.engine_dir = Path.cwd() / "engines"

    assert Path(
        args.model_dir).exists(), "Please pass a valid, existing model path"
    assert Path(args.quant_ckpt_path).exists(
    ), "Please pass a valid, existing path to quantized weights"

    args.model_dir = Path(args.model_dir)
    args.quant_ckpt_path = Path(args.quant_ckpt_path)
    args.engine_dir = Path(args.engine_dir)

    main(args)