TensorRT-LLMs/tests/integration/defs/common.py

# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import os
import platform
import re
from difflib import SequenceMatcher
from pathlib import Path

from packaging import version

from .trt_test_alternative import check_call, check_output, exists, is_windows


def venv_check_call(venv, cmd, env=None, **kwargs):

    def _war_check_call(*args, **kwargs):
        kwargs["cwd"] = venv.get_working_directory()
        return check_call(*args, **kwargs)

    venv.run_cmd(cmd, caller=_war_check_call, env=env, **kwargs)


def venv_check_output(venv, cmd, env=None, **kwargs):

    def _war_check_output(*args, **kwargs):
        kwargs["cwd"] = venv.get_working_directory()
        output = check_output(*args, **kwargs)
        return output

    return venv.run_cmd(cmd, caller=_war_check_output, env=env, **kwargs)


def venv_mpi_check_call(venv, mpi_cmd, python_cmd):
    """
    This function WAR check_call() to run python_cmd with mpi.
    If mpi_cmd = ["mpirun", "-n", "2"] and python_cmd = ["run.py"], the command will be:

    "mpirun -n 2 <venv python> run.py"

    """

    def _war_check_call(*args, **kwargs):
        assert len(args) == 1, "bad args"
        arg_list, = args
        merged_cmd = copy.deepcopy(mpi_cmd)
        merged_cmd.extend(arg_list)
        kwargs["cwd"] = venv.get_working_directory()
        return check_call(merged_cmd, **kwargs)

    venv.run_cmd(python_cmd, caller=_war_check_call)


def venv_mpi_check_output(venv, mpi_cmd, python_cmd, env=None):
    """
    This function WAR check_output() to run python_cmd with mpi.
    If mpi_cmd = ["mpirun", "-n", "2"] and python_cmd = ["run.py"], the command will be:

    "mpirun -n 2 <venv python> run.py"

    """

    def _war_check_output(*args, **kwargs):
        assert len(args) == 1, "bad args"
        arg_list, = args
        merged_cmd = copy.deepcopy(mpi_cmd)
        merged_cmd.extend(arg_list)
        kwargs["cwd"] = venv.get_working_directory()
        return check_output(merged_cmd, **kwargs)

    return venv.run_cmd(python_cmd, caller=_war_check_output, env=env)


def parse_mpi_cmd(cmd):
    if platform.system() == "Windows":
        # Simply fetch necessary args from Linux cmd then fill Windows cmd because:
        # 1. We use Microsoft MPI on Windows, while Open-MPI on Linux. Args are not compatible.
        # 2. Multi-GPU is actually not supported on Windows for now.
        flags = ("-n", "-np")
        # append None if not found
        indices = [idx for idx in range(len(cmd)) if cmd[idx] in flags] + [
            None,
        ]
        index = indices[0]
        return ["mpiexec", cmd[index], cmd[index + 1]] if index else cmd
    else:
        return cmd


class PluginOptions:

    def __init__(self,
                 gpt_attention: str = None,
                 bert_attention: str = None,
                 gemm: str = None,
                 layernorm: str = None):
        self.gpt_attention = gpt_attention
        self.bert_attention = bert_attention
        self.gemm = gemm

    def to_legacy_args(self):
        args = []
        if self.gpt_attention is not None:
            args.extend(["--use_gpt_attention_plugin", self.gpt_attention])
        if self.bert_attention is not None:
            args.extend(["--use_bert_attention_plugin", self.bert_attention])
        if self.gemm is not None:
            args.extend(["--use_gemm_plugin", self.gemm])
        return args

    def to_args(self):
        args = []
        if self.gpt_attention is not None:
            args.extend(["--gpt_attention_plugin", self.gpt_attention])
        else:
            args.extend(["--gpt_attention_plugin", "disable"])
        if self.bert_attention is not None:
            args.extend(["--bert_attention_plugin", self.bert_attention])
        else:
            args.extend(["--bert_attention_plugin", "disable"])
        if self.gemm is not None:
            args.extend(["--gemm_plugin", self.gemm])
        else:
            args.extend(["--gemm_plugin", "disable"])
        return args


def prune_checkpoint(llm_venv, checkpoint_dir):
    pruned_checkpoint_dir = checkpoint_dir + ".pruned"
    prune_cmd = [
        "trtllm-prune", f"--checkpoint_dir={checkpoint_dir}",
        f"--out_dir={pruned_checkpoint_dir}"
    ]

    check_call(" ".join(prune_cmd), shell=True, env=llm_venv._new_env)
    return pruned_checkpoint_dir


def refit_model(llm_venv, engine_dir, unpruned_model_dir):
    refit_engine_dir = f"{engine_dir}_refit_full"
    refit_cmd = [
        "trtllm-refit", f"--checkpoint_dir={unpruned_model_dir}",
        f"--engine_dir {engine_dir}", f"--output_dir {refit_engine_dir}"
    ]

    check_call(" ".join(refit_cmd), shell=True, env=llm_venv._new_env)
    return refit_engine_dir


def convert_weights(llm_venv,
                    example_root,
                    cmodel_dir,
                    model,
                    model_path,
                    quant_ckpt_path=None,
                    data_type="float16",
                    gpus=1,
                    tp_size=None,
                    pp_size=None,
                    model_type=None,
                    use_parallel_embedding=False,
                    embedding_sharding_dim=0,
                    load_by_shard=False,
                    int8_kv_cache=False,
                    use_weight_only=False,
                    workers=1,
                    processes=None,
                    smoothquant=0,
                    per_channel=False,
                    per_token=False,
                    fp8_kv_cache=False,
                    enable_fp8=False,
                    weight_only_precision=None,
                    per_group=False,
                    batch_size=8,
                    multimodal=False,
                    ckpt_type='hf',
                    load_model_on_cpu=False,
                    **kwargs):
    "Convert weights from HF transformers format to FT format"
    converted_model_path = os.path.join(cmodel_dir, model, data_type)
    script = "convert_checkpoint.py"

    tp_size = gpus if tp_size is None else tp_size
    pp_size = gpus // tp_size if pp_size is None else pp_size
    gpus = tp_size * pp_size
    model_dir = f'{converted_model_path}/{gpus}-gpu'

    # TODO: add other models command
    if "gpt2" in model:
        script = "convert_checkpoint.py"
        convert_cmd = [
            f"{example_root}/{script}", f"--output_dir={model_dir}",
            f"--dtype={data_type}", f"--tp_size={tp_size}",
            f"--pp_size={pp_size}"
        ]
        if "next" in model:
            convert_cmd.extend(["--nemo_ckpt_path", model_path])
        else:
            convert_cmd.extend(["--model_dir", model_path])
        if "smooth" in model:
            convert_cmd.extend(["--smoothquant", "0.5"])
        if "kv" in model and "int8" in model:
            convert_cmd.append("--int8_kv_cache")

    elif "t5" in model or "bart" in model or "ul2" in model or "wmt" in model or "nougat" in model or 'pix2struct' in model:
        assert model_type, "Encoder-Decoder models must specify model architecture type"
        script = "convert_checkpoint.py"
        convert_cmd = [
            f"{example_root}/{script}", "--model_dir", model_path,
            "--output_dir", converted_model_path, f"--model_type={model_type}",
            f"--tp_size={tp_size}", f"--pp_size={pp_size}",
            f"--dtype={data_type}"
        ]
        if "nougat" in model:
            convert_cmd.append("--nougat")

        model_dir = converted_model_path

    elif "opt" in model and model_type == "blip2":
        convert_cmd = [
            f"{example_root}/{script}",
            f"--model_dir={model_path}",
            f"--output_dir={model_dir}",
            f"--model_type={model_type}",
            f"--dtype={data_type}",
            f"--tp_size={tp_size}",
            f"--pp_size={pp_size}",
        ]

    elif "whisper" in model_path:
        script = "convert_checkpoint.py"
        convert_cmd = [
            f"{example_root}/{script}", "--model_dir", model_path,
            "--output_dir", converted_model_path
        ]
        model_dir = converted_model_path

    elif "mamba" in model:
        convert_cmd = [
            f"{example_root}/{script}", "--model_dir", model_path,
            "--output_dir", model_dir, f"--dtype={data_type}",
            f"--tp_size={tp_size}"
        ]

    elif "llama" in model or "llava" in model or "vila" in model:
        convert_cmd = [
            f"{example_root}/{script}", "--output_dir", model_dir,
            f"--dtype={data_type}", f"--tp_size={tp_size}",
            f"--pp_size={pp_size}"
        ]

        if 'meta-ckpt' in model:
            convert_cmd.extend(['--meta_ckpt_dir', model_path])
        else:
            convert_cmd.extend(['--model_dir', model_path])

        if 'code_llama_1gpu' in model:
            convert_cmd.extend(['--rotary_base=1000000'])
            convert_cmd.extend(['--vocab_size=32016'])
        elif 'code_llama' in model:
            convert_cmd.extend(['--rotary_base=1000000'])
            convert_cmd.extend(['--vocab_size=32000'])
        if 'int4_gptq' in model:
            convert_cmd.extend([
                "--use_weight_only", "--weight_only_precision=int4_gptq",
                f"--quant_ckpt_path={quant_ckpt_path}", "--per_group"
            ])
        if 'int8_gptq' in model:
            convert_cmd.extend([
                "--use_weight_only", "--weight_only_precision=int8_gptq",
                f"--quant_ckpt_path={quant_ckpt_path}", "--per_group",
                "--group_size=64"
            ])

        if 'awq' in model:
            convert_cmd.extend([
                "--use_weight_only", "--weight_only_precision=int4_awq",
                "--group_size=128"
            ])
        if 'hf_fp8' in model:
            convert_cmd.extend(["--use_fp8"])

    elif "draft_target_model" in model:
        if "gpt" in model_path:
            example_name = "gpt"
        elif "llama" in model_path:
            example_name = "llama"
        script = f"{example_root}/../models/core/{example_name}/convert_checkpoint.py"
        convert_cmd = [
            f"{script}",
            "--model_dir",
            model_path,
            "--output_dir",
            model_dir,
            f"--dtype={data_type}",
        ]

    elif "prompt_lookup" in model:
        if "gpt" in model_path:
            example_name = "gpt"
        elif "llama" in model_path:
            example_name = "llama"
        script = f"{example_root}/../models/core/{example_name}/convert_checkpoint.py"
        convert_cmd = [
            f"{script}",
            "--model_dir",
            model_path,
            "--output_dir",
            model_dir,
            f"--dtype={data_type}",
        ]

    elif "medusa" in model:
        convert_cmd = [
            f"{example_root}/{script}", "--model_dir", model_path[0],
            "--medusa_model_dir", model_path[1], "--output_dir", model_dir,
            f"--dtype={data_type}", f"--tp_size={tp_size}",
            f"--pp_size={pp_size}", "--num_medusa_heads=4"
        ]
    elif "redrafter" in model:
        redrafter_num_beams = kwargs.pop("redrafter_num_beams")
        redrafter_draft_len_per_beam = kwargs.pop(
            "redrafter_draft_len_per_beam")
        convert_cmd = [
            f"{example_root}/{script}", "--model_dir", model_path[0],
            "--drafter_model_dir", model_path[1], "--output_dir", model_dir,
            f"--dtype={data_type}", f"--tp_size={tp_size}",
            f"--redrafter_num_beams={redrafter_num_beams}",
            f"--redrafter_draft_len_per_beam={redrafter_draft_len_per_beam}"
        ]
    elif "eagle" in model:
        if len(model_path) == 2:
            # Test the checkpoint released from HF, which requires two separate weights,
            # one for the base model and one for the EagleNets.
            convert_cmd = [
                f"{example_root}/{script}", "--model_dir", model_path[0],
                "--eagle_model_dir", model_path[1], "--output_dir", model_dir,
                f"--dtype={data_type}", f"--tp_size={tp_size}",
                f"--pp_size={pp_size}", "--num_eagle_layers=4",
                "--max_draft_len=63", "--max_non_leaves_per_layer=10"
            ]
        else:
            # Test the checkpoint released from ModelOpt, which only requires one weight,
            # which includes both the base model and EagleNets, and is an FP8 datatype.
            convert_cmd = [
                f"{example_root}/{script}", "--model_dir", model_path,
                "--output_dir", model_dir, f"--dtype={data_type}",
                f"--tp_size={tp_size}", f"--pp_size={pp_size}",
                "--num_eagle_layers=4", "--max_draft_len=63",
                "--max_non_leaves_per_layer=10"
            ]
    elif "recurrentgemma" in model:
        convert_cmd = [
            f"{example_root}/{script}", "--model_dir", model_path,
            "--output_dir", model_dir, f"--dtype={data_type}",
            f"--world_size={tp_size}", f"--ckpt_type={ckpt_type}"
        ]
    elif "cogvlm" in model:
        convert_cmd = [
            f"{example_root}/{script}", "--model_dir", model_path,
            "--output_dir", model_dir, f"--dtype={data_type}",
            f"--tp_size={tp_size}", f"--pp_size={pp_size}",
            "--use_prompt_tuning"
        ]
    elif "fuyu" in model or "kosmos" in model:
        gpt_variant = "kosmos-2" if "kosmos" in model else "persimmon"
        convert_cmd = [
            f"{example_root}/{script}", "--model_dir", model_path,
            "--output_dir", model_dir, "--dtype", data_type, "--gpt_variant",
            gpt_variant
        ]
    elif "neva-22b" in model:
        convert_cmd = [
            f"{example_root}/{script}", "--nemo_ckpt_path", model_path,
            "--output_dir", model_dir, "--dtype", data_type,
            "--nemo_rename_key", "model:model.language_model",
            "attention.linear_qkv.layer_norm_bias:input_layernorm.bias",
            "attention.linear_qkv.layer_norm_weight:input_layernorm.weight",
            "mlp.linear_fc1.layer_norm_bias:post_attention_layernorm.bias",
            "mlp.linear_fc1.layer_norm_weight:post_attention_layernorm.weight",
            "linear_qkv:query_key_value", "linear_fc1:dense_h_to_4h",
            "linear_fc2:dense_4h_to_h", "linear_proj:dense", "decoder:encoder"
        ]
    elif "video-neva" in model:

        nemotron_root = os.path.join(example_root, "../", "nemotron")

        if llm_venv:
            # Install Python requirements for nemotron
            llm_venv.run_cmd([
                "-m", "pip", "install", "-r",
                os.path.join(nemotron_root, "requirements.txt")
            ])

        qformat = 'full_prec'
        model_name = 'nemotron-video-neva'
        converted_model_path = os.path.join(cmodel_dir, model_name, qformat)
        model_dir = f'{converted_model_path}/{gpus}-gpu'
        # Overwrite the model_path with the nemotron model path
        model_path = os.path.join(os.path.dirname(os.path.dirname(model_path)),
                                  'nemotron', 'Nemotron-4-15B-SteerLM.nemo')
        convert_cmd = [
            f"{example_root}/../quantization/quantize.py",
            f"--nemo_ckpt_path={model_path}",
            "--batch_size=64",
            f"--dtype={data_type}",
            f"--qformat={qformat}",
            f"--output_dir={model_dir}",
        ]
    elif "dit-xl" in model.lower():
        convert_cmd = [
            f"{example_root}/{script}",
            f"--timm_ckpt={model_path}",
            f"--output_dir={model_dir}",
            f"--dtype={data_type}",
            f"--tp_size={tp_size}",
            f"--pp_size={pp_size}",
        ]
        if kwargs.get("enable_fp8_linear") is not None:
            convert_cmd.append("--fp8_linear")
    elif "stdit" in model.lower():
        convert_cmd = [
            f"{example_root}/{script}",
            f"--timm_ckpt={model_path}/model.safetensors",
            f"--output_dir={model_dir}",
            f"--dtype={data_type}",
            f"--tp_size={tp_size}",
            f"--pp_size={pp_size}",
        ]
    elif "bert" in model.lower():
        convert_cmd = [
            f"{example_root}/{script}",
            f"--model={model}",
            f"--model_dir={model_path}",
            f"--output_dir={model_dir}",
            f"--dtype={data_type}",
            f"--tp_size={tp_size}",
        ]
    elif "granite" in model.lower():
        convert_cmd = [
            f"{example_root}/{script}",
            f"--model_dir={model_path}",
            f"--output_dir={model_dir}",
            f"--dtype={data_type}",
            f"--tp_size={tp_size}",
        ]
    elif "stable-diffusion-3.5" in model.lower():
        convert_cmd = [
            f"{example_root}/{script}",
            f"--model_path={model_path}",
            f"--output_dir={model_dir}",
            f"--tp_size={tp_size}",
        ]
    else:
        convert_cmd = [
            f"{example_root}/{script}",
            f"--model_dir={model_path}",
            f"--output_dir={model_dir}",
            f"--dtype={data_type}",
            f"--tp_size={tp_size}",
            f"--pp_size={pp_size}",
        ]

    if use_parallel_embedding:
        convert_cmd.append("--use_parallel_embedding")
        convert_cmd.append(f"--embedding_sharding_dim={embedding_sharding_dim}")
    if load_by_shard:
        convert_cmd.extend(["--load_by_shard"])
    if load_model_on_cpu:
        convert_cmd.extend(["--load_model_on_cpu"])
    if workers > 1:
        convert_cmd.extend([f"--workers={workers}"])
    if int8_kv_cache:
        convert_cmd.append("--int8_kv_cache")
    if use_weight_only:
        convert_cmd.append("--use_weight_only")
    if weight_only_precision:
        convert_cmd.append(f"--weight_only_precision={weight_only_precision}")
    if processes is not None:
        convert_cmd.append(f"--processes={processes}")
    if smoothquant > 0:
        convert_cmd.append(f"--smoothquant={smoothquant}")
    if per_channel:
        convert_cmd.append("--per_channel")
    if per_token:
        convert_cmd.append("--per_token")
    if enable_fp8:
        convert_cmd.append('--enable_fp8')
    if fp8_kv_cache:
        convert_cmd.append('--fp8_kv_cache')
    if quant_ckpt_path:
        convert_cmd.append(f"--quant_ckpt_path={quant_ckpt_path}")
    if per_group:
        convert_cmd.append("--per_group")

    for key, value in kwargs.items():
        if isinstance(value, bool):
            if value:
                convert_cmd.append(f"--{key}")
        else:
            convert_cmd.extend([f"--{key}={value}"])

    if llm_venv:
        venv_check_call(llm_venv, convert_cmd)
        return model_dir
    else:
        return convert_cmd, model_dir


def similarity_score(a, b):
    "similar compare a and b "
    return SequenceMatcher(None, a, b).ratio()


def similar(a, b, threshold=0.8):
    "similar compare a and b "
    return similarity_score(a, b) >= threshold


def generate_summary_cmd(example_root, *args, **kwargs):
    "generate summary command"
    summarize_script = f"{example_root}/../../../summarize.py" if "core" in example_root else f"{example_root}/../summarize.py"
    summary_cmd = [summarize_script, "--test_trt_llm", "--check_accuracy"]

    for key, value in kwargs.items():
        if isinstance(value, bool):
            if value:
                summary_cmd.append(f"--{key}")
        elif isinstance(value, list):  # Support max_attention_window
            summary_cmd.extend([f"--{key}", *map(str, value)])
        else:
            summary_cmd.extend([f"--{key}", f"{value}"])

    for arg in args:
        summary_cmd.append(f"--{arg}")

    return summary_cmd


def generate_deterministic_cmd(example_root, *args, **kwargs):
    "generate deterministic command"
    deterministic_cmd = [
        f"{example_root}/mixtral_deterministic.py",
        "--check_deterministic_accuracy"
    ]

    for key, value in kwargs.items():
        if isinstance(value, bool):
            if value:
                deterministic_cmd.extend(f"--{key}")
        else:
            deterministic_cmd.extend([f"--{key}", f"{value}"])

    for arg in args:
        deterministic_cmd.append(f"--{arg}")

    return deterministic_cmd


def quantize_data(llm_venv,
                  example_root,
                  model_dir,
                  dtype,
                  quantize_dir,
                  qformat="full_prec",
                  tp_size=1,
                  pp_size=1,
                  cp_size=1,
                  calib_size=512,
                  kv_cache_dtype=None,
                  **kwargs):
    "quanize data and return data dir"
    model_name = os.path.basename(model_dir)
    output_dir = os.path.join(quantize_dir, model_name, dtype, qformat,
                              f"tp{tp_size}pp{pp_size}")
    if kv_cache_dtype:
        output_dir = os.path.join(output_dir, kv_cache_dtype)
    else:
        output_dir = os.path.join(output_dir, "no_kv_cache")

    quantize_script = f"{example_root}/../../../quantization/quantize.py" if "core" in example_root else f"{example_root}/../quantization/quantize.py"
    quantize_cmd = [
        quantize_script,
        f"--model_dir={model_dir}",
        f"--dtype={dtype}",
        f"--qformat={qformat}",
        f"--output_dir={output_dir}",
        f"--tp_size={tp_size}",
        f"--pp_size={pp_size}",
        f"--cp_size={cp_size}",
        f"--calib_size={calib_size}",
    ]

    if kv_cache_dtype:
        quantize_cmd.append(f"--kv_cache_dtype={kv_cache_dtype}")

    for key, value in kwargs.items():
        if isinstance(value, bool):
            if value:
                quantize_cmd.append(f"--{key}")
        else:
            quantize_cmd.extend([f"--{key}", f"{value}"])

    if llm_venv:
        if not exists(output_dir):
            venv_check_call(llm_venv, quantize_cmd)
        return output_dir
    else:
        return quantize_cmd, output_dir


def find_tensorrt(ld_library_path):
    MAX_SEARCH_HEIGHT = 10
    ld_library_path = ld_library_path.split(os.pathsep)
    for trt_lib_dir in ld_library_path:
        trt_lib_dir = Path(trt_lib_dir)
        trt_nvinfer_lib = trt_lib_dir / "libnvinfer.so"
        if trt_nvinfer_lib.exists():
            trt_root_dir = trt_lib_dir
            for i in range(MAX_SEARCH_HEIGHT):
                trt_root_dir = trt_root_dir.parent
                trt_include_dir = trt_root_dir / "include"
                trt_nvinfer_header = trt_include_dir / "NvInfer.h"
                if trt_nvinfer_header.exists():
                    return str(trt_include_dir), str(trt_lib_dir)
    return None, None


def get_trt_llm_lib_dir(venv):
    output = venv.run_raw(
        "import tensorrt_llm; print(f'{tensorrt_llm.__path__[0]}/libs')",
        caller=check_output).strip()

    if "TensorRT-LLM version: " in output:
        output = output.split('\n')[-1]

    return output.strip()


def trt_gte(venv, major: int, minor: int = 0):
    """
    Check if TRT version is greater than or equal to major.minor
    """
    ver = venv.run_output("import tensorrt;print(tensorrt.__version__)")
    trt_ver = version.parse(ver)
    return trt_ver.major >= major and trt_ver.minor >= minor


def parse_output(text):
    "parse output"
    results = []
    text_lists = re.split(r"Input \[Text \d\]:", text)
    for item in text_lists:
        item = item.replace(os.linesep, "")
        while True:
            match = re.search(
                r"(Output \[Text \d+ Beam \d+\]: \"(.*?)\")(Output|Input|$)",
                item, re.MULTILINE)
            if match is None:
                break
            _, end = match.span(1)
            results.append(match.group(2))
            item = item[end:]

    return results


def run_and_check(llm_venv, run_cmd, valid_outputs, streaming=False):
    print("Running inference...")
    output = venv_check_output(llm_venv, run_cmd)

    if not streaming:
        output = parse_output(output)[0]
        assert any([
            similar(output, expect, threshold=0.95) for expect in valid_outputs
        ]), f"output is: {output}"
    else:
        # Fetch all outputs and expect a monotonically increasing similarity
        similarities = []
        for suboutput in parse_output(output):
            similarities.append(
                max([
                    similarity_score(suboutput, expect)
                    for expect in valid_outputs
                ]))
        assert (
            all(x <= y for x, y in zip(similarities, similarities[1:]))
        ), f"streaming outputs must have a monotonically increasing similarity score. similarities: {similarities}"
        output = parse_output(output)[-1]
        assert any([
            similar(output, expect, threshold=0.95) for expect in valid_outputs
        ]), f"output is: {output}"


def get_cpp_benchmark(cpp_benchmark_name, llm_root):
    suffix = ".exe" if is_windows() else ""
    cpp_benchmark_name += suffix
    # In CI/CD, we copy the cpp binary into the same folder as cpp to avoid package sanity
    ci_path = os.path.join(os.path.dirname(os.path.realpath(llm_root)),
                           "benchmarks", "cpp", cpp_benchmark_name)
    if os.path.exists(ci_path):
        return ci_path
    # In QA, we keep the benchmark build at its original location
    qa_path = os.path.join(llm_root, "cpp", "build", "benchmarks",
                           cpp_benchmark_name)
    if os.path.exists(qa_path):
        return qa_path
    raise Exception(
        f"Cannot find cpp benchmark binary in either {ci_path} or {qa_path}. Did you forget --benchmark in building TRT-LLM?"
    )


def generate_dummy_loras(
        hf_model_dir,
        lora_output_dir,
        num_loras=1,
        lora_rank=8,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
        zero_weights=False):

    import torch
    from peft import LoraConfig, get_peft_model
    from transformers import AutoModelForCausalLM

    print("Creating pseudo LoRAs...")
    model = AutoModelForCausalLM.from_pretrained(
        hf_model_dir,
        torch_dtype=torch.float16,
        device_map="auto",
        trust_remote_code=True,
    )
    lora_config = LoraConfig(r=lora_rank,
                             target_modules=target_modules,
                             bias="none",
                             task_type="CAUSAL_LM")
    lora_output_paths = []
    for lora_idx in range(num_loras):
        lora_model = get_peft_model(model, lora_config)
        if zero_weights:
            for param in lora_model.parameters():
                param.data.zero_()
        pseudo_lora_dir = f"{lora_output_dir}/pseudo_lora_{lora_idx}"
        lora_model.save_pretrained(pseudo_lora_dir)
        lora_output_paths.append(pseudo_lora_dir)
    return lora_output_paths


def test_multi_lora_support(
    hf_model_dir,
    tllm_ckpt_dir,
    engine_dir,
    llm_venv,
    example_root,
    num_loras=2,
    lora_rank=8,
    target_hf_modules=["q_proj", "k_proj", "v_proj"],
    target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
    zero_lora_weights=True,
    use_code_prompts=False,
):
    print("Creating dummy LoRAs...")
    lora_paths = generate_dummy_loras(
        hf_model_dir=hf_model_dir,
        lora_output_dir=llm_venv.get_working_directory(),
        num_loras=num_loras,
        lora_rank=lora_rank,
        target_modules=target_hf_modules,
        zero_weights=zero_lora_weights)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={tllm_ckpt_dir}",
        f"--output_dir={engine_dir}",
        "--remove_input_padding=enable",
        "--context_fmha=enable",
        "--gemm_plugin=auto",
        "--lora_plugin=auto",
        "--max_batch_size=8",
        "--max_input_len=512",
        "--max_seq_len=562",
        "--lora_dir",
        f"{lora_paths[0]}",
        f"{lora_paths[1]}",
        "--max_lora_rank=8",
        "--lora_target_modules",
        *target_trtllm_modules,
        "--max_beam_width=1",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    if use_code_prompts:
        input_prompts = [
            "Write a function that outputs the fibonacci sequence.",
            "Convert the following C++ code to Python:  x = 0;x++;",
            "Find the largest prime factor of 42.",
            "write a unit test for this function: $(cat fib.py)",
            "# A simple python function to remove whitespace from a string:",
            "How to load CodeLlama from HuggingFace?",
        ]
    else:
        input_prompts = [
            "Hey how are you doing today?",
            "How is the weather in Seattle, WA?",
            "Is it ok to fill diesel in a petrol car?",
            "Can you check the top 5 trending songs on spotify?",
            "What is the capital of France?",
            "How to load CodeLlama from HuggingFace?",
        ]

    print("Run inference with C++ runtime with pybind...")
    run_script = f"{example_root}/../../../run.py" if "core" in example_root else f"{example_root}/../run.py"
    run_cmd = [
        run_script,
        f"--tokenizer_dir={hf_model_dir}",
        f"--engine_dir={engine_dir}",
        "--input_text",
        *input_prompts,
        "--lora_task_uids",
        "-1",
        "0",
        "1",
        "-1",
        "0",
        "1",
        "--top_p=0.5",
        "--top_k=0",
        "--random_seed=0",
        "--max_output_len=30",
    ]
    venv_check_call(llm_venv, run_cmd)


def get_dummy_spec_decoding_heads(hf_model_dir,
                                  save_dir,
                                  mode='medusa',
                                  num_heads=4,
                                  num_layers=1):

    import os

    import modelopt.torch.opt as mto
    import modelopt.torch.speculative as mtsp
    import transformers
    from modelopt.torch.export import export_hf_checkpoint

    # Create the base model.
    model = transformers.AutoModelForCausalLM.from_pretrained(
        hf_model_dir, trust_remote_code=True)

    if mode == "medusa":
        config = {
            "medusa_num_heads": num_heads,
            "medusa_num_layers": num_layers,
        }
    elif mode == "eagle":
        config = {
            "eagle_num_layers": num_layers,
            "use_input_layernorm_in_first_layer": True,
            "use_last_layernorm": False,
        }
    else:
        raise NotImplementedError(f"Unknown mode {mode}.")
    mtsp.convert(model, [(mode, config)])

    tokenizer = transformers.AutoTokenizer.from_pretrained(hf_model_dir)
    tokenizer.pad_token_id = tokenizer.eos_token_id

    # Create a dummy trainer.
    trainer = transformers.Trainer(model=model, tokenizer=tokenizer)
    trainer._move_model_to_device(model, 'cuda')

    # Enable HF checkpointing so that the saved model will contain the speculative decoding module.
    mto.enable_huggingface_checkpointing()
    trainer.save_model(os.path.join(save_dir, 'native'))
    tokenizer.save_pretrained(os.path.join(save_dir, 'native'))

    import modelopt.torch.quantization as mtq
    import modelopt.torch.utils.dataset_utils as dataset_utils

    mto.enable_huggingface_checkpointing()

    model = transformers.AutoModelForCausalLM.from_pretrained(
        os.path.join(save_dir, 'native'))
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        os.path.join(save_dir, 'native'))

    calib_dataloader = dataset_utils.get_dataset_dataloader(
        dataset_name="cnn_dailymail",
        tokenizer=tokenizer,
        batch_size=1,
        num_samples=1,
        device=model.device,
        include_labels=False,
    )

    quant_cfg = getattr(mtq, "FP8_DEFAULT_CFG")
    # Following quantizers are needed for KV cache quantization.
    quant_cfg["quant_cfg"]["*output_quantizer"] = {
        "num_bits": (4, 3),
        "axis": None,
        "enable": True,
    }
    quant_cfg["quant_cfg"]["*k_bmm_quantizer"] = {
        "num_bits": (4, 3),
        "axis": None,
        "enable": True,
    }
    quant_cfg["quant_cfg"]["*v_bmm_quantizer"] = {
        "num_bits": (4, 3),
        "axis": None,
        "enable": True,
    }

    calibrate_loop = dataset_utils.create_forward_loop(
        calib_dataloader, dataloader=calib_dataloader)
    model = mtq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
    mtq.print_quant_summary(model)

    export_hf_checkpoint(model,
                         dtype=model.config.torch_dtype,
                         export_dir=os.path.join(save_dir, 'fp8'))