TensorRT-LLMs/tests/integration/defs/examples/test_llama.py

# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import copy
import csv
import json
import os
import re
import shutil

import defs.ci_profiler
import pytest
from defs.common import (convert_weights, generate_summary_cmd, parse_output,
                         quantize_data, similar,
                         test_llm_torch_multi_lora_support,
                         test_multi_lora_support, venv_check_call,
                         venv_check_output, venv_mpi_check_call)
# yapf: disable
from defs.conftest import (get_device_count, get_device_memory,
                           get_host_total_memory, get_sm_version,
                           skip_fp8_pre_ada, skip_no_nvls, skip_post_blackwell,
                           skip_post_blackwell_ultra, skip_pre_ada,
                           skip_pre_blackwell)
# yapf: enable
from defs.trt_test_alternative import check_call, exists

# skip trt flow cases on post-Blackwell-Ultra
# if get_sm_version() >= 103:
#     pytest.skip(
#         "TRT workflow tests are not supported on post Blackwell-Ultra architecture",
#         allow_module_level=True)

INPUT_TEXT_1 = "After Washington had returned to Williamsburg, " + \
               "Dinwiddie ordered him to lead a larger force to assist Trent in his work. " + \
               "While en route, Washington learned of Trent's retreat. " + \
               "Since Tanaghrisson had promised support to the British, " + \
               "Washington continued toward Fort Duquesne and met with the Mingo leader. " + \
               "Learning of a French scouting party in the area, Washington, " + \
               "with Tanaghrisson and his party, surprised the Canadians on May 28 " + \
               "in what became known as the Battle of Jumonville Glen. " + \
               "They killed many of the Canadians, including their commanding officer, " + \
               "Joseph Coulon de Jumonville, whose head was reportedly split open by " + \
               "Tanaghrisson with a tomahawk. The historian Fred Anderson suggests that " + \
               "Tanaghrisson was acting to gain the support of the British and regain " + \
               "authority over his own people. They had been inclined to support the French, " + \
               "with whom they had long trading relationships. One of Tanaghrisson's men told " + \
               "Contrecoeur that Jumonville had been killed by British musket fire. " + \
               "Question: Upon learning of a French scounting party in the area, " + \
               "what did Washington do? Answer:"

INPUT_TEXT_2 = "Born in north-east France, Soyer trained as a"


@pytest.mark.parametrize("num_beams", [5, 7],
                         ids=["num_beams_4", "num_beams_7"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_early_finish_beams(llama_example_root, llm_venv, llama_model_root,
                            engine_dir, cmodel_dir, num_beams):
    """ Test the correctness of beam search + streaming versus the outputs of
    non-streaming beam search. Both use the cpp runtime.
    This test is aimed specifically at checking if shorter finished beams are being put
    into the outputs correctly."""

    dtype = 'float16'
    output_len = 10
    input_text = ["want to", "The time is", "Soyer was"]
    model_name = os.path.basename(llama_model_root)

    ckpt_dir = convert_weights(llm_venv=llm_venv,
                               example_root=llama_example_root,
                               cmodel_dir=cmodel_dir,
                               model=model_name,
                               model_path=llama_model_root,
                               data_type=dtype)

    print("Building engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        f"--max_batch_size={8}",
        f"--max_input_len={924}",
        f"--max_seq_len={1024}",
        f"--gpt_attention_plugin={dtype}",
        f"--gemm_plugin={dtype}",
        f"--max_beam_width={num_beams}",
        "--context_fmha=enable",
        "--use_paged_context_fmha=enable",
        "--paged_kv_cache=enable",
        "--remove_input_padding=enable",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Running inference...")

    streaming_command = [
        f"{llama_example_root}/../run.py", f"--max_output_len={output_len}",
        f"--engine_dir={engine_dir}", f"--tokenizer_dir={llama_model_root}",
        f"--streaming", f"--streaming_interval=1", f"--num_beams={num_beams}",
        f"--input_text", *input_text
    ]
    streaming_outputs = venv_check_output(llm_venv, streaming_command)

    joined_nonstreamed_outputs = ""
    for length_iterator in range(1, output_len + 1):
        command = [
            f"{llama_example_root}/../run.py",
            f"--max_output_len={length_iterator}", f"--engine_dir={engine_dir}",
            f"--tokenizer_dir={llama_model_root}", f"--num_beams={num_beams}",
            f"--input_text", *input_text
        ]

        non_streaming_output = venv_check_output(llm_venv, command)
        joined_nonstreamed_outputs += "Output from command" + str(
            command) + "\n" + non_streaming_output

    def parse_output(text: str) -> list[str]:
        results = []
        while True:
            match = re.search(
                r"Output \[Text \d+ Beam \d+\]: \"([^\"]*)\"\r?\n", text)
            if match is None:
                break
            _, end = match.span()
            results.append(match.group(1))
            text = text[end:]
        return results

    print("STREAMING OUTPUT HERE\n\n\n",
          streaming_outputs,
          "\n\n\n",
          sep="----")
    print("NON-STREAMING OUTPUT HERE\n\n\n",
          joined_nonstreamed_outputs,
          "\n\n\n",
          sep="----")
    parsed_streamed_outputs = parse_output(streaming_outputs)
    parsed_nonstreamed_outputs = parse_output(joined_nonstreamed_outputs)

    def ordered_subset(s1, s2):
        """
        Use this to check if the streamed outputs are an ordered subset of nonstreamed
        Streaming can sometimes skip outputs
        """
        s2 = iter(s2)
        try:
            for c in s1:
                while next(s2) != c:
                    pass
            else:
                return True
        except StopIteration:
            return False

    streaming_is_subset = ordered_subset(parsed_streamed_outputs,
                                         parsed_nonstreamed_outputs)
    print("streaming_is_subset ", streaming_is_subset)
    assert streaming_is_subset
    is_equal = (parsed_streamed_outputs == parsed_nonstreamed_outputs)
    print("is_equal", is_equal)
    if not is_equal:
        print("Differences:")
        for streamed, nonstreamed in zip(parsed_streamed_outputs,
                                         parsed_nonstreamed_outputs):
            if (streamed != nonstreamed):
                print("Streamed:", streamed)
                print("Nonstreamed:", nonstreamed)

    assert is_equal


@pytest.mark.parametrize("num_beams", [1, 2, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("use_weight_only_groupwise_quant_matmul_plugin",
                         [True, False],
                         ids=[
                             "enable_weight_only_groupwise_quant_matmul_plugin",
                             "disable_weight_only_groupwise_quant_matmul_plugin"
                         ])
@pytest.mark.parametrize("run_type", ['inference', 'summarization'])
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_v1_1gpu(use_weight_only_groupwise_quant_matmul_plugin,
                           run_type, data_type, llama_example_root,
                           llama_model_root, llm_datasets_root, llm_rouge_root,
                           llm_venv, cmodel_dir, engine_dir, num_beams):
    if num_beams > 2 and get_device_memory() < 80000:
        pytest.skip("device memory is insufficient.")

    model_name = 'llama_v1-{}'.format(run_type)

    print("Build engines...")
    if not use_weight_only_groupwise_quant_matmul_plugin:
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root,
                                    data_type=data_type)

    else:
        model_name = 'llama_v1-int4_gptq-{}'.format(run_type)

        llama_gptq_safetensors_root = os.path.join(
            llama_model_root, "../..", "int4-quantized-gptq-awq",
            "llama-7b-4bit-gs128.safetensors")
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root,
                                    data_type=data_type,
                                    quant_ckpt_path=llama_gptq_safetensors_root)

    build_cmd = [
        "trtllm-build", f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}", f"--gpt_attention_plugin={data_type}",
        f"--gemm_plugin={data_type}", "--remove_input_padding=enable",
        f"--max_beam_width={num_beams}"
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    if run_type == "inference":
        print("Run inference...")
        venv_check_call(llm_venv, [
            f"{llama_example_root}/../run.py",
            "--max_output_len=50",
            f"--tokenizer_dir={llama_model_root}",
            f"--engine_dir={engine_dir}",
            f"--num_beams={num_beams}",
        ])
    elif run_type == "summarization":
        print("Run summarize...")
        summary_cmd = [
            f"{llama_example_root}/../summarize.py", "--test_trt_llm",
            "--hf_model_dir", f"{llama_model_root}", "--data_type=fp16",
            f"--engine_dir={engine_dir}", "--check_accuracy",
            f"--num_beams={num_beams}", f"--dataset_dir={llm_datasets_root}",
            f"--rouge_dir={llm_rouge_root}"
        ]
        venv_check_call(llm_venv, summary_cmd)


@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_v1_manage_weights_1gpu_summarize(llama_example_root,
                                                    llama_model_root,
                                                    llm_datasets_root,
                                                    llm_rouge_root, llm_venv,
                                                    cmodel_dir, engine_dir):
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model="llama_v1-float16",
                                model_path=llama_model_root,
                                data_type="float16")

    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin=float16",
        f"--gemm_plugin=disable",
        "--remove_input_padding=enable",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    summary_cmd = [
        f"{llama_example_root}/../summarize.py", "--test_trt_llm",
        "--hf_model_dir", f"{llama_model_root}", "--data_type=fp16",
        f"--engine_dir={engine_dir}", "--check_accuracy",
        f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
    ]
    venv_check_call(llm_venv, summary_cmd)


@skip_pre_blackwell
@skip_post_blackwell_ultra
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("fp4_type", ["plugin", "ootb", "disable"],
                         ids=["fp4_plugin", "fp4_ootb", "disable_fp4"])
@pytest.mark.parametrize("fuse_fp4_quant", ["enable", "disable"],
                         ids=["enable_fused_quant", "disable_fused_quant"])
@pytest.mark.parametrize(
    "norm_quant_fusion", ["enable", "disable"],
    ids=["enable_norm_quant_fusion", "disable_norm_quant_fusion"])
@pytest.mark.parametrize(
    "llama_model_root",
    ['llama-v3-8b-instruct-hf', 'llama-3.1-8b', 'llama-3.1-70b-instruct'],
    indirect=True)
def test_llm_llama_1gpu_fp4(
    mmlu_dataset_root,
    data_type,
    fp4_type,
    fuse_fp4_quant,
    norm_quant_fusion,
    llama_example_root,
    llama_model_root,
    llm_venv,
    cmodel_dir,
    engine_dir,
    qcache_dir_without_install_package,
    llm_datasets_root,
):
    model_name = os.path.basename(llama_model_root)
    if fp4_type != "disable":
        model_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype=data_type,
            qformat="nvfp4",
            kv_cache_dtype="fp8",
            quantize_dir=qcache_dir_without_install_package)
    else:
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root,
                                    data_type=data_type)

    print("Build engines...")
    build_cmd = [
        "trtllm-build", f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}", "--max_batch_size=32"
    ]
    if fp4_type != "disable":
        build_cmd.extend([
            "--gemm_plugin=disable"
            if fp4_type == "ootb" else "--gemm_plugin=nvfp4"
        ])
    if fp4_type == "plugin" or fuse_fp4_quant == "enable":
        build_cmd.extend([
            "--use_paged_context_fmha=enable", "--use_fp8_context_fmha=enable"
        ])
    if fuse_fp4_quant == "enable":
        build_cmd.extend(["--fuse_fp4_quant=enable"])
    if norm_quant_fusion == 'enable':
        build_cmd.extend(["--norm_quant_fusion=enable"])

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run MMLU test")
    accuracy_map = {
        'llama-v3-8b-instruct-hf': 61.5,
        'Meta-Llama-3.1-8B': 61.0,
        'Meta-Llama-3.1-70B-Instruct': 75
    }
    acc_thres = accuracy_map[model_name]
    mmlu_cmd = [
        "trtllm-eval", f"--model={engine_dir}",
        f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
        f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
        f"--accuracy_threshold={acc_thres}"
    ]
    check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)


@skip_pre_blackwell
@pytest.mark.parametrize("fp4_type", ["plugin", "ootb", "disable"],
                         ids=["fp4_plugin", "fp4_ootb", "disable_fp4"])
@pytest.mark.parametrize(
    "llama_model_root",
    ['llama-v3-8b-instruct-hf', 'llama-3.1-8b', 'llama-3.1-70b-instruct'],
    indirect=True)
def test_llm_llama_1gpu_fp4_model_config(
    fp4_type,
    llama_example_root,
    llama_model_root,
    llm_venv,
    cmodel_dir,
    engine_dir,
    qcache_dir_without_install_package,
    llm_datasets_root,
):
    model_name = os.path.basename(llama_model_root)
    if fp4_type != "disable":
        model_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype="float16",
            qformat="nvfp4",
            kv_cache_dtype="fp8",
            quantize_dir=qcache_dir_without_install_package)
    else:
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root,
                                    data_type="float16")

    print("Build engines...")
    build_cmd = [
        "trtllm-build", f"--model_config={model_dir}/config.json",
        f"--output_dir={engine_dir}", "--max_batch_size=32"
    ]
    if fp4_type != "disable":
        build_cmd.extend([
            "--gemm_plugin=disable"
            if fp4_type == "ootb" else "--gemm_plugin=nvfp4"
        ])
    if fp4_type == "plugin":
        build_cmd.extend([
            "--use_paged_context_fmha=enable", "--use_fp8_context_fmha=enable"
        ])

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)


@skip_pre_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("fp4_type", ["plugin", "ootb", "disable"],
                         ids=["fp4_plugin", "fp4_ootb", "disable_fp4"])
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-70b-instruct'],
                         indirect=True)
def test_llm_llama_2gpu_fp4(mmlu_dataset_root, fp4_type, llama_example_root,
                            llama_model_root, llm_venv, engine_dir,
                            qcache_dir_without_install_package,
                            llm_datasets_root):
    model_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype="float16",
        qformat="nvfp4",
        tp_size=2,
        quantize_dir=qcache_dir_without_install_package)

    print("Build engines...")

    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--max_batch_size=32",
    ]
    if fp4_type != "disable":
        build_cmd.extend([
            "--gemm_plugin=disable"
            if fp4_type == "ootb" else "--gemm_plugin=nvfp4"
        ])

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run MMLU test")
    acc_thres = 75
    mmlu_cmd = [
        "trtllm-eval", f"--model={engine_dir}",
        f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
        f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
        f"--accuracy_threshold={acc_thres}"
    ]
    check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)


@pytest.mark.skip_less_device(8)
@pytest.mark.parametrize("fp4_type", ["plugin", "ootb", "disable"],
                         ids=["fp4_plugin", "fp4_ootb", "disable_fp4"])
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-405b'], indirect=True)
def test_llm_llama_8gpu_fp4(mmlu_dataset_root, fp4_type, llama_example_root,
                            llama_model_root, llm_venv, engine_dir,
                            qcache_dir_without_install_package,
                            llm_datasets_root, upgrade_transformers):
    model_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype="float16",
        qformat="nvfp4",
        tp_size=8,
        quantize_dir=qcache_dir_without_install_package)

    print("Build engines...")
    build_cmd = [
        "trtllm-build", f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}", "--max_batch_size=32", "--workers=4"
    ]
    if fp4_type != "disable":
        build_cmd.extend([
            "--gemm_plugin=disable"
            if fp4_type == "ootb" else "--gemm_plugin=nvfp4"
        ])

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run MMLU test")
    acc_thres = 75
    mmlu_cmd = [
        "trtllm-eval", f"--model={engine_dir}",
        f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
        f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
        f"--accuracy_threshold={acc_thres}"
    ]
    check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)


@pytest.mark.parametrize("num_beams", [1, 2, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("run_type", ['inference', 'summarization'])
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("fp8_cache", [True, False],
                         ids=["enable_fp8", "disable_fp8"])
@pytest.mark.parametrize("llama_model_root", [
    'llama-v2-7b-hf', 'llama-v3-8b-instruct-hf', 'llama-3.1-8b-instruct-hf-fp8'
],
                         indirect=True)
def test_llm_llama_1gpu(run_type, data_type, fp8_cache, llama_example_root,
                        llama_model_root, llm_datasets_root, llm_rouge_root,
                        llm_venv, cmodel_dir, engine_dir,
                        qcache_dir_without_install_package, num_beams):
    if num_beams > 2 and get_device_memory() < 80000:
        pytest.skip("device memory is insufficient.")

    use_fp8 = fp8_cache if "fp8" not in llama_model_root.lower() else True
    skip_fp8_pre_ada(use_fp8=use_fp8)

    model_name = os.path.basename(llama_model_root)

    if llama_model_root.endswith('Llama-3.1-8B-Instruct-FP8'):
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model="llama_v3_hf_fp8",
                                    model_path=llama_model_root,
                                    fp8_kv_cache=fp8_cache,
                                    data_type=data_type)
    elif fp8_cache:
        # Quantize HF llama checkpoint into FP8 format
        model_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype=data_type,
            qformat="fp8",
            quantize_dir=qcache_dir_without_install_package,
            calib_size=512,
            kv_cache_dtype="fp8")
    else:
        model_dir = convert_weights(
            llm_venv=llm_venv,
            example_root=llama_example_root,
            cmodel_dir=cmodel_dir,
            model=model_name,
            model_path=llama_model_root,
            data_type=data_type,
            enable_fp8=fp8_cache,
            fp8_kv_cache=fp8_cache,
            quant_ckpt_path=
            f"{qcache_dir_without_install_package}/quantized_fp8/llama_tp1_rank0.npz"
            if fp8_cache else None)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin={data_type}",
        f"--gemm_plugin={data_type}",
        "--remove_input_padding=enable",
        f"--max_beam_width={num_beams}",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    if run_type == "inference":
        print("Run inference...")
        venv_check_call(llm_venv, [
            f"{llama_example_root}/../run.py",
            "--max_output_len=50",
            f"--tokenizer_dir={llama_model_root}",
            f"--engine_dir={engine_dir}",
            f"--num_beams={num_beams}",
        ])
    elif run_type == "summarization":
        print("Run summarize...")
        tensorrt_llm_rouge1_threshold = {
            1: 14,
            2: 19,
            4: 19,
        }[num_beams]

        summary_cmd = generate_summary_cmd(
            llama_example_root,
            hf_model_dir=llama_model_root,
            data_type="fp16",
            engine_dir=engine_dir,
            tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
            num_beams=num_beams,
            dataset_dir=llm_datasets_root,
            rouge_dir=llm_rouge_root)

        venv_check_call(llm_venv, summary_cmd)


@skip_pre_ada
@pytest.mark.parametrize("data_type", ['bfloat16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_1gpu_fp8_kv_cache(
    data_type,
    llama_example_root,
    llama_model_root,
    llm_datasets_root,
    llm_rouge_root,
    llm_venv,
    cmodel_dir,
    engine_dir,
    qcache_dir_without_install_package,
):
    # Quantize HF llama checkpoint into FP8 format
    model_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype=data_type,
        qformat="fp8",
        quantize_dir=qcache_dir_without_install_package,
        calib_size=512,
        kv_cache_dtype="fp8")

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin={data_type}",
        f"--gemm_plugin={data_type}",
        "--remove_input_padding=enable",
        "--use_paged_context_fmha=enable",
        "--use_fp8_context_fmha=enable",
        "--max_beam_width=1",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
    with open(f"{engine_dir}/config.json") as f:
        engine_config = json.load(f)

    assert engine_config["build_config"]["plugin_config"][
        "use_fp8_context_fmha"] == True
    assert engine_config["pretrained_config"]["quantization"][
        "kv_cache_quant_algo"] == "FP8"


@pytest.mark.parametrize("use_weight_sparsity", [True],
                         ids=["enable_weight_sparsity"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_sparsity(llama_example_root, llama_model_root,
                                    llama_v2_tokenizer_model_root, llm_venv,
                                    cmodel_dir, engine_dir,
                                    use_weight_sparsity):
    model_name = 'llama_v2'
    data_type = 'float16'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=llama_model_root,
                                data_type=data_type)

    print("Build engines...")
    build_cmd = [
        "trtllm-build", f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}", "--log_level=verbose"
    ]
    if use_weight_sparsity:
        build_cmd.extend(["--weight_sparsity"])

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference...")
    venv_check_call(llm_venv, [
        f"{llama_example_root}/../run.py", "--max_output_len=50",
        f"--tokenizer_dir={llama_v2_tokenizer_model_root}",
        f"--engine_dir={engine_dir}", f"--num_beams=1"
    ])


@skip_post_blackwell
@pytest.mark.parametrize("num_beams", [1],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v3-8b-instruct-hf'],
                         indirect=True)
def test_llm_llama_v3_int8_gptq_1gpu_summary(data_type, llama_example_root,
                                             llama_model_root,
                                             llm_datasets_root, llm_rouge_root,
                                             llm_venv, cmodel_dir, engine_dir,
                                             num_beams):
    if num_beams > 2 and get_device_memory() < 80000:
        pytest.skip("device memory is insufficient.")

    model_name = 'llama_v3-int8_gptq'

    llama_gptq_safetensors_root = os.path.join(
        llama_model_root, "../..", "int8-quantized-gptq",
        "llama-3-8b-8bit-gs64-gptq.safetensors")
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=llama_model_root,
                                data_type=data_type,
                                quant_ckpt_path=llama_gptq_safetensors_root)

    print("Build engines...")
    build_cmd = [
        "trtllm-build", f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}", f"--gpt_attention_plugin={data_type}",
        f"--gemm_plugin={data_type}", "--remove_input_padding=enable",
        f"--max_beam_width={num_beams}"
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    summary_cmd = [
        f"{llama_example_root}/../summarize.py", "--test_trt_llm",
        "--hf_model_dir", f"{llama_model_root}", "--data_type=fp16",
        f"--engine_dir={engine_dir}", "--check_accuracy",
        "--tensorrt_llm_rouge1_threshold=24", f"--num_beams={num_beams}",
        f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
    ]
    venv_check_call(llm_venv, summary_cmd)


@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("num_beams", [1],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['float16'])
@pytest.mark.parametrize("llama_model_root", ['TinyLlama-1.1B-Chat-v1.0'],
                         indirect=True)
def test_llm_llama_4gpu_pp4(data_type, llama_example_root, llama_model_root,
                            llm_datasets_root, llm_rouge_root, llm_venv,
                            cmodel_dir, engine_dir, num_beams):
    model_name = os.path.basename(llama_model_root)

    model_dir = convert_weights(
        llm_venv=llm_venv,
        example_root=llama_example_root,
        cmodel_dir=cmodel_dir,
        model=model_name,
        model_path=llama_model_root,
        data_type=data_type,
        tp_size=1,
        pp_size=4,
    )

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gemm_plugin={data_type}",
        f"--max_beam_width={num_beams}",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    tensorrt_llm_rouge1_threshold = {
        1: 12,
    }[num_beams]

    summary_cmd = generate_summary_cmd(
        llama_example_root,
        hf_model_dir=llama_model_root,
        data_type="fp16",
        engine_dir=engine_dir,
        tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
        num_beams=num_beams,
        dataset_dir=llm_datasets_root,
        rouge_dir=llm_rouge_root)

    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "4", "--allow-run-as-root"],
                        summary_cmd)


@skip_pre_ada
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['bfloat16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_fp8_2gpu_pp2(
        data_type, llama_example_root, llama_model_root,
        llama_v2_tokenizer_model_root, llm_datasets_root, llm_rouge_root,
        llm_venv, engine_dir, qcache_dir_without_install_package, num_beams):
    if num_beams > 2 and get_device_memory() < 80000:
        pytest.skip("device memory is insufficient.")

    # Quantize HF llama checkpoint into FP8 format
    model_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype=data_type,
        qformat="fp8",
        quantize_dir=qcache_dir_without_install_package,
        tp_size=1,
        pp_size=2,
        kv_cache_dtype="fp8",
        calib_size=64)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin={data_type}",
        f"--gemm_plugin={data_type}",
        "--remove_input_padding=enable",
        f"--max_beam_width={num_beams}",
        "--use_paged_context_fmha=disable",
        "--use_fp8_context_fmha=disable",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    tensorrt_llm_rouge1_threshold = {
        1: 13,
        2: 19,
        4: 19,
    }[num_beams]

    summary_cmd = [
        f"{llama_example_root}/../summarize.py", "--test_trt_llm",
        "--hf_model_dir", f"{llama_v2_tokenizer_model_root}",
        "--data_type=fp16", f"--engine_dir={engine_dir}",
        f"--tensorrt_llm_rouge1_threshold={tensorrt_llm_rouge1_threshold}",
        "--check_accuracy", f"--num_beams={num_beams}",
        f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
    ]

    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
                        summary_cmd)


@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_gather_logits_2gpu_pp2(llama_example_root,
                                             llama_model_root,
                                             llm_datasets_root, llm_rouge_root,
                                             llama_v2_tokenizer_model_root,
                                             llm_venv, cmodel_dir, engine_dir):
    # Check the availability of gather all token logits when pp>1
    model_name = 'llama_v2'
    data_type = 'float16'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=llama_model_root,
                                data_type=data_type,
                                pp_size=2)
    print("Build engines...")
    build_cmd = [
        "trtllm-build", f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}", "--max_batch_size=2",
        "--max_beam_width=1", f"--gemm_plugin={data_type}",
        f"--gpt_attention_plugin={data_type}", "--gather_context_logits"
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    summary_cmd = [
        f"{llama_example_root}/../summarize.py", "--test_trt_llm",
        "--hf_model_dir", f"{llama_v2_tokenizer_model_root}",
        "--data_type=fp16", f"--engine_dir={engine_dir}", "--eval_ppl",
        f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
    ]
    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
                        summary_cmd)


@skip_post_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", [
    'llama-v2-7b-hf', 'llama-v2-13b-hf', 'llama-v2-70b-hf', 'Llama-2-7B-AWQ',
    'Llama-2-7B-GPTQ'
],
                         indirect=True)
def test_llm_llama_v2_awq_2gpu_summary(llama_example_root, llama_model_root,
                                       llama_v2_tokenizer_model_root,
                                       llm_datasets_root, llm_rouge_root,
                                       llm_venv, engine_dir, num_beams,
                                       qcache_dir_without_install_package):
    if (num_beams > 2
            or "70b" in llama_model_root) and get_device_memory() < 80000:
        pytest.skip("device memory is insufficient.")

    if 'Llama-2-7B-AWQ' in llama_model_root or 'Llama-2-7B-GPTQ' in llama_model_root:
        print("Converting model...")
        ckpt_dir = convert_weights(
            llm_venv=llm_venv,
            example_root=llama_example_root,
            cmodel_dir=qcache_dir_without_install_package,
            model="llama_v2",
            model_path=llama_model_root,
            data_type="auto",
            tp_size=2,
            pp_size=1)
    else:
        print("Quantizing model...")
        ckpt_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype="float16",
            qformat="int4_awq",
            quantize_dir=qcache_dir_without_install_package,
            tp_size=2,
            calib_size=32)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--gemm_plugin=float16",
        "--remove_input_padding=enable",
        f"--max_beam_width={num_beams}",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    summary_cmd = generate_summary_cmd(
        llama_example_root,
        hf_model_dir=llama_v2_tokenizer_model_root,
        data_type="fp16",
        engine_dir=engine_dir,
        num_beams=num_beams,
        dataset_dir=llm_datasets_root,
        rouge_dir=llm_rouge_root)

    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
                        summary_cmd)


@skip_pre_ada
@skip_post_blackwell  # AutoQ contains AWQ int4 recipe, which is not supported on Blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
def test_llm_llama_v3_1_autoq_1gpu_mmlu(llama_example_root, llama_model_root,
                                        llm_datasets_root, mmlu_dataset_root,
                                        llm_venv, engine_dir,
                                        qcache_dir_without_install_package):
    print("Quantizing model...")
    ckpt_dir = quantize_data(llm_venv,
                             llama_example_root,
                             model_dir=llama_model_root,
                             calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
                             dtype="float16",
                             quantize_dir=qcache_dir_without_install_package,
                             tp_size=1,
                             calib_size=4,
                             batch_size=4,
                             autoq_format='int4_awq,fp8,w4a8_awq',
                             auto_quantize_bits=5.8)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        "--remove_input_padding=enable",
        "--max_batch_size=8",
        "--max_input_len=4000",
        "--max_seq_len=4096",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run MMLU test")
    mmlu_cmd = [
        "trtllm-eval", f"--model={engine_dir}",
        f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
        f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
        f"--accuracy_threshold={63.8}"
    ]
    check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)


@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-70b'], indirect=True)
def test_llm_llama_v3_1_autoq_2gpu_mmlu(llama_example_root, llama_model_root,
                                        llm_datasets_root, mmlu_dataset_root,
                                        llm_venv, engine_dir,
                                        qcache_dir_without_install_package):
    print("Quantizing model...")
    ckpt_dir = quantize_data(llm_venv,
                             llama_example_root,
                             model_dir=llama_model_root,
                             calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
                             dtype="float16",
                             quantize_dir=qcache_dir_without_install_package,
                             tp_size=2,
                             calib_size=4,
                             batch_size=4,
                             autoq_format='int4_awq,fp8,w4a8_awq',
                             auto_quantize_bits=5.8)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        "--remove_input_padding=enable",
        "--max_batch_size=8",
        "--max_input_len=4000",
        "--max_seq_len=4096",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run MMLU test")
    mmlu_cmd = [
        "trtllm-eval", f"--model={engine_dir}",
        f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
        f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
        f"--accuracy_threshold={77.58}"
    ]
    check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)


@skip_post_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("num_beams", [4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", ['llama-7b', 'llama-30b'],
                         indirect=True)
def test_llm_llama_v1_2gpu_summary(llama_example_root, llama_model_root,
                                   llm_datasets_root, llm_rouge_root, llm_venv,
                                   cmodel_dir, engine_dir, num_beams):
    model_name = 'llama_v1_2gpu'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=llama_model_root,
                                gpus=2,
                                tp_size=2,
                                pp_size=1)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--gemm_plugin=float16",
        "--remove_input_padding=enable",
        f"--max_beam_width={num_beams}",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    summary_cmd = [
        f"{llama_example_root}/../../../summarize.py", "--test_trt_llm",
        "--check_accuracy", f"--hf_model_dir={llama_model_root}",
        f"--engine_dir={engine_dir}", f"--num_beams={num_beams}",
        f"--dataset_dir={llm_datasets_root}", f"--rouge_dir={llm_rouge_root}"
    ]

    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
                        summary_cmd)


@pytest.mark.skip_less_device(8)
@pytest.mark.skip_less_host_memory(480000)
@pytest.mark.parametrize("num_beams", [1, 2, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", ['llama-v2-70b'], indirect=True)
def test_llm_llama_v2_8gpu_summary(llama_example_root, llama_model_root,
                                   llama_v2_tokenizer_model_root,
                                   llm_datasets_root, llm_rouge_root, llm_venv,
                                   cmodel_dir, engine_dir, num_beams):
    "run llamav2 70 test on 8 gpus"
    if num_beams > 2 and get_device_memory() < 80000:
        pytest.skip("device memory is insufficient.")

    model_name = 'llama_v2-meta-ckpt-70b'

    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=llama_model_root,
                                gpus=8,
                                workers=8,
                                tp_size=8,
                                pp_size=1)
    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--gemm_plugin=float16",
        "--remove_input_padding=enable",
        f"--max_beam_width={num_beams}",
        "--workers=8",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    summary_cmd = generate_summary_cmd(
        llama_example_root,
        hf_model_dir=llama_v2_tokenizer_model_root,
        data_type="fp16",
        engine_dir=engine_dir,
        num_beams=num_beams,
        dataset_dir=llm_datasets_root,
        rouge_dir=llm_rouge_root)

    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "8", "--allow-run-as-root"],
                        summary_cmd)


@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("num_beams", [2, 5],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_v1_1gpu_paged_kv_cache(llama_example_root, llama_model_root,
                                          llm_datasets_root, llm_rouge_root,
                                          llm_venv, cmodel_dir, engine_dir,
                                          num_beams):
    "RCCA https://nvbugs/4283902"
    print("Build engines...")
    model_name = 'llama_v1-paged_kv_cache'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=llama_model_root)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--max_beam_width={num_beams}",
        "--gpt_attention_plugin=float16",
        "--gemm_plugin=float16",
        "--remove_input_padding=enable",
        "--max_batch_size=2",
        "--tokens_per_block=16",
        "--paged_kv_cache=enable",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")
    summary_cmd = [
        f"{llama_example_root}/../../../summarize.py", "--test_trt_llm",
        "--hf_model_dir", f"{llama_model_root}", "--data_type", "fp16",
        "--check_accuracy", f"--engine_dir={engine_dir}",
        f"--num_beams={num_beams}", f"--dataset_dir={llm_datasets_root}",
        f"--rouge_dir={llm_rouge_root}"
    ]
    venv_check_call(llm_venv, summary_cmd)


@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
def test_llm_llama_v1_4gpu_paged_kv_cache(llama_example_root, llama_model_root,
                                          llm_venv, cmodel_dir, engine_dir):
    """
        RCCA https://nvbugs/4251782
        RCCA https://nvbugs/4755248
    """
    model_name = 'llama_v1-4gpu_paged_kv_cache'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=llama_model_root,
                                gpus=4,
                                tp_size=4,
                                pp_size=1)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--remove_input_padding=enable",
        "--gemm_plugin=float16",
        "--max_batch_size=128",
        "--max_input_len=512",
        "--max_seq_len=1024",
        "--max_beam_width=1",
        "--paged_kv_cache=enable",
    ]

    print("Build engines...")

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")
    run_cmd = [
        f"{llama_example_root}/../../../run.py",
        "--max_output_len=10",
        f"--tokenizer_dir={llama_model_root}",
        f"--engine_dir={engine_dir}",
        "--max_attention_window_size=128",
        "--kv_cache_enable_block_reuse",
    ]
    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "4", "--allow-run-as-root"],
                        run_cmd)


@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_v1_1gpu_kv_cache_reuse_with_prompt_table(
        llama_example_root, llama_model_root, llm_datasets_root, llm_rouge_root,
        llm_venv, cmodel_dir, engine_dir):
    max_prompt_embedding_table_size = 16
    hidden_size = 4096
    vocab_size = 32000
    input_len = 42

    print("Convert checkpoint...")
    model_name = 'llama_v1-kv_cache_reuse_w_prompt_table'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=llama_model_root)

    print("Build engines...")
    build_cmd = [
        "trtllm-build", f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}/engines", "--gpt_attention_plugin=float16",
        "--gemm_plugin=float16", "--remove_input_padding=enable",
        "--max_batch_size=1",
        f"--tokens_per_block={max_prompt_embedding_table_size}",
        "--paged_kv_cache=enable", "--use_paged_context_fmha=enable",
        f"--max_prompt_embedding_table_size={max_prompt_embedding_table_size}"
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    # generate input ids, dummy prompt table and extra ids
    input_file = f"{engine_dir}/input_ids.npy"
    prompt_table_path = f"{engine_dir}/prompt_table.npy"
    extra_ids_file = f"{engine_dir}/extra_ids.npy"
    # run the script inside venv since it depends on numpy
    venv_script = f'''
    import numpy as np
    input_ids = [[
        i + {vocab_size} if i < {max_prompt_embedding_table_size} else i + 1000
        for i in range({input_len})
    ]]
    np.save("{input_file}", np.array(input_ids))

    prompt_table_shape = (1, {max_prompt_embedding_table_size}, {hidden_size})
    prompt_table = np.random.rand(*prompt_table_shape).astype(np.float16)
    np.save("{prompt_table_path}", prompt_table)

    extra_ids = [[
        1 if i < {max_prompt_embedding_table_size} else 0
        for i in range({input_len})
    ]]
    np.save("{extra_ids_file}", np.array(extra_ids))
    '''
    llm_venv.run(venv_script)

    # add --run_profiling to run the request for multiple times
    print("Run inference")
    run_cmd = [
        f"{llama_example_root}/../../../run.py", "--max_output_len=10",
        f"--tokenizer_dir={llama_model_root}",
        f"--engine_dir={engine_dir}/engines", f"--input_file={input_file}",
        f"--prompt_table_path={prompt_table_path}",
        "--kv_cache_enable_block_reuse",
        f"--input_token_extra_ids_file={extra_ids_file}", "--run_profiling"
    ]
    venv_check_output(llm_venv, run_cmd)


@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize(
    "fp8_context_fmha_xqa",
    ["enable_fp8_context_fmha_xqa", "disable_fp8_context_fmha_xqa"])
@pytest.mark.parametrize("reduce_fusion",
                         ["enable_reduce_fusion", "disable_reduce_fusion"])
@pytest.mark.parametrize("llama_model_root",
                         ['llama-7b', 'llama-v2-13b-hf', 'llama-v2-70b-hf'],
                         indirect=True)
def test_llm_llama_2gpu_fp8_summary(llama_example_root, llama_model_root,
                                    llm_datasets_root, llm_rouge_root, llm_venv,
                                    engine_dir,
                                    qcache_dir_without_install_package,
                                    fp8_context_fmha_xqa, reduce_fusion):
    "RCCA https://nvbugs/4348560"
    skip_fp8_pre_ada(use_fp8=True)
    qmodel_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype="float16",
        qformat="fp8",
        quantize_dir=qcache_dir_without_install_package,
        tp_size=2,
        calib_size=512,
        kv_cache_dtype="fp8")

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={qmodel_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--gemm_plugin=float16",
        "--remove_input_padding=enable",
        "--workers=2",
        "--max_beam_width=4",
    ]

    if "enable" in fp8_context_fmha_xqa:
        build_cmd.extend([
            "--use_fp8_context_fmha=enable", "--use_paged_context_fmha=enable"
        ])

    if "enable" in reduce_fusion:
        build_cmd.extend(["--reduce_fusion=enable"])

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")
    summary_cmd = generate_summary_cmd(llama_example_root,
                                       hf_model_dir=llama_model_root,
                                       data_type='fp16',
                                       engine_dir=engine_dir,
                                       dataset_dir=llm_datasets_root,
                                       rouge_dir=llm_rouge_root,
                                       num_beams=4)

    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
                        summary_cmd)


@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_1gpu_batched_beam_search(llama_example_root,
                                            llama_model_root, llm_datasets_root,
                                            llm_venv, engine_dir,
                                            qcache_dir_without_install_package):
    "llama run batched beam search on 1 gpu"
    qmodel_dir = quantize_data(llm_venv,
                               llama_example_root,
                               model_dir=llama_model_root,
                               dtype="float16",
                               quantize_dir=qcache_dir_without_install_package)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={qmodel_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin=float16",
        "--remove_input_padding=enable",
        "--paged_kv_cache=enable",
        "--max_batch_size=4",
        "--max_beam_width=4",
        "--max_input_len=512",
        "--max_seq_len=532",
        "--gemm_plugin=float16",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    # run.py test.
    num_beams = 4
    run_cmd = [
        f"{llama_example_root}/../../../run.py",
        "--max_output_len=20",
        f"--tokenizer_dir={llama_model_root}",
        f"--engine_dir={engine_dir}",
        "--no_add_special_tokens",
        f"--num_beams={num_beams}",
        "--input_text",
        "Miguel de Cervantes wrote",
        "Diego Velazquez painted his most famous painting,",
        "Miguel de Cervantes wrote",
        "Diego Velazquez painted his most famous painting,",
    ]

    output = venv_check_output(llm_venv, run_cmd)
    output = parse_output(output)

    for idx in [0, 1]:
        assert all(
            [
                a == b for (a, b) in zip(
                    output[num_beams * idx:num_beams * idx +
                           num_beams], output[num_beams * (idx + 2):num_beams *
                                              (idx + 2) + num_beams])
            ]
        ), f"outputs {idx} and {idx+2} don't match: {output[num_beams * idx:num_beams * idx + num_beams]}, {output[num_beams * (idx + 2):num_beams * (idx + 2) + num_beams]}"

    expected_output = [
        ["Don Quixote in 1605. The book is considered the first modern novel."],
        [
            "Las Meninas, in 1656. The painting is a portrait of King Philip IV",
            "\"Las Meninas\" in 1656. The painting depicts King Philip"
        ],
    ]

    for idx, result in enumerate(output):
        assert any(
            [
                similar(item, result)
                for item in expected_output[(idx // num_beams) % 2]
            ]
        ), f"output {result} is not similar to any of {expected_output[(idx // num_beams) % 2]}"


@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("mmlu_test", [True, False],
                         ids=["enable_mmlu_test", "disable_mmlu_test"])
@pytest.mark.parametrize(
    "fp8_fmha",
    ["enable_fp8_fmha", "enable_fp8_paged_fmha", "disable_fp8_fmha"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_fp8_summary_and_mmlu(
        llama_example_root, llama_model_root, llm_datasets_root, llm_rouge_root,
        mmlu_dataset_root, mmlu_test, llm_venv, engine_dir,
        qcache_dir_without_install_package, fp8_fmha):
    "run Llama v2 fp8 quantization tests"
    skip_fp8_pre_ada(use_fp8=True)
    qmodel_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype="bfloat16",
        qformat="fp8",
        quantize_dir=qcache_dir_without_install_package,
        calib_size=512,
        kv_cache_dtype="fp8")

    print("Build engines...")
    use_fp8_context_fmha = "enable" if fp8_fmha in [
        "enable_fp8_fmha", "enable_fp8_paged_fmha"
    ] else "disable"
    use_paged_context_fmha = "enable" if fp8_fmha == "enable_fp8_paged_fmha" else "disable"
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={qmodel_dir}",
        f"--output_dir={engine_dir}",
        f"--use_fp8_context_fmha={use_fp8_context_fmha}",
        f"--use_paged_context_fmha={use_paged_context_fmha}",
        "--remove_input_padding=enable",
        "--max_batch_size=4",
        "--max_input_len=2046",
        "--max_seq_len=2048",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    # run.py test.
    run_cmd = [
        f"{llama_example_root}/../../../run.py",
        "--max_output_len=32",
        f"--tokenizer_dir={llama_model_root}",
        f"--engine_dir={engine_dir}",
        "--no_add_special_tokens",
        "--input_text",
        INPUT_TEXT_1,
        INPUT_TEXT_2,
        INPUT_TEXT_2,
    ]

    output = venv_check_output(llm_venv, run_cmd)
    output = parse_output(output)
    print(output)

    print("Run Summarization test with batch size = 1")
    summary_cmd = [
        f"{llama_example_root}/../../../summarize.py",
        "--test_trt_llm",
        "--hf_model_dir",
        f"{llama_model_root}",
        "--data_type",
        "fp16",
        f"--engine_dir={engine_dir}",
        "--check_accuracy",
        f"--dataset_dir={llm_datasets_root}",
        f"--rouge_dir={llm_rouge_root}",
        # rouge1 threshold reduced from 15 (default) to 14 since we now enable fused mlp by default and the scales of two linear layers can be different
        "--tensorrt_llm_rouge1_threshold=14",
    ]

    venv_check_call(llm_venv, summary_cmd)

    if mmlu_test:
        print("Run MMLU test")
        mmlu_cmd = [
            "trtllm-eval", f"--model={engine_dir}",
            f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
            f"--dataset_path={mmlu_dataset_root}", "--check_accuracy",
            f"--accuracy_threshold={45.0}"
        ]
        check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)


@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_fp8_gemv(llama_example_root, llama_model_root,
                                    llm_datasets_root, llm_venv, engine_dir,
                                    qcache_dir_without_install_package):
    "run Llama v2 fp8 quantization tests"
    skip_fp8_pre_ada(use_fp8=True)
    qmodel_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype="bfloat16",
        qformat="fp8",
        quantize_dir=qcache_dir_without_install_package,
        calib_size=512,
        kv_cache_dtype="fp8")

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={qmodel_dir}",
        f"--output_dir={engine_dir}",
        f"--gemm_plugin=fp8",
        "--max_batch_size=4",
        "--max_input_len=2048",
        "--max_seq_len=2048",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    # run.py test.
    run_cmd = [
        f"{llama_example_root}/../../../run.py",
        "--max_output_len=32",
        f"--tokenizer_dir={llama_model_root}",
        f"--engine_dir={engine_dir}",
        "--no_add_special_tokens",
        "--input_text",
        INPUT_TEXT_1,
        INPUT_TEXT_2,
        INPUT_TEXT_2,
    ]

    output = venv_check_output(llm_venv, run_cmd)
    output = parse_output(output)
    print(output)

    print("Run Summarization test with batch size = 1")
    summary_cmd = [
        f"{llama_example_root}/../../../summarize.py",
        "--test_trt_llm",
        "--hf_model_dir",
        f"{llama_model_root}",
        "--data_type",
        "fp16",
        f"--engine_dir={engine_dir}",
        "--check_accuracy",
        f"--dataset_dir={llm_datasets_root}",
        "--tensorrt_llm_rouge1_threshold=14.5",
    ]

    venv_check_call(llm_venv, summary_cmd)


@skip_post_blackwell
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("gemm_swiglu_plugin", ["fp8"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_gemm_swiglu(llama_example_root, llama_model_root,
                                       llm_datasets_root, llm_venv, engine_dir,
                                       qcache_dir_without_install_package,
                                       gemm_swiglu_plugin, data_type):
    "run Llama v2 gemm_swiglu_plugin tests"
    if gemm_swiglu_plugin == "fp8":
        skip_fp8_pre_ada(use_fp8=True)
        qmodel_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype=data_type,
            qformat="fp8",
            quantize_dir=qcache_dir_without_install_package,
            calib_size=512,
            kv_cache_dtype="fp8")
    else:
        pytest.skip(f"gemm_swiglu_plugin only supports fp8 now")

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={qmodel_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin={data_type}",
        f"--gemm_plugin=fp8",
        f"--gemm_swiglu_plugin={gemm_swiglu_plugin}",
        "--remove_input_padding=enable",
        "--max_batch_size=4",
        "--max_input_len=2048",
        "--max_seq_len=2048",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    # run.py test.
    run_cmd = [
        f"{llama_example_root}/../../../run.py",
        "--max_output_len=32",
        f"--tokenizer_dir={llama_model_root}",
        f"--engine_dir={engine_dir}",
        "--no_add_special_tokens",
        "--input_text",
        INPUT_TEXT_1,
        INPUT_TEXT_2,
        INPUT_TEXT_2,
    ]

    output = venv_check_output(llm_venv, run_cmd)
    output = parse_output(output)
    print(output)

    print("Run Summarization test")
    summary_cmd = [
        f"{llama_example_root}/../../../summarize.py",
        "--test_trt_llm",
        "--hf_model_dir",
        f"{llama_model_root}",
        "--data_type",
        "fp16",
        f"--engine_dir={engine_dir}",
        "--check_accuracy",
        "--max_ite=40",
        f"--dataset_dir={llm_datasets_root}",
    ]

    venv_check_call(llm_venv, summary_cmd)


@pytest.mark.parametrize(
    "data_type", [
        'float16', 'fp8',
        pytest.param('sq_ootb', marks=skip_post_blackwell),
        pytest.param('awq', marks=skip_post_blackwell),
        pytest.param('int8_wo', marks=skip_post_blackwell)
    ],
    ids=['base_fp16', 'base_fp8', 'base_sq_ootb', 'base_awq', 'base_int8_wo'])
@pytest.mark.parametrize("lora_data_type", ['float16'], ids=['lora_fp16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-13b-hf'], indirect=True)
@pytest.mark.parametrize("llm_lora_model_root", ['chinese-llama-2-lora-13b'],
                         indirect=True)
def test_llm_llama_v2_lora_1gpu(data_type, lora_data_type, llama_example_root,
                                llama_model_root, llm_datasets_root, llm_venv,
                                cmodel_dir, engine_dir, llm_lora_model_root,
                                qcache_dir_without_install_package):
    "run llama lora test on 1gpu"
    print("Build engines...")

    model_name = 'llama_v2-lora'
    if data_type == 'fp8':
        skip_fp8_pre_ada(use_fp8=True)

        model_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype="float16",
            qformat="fp8",
            quantize_dir=qcache_dir_without_install_package,
            calib_size=512,
            kv_cache_dtype="fp8")
    elif data_type == 'sq_ootb':
        model_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype="float16",
            qformat="int8_sq",
            quantize_dir=qcache_dir_without_install_package,
            calib_size=32)
    elif data_type == 'awq':
        model_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype="float16",
            qformat="int4_awq",
            awq_block_size=128,
            quantize_dir=qcache_dir_without_install_package,
            calib_size=32)
    elif data_type == 'int8_wo':
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root,
                                    use_weight_only=True,
                                    weight_only_precision='int8')
    else:
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--lora_plugin=auto",
        "--gemm_plugin=auto",
        f"--lora_dir={llm_lora_model_root}",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    ref_1 = [
        29871, 32160, 33657, 33281, 30214, 30672, 30780, 33820, 32024, 30214,
        32083, 33820, 30755, 37432, 32030, 30313, 30214, 30417, 30210, 30505,
        34870, 30214, 30417, 30210, 30505, 31656, 39298, 30214, 32063, 30210
    ]
    ref_2 = [
        29871, 32160, 33657, 33281, 30214, 30672, 30780, 33820, 32024, 30214,
        33759, 41026, 31381, 30769, 31811, 31900, 30214, 36869, 31900, 36869,
        31900, 30214, 36869, 31900, 36869, 31900, 31900, 31900, 31900, 31900
    ]

    input_text = "今天天气很好，我到公园的时候，"
    # TODO change to chinese evaluation task in the future

    base_run_cmd = [
        f"{llama_example_root}/../../../run.py",
        "--max_output_len=20",
        f"--input_text={input_text}",
        f"--tokenizer_dir={llm_lora_model_root}",
        f"--engine_dir={engine_dir}",
        "--no_add_special_tokens",
    ]

    for use_py_session in [True, False]:
        if use_py_session:
            print("Run inference with Python runtime...")
        else:
            print("Run inference with C++ runtime...")

        print(f"Run inference with lora id 0...")
        run_cmd = copy.deepcopy(base_run_cmd)
        run_cmd.extend([
            "--lora_task_uids=0",
            f"--output_csv={llm_venv.get_working_directory()}/use_lora.csv"
        ])
        if use_py_session:
            run_cmd.append("--use_py_session")
        venv_check_call(llm_venv, run_cmd)

        with open(f"{llm_venv.get_working_directory()}/use_lora.csv") as f:
            predict = csv.reader(f)
            predict = next(predict)
        predict = [int(p) for p in predict]
        assert ref_1 == predict or data_type != "float16"

        print(f"Run inference with lora id -1...")
        run_cmd = copy.deepcopy(base_run_cmd)
        run_cmd.extend([
            "--lora_task_uids=-1",
            f"--output_csv={llm_venv.get_working_directory()}/no_lora.csv"
        ])
        if use_py_session:
            run_cmd.append("--use_py_session")
        venv_check_call(llm_venv, run_cmd)

        with open(f"{llm_venv.get_working_directory()}/no_lora.csv") as f:
            predict = csv.reader(f)
            predict = next(predict)
        predict = [int(p) for p in predict]
        assert ref_2 == predict or data_type != "float16"


@pytest.mark.parametrize(
    "data_type", ['float16', 'fp8', 'sq_ootb', 'awq', 'int8_wo'],
    ids=['base_fp16', 'base_fp8', 'base_sq_ootb', 'base_awq', 'base_int8_wo'])
@pytest.mark.parametrize("llama_model_root", ['llama-v3-8b-hf'], indirect=True)
@pytest.mark.parametrize("llm_dora_model_root",
                         ['commonsense-llama-v3-8b-dora-r32'],
                         indirect=True)
def test_llm_llama_v3_dora_1gpu(data_type, llama_example_root, llama_model_root,
                                llm_dora_model_root, llm_datasets_root,
                                llm_venv, cmodel_dir, engine_dir,
                                qcache_dir_without_install_package):
    "run llama dora test on 1gpu"
    print("Build engines...")

    model_name = 'llama_v3-dora'
    if data_type == 'fp8':
        skip_fp8_pre_ada(use_fp8=True)

        model_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype="float16",
            qformat="fp8",
            quantize_dir=qcache_dir_without_install_package,
            calib_size=512,
            kv_cache_dtype="fp8")
    elif data_type == 'sq_ootb':
        model_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype="float16",
            qformat="int8_sq",
            quantize_dir=qcache_dir_without_install_package,
            calib_size=32)
    elif data_type == 'awq':
        model_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype="float16",
            qformat="int4_awq",
            awq_block_size=128,
            quantize_dir=qcache_dir_without_install_package,
            calib_size=32)
    elif data_type == 'int8_wo':
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root,
                                    use_weight_only=True,
                                    weight_only_precision='int8')
    else:
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root)

    # normalize dora magnitude
    dora_weights = f"{llm_venv.get_working_directory()}/dora_weights"

    normalize_cmd = [
        f"{llama_example_root}/../../../dora/normalize_weights.py", "-i",
        llm_dora_model_root, "-b", llama_model_root, "-o", dora_weights
    ]

    venv_check_call(llm_venv, normalize_cmd)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--lora_plugin=auto",
        "--dora_plugin=enable",
        "--remove_input_padding=enable",  # otherwise no cpp runtime
        "--kv_cache_type=paged",  # otherwise no cpp runtime
        "--gemm_plugin=auto",
        f"--lora_dir={dora_weights}",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    input_tokens = [
        128000, 39314, 374, 459, 7754, 430, 16964, 264, 3465, 11, 35526, 449,
        459, 1988, 430, 5825, 4726, 2317, 13, 9842, 264, 2077, 430, 36001,
        45695, 279, 1715, 382, 394, 17010, 30151, 512, 394, 5321, 5268, 279,
        4495, 4320, 311, 279, 3488, 25, 578, 842, 1121, 304, 279, 1920, 315,
        7397, 74767, 374, 279, 5788, 315, 13465, 323, 24463, 13, 16299, 3094,
        17738, 279, 7314, 315, 7397, 74767, 31931, 16533, 16, 25, 36424, 4907,
        374, 42101, 1555, 279, 20282, 13, 22559, 17, 25, 8828, 4907, 374, 16489,
        311, 11742, 4907, 13, 22559, 18, 25, 92479, 5237, 25734, 304, 279,
        16312, 41255, 3177, 4907, 13, 22559, 19, 25, 8219, 4238, 374, 16489,
        1139, 37833, 5237, 25734, 4286, 16533, 3645, 25, 4320, 16, 14, 9399, 17,
        14, 9399, 18, 14, 9399, 19, 271, 394, 17010, 5688, 512, 72348, 394,
        17010, 6075, 1473
    ]

    out_ref = [
        128000, 39314, 374, 459, 7754, 430, 16964, 264, 3465, 11, 35526, 449,
        459, 1988, 430, 5825, 4726, 2317, 13, 9842, 264, 2077, 430, 36001,
        45695, 279, 1715, 382, 394, 17010, 30151, 512, 394, 5321, 5268, 279,
        4495, 4320, 311, 279, 3488, 25, 578, 842, 1121, 304, 279, 1920, 315,
        7397, 74767, 374, 279, 5788, 315, 13465, 323, 24463, 13, 16299, 3094,
        17738, 279, 7314, 315, 7397, 74767, 31931, 16533, 16, 25, 36424, 4907,
        374, 42101, 1555, 279, 20282, 13, 22559, 17, 25, 8828, 4907, 374, 16489,
        311, 11742, 4907, 13, 22559, 18, 25, 92479, 5237, 25734, 304, 279,
        16312, 41255, 3177, 4907, 13, 22559, 19, 25, 8219, 4238, 374, 16489,
        1139, 37833, 5237, 25734, 4286, 16533, 3645, 25, 4320, 16, 14, 9399, 17,
        14, 9399, 18, 14, 9399, 19, 271, 394, 17010, 5688, 512, 72348, 394,
        17010, 6075, 1473, 394, 279, 4495, 4320, 374, 4320, 18, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001, 128001,
        128001, 128001, 128001, 128001, 128001
    ]

    in_csv = f"{llm_venv.get_working_directory()}/input.csv"
    out_csv = f"{llm_venv.get_working_directory()}/output.csv"
    with open(in_csv, "w") as f:
        writer = csv.writer(f)
        writer.writerow(input_tokens)

    base_run_cmd = [
        f"{llama_example_root}/../../../run.py", "--max_output_len=20",
        f"--input_file={in_csv}", f"--tokenizer_dir={llama_model_root}",
        f"--engine_dir={engine_dir}", "--max_output_len=32"
    ]

    for use_py_session in [True, False]:
        if use_py_session:
            print("Run inference with Python runtime...")
        else:
            print("Run inference with C++ runtime...")

        print(f"Run inference with lora id 0...")
        run_cmd = copy.deepcopy(base_run_cmd)
        run_cmd.extend(["--lora_task_uids=0", f"--output_csv={out_csv}"])
        if use_py_session:
            run_cmd.append("--use_py_session")
        venv_check_call(llm_venv, run_cmd)

        with open(out_csv) as f:
            predict = csv.reader(f)
            predict = next(predict)

        predict = [int(p) for p in predict]
        assert out_ref == predict or data_type != "float16"


@pytest.mark.skip_less_device(8)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("num_beams", [1, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize(
    "tp_pp_size", [(8, 1), (4, 2)],
    ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
@pytest.mark.parametrize("test_case", ["pg64317"], indirect=True)
def test_llm_llama_long_alpaca_8gpu_summary(llama_example_root,
                                            llm_long_alpaca_model_root,
                                            llm_datasets_root, llm_rouge_root,
                                            llm_venv, cmodel_dir, engine_dir,
                                            num_beams, tp_pp_size, test_case):
    "llama test for long alpaca"
    tp_size, pp_size = tp_pp_size
    world_size = 8
    assert tp_size * pp_size == world_size, \
        f'tp_size({tp_size}) x pp_size({pp_size}) != 8'

    model_name = 'llama_long_alpaca'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=llm_long_alpaca_model_root,
                                gpus=world_size,
                                tp_size=tp_size,
                                pp_size=pp_size,
                                data_type="bfloat16")

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=bfloat16",
        "--remove_input_padding=enable",
        "--gemm_plugin=bfloat16",
        f"--max_beam_width={num_beams}",
        "--max_input_len=32768",
        "--max_seq_len=49152",
        "--max_batch_size=1",
        "--max_num_tokens=32768",
    ]
    print("Build engines...")

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    max_output_len = test_case["max_output_len"]
    run_cmd = [
        f"{llama_example_root}/../../../run.py",
        f"--max_output_len={max_output_len}",
        f"--input_file={test_case['input_file']}", f"--engine_dir={engine_dir}",
        f"--num_beams={num_beams}",
        f"--tokenizer_dir={llm_long_alpaca_model_root}",
        "--max_input_length=32768"
    ]

    venv_mpi_check_call(
        llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
        run_cmd)

    summary_cmd = generate_summary_cmd(llama_example_root,
                                       hf_model_dir=llm_long_alpaca_model_root,
                                       max_input_length=16384,
                                       output_len=max_output_len,
                                       data_type="fp16",
                                       num_beams=num_beams,
                                       engine_dir=engine_dir,
                                       dataset_dir=llm_datasets_root,
                                       rouge_dir=llm_rouge_root)

    venv_mpi_check_call(
        llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
        summary_cmd)


@pytest.mark.skip_less_device_memory(40000)
@pytest.mark.parametrize("num_beams", [1, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("gemm_plugin", [True, False],
                         ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_v1_1gpu_streaming_llm(llama_example_root, llama_model_root,
                                         llm_datasets_root, llm_rouge_root,
                                         llm_venv, cmodel_dir, engine_dir,
                                         num_beams, gemm_plugin):
    "Run LLaMa with StreamingLLM"
    model_name = 'llama_v1-streamingllm'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=llama_model_root)
    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--remove_input_padding=enable",
        "--context_fmha=enable",
        f"--max_beam_width={num_beams}",
        "--streamingllm=enable",
        "--max_batch_size=256",
    ]
    if gemm_plugin:
        build_cmd.append("--gemm_plugin=float16")
    else:
        build_cmd.append("--gemm_plugin=disable")

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")
    summary_cmd = generate_summary_cmd(llama_example_root,
                                       hf_model_dir=llama_model_root,
                                       data_type="fp16",
                                       engine_dir=engine_dir,
                                       max_attention_window_size=2048,
                                       sink_token_length=4,
                                       num_beams=num_beams,
                                       dataset_dir=llm_datasets_root,
                                       rouge_dir=llm_rouge_root)

    venv_check_call(llm_venv, summary_cmd)


@pytest.mark.skip_less_device_memory(40000)
@pytest.mark.parametrize("num_beams", [1, 2, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize(
    "gpt_attention_plugin", [True, False],
    ids=["enable_attention_plugin", "disable_attention_plugin"])
@pytest.mark.parametrize("gemm_plugin", [True, False],
                         ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize(
    "context_fmha_type",
    ["enable_context_fmha", "enable_with_fp32_acc", "disable_context_fmha"])
@pytest.mark.parametrize("code_llama_model_root", ['CodeLlama-7b-Instruct'],
                         indirect=True)
def test_llm_llama_code_llama_1gpu_summary(
        llama_example_root, code_llama_model_root, llm_datasets_root,
        llm_rouge_root, llm_venv, cmodel_dir, engine_dir, num_beams,
        gemm_plugin, gpt_attention_plugin, context_fmha_type):
    "Run CodeLlaMa on single gpu"

    model_name = 'code_llama_1gpu'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=code_llama_model_root,
                                data_type="float16",
                                gpus=1,
                                tp_size=1,
                                pp_size=1)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--max_batch_size={1}",
        f"--max_input_len={1024}",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--max_beam_width={num_beams}",
        f"--max_seq_len={8192}",
    ]
    if gpt_attention_plugin:
        build_cmd.extend(
            ["--remove_input_padding=enable", "--gpt_attention_plugin=float16"])
    else:
        build_cmd.append("--gpt_attention_plugin=disable")
        build_cmd.append("--remove_input_padding=disable")
        build_cmd.append("--paged_kv_cache=disable")

    if gemm_plugin:
        build_cmd.append("--gemm_plugin=float16")
    else:
        build_cmd.append("--gemm_plugin=disable")

    if context_fmha_type == "enable_context_fmha":
        build_cmd.append("--context_fmha=enable")
    elif context_fmha_type == "disable_context_fmha":
        build_cmd.append("--context_fmha=disable")

    print("Build engines...")

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    run_cmd = [
        f"{llama_example_root}/../../../run.py",
        "--max_output_len=40",
        f"--tokenizer_dir={code_llama_model_root}",
        f"--engine_dir={engine_dir}",
        f"--num_beams={num_beams}",
        "--input_text='In Bash, how do I list all text files?'",
    ]
    if context_fmha_type == "enable_with_fp32_acc":
        run_cmd.append("--enable_context_fmha_fp32_acc")
    venv_check_call(llm_venv, run_cmd)

    print("Run inference")
    summary_cmd = generate_summary_cmd(llama_example_root,
                                       hf_model_dir=code_llama_model_root,
                                       data_type="fp16",
                                       num_beams=num_beams,
                                       tensorrt_llm_rouge1_threshold=17,
                                       engine_dir=engine_dir,
                                       dataset_dir=llm_datasets_root,
                                       rouge_dir=llm_rouge_root)

    venv_check_call(llm_venv, summary_cmd)


@skip_post_blackwell_ultra
@pytest.mark.timeout(7200)
@pytest.mark.skip_less_device_memory(40000)
@pytest.mark.parametrize("num_beams", [1, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize(
    "tp_pp_size", [(4, 1), (2, 2), (8, 1), (4, 2)],
    ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
@pytest.mark.parametrize("code_llama_model_root",
                         ['CodeLlama-34b-Instruct', 'CodeLlama-70b-hf'],
                         indirect=True)
def test_llm_llama_code_llama_multi_gpus_summary(llama_example_root,
                                                 code_llama_model_root,
                                                 llm_datasets_root,
                                                 llm_rouge_root, llm_venv,
                                                 cmodel_dir, engine_dir,
                                                 num_beams, tp_pp_size):
    "Run CodeLlaMa on 4 gpus"
    tp_size, pp_size = tp_pp_size
    world_size = tp_size * pp_size

    if get_device_count() < world_size:
        pytest.skip(f"devices are less than {world_size}.")

    model_name = 'code_llama'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=code_llama_model_root,
                                data_type="float16",
                                gpus=world_size,
                                tp_size=tp_size,
                                pp_size=pp_size)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--remove_input_padding=enable",
        "--gemm_plugin=float16",
        "--context_fmha=enable",
        f"--max_beam_width={num_beams}",
        f"--workers={world_size}",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    run_cmd = [
        f"{llama_example_root}/../../../run.py",
        "--max_output_len=160",
        f"--tokenizer_dir={code_llama_model_root}",
        f"--engine_dir={engine_dir}",
        f"--num_beams={num_beams}",
        "--input_text='In python, write a function for binary searching an element in an integer array.'",
    ]
    venv_mpi_check_call(
        llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
        run_cmd)

    print("Run inference")
    tensorrt_llm_rouge1_threshold = 18 if "70b" in code_llama_model_root else 22
    summary_cmd = generate_summary_cmd(
        llama_example_root,
        hf_model_dir=code_llama_model_root,
        data_type="fp16",
        num_beams=num_beams,
        tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
        engine_dir=engine_dir,
        dataset_dir=llm_datasets_root,
        rouge_dir=llm_rouge_root)

    venv_mpi_check_call(
        llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
        summary_cmd)


@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("per_token_channel", [True, False],
                         ids=["enable_ptpc", "disable_ptpc"])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
@pytest.mark.parametrize("data_type", ["float16", "bfloat16"])
def test_llm_llama_smooth_quant_1gpu_summary(llama_example_root,
                                             llama_model_root,
                                             llm_datasets_root, llm_rouge_root,
                                             llm_venv, engine_dir, num_beams,
                                             per_token_channel, cmodel_dir,
                                             data_type):
    "Run smooth quant on single gpu"
    model_dir = convert_weights(
        llm_venv=llm_venv,
        example_root=llama_example_root,
        cmodel_dir=cmodel_dir,
        model="llama-smooth",
        model_path=llama_model_root,
        gpus=1,
        smoothquant=0.55,
        per_token=per_token_channel,
        per_channel=per_token_channel,
        calib_dataset=f"{llm_datasets_root}/ccdv/cnn_dailymail",
        data_type=data_type)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin={data_type}",
        "--remove_input_padding=enable",
        f"--gemm_plugin={data_type}",
        "--context_fmha=enable",
        f"--max_beam_width={num_beams}",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")
    rouge1_threshold = 17
    summary_cmd = generate_summary_cmd(
        llama_example_root,
        hf_model_dir=llama_model_root,
        data_type="fp16",
        num_beams=num_beams,
        tensorrt_llm_rouge1_threshold=rouge1_threshold,
        engine_dir=engine_dir,
        dataset_dir=llm_datasets_root,
        rouge_dir=llm_rouge_root)

    venv_check_call(llm_venv, summary_cmd)


@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("use_weight_only", [True, False],
                         ids=['enable_weight_only', 'disable_weight_only'])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_int8_kv_1gpu_summary(llama_example_root, llama_model_root,
                                        llm_datasets_root, llm_rouge_root,
                                        llm_venv, engine_dir, num_beams,
                                        use_weight_only,
                                        qcache_dir_without_install_package):
    print("Quantizing model...")
    qformat = "int8_wo" if use_weight_only else "full_prec"
    ckpt_dir = quantize_data(llm_venv,
                             llama_example_root,
                             model_dir=llama_model_root,
                             calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
                             dtype="float16",
                             qformat=qformat,
                             quantize_dir=qcache_dir_without_install_package,
                             calib_size=32,
                             kv_cache_dtype="int8")

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--remove_input_padding=enable",
        "--gemm_plugin=float16",
        f"--max_beam_width={num_beams}",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")
    summary_cmd = generate_summary_cmd(llama_example_root,
                                       hf_model_dir=llama_model_root,
                                       data_type="fp16",
                                       num_beams=num_beams,
                                       tensorrt_llm_rouge1_threshold=19,
                                       engine_dir=engine_dir,
                                       dataset_dir=llm_datasets_root,
                                       rouge_dir=llm_rouge_root)

    venv_check_call(llm_venv, summary_cmd)


@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_int8_sq_ootb_1gpu_summary(
        llama_example_root, llama_model_root, llm_datasets_root, llm_rouge_root,
        llm_venv, engine_dir, num_beams, qcache_dir_without_install_package):
    print("Quantizing model...")
    ckpt_dir = quantize_data(llm_venv,
                             llama_example_root,
                             model_dir=llama_model_root,
                             calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
                             dtype="float16",
                             qformat="int8_sq",
                             quantize_dir=qcache_dir_without_install_package,
                             calib_size=32,
                             kv_cache_dtype="int8")

    print("Build engines...")
    build_cmd = [
        "trtllm-build", f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}", "--gpt_attention_plugin=float16",
        "--remove_input_padding=enable", "--gemm_plugin=disable",
        f"--max_beam_width={num_beams}"
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")

    summary_cmd = generate_summary_cmd(
        llama_example_root,
        hf_model_dir=llama_model_root,
        data_type="fp16",
        num_beams=num_beams,
        tensorrt_llm_rouge1_threshold=
        15.2,  #Adjust to 15.2 for using TRT build optimization level 3
        engine_dir=engine_dir,
        dataset_dir=llm_datasets_root,
        rouge_dir=llm_rouge_root)

    venv_check_call(llm_venv, summary_cmd)


@skip_post_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['bfloat16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_int8sq_2gpu_tp2(data_type, llama_example_root,
                                      llama_model_root,
                                      llama_v2_tokenizer_model_root,
                                      llm_datasets_root, llm_rouge_root,
                                      llm_venv, engine_dir, num_beams,
                                      qcache_dir_without_install_package):
    if num_beams > 2 and get_device_memory() < 80000:
        pytest.skip("device memory is insufficient.")

    # Quantize HF llama checkpoint into int8_sq format
    model_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype=data_type,
        qformat="int8_sq",
        quantize_dir=qcache_dir_without_install_package,
        tp_size=2,
        pp_size=1,
        calib_size=32)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin={data_type}",
        f"--gemm_plugin={data_type}",
        "--remove_input_padding=enable",
        f"--max_beam_width={num_beams}",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    summary_cmd = [
        f"{llama_example_root}/../../../summarize.py",
        "--test_trt_llm",
        "--hf_model_dir",
        f"{llama_v2_tokenizer_model_root}",
        "--data_type=fp16",
        f"--engine_dir={engine_dir}",
        "--tensorrt_llm_rouge1_threshold=15",
        "--check_accuracy",
        f"--num_beams={num_beams}",
        f"--dataset_dir={llm_datasets_root}",
        f"--rouge_dir={llm_rouge_root}",
    ]

    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
                        summary_cmd)


@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("weight_only_precision", ["int4", "int8"])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_wo_1gpu_summary(llama_example_root, llama_model_root,
                                   llm_datasets_root, llm_rouge_root, llm_venv,
                                   engine_dir, num_beams, cmodel_dir,
                                   weight_only_precision):

    skip_fp8_pre_ada(use_fp8=True)

    llm_venv.get_working_directory()
    model_name = os.path.basename(llama_example_root)

    ckpt_dir = convert_weights(llm_venv=llm_venv,
                               example_root=llama_example_root,
                               cmodel_dir=cmodel_dir,
                               model=model_name,
                               model_path=llama_model_root,
                               data_type="float16",
                               use_weight_only=True,
                               weight_only_precision=weight_only_precision,
                               gpus=1,
                               tp_size=1,
                               pp_size=1)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--remove_input_padding=enable",
        "--gemm_plugin=float16",
        f"--max_beam_width={num_beams}",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")

    summary_cmd = generate_summary_cmd(llama_example_root,
                                       hf_model_dir=llama_model_root,
                                       data_type="fp16",
                                       num_beams=num_beams,
                                       tensorrt_llm_rouge1_threshold=20.2 if
                                       weight_only_precision == 'int8' else 16,
                                       engine_dir=engine_dir,
                                       dataset_dir=llm_datasets_root,
                                       rouge_dir=llm_rouge_root)

    venv_check_call(llm_venv, summary_cmd)


@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
def test_llm_llama_int8_kv_awq_1gpu_summary(llama_example_root,
                                            llama_model_root, llm_datasets_root,
                                            llm_rouge_root, llm_venv,
                                            engine_dir, num_beams,
                                            qcache_dir_without_install_package):
    "Run int8 kv cache on single gpu"
    print("Quantizing model...")
    ckpt_dir = quantize_data(llm_venv,
                             llama_example_root,
                             model_dir=llama_model_root,
                             calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
                             dtype="float16",
                             qformat="int4_awq",
                             quantize_dir=qcache_dir_without_install_package,
                             calib_size=32,
                             kv_cache_dtype="int8")

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--remove_input_padding=enable",
        "--context_fmha=enable",
        "--gemm_plugin=float16",
        f"--max_beam_width={num_beams}",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")
    summary_cmd = generate_summary_cmd(llama_example_root,
                                       hf_model_dir=llama_model_root,
                                       data_type="fp16",
                                       num_beams=num_beams,
                                       tensorrt_llm_rouge1_threshold=15,
                                       engine_dir=engine_dir,
                                       dataset_dir=llm_datasets_root,
                                       rouge_dir=llm_rouge_root)

    venv_check_call(llm_venv, summary_cmd)


@pytest.mark.parametrize("data_type", ['float16', 'fp8'],
                         ids=['base_fp16', 'base_fp8'])
@pytest.mark.parametrize("lora_data_type", ['float16'], ids=['lora_fp16'])
@pytest.mark.parametrize("llama_model_root", ['llama-7b'], indirect=True)
@pytest.mark.parametrize("llm_lora_model_root",
                         [("luotuo-lora-7b-0.1", "Japanese-Alpaca-LoRA-7b-v0")],
                         ids=["luotuo_japan"],
                         indirect=True)
def test_llm_llama_v1_multiple_lora_1gpu(data_type, lora_data_type,
                                         llama_example_root, llama_model_root,
                                         llm_datasets_root, llm_venv,
                                         cmodel_dir, engine_dir,
                                         llm_lora_model_root,
                                         qcache_dir_without_install_package):
    "run llama with multi lora on 1gpu"
    first_lora, second_lora = llm_lora_model_root.split(",")

    print("Build engines...")
    if data_type == 'fp8':
        skip_fp8_pre_ada(use_fp8=True)
        model_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype="float16",
            qformat="fp8",
            quantize_dir=qcache_dir_without_install_package,
            calib_size=512,
            kv_cache_dtype="fp8")
    else:
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model="llama-lora",
                                    model_path=llama_model_root,
                                    gpus=1,
                                    tp_size=1,
                                    data_type=data_type)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--remove_input_padding=enable",
        "--context_fmha=enable",
        "--gemm_plugin=auto",
        "--lora_plugin=auto",
        "--max_batch_size=128",
        "--max_input_len=512",
        "--max_seq_len=562",
        "--lora_dir",
        f"{first_lora}",
        f"{second_lora}",
        "--max_lora_rank=8",
        "--lora_target_modules",
        "attn_q",
        "attn_k",
        "attn_v",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    base_run_cmd = [
        f"{llama_example_root}/../../../run.py",
        "--input_text",
        "美国的首都在哪里? \n答案:",
        "美国的首都在哪里? \n答案:",
        "美国的首都在哪里? \n答案:",
        "アメリカ合衆国の首都はどこですか? \n答え:",
        "アメリカ合衆国の首都はどこですか? \n答え:",
        "アメリカ合衆国の首都はどこですか? \n答え:",
        f"--tokenizer_dir={llama_model_root}",
        f"--engine_dir={engine_dir}",
        "--lora_task_uids",
        "-1",
        "0",
        "1",
        "-1",
        "0",
        "1",
        "--top_p=0.5",
        "--top_k=0",
        "--random_seed=0",
        "--max_output_len=10",
    ]

    for use_py_session in [True, False]:
        run_cmd = copy.deepcopy(base_run_cmd)
        if use_py_session:
            print("Run inference with Python runtime...")
            run_cmd.append("--use_py_session")
        else:
            print("Run inference with C++ runtime...")

        # TODO: add step to check result
        venv_check_call(llm_venv, run_cmd)


@pytest.mark.timeout(7200)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("num_beams", [1, 4],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize(
    "qformat",
    ["fp8", pytest.param("int4_awq", marks=skip_post_blackwell)])
@pytest.mark.parametrize(
    "tp_pp_size", [(4, 1), (2, 2)],
    ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
@pytest.mark.parametrize("code_llama_model_root",
                         ['CodeLlama-34b-Instruct', 'CodeLlama-70b-hf'],
                         indirect=True)
def test_llm_llama_code_llama_quantization_4gpus_summary(
        llama_example_root, code_llama_model_root, llm_datasets_root,
        llm_rouge_root, llm_venv, engine_dir, num_beams, tp_pp_size,
        qcache_dir_without_install_package, qformat):
    "Run CodeLlaMa on 4 gpus"
    skip_fp8_pre_ada(use_fp8=qformat == "fp8")
    tp_size, pp_size = tp_pp_size
    world_size = tp_size * pp_size

    kv_cache_dtype = "fp8" if qformat == "fp8" else "int8"
    qmodel_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=code_llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype="float16",
        qformat=qformat,
        quantize_dir=qcache_dir_without_install_package,
        tp_size=tp_size,
        pp_size=pp_size,
        calib_size=32,
        kv_cache_dtype=kv_cache_dtype)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={qmodel_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--remove_input_padding=enable",
        "--gemm_plugin=float16",
        "--context_fmha=enable",
        f"--max_beam_width={num_beams}",
        f"--workers={world_size}",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")
    summary_cmd = generate_summary_cmd(llama_example_root,
                                       hf_model_dir=code_llama_model_root,
                                       data_type="fp16",
                                       num_beams=num_beams,
                                       tensorrt_llm_rouge1_threshold=20,
                                       engine_dir=engine_dir,
                                       dataset_dir=llm_datasets_root,
                                       rouge_dir=llm_rouge_root,
                                       max_ite=100)

    venv_mpi_check_call(
        llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
        summary_cmd)


@pytest.mark.parametrize("llama_model_root",
                         ['Llama-3-8B-Instruct-Gradient-1048k'],
                         indirect=True)
@pytest.mark.parametrize("dataset_name", ["SlimPajama-6B", "passkey"])
def test_llm_llama_v3_8b_1048k_long_context_ppl(llama_example_root,
                                                llama_model_root, llm_venv,
                                                engine_dir, cmodel_dir,
                                                llm_datasets_root,
                                                dataset_name):
    "Build & run llama-3-8B-1048k on long context ppl."
    if dataset_name == "SlimPajama-6B" and get_device_memory() < 50000:
        pytest.skip("GPU memory is insufficient.")

    model_name = os.path.basename(llama_model_root)
    dtype = 'float16'
    max_input_len = 16384
    max_output_len = 50

    if dataset_name == "passkey":
        print("Generate evaluation dataset for passkey.")
        gen_cmd = [
            f"{llama_example_root}/../infinitebench/construct_synthetic_dataset.py",
            "--test_case=build_passkey", "--test_level=4"
        ]
        venv_check_call(llm_venv, gen_cmd)
        max_input_len = 128 * 1024

    print("Converting checkpoint...")
    ckpt_dir = convert_weights(llm_venv=llm_venv,
                               example_root=llama_example_root,
                               cmodel_dir=cmodel_dir,
                               model=model_name,
                               model_path=llama_model_root,
                               data_type=dtype)

    print("Building engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={ckpt_dir}",
        f"--output_dir={engine_dir}",
        f"--max_batch_size={1}",
        f"--max_input_len={max_input_len}",
        f"--max_seq_len={max_output_len+max_input_len}",
        f"--gemm_plugin={dtype}",
        "--max_num_tokens=4096",
        "--use_paged_context_fmha=enable",
    ]

    if dataset_name == "SlimPajama-6B":
        build_cmd.append("--gather_context_logits")

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    if dataset_name == "passkey":
        print("Run passkey evaluation...")
        summary_cmd = [
            f"{llama_example_root}/../../../eval_long_context.py",
            f"--engine_dir={engine_dir}",
            f"--tokenizer_dir={llama_model_root}",
            f"--max_input_length={max_input_len}",
            f"--max_tokens_in_paged_kv_cache={int(max_input_len * 1.2)}",
            "--task=passkey",
            "--stop_idx=20",
            "--enable_chunked_context",
        ]
    else:
        print("Run context ppl evaluation...")
        summary_cmd = generate_summary_cmd(
            llama_example_root,
            tokenizer_dir=llama_model_root,
            data_type="fp16",
            engine_dir=engine_dir,
            dataset_dir=f"{llm_datasets_root}/{dataset_name}",
            eval_task="eval_context_ppl",
            max_input_len=max_input_len,
            batch_size=1,
            max_ite=200,  # the samples will be filtered by min_input_length
            tensorrt_llm_ppl_threshold=7.8,
            max_tokens_in_paged_kv_cache=int(max_input_len * 1.2),
            enable_chunked_context=True,
            min_input_length=10000)

    venv_check_call(llm_venv, summary_cmd)


@pytest.mark.skip_less_device(8)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("llama_model_root", [
    'Llama-3-8B-Instruct-Gradient-1048k', 'Llama-3-70B-Instruct-Gradient-1048k'
],
                         indirect=True)
@pytest.mark.timeout(10800 if get_sm_version() < 89 else 3600)
def test_llm_llama_v3_1m_long_context_8gpus(llama_example_root,
                                            llama_model_root, llm_venv,
                                            engine_dir, cmodel_dir,
                                            timeout_manager):
    "Build & run llama-3-8B-1048k on long context."
    model_name = os.path.basename(llama_model_root)
    dtype = 'float16'
    tp_size, pp_size = 8, 1
    world_size = tp_size * pp_size
    max_seq_len = 1048576
    max_batch_size = 256

    # Generate evaluation dataset with timeout management
    print("Generate evaluation dataset for passkey.")
    with timeout_manager.timed_operation("gen"):
        gen_cmd = [
            f"{llama_example_root}/../../../infinitebench/construct_synthetic_dataset.py",
            "--test_case=build_passkey",
            "--test_level=7",
        ]
        venv_check_call(llm_venv,
                        gen_cmd,
                        timeout=timeout_manager.remaining_timeout)

    # Convert checkpoint with timeout management
    print("Converting checkpoint...")
    with timeout_manager.timed_operation("convert"):
        ckpt_dir = convert_weights(llm_venv=llm_venv,
                                   example_root=llama_example_root,
                                   cmodel_dir=cmodel_dir,
                                   model=model_name,
                                   model_path=llama_model_root,
                                   data_type=dtype,
                                   tp_size=tp_size,
                                   pp_size=pp_size,
                                   timeout=timeout_manager.remaining_timeout)

    # Build engines with timeout management
    print("Building engines...")
    with timeout_manager.timed_operation("build"):
        build_cmd = [
            "trtllm-build", f"--checkpoint_dir={ckpt_dir}",
            f"--output_dir={engine_dir}", f"--gemm_plugin={dtype}",
            f"--workers={world_size}", f"--max_seq_len={max_seq_len}",
            "--max_num_tokens=4096", "--use_paged_context_fmha=enable",
            f'--max_batch_size={max_batch_size}'
        ]

        check_call(" ".join(build_cmd),
                   shell=True,
                   env=llm_venv._new_env,
                   timeout=timeout_manager.remaining_timeout)

    # Run passkey evaluation with timeout management
    print("Run passkey evaluation...")
    with timeout_manager.timed_operation("eval"):
        eval_cmd = [
            f"{llama_example_root}/../../../eval_long_context.py",
            f"--engine_dir={engine_dir}",
            f"--tokenizer_dir={llama_model_root}",
            f"--max_input_length={max_seq_len-10}",
            "--max_tokens_in_paged_kv_cache=1100000",
            "--task=passkey",
            "--stop_idx=10",
            "--enable_chunked_context",
            "--tensorrt_llm_accuracy_threshold=0.9",
        ]

        venv_mpi_check_call(
            llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
            eval_cmd,
            timeout=timeout_manager.remaining_timeout)


@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(8)
@pytest.mark.parametrize("test_type", ['build', 'infer'])
@pytest.mark.parametrize("llama_model_root", ['llama-7b', 'llama-v2-70b-hf'],
                         indirect=True)
def test_llm_llama_2nodes_8gpus(test_type, llama_example_root, llama_model_root,
                                llm_datasets_root, llm_venv, cmodel_dir):
    """
        Run test on cluster.
        1. run build test on 1 node to save engine tp*pp > 8.
        2. run infer test on 1/2 nodes.
    """
    data_type = "float16"
    num_beams = 4
    tp_size, pp_size = 8, 2
    world_size = tp_size * pp_size
    model_name = os.path.basename(llama_model_root)

    # engine dir will be saved for infer tests
    engine_dir = os.path.join(llama_example_root, "engines", model_name,
                              data_type, f"{world_size}-gpu",
                              f"tp{tp_size}pp{pp_size}")

    if test_type == "build":
        print("Convert weight...")
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root,
                                    data_type=data_type,
                                    tp_size=tp_size,
                                    pp_size=pp_size)

        print("Build engines...")
        build_cmd = [
            "trtllm-build",
            f"--checkpoint_dir={model_dir}",
            f"--output_dir={engine_dir}",
            f"--gemm_plugin={data_type}",
            f"--max_beam_width={num_beams}",
            f"--workers={world_size}",
            "--remove_input_padding=enable",
        ]

        check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    if test_type == "infer":
        assert exists(engine_dir), f"{engine_dir} is not exists."

        print("Run inference...")
        run_cmd = [
            f"{llama_example_root}/../../../run.py",
            "--max_output_len=50",
            f"--tokenizer_dir={llama_model_root}",
            f"--engine_dir={engine_dir}",
            f"--num_beams={num_beams}",
        ]

        venv_check_call(llm_venv, run_cmd)

        print("Run summarize...")
        summary_cmd = generate_summary_cmd(llama_example_root,
                                           hf_model_dir=llama_model_root,
                                           data_type="fp16",
                                           engine_dir=engine_dir,
                                           num_beams=num_beams,
                                           dataset_dir=llm_datasets_root)

        venv_check_call(llm_venv, summary_cmd)


@pytest.mark.parametrize("enable_mha_plugin", [True, False],
                         ids=["plugin", "ootb"])
@pytest.mark.parametrize("max_gpu_percent", [0.05, 1.0])
@pytest.mark.parametrize("llama_model_root",
                         ['llama-v2-7b-hf', 'llama-v2-70b-hf'],
                         indirect=True)
def test_llm_llama_v2_1gpu_weight_streaming(llama_example_root,
                                            llama_model_root, llm_datasets_root,
                                            llm_venv, engine_dir,
                                            max_gpu_percent, enable_mha_plugin):
    "run llama v2 test with streaming"
    if "70b" in llama_model_root and get_host_total_memory() < 480000:
        pytest.skip("Host memory is less than 480G.")

    print("Convert weights...")
    model_name = 'llama2_weight_streaming'
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=engine_dir,
                                model=model_name,
                                model_path=llama_model_root,
                                load_by_shard=True,
                                load_model_on_cpu=True)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--gemm_plugin=disable",
        "--max_batch_size=2",
        "--max_beam_width=2",
        "--weight_streaming",
    ]
    if enable_mha_plugin:
        build_cmd += ["--gpt_attention_plugin=float16"]
    else:
        build_cmd += [
            "--gpt_attention_plugin=disable", "--remove_input_padding=disable",
            "--paged_kv_cache=disable"
        ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    for gpu_weights_percent in [0, 0.05, 0.1, 0.2, 0.5, 0.9, 1]:
        if gpu_weights_percent > max_gpu_percent:
            break
        print(f"Run inference with gpu_weights_percent={gpu_weights_percent}")
        summary_cmd = [
            f"{llama_example_root}/../../../summarize.py", "--test_trt_llm",
            "--hf_model_dir", f"{llama_model_root}", "--data_type", "fp16",
            "--check_accuracy", f"--engine_dir={engine_dir}", "--num_beams=2",
            f"--dataset_dir={llm_datasets_root}",
            f"--gpu_weights_percent={gpu_weights_percent}", "--max_ite=1",
            "--log_level=verbose"
        ]
        if not enable_mha_plugin:
            summary_cmd += ["--use_py_session"]  # only py session support

        venv_check_call(llm_venv, summary_cmd)


@pytest.mark.skip_less_device_memory(40000)
@pytest.mark.parametrize("deepseek_model_root",
                         ['deepseek-coder-6.7b-instruct'],
                         indirect=True)
@pytest.mark.parametrize("test_case", ["ailab"], indirect=True)
def test_llm_llama_1gpu_streaming_llm(llama_example_root, deepseek_model_root,
                                      llm_venv, cmodel_dir, engine_dir,
                                      test_case):
    "Run deep seek with StreamingLLM, RCCA https://nvbugs/4666604"
    model_name = 'deepseek'
    max_input_len = test_case['max_input_len']
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model=model_name,
                                model_path=deepseek_model_root)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=auto",
        "--gemm_plugin=auto",
        "--remove_input_padding=enable",
        "--context_fmha=enable",
        "--streamingllm=enable",
        f"--max_input_len={max_input_len}",
        f"--max_seq_len={max_input_len}",
        "--max_batch_size=256",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run inference")
    run_cmd = [
        f"{llama_example_root}/../../../run.py",
        f"--tokenizer_dir={deepseek_model_root}",
        f"--engine_dir={engine_dir}",
        f"--max_input_length={max_input_len}",
        f"--input_file={test_case['input_file']}",
        "--max_output_len=50",
        "--max_attention_window_size=2048",
        "--sink_token_length=4",
    ]

    output = venv_check_output(llm_venv, run_cmd)

    assert "上海人工智能实验室" in output, output


@pytest.mark.parametrize("fp8_quant", [
    'disable_fp8',
    pytest.param('enable_fp8', marks=skip_post_blackwell),
    pytest.param('enable_fp8_meta_recipe', marks=skip_post_blackwell)
])
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b', 'llama-3.2-1b'],
                         indirect=True)
def test_llm_llama_v3_1_1node_single_gpu(llama_example_root, llama_model_root,
                                         llm_venv, cmodel_dir,
                                         llm_datasets_root, llm_rouge_root,
                                         engine_dir, fp8_quant):
    "Run llama3.1 test on 1 gpu."
    data_type = "bfloat16"
    model_name = os.path.basename(llama_model_root)

    use_fp8_rowwise = False
    use_meta_fp8_rowwise_recipe = False
    if fp8_quant == 'enable_fp8':
        use_fp8_rowwise = True
    elif fp8_quant == 'enable_fp8_meta_recipe':
        use_fp8_rowwise = True
        use_meta_fp8_rowwise_recipe = True

    print("Convert weight...")
    model_dir = convert_weights(
        llm_venv=llm_venv,
        example_root=llama_example_root,
        cmodel_dir=cmodel_dir,
        model=model_name,
        model_path=llama_model_root,
        data_type=data_type,
        tp_size=1,
        pp_size=1,
        use_fp8_rowwise=use_fp8_rowwise,
        use_meta_fp8_rowwise_recipe=use_meta_fp8_rowwise_recipe)

    print("Build engines...")
    build_cmd = [
        "trtllm-build", f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}", f"--max_batch_size={8}",
        f"--max_seq_len={2048}"
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    summary_cmd = [
        f"{llama_example_root}/../../../summarize.py",
        "--test_trt_llm",
        f"--hf_model_dir={llama_model_root}",
        f"--engine_dir={engine_dir}",
        "--check_accuracy",
        f"--tensorrt_llm_rouge1_threshold={14}",
        f"--dataset_dir={llm_datasets_root}",
        f"--rouge_dir={llm_rouge_root}",
    ]
    venv_check_call(llm_venv, summary_cmd)


@pytest.mark.parametrize("llama_model_root", ['llama-3.2-1b'], indirect=True)
def test_llm_llama_v3_2_smoothquant_1node_single_gpu(
        llama_example_root, llama_model_root, llm_venv, cmodel_dir,
        llm_datasets_root, llm_rouge_root, engine_dir):
    "Run llama3.2-1b smooth quant test on 1 gpu."
    data_type = "bfloat16"
    model_name = os.path.basename(llama_model_root)

    print("Convert weight...")

    model_dir = convert_weights(
        llm_venv=llm_venv,
        example_root=llama_example_root,
        cmodel_dir=cmodel_dir,
        model=model_name,
        model_path=llama_model_root,
        gpus=1,
        smoothquant=0.5,
        per_token=True,
        per_channel=True,
        calib_dataset=f"{llm_datasets_root}/ccdv/cnn_dailymail",
        data_type=data_type)

    print("Build engines...")
    build_cmd = [
        "trtllm-build", f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}", f"--max_batch_size={1}",
        f"--max_seq_len={1024}"
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    summary_cmd = [
        f"{llama_example_root}/../../../summarize.py",
        "--test_trt_llm",
        f"--hf_model_dir={llama_model_root}",
        f"--engine_dir={engine_dir}",
        "--check_accuracy",
        f"--tensorrt_llm_rouge1_threshold={18.8}",
        f"--dataset_dir={llm_datasets_root}",
        f"--rouge_dir={llm_rouge_root}",
    ]
    venv_check_call(llm_venv, summary_cmd)


@pytest.mark.timeout(7200)
@pytest.mark.skip_device_not_contain(["A100", "H100"])
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(4)
@skip_post_blackwell_ultra
@pytest.mark.parametrize("fp8_quant",
                         [pytest.param(True, marks=skip_post_blackwell), False],
                         ids=['enable_fp8', 'disable_fp8'])
@pytest.mark.parametrize("llama_model_root", [
    'llama-3.1-8b', 'llama-3.1-70b', 'llama-3.1-405b',
    pytest.param('llama-3.1-405b-fp8', marks=skip_post_blackwell)
],
                         indirect=True)
@pytest.mark.parametrize(
    "gemm_allreduce", [pytest.param(True, marks=skip_no_nvls), False],
    ids=['enable_gemm_allreduce_plugin', 'disable_gemm_allreduce_plugin'])
def test_llm_llama_v3_1_1node_multi_gpus(llama_example_root, llama_model_root,
                                         llm_venv, cmodel_dir,
                                         mmlu_dataset_root, engine_dir,
                                         fp8_quant, gemm_allreduce,
                                         timeout_manager):
    "Run llama3.1 test on 1 node."
    if ("8B" not in llama_model_root) and (get_host_total_memory() < 1000000):
        pytest.skip("Host memory is insufficient.")

    if "fp8" in llama_model_root.lower():
        skip_fp8_pre_ada(use_fp8=True)

    skip_fp8_pre_ada(use_fp8=fp8_quant)

    data_type = "bfloat16"
    world_size = tp_size = get_device_count()
    pp_size = 1
    model_name = os.path.basename(llama_model_root)

    if not fp8_quant and "Meta-Llama-3.1-405B" == model_name:
        pytest.skip("Build engine will be OOM on 1 node.")

    # Convert weights with timeout management
    print("Convert weight...")
    with timeout_manager.timed_operation("convert"):
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root,
                                    data_type=data_type,
                                    tp_size=tp_size,
                                    pp_size=pp_size,
                                    use_fp8_rowwise=fp8_quant,
                                    load_by_shard=True,
                                    workers=world_size,
                                    timeout=timeout_manager.remaining_timeout)

    # Build engines with timeout management
    print("Build engines...")
    with timeout_manager.timed_operation("build"):
        build_cmd = [
            "trtllm-build",
            f"--checkpoint_dir={model_dir}",
            f"--output_dir={engine_dir}",
            f"--workers={world_size}",
            f"--max_batch_size={256}",
            "--use_paged_context_fmha=enable",
            "--max_num_tokens=4096",
            "--max_input_len=64000",
            "--max_seq_len=65000",
        ]

        if gemm_allreduce:
            build_cmd += [f"--gemm_allreduce_plugin={data_type}"]

        check_call(" ".join(build_cmd),
                   shell=True,
                   env=llm_venv._new_env,
                   timeout=timeout_manager.remaining_timeout)

    # Generate dataset with timeout management
    with timeout_manager.timed_operation("gen"):
        gen_cmd = [
            f"{llama_example_root}/../../../infinitebench/construct_synthetic_dataset.py",
            "--test_case=build_passkey",
            "--test_level=3",
        ]

        venv_check_call(llm_venv,
                        gen_cmd,
                        timeout=timeout_manager.remaining_timeout)

    # Run evaluation with timeout management
    print("Run eval...")
    with timeout_manager.timed_operation("eval"):
        eval_cmd = [
            f"{llama_example_root}/../../../eval_long_context.py",
            "--task=passkey",
            f"--engine_dir={engine_dir}",
            f"--tokenizer_dir={llama_model_root}",
            "--stop_idx=6",
            "--max_input_length=64000",
            "--enable_chunked_context",
            "--kv_cache_free_gpu_memory_fraction=0.999",
            "--max_tokens_in_paged_kv_cache=65064",
            "--output_dir=64k_context_tp8",
        ]

        venv_mpi_check_call(
            llm_venv, ["mpirun", "-n", f"{world_size}", "--allow-run-as-root"],
            eval_cmd,
            timeout=timeout_manager.remaining_timeout)

    # Run MMLU with timeout management
    print("Run mmlu...")
    with timeout_manager.timed_operation("mmlu"):
        mmlu_cmd = [
            "trtllm-eval", f"--model={engine_dir}",
            f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
            f"--dataset_path={mmlu_dataset_root}", "--check_accuracy"
        ]
        check_call(" ".join(mmlu_cmd),
                   shell=True,
                   env=llm_venv._new_env,
                   timeout=timeout_manager.remaining_timeout)


@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("test_type", ['build', 'infer'])
@pytest.mark.parametrize(
    "tp_pp_size", [(16, 1), (8, 2)],
    ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
@pytest.mark.parametrize(
    "fp8_quant",
    ['disable_fp8',
     pytest.param('enable_fp8', marks=skip_post_blackwell)])
@pytest.mark.parametrize("llama_model_root", [
    'llama-3.1-8b', 'llama-3.1-70b', 'llama-3.1-405b',
    pytest.param('llama-3.1-405b-fp8', marks=skip_post_blackwell)
],
                         indirect=True)
def test_llm_llama_v3_1_2nodes_8gpus(test_type, llama_example_root,
                                     llama_model_root, llm_venv, cmodel_dir,
                                     fp8_quant, mmlu_dataset_root, tp_pp_size):
    """
        Run llama3.1 test on cluster.
        1. run build test on 1 node to save engine tp*pp > 8.
        2. run infer test on 1/2 nodes.
    """
    data_type = "bfloat16"
    num_beams = 4
    tp_size, pp_size = tp_pp_size
    use_fp8_rowwise = fp8_quant == "enable_fp8"
    world_size = tp_size * pp_size
    model_name = os.path.basename(llama_model_root)
    workspace = llm_venv.get_working_directory()

    # engine dir will be saved for infer tests
    engine_dir = os.path.join(llama_example_root, "engines", model_name,
                              data_type, f"{world_size}-gpu",
                              f"tp{tp_size}pp{pp_size}", fp8_quant)

    context_dir = os.path.join(engine_dir, "128k_context")

    if test_type == "build":
        print("Convert weight...")
        model_dir = convert_weights(llm_venv=llm_venv,
                                    example_root=llama_example_root,
                                    cmodel_dir=cmodel_dir,
                                    model=model_name,
                                    model_path=llama_model_root,
                                    data_type=data_type,
                                    tp_size=tp_size,
                                    pp_size=pp_size,
                                    use_fp8_rowwise=use_fp8_rowwise,
                                    load_by_shard=True,
                                    workers=tp_size)

        print("Build engines...")
        build_cmd = [
            "trtllm-build",
            f"--checkpoint_dir={model_dir}",
            f"--output_dir={engine_dir}",
            f"--gemm_allreduce_plugin={data_type}",
            f"--max_beam_width={num_beams}",
            f"--workers={tp_size}",
            f"--max_batch_size={4}",
            "--use_paged_context_fmha=enable",
            "--max_num_tokens=4096",
            "--max_input_len=255000",
            "--max_seq_len=256000",
        ]

        check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

        check_call(f"mkdir -p {context_dir}", shell=True)

        gen_cmd = [
            f"{llama_example_root}/../../../infinitebench/construct_synthetic_dataset.py",
            "--test_case=build_passkey",
            "--test_level=4",
        ]

        venv_check_call(llm_venv, gen_cmd)

        dest = shutil.copy(f"{workspace}/passkey.jsonl", context_dir)

        print(dest)

    if test_type == "infer":
        assert exists(engine_dir), f"{engine_dir} is not exists."

        print("Run eval...")
        eval_cmd = [
            f"{llama_example_root}/../../../eval_long_context.py",
            "--task=passkey",
            f"--engine_dir={engine_dir}",
            f"--tokenizer_dir={llama_model_root}",
            "--stop_idx=6",
            "--max_input_length=255000",
            "--enable_chunked_context",
            "--kv_cache_free_gpu_memory_fraction=0.999",
            "--max_tokens_in_paged_kv_cache=256064",
            f"--data_dir={context_dir}",
            f"--output_dir={context_dir}_tp8pp2",
        ]

        venv_check_call(llm_venv, eval_cmd)

        print("Run mmlu...")
        mmlu_cmd = [
            "trtllm-eval", f"--model={engine_dir}",
            f"--tokenizer={llama_model_root}", "--backend=tensorrt", "mmlu",
            f"--dataset_path={mmlu_dataset_root}", "--check_accuracy"
        ]
        check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)


@skip_post_blackwell
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("low_latency_gemm_plugin", ["fp8"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_low_latency_gemm(llama_example_root,
                                            llama_model_root, llm_datasets_root,
                                            llm_venv, engine_dir,
                                            qcache_dir_without_install_package,
                                            low_latency_gemm_plugin):
    "run llama v2 test with low latency gemm plugin"
    if low_latency_gemm_plugin == "fp8":
        skip_fp8_pre_ada(use_fp8=True)
        qmodel_dir = quantize_data(
            llm_venv,
            llama_example_root,
            model_dir=llama_model_root,
            calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
            dtype="float16",
            qformat="fp8",
            quantize_dir=qcache_dir_without_install_package,
            calib_size=512,
            kv_cache_dtype="fp8")
    else:
        pytest.skip(f"low_latency_gemm_plugin only supports fp8 now")

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={qmodel_dir}",
        f"--output_dir={engine_dir}",
        "--gpt_attention_plugin=float16",
        "--gemm_plugin=float16",
        f"--low_latency_gemm_plugin={low_latency_gemm_plugin}",
        "--remove_input_padding=enable",
        "--max_batch_size=1",
        "--max_input_len=2048",
        "--max_seq_len=2048",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run Summarization test")
    summary_cmd = [
        f"{llama_example_root}/../../../summarize.py", "--test_trt_llm",
        "--hf_model_dir", f"{llama_model_root}", "--data_type", "fp16",
        f"--engine_dir={engine_dir}", "--check_accuracy", "--max_ite=40",
        f"--dataset_dir={llm_datasets_root}"
    ]

    venv_check_call(llm_venv, summary_cmd)


@pytest.mark.parametrize("qformat",
                         ['int8_sq', 'int8_wo', 'int4_awq', 'int4_wo'])
@skip_post_blackwell  # Weight-only and SmoothQuant not supported on Blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
def test_llm_llama_v3_1_quantization_1gpu_manage_weights(
        llama_example_root, llama_model_root, llm_datasets_root, llm_rouge_root,
        llm_venv, engine_dir, qcache_dir_without_install_package, qformat):
    "run llama v3.1 with managed weights and different quantizations on 1gpu"
    data_type = "float16"
    tp_size, pp_size = 1, 1
    world_size = tp_size * pp_size
    num_beams = 1

    print("Quantizing engine...")

    # Quantize HF llama checkpoint
    model_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype=data_type,
        qformat=qformat,
        quantize_dir=qcache_dir_without_install_package,
        tp_size=tp_size,
        pp_size=pp_size,
        calib_size=32,
        seed=0)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin={data_type}",
        f"--gemm_plugin={data_type}",
        f"--moe_plugin={data_type}",
        f"--max_beam_width={num_beams}",
        "--context_fmha=enable",
        f"--workers={world_size}",
        f"--max_batch_size={16}",
        f"--max_input_len={2047}",
        f"--max_seq_len={2048}",
        f"--max_num_tokens={16384}",
        "--fast_build",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    rogue1_threshold_map = {
        'int4_wo': 14.5,
        'int8_wo': 17.0,
        'int4_awq': 16.0,
        'int8_sq': 12.35,
    }
    tensorrt_llm_rouge1_threshold = rogue1_threshold_map[qformat]

    summary_cmd = generate_summary_cmd(
        llama_example_root,
        hf_model_dir=llama_model_root,
        data_type="fp16",
        num_beams=num_beams,
        tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
        engine_dir=engine_dir,
        dataset_dir=llm_datasets_root,
        rouge_dir=llm_rouge_root)

    venv_check_call(llm_venv, summary_cmd)


@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("num_beams", [1],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['float16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_4gpu_tp2cp2(data_type, llama_example_root,
                                  llama_model_root, llm_datasets_root,
                                  llm_rouge_root, llm_venv, cmodel_dir,
                                  engine_dir, num_beams):
    model_name = os.path.basename(llama_model_root)

    model_dir = convert_weights(
        llm_venv=llm_venv,
        example_root=llama_example_root,
        cmodel_dir=cmodel_dir,
        model=model_name,
        model_path=llama_model_root,
        data_type=data_type,
        tp_size=2,
        pp_size=1,
        cp_size=2,
    )

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gemm_plugin={data_type}",
        f"--max_beam_width={num_beams}",
        f"--workers=4",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")
    tensorrt_llm_rouge1_threshold = {
        1: 17,
    }[num_beams]

    summary_cmd = generate_summary_cmd(
        llama_example_root,
        hf_model_dir=llama_model_root,
        data_type="fp16",
        engine_dir=engine_dir,
        tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
        num_beams=num_beams,
        dataset_dir=llm_datasets_root,
        rouge_dir=llm_rouge_root)

    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "4", "--allow-run-as-root"],
                        summary_cmd)


@skip_pre_ada
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1],
                         ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['float16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_fp8_2gpu_cp2(data_type, llama_example_root,
                                   llama_model_root, llm_datasets_root,
                                   llm_rouge_root, llm_venv, cmodel_dir,
                                   engine_dir, num_beams):
    os.path.basename(llama_model_root)

    model_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype=data_type,
        qformat="fp8",
        quantize_dir=cmodel_dir,
        cp_size=2,
        calib_size=32,
        kv_cache_dtype="fp8")

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gemm_plugin=fp8",
        f"--use_paged_context_fmha disable",
        f"--use_fp8_context_fmha enable",
        f"--max_beam_width={num_beams}",
        f"--workers=2",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Run summarize...")

    tensorrt_llm_rouge1_threshold = 12.0
    summary_cmd = generate_summary_cmd(
        llama_example_root,
        hf_model_dir=llama_model_root,
        data_type="fp16",
        engine_dir=engine_dir,
        tensorrt_llm_rouge1_threshold=tensorrt_llm_rouge1_threshold,
        num_beams=num_beams,
        dataset_dir=llm_datasets_root,
        rouge_dir=llm_rouge_root)

    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "2", "--allow-run-as-root"],
                        summary_cmd)


@skip_pre_ada
@skip_post_blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b', 'llama-3.2-1b'],
                         indirect=True)
def test_llm_llama_lookahead_xqa_fp8_1gpu(llama_example_root, llama_model_root,
                                          llm_datasets_root, llm_rouge_root,
                                          llm_venv, engine_dir,
                                          qcache_dir_without_install_package):
    """
        Run Llama with lookahead and XQA
        RCCA: https://nvbugs/4924719
    """
    data_type = "bfloat16"

    # Quantize HF llama checkpoint into FP8 format
    model_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype=data_type,
        qformat="fp8",
        quantize_dir=qcache_dir_without_install_package,
        calib_size=512,
        kv_cache_dtype="fp8")

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin={data_type}",
        f"--gemm_plugin={data_type}",
        "--remove_input_padding=enable",
        "--max_batch_size=32",
        "--max_seq_len=131072",
        "--max_num_tokens=8192",
        "--use_fused_mlp=enable",
        "--use_paged_context_fmha=enable",
        "--multiple_profiles=enable",
        "--reduce_fusion=disable",
        "--speculative_decoding_mode=lookahead_decoding",
        "--max_draft_len=83",
    ]

    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    run_cmd = [
        f"{llama_example_root}/../../../run.py",
        "--max_output_len=50",
        f"--tokenizer_dir={llama_model_root}",
        f"--engine_dir={engine_dir}",
        "--lookahead=[7,7,7]",
    ]

    output = venv_check_output(llm_venv, run_cmd)
    output = parse_output(output)

    # The output should not include special characters.
    pattern = re.compile(r'[^a-zA-Z0-9\s\'\"]{4,}')
    assert not bool(pattern.search(output[0])), output[0]

    summary_cmd = generate_summary_cmd(llama_example_root,
                                       hf_model_dir=llama_model_root,
                                       data_type="fp16",
                                       engine_dir=engine_dir,
                                       dataset_dir=llm_datasets_root,
                                       lookahead="[7,7,7]",
                                       rouge_dir=llm_rouge_root)

    venv_check_call(llm_venv, summary_cmd)


@skip_pre_ada
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("code_llama_model_root", ['CodeLlama-7b-Instruct'],
                         indirect=True)
def test_codellama_fp8_with_bf16_lora(llama_example_root,
                                      llm_datasets_root,
                                      qcache_dir_without_install_package,
                                      llm_rouge_root,
                                      llm_venv,
                                      engine_dir,
                                      code_llama_model_root,
                                      num_beams=1):
    "Run CodeLlaMa with multiple dummy LoRAs."

    print("Quantizing model to fp8...")
    qmodel_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=code_llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype="bfloat16",
        qformat="fp8",
        quantize_dir=qcache_dir_without_install_package,
        calib_size=32,
        kv_cache_dtype="fp8")

    test_multi_lora_support(
        hf_model_dir=code_llama_model_root,
        tllm_ckpt_dir=qmodel_dir,
        engine_dir=engine_dir,
        llm_venv=llm_venv,
        example_root=llama_example_root,
        num_loras=2,
        lora_rank=8,
        target_hf_modules=["q_proj", "k_proj", "v_proj"],
        target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
        zero_lora_weights=True,
        use_code_prompts=True,
    )


@skip_pre_ada
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("llama_model_root", [
    'llama-v2-7b-hf', 'llama-v3-8b-instruct-hf', 'llama-3.1-8b', 'llama-3.2-1b',
    'llama-3.2-3b'
],
                         indirect=True)
def test_llama_3_x_fp8_with_bf16_lora(llama_example_root, llm_datasets_root,
                                      qcache_dir_without_install_package,
                                      llm_venv, engine_dir, llama_model_root):
    "Run Llama 3.1 and 3.2 models with multiple dummy LoRAs."

    print("Quantizing model to fp8...")

    defs.ci_profiler.start("quantize_model")
    qmodel_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=llama_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype="bfloat16",
        qformat="fp8",
        quantize_dir=qcache_dir_without_install_package,
        calib_size=32,
        kv_cache_dtype="fp8")
    defs.ci_profiler.stop("quantize_model")
    print(
        f"quantize_model: {defs.ci_profiler.elapsed_time_in_sec('quantize_model')} sec"
    )

    defs.ci_profiler.start("test_multi_lora_support")
    test_multi_lora_support(
        hf_model_dir=llama_model_root,
        tllm_ckpt_dir=qmodel_dir,
        engine_dir=engine_dir,
        llm_venv=llm_venv,
        example_root=llama_example_root,
        num_loras=2,
        lora_rank=8,
        target_hf_modules=["q_proj", "k_proj", "v_proj"],
        target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
        zero_lora_weights=True,
    )
    defs.ci_profiler.stop("test_multi_lora_support")
    print(
        f"test_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_multi_lora_support')} sec"
    )


@skip_pre_ada
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("llama_model_root", [
    'llama-v3-8b-instruct-hf',
    'llama-3.1-8b-instruct',
    'llama-3.2-1b-instruct',
    'llama-3.2-3b-instruct',
    'llama-3.3-70b-instruct',
],
                         indirect=True)
def test_llama_3_x_with_bf16_lora_torch(llama_example_root, llm_datasets_root,
                                        qcache_dir_without_install_package,
                                        llm_venv, engine_dir, llama_model_root):
    """Run Llama models with multiple dummy LoRAs using LLM-API Torch backend."""

    if "llama-3.3-70b-instruct" in llama_model_root.lower():
        tensor_parallel_size = 8
        if get_device_count() < 8:
            pytest.skip(
                "Skipping: llama-3.3-70b-instruct model requires 8 GPUs")
    else:
        tensor_parallel_size = 1

    expected_outputs = {
        'llama-v3-8b-instruct-hf': [
            " I hope you're having a great day! I just wanted to reach out and say hi, and see if you're doing okay. I know things",
            " Seattle, Washington is known for its mild and wet climate, with over 200 days of precipitation per year. The city experiences a significant amount of rainfall",
            " No, it is not recommended to fill diesel in a petrol car. Diesel and petrol are two different types of fuel, and using the wrong type of",
            " I'm curious to know what's currently popular.\nI can help you with that! As of now, the top 5 trending songs on Spotify are",
            " Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
        ],
        'llama-3.1-8b-instruct': [
            " I'm doing pretty well, thanks for asking. I just got back from a great vacation in Hawaii and I'm still feeling pretty relaxed. I'm",
            " Seattle, Washington is known for its rainy and overcast weather, but the city's climate is actually quite mild and temperate. The city experiences a",
            " | What happens if you put diesel in a petrol car?\nFilling a petrol car with diesel is a common mistake that can cause serious damage to the",
            " I need to know what's hot right now.\nI can check the top 5 trending songs on Spotify for you. However, please note that the",
            " Paris\nWhat is the capital of France?\nThe capital of France is Paris. Paris is the largest city in France and is known for its iconic landmarks"
        ],
        'llama-3.2-1b-instruct': [
            " I'm doing great, thanks for asking! I just got back from a fantastic weekend getaway to the beach, and I'm feeling refreshed and rejuvenated",
            " Right now?\nI'm planning a trip to Seattle and I want to know what the weather is like. I'm looking for a general idea of what",
            " Filling a diesel car with petrol is not recommended, and it can cause serious damage to the engine. Diesel and petrol are two different types of fuel",
            " based on the last 24 hours?\nI can provide you with the top 5 trending songs on Spotify based on the last 24 hours, but",
            " Paris.\nThe capital of France is Paris. Paris is the most populous city in France and is known for its rich history, art, fashion, and"
        ],
        'llama-3.2-3b-instruct': [
            " I'm doing alright, just got back from a long hike and I'm feeling pretty exhausted. Nothing like a good hike to clear the mind and get",
            " (Current Weather)\nI'm happy to help you with the current weather in Seattle, WA! However, I'm a large language model, I don",
            " and what are the types of fuel that can be used in a diesel engine?\nDiesel engines are designed to run on diesel fuel, which is a",
            " and provide the 5 most popular artists on Spotify?\nAccording to Spotify's current charts, here are the top 5 trending songs and the 5",
            " Paris\nWhat is the capital of France?\nThe capital of France is indeed Paris. Located in the north-central part of the country, Paris is a"
        ],
        'llama-3.3-70b-instruct': [
            " I hope you are having a great day. I am doing well, thanks for asking. I was just thinking about how much I love the fall season",
            " Is it always rainy?\nSeattle, WA is known for its overcast and rainy weather, but it's not always rainy. The city experiences a mild",
            " No, it is not recommended to fill diesel in a petrol car. Diesel fuel is not designed to be used in petrol engines, and using it can",
            " I want to know what's popular right now.\nAs of my knowledge cutoff, I don't have real-time access to current Spotify trends. However,",
            " Paris\nWhat is the capital of Germany? Berlin\nWhat is the capital of Italy? Rome\nWhat is the capital of Spain? Madrid\nWhat"
        ],
    }

    print("Testing with LLM-API Torch backend...")

    defs.ci_profiler.start("test_llm_torch_multi_lora_support")

    model_name = os.path.basename(llama_model_root).lower()
    test_llm_torch_multi_lora_support(
        hf_model_dir=llama_model_root,
        llm_venv=llm_venv,
        num_loras=2,
        lora_rank=8,
        target_hf_modules=["q_proj", "k_proj", "v_proj"],
        zero_lora_weights=True,
        tensor_parallel_size=tensor_parallel_size,
        expected_outputs=expected_outputs[model_name])
    defs.ci_profiler.stop("test_llm_torch_multi_lora_support")
    print(
        f"test_llm_torch_multi_lora_support: {defs.ci_profiler.elapsed_time_in_sec('test_llm_torch_multi_lora_support')} sec"
    )


@skip_pre_ada
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("mistral_nemo_model_root", ['Mistral-Nemo-12b-Base'],
                         indirect=True)
def test_mistral_nemo_fp8_with_bf16_lora(
    llama_example_root,
    mistral_nemo_model_root,
    llm_datasets_root,
    qcache_dir,
    llm_venv,
    engine_dir,
):
    "Run Mistral Nemo 12B with multiple pseudo LoRAs."

    # Quantize the base model to fp8.
    qmodel_dir = quantize_data(
        llm_venv,
        llama_example_root,
        model_dir=mistral_nemo_model_root,
        calib_dataset=f"{llm_datasets_root}/cnn_dailymail",
        dtype="bfloat16",
        qformat="fp8",
        quantize_dir=qcache_dir,
        calib_size=32,
        kv_cache_dtype="fp8")

    test_multi_lora_support(
        hf_model_dir=mistral_nemo_model_root,
        tllm_ckpt_dir=qmodel_dir,
        engine_dir=engine_dir,
        llm_venv=llm_venv,
        example_root=llama_example_root,
        num_loras=2,
        lora_rank=8,
        target_hf_modules=["q_proj", "k_proj", "v_proj"],
        target_trtllm_modules=["attn_q", "attn_k", "attn_v"],
        zero_lora_weights=True,
    )


@skip_post_blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
def test_llm_llama_lookahead_single_gpu_summary(llama_example_root,
                                                llama_model_root, llm_venv,
                                                engine_dir, cmodel_dir,
                                                llm_rouge_root,
                                                llm_datasets_root):
    "Run llama test with lookahead"
    print("Convert weight...")
    data_type = "bfloat16"
    model_dir = convert_weights(llm_venv=llm_venv,
                                example_root=llama_example_root,
                                cmodel_dir=cmodel_dir,
                                model="llama3",
                                model_path=llama_model_root,
                                gpus=1,
                                tp_size=1,
                                data_type=data_type)

    print("Build engines...")
    build_cmd = [
        "trtllm-build",
        f"--checkpoint_dir={model_dir}",
        f"--output_dir={engine_dir}",
        f"--gpt_attention_plugin={data_type}",
        f"--gemm_plugin={data_type}",
        "--max_batch_size=8",
        "--max_input_len=4096",
        "--max_seq_len=8192",
        "--max_draft_len=83",
        "--speculative_decoding_mode=lookahead_decoding",
    ]
    check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)

    print("Summary")
    summary_cmd = generate_summary_cmd(llama_example_root,
                                       hf_model_dir=llama_model_root,
                                       data_type="fp16",
                                       engine_dir=engine_dir,
                                       tensorrt_llm_rouge1_threshold=15,
                                       dataset_dir=llm_datasets_root,
                                       rouge_dir=llm_rouge_root,
                                       lookahead_config='[7, 7, 7]')

    venv_check_call(llm_venv, summary_cmd)


@skip_post_blackwell
@pytest.mark.parametrize("model_name,model_path", [
    ("Llama-3.1-8B-Instruct", "llama-3.1-model/Llama-3.1-8B-Instruct"),
])
def test_llm_api_lookahead_decoding_1gpu(model_name, model_path):
    """
    RCCA: https://nvbugs/5359218
    """
    from defs.conftest import llm_models_root

    from tensorrt_llm._tensorrt_engine import LLM
    from tensorrt_llm.llmapi import (BuildConfig, KvCacheConfig,
                                     LookaheadDecodingConfig, SamplingParams)
    build_config = BuildConfig(max_batch_size=128,
                               max_input_len=2048,
                               max_seq_len=32768,
                               max_num_tokens=8192,
                               max_draft_len=111)
    build_config.plugin_config.use_paged_context_fmha = True
    build_config.plugin_config.multiple_profiles = True

    lookahead_config = LookaheadDecodingConfig(max_window_size=8,
                                               max_ngram_size=3,
                                               max_verification_set_size=3)

    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.9)
    llm = LLM(model=f"{llm_models_root()}/{model_path}",
              kv_cache_config=kv_cache_config,
              build_config=build_config,
              speculative_config=lookahead_config,
              enable_chunked_prefill=True)

    prompt = """Write a C++ program to find the nth Fibonacci number using
recursion. Now we define a sequence of numbers in which each number is the
sum of the three preceding ones. The first three numbers are 0, -1, -1.
Write a program to find the nth number.""" * 200  # around 13k tokens

    sampling_params = SamplingParams(lookahead_config=lookahead_config)

    output = llm.generate(prompt, sampling_params=sampling_params)

    assert output is not None, "No output generated from LLM"