TensorRT-LLMs/tests/integration/defs/perf/test_perf.py

# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
TensorRT LLM perf tests
"""
import os
import re
import shutil
import sys
from typing import Dict, List, NamedTuple

import pytest
from defs.common import convert_weights, get_cpp_benchmark, quantize_data
from defs.trt_test_alternative import (is_linux, is_windows, print_info,
                                       print_warning)

from ..conftest import get_llm_root, llm_models_root, trt_environment
from .model_yaml_config import get_model_yaml_config
from .utils import (AbstractPerfScriptTestClass, PerfBenchScriptTestCmds,
                    PerfMetricType, PerfScriptTestCmds, generate_test_nodes)

if not hasattr(re, "Pattern"):
    re.Pattern = type(re.compile(""))

ALLOWED_CONFIGS_CACHE = None  # Cache to avoid modifying sys.path many times.
MAP_BY_SOCKET = None

# Model PATH of local dir synced from internal LLM models repo
MODEL_PATH_DICT = {
    "llama_v2_7b": "llama-models-v2/llama-v2-7b-hf",  # not safetensors repo
    "llama_v2_13b": "llama-models-v2/llama-v2-13b-hf",  # not safetensors repo
    "llama_v2_70b": "llama-models-v2/llama-v2-70b-hf",  # not safetensors repo
    "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B",
    "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
    "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
    "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B",
    "llama_v3.3_70b_instruct_fp8":
    "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
    "llama_v3.1_405b_instruct_fp4":
    "llm-models/modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
    "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
    "llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B",
    "llama_v3.3_nemotron_49b": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1/",
    "llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1",
    # "llama_30b": "llama-models/llama-30b-hf",
    "mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
    "mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
    "mixtral_8x22b_v0.1": "Mixtral-8x22B-v0.1",
    "mistral_7b_v0.1": "mistral-7b-v0.1",
    "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
    "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
    "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
    "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
    "qwen2_7b_instruct": "Qwen2-7B-Instruct",
    "qwen_14b_chat": "Qwen-14B-Chat",
    "starcoder2_3b": "starcoder2-3b",
    "starcoder_15b": "starcoder2-15b",
    "t5": "t5-small",  # not supported for trtllm-bench build config
    "flan_t5_base":
    "flan-t5-small",  # not supported for trtllm-bench build config
    "flan_t5_large":
    "flan-t5-xl",  # not supported for trtllm-bench build config
    "whisper_large_v3":
    "whisper-models/large-v3",  # not supported for trtllm-bench tokenizer
    "bart_large_cnn": "bart-large-cnn",  # not safetensors repo
    "mbart_large_50_many_to_one_mmt": "mbart-large-50-many-to-one-mmt",
    "mamba_130m": "mamba/mamba-130m-hf",
    "mamba_370m": "mamba/mamba-370m-hf",
    "mamba_2.8b": "mamba/mamba-2.8b-hf",
    "gpt_20b": "gpt-neox-20b",
    "gpt_350m_moe": "gpt2-medium",
    "phi_3_mini_4k_instruct": "Phi-3/Phi-3-mini-4k-instruct",
    "phi_3_mini_128k_instruct": "Phi-3/Phi-3-mini-128k-instruct",
}
# Model PATH of HuggingFace
HF_MODEL_PATH = {
    "llama_v2_7b_hf": "meta-llama/Llama-2-7b-hf",
    "llama_v2_70b_hf": "meta-llama/Llama-2-70b-hf",
    "falcon_180b_hf": "tiiuae/falcon-180B",
    "gptj_6b_hf": "EleutherAI/gpt-j-6b",
    "llama_v3_8b_hf": "meta-llama/Meta-Llama-3-8B",
    "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B",
    "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8",
    "llama_v3.1_70b_instruct_hf": "meta-llama/Meta-Llama-3.1-70B-Instruct",
    "llama_v3_70b_hf": "meta-llama/Meta-Llama-3-70B",
    "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B",
    "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B",
    "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
    "mixtral_8x7b_v0.1_hf": "mistralai/Mixtral-8x7B-v0.1",
    "mixtral_8x7b_v0.1_instruct_hf": "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "mistral_7b_v0.1_hf": "mistralai/Mistral-7B-v0.1",
    "flan_t5_base_hf": "google/flan-t5-small",
}
LORA_MODEL_PATH = {
    "llama_v2_13b": "llama-models-v2/chinese-llama-2-lora-13b",
    "mixtral_8x7b_0.1": "chinese-mixtral-lora",
}

TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "")


def cpu_socket_count_gt_1():
    global MAP_BY_SOCKET
    if MAP_BY_SOCKET is not None:
        return MAP_BY_SOCKET
    if is_linux():
        with open('/proc/cpuinfo') as f:
            cpuinfo = f.read()
            physical_id_set = set()
            for line in cpuinfo.splitlines():
                if line.startswith('physical id'):
                    _, id_ = line.split(':')
                    physical_id_set.add(id_.strip())
        MAP_BY_SOCKET = len(physical_id_set) > 1
    else:
        MAP_BY_SOCKET = False
    return MAP_BY_SOCKET


# A helper function to import allowed_configs.py.
def import_allowed_perf_config():
    if trt_environment:
        from llm import allowed_configs
    else:
        global ALLOWED_CONFIGS_CACHE
        if ALLOWED_CONFIGS_CACHE is None:
            sys.path.append((os.path.join(get_llm_root(),
                                          "tests/integration/defs/perf")))
            import allowed_configs
            ALLOWED_CONFIGS_CACHE = allowed_configs
        else:
            allowed_configs = ALLOWED_CONFIGS_CACHE
    return allowed_configs


# Regex commands used to parse the metric result for the metric type.
PERF_METRIC_LOG_QUERIES = {
    PerfMetricType.BUILD_TIME:
    re.compile(r"Engine generation completed in ([\d\.]+) seconds"),
    PerfMetricType.INFERENCE_TIME:
    re.compile(r"\[BENCHMARK\].* (?:total_latency|latency)\(ms\) ([\d\.]+)"),
    PerfMetricType.FIRST_TOKEN_TIME:
    re.compile(r"\[BENCHMARK\].* avg_time_to_first_token\(ms\) ([\d\.]+)"),
    PerfMetricType.SEQ_LATENCY:
    re.compile(r"\[BENCHMARK\].* avg_sequence_latency\(ms\) ([\d\.]+)"),
    PerfMetricType.SEQ_THROUGHPUT:
    re.compile(r"\[BENCHMARK\].* seq_throughput\(seq\/sec\) ([\d\.]+)"),
    PerfMetricType.TOKEN_THROUGHPUT:
    re.compile(
        r"\[BENCHMARK\].* (?:token_throughput\(token\/sec\)|tokensPerSec|tokens_per_sec) ([\d\.]+)"
    ),
    PerfMetricType.INFERENCE_PEAK_GPU_MEMORY:
    re.compile(r"\[BENCHMARK\].* gpu_peak_mem\(gb\) ([\d\.]+)"),
    PerfMetricType.BUILD_PEAK_CPU_MEMORY:
    re.compile(
        r"Peak memory usage during Engine building and serialization: CPU: ([\d\.]+) .*"
    ),
    PerfMetricType.BUILD_PEAK_GPU_MEMORY:
    re.compile(
        r"Peak memory usage of TRT CPU/GPU memory allocators: CPU .*, GPU ([\d\.]+) .*"
    ),
    PerfMetricType.ENGINE_SIZE:
    re.compile(r".*Total engine size per GPU is ([\d\.]+) MiB.*"),
    PerfMetricType.CONTEXT_GPU_MEMORY:
    re.compile(r".*Allocated ([\d\.]+) MiB for execution context memory.*"),
    PerfMetricType.KV_CACHE_SIZE:
    re.compile(r".*Allocated ([\d\.]+) GiB for max tokens in paged KV cache.*"),
}
BENCH_PERF_METRIC_LOG_QUERIES = {
    PerfMetricType.BUILD_TIME:
    re.compile(r"Engine generation completed in ([\d\.]+) seconds"),
    PerfMetricType.INFERENCE_TIME:
    re.compile(r"Total Latency \(ms\):\s+([\d\.]+)"),
    PerfMetricType.TOKEN_THROUGHPUT:
    re.compile(r"GPU Output Throughput \(tps\/gpu\):\s+([\d\.]+)"),
    PerfMetricType.SEQ_THROUGHPUT:
    re.compile(r"Request Throughput \(req\/sec\):\s+([\d\.]+)"),
    PerfMetricType.FIRST_TOKEN_TIME:
    re.compile(r"Average time-to-first-token \[TTFT\] \(ms\):\s+([\d\.]+)"),
    PerfMetricType.OUTPUT_TOKEN_TIME:
    re.compile(r"Average time-per-output-token \[TPOT\] \(ms\):\s+([\d\.]+)"),
}
# (Relative threshold, Absolute threshold) for all metric types
PERF_METRIC_THRESHOLD = {
    PerfMetricType.BUILD_TIME: (0.1, 30),  # Ignore build time regression < 30ms
    PerfMetricType.INFERENCE_TIME:
    (0.1, 50),  # Ignore inference time regression < 50ms
    PerfMetricType.FIRST_TOKEN_TIME:
    (0.1, 50),  # Ignore first token time regression < 50ms
    PerfMetricType.OUTPUT_TOKEN_TIME:
    (0.1, 50),  # Ignore per output token time regression < 50ms
    PerfMetricType.SEQ_LATENCY: (0.1, 50),  # Ignore latency regression < 50ms
    PerfMetricType.TOKEN_THROUGHPUT: (
        -0.1, 10
    ),  # Ignore throughput regression < 10 tokens/s. Negative rel threshold is to indicate that larger is better.
    PerfMetricType.SEQ_THROUGHPUT: (
        -0.1, 10
    ),  # Ignore throughput regression < 10 tokens/s. Negative rel threshold is to indicate that larger is better.
    PerfMetricType.INFERENCE_PEAK_GPU_MEMORY:
    (0.1, 0.1),  # Ignore inference peak gpu memory regression < 0.1GiB
    PerfMetricType.BUILD_PEAK_CPU_MEMORY:
    (0.1, 100),  # Ignore build peak cpu memory regression < 100MiB
    PerfMetricType.BUILD_PEAK_GPU_MEMORY:
    (0.1, 100),  # Ignore build peak gpu memory regression < 100MiB
    PerfMetricType.ENGINE_SIZE: (0.3,
                                 100),  # Ignore engine size regression < 100MiB
    PerfMetricType.CONTEXT_GPU_MEMORY:
    (0.1, 50),  # Ignore context GPU memory < 50MiB
    PerfMetricType.KV_CACHE_SIZE: (-0.1, 50),  # Ignore value < 50MiB
}

BUILDER_METRICS = [
    PerfMetricType.BUILD_TIME, PerfMetricType.BUILD_PEAK_CPU_MEMORY,
    PerfMetricType.BUILD_PEAK_GPU_MEMORY, PerfMetricType.ENGINE_SIZE
]

INFERENCE_METRICS = [
    PerfMetricType.INFERENCE_TIME,
    PerfMetricType.INFERENCE_PEAK_GPU_MEMORY,
    PerfMetricType.CONTEXT_GPU_MEMORY,
]

BERT_CPP_INFERENCE_METRICS = [
    PerfMetricType.INFERENCE_TIME,
    PerfMetricType.CONTEXT_GPU_MEMORY,
]

MANAGER_INFERENCE_METRICS = [
    PerfMetricType.INFERENCE_TIME,
    PerfMetricType.TOKEN_THROUGHPUT,
    PerfMetricType.CONTEXT_GPU_MEMORY,
    PerfMetricType.SEQ_THROUGHPUT,
    PerfMetricType.SEQ_LATENCY,
    PerfMetricType.KV_CACHE_SIZE,
]

BENCH_INFERENCE_METRICS = [
    PerfMetricType.INFERENCE_TIME,
    PerfMetricType.TOKEN_THROUGHPUT,
    PerfMetricType.SEQ_THROUGHPUT,
]


class PerfTestMetric(NamedTuple):
    """
    Configurations of a test metric.
    """
    # The original test name used to run the TURTLE test.
    original_test_name: str
    # The name for this particular metric.
    metric_name: str
    # The type of this metric.
    metric_type: PerfMetricType
    # The regex used to parse this metric.
    metric_regex: re.Pattern
    # The relative threshold to allow for regressions.
    metric_threshold: float
    # The absolute threshold to allow for regressions.
    metric_abs_threshold: float
    # The index of the command of this metric.
    # Currently, we run 1 build command plus N benchmark commands.
    cmd_idx: int


class PerfTestConfig:
    """
    Configurations defining the LLM perf test.
    This should hold only the attributes that distinguish different tests.
    """

    def __init__(
        self,
        *,
        model_name: str = "",
        runtime: str = "python",
        static_batching: str = "",
        api: str = "",
        streaming: str = "",
        backend: str = "",
        mode: str = "plugin",
        data_type: str = "float16",
        max_batch_size: int = 512,
        max_num_tokens: int = 2048,
        gpu_weights_percent: float = -1,
        batch_sizes: List[int] = [0],
        input_lens: List[int] = [8],
        output_lens: List[int] = [1],
        num_beams: int = 1,
        num_loras: int = 0,
        num_reqs: int = 512,
        concurrency: int = -1,
        quantization: str = "",
        ep_size: int = None,
        tp_size: int = 1,
        pp_size: int = 1,
        num_gpus: int = 1,
    ):
        # The model name.
        self.model_name = model_name
        # Python or cpp/cppmanager runtime.
        self.runtime = runtime
        # static batching for gptManagerBenchmark
        self.static_batching = static_batching
        # API Type: only executor is allowed
        self.api = api
        # Backend Type: pytorch or cpp
        self.backend = backend
        # Streaming responses
        self.streaming = streaming
        # Plugin or OOTB mode.
        self.mode = mode
        # Activation dtype.
        self.data_type = data_type
        # Percentage of weights that resides on GPU.
        self.gpu_weights_percent = gpu_weights_percent
        # Max Batch Size to build TRT engine with.
        self.max_batch_size = max_batch_size
        # Max number of tokens to build TRT engine with.
        self.max_num_tokens = max_num_tokens
        # List of batch sizes to run benchmark with.
        self.batch_sizes = batch_sizes
        # List of input lens to run benchmark with.
        self.input_lens = input_lens
        # List of output lens to run benchmark with.
        self.output_lens = output_lens
        # Number of beams.
        self.num_beams = num_beams
        # Number of loras.
        self.num_loras = num_loras
        # Number of requests.
        self.num_reqs = num_reqs
        # Number of concurrency
        self.concurrency = concurrency
        # Quantization type.
        self.quantization = quantization
        # Multiple Profiles
        self.multiple_profiles = False
        # EP Size
        self.ep_size = ep_size
        # TP Size
        self.tp_size = tp_size
        # PP Size
        self.pp_size = pp_size
        # Number of GPUs.
        self.num_gpus = num_gpus
        # Just build engines
        self.build_only = False

    def to_string(self,
                  custom_bs: int = None,
                  custom_input_len: int = None,
                  custom_output_len: int = None) -> str:

        # First, add the model name.
        entries = [self.model_name]

        if self.runtime == "cpp":  # bertBenchmark runtime
            entries.append(f"cpp")
        elif self.runtime == "cppmanager":  # gptManagerBenchmark runtime
            entries.append(f"cppmanager")
            if self.api == "exe":  # executor
                entries.append(f"exe")
            if self.streaming == "streaming":
                entries.append(f"streaming")
            if self.static_batching == "static_batching":
                entries.append(f"static_batching")
        elif self.runtime == "bench":  # trtllm-bench
            entries.append(f"bench")
            if self.backend == 'pytorch':
                entries.append(f"pytorch")
            if self.streaming == "streaming":
                entries.append(f"streaming")

        # Add mode and dtype.
        if self.runtime != "bench":
            entries.append(self.mode)
        entries.append(self.data_type)

        if self.gpu_weights_percent != -1:
            entries.append(f"gwp:{self.gpu_weights_percent}")

        if self.multiple_profiles:
            entries.append(f"mp")

        # Add Max batch size.
        entries.append(f"maxbs:{self.max_batch_size}")

        # Add Max number of tokens.
        entries.append(f"maxnt:{self.max_num_tokens}")

        if self.build_only:
            entries.append(f"build_only")

        if self.batch_sizes[0] > 0:
            # Add batch size(s).
            if custom_bs is None:
                bs_label = "+".join([str(x) for x in self.batch_sizes])
            else:
                bs_label = str(custom_bs)
            entries.append(f"bs:{bs_label}")

        # Add input/output lens.
        if len(self.output_lens) > 0:
            if custom_input_len is None:
                io_lens = []
                for in_len, out_len in zip(self.input_lens, self.output_lens):
                    io_lens.append(f"{in_len},{out_len}")
                io_len_label = "+".join(io_lens)
            else:
                assert custom_output_len is not None, \
                    "custom_output_len must be provided if custom_input_len is specified!"
                io_len_label = f"{custom_input_len},{custom_output_len}"
            entries.append(f"input_output_len:{io_len_label}")
        else:
            if custom_input_len is None:
                len_label = "+".join([str(x) for x in self.input_lens])
            else:
                len_label = custom_input_len
            entries.append(f"input_len:{len_label}")

        # Add number of beams.
        if self.num_beams > 1:
            entries.append(f"beams:{self.num_beams}")

        # Add number of loras.
        if self.num_loras > 0:
            entries.append(f"loras:{self.num_loras}")

        # Add quantization type.
        if self.quantization != "":
            entries.append(f"quant:{self.quantization}")

        # Add number of requests.
        if self.num_reqs != 512:
            entries.append(f"reqs:{self.num_reqs}")

        #Add number of concurrency
        if self.concurrency != -1:
            entries.append(f"con:{self.concurrency}")

        #Add EP Size.
        if self.ep_size != None:
            entries.append(f"ep:{self.ep_size}")

        # Add TP Size.
        if self.tp_size > 1 and self.tp_size != self.num_gpus:
            entries.append(f"tp:{self.tp_size}")

        # Add PP Size.
        if self.pp_size > 1:
            entries.append(f"pp:{self.pp_size}")

        # Add number of GPUs.
        if self.num_gpus > 1:
            entries.append(f"gpus:{self.num_gpus}")

        # Concatenate labels with "-".
        return "-".join(entries)

    def __str__(self) -> str:
        return self.to_string()

    def load_from_str(self, test_param_labels) -> None:
        """
        Populate the config properties given the test param string.
        """

        # Extract configs from test param labels.
        labels = test_param_labels.split("-")

        self.model_name = labels.pop(0)
        assert labels[0] in ["cpp", "cppmanager", "bench"], \
            f"Invalid runtime {labels[0]}!"
        self.runtime = labels.pop(0)
        self.api = labels.pop(0) if labels[0] == "exe" else ""
        self.backend = labels.pop(0) if labels[0] == "pytorch" else ""
        self.streaming = labels.pop(0) if labels[0] == "streaming" else ""
        self.static_batching = labels.pop(
            0) if labels[0] == "static_batching" else ""
        if self.runtime != "bench":
            self.mode = labels.pop(0)
        self.data_type = labels.pop(0)
        if labels[0].startswith("gwp"):
            self.gpu_weights_percent = float(labels.pop(0).replace("gwp:", ""))

        if labels[0] == "mp":
            self.multiple_profiles = True
            labels.pop(0)

        if labels[0].startswith("maxbs"):
            self.max_batch_size = int(labels.pop(0).replace("maxbs:", ""))

        if labels[0].startswith("maxnt"):
            self.max_num_tokens = int(labels.pop(0).replace("maxnt:", ""))

        if labels[0] == "build_only":
            self.build_only = True
            labels.pop(0)

        if not self.build_only:
            if labels[0].startswith("bs:"):
                self.batch_sizes = [
                    int(x) for x in labels.pop(0).replace("bs:", "").split("+")
                ]
            else:
                self.batch_sizes = [0]

            if labels[0].startswith("input_output_len"):
                io_lens = labels.pop(0).replace("input_output_len:",
                                                "").split("+")
                self.input_lens = [int(x.split(",")[0]) for x in io_lens]
                self.output_lens = [int(x.split(",")[1]) for x in io_lens]
            elif labels[0].startswith("input_len"):
                self.input_lens = [
                    int(x)
                    for x in labels.pop(0).replace("input_len:", "").split("+")
                ]
                self.output_lens = []
            else:
                raise RuntimeError(
                    f"Unexpected test name label for seq lens: {labels[0]}!")

        if len(labels) > 0:
            self.num_beams = 1 if not labels[0].startswith("beams:") else int(
                labels.pop(0).replace("beams:", ""))

        if len(labels) > 0:
            self.num_loras = 0 if not labels[0].startswith("loras:") else int(
                labels.pop(0).replace("loras:", ""))

        if len(labels) > 0:
            self.quantization = "" if not labels[0].startswith(
                "quant:") else labels.pop(0).replace("quant:", "")

        if len(labels) > 0:
            self.num_reqs = 512 if not labels[0].startswith("reqs:") else int(
                labels.pop(0).replace("reqs:", ""))

        if len(labels) > 0:
            self.concurrency = -1 if not labels[0].startswith("con:") else int(
                labels.pop(0).replace("con:", ""))

        if len(labels) > 0:
            self.ep_size = None if not labels[0].startswith("ep:") else int(
                labels.pop(0).replace("ep:", ""))

        if len(labels) > 0:
            self.tp_size = 1 if not labels[0].startswith("tp:") else int(
                labels.pop(0).replace("tp:", ""))

        if len(labels) > 0:
            self.pp_size = 1 if not labels[0].startswith("pp:") else int(
                labels.pop(0).replace("pp:", ""))

        if len(labels) > 0:
            self.num_gpus = 1 if not labels[0].startswith("gpus:") else int(
                labels.pop(0).replace("gpus:", ""))

        assert len(
            labels
        ) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}"

        # Validate the parsed config.
        self.validate()

    def validate(self):
        """
        Validate if the config makes sense.
        """

        # Validate model name.
        assert len(self.model_name) > 0, "model_name must not be empty!"
        assert "-" not in self.model_name, "model_name must not contain '-' character!"
        if self.model_name not in MODEL_PATH_DICT.keys(
        ) and self.model_name not in HF_MODEL_PATH.keys():
            allowed_configs = import_allowed_perf_config()
            allowed_models = allowed_configs.get_allowed_models()
            assert self.model_name in allowed_models, f"model_name {self.model_name} is not in allowed_models!"

        # Validate runtime type.
        VALID_RUNTIMES = ["cpp", "cppmanager", "bench"]
        assert self.runtime in VALID_RUNTIMES, f"Invalid runtime {self.runtime}!"

        # Validate plugin mode.
        VALID_MODES = ["plugin", "ootb", "ootb_except_mha"]
        if self.runtime == "cppmanager":
            VALID_MODES += ["plugin_ifb"]
        assert self.mode in VALID_MODES, f"Invalid mode {self.mode}!"

        # Validate dtype.
        VALID_DTYPES = ["float32", "float16", "bfloat16", "float8", "float4"]
        assert self.data_type in VALID_DTYPES, f"Invalid data_type {self.data_type}!"

        # Validate quantization mode.
        if self.model_name in MODEL_PATH_DICT.keys():
            VALID_QUANTS = [
                "", "nvfp4", "fp8", "int8", "int4_awq", "w4a8_awq", "w4a16_awq",
                "int4_wo", "full_prec"
            ]
        else:
            VALID_QUANTS = [
                "",
                "fp8",
                "fp8_gemm",
                "fp8_kv_cache",
                "int8_sq_per_tensor",
                "int8_sq_per_token_channel",
                "int8_weight_only",
                "int4_weight_only",
                "int4_weight_only_awq",
                "int4_weight_only_gptq",
            ]
        assert self.quantization in VALID_QUANTS, f"Invalid quantization {self.quantization}!"
        if self.backend == "pytorch":
            assert self.quantization == "", f"Not support passing quantization {self.quantization} for pytorch backend!"
        assert self.num_beams >= 1, f"Invalid num_beams: {self.num_beams}!"
        assert self.num_loras >= 0, f"Invalid num_loras: {self.num_loras}!"
        assert self.num_reqs >= 1, f"Invalid num_reqs: {self.num_reqs}!"
        if self.pp_size > 1:
            assert self.model_name in MODEL_PATH_DICT.keys(
            ), f"Invalid model name for pp size {self.pp_size} test"
        if self.num_gpus > 1 and self.tp_size == 1 and self.pp_size == 1:
            self.tp_size = self.num_gpus

        if self.tp_size > 1 or self.pp_size > 1 and self.num_gpus == 1:
            self.num_gpus = self.tp_size * self.pp_size

        assert self.num_gpus == self.tp_size * self.pp_size, f"Num of GPU shall be equal to TP*PP: {self.num_gpus}, {self.tp_size}, {self.pp_size}"
        if self.gpu_weights_percent != -1:
            assert 0 <= self.gpu_weights_percent <= 1, f"Invalid gpu_weights_percent: {self.gpu_weights_percent}!"
        if not self.build_only:
            if self.runtime != "cppmanager" and self.runtime != "bench":
                print(f"runtime: {self.runtime}")
                # Validate max batch size.
                if self.max_batch_size > 0:
                    assert max(
                        self.batch_sizes
                    ) <= self.max_batch_size, f"Batch Size larger than Max Batch Size!"
                    # Validate bs, seq lens, and num_beams.
                    assert len(
                        self.batch_sizes
                    ) > 0 and self.batch_sizes[0] > 0, f"Empty batch sizes!"
                assert self.static_batching == "", f"Static Batching only valid for gptManagerBenchmark!"
                assert self.api == "", f"API Type only valid for gptManagerBenchmark!"
                assert self.streaming == "", f"Streaming only valid for gptManagerBenchmark and trtllm-bench!"

            assert len(self.input_lens) > 0, f"Empty input_lens!"
            if self.is_bert_like():
                assert len(
                    self.output_lens
                ) == 0, f"BERT-like models must not have output_lens!"
            else:
                assert len(
                    self.output_lens
                ) > 0, f"GPT-like models and enc-dec models must have output_lens!"

            # BERT with small BS is very unstable. Try to avoid it.
            if self.is_bert_like():
                if self.runtime == "trtllm-bench":
                    self.batch_sizes[
                        0] = self.max_batch_size if self.max_batch_size > 0 else 1
                    print(f"batch_sizes: {self.batch_sizes}")
                assert all(
                    [b >= 32 for b in self.batch_sizes]
                ), f"BERT with small BS is very unstable! Please increase to at least 32."

            # GPT-350m and Bloom-560m with small BS are very unstable. Only run these small models with larger BS.
            if self.model_name in ["gpt_350m", "bloom_560m"]:
                assert all(
                    [b >= 32 for b in self.batch_sizes]
                ), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32."

    def get_model_family(self) -> str:
        """
        Get the model family of the current model.
        """
        allowed_configs = import_allowed_perf_config()
        allowed_models = allowed_configs.get_allowed_models()
        if self.model_name in allowed_models:
            return allowed_configs.get_model_family(self.model_name)
        else:
            return ""

    def is_mamba_family(self) -> bool:
        """
        Check if the current model family is Mamba.
        """
        return self.get_model_family() == 'mamba'

    def is_moe_family(self) -> bool:
        """
        Check if the current model family is MoE.
        """
        allowed_configs = import_allowed_perf_config()
        allowed_models = allowed_configs.get_allowed_models()
        if self.model_name in allowed_models:
            model_config = allowed_configs.get_model_config(self.model_name)
            return model_config['moe_num_experts'] > 0 and model_config[
                'moe_top_k'] > 0
        else:
            return False

    def get_benchmark_type(self) -> str:
        """
        Get the benchmark type of the current model.
        """
        allowed_configs = import_allowed_perf_config()
        allowed_models = allowed_configs.get_allowed_models()
        if self.model_name in allowed_models:
            return allowed_configs.get_benchmark_type(self.model_name)
        else:
            return ""

    def is_bert_like(self) -> bool:
        """
        Check if the current benchmark is a BERT benchmark.
        """
        return self.get_benchmark_type() == "bert"

    def is_enc_dec(self) -> bool:
        """
        Check if the current benchmark is a EncDec benchmark.
        """
        return self.get_benchmark_type() == "enc_dec"


class MultiMetricPerfTest(AbstractPerfScriptTestClass):
    """
    Base class for perf tests with multiple metrics.
    """

    def __init__(self, full_test_name: str):
        # full_test_name is the full test name appearing in TURTLE output.
        self._full_test_name = full_test_name
        # test_domain_name is the part before "::".
        self._test_domain_name = "::".join(full_test_name.split("::")[:-1])
        # short_test_name is the part after "::".
        self._short_test_name = full_test_name.split("::")[-1]
        # short_test_name_body is the part before "[" in short_test_name.
        self._short_test_name_body = self._short_test_name.split("[")[0]
        # test_param_labels is the part inside "[...]".
        self._test_param_labels = full_test_name.split("[")[-1][:-1]
        # Load test config from test name.
        self._config = PerfTestConfig()
        self._config.load_from_str(self._test_param_labels)
        # This will store the currently running metric.
        self._current_metric = None

    def get_test_name(self) -> str:
        return str(self._config)

    def set_runtime_configs(self, llm_root, working_dir,
                            perf_cache_fpath) -> None:
        if self._config.runtime == "cpp":
            if not self._config.is_bert_like():
                raise ValueError(
                    f"Invalid config: '{self._config.runtime}' is only supported for bert-like models!"
                )
            benchmark_script = get_cpp_benchmark("bertBenchmark", llm_root)
        elif self._config.runtime == "cppmanager":
            benchmark_script = get_cpp_benchmark("gptManagerBenchmark",
                                                 llm_root)
        elif self._config.runtime == "bench":
            benchmark_script = "trtllm-bench"
        else:
            raise RuntimeError(f"Invalid runtime {self._config.runtime}.")
        allowed_configs = import_allowed_perf_config()
        allowed_models = allowed_configs.get_allowed_models()
        if self._config.runtime == "bench":
            build_script = "trtllm-bench"
        elif self._config.pp_size > 1 or self._config.model_name not in allowed_models:
            build_script = "trtllm-build"
        else:
            # build.py is used to build engines for both python and cpp runtime
            build_script = os.path.join(llm_root,
                                        "tests/integration/defs/perf/build.py")
        self._build_script = build_script
        self._benchmark_script = benchmark_script
        self._working_dir = working_dir
        self._perf_cache_fpath = perf_cache_fpath
        self._llm_root = llm_root

    def get_convert_weights_command(self, model_dir, engine_dir) -> str:
        """
        Get the convert checkpoint command.
        """
        if "phi" in self._config.model_name:
            example_name = "phi"
        else:
            example_name = "llama"

        if self._config.quantization != "":
            command, checkpoint_dir = quantize_data(
                llm_venv=None,
                example_root=os.path.join(get_llm_root(), "examples", "models",
                                          "core", example_name),
                model_dir=model_dir,
                calib_dataset=os.path.join(llm_models_root(), "datasets",
                                           "cnn_dailymail"),
                dtype=self._config.data_type,
                qformat=self._config.quantization,
                tp_size=self._config.tp_size,
                pp_size=self._config.pp_size,
                quantize_dir=engine_dir)
        else:
            command, checkpoint_dir = convert_weights(
                llm_venv=None,
                example_root=os.path.join(get_llm_root(), "examples", "models",
                                          "core", example_name),
                cmodel_dir=engine_dir,
                model=self._config.model_name,
                model_path=model_dir,
                tp_size=self._config.tp_size,
                pp_size=self._config.pp_size,
                data_type=self._config.data_type)
        command = [f"python3"] + command

        return command, checkpoint_dir

    def get_convert_lora_weights_command(self, model_dir, engine_dir) -> str:
        script = os.path.join(self._llm_root, "examples", "hf_lora_convert.py")
        checkpoint_dir = os.path.join(engine_dir, "lora_cpp")
        command = [
            script, f"-i={model_dir}", "--storage-type=float16",
            f"-o={checkpoint_dir}"
        ]
        command = [f"python3"] + command

        return command, checkpoint_dir

    def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list:
        build_cmd = [
            self._build_script, f"--output_dir={engine_dir}",
            f"--checkpoint_dir={checkpoint_dir}",
            f"--workers={self._config.tp_size}",
            f"--use_paged_context_fmha=enable", f"--monitor_memory",
            f"--max_batch_size={self._config.max_batch_size}"
        ]
        # For Multiple Profiles
        if self._config.multiple_profiles:
            build_cmd.append(f"--multiple_profiles=enable")
        else:
            build_cmd.append(f"--multiple_profiles=disable")
        num_beams = self._config.num_beams
        if num_beams > 1:
            build_cmd.append(f"--max_beam_width={num_beams}")
        gpu_percent = self._config.gpu_weights_percent
        if gpu_percent != -1:
            build_cmd += [f"--weight_streaming"]
        # For engine inspector
        build_cmd.append("--profiling_verbosity=layer_names_only")
        if self._config.num_loras > 0:
            if "mixtral" in self._config.model_name:
                build_cmd.append(f"--lora_plugin=auto")
                build_cmd.append(f"--moe_plugin=auto")
                build_cmd.append(f"--lora_target_modules")
                build_cmd.append(f"attn_q")
                build_cmd.append(f"attn_k")
                build_cmd.append(f"attn_v")
                build_cmd.append(f"attn_dense")
                build_cmd.append(f"moe_h_to_4h")
                build_cmd.append(f"moe_4h_to_h")
                build_cmd.append(f"moe_gate")
                build_cmd.append(f"moe_router")
            elif "llama" in self._config.model_name:
                build_cmd.append(f"--lora_plugin=float16")
                build_cmd.append(f"--lora_target_modules")
                build_cmd.append(f"attn_q")
                build_cmd.append(f"attn_k")
                build_cmd.append(f"attn_v")
                build_cmd.append(f"attn_dense")
                build_cmd.append(f"mlp_h_to_4h")
                build_cmd.append(f"mlp_4h_to_h")
                build_cmd.append(f"mlp_gate")
        if TIMING_CACHE_DIR and not self._config.build_only:
            timing_cache = os.path.join(TIMING_CACHE_DIR, "model.cache")
            build_cmd.append(f"--input_timing_cache={timing_cache}")
            build_cmd.append(f"--output_timing_cache={timing_cache}")
        return build_cmd

    def get_trtllm_bench_model(self):
        model_dir = ""
        if self._config.model_name in MODEL_PATH_DICT.keys():
            model_dir = os.path.join(llm_models_root(),
                                     MODEL_PATH_DICT[self._config.model_name])
        elif self._config.model_name in HF_MODEL_PATH.keys():
            model_dir = os.path.join(
                llm_models_root(),
                MODEL_PATH_DICT[self._config.model_name.split('_hf')[0]])
        return model_dir

    def get_trtllm_bench_build_command(self, engine_dir) -> list:
        model_dir = self.get_trtllm_bench_model()
        dataset_path = os.path.join(engine_dir, "synthetic_data.json")
        if model_dir == "":
            pytest.skip("Model Name is not supported by trtllm-bench")
        model_name = self._config.model_name
        if not model_name.endswith("_hf"):
            model_name = model_name + "_hf"
        hf_model_name = HF_MODEL_PATH.get(model_name, "")
        build_cmd = [
            self._build_script, f"--log_level=info",
            f"--workspace={engine_dir}", f"--model={hf_model_name}",
            f"--model_path={model_dir}", "build", f"--dataset={dataset_path}",
            f"--tp_size={self._config.tp_size}",
            f"--pp_size={self._config.pp_size}"
        ]
        max_seq_len = max(self._config.input_lens) + max(
            self._config.output_lens)
        build_cmd.append(f"--max_seq_len={max_seq_len}")
        if self._config.quantization:
            build_cmd.append(
                f"--quantization={self._config.quantization.upper()}")
        return build_cmd

    def get_benchmark_build_command(self, engine_dir) -> list:
        mode_flag = self._config.mode.replace("_", "-")
        build_cmd = [
            self._build_script, f"--model={self._config.model_name}",
            "--log_level=info", f"--mode={mode_flag}",
            f"--dtype={self._config.data_type}", f"--output_dir={engine_dir}",
            "--monitor_memory"
        ]
        if self._config.quantization != "":
            build_cmd.append(f"--quantization={self._config.quantization}")
        num_beams = self._config.num_beams
        if num_beams > 1:
            build_cmd.append(f"--max_beam_width={num_beams}")
        gpu_percent = self._config.gpu_weights_percent
        if gpu_percent != -1:
            build_cmd += [f"--weight_streaming"]
        if self._config.max_batch_size > 0:
            build_cmd.append(f"--max_batch_size={self._config.max_batch_size}")

        # For performance data stability, set opt_num_token/opt_batch_size to 8 when max batch size is greater than 8.
        # The script will use the settings from allow_configs.py if max_batch_size is set to 0,
        # opt_num_token/opt_batch_size is also necessary for stability.
        if self._config.max_batch_size > 8 or self._config.max_batch_size == 0:
            if self._config.mode in ["plugin_ifb", "plugin", 'ootb_except_mha']:
                build_cmd.append("--opt_num_tokens=8")
            else:
                build_cmd.append("--opt_batch_size=8")
        # For Multiple Profiles
        if self._config.multiple_profiles:
            build_cmd.append("--multiple_profiles")
        # For engine inspector
        build_cmd.append("--profiling_verbosity=layer_names_only")
        if TIMING_CACHE_DIR and not self._config.build_only:
            timing_cache = os.path.join(TIMING_CACHE_DIR, "model.cache")
            build_cmd.append(f"--input_timing_cache={timing_cache}")
            build_cmd.append(f"--output_timing_cache={timing_cache}")
        return build_cmd

    def get_prepare_data_command(self, engine_dir, input_len,
                                 output_len) -> list:
        data_cmd = []
        prepare_data_script = os.path.join(self._llm_root, "benchmarks", "cpp",
                                           "prepare_dataset.py")

        if self._config.model_name in MODEL_PATH_DICT.keys():
            tokenizer_dir = os.path.join(
                llm_models_root(), MODEL_PATH_DICT[self._config.model_name])
        elif self._config.model_name in HF_MODEL_PATH.keys():
            tokenizer_dir = HF_MODEL_PATH[self._config.model_name]
        else:
            tokenizer_dir = os.path.join(llm_models_root(), "llama-models",
                                         "llama-7b-hf")
        if not os.path.exists(engine_dir):
            os.makedirs(engine_dir, exist_ok=True)
        if self._config.num_loras > 0:
            istdev = 16
            ostdev = 24
            nloras = self._config.num_loras
            # lora_data = os.path.join(engine_dir,
            #                          f"token-norm-dist-lora-{nloras}.json")
            dataset_path = os.path.join(engine_dir, "synthetic_data.json")
            data_cmd += [
                "python3", prepare_data_script, f"--stdout",
                f"--rand-task-id 0 {nloras-1}", f"--tokenizer={tokenizer_dir}",
                f"token-norm-dist", f"--num-requests={self._config.num_reqs}",
                f"--input-mean={input_len}", f"--output-mean={output_len}",
                f"--input-stdev={istdev}", f"--output-stdev={ostdev}",
                f" > {dataset_path}"
            ]
            if self._config.runtime == "cppmanager":
                data_cmd += [";"]
                generate_rand_lora_script = os.path.join(
                    self._llm_root, "benchmarks", "cpp", "utils",
                    "generate_rand_loras.py")
                checkpoint_dir = os.path.join(engine_dir, "lora_cpp")
                lora_dir = os.path.join(engine_dir, f"loras")
                data_cmd += [
                    "python3", generate_rand_lora_script, checkpoint_dir,
                    lora_dir, "16"
                ]
        else:
            istdev = 0
            ostdev = 0
            dataset_path = os.path.join(engine_dir, "synthetic_data.json")
            if self._build_script == 'trtllm-bench':
                data_cmd += [
                    "python3", prepare_data_script, "--stdout",
                    f"--tokenizer={tokenizer_dir}", f"token-norm-dist",
                    f"--num-requests={self._config.num_reqs}",
                    f"--input-mean={input_len}", f"--output-mean={output_len}",
                    f"--input-stdev={istdev}", f"--output-stdev={ostdev}",
                    f" > {dataset_path}"
                ]
            else:
                data_cmd += [
                    "python3", prepare_data_script, f"--output={dataset_path}",
                    f"--tokenizer={tokenizer_dir}", f"token-norm-dist",
                    f"--num-requests={self._config.num_reqs}",
                    f"--input-mean={input_len}", f"--output-mean={output_len}",
                    f"--input-stdev={istdev}", f"--output-stdev={ostdev}"
                ]

        return data_cmd

    def get_python_runtime_benchmark_command(self, engine_dir, bs, input_len,
                                             output_len):
        benchmark_cmd = [
            self._benchmark_script,
        ]
        if self._config.is_bert_like():
            model = "enc"
            benchmark_cmd.append(f"--engine_dir={engine_dir}")
        elif self._config.is_enc_dec():
            model = "enc-dec"
            benchmark_cmd.append(
                f"--encoder_engine_dir={os.path.join(engine_dir, 'encoder')}")
            benchmark_cmd.append(
                f"--decoder_engine_dir={os.path.join(engine_dir, 'decoder')}")

        else:
            model = "dec"
            benchmark_cmd.append(f"--engine_dir={engine_dir}")
        benchmark_cmd.append(f"--model={model}")
        benchmark_cmd += [f"--batch_size={bs}"]
        # Use 3 warm-up runs and minimum of 10 actual runs and minimum of 10 seconds for now.
        benchmark_cmd += [f"--warm_up=3", f"--num_runs=10", f"--duration=10"]
        benchmark_cmd += [f"--dtype={self._config.data_type}"]
        if self._config.is_bert_like():
            benchmark_cmd.append(f"--input_len={input_len}")
        else:
            benchmark_cmd.append(f"--input_output_len={input_len},{output_len}")
        # Weight streaming don't support CUDA Graph for now.
        gpu_percent = self._config.gpu_weights_percent
        if gpu_percent == -1:
            benchmark_cmd.append(f"--enable_cuda_graph")
        return benchmark_cmd

    def get_gpt_session_runtime_benchmark_command(self, engine_dir, bs,
                                                  input_len, output_len):
        benchmark_cmd = [
            self._benchmark_script,
            # This is required to get context GPU info
            f"--log_level=info",
        ]
        benchmark_cmd.append(f"--engine_dir={engine_dir}")
        if self._config.is_bert_like():
            benchmark_cmd.append(f"--model={self._config.model_name}")
        num_beams = self._config.num_beams
        if num_beams > 1:
            benchmark_cmd.append(f"--beam_width={num_beams}")
        gpu_percent = self._config.gpu_weights_percent
        if gpu_percent != -1:
            benchmark_cmd.append(f"--gpu_weights_percent={gpu_percent}")
        benchmark_cmd += [f"--batch_size={bs}"]
        # Use 3 warm-up runs and minimum of 10 actual runs and minimum of 10 seconds for now.
        benchmark_cmd += [f"--warm_up=3", f"--num_runs=10", f"--duration=10"]
        if not self._config.is_bert_like() and not self._config.is_enc_dec(
        ) and not self._config.is_mamba_family() and self._config.num_gpus < 8:
            # Dump layer information and per-layer profile
            benchmark_cmd += ["--dump_layer_info", "--dump_profile"]

        # For GPT Models and enc-dec Models
        if not self._config.is_bert_like():
            benchmark_cmd.append(f"--input_output_len={input_len},{output_len}")
            # Weight streaming don't support CUDA Graph for now.
            # MoE OOTB doesn't support CUDA Graph
            gpu_percent = self._config.gpu_weights_percent
            if gpu_percent == -1 and not (self._config.is_moe_family()
                                          and self._config.mode
                                          in ['ootb', 'ootb_except_mha']):
                benchmark_cmd.append(f"--enable_cuda_graph")
        # For BERT Models:
        else:
            benchmark_cmd.append(f"--input_len={input_len}")
        return benchmark_cmd

    def get_trtllm_bench_command(self, engine_dir):
        model_dir = self.get_trtllm_bench_model()
        model_name = self._config.model_name
        dataset_path = os.path.join(engine_dir, "synthetic_data.json")
        report_path = os.path.join(engine_dir, "report.json")
        if not model_name.endswith("_hf"):
            model_name = model_name + "_hf"
        hf_model_name = HF_MODEL_PATH.get(model_name, "")
        tp_pp_str = f"tp_{self._config.tp_size}_pp_{self._config.pp_size}"
        engine_dir = os.path.join(engine_dir, hf_model_name, tp_pp_str)
        benchmark_cmd = [
            self._benchmark_script,
            f"--model={model_name}",
            f"--model_path={model_dir}",
            "throughput",
            f"--dataset={dataset_path}",
            f"--max_batch_size={self._config.max_batch_size}",
            f"--max_num_tokens={self._config.max_num_tokens}",
            f"--report_json={report_path}",
        ]
        if self._config.backend != "pytorch":
            benchmark_cmd += [f"--engine_dir={engine_dir}"]
        else:
            benchmark_cmd += ["--backend=pytorch"]
        if self._config.num_reqs > 0:
            benchmark_cmd += [f"--num_requests={self._config.num_reqs}"]
        if self._config.concurrency != -1:
            benchmark_cmd += [f"--concurrency={self._config.concurrency}"]
        if self._config.ep_size != None:
            benchmark_cmd += [f"--ep={self._config.ep_size}"]
        if self._config.tp_size > 1:
            benchmark_cmd += [f"--tp={self._config.tp_size}"]
        if self._config.pp_size > 1:
            benchmark_cmd += [f"--pp={self._config.pp_size}"]
        if self._config.streaming == "streaming":
            benchmark_cmd += [f"--streaming"]
        #use default yaml config
        if self._config.backend == "pytorch":
            import yaml
            config = get_model_yaml_config(self._config.to_string())
            with open('extra-llm-api-config.yml', 'w') as f:
                yaml.dump(config, f, default_flow_style=False)
            benchmark_cmd += [
                f"--extra_llm_api_options=extra-llm-api-config.yml"
            ]
        return benchmark_cmd

    def get_gpt_manager_runtime_benchmark_command(self, engine_dir, bs,
                                                  input_len):
        benchmark_cmd = [
            self._benchmark_script,
            # This is required to get context GPU info
            f"--log_level=info",
        ]
        if self._config.is_enc_dec():
            benchmark_cmd.append(
                f"--encoder_engine_dir={os.path.join(engine_dir, 'encoder')}")
            benchmark_cmd.append(
                f"--decoder_engine_dir={os.path.join(engine_dir, 'decoder')}")
        else:
            benchmark_cmd.append(f"--engine_dir={engine_dir}")

        num_beams = self._config.num_beams
        if num_beams > 1:
            benchmark_cmd.append(f"--beam_width={num_beams}")
        gpu_percent = self._config.gpu_weights_percent
        if gpu_percent != -1:
            benchmark_cmd.append(f"--gpu_weights_percent={gpu_percent}")
        if self._config.num_loras > 0:
            nloras = self._config.num_loras
            dataset_path = os.path.join(engine_dir,
                                        f"token-norm-dist-lora-{nloras}.json")
            lora_dir = os.path.join(engine_dir, f"loras")

            eos_id = 2
            num_layers = 32 if "mixtral" in self._config.model_name else 40
            num_lora_mods = 8 if "mixtral" in self._config.model_name else 7
            max_lora_rank = 64
            benchmark_cmd += [f"--lora_host_cache_bytes=8589934592"]
            benchmark_cmd += [
                f"--lora_num_device_mod_layers={32 * num_layers * num_lora_mods * max_lora_rank}"
            ]
            benchmark_cmd += [f"--eos_id={eos_id}"]
            benchmark_cmd += [f"--lora_dir={lora_dir}"]
        else:
            dataset_path = os.path.join(engine_dir, "synthetic_data.json")
        benchmark_cmd += [f"--dataset={dataset_path}"]
        # API Type is executor
        if self._config.api == "exe":
            benchmark_cmd += [f"--api=executor"]
        if self._config.mode == "plugin_ifb":
            benchmark_cmd += [
                f"--type=UIFB"
            ] if self._config.is_mamba_family() else ["--type=IFB"]
        else:
            benchmark_cmd += [f"--type=V1"]
        if self._config.streaming == "streaming":
            benchmark_cmd += [f"--streaming"]
            benchmark_cmd += [f"--scheduler_policy=max_utilization"]
        if self._config.static_batching == "static_batching":
            benchmark_cmd += [f"--static_emulated_batch_size={bs}"]
        if self._config.concurrency != -1:
            benchmark_cmd += [f"--concurrency={self._config.concurrency}"]

        return benchmark_cmd

    def get_commands(self):

        # Whether this is python or cpp runtime perf test.
        is_python = self._config.runtime == "python"
        num_gpus = self._config.num_gpus
        if is_python and num_gpus > 1:
            # TODO: Fix https://nvbugs/4449875
            pytest.skip(
                "multi-gpu tests with python runtime is skipped because of hanging issue. See https://nvbugs/4449875"
            )
        if is_windows() and num_gpus > 1:
            pytest.skip(
                "multi-gpu not supported on Windows yet, skipped for now")

        # Construct engine build command.
        engine_dir = self._get_engine_dir()
        convert_cmd = []
        build_cmd = []
        if self._build_script == "trtllm-build" and self._config.model_name in MODEL_PATH_DICT.keys(
        ):
            model_path = MODEL_PATH_DICT[self._config.model_name]
            model_dir = os.path.join(llm_models_root(), model_path)
            if not os.path.exists(engine_dir):
                os.makedirs(engine_dir, exist_ok=True)
            convert_cmd, checkpoint_dir = self.get_convert_weights_command(
                model_dir, engine_dir)
            if self._config.num_loras > 0:
                if self._config.model_name in LORA_MODEL_PATH.keys():
                    model_dir = os.path.join(
                        llm_models_root(),
                        LORA_MODEL_PATH[self._config.model_name])
                    convert_lora_cmd, lora_checkpoint_dir = self.get_convert_lora_weights_command(
                        model_dir, engine_dir)
                    convert_cmd += [";"]
                    convert_cmd += convert_lora_cmd
                else:
                    pytest.skip(
                        f"There is no LoRA weights model for {self._config.model_name}"
                    )
            build_cmd = self.get_trtllm_build_command(engine_dir,
                                                      checkpoint_dir)
        elif self._config.runtime == "bench":
            if self._config.backend == "pytorch":
                # Skip building process as it is pytorch backend")
                pass
            else:
                build_cmd = self.get_trtllm_bench_build_command(engine_dir)
        else:
            build_cmd = self.get_benchmark_build_command(engine_dir)
        # Construct prepare synthetic data command
        data_cmds = []

        # Construct benchmark commands for each bs and seq len combination.
        benchmark_cmds = []
        for bs in self._config.batch_sizes:
            for len_idx, input_len in enumerate(self._config.input_lens):
                output_len = None if self._config.is_bert_like(
                ) else self._config.output_lens[len_idx]
                if is_python:
                    benchmark_cmd = self.get_python_runtime_benchmark_command(
                        engine_dir, bs, input_len, output_len)
                elif self._config.runtime == "bench":
                    benchmark_cmd = self.get_trtllm_bench_command(engine_dir)
                elif self._config.runtime == "cpp":
                    benchmark_cmd = self.get_gpt_session_runtime_benchmark_command(
                        engine_dir, bs, input_len, output_len)
                else:
                    benchmark_cmd = self.get_gpt_manager_runtime_benchmark_command(
                        engine_dir, bs, input_len)
                benchmark_cmds.append(benchmark_cmd)
                if not self._config.runtime == "cpp" and not is_python:
                    data_cmd = self.get_prepare_data_command(
                        engine_dir, input_len, output_len)
                    data_cmds.append(data_cmd)

        # Construct MPI command.
        mpi_cmd = []
        if num_gpus > 1 and num_gpus <= 8 and not self._config.runtime == "bench":
            if cpu_socket_count_gt_1():
                mpi_cmd = [
                    "mpirun", "--map-by", "socket", "-n", f"{num_gpus}",
                    "--allow-run-as-root"
                ]
            else:
                mpi_cmd = ["mpirun", "-n", f"{num_gpus}", "--allow-run-as-root"]
        if self._build_script == "trtllm-bench":
            return PerfBenchScriptTestCmds(data_cmds, build_cmd, benchmark_cmds,
                                           mpi_cmd, is_python)
        else:
            return PerfScriptTestCmds(convert_cmd, build_cmd, data_cmds,
                                      benchmark_cmds, mpi_cmd, is_python)

    def get_perf_result(self, outputs: Dict[int, str]) -> float:
        """
        Get perf metric result from test output logs.
        """
        metric = self._current_metric
        cmd_idx = metric.cmd_idx
        metric_name = metric.metric_name
        num_gpus = self._config.num_gpus

        # Make sure we have outputs.
        assert cmd_idx in outputs, f"Output log for command {cmd_idx} does not exist!"

        # Use the regex to go through the log from the N-th command, where N = cmd_idx.
        print_info(
            f"Searching for metric {metric_name} from output log of command {cmd_idx} ..."
        )

        regex_matches = [
            metric.metric_regex.search(line)
            for line in outputs[cmd_idx].split("\n")
        ]
        metric_values = [
            float(match.group(1)) for match in regex_matches if match
        ]

        if len(metric_values) == 0:
            if self._build_script == "trtllm-build" and metric.metric_type == PerfMetricType.ENGINE_SIZE:
                metric_values = [0.0]
            elif self._build_script == "trtllm-bench" and self._config.num_gpus > 1 and metric.metric_type == PerfMetricType.BUILD_TIME:
                print_info("skip building process for multi-gpu test"
                           )  #https://nvbugspro.nvidia.com/bug/5210111
                metric_values = [0.0]
            else:
                raise RuntimeError(
                    f"Cannot find perf result for {metric_name} from perf script logs!"
                )

        if metric.metric_type in BUILDER_METRICS and metric.metric_type != PerfMetricType.ENGINE_SIZE:
            # For enc-dec models, there are 2 builder perf metrics, we add them up.
            if self._config.is_enc_dec():
                assert len(
                    metric_values
                ) == 2 * num_gpus, f"Enc-Dec models must have num of metrics 2*{num_gpus} but got {len(metric_values)}!"

                enc_metrics = metric_values[:num_gpus]
                dec_metrics = metric_values[num_gpus:]
                gather_function = sum
                # Measure BUILD_PEAK_CPU_MEMORY, BUILD_PEAK_GPU_MEMORY by max function
                if metric.metric_type in [
                        PerfMetricType.BUILD_PEAK_CPU_MEMORY,
                        PerfMetricType.BUILD_PEAK_GPU_MEMORY
                ]:
                    gather_function = max

                metric_values = [
                    gather_function([x, y])
                    for x, y in zip(enc_metrics, dec_metrics)
                ]
                print_info(
                    f"Combining up enc builder_perf {enc_metrics} and dec builder_perf {dec_metrics} to {metric_values}."
                )
            # For other models, builder metric should equal # gpus.
            elif self._build_script != "trtllm-build" and self._build_script != "trtllm-bench":
                assert len(
                    metric_values
                ) == num_gpus, f"num of metrics: {len(metric_values)} should match num_gpus: {num_gpus}"

        # Use max perf metrics across GPUS
        if len(metric_values) > 1:
            metric_value = max(metric_values)
            print_info(
                f"Use max value {metric_value} out of {metric_values} for perf metric {metric_name}."
            )
        else:
            metric_value = metric_values[0]
            print_info(
                f"Use value {metric_value} for perf metric {metric_name}.")

        return metric_value

    def get_threshold(self) -> float:
        return self._current_metric.metric_threshold

    def get_absolute_threshold(self) -> float:
        return self._current_metric.metric_abs_threshold

    def get_metric_type(self) -> PerfMetricType:
        return self._current_metric.metric_type

    def run_metrics(self, llm_venv, gpu_clock_lock, session_data_writer,
                    output_dir):
        """
        Run through the commands and parse multiple perf metrics from the logs.
        """
        #print info to separate cases
        print_info(f"Running perf test for case: {self._short_test_name}")
        self._current_cmd_idx = 0
        metrics = self._get_metrics()
        outputs = {}
        result_states = {}
        errors = []

        def add_myelin_time_pass_to(input_env):
            time_pass_flag = r" -time_pass=on"
            old_myelin_env = input_env.get("__LUNOWUD", "")
            if time_pass_flag not in old_myelin_env:
                input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag
            return old_myelin_env

        old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env)
        if self._config.runtime == 'bench':
            #prepare dataset first for trtllm-bench
            print_info(f"Running command for generating dataset")
            outputs = self.run_ex("prepare_dataset",
                                  llm_venv,
                                  gpu_clock_lock,
                                  session_data_writer,
                                  output_dir,
                                  outputs=outputs,
                                  original_test_name="prepare_dataset",
                                  cmd_idx=self._current_cmd_idx)

            # Save the result state.
            result_state = self.get_result_state()
            result_states[self._current_cmd_idx] = result_state
            if result_state != "valid":
                errors.append(self.get_error())

        try:
            for metric in metrics:
                # Make sure that cmd_idx is in ascending order.
                assert metric.cmd_idx >= self._current_cmd_idx, "Command indices must be in ascending order!"
                self._current_cmd_idx = metric.cmd_idx
                self._current_metric = metric

                # If the same command has previously failed, do not run it again.
                if self._current_cmd_idx in result_states and result_states[
                        self._current_cmd_idx] == "failed":
                    print_warning(
                        f"Skipped running command for {metric.metric_name} since the previous run failed."
                    )
                    continue

                # If engine build command already failed, do not run benchmark commands.
                if 0 in result_states and result_states[0] == "failed":
                    print_warning(
                        f"Skipped running command for {metric.metric_name} since the engine building command failed."
                    )
                    continue

                # Run the command or reuse the existing output logs.
                print_info(f"Running command for {metric.metric_name}")
                outputs = self.run_ex(
                    metric.metric_name,
                    llm_venv,
                    gpu_clock_lock,
                    session_data_writer,
                    output_dir,
                    outputs=outputs,
                    original_test_name=metric.original_test_name,
                    cmd_idx=self._current_cmd_idx)

                # Save the result state.
                result_state = self.get_result_state()
                result_states[self._current_cmd_idx] = result_state
                if result_state != "valid":
                    errors.append(self.get_error())
        finally:
            # Clean up engine dir after use.
            shutil.rmtree(self._get_engine_dir(), ignore_errors=True)

        llm_venv._new_env["__LUNOWUD"] = old_llm_venv

        # Check if any commands failed.
        if not all([result_states[idx] == "valid" for idx in result_states]):
            # If there is only one error, throw it directly.
            if len(errors) == 1:
                raise errors[0]

            # Otherwise, combine all the error messages and re-raise a generic RuntimeError.
            msg = "Multiple Errors happened:\n"
            for error_idx, e in enumerate(errors):
                msg += f"> Error {error_idx+1}/{len(errors)}: {type(e).__name__}: {e}\n"

            raise RuntimeError(msg)

    def _get_engine_dir(self) -> str:
        """
        Get the engine directory to store the engine.
        """
        escaped_label = self._test_param_labels.replace("+", "_").replace(
            ":", "_").replace(",", "_")
        return os.path.join(self._working_dir, "perf_engines", escaped_label)

    def _get_metrics(self) -> List[PerfTestMetric]:
        """
        Generate all the metric configs for the current test.
        """

        metrics = []

        # Build command is the first command.
        cmd_idx = 0 if self._config.runtime != "bench" else 1
        if self._config.runtime == "bench":
            if self._config.backend == "pytorch":
                print_info(
                    f"Skip building process for {self._config.model_name} as it is pytorch backend"
                )
                builder_metrics = []
            else:
                builder_metrics = [PerfMetricType.BUILD_TIME]
        else:
            builder_metrics = BUILDER_METRICS.copy()

        # Add all builder_perf metrics
        for metric_type in builder_metrics:
            metrics.append(
                PerfTestMetric(
                    original_test_name=self._full_test_name,
                    metric_name=self._get_metric_name(metric_type),
                    metric_type=metric_type,
                    metric_regex=self._get_metric_regex(metric_type),
                    metric_threshold=self._get_metric_threshold(metric_type),
                    metric_abs_threshold=self._get_metric_abs_threshold(
                        metric_type),
                    cmd_idx=cmd_idx,
                ))
        if self._config.build_only:
            return metrics

        # Then, construct inference latency and gpu mem usage metrics, for each
        # bs and each seq len.
        for bs in self._config.batch_sizes:
            for len_idx, input_len in enumerate(self._config.input_lens):
                cmd_idx += 1
                output_len = None if self._config.is_bert_like(
                ) else self._config.output_lens[len_idx]

                # Get list of metrics depending on config.
                if self._config.runtime == "bench":
                    metric_types = BENCH_INFERENCE_METRICS.copy()
                    if self._config.streaming == "streaming":
                        metric_types.append(PerfMetricType.FIRST_TOKEN_TIME)
                        metric_types.append(PerfMetricType.OUTPUT_TOKEN_TIME)
                else:
                    metric_types = INFERENCE_METRICS.copy()
                if self._config.runtime == "cpp":
                    metric_types.append(PerfMetricType.TOKEN_THROUGHPUT)

                if self._config.runtime == "cppmanager":
                    metric_types = MANAGER_INFERENCE_METRICS.copy()
                    if self._config.streaming == "streaming":
                        metric_types.append(PerfMetricType.FIRST_TOKEN_TIME)
                    if self._config.mode != "plugin_ifb" or self._config.is_mamba_family(
                    ):
                        metric_types.remove(PerfMetricType.KV_CACHE_SIZE)
                if self._config.is_bert_like(
                ) and self._config.runtime == "cpp":
                    # TODO: bertBenchmark does not report peak GPU memory yet.
                    metric_types = BERT_CPP_INFERENCE_METRICS

                for metric_type in metric_types:
                    metrics.append(
                        PerfTestMetric(
                            original_test_name=self._full_test_name,
                            metric_name=self._get_metric_name(
                                metric_type, bs, input_len, output_len),
                            metric_type=metric_type,
                            metric_regex=self._get_metric_regex(metric_type),
                            metric_threshold=self._get_metric_threshold(
                                metric_type),
                            metric_abs_threshold=self._get_metric_abs_threshold(
                                metric_type),
                            cmd_idx=cmd_idx,
                        ))

        return metrics

    def _get_metric_name(self,
                         metric_type: PerfMetricType,
                         bs: int = None,
                         input_len: int = None,
                         output_len: int = None) -> str:
        """
        Construct the metric name for given metric_type, bs, input_len, and output_len.
        """

        if metric_type in BUILDER_METRICS:
            # We build one engine for all benchmark runs, so add all bs and seq lens to the metric name.
            metric_label = self._config.to_string()
        else:
            # Otherwise, generate per-bs and per-seqlen label.
            metric_label = self._config.to_string(
                custom_bs=bs,
                custom_input_len=input_len,
                custom_output_len=output_len,
            )
        metric_name = f"test_perf_metric_{metric_type.lower()}"
        return self._test_domain_name + "::" + metric_name + "[" + metric_label + "]"

    def _get_metric_regex(self, metric_type: PerfMetricType) -> re.Pattern:
        """
        Get the regex used to parse the metric result for the metric type.
        """

        if self._config.runtime == "bench":
            if metric_type not in BENCH_PERF_METRIC_LOG_QUERIES:
                raise ValueError(f"Unexpected metric_type: {metric_type}")
            return BENCH_PERF_METRIC_LOG_QUERIES[metric_type]
        else:
            if metric_type not in PERF_METRIC_LOG_QUERIES:
                raise ValueError(f"Unexpected metric_type: {metric_type}")
            return PERF_METRIC_LOG_QUERIES[metric_type]

    def _get_metric_threshold(self, metric_type: PerfMetricType) -> float:
        """
        Get the threshold for the metric type.
        """

        if metric_type not in PERF_METRIC_THRESHOLD:
            raise ValueError(f"Unexpected metric_type: {metric_type}")

        return PERF_METRIC_THRESHOLD[metric_type][0]

    def _get_metric_abs_threshold(self, metric_type: PerfMetricType) -> float:
        """
        Get the absolute threshold for the metric type.
        """

        if metric_type not in PERF_METRIC_THRESHOLD:
            raise ValueError(f"Unexpected metric_type: {metric_type}")

        return PERF_METRIC_THRESHOLD[metric_type][1]


def run_perf_test(perf_case_name, trt_performance_cache_fpath,
                  trt_gpu_clock_lock, llm_session_data_writer, output_dir,
                  llm_venv, llm_root):
    """
    The actual test definition for TensorRT LLM perf test.
    """
    working_dir = llm_venv.get_working_directory()
    test_runner = MultiMetricPerfTest(perf_case_name)
    test_runner.set_runtime_configs(llm_root, working_dir,
                                    trt_performance_cache_fpath)
    test_runner.run_metrics(llm_venv, trt_gpu_clock_lock,
                            llm_session_data_writer, output_dir)


def generate_perf_tests(session, config, items):
    """
    Generate all the perf tests based on test lists to speed up the test collection time.
    """

    print_info(f"Dynamically generating perf tests...")
    valid_prefixes = [
        "perf/test_perf.py::test_perf[",
        # TRT pipeline adds "llm/" prefix, so include it so that TRT-LLM perf tests can run in TRT pipelines.
        "llm/perf/test_perf.py::test_perf[",
    ]
    items = generate_test_nodes(session, config, items, valid_prefixes,
                                run_perf_test)
    print_info(f"Completed generating perf tests.")

    return items