# SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
TensorRT LLM perf tests
"""
import os
import re
import shutil
import sys
from typing import Dict, List, NamedTuple

import pytest
import yaml
from defs.common import get_cpp_benchmark
from defs.trt_test_alternative import (is_linux, is_windows, print_info,
                                       print_warning)

from ..conftest import get_llm_root, llm_models_root, trt_environment
from .pytorch_model_config import get_model_yaml_config
from .sampler_options_config import get_sampler_options_config
from .utils import (AbstractPerfScriptTestClass, PerfBenchScriptTestCmds,
                    PerfDisaggScriptTestCmds, PerfMetricType,
                    generate_test_nodes)

if not hasattr(re, "Pattern"):
    re.Pattern = type(re.compile(""))

ALLOWED_CONFIGS_CACHE = None  # Cache to avoid modifying sys.path many times.
MAP_BY_SOCKET = None

# Model PATH of local dir synced from internal LLM models repo
MODEL_PATH_DICT = {
    "llama_v2_7b": "llama-models-v2/llama-v2-7b-hf",  # not safetensors repo
    "llama_v2_13b": "llama-models-v2/llama-v2-13b-hf",  # not safetensors repo
    "llama_v2_70b": "llama-models-v2/llama-v2-70b-hf",  # not safetensors repo
    "llama_v3.1_8b": "llama-3.1-model/Meta-Llama-3.1-8B",
    "llama_v3.1_8b_instruct": "llama-3.1-model/Llama-3.1-8B-Instruct",
    "llama_v3.1_8b_instruct_fp8": "llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
    "llama_v3.1_8b_instruct_fp4":
    "modelopt-hf-model-hub/Llama-3.1-8B-Instruct-fp4",
    "llama_v3.1_70b": "llama-3.1-model/Meta-Llama-3.1-70B",
    "llama_v3.3_70b_instruct": "llama-3.3-models/Llama-3.3-70B-Instruct",
    "llama_v3.1_70b_instruct_fp8": "llama-3.1-model/Llama-3.1-70B-Instruct-FP8",
    "llama_v3.3_70b_instruct_fp8":
    "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8",
    "llama_v3.3_70b_instruct_fp4":
    "modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4",
    "llama_v3.1_405b_instruct_fp8":
    "llama-3.1-model/Llama-3.1-405B-Instruct-FP8",
    "llama_v3.1_405b_instruct_fp4":
    "modelopt-hf-model-hub/Llama-3.1-405B-Instruct-fp4",
    "llama_v3.1_70b_instruct": "llama-3.1-model/Meta-Llama-3.1-70B-Instruct",
    "llama_v3.2_1b": "llama-3.2-models/Llama-3.2-1B",
    "llama_v3.1_nemotron_nano_8b": "Llama-3.1-Nemotron-Nano-8B-v1",
    "llama_v3.1_nemotron_nano_8b_fp8": "Llama-3.1-Nemotron-Nano-8B-v1-FP8",
    "llama_v3.3_nemotron_super_49b":
    "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1",
    "llama_v3.3_nemotron_super_49b_fp8":
    "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8",
    "llama_v3.3_nemotron_super_49b_v1.5_fp8":
    "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1_5-FP8",
    "llama_v3.1_nemotron_ultra_253b":
    "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1",
    "llama_v3.1_nemotron_ultra_253b_fp8":
    "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
    "llama_v4_scout_17b_16e_instruct":
    "llama4-models/Llama-4-Scout-17B-16E-Instruct",
    "llama_v4_scout_17b_16e_instruct_fp8":
    "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8",
    "llama_v4_scout_17b_16e_instruct_fp4":
    "llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4",
    "llama_v4_maverick_17b_128e_instruct":
    "llama4-models/Llama-4-Maverick-17B-128E-Instruct",
    "llama_v4_maverick_17b_128e_instruct_fp8":
    "llama4-models/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8",
    "mixtral_8x7b_v0.1": "Mixtral-8x7B-v0.1",
    "mixtral_8x7b_v0.1_instruct": "Mixtral-8x7B-Instruct-v0.1",
    "mixtral_8x7b_v0.1_instruct_fp8": "Mixtral-8x7B-Instruct-v0.1-fp8",
    "mixtral_8x7b_v0.1_instruct_fp4":
    "modelopt-hf-model-hub/Mixtral-8x7B-Instruct-v0.1-fp4",
    "mistral_nemo_12b_base": "Mistral-Nemo-Base-2407",
    "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B",
    "deepseek_r1_distill_llama_70b":
    "DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B/",
    "mixtral_8x22b_v0.1": "Mixtral-8x22B-v0.1",
    "mistral_7b_v0.1": "mistral-7b-v0.1",
    "ministral_8b": "Ministral-8B-Instruct-2410",
    "ministral_8b_fp8": "Ministral-8B-Instruct-2410-FP8",
    "gemma_3_1b_it": "gemma/gemma-3-1b-it",
    "gemma_3_27b_it": "gemma/gemma-3-27b-it",
    "gemma_3_27b_it_fp8": "gemma/gemma-3-27b-it-fp8",
    "gemma_3_27b_it_fp4": "gemma/gemma-3-27b-it-FP4",
    "gemma_3_12b_it": "gemma/gemma-3-12b-it",
    "gemma_3_12b_it_fp8": "gemma/gemma-3-12b-it-fp8",
    "gemma_3_12b_it_fp4": "gemma/gemma-3-12b-it-fp4",
    "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1",
    "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4",
    "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/",
    "deepseek_r1_0528_fp4": "DeepSeek-R1/DeepSeek-R1-0528-FP4/",
    "deepseek_r1_0528_fp4_v2": "DeepSeek-R1/DeepSeek-R1-0528-FP4-v2/",
    "deepseek_v3_lite_fp8": "DeepSeek-V3-Lite/fp8",
    "deepseek_v3_lite_nvfp4": "DeepSeek-V3-Lite/nvfp4_moe_only",
    "qwen2_7b_instruct": "Qwen2-7B-Instruct",
    "qwen_14b_chat": "Qwen-14B-Chat",
    "qwen3_0.6b": "Qwen3/Qwen3-0.6B",
    "qwen3_4b_eagle3": "Qwen3/Qwen3-4B",
    "qwen3_8b": "Qwen3/Qwen3-8B",
    "qwen3_8b_fp8": "Qwen3/nvidia-Qwen3-8B-FP8",
    "qwen3_8b_fp4": "Qwen3/nvidia-Qwen3-8B-NVFP4",
    "qwen3_14b": "Qwen3/Qwen3-14B",
    "qwen3_14b_fp8": "Qwen3/nvidia-Qwen3-14B-FP8",
    "qwen3_14b_fp4": "Qwen3/nvidia-Qwen3-14B-NVFP4",
    "qwen3_30b_a3b": "Qwen3/Qwen3-30B-A3B",
    "qwen3_30b_a3b_fp4": "Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf",
    "qwen3_32b": "Qwen3/Qwen3-32B",
    "qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4",
    "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
    "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
    "qwen2_5_vl_7b_instruct": "Qwen2.5-VL-7B-Instruct",
    "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8",
    "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4",
    "starcoder2_3b": "starcoder2-3b",
    "starcoder2_7b": "starcoder2-7b",
    "starcoder2_15b": "starcoder2-15b",
    "t5": "t5-small",  # not supported for trtllm-bench build config
    "flan_t5_base":
    "flan-t5-small",  # not supported for trtllm-bench build config
    "flan_t5_large":
    "flan-t5-xl",  # not supported for trtllm-bench build config
    "whisper_large_v3":
    "whisper-models/large-v3",  # not supported for trtllm-bench tokenizer
    "bart_large_cnn": "bart-large-cnn",  # not safetensors repo
    "mbart_large_50_many_to_one_mmt": "mbart-large-50-many-to-one-mmt",
    "mamba_130m": "mamba/mamba-130m-hf",
    "mamba_370m": "mamba/mamba-370m-hf",
    "mamba_2.8b": "mamba/mamba-2.8b-hf",
    "gpt_20b": "gpt-neox-20b",
    "gpt_350m_moe": "gpt2-medium",
    "phi_4_mini_instruct": "Phi-4-mini-instruct",
    "phi_4_reasoning_plus": "Phi-4-reasoning-plus",
    "phi_4_reasoning_plus_fp8": "nvidia-Phi-4-reasoning-plus-FP8",
    "phi_4_reasoning_plus_fp4": "nvidia-Phi-4-reasoning-plus-NVFP4",
    "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct",
    "phi_4_multimodal_instruct_image": "multimodals/Phi-4-multimodal-instruct",
    "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct",
    "phi_4_multimodal_instruct_fp4":
    "multimodals/Phi-4-multimodal-instruct-FP4",
    "phi_4_multimodal_instruct_fp4_image":
    "multimodals/Phi-4-multimodal-instruct-FP4",
    "phi_4_multimodal_instruct_fp4_audio":
    "multimodals/Phi-4-multimodal-instruct-FP4",
    "phi_4_multimodal_instruct_fp8_image":
    "multimodals/Phi-4-multimodal-instruct-FP8",
    "phi_4_multimodal_instruct_fp8_audio":
    "multimodals/Phi-4-multimodal-instruct-FP8",
    "phi_4_multimodal_instruct_fp8":
    "multimodals/Phi-4-multimodal-instruct-FP8",
    "bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct",
    "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8",
    "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503",
    "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b",
    "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b",
    "gpt_oss_120b_eagle3": "gpt_oss/gpt-oss-120b-Eagle3",
    "nemotron_nano_3_30b_fp8": "Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev",
    "nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2",
    "nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4",
    "starcoder2_7b": "starcoder2-7b",
    "kimi_k2_nvfp4": "Kimi-K2-Thinking-NVFP4",
}
# Model PATH of HuggingFace
HF_MODEL_PATH = {
    "llama_v2_7b_hf": "meta-llama/Llama-2-7b-hf",
    "llama_v2_70b_hf": "meta-llama/Llama-2-70b-hf",
    "falcon_180b_hf": "tiiuae/falcon-180B",
    "gptj_6b_hf": "EleutherAI/gpt-j-6b",
    "llama_v3_8b_hf": "meta-llama/Meta-Llama-3-8B",
    "llama_v3.1_8b_hf": "meta-llama/Llama-3.1-8B",
    "llama_v3.1_8b_instruct_hf": "nvidia/Llama-3.1-8B-Instruct-FP8",
    "llama_v3.1_70b_instruct_hf": "meta-llama/Meta-Llama-3.1-70B-Instruct",
    "llama_v3_70b_hf": "meta-llama/Meta-Llama-3-70B",
    "llama_v3.1_70b_hf": "meta-llama/Llama-3.1-70B",
    "llama_v3.1_405b_hf": "meta-llama/Llama-3.1-405B",
    "llama_v3.1_nemotron_nano_8b_hf": "nvidia/Llama-3.1-Nemotron-Nano-8B-v1",
    "llama_v3.1_nemotron_nano_8b_fp8_hf":
    "nvidia/Llama-3.1-Nemotron-Nano-8B-v1-FP8",
    "llama_v3.3_nemotron_super_49b_hf":
    "nvidia/Llama-3_3-Nemotron-Super-49B-v1",
    "llama_v3.3_nemotron_super_49b_fp8_hf":
    "nvidia/Llama-3_3-Nemotron-Super-49B-v1-FP8",
    "llama_v3.1_nemotron_ultra_253b_fp8_hf":
    "nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8",
    "mixtral_8x7b_v0.1_hf": "mistralai/Mixtral-8x7B-v0.1",
    "mixtral_8x7b_v0.1_instruct_hf": "mistralai/Mixtral-8x7B-Instruct-v0.1",
    "mistral_7b_v0.1_hf": "mistralai/Mistral-7B-v0.1",
    "ministral_8b_hf": "mistralai/Ministral-8B-Instruct-2410",
    "flan_t5_base_hf": "google/flan-t5-small",
    "phi_4_mini_instruct_hf": "microsoft/Phi-4-mini-instruct",
    "gemma_3_1b_it_hf": "google/gemma-3-1b-it",
}
LORA_MODEL_PATH = {
    "llama_v2_13b":
    "llama-models-v2/chinese-llama-2-lora-13b",
    "mixtral_8x7b_0.1":
    "chinese-mixtral-lora",
    "llama_v3.1_8b_instruct_fp8":
    "lora/llama-3-chinese-8b-instruct-v2-lora/",
    "ministral_8b":
    "lora/ministral/Ministral-8B-Instruct-2410-Loras-Dummy",  # Dummy LoRA for Ministral
    "gemma_3_1b_it":
    "lora/gemma/gemma-3-1b-it-dummy-lora",  # Dummy LoRA for Gemma-3-1B-Instruct
    "phi_4_multimodal_instruct_image":
    "multimodals/Phi-4-multimodal-instruct/vision-lora",
    "phi_4_multimodal_instruct_audio":
    "multimodals/Phi-4-multimodal-instruct/speech-lora",
    "phi_4_multimodal_instruct_fp4_image":
    "multimodals/Phi-4-multimodal-instruct-FP4/vision-lora",
    "phi_4_multimodal_instruct_fp4_audio":
    "multimodals/Phi-4-multimodal-instruct-FP4/speech-lora",
    "phi_4_multimodal_instruct_fp8_image":
    "multimodals/Phi-4-multimodal-instruct-FP8/vision-lora",
    "phi_4_multimodal_instruct_fp8_audio":
    "multimodals/Phi-4-multimodal-instruct-FP8/speech-lora",
}

TIMING_CACHE_DIR = os.environ.get("TIMING_CACHE_DIR", "")

TRUST_REMOTE_CODE_MODELS = {  # these models require explicit trust_remote_code=True
    "llama_v3.3_nemotron_super_49b",
    "llama_v3.3_nemotron_super_49b_fp8",
    "llama_v3.1_nemotron_ultra_253b",
    "llama_v3.1_nemotron_ultra_253b_fp8",
    "kimi_k2_nvfp4",
}

# Autodeploy model configs - maps model name to config file path (relative to TRT-LLM root)
AUTODEPLOY_MODEL_CONFIGS = {
    "nemotron_nano_3_30b_fp8": "examples/auto_deploy/nano_v3.yaml",
}


def get_model_dir(model_name: str):
    model_dir = ""
    if model_name in MODEL_PATH_DICT.keys():
        model_dir = os.path.join(llm_models_root(), MODEL_PATH_DICT[model_name])
    elif model_name in HF_MODEL_PATH.keys():
        model_dir = os.path.join(llm_models_root(),
                                 MODEL_PATH_DICT[model_name.split('_hf')[0]])
    return model_dir


def get_dataset_path():
    return os.path.join(llm_models_root(), "datasets",
                        "ShareGPT_V3_unfiltered_cleaned_split.json")


def cpu_socket_count_gt_1():
    global MAP_BY_SOCKET
    if MAP_BY_SOCKET is not None:
        return MAP_BY_SOCKET
    if is_linux():
        with open('/proc/cpuinfo') as f:
            cpuinfo = f.read()
            physical_id_set = set()
            for line in cpuinfo.splitlines():
                if line.startswith('physical id'):
                    _, id_ = line.split(':')
                    physical_id_set.add(id_.strip())
        MAP_BY_SOCKET = len(physical_id_set) > 1
    else:
        MAP_BY_SOCKET = False
    return MAP_BY_SOCKET


# A helper function to import allowed_configs.py.
def import_allowed_perf_config():
    if trt_environment:
        from llm import allowed_configs
    else:
        global ALLOWED_CONFIGS_CACHE
        if ALLOWED_CONFIGS_CACHE is None:
            sys.path.append((os.path.join(get_llm_root(),
                                          "tests/integration/defs/perf")))
            import allowed_configs
            ALLOWED_CONFIGS_CACHE = allowed_configs
        else:
            allowed_configs = ALLOWED_CONFIGS_CACHE
    return allowed_configs


# Regex commands used to parse the metric result for the metric type.
PERF_METRIC_LOG_QUERIES = {
    PerfMetricType.BUILD_TIME:
    re.compile(r"Engine generation completed in ([\d\.]+) seconds"),
    PerfMetricType.INFERENCE_TIME:
    re.compile(r"\[BENCHMARK\].* (?:total_latency|latency)\(ms\) ([\d\.]+)"),
    PerfMetricType.FIRST_TOKEN_TIME:
    re.compile(r"\[BENCHMARK\].* avg_time_to_first_token\(ms\) ([\d\.]+)"),
    PerfMetricType.SEQ_LATENCY:
    re.compile(r"\[BENCHMARK\].* avg_sequence_latency\(ms\) ([\d\.]+)"),
    PerfMetricType.SEQ_THROUGHPUT:
    re.compile(r"\[BENCHMARK\].* seq_throughput\(seq\/sec\) ([\d\.]+)"),
    PerfMetricType.TOKEN_THROUGHPUT:
    re.compile(
        r"\[BENCHMARK\].* (?:token_throughput\(token\/sec\)|tokensPerSec|tokens_per_sec) ([\d\.]+)"
    ),
    PerfMetricType.INFERENCE_PEAK_GPU_MEMORY:
    re.compile(r"\[BENCHMARK\].* gpu_peak_mem\(gb\) ([\d\.]+)"),
    PerfMetricType.BUILD_PEAK_CPU_MEMORY:
    re.compile(
        r"Peak memory usage during Engine building and serialization: CPU: ([\d\.]+) .*"
    ),
    PerfMetricType.BUILD_PEAK_GPU_MEMORY:
    re.compile(
        r"Peak memory usage of TRT CPU/GPU memory allocators: CPU .*, GPU ([\d\.]+) .*"
    ),
    PerfMetricType.ENGINE_SIZE:
    re.compile(r".*Total engine size per GPU is ([\d\.]+) MiB.*"),
    PerfMetricType.CONTEXT_GPU_MEMORY:
    re.compile(r".*Allocated ([\d\.]+) MiB for execution context memory.*"),
    PerfMetricType.KV_CACHE_SIZE:
    re.compile(r".*Allocated ([\d\.]+) GiB for max tokens in paged KV cache.*"),
    PerfMetricType.DISAGG_SERVER_E2EL:
    re.compile(r"Median E2EL \(ms\):\s*(\d+\.?\d*)"),
    PerfMetricType.DISAGG_SERVER_TTFT:
    re.compile(r"Median TTFT \(ms\):\s*(\d+\.?\d*)"),
}

BENCH_PERF_METRIC_LOG_QUERIES = {
    PerfMetricType.BUILD_TIME:
    re.compile(r"Engine generation completed in ([\d\.]+) seconds"),
    PerfMetricType.INFERENCE_TIME:
    re.compile(r"Total Latency \(ms\):\s+([\d\.]+)"),
    PerfMetricType.TOKEN_THROUGHPUT:
    re.compile(r"GPU Output Throughput \(tps\/gpu\):\s+([\d\.]+)"),
    PerfMetricType.SEQ_THROUGHPUT:
    re.compile(r"Request Throughput \(req\/sec\):\s+([\d\.]+)"),
    PerfMetricType.FIRST_TOKEN_TIME:
    re.compile(r"Average time-to-first-token \[TTFT\] \(ms\):\s+([\d\.]+)"),
    PerfMetricType.OUTPUT_TOKEN_TIME:
    re.compile(r"Average time-per-output-token \[TPOT\] \(ms\):\s+([\d\.]+)"),
    PerfMetricType.KV_CACHE_SIZE:
    re.compile(r".*(?:Allocated ([\d\.]+) GiB for max tokens in paged KV cache|"
               r"Final KV cache size after resize: ([\d\.]+) GiB).*"),
    PerfMetricType.PER_USER_OUTPUT_THROUGHPUT:
    re.compile(
        r"Per User Output Throughput \[w\/ ctx\] \(tps\/user\):\s+([\d\.]+)"),
    PerfMetricType.PER_GPU_OUTPUT_THROUGHPUT:
    re.compile(r"Per GPU Output Throughput \(tps\/gpu\):\s+([\d\.]+)"),
}

AGGR_SERVER_PERF_METRIC_LOG_QUERIES = {
    PerfMetricType.SEQ_THROUGHPUT:
    re.compile(r"Request throughput \(req\/s\):\s+(-?[\d\.]+)"),
    PerfMetricType.TOKEN_THROUGHPUT:
    re.compile(r"Output token throughput \(tok\/s\):\s+(-?[\d\.]+)"),
    PerfMetricType.TOTAL_TOKEN_THROUGHPUT:
    re.compile(r"Total Token throughput \(tok\/s\):\s+(-?[\d\.]+)"),
    PerfMetricType.USER_THROUGHPUT:
    re.compile(r"User throughput \(tok\/s\):\s+(-?[\d\.]+)"),
    PerfMetricType.FIRST_TOKEN_TIME:
    re.compile(r"Mean TTFT \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.MEDIAN_FIRST_TOKEN_TIME:
    re.compile(r"Median TTFT \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.P99_FIRST_TOKEN_TIME:
    re.compile(r"P99 TTFT \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.INTER_TOKEN_TIME:
    re.compile(r"Mean ITL \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.MEDIAN_INTER_TOKEN_TIME:
    re.compile(r"Median ITL \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.P99_INTER_TOKEN_TIME:
    re.compile(r"P99 ITL \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.OUTPUT_TOKEN_TIME:
    re.compile(r"Mean TPOT \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.MEDIAN_OUTPUT_TOKEN_TIME:
    re.compile(r"Median TPOT \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.P99_OUTPUT_TOKEN_TIME:
    re.compile(r"P99 TPOT \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.INFERENCE_TIME:
    re.compile(r"Mean E2EL \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.MEDIAN_INFERENCE_TIME:
    re.compile(r"Median E2EL \(ms\):\s+(-?[\d\.]+)"),
    PerfMetricType.P99_INFERENCE_TIME:
    re.compile(r"P99 E2EL \(ms\):\s+(-?[\d\.]+)"),
}

# (Relative threshold, Absolute threshold) for all metric types
PERF_METRIC_THRESHOLD = {
    PerfMetricType.BUILD_TIME: (0.1, 30),  # Ignore build time regression < 30ms
    PerfMetricType.INFERENCE_TIME:
    (0.1, 50),  # Ignore inference time regression < 50ms
    PerfMetricType.MEDIAN_INFERENCE_TIME:
    (0.1, 50),  # Ignore median inference time regression < 50ms
    PerfMetricType.P99_INFERENCE_TIME:
    (0.1, 50),  # Ignore p99 inference time regression < 50ms
    PerfMetricType.FIRST_TOKEN_TIME:
    (0.1, 50),  # Ignore first token time regression < 50ms
    PerfMetricType.MEDIAN_FIRST_TOKEN_TIME:
    (0.1, 50),  # Ignore median first token time regression < 50ms
    PerfMetricType.P99_FIRST_TOKEN_TIME:
    (0.1, 50),  # Ignore p99 first token time regression < 50ms
    PerfMetricType.OUTPUT_TOKEN_TIME:
    (0.1, 50),  # Ignore per output token time regression < 50ms
    PerfMetricType.MEDIAN_OUTPUT_TOKEN_TIME:
    (0.1, 50),  # Ignore median output token time regression < 50ms
    PerfMetricType.P99_OUTPUT_TOKEN_TIME:
    (0.1, 50),  # Ignore p99 output token time regression < 50ms
    PerfMetricType.INTER_TOKEN_TIME:
    (0.1, 50),  # Ignore inter token time regression < 50ms
    PerfMetricType.MEDIAN_INTER_TOKEN_TIME:
    (0.1, 50),  # Ignore median inter token time regression < 50ms
    PerfMetricType.P99_INTER_TOKEN_TIME:
    (0.1, 50),  # Ignore p99 inter token time regression < 50ms
    PerfMetricType.SEQ_LATENCY: (0.1, 50),  # Ignore latency regression < 50ms
    PerfMetricType.TOKEN_THROUGHPUT: (
        -0.1, 10
    ),  # Ignore throughput regression < 10 tokens/s. Negative rel threshold is to indicate that larger is better.
    PerfMetricType.TOTAL_TOKEN_THROUGHPUT: (0.1, 10),
    PerfMetricType.USER_THROUGHPUT: (0.1, 10),
    PerfMetricType.SEQ_THROUGHPUT: (
        -0.1, 10
    ),  # Ignore throughput regression < 10 tokens/s. Negative rel threshold is to indicate that larger is better.
    PerfMetricType.INFERENCE_PEAK_GPU_MEMORY:
    (0.1, 0.1),  # Ignore inference peak gpu memory regression < 0.1GiB
    PerfMetricType.BUILD_PEAK_CPU_MEMORY:
    (0.1, 100),  # Ignore build peak cpu memory regression < 100MiB
    PerfMetricType.BUILD_PEAK_GPU_MEMORY:
    (0.1, 100),  # Ignore build peak gpu memory regression < 100MiB
    PerfMetricType.ENGINE_SIZE: (0.3,
                                 100),  # Ignore engine size regression < 100MiB
    PerfMetricType.CONTEXT_GPU_MEMORY:
    (0.1, 50),  # Ignore context GPU memory < 50MiB
    PerfMetricType.KV_CACHE_SIZE: (-0.1, 50),  # Ignore value < 50MiB
    PerfMetricType.DISAGG_SERVER_E2EL: (0.1,
                                        50),  # Ignore E2EL regression < 50ms
    PerfMetricType.DISAGG_SERVER_TTFT: (0.1,
                                        50),  # Ignore TTFT regression < 50ms
}

PERF_METRIC_STRING = {
    PerfMetricType.BUILD_TIME: "build_time",
    PerfMetricType.INFERENCE_TIME: "mean_e2el",
    PerfMetricType.MEDIAN_INFERENCE_TIME: "median_e2el",
    PerfMetricType.P99_INFERENCE_TIME: "p99_e2el",
    PerfMetricType.FIRST_TOKEN_TIME: "mean_ttft",
    PerfMetricType.MEDIAN_FIRST_TOKEN_TIME: "median_ttft",
    PerfMetricType.P99_FIRST_TOKEN_TIME: "p99_ttft",
    PerfMetricType.OUTPUT_TOKEN_TIME: "mean_tpot",
    PerfMetricType.MEDIAN_OUTPUT_TOKEN_TIME: "median_tpot",
    PerfMetricType.P99_OUTPUT_TOKEN_TIME: "p99_tpot",
    PerfMetricType.INTER_TOKEN_TIME: "mean_itl",
    PerfMetricType.MEDIAN_INTER_TOKEN_TIME: "median_itl",
    PerfMetricType.P99_INTER_TOKEN_TIME: "p99_itl",
    PerfMetricType.SEQ_LATENCY: "seq_latency",
    PerfMetricType.TOKEN_THROUGHPUT: "token_throughput",
    PerfMetricType.TOTAL_TOKEN_THROUGHPUT: "total_token_throughput",
    PerfMetricType.USER_THROUGHPUT: "user_throughput",
    PerfMetricType.SEQ_THROUGHPUT: "seq_throughput",
    PerfMetricType.INFERENCE_PEAK_GPU_MEMORY: "inference_peak_gpu_memory",
    PerfMetricType.BUILD_PEAK_CPU_MEMORY: "build_peak_cpu_memory",
    PerfMetricType.BUILD_PEAK_GPU_MEMORY: "build_peak_gpu_memory",
    PerfMetricType.ENGINE_SIZE: "engine_size",
    PerfMetricType.CONTEXT_GPU_MEMORY: "context_gpu_memory",
    PerfMetricType.KV_CACHE_SIZE: "kv_cache_size",
    PerfMetricType.PER_USER_OUTPUT_THROUGHPUT: "per_user_output_throughput",
    PerfMetricType.PER_GPU_OUTPUT_THROUGHPUT: "per_gpu_output_throughput",
}

BUILDER_METRICS = [
    PerfMetricType.BUILD_TIME, PerfMetricType.BUILD_PEAK_CPU_MEMORY,
    PerfMetricType.BUILD_PEAK_GPU_MEMORY, PerfMetricType.ENGINE_SIZE
]

INFERENCE_METRICS = [
    PerfMetricType.INFERENCE_TIME,
    PerfMetricType.INFERENCE_PEAK_GPU_MEMORY,
    PerfMetricType.CONTEXT_GPU_MEMORY,
]

AGGR_SERVER_METRICS = [
    PerfMetricType.SEQ_THROUGHPUT,
    PerfMetricType.TOKEN_THROUGHPUT,
    PerfMetricType.TOTAL_TOKEN_THROUGHPUT,
    PerfMetricType.USER_THROUGHPUT,
    PerfMetricType.FIRST_TOKEN_TIME,
    PerfMetricType.MEDIAN_FIRST_TOKEN_TIME,
    PerfMetricType.P99_FIRST_TOKEN_TIME,
    PerfMetricType.OUTPUT_TOKEN_TIME,
    PerfMetricType.MEDIAN_OUTPUT_TOKEN_TIME,
    PerfMetricType.P99_OUTPUT_TOKEN_TIME,
    PerfMetricType.INTER_TOKEN_TIME,
    PerfMetricType.MEDIAN_INTER_TOKEN_TIME,
    PerfMetricType.P99_INTER_TOKEN_TIME,
    PerfMetricType.INFERENCE_TIME,
    PerfMetricType.MEDIAN_INFERENCE_TIME,
    PerfMetricType.P99_INFERENCE_TIME,
]

BENCH_INFERENCE_METRICS = [
    PerfMetricType.INFERENCE_TIME,
    PerfMetricType.TOKEN_THROUGHPUT,
    PerfMetricType.SEQ_THROUGHPUT,
    PerfMetricType.KV_CACHE_SIZE,
]

DISAGG_SERVER_METRICS = [
    PerfMetricType.DISAGG_SERVER_E2EL,
    PerfMetricType.DISAGG_SERVER_TTFT,
]


class PerfTestMetric(NamedTuple):
    """
    Configurations of a test metric.
    """
    # The original test name used to run the oraginal perf test.
    original_test_name: str
    # The name for this particular metric.
    metric_name: str
    # The type of this metric.
    metric_type: PerfMetricType
    # The regex used to parse this metric.
    metric_regex: re.Pattern
    # The relative threshold to allow for regressions.
    metric_threshold: float
    # The absolute threshold to allow for regressions.
    metric_abs_threshold: float
    # The index of the command of this metric.
    # Currently, we run 1 build command plus N benchmark commands.
    cmd_idx: int


class PerfTestConfig:
    """
    Configurations defining the LLM perf test.
    This should hold only the attributes that distinguish different tests.
    """

    def __init__(
        self,
        *,
        model_name: str = "",
        runtime: str = "python",
        static_batching: str = "",
        api: str = "",
        streaming: str = "",
        backend: str = "",
        mode: str = "plugin",
        data_type: str = "float16",
        max_batch_size: int = 512,
        max_num_tokens: int = 2048,
        gpu_weights_percent: float = -1,
        batch_sizes: List[int] = [0],
        input_lens: List[int] = [8],
        output_lens: List[int] = [1],
        num_beams: int = 1,
        num_loras: int = 0,
        num_reqs: int = 512,
        concurrency: int = -1,
        quantization: str = "",
        kv_cache_free_gpu_mem_fraction: float = 0.9,
        kv_cache_dtype: str = "auto",
        ep_size: int = None,
        tp_size: int = 1,
        pp_size: int = 1,
        num_gpus: int = 1,
        # only for torch-backend currently
        extra: bool = False,
        # _autodeploy backend specific parameters
        ad_compile_backend: str = "torch-opt",
        extra_runtime: str = "trtllm",
        skip_loading_weights: bool = False,
    ):
        # The model name.
        self.model_name = model_name
        # Python or cpp/cppmanager runtime.
        self.runtime = runtime
        # static batching for gptManagerBenchmark
        self.static_batching = static_batching
        # API Type: only executor is allowed
        self.api = api
        # Backend Type: pytorch or cpp
        self.backend = backend
        # Streaming responses
        self.streaming = streaming
        # Plugin or OOTB mode.
        self.mode = mode
        # Activation dtype.
        self.data_type = data_type
        # Percentage of weights that resides on GPU.
        self.gpu_weights_percent = gpu_weights_percent
        # Max Batch Size to build TRT engine with.
        self.max_batch_size = max_batch_size
        # Max number of tokens to build TRT engine with.
        self.max_num_tokens = max_num_tokens
        # List of batch sizes to run benchmark with.
        self.batch_sizes = batch_sizes
        # List of input lens to run benchmark with.
        self.input_lens = input_lens
        # List of output lens to run benchmark with.
        self.output_lens = output_lens
        # Number of beams.
        self.num_beams = num_beams
        # Number of loras.
        self.num_loras = num_loras
        # Number of requests.
        self.num_reqs = num_reqs
        # Number of concurrency
        self.concurrency = concurrency
        # Quantization type.
        self.quantization = quantization
        # KV cache free gpu mem fraction
        self.kv_cache_free_gpu_mem_fraction = kv_cache_free_gpu_mem_fraction
        # KV Cache dtype
        self.kv_cache_dtype = kv_cache_dtype
        # Multiple Profiles
        self.multiple_profiles = False
        # EP Size
        self.ep_size = ep_size
        # TP Size
        self.tp_size = tp_size
        # PP Size
        self.pp_size = pp_size
        # Number of GPUs.
        self.num_gpus = num_gpus
        # Extra flag to enable pytorch_model_config reading for TRT backend
        self.extra = extra
        # _autodeploy backend specific parameters
        self.ad_compile_backend = ad_compile_backend
        self.extra_runtime = extra_runtime
        self.skip_loading_weights = skip_loading_weights
        # Just build engines
        self.build_only = False

        # Whether to run disaggregated server perf test.
        self.is_disagg_server = False
        self.ctx_server_workers = 0
        self.gen_server_workers = 0

    def _to_string_disagg(self, entries: List[str]):
        entries.append(f"disagg_server")
        if self.ctx_tp_size > 1:
            entries.append(f"ctx_tp:{self.ctx_tp_size}")
        if self.ctx_dp_size > 1:
            entries.append(f"ctx_dp:{self.ctx_dp_size}")
        if self.ctx_pp_size > 1:
            entries.append(f"ctx_pp:{self.ctx_pp_size}")
        if self.gen_tp_size > 1:
            entries.append(f"gen_tp:{self.gen_tp_size}")
        if self.gen_dp_size > 1:
            entries.append(f"gen_dp:{self.gen_dp_size}")
        if self.gen_pp_size > 1:
            entries.append(f"gen_pp:{self.gen_pp_size}")
        return "-".join(entries)

    def to_string(self,
                  custom_server_name: str = None,
                  custom_client_name: str = None,
                  custom_bs: int = None,
                  custom_input_len: int = None,
                  custom_output_len: int = None,
                  device_subtype: str = None) -> str:

        # First, add the model name.
        entries = [self.model_name]

        # Add device subtype if provided (for autodeploy tests)
        if device_subtype:
            entries.append(f"subtype:{device_subtype}")

        if self.runtime == "cpp":  # bertBenchmark runtime
            entries.append(f"cpp")
        elif self.runtime == "cppmanager":  # gptManagerBenchmark runtime
            entries.append(f"cppmanager")
            if self.api == "exe":  # executor
                entries.append(f"exe")
            if self.streaming == "streaming":
                entries.append(f"streaming")
            if self.static_batching == "static_batching":
                entries.append(f"static_batching")
        elif self.runtime == "bench":  # trtllm-bench
            entries.append(f"bench")
            if self.backend == 'pytorch':
                entries.append(f"pytorch")
            elif self.backend == '_autodeploy':
                entries.append(f"_autodeploy")
            if self.streaming == "streaming":
                entries.append(f"streaming")
        elif self.runtime == "disagg_server":  # trtllm-serve
            entries.append(f"disagg_server")
            return self._to_string_disagg(entries)

        # Add mode and dtype.
        if self.runtime != "bench":
            entries.append(self.mode)
        entries.append(self.data_type)

        if self.gpu_weights_percent != -1:
            entries.append(f"gwp:{self.gpu_weights_percent}")

        if self.multiple_profiles:
            entries.append(f"mp")

        # Add Max batch size.
        entries.append(f"maxbs:{self.max_batch_size}")

        # Add Max number of tokens.
        entries.append(f"maxnt:{self.max_num_tokens}")

        # Add kv cache free gpu mem fraction.
        if self.kv_cache_free_gpu_mem_fraction != 0.9:
            entries.append(f"kv_frac:{self.kv_cache_free_gpu_mem_fraction}")

        if self.build_only:
            entries.append(f"build_only")

        if self.batch_sizes[0] > 0:
            # Add batch size(s).
            if custom_bs is None:
                bs_label = "+".join([str(x) for x in self.batch_sizes])
            else:
                bs_label = str(custom_bs)
            entries.append(f"bs:{bs_label}")

        # Add input/output lens.
        if len(self.output_lens) > 0:
            if custom_input_len is None:
                io_lens = []
                for in_len, out_len in zip(self.input_lens, self.output_lens):
                    io_lens.append(f"{in_len},{out_len}")
                io_len_label = "+".join(io_lens)
            else:
                assert custom_output_len is not None, \
                    "custom_output_len must be provided if custom_input_len is specified!"
                io_len_label = f"{custom_input_len},{custom_output_len}"
            entries.append(f"input_output_len:{io_len_label}")
        else:
            if custom_input_len is None:
                len_label = "+".join([str(x) for x in self.input_lens])
            else:
                len_label = custom_input_len
            entries.append(f"input_len:{len_label}")

        # Add number of beams.
        if self.num_beams > 1:
            entries.append(f"beams:{self.num_beams}")

        # Add number of loras.
        if self.num_loras > 0:
            entries.append(f"loras:{self.num_loras}")

        # Add quantization type.
        if self.quantization != "":
            entries.append(f"quant:{self.quantization}")

        # Add kv cache dtype.
        if self.kv_cache_dtype != "auto":
            entries.append(f"kv_cache_dtype:{self.kv_cache_dtype}")

        # Add number of requests.
        if self.num_reqs != 512:
            entries.append(f"reqs:{self.num_reqs}")

        #Add number of concurrency
        if self.concurrency != -1:
            entries.append(f"con:{self.concurrency}")

        #Add EP Size.
        if self.ep_size != None:
            entries.append(f"ep:{self.ep_size}")

        # Add TP Size.
        if self.tp_size > 1 and self.tp_size != self.num_gpus:
            entries.append(f"tp:{self.tp_size}")

        # Add PP Size.
        if self.pp_size > 1:
            entries.append(f"pp:{self.pp_size}")

        # Add number of GPUs.
        if self.num_gpus > 1:
            entries.append(f"gpus:{self.num_gpus}")

        # Add extra flag for llm-api-config.yml.
        if self.extra:
            entries.append("extra")

        # Concatenate labels with "-".
        return "-".join(entries)

    def __str__(self) -> str:
        return self.to_string()

    def _load_from_str_disagg(self, labels: List[str]) -> None:
        self.ctx_tp_size = 1
        self.ctx_dp_size = 1
        self.ctx_pp_size = 1
        self.gen_tp_size = 1
        self.gen_dp_size = 1
        self.gen_pp_size = 1

        if labels[0].startswith("ctx_tp:"):
            self.ctx_tp_size = int(labels.pop(0).replace("ctx_tp:", ""))
        elif labels[0].startswith("ctx_dp:"):
            self.ctx_dp_size = int(labels.pop(0).replace("ctx_dp:", ""))
        elif labels[0].startswith("ctx_pp:"):
            self.ctx_pp_size = int(labels.pop(0).replace("ctx_pp:", ""))
        else:
            raise RuntimeError(f"Wrong label for ctx config: {labels[0]}!")

        if labels[0].startswith("gen_tp:"):
            self.gen_tp_size = int(labels.pop(0).replace("gen_tp:", ""))
        elif labels[0].startswith("gen_dp:"):
            self.gen_dp_size = int(labels.pop(0).replace("gen_dp:", ""))
        elif labels[0].startswith("gen_pp:"):
            self.gen_pp_size = int(labels.pop(0).replace("gen_pp:", ""))
        else:
            raise RuntimeError(f"Wrong label for gen config: {labels[0]}!")

        self.ctx_server_workers = self.ctx_tp_size * self.ctx_dp_size * self.ctx_pp_size
        self.gen_server_workers = self.gen_tp_size * self.gen_dp_size * self.gen_pp_size

        self.validate()

    def load_from_str(self, test_param_labels) -> None:
        """
        Populate the config properties given the test param string.
        """

        # Extract configs from test param labels.
        labels = test_param_labels.split("-")

        self.model_name = labels.pop(0)

        # Check if device subtype is present (for autodeploy tests)
        self.device_subtype = None
        if len(labels) > 0 and labels[0].startswith("subtype:"):
            self.device_subtype = labels.pop(0).replace("subtype:", "")

        assert labels[0] in ["cpp", "cppmanager", "bench", "disagg_server"], \
            f"Invalid runtime {labels[0]}!"
        self.runtime = labels.pop(0)

        if self.runtime == "disagg_server":
            return self._load_from_str_disagg(labels)

        self.api = labels.pop(0) if labels[0] == "exe" else ""
        self.backend = labels.pop(0) if labels[0] in ["pytorch", "_autodeploy"
                                                      ] else ""
        self.streaming = labels.pop(0) if labels[0] == "streaming" else ""
        self.static_batching = labels.pop(
            0) if labels[0] == "static_batching" else ""
        if self.runtime != "bench":
            self.mode = labels.pop(0)
        self.data_type = labels.pop(0)
        if labels[0].startswith("gwp"):
            self.gpu_weights_percent = float(labels.pop(0).replace("gwp:", ""))

        if labels[0] == "mp":
            self.multiple_profiles = True
            labels.pop(0)

        if labels[0].startswith("maxbs"):
            self.max_batch_size = int(labels.pop(0).replace("maxbs:", ""))

        if labels[0].startswith("maxnt"):
            self.max_num_tokens = int(labels.pop(0).replace("maxnt:", ""))

        if labels[0].startswith("kv_frac"):
            self.kv_cache_free_gpu_mem_fraction = float(
                labels.pop(0).replace("kv_frac:", ""))

        if labels[0] == "build_only":
            self.build_only = True
            labels.pop(0)

        if not self.build_only:
            if labels[0].startswith("bs:"):
                self.batch_sizes = [
                    int(x) for x in labels.pop(0).replace("bs:", "").split("+")
                ]
            else:
                self.batch_sizes = [0]

            if labels[0].startswith("input_output_len"):
                io_lens = labels.pop(0).replace("input_output_len:",
                                                "").split("+")
                self.input_lens = [int(x.split(",")[0]) for x in io_lens]
                self.output_lens = [int(x.split(",")[1]) for x in io_lens]
            elif labels[0].startswith("input_len"):
                self.input_lens = [
                    int(x)
                    for x in labels.pop(0).replace("input_len:", "").split("+")
                ]
                self.output_lens = []
            else:
                raise RuntimeError(
                    f"Unexpected test name label for seq lens: {labels[0]}!")

        if len(labels) > 0:
            self.num_beams = 1 if not labels[0].startswith("beams:") else int(
                labels.pop(0).replace("beams:", ""))

        if len(labels) > 0:
            self.num_loras = 0 if not labels[0].startswith("loras:") else int(
                labels.pop(0).replace("loras:", ""))

        if len(labels) > 0:
            self.quantization = "" if not labels[0].startswith(
                "quant:") else labels.pop(0).replace("quant:", "")

        if len(labels) > 0:
            self.kv_cache_dtype = "auto" if not labels[0].startswith(
                "kv_cache_dtype:") else labels.pop(0).replace(
                    "kv_cache_dtype:", "")

        if len(labels) > 0:
            self.num_reqs = 512 if not labels[0].startswith("reqs:") else int(
                labels.pop(0).replace("reqs:", ""))

        if len(labels) > 0:
            self.concurrency = -1 if not labels[0].startswith("con:") else int(
                labels.pop(0).replace("con:", ""))

        if len(labels) > 0:
            self.ep_size = None if not labels[0].startswith("ep:") else int(
                labels.pop(0).replace("ep:", ""))

        if len(labels) > 0:
            self.tp_size = 1 if not labels[0].startswith("tp:") else int(
                labels.pop(0).replace("tp:", ""))

        if len(labels) > 0:
            self.pp_size = 1 if not labels[0].startswith("pp:") else int(
                labels.pop(0).replace("pp:", ""))

        if len(labels) > 0:
            self.num_gpus = 1 if not labels[0].startswith("gpus:") else int(
                labels.pop(0).replace("gpus:", ""))

        if len(labels) > 0:
            self.extra = True if labels[0] == "extra" else False
            if self.extra:
                labels.pop(0)

        assert len(
            labels
        ) == 0, f"Invalid test name! Some labels cannot be parsed: {labels}"

        # Validate the parsed config.
        self.validate()

    def validate(self):
        """
        Validate if the config makes sense.
        """
        # Validate model name.
        assert len(self.model_name) > 0, "model_name must not be empty!"
        assert "-" not in self.model_name, "model_name must not contain '-' character!"
        if self.model_name not in MODEL_PATH_DICT.keys(
        ) and self.model_name not in HF_MODEL_PATH.keys():
            allowed_configs = import_allowed_perf_config()
            allowed_models = allowed_configs.get_allowed_models()
            assert self.model_name in allowed_models, f"model_name {self.model_name} is not in allowed_models!"

        # Validate runtime type.
        VALID_RUNTIMES = ["cpp", "cppmanager", "bench", "disagg_server"]
        assert self.runtime in VALID_RUNTIMES, f"Invalid runtime {self.runtime}!"

        if self.runtime == "disagg_server":
            # TODO: validate disaggregated server config
            return

        # Validate plugin mode.
        VALID_MODES = ["plugin", "ootb", "ootb_except_mha"]
        if self.runtime == "cppmanager":
            VALID_MODES += ["plugin_ifb"]
        assert self.mode in VALID_MODES, f"Invalid mode {self.mode}!"

        # Validate dtype.
        VALID_DTYPES = ["float32", "float16", "bfloat16", "float8", "float4"]
        assert self.data_type in VALID_DTYPES, f"Invalid data_type {self.data_type}!"
        VALID_KV_CACHE_DTYPES = ["auto", "fp8"]
        assert self.kv_cache_dtype in VALID_KV_CACHE_DTYPES, f"Invalid kv_cache_dtype {self.kv_cache_dtype}!"

        # Validate quantization mode.
        if self.model_name in MODEL_PATH_DICT.keys():
            VALID_QUANTS = [
                "", "nvfp4", "fp8", "int8", "int4_awq", "w4a8_awq", "w4a16_awq",
                "int4_wo", "full_prec"
            ]
        else:
            VALID_QUANTS = [
                "",
                "fp8",
                "fp8_gemm",
                "fp8_kv_cache",
                "int8_sq_per_tensor",
                "int8_sq_per_token_channel",
                "int8_weight_only",
                "int4_weight_only",
                "int4_weight_only_awq",
                "int4_weight_only_gptq",
            ]
        assert self.quantization in VALID_QUANTS, f"Invalid quantization {self.quantization}!"
        if self.backend == "pytorch":
            assert self.quantization == "", f"Not support passing quantization {self.quantization} for pytorch backend!"
        assert self.num_beams >= 1, f"Invalid num_beams: {self.num_beams}!"
        assert self.num_loras >= 0, f"Invalid num_loras: {self.num_loras}!"
        assert self.num_reqs >= 1, f"Invalid num_reqs: {self.num_reqs}!"
        if self.pp_size > 1:
            assert self.model_name in MODEL_PATH_DICT.keys(
            ), f"Invalid model name for pp size {self.pp_size} test"
        if self.num_gpus > 1 and self.tp_size == 1 and self.pp_size == 1:
            self.tp_size = self.num_gpus

        if self.tp_size > 1 or self.pp_size > 1 and self.num_gpus == 1:
            self.num_gpus = self.tp_size * self.pp_size

        assert self.num_gpus == self.tp_size * self.pp_size, f"Num of GPU shall be equal to TP*PP: {self.num_gpus}, {self.tp_size}, {self.pp_size}"
        if self.gpu_weights_percent != -1:
            assert 0 <= self.gpu_weights_percent <= 1, f"Invalid gpu_weights_percent: {self.gpu_weights_percent}!"
        if not self.build_only:
            assert len(self.input_lens) > 0, f"Empty input_lens!"
            if self.is_bert_like():
                assert len(
                    self.output_lens
                ) == 0, f"BERT-like models must not have output_lens!"
            else:
                assert len(
                    self.output_lens
                ) > 0, f"GPT-like models and enc-dec models must have output_lens!"

            # BERT with small BS is very unstable. Try to avoid it.
            if self.is_bert_like():
                if self.runtime == "trtllm-bench":
                    self.batch_sizes[
                        0] = self.max_batch_size if self.max_batch_size > 0 else 1
                    print(f"batch_sizes: {self.batch_sizes}")
                assert all(
                    [b >= 32 for b in self.batch_sizes]
                ), f"BERT with small BS is very unstable! Please increase to at least 32."

            # GPT-350m and Bloom-560m with small BS are very unstable. Only run these small models with larger BS.
            if self.model_name in ["gpt_350m", "bloom_560m"]:
                assert all(
                    [b >= 32 for b in self.batch_sizes]
                ), f"gpt_350m and bloom_560m with small BS are very unstable! Please increase to at least 32."

    def get_model_family(self) -> str:
        """
        Get the model family of the current model.
        """
        allowed_configs = import_allowed_perf_config()
        allowed_models = allowed_configs.get_allowed_models()
        if self.model_name in allowed_models:
            return allowed_configs.get_model_family(self.model_name)
        else:
            return ""

    def is_mamba_family(self) -> bool:
        """
        Check if the current model family is Mamba.
        """
        return self.get_model_family() == 'mamba'

    def is_moe_family(self) -> bool:
        """
        Check if the current model family is MoE.
        """
        allowed_configs = import_allowed_perf_config()
        allowed_models = allowed_configs.get_allowed_models()
        if self.model_name in allowed_models:
            model_config = allowed_configs.get_model_config(self.model_name)
            return model_config['moe_num_experts'] > 0 and model_config[
                'moe_top_k'] > 0
        else:
            return False

    def get_benchmark_type(self) -> str:
        """
        Get the benchmark type of the current model.
        """
        allowed_configs = import_allowed_perf_config()
        allowed_models = allowed_configs.get_allowed_models()
        if self.model_name in allowed_models:
            return allowed_configs.get_benchmark_type(self.model_name)
        else:
            return ""

    def is_bert_like(self) -> bool:
        """
        Check if the current benchmark is a BERT benchmark.
        """
        return self.get_benchmark_type() == "bert"

    def is_enc_dec(self) -> bool:
        """
        Check if the current benchmark is a EncDec benchmark.
        """
        return self.get_benchmark_type() == "enc_dec"


class MultiMetricPerfTest(AbstractPerfScriptTestClass):
    """
    Base class for perf tests with multiple metrics.
    """

    def __init__(self, full_test_name: str):
        # full_test_name is the full test name appearing in test output.
        self._full_test_name = full_test_name
        # test_domain_name is the part before "::".
        self._test_domain_name = "::".join(full_test_name.split("::")[:-1])
        # short_test_name is the part after "::".
        self._short_test_name = full_test_name.split("::")[-1]
        # short_test_name_body is the part before "[" in short_test_name.
        self._short_test_name_body = self._short_test_name.split("[")[0]
        # test_param_labels is the part inside "[...]".
        self._test_param_labels = full_test_name.split("[")[-1][:-1]
        # Load test config from test name.
        self._config = PerfTestConfig()
        self._config.load_from_str(self._test_param_labels)
        # This will store the currently running metric.
        self._current_metric = None
        self.lora_dirs = []
        # This will store each test's result
        self._test_results = {}

    def get_test_name(self) -> str:
        return str(self._config)

    def set_runtime_configs(self,
                            llm_root,
                            working_dir,
                            output_dir,
                            perf_cache_fpath,
                            gpu_clock_lock=None) -> None:
        if self._config.runtime == "cpp":
            if not self._config.is_bert_like():
                raise ValueError(
                    f"Invalid config: '{self._config.runtime}' is only supported for bert-like models!"
                )
            benchmark_script = get_cpp_benchmark("bertBenchmark", llm_root)
        elif self._config.runtime == "cppmanager":
            benchmark_script = get_cpp_benchmark("gptManagerBenchmark",
                                                 llm_root)
        elif self._config.runtime == "bench":
            benchmark_script = "trtllm-bench"
        elif self._config.runtime == "disagg_server":
            benchmark_script = None
        else:
            raise RuntimeError(f"Invalid runtime {self._config.runtime}.")

        allowed_configs = import_allowed_perf_config()
        allowed_models = allowed_configs.get_allowed_models()

        if self._config.runtime == "bench":
            build_script = "trtllm-bench"
        elif self._config.runtime == "aggr_server":
            build_script = None
        elif self._config.runtime == "multi_node_disagg_server":
            build_script = None
        elif self._config.pp_size > 1 or self._config.model_name not in allowed_models:
            build_script = "trtllm-build"
        else:
            # build.py is used to build engines for both python and cpp runtime
            build_script = os.path.join(llm_root,
                                        "tests/integration/defs/perf/build.py")

        self._build_script = build_script
        self._benchmark_script = benchmark_script
        self._working_dir = working_dir
        self._output_dir = output_dir
        self._perf_cache_fpath = perf_cache_fpath
        self._llm_root = llm_root
        self._gpu_clock_lock = gpu_clock_lock

    def get_trtllm_build_command(self, engine_dir, checkpoint_dir) -> list:
        build_cmd = [
            self._build_script, f"--output_dir={engine_dir}",
            f"--checkpoint_dir={checkpoint_dir}",
            f"--workers={self._config.tp_size}",
            f"--use_paged_context_fmha=enable", f"--monitor_memory",
            f"--max_batch_size={self._config.max_batch_size}"
        ]
        # For Multiple Profiles
        if self._config.multiple_profiles:
            build_cmd.append(f"--multiple_profiles=enable")
        else:
            build_cmd.append(f"--multiple_profiles=disable")
        num_beams = self._config.num_beams
        if num_beams > 1:
            build_cmd.append(f"--max_beam_width={num_beams}")
        gpu_percent = self._config.gpu_weights_percent
        if gpu_percent != -1:
            build_cmd += [f"--weight_streaming"]
        # For engine inspector
        build_cmd.append("--profiling_verbosity=layer_names_only")
        if self._config.num_loras > 0:
            if "mixtral" in self._config.model_name:
                build_cmd.append(f"--lora_plugin=auto")
                build_cmd.append(f"--moe_plugin=auto")
                build_cmd.append(f"--lora_target_modules")
                build_cmd.append(f"attn_q")
                build_cmd.append(f"attn_k")
                build_cmd.append(f"attn_v")
                build_cmd.append(f"attn_dense")
                build_cmd.append(f"moe_h_to_4h")
                build_cmd.append(f"moe_4h_to_h")
                build_cmd.append(f"moe_gate")
                build_cmd.append(f"moe_router")
            elif "llama" in self._config.model_name:
                build_cmd.append(f"--lora_plugin=float16")
                build_cmd.append(f"--lora_target_modules")
                build_cmd.append(f"attn_q")
                build_cmd.append(f"attn_k")
                build_cmd.append(f"attn_v")
                build_cmd.append(f"attn_dense")
                build_cmd.append(f"mlp_h_to_4h")
                build_cmd.append(f"mlp_4h_to_h")
                build_cmd.append(f"mlp_gate")
        if TIMING_CACHE_DIR and not self._config.build_only:
            timing_cache = os.path.join(TIMING_CACHE_DIR, "model.cache")
            build_cmd.append(f"--input_timing_cache={timing_cache}")
            build_cmd.append(f"--output_timing_cache={timing_cache}")
        return build_cmd

    def get_trtllm_bench_model(self):
        return get_model_dir(self._config.model_name)

    def get_trtllm_bench_build_command(self, engine_dir) -> list:
        model_dir = self.get_trtllm_bench_model()
        if model_dir == "":
            pytest.skip("Model Name is not supported by trtllm-bench")
        model_name = self._config.model_name
        if not model_name.endswith("_hf"):
            model_name = model_name + "_hf"
        hf_model_name = HF_MODEL_PATH.get(model_name, "")
        build_cmd = [
            self._build_script, f"--log_level=info",
            f"--workspace={engine_dir}", f"--model={hf_model_name}",
            f"--model_path={model_dir}", "build",
            f"--tp_size={self._config.tp_size}",
            f"--pp_size={self._config.pp_size}"
        ]
        max_seq_len = max(self._config.input_lens) + max(
            self._config.output_lens)
        build_cmd.append(f"--max_seq_len={max_seq_len}")
        # Add max_batch_size and max_num_tokens to ensure build matches runtime configuration
        # Note: trtllm-bench requires both to be specified together (option group constraint)
        assert self._config.max_batch_size > 0, f"max_batch_size must be > 0, got {self._config.max_batch_size}"
        assert self._config.max_num_tokens > 0, f"max_num_tokens must be > 0, got {self._config.max_num_tokens}"
        build_cmd.append(f"--max_batch_size={self._config.max_batch_size}")
        build_cmd.append(f"--max_num_tokens={self._config.max_num_tokens}")
        if self._config.quantization:
            build_cmd.append(
                f"--quantization={self._config.quantization.upper()}")
        if self._config.model_name in TRUST_REMOTE_CODE_MODELS:
            build_cmd.append(f"--trust_remote_code=True")
        return build_cmd

    def get_prepare_data_command(self, engine_dir, input_len,
                                 output_len) -> list:
        data_cmd = []
        prepare_data_script = os.path.join(self._llm_root, "benchmarks", "cpp",
                                           "prepare_dataset.py")

        if self._config.model_name in MODEL_PATH_DICT.keys():
            tokenizer_dir = os.path.join(
                llm_models_root(), MODEL_PATH_DICT[self._config.model_name])
        elif self._config.model_name in HF_MODEL_PATH.keys():
            tokenizer_dir = HF_MODEL_PATH[self._config.model_name]
        else:
            tokenizer_dir = os.path.join(llm_models_root(), "llama-models",
                                         "llama-7b-hf")
        if not os.path.exists(engine_dir):
            os.makedirs(engine_dir, exist_ok=True)

        if self._config.num_loras > 0:
            istdev = 16
            ostdev = 24
            nloras = self._config.num_loras
            dataset_path = os.path.join(engine_dir, "synthetic_data.json")

            if self._config.model_name in LORA_MODEL_PATH.keys(
            ) and self._config.backend == "pytorch" and self._config.runtime == "bench":
                actual_lora_paths = LORA_MODEL_PATH[self._config.model_name]
                if not isinstance(actual_lora_paths, list):
                    actual_lora_paths = [actual_lora_paths]
                for i, actual_lora_path in enumerate(actual_lora_paths):
                    if not actual_lora_path.startswith("/"):
                        actual_lora_paths[i] = os.path.join(
                            llm_models_root(), actual_lora_path)
                lora_dir = os.path.join(engine_dir, "loras")
                data_cmd += [f"mkdir -p {lora_dir}", ";"]
                if len(actual_lora_paths) != nloras:
                    raise ValueError(
                        f"Number of LoRA paths ({len(actual_lora_paths)}) does not match requested number of LoRAs ({nloras})"
                    )
                for i, lora_path in enumerate(actual_lora_paths):
                    self.lora_dirs.append(f"{lora_dir}/{i}")
                    data_cmd += [f"ln -sf {lora_path} {lora_dir}/{i}", ";"]
                data_cmd += [
                    "trtllm-bench", f"--model={tokenizer_dir}",
                    "prepare-dataset", "--output", f"{dataset_path}",
                    f"--rand-task-id 0 {nloras-1}", f"--lora-dir={lora_dir}",
                    f"token-norm-dist",
                    f"--num-requests={self._config.num_reqs}",
                    f"--input-mean={input_len}", f"--output-mean={output_len}",
                    f"--input-stdev={istdev}", f"--output-stdev={ostdev}"
                ]

            else:
                pytest.skip(
                    f"LoRA config not supported for {self._config.model_name} with the current backend and runtime."
                )
        else:
            istdev = 0
            ostdev = 0
            dataset_path = os.path.join(engine_dir, "synthetic_data.json")
            if self._build_script == 'trtllm-bench':
                data_cmd += [
                    "trtllm-bench", f"--model={tokenizer_dir}",
                    "prepare-dataset", "--output", f"{dataset_path}",
                    "token-norm-dist",
                    f"--num-requests={self._config.num_reqs}",
                    f"--input-mean={input_len}", f"--output-mean={output_len}",
                    f"--input-stdev={istdev}", f"--output-stdev={ostdev}"
                ]
            else:
                data_cmd += [
                    "python3", prepare_data_script, f"--output={dataset_path}",
                    f"--tokenizer={tokenizer_dir}", f"token-norm-dist",
                    f"--num-requests={self._config.num_reqs}",
                    f"--input-mean={input_len}", f"--output-mean={output_len}",
                    f"--input-stdev={istdev}", f"--output-stdev={ostdev}"
                ]

        return data_cmd

    def get_trtllm_bench_command(self, engine_dir):
        model_dir = self.get_trtllm_bench_model()
        model_name = self._config.model_name
        dataset_path = os.path.join(engine_dir, "synthetic_data.json")
        report_path = os.path.join(engine_dir, "report.json")
        if not model_name.endswith("_hf"):
            model_name = model_name + "_hf"
        hf_model_name = HF_MODEL_PATH.get(model_name, "")
        tp_pp_str = f"tp_{self._config.tp_size}_pp_{self._config.pp_size}"
        engine_dir = os.path.join(engine_dir, hf_model_name, tp_pp_str)
        benchmark_cmd = [
            self._benchmark_script,
            f"--model={model_name}",
            f"--model_path={model_dir}",
            "throughput",
            f"--dataset={dataset_path}",
            f"--max_batch_size={self._config.max_batch_size}",
            f"--max_num_tokens={self._config.max_num_tokens}",
            f"--report_json={report_path}",
            f"--kv_cache_free_gpu_mem_fraction={self._config.kv_cache_free_gpu_mem_fraction}",
        ]
        if self._config.backend == "pytorch":
            benchmark_cmd += ["--backend=pytorch"]
        elif self._config.backend == "_autodeploy":
            benchmark_cmd += ["--backend=_autodeploy"]
        else:
            benchmark_cmd += [
                f"--backend=tensorrt", f"--engine_dir={engine_dir}"
            ]
        if self._config.num_reqs > 0:
            benchmark_cmd += [f"--num_requests={self._config.num_reqs}"]
        if self._config.concurrency != -1:
            benchmark_cmd += [f"--concurrency={self._config.concurrency}"]
        if self._config.ep_size != None:
            benchmark_cmd += [f"--ep={self._config.ep_size}"]
        if self._config.tp_size > 1:
            benchmark_cmd += [f"--tp={self._config.tp_size}"]
        if self._config.pp_size > 1:
            benchmark_cmd += [f"--pp={self._config.pp_size}"]
        if self._config.streaming == "streaming":
            benchmark_cmd += [f"--streaming"]
        if self._config.num_gpus > 1:
            benchmark_cmd += [f"--warmup={2 * self._config.num_gpus}"]

        #Add extra-llm-api-config.yml for pytorch backend and tensorrt backend with extra flag
        if self._config.backend == "pytorch" or (self._config.backend == ""
                                                 and self._config.extra):
            pytorch_config_path = os.path.join(engine_dir,
                                               "extra-llm-api-config.yml")
            if not os.path.exists(pytorch_config_path):
                os.makedirs(os.path.dirname(pytorch_config_path), exist_ok=True)
            config = get_model_yaml_config(self._config.to_string(),
                                           lora_dirs=self.lora_dirs)
            if config:
                print_info(f"pytorch/TRT model config: {config}")
                with open(pytorch_config_path, 'w') as f:
                    yaml.dump(config, f, default_flow_style=False)
                benchmark_cmd += [f"--config={pytorch_config_path}"]
                # If guided_decoding_backend is set, we need to initialize tokenizer
                if config.get('guided_decoding_backend') is not None:
                    benchmark_cmd += ["--no_skip_tokenizer_init"]
        elif self._config.backend == "_autodeploy":
            autodeploy_config_path = os.path.join(engine_dir,
                                                  "extra_llm_api_options.yaml")
            if not os.path.exists(autodeploy_config_path):
                os.makedirs(os.path.dirname(autodeploy_config_path),
                            exist_ok=True)

            # Default autodeploy config
            autodeploy_config = {
                'transforms': {
                    'compile_model': {
                        'backend': self._config.ad_compile_backend
                    },
                },
                'runtime': self._config.extra_runtime,
                'skip_loading_weights': self._config.skip_loading_weights
            }

            # If model has a curated config, use it instead
            if self._config.model_name in AUTODEPLOY_MODEL_CONFIGS:
                config_file = os.path.join(
                    self._llm_root,
                    AUTODEPLOY_MODEL_CONFIGS[self._config.model_name])
                if os.path.exists(config_file):
                    with open(config_file, 'r') as f:
                        autodeploy_config = yaml.safe_load(f)

            print_info(f"_autodeploy model config: {autodeploy_config}")
            with open(autodeploy_config_path, 'w') as f:
                yaml.dump(autodeploy_config, f, default_flow_style=False)
            benchmark_cmd += [f"--config={autodeploy_config_path}"]
        # for sampler options
        sampler_options_path = os.path.join(engine_dir, "sampler_options.yml")
        if not os.path.exists(sampler_options_path):
            os.makedirs(os.path.dirname(sampler_options_path), exist_ok=True)
        sampler_config = get_sampler_options_config(self._config.to_string())
        if sampler_config:
            print_info(f"sampler options config: {sampler_config}")
            with open(sampler_options_path, 'w') as f:
                yaml.dump(sampler_config, f, default_flow_style=False)
            benchmark_cmd += [f"--sampler_options={sampler_options_path}"]
        return benchmark_cmd

    def get_commands(self):
        # Whether this is python or cpp runtime perf test.
        is_python = self._config.runtime == "python"
        num_gpus = self._config.num_gpus
        is_disagg = self._config.runtime == "disagg_server"
        if is_disagg:
            ctx_cmd, gen_cmd = self._get_disagg_worker_deploy_command()
            server_cmd = self._get_disagg_server_deploy_command()
            client_cmd = self._get_disagg_client_command()
            benchmark_cmd = self._get_disagg_benchmark_command()
            return PerfDisaggScriptTestCmds(ctx_cmd, gen_cmd, server_cmd,
                                            client_cmd, benchmark_cmd)

        if is_python and num_gpus > 1:
            # TODO: Fix https://nvbugs/4449875
            pytest.skip(
                "multi-gpu tests with python runtime is skipped because of hanging issue. See https://nvbugs/4449875"
            )
        if is_windows() and num_gpus > 1:
            pytest.skip(
                "multi-gpu not supported on Windows yet, skipped for now")

        # Construct engine build command.
        engine_dir = self._get_engine_dir()
        build_cmd = []
        if self._config.runtime == "bench":
            if self._config.backend in ["pytorch", "_autodeploy"]:
                # Skip building process as it is pytorch or _autodeploy backend")
                pass
            else:
                build_cmd = self.get_trtllm_bench_build_command(engine_dir)
        else:
            pytest.skip("only support trtllm-bench runtime for now")

        # Construct prepare synthetic data command
        data_cmds = []

        # Construct benchmark commands for each bs and seq len combination.
        benchmark_cmds = []
        for bs in self._config.batch_sizes:
            for len_idx, input_len in enumerate(self._config.input_lens):
                output_len = None if self._config.is_bert_like(
                ) else self._config.output_lens[len_idx]
                if self._config.runtime == "bench":
                    benchmark_cmd = self.get_trtllm_bench_command(engine_dir)
                else:
                    pytest.skip("only support trtllm-bench runtime for now")
                benchmark_cmds.append(benchmark_cmd)
                data_cmd = self.get_prepare_data_command(
                    engine_dir, input_len, output_len)
                data_cmds.append(data_cmd)

        # Construct MPI command.
        mpi_cmd = []
        if num_gpus > 1 and num_gpus <= 8 and not self._config.runtime == "bench":
            if cpu_socket_count_gt_1():
                mpi_cmd = [
                    "mpirun", "--map-by", "socket", "-n", f"{num_gpus}",
                    "--allow-run-as-root"
                ]
            else:
                mpi_cmd = ["mpirun", "-n", f"{num_gpus}", "--allow-run-as-root"]
        if self._build_script == "trtllm-bench":
            return PerfBenchScriptTestCmds(data_cmds, build_cmd, benchmark_cmds,
                                           mpi_cmd, is_python)
        else:
            pytest.skip("only support trtllm-bench runtime for now")

    def get_perf_result(self, outputs: Dict[int, str]) -> float:
        """
        Get perf metric result from test output logs.
        """
        metric = self._current_metric
        cmd_idx = metric.cmd_idx
        metric_name = metric.metric_name
        num_gpus = self._config.num_gpus

        # Make sure we have outputs.
        assert cmd_idx in outputs, f"Output log for command {cmd_idx} does not exist!"

        # Use all applicable regex patterns to go through the log from the N-th command, where N = cmd_idx.
        print_info(
            f"Searching for metric {metric_name} from output log of command {cmd_idx} ..."
        )

        regex_matches = [
            metric.metric_regex.search(line)
            for line in outputs[cmd_idx].split("\n")
        ]
        metric_values = []
        for match in regex_matches:
            if match:
                # Handle multiple capture groups - use the first non-None group
                value = None
                for i in range(1, len(match.groups()) + 1):
                    if match.group(i) is not None:
                        value = match.group(i)
                        break
                if value is not None:
                    metric_values.append(float(value))

        if len(metric_values) == 0:
            if self._build_script == "trtllm-bench" and self._config.num_gpus > 1 and metric.metric_type == PerfMetricType.BUILD_TIME:
                print_info("skip building process for multi-gpu test"
                           )  #https://nvbugspro.nvidia.com/bug/5210111
                metric_values = [0.0]
            else:
                raise RuntimeError(
                    f"Cannot find perf result for {metric_name} from perf script logs!"
                )

        if metric.metric_type in BUILDER_METRICS and metric.metric_type != PerfMetricType.ENGINE_SIZE:
            # For enc-dec models, there are 2 builder perf metrics, we add them up.
            if self._config.is_enc_dec():
                assert len(
                    metric_values
                ) == 2 * num_gpus, f"Enc-Dec models must have num of metrics 2*{num_gpus} but got {len(metric_values)}!"

                enc_metrics = metric_values[:num_gpus]
                dec_metrics = metric_values[num_gpus:]
                gather_function = sum
                # Measure BUILD_PEAK_CPU_MEMORY, BUILD_PEAK_GPU_MEMORY by max function
                if metric.metric_type in [
                        PerfMetricType.BUILD_PEAK_CPU_MEMORY,
                        PerfMetricType.BUILD_PEAK_GPU_MEMORY
                ]:
                    gather_function = max

                metric_values = [
                    gather_function([x, y])
                    for x, y in zip(enc_metrics, dec_metrics)
                ]
                print_info(
                    f"Combining up enc builder_perf {enc_metrics} and dec builder_perf {dec_metrics} to {metric_values}."
                )
            # For other models, builder metric should equal # gpus.
            elif self._build_script != "trtllm-build" and self._build_script != "trtllm-bench":
                assert len(
                    metric_values
                ) == num_gpus, f"num of metrics: {len(metric_values)} should match num_gpus: {num_gpus}"

        # Use max perf metrics across GPUS
        if len(metric_values) > 1:
            metric_value = max(metric_values)
            print_info(
                f"Use max value {metric_value} out of {metric_values} for perf metric {metric_name}."
            )
        else:
            metric_value = metric_values[0]
            print_info(
                f"Use value {metric_value} for perf metric {metric_name}.")

        return metric_value

    def get_threshold(self) -> float:
        return self._current_metric.metric_threshold

    def get_absolute_threshold(self) -> float:
        return self._current_metric.metric_abs_threshold

    def get_metric_type(self) -> PerfMetricType:
        return self._current_metric.metric_type

    def run_metrics(self, llm_venv, gpu_clock_lock, session_data_writer,
                    output_dir):
        """
        Run through the commands and parse multiple perf metrics from the logs.
        """
        #print info to separate cases
        self._current_cmd_idx = 0
        metrics = self._get_metrics()
        commands = self.get_commands()
        outputs = {}
        result_states = {}
        errors = []

        # Only trtllm-bench needs to prepare dataset first.
        if self._config.runtime == 'bench':
            print_info(f"Running command for generating dataset")
            outputs = self.run_ex(commands=commands,
                                  cmd_idx=self._current_cmd_idx,
                                  full_test_name="prepare_dataset",
                                  metric_type=None,
                                  venv=llm_venv,
                                  gpu_clock_lock=gpu_clock_lock,
                                  session_data_writer=session_data_writer,
                                  output_dir=output_dir,
                                  outputs=outputs,
                                  original_test_name="prepare_dataset")
            result_state = self.get_result_state()
            result_states[self._current_cmd_idx] = result_state
            if result_state != "valid":
                errors.append(self.get_error())

        try:
            for metric in metrics:
                # Make sure that cmd_idx is in ascending order.
                assert metric.cmd_idx >= self._current_cmd_idx, "Command indices must be in ascending order!"
                self._current_cmd_idx = metric.cmd_idx
                self._current_metric = metric

                # If the same command has previously failed, do not run it again.
                if self._current_cmd_idx in result_states and result_states[
                        self._current_cmd_idx] == "failed":
                    print_warning(
                        f"Skipped running command for {metric.metric_name} since the previous run failed."
                    )
                    continue

                # If engine build command already failed, do not run benchmark commands.
                if 0 in result_states and result_states[0] == "failed":
                    print_warning(
                        f"Skipped running command for {metric.metric_name} since the engine building command failed."
                    )
                    continue

                # Run the command or reuse the existing output logs.
                print_info(f"Running command for {metric.metric_name}")
                outputs = self.run_ex(
                    commands=commands,
                    cmd_idx=self._current_cmd_idx,
                    full_test_name=metric.metric_name,
                    metric_type=metric.metric_type,
                    venv=llm_venv,
                    gpu_clock_lock=gpu_clock_lock,
                    session_data_writer=session_data_writer,
                    output_dir=output_dir,
                    outputs=outputs,
                    original_test_name=metric.original_test_name)

                # Save the result state.
                result_state = self.get_result_state()
                result_states[self._current_cmd_idx] = result_state
                if result_state != "valid":
                    errors.append(self.get_error())
                    if self._current_cmd_idx in self._test_results:
                        del self._test_results[self._current_cmd_idx]

        finally:
            # Clean up engine dir after use.
            shutil.rmtree(self._get_engine_dir(), ignore_errors=True)

        def add_myelin_time_pass_to(input_env):
            time_pass_flag = r" -time_pass=on"
            old_myelin_env = input_env.get("__LUNOWUD", "")
            if time_pass_flag not in old_myelin_env:
                input_env["__LUNOWUD"] = old_myelin_env + time_pass_flag
            return old_myelin_env

        old_llm_venv = add_myelin_time_pass_to(llm_venv._new_env)
        llm_venv._new_env["__LUNOWUD"] = old_llm_venv

        # Check if any commands failed.
        if not all([result_states[idx] == "valid" for idx in result_states]):
            # If there is only one error, throw it directly.
            if len(errors) == 1:
                raise errors[0]

            # Otherwise, combine all the error messages and re-raise a generic RuntimeError.
            msg = "Multiple Errors happened:\n"
            for error_idx, e in enumerate(errors):
                msg += f"> Error {error_idx+1}/{len(errors)}: {type(e).__name__}: {e}\n"

            raise RuntimeError(msg)

    def _get_engine_dir(self) -> str:
        """
        Get the engine directory to store the engine.
        """
        escaped_label = self._test_param_labels.replace("+", "_").replace(
            ":", "_").replace(",", "_")
        return os.path.join(self._working_dir, "perf_engines", escaped_label)

    def _get_metrics(self) -> List[PerfTestMetric]:
        """
        Generate all the metric configs for the current test.
        """
        metrics = []
        if self._config.runtime == "disagg_server":
            for metric_type in DISAGG_SERVER_METRICS:
                metrics.append(
                    PerfTestMetric(
                        original_test_name=self._full_test_name,
                        metric_name=self._get_metric_name(
                            metric_type=metric_type),
                        metric_type=metric_type,
                        metric_regex=self._get_metric_regex(metric_type),
                        metric_threshold=self._get_metric_threshold(
                            metric_type),
                        metric_abs_threshold=self._get_metric_abs_threshold(
                            metric_type),
                        cmd_idx=0,
                    ))
            return metrics

        # Build command is the first command.
        cmd_idx = 0 if self._config.runtime != "bench" else 1
        if self._config.runtime == "bench":
            if self._config.backend in ["pytorch", "_autodeploy"]:
                print_info(
                    f"Skip building process for {self._config.model_name} as it is {self._config.backend} backend"
                )
                builder_metrics = []
            else:
                builder_metrics = [PerfMetricType.BUILD_TIME]
        else:
            builder_metrics = BUILDER_METRICS.copy()

        # Add all builder_perf metrics
        for metric_type in builder_metrics:
            metrics.append(
                PerfTestMetric(
                    original_test_name=self._full_test_name,
                    metric_name=self._get_metric_name(metric_type=metric_type),
                    metric_type=metric_type,
                    metric_regex=self._get_metric_regex(metric_type),
                    metric_threshold=self._get_metric_threshold(metric_type),
                    metric_abs_threshold=self._get_metric_abs_threshold(
                        metric_type),
                    cmd_idx=cmd_idx,
                ))
        if self._config.build_only:
            return metrics

        # Then, construct inference latency and gpu mem usage metrics, for each
        # bs and each seq len.
        for bs in self._config.batch_sizes:
            for len_idx, input_len in enumerate(self._config.input_lens):
                cmd_idx += 1
                output_len = None if self._config.is_bert_like(
                ) else self._config.output_lens[len_idx]

                # Get list of metrics depending on config.
                if self._config.runtime == "bench":
                    metric_types = BENCH_INFERENCE_METRICS.copy()
                    if self._config.streaming == "streaming":
                        metric_types.append(PerfMetricType.FIRST_TOKEN_TIME)
                        metric_types.append(PerfMetricType.OUTPUT_TOKEN_TIME)
                else:
                    metric_types = INFERENCE_METRICS.copy()
                for metric_type in metric_types:
                    metrics.append(
                        PerfTestMetric(
                            original_test_name=self._full_test_name,
                            metric_name=self._get_metric_name(
                                metric_type=metric_type,
                                bs=bs,
                                input_len=input_len,
                                output_len=output_len),
                            metric_type=metric_type,
                            metric_regex=self._get_metric_regex(metric_type),
                            metric_threshold=self._get_metric_threshold(
                                metric_type),
                            metric_abs_threshold=self._get_metric_abs_threshold(
                                metric_type),
                            cmd_idx=cmd_idx,
                        ))

        return metrics

    def _get_metric_name(self,
                         metric_type: PerfMetricType,
                         bs: int = None,
                         input_len: int = None,
                         output_len: int = None,
                         server_name: str = None,
                         client_name: str = None,
                         disagg_config_name: str = None) -> str:
        """
        Construct the metric name for given metric_type, bs, input_len, and output_len.
        """

        # Get device subtype for autodeploy tests
        device_subtype = None
        if (hasattr(self, '_gpu_clock_lock') and self._gpu_clock_lock
                and self._config.backend == "_autodeploy"):
            device_subtype = self._gpu_clock_lock.get_device_subtype()

        if metric_type in BUILDER_METRICS:
            # We build one engine for all benchmark runs, so add all bs and seq lens to the metric name.
            metric_label = self._config.to_string(device_subtype=device_subtype)
        elif self._config.runtime == "aggr_server":
            metric_label = self._config.to_string(
                custom_server_name=server_name,
                custom_client_name=client_name,
            )
        elif self._config.runtime == "multi_node_disagg_server":
            metric_label = self._config.to_string(
                custom_server_name=disagg_config_name)
        else:
            # Otherwise, generate per-bs and per-seqlen label.
            metric_label = self._config.to_string(
                custom_bs=bs,
                custom_input_len=input_len,
                custom_output_len=output_len,
                device_subtype=device_subtype,
            )
        metric_name = f"test_perf_metric_{metric_type.lower()}"
        return self._test_domain_name + "::" + metric_name + "[" + metric_label + "]"

    def _get_metric_regex(self, metric_type: PerfMetricType) -> re.Pattern:
        """
        Get the regex used to parse the metric result for the metric type.
        """

        if self._config.runtime == "bench":
            if metric_type not in BENCH_PERF_METRIC_LOG_QUERIES:
                raise ValueError(f"Unexpected metric_type: {metric_type}")
            return BENCH_PERF_METRIC_LOG_QUERIES[metric_type]
        elif self._config.runtime == "aggr_server":
            if metric_type not in AGGR_SERVER_PERF_METRIC_LOG_QUERIES:
                raise ValueError(f"Unexpected metric_type: {metric_type}")
            return AGGR_SERVER_PERF_METRIC_LOG_QUERIES[metric_type]
        elif self._config.runtime == "multi_node_disagg_server":
            if metric_type not in AGGR_SERVER_PERF_METRIC_LOG_QUERIES:
                raise ValueError(f"Unexpected metric_type: {metric_type}")
            return AGGR_SERVER_PERF_METRIC_LOG_QUERIES[metric_type]
        else:
            pytest.skip("only support trtllm-bench runtime for now")

    def _get_metric_threshold(self, metric_type: PerfMetricType) -> float:
        """
        Get the threshold for the metric type.
        """

        if metric_type not in PERF_METRIC_THRESHOLD:
            raise ValueError(f"Unexpected metric_type: {metric_type}")

        return PERF_METRIC_THRESHOLD[metric_type][0]

    def _get_metric_abs_threshold(self, metric_type: PerfMetricType) -> float:
        """
        Get the absolute threshold for the metric type.
        """

        if metric_type not in PERF_METRIC_THRESHOLD:
            raise ValueError(f"Unexpected metric_type: {metric_type}")

        return PERF_METRIC_THRESHOLD[metric_type][1]

    def _gen_disagg_worker_config(self):
        ctx_config = {
            'max_batch_size': 32,
            'max_num_tokens': 4096,
            'max_seq_len': 4096,
            'tensor_parallel_size': self._config.ctx_tp_size,
            'enable_attention_dp': self._config.ctx_dp_size > 1,
            'print_iter_log': True,
            'disable_overlap_scheduler': True,
            'kv_cache_config': {
                'enable_block_reuse': False,
                # 'free_gpu_memory_fraction': ctx_free_gpu_memory_fraction,
                'free_gpu_memory_fraction': 0.5,
                'dtype': 'fp8',
            },
            'disable_overlap_scheduler': True,
            'cache_transceiver_config': {
                # 'max_tokens_in_buffer': cache_transceiver_max_num_tokens,
                'max_tokens_in_buffer': 4096,
                'backend': 'DEFAULT',
            },
        }

        gen_config = {
            'tensor_parallel_size': self._config.gen_tp_size,
            'enable_attention_dp': self._config.gen_dp_size > 1,
            'pipeline_parallel_size': self._config.gen_pp_size,
            'max_batch_size': 32,
            'max_num_tokens': 4096,
            'max_seq_len': 4096,
            'cuda_graph_config': {
                'enable_padding': True,
                'batch_sizes': [1, 2, 4, 8, 16, 32],
            },
            'print_iter_log': True,
            'kv_cache_config': {
                'enable_block_reuse': False,
                'free_gpu_memory_fraction': 0.5,
                'dtype': 'fp8',
            },
            'cache_transceiver_config': {
                'max_tokens_in_buffer': 4096,
                'backend': 'DEFAULT',
            },
        }
        return ctx_config, gen_config

    def _gen_disagg_server_config(self):
        server_config = {
            'hostname': 'localhost',
            'port': 8000,
            'backend': 'pytorch',
            'context_servers': {
                'num_instances': 1,
                'urls': ['localhost:8001']
            },
            'generation_servers': {
                'num_instances': 1,
                'urls': ['localhost:8002']
            }
        }
        return server_config

    def _get_disagg_worker_deploy_command(self):
        ctx_config, gen_config = self._gen_disagg_worker_config()
        ctx_config_path = os.path.join(self._working_dir, "ctx_config.yaml")
        gen_config_path = os.path.join(self._working_dir, "gen_config.yaml")
        with open(ctx_config_path, 'w', encoding='utf-8') as f:
            yaml.dump(ctx_config, f)
        with open(gen_config_path, 'w', encoding='utf-8') as f:
            yaml.dump(gen_config, f)

        print_info(f"ctx_server_config: {ctx_config}")
        print_info(f"gen_server_config: {gen_config}")

        model_path = MODEL_PATH_DICT[self._config.model_name]
        model_dir = os.path.join(llm_models_root(), model_path)

        ctx_gpu_list = ",".join(
            [str(i) for i in range(self._config.ctx_server_workers)])

        gen_gpu_list = ",".join([
            str(i) for i in range(
                self._config.ctx_server_workers,
                self._config.ctx_server_workers +
                self._config.gen_server_workers)
        ])

        ctx_cmd = f'CUDA_VISIBLE_DEVICES={ctx_gpu_list} trtllm-serve {model_dir} --host localhost --port 8001 --config {ctx_config_path}'
        gen_cmd = f'CUDA_VISIBLE_DEVICES={gen_gpu_list} trtllm-serve {model_dir} --host localhost --port 8002 --config {gen_config_path}'
        return ctx_cmd, gen_cmd

    def _get_disagg_server_deploy_command(self):
        server_config = self._gen_disagg_server_config()
        server_config_path = os.path.join(self._working_dir,
                                          "server_config.yaml")
        with open(server_config_path, 'w', encoding='utf-8') as f:
            yaml.dump(server_config, f)
        return f'trtllm-serve disaggregated -c {server_config_path} -t 3600 -r 3600'

    def _get_disagg_client_command(self):
        client_dir = os.path.join(self._llm_root,
                                  "examples/disaggregated/clients")
        client_cmd = [
            'python3', f'{client_dir}/disagg_client.py', '-c',
            f'{self._working_dir}/server_config.yaml', '-p',
            f'{client_dir}/prompts.json', '--ignore-eos',
            '--server-start-timeout',
            str(3600)
        ]
        return client_cmd

    def _get_disagg_benchmark_command(self):
        benchmark_script = os.path.join(self._llm_root, "tensorrt_llm", "serve",
                                        "scripts", "benchmark_serving.py")
        model_path = MODEL_PATH_DICT[self._config.model_name]
        model_dir = os.path.join(llm_models_root(), model_path)
        shared_gpt_path = os.path.join(
            llm_models_root(), "datasets",
            "ShareGPT_V3_unfiltered_cleaned_split.json")
        benchmark_cmd = [
            'python3',
            benchmark_script,
            '--model',
            model_dir,
            '--tokenizer',
            model_dir,
            '--dataset-name',
            'random',
            '--dataset-path',
            shared_gpt_path,
            '--random-input-len',
            '1024',
            '--random-output-len',
            '1024',
            '--random-prefix-len',
            '0',
            '--num-prompts',
            '320',
            '--max-concurrency',
            '32',
            '--host',
            'localhost',
            '--port',
            '8000',
            '--ignore-eos',
            '--no-test-input',
            '--percentile-metrics',
            'e2el,ttft',
        ]
        return benchmark_cmd


def run_perf_test(perf_case_name, trt_performance_cache_fpath,
                  trt_gpu_clock_lock, llm_session_data_writer, output_dir,
                  llm_venv, llm_root):
    """
    The actual test definition for TensorRT LLM perf test.
    """
    working_dir = llm_venv.get_working_directory()
    test_runner = MultiMetricPerfTest(perf_case_name)
    test_runner.set_runtime_configs(llm_root, working_dir, output_dir,
                                    trt_performance_cache_fpath,
                                    trt_gpu_clock_lock)
    test_runner.run_metrics(llm_venv, trt_gpu_clock_lock,
                            llm_session_data_writer, output_dir)


def generate_perf_tests(session, config, items):
    """
    Generate all the perf tests based on test lists to speed up the test collection time.
    """

    print_info(f"Dynamically generating perf tests...")
    valid_prefixes = [
        "perf/test_perf.py::test_perf[",
        # TRT pipeline adds "llm/" prefix, so include it so that TRT-LLM perf tests can run in TRT pipelines.
        "llm/perf/test_perf.py::test_perf[",
    ]
    items = generate_test_nodes(session, config, items, valid_prefixes,
                                run_perf_test)
    print_info(f"Completed generating perf tests.")

    return items