[TRTLLM-9834][feat] Transfer to TRTLLM-INFRA Database and Fail post-merge tests if regression (#10282)

Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-12-31 21:44:59 +08:00 · 2025-12-31 21:44:59 +08:00 · a23c6f1092
commit a23c6f1092
parent 464847c6be
24 changed files with 1995 additions and 1568 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -893,7 +893,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
    // Create a unique suffix for the job name
    String customSuffix = "${env.BUILD_TAG}-${UUID.randomUUID().toString().replaceAll("-", "").substring(0, 6)}".toLowerCase()
    def jobUID = "${cluster.host}-multi_node_test-${customSuffix}"
-    def disaggMode = stageName.contains("Perf-Sanity-Disagg")
+    def perfSanityMode = stageName.contains("PerfSanity")
+    def disaggMode = stageName.contains("PerfSanity-Disagg")
    def setSegment = disaggMode

    Utils.exec(pipeline, script: "env | sort && pwd && ls -alh")
@ -938,6 +939,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
            def scriptExecPathLocal = Utils.createTempLocation(pipeline, "./slurm_exec.sh")
            def scriptExecPathNode = "${jobWorkspace}/${jobUID}-slurm_exec.sh"
            def coverageConfigFile = "${jobWorkspace}/.coveragerc"
+            def perfCheckScriptLocal = "${llmSrcLocal}/tests/integration/defs/perf/perf_regression_check.py"
+            def perfCheckScriptNode = "${jobWorkspace}/${jobUID}-perf_regression_check.py"

            stage("[${stageName}] Initializing Test") {
                // Create Job Workspace folder in Frontend Node
@ -1020,6 +1023,16 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    coverageConfigFile
                )

+                if (perfSanityMode) {
+                    Utils.copyFileToRemoteHost(
+                        pipeline,
+                        remote,
+                        perfCheckScriptLocal,
+                        perfCheckScriptNode,
+                        true
+                    )
+                }
+
                // Generate Pytest command
                String pytestUtil = ""
                if (nodeCount > 1) {
@ -1094,7 +1107,8 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                // Define environment variables to export
                def envVarNames = [
                    'OPEN_SEARCH_DB_BASE_URL',
-                    'OPEN_SEARCH_DB_CREDENTIALS',
+                    'OPEN_SEARCH_DB_CREDENTIALS_USR',
+                    'OPEN_SEARCH_DB_CREDENTIALS_PSW',
                    'BUILD_ID',
                    'BUILD_URL',
                    'JOB_NAME',
@ -1300,6 +1314,22 @@ def runLLMTestlistWithSbatch(pipeline, platform, testList, config=VANILLA_CONFIG
                    ),
                    numRetries: 3
                )
+
+                if (perfSanityMode) {
+                    stage("[${stageName}] Check perf result") {
+                        def perfCheckResult = Utils.exec(
+                            pipeline,
+                            script: Utils.sshUserCmd(
+                                remote,
+                                "python3 ${perfCheckScriptNode} ${jobWorkspace}"
+                            ),
+                            returnStatus: true
+                        )
+                        if (perfCheckResult != 0) {
+                            error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
+                        }
+                    }
+                }
            }

            echo "Finished test stage execution."
@ -2785,7 +2815,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
            error "Some tests still failed after rerun attempts, please check the test report."
        }

-        if (perfMode && !stageName.contains("Perf-Sanity")) {
+        if (perfMode) {
            basePerfFilename = stageName.contains("PyTorch") ? "base_perf_pytorch.csv" : "base_perf.csv"
            basePerfPath = "${llmSrc}/tests/integration/defs/perf/${basePerfFilename}"
            stage("Check perf result") {
@ -2811,7 +2841,7 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
            }
        }

-        if (perfMode && stageName.contains("Perf-Sanity")) {
+        if (stageName.contains("PerfSanity")) {
            stage ("Check perf result") {
                def perfCheckResult = sh(
                    script: """
@ -2820,10 +2850,9 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
                    """,
                    returnStatus: true
                )
-                // TODO: Enable this when perf regression check is stable
-                // if (perfCheckResult != 0) {
-                //     error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
-                // }
+                if (perfCheckResult != 0) {
+                    error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
+                }
            }
        }
    }
@ -3187,7 +3216,7 @@ def launchTestJobs(pipeline, testFilter)
        "RTXPro6000D-4_GPUs-PyTorch-Post-Merge-2": ["rtx-pro-6000d-x4", "l0_rtx_pro_6000", 2, 2, 4],
    ]

-    parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
+    parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("-Perf-")), {
        def config = VANILLA_CONFIG
        if (key.contains("single-device")) {
            config = SINGLE_DEVICE_CONFIG
@ -3198,7 +3227,7 @@ def launchTestJobs(pipeline, testFilter)
        if (key.contains("Pybind")) {
            config = PYBIND_CONFIG
        }
-        runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3])
+        runLLMTestlistOnPlatform(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3])
    }]]}
    fullSet = parallelJobs.keySet()

@ -3219,9 +3248,12 @@ def launchTestJobs(pipeline, testFilter)
        "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
        "DGX_B300-4_GPUs-PyTorch-Post-Merge-2": ["b300-x4", "l0_dgx_b300", 2, 2, 4],
        // Perf sanity post merge test
-        // "DGX_B200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 1, 4],
-        // "DGX_B200-8_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 1, 8],
-        // "DGX_B300-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["b300-x4", "l0_dgx_b300_perf_sanity", 1, 1, 4],
+        // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x4", "l0_dgx_b200_perf_sanity", 1, 3, 4],
+        // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x4", "l0_dgx_b200_perf_sanity", 2, 3, 4],
+        // "DGX_B200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x4", "l0_dgx_b200_perf_sanity", 3, 3, 4],
+        // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["b200-x8", "l0_dgx_b200_perf_sanity", 1, 3, 8],
+        // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["b200-x8", "l0_dgx_b200_perf_sanity", 2, 3, 8],
+        // "DGX_B200-8_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["b200-x8", "l0_dgx_b200_perf_sanity", 3, 3, 8],
    ]
    fullSet += x86SlurmTestConfigs.keySet()

@ -3233,7 +3265,7 @@ def launchTestJobs(pipeline, testFilter)
        if (key.contains("llvm")) {
            config = LLVM_CONFIG
        }
-        runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
+        runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
    }]]}

    parallelJobs += parallelSlurmJobs
@ -3252,11 +3284,30 @@ def launchTestJobs(pipeline, testFilter)
        "GB200-4_GPUs-PyTorch-2": ["gb200-x4-oci", "l0_gb200_multi_gpus", 2, 2, 4],
        "GB200-4_GPUs-PyTorch-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus", 1, 1, 4],
        "GB10-PyTorch-Post-Merge-1": ["gb10x-single", "l0_gb10", 1, 1],
-        // Perf sanity post merge test
-        "GB200-4_GPUs-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 1, 4],
        // Disable GB300 stages due to nodes will be offline temporarily.
        // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
        // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
+        // Perf sanity pre merge test
+        "GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
+        // Perf sanity post merge test
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 7, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 8, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 9, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 10, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 13, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 14, 14, 4],
    ]
    fullSet += SBSASlurmTestConfigs.keySet()

@ -3268,13 +3319,15 @@ def launchTestJobs(pipeline, testFilter)
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 1, 3, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 2, 3, 8, 2],
        "GB200-8_GPUs-2_Nodes-PyTorch-Post-Merge-3": ["gb200-oci-trtllm", "l0_gb200_multi_nodes", 3, 3, 8, 2],
-        // Perf sanity post merge aggr tests
-        "GB200-8_GPUs-2_Nodes-PyTorch-Perf-Sanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001", 1, 1, 8, 2],
-        // Perf sanity post merge disagg tests
-        "GB200-12_GPUs-3_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001", 1, 1, 12, 3],
-        // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001", 1, 1, 24, 6],
-        // "GB200-24_GPUs-6_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002", 1, 1, 24, 6],
-        // "GB200-32_GPUs-8_Nodes-PyTorch-Perf-Sanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001", 1, 1, 32, 8],
+        // Perf sanity pre merge tests
+        // "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
+        // Perf sanity post merge tests
+        "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 1, 2, 8, 2],
+        "GB200-8_GPUs-2_Nodes-PyTorch-PerfSanity-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes", 2, 2, 8, 2],
+        "GB200-12_GPUs-3_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes", 1, 1, 12, 3],
+        // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 1, 2, 24, 6],
+        // "GB200-24_GPUs-6_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-2": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes", 2, 2, 24, 6],
+        // "GB200-32_GPUs-8_Nodes-PyTorch-PerfSanity-Disagg-Post-Merge-1": ["gb200-oci-trtllm", "l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes", 1, 1, 32, 8],
    ]
    fullSet += multiNodesSBSAConfigs.keySet()

@ -3292,7 +3345,7 @@ def launchTestJobs(pipeline, testFilter)
            if (key.contains("llvm")) {
                config = LLVM_CONFIG
            }
-            runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
+            runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 1, values[6] ?: false)
        }]]}
        parallelJobs += parallelSlurmJobs

@ -3305,7 +3358,7 @@ def launchTestJobs(pipeline, testFilter)
            if (key.contains("llvm")) {
                config = LLVM_CONFIG
            }
-            runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("Perf"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
+            runLLMTestlistOnSlurm(pipeline, values[0], values[1], config, key.contains("-Perf-"), key, values[2], values[3], values[4] ?: 1, values[5] ?: 2, values[6] ?: false)
        }]]}

        parallelJobs += parallelMultiNodesSBSAJobs
--- a/jenkins/scripts/open_search_db.py
+++ b/jenkins/scripts/open_search_db.py
@ -51,6 +51,7 @@ TEST_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-test_info"
 JOB_MACHINE_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-job_machine_info"
 FAILED_STEP_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-failed_step_info"
 PR_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-pr_info"
+PERF_SANITY_PROJECT_NAME = f"{PROJECT_ROOT}-ci-{MODE}-perf_sanity_info"

 READ_ACCESS_PROJECT_NAME = [
    JOB_PROJECT_NAME,
@ -59,9 +60,12 @@ READ_ACCESS_PROJECT_NAME = [
    JOB_MACHINE_PROJECT_NAME,
    FAILED_STEP_PROJECT_NAME,
    PR_PROJECT_NAME,
+    PERF_SANITY_PROJECT_NAME,
 ]

-WRITE_ACCESS_PROJECT_NAME = []
+WRITE_ACCESS_PROJECT_NAME = [
+    PERF_SANITY_PROJECT_NAME,
+]

 DISABLE_OPEN_SEARCH_DB_FOR_LOCAL_TEST = False

--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@ -108,7 +108,7 @@ eval $pytestCommand
 pytest_exit_code=$?
 echo "Rank${SLURM_PROCID} Pytest finished execution with exit code $pytest_exit_code"

-if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Perf-Sanity* ]]; then
+if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
    if [[ "$stageName" == *PyTorch* ]]; then
        basePerfFilename="base_perf_pytorch.csv"
    else
@ -135,14 +135,6 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" != *Pe
    echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
 fi

-if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ] && [[ "$stageName" == *Perf-Sanity* ]]; then
-    echo "Check Perf-Sanity Result"
-    python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
-        $jobWorkspace
-    perf_sanity_check_exit_code=$?
-    echo "Rank${SLURM_PROCID} Perf-Sanity check finished execution with exit code $perf_sanity_check_exit_code"
-fi
-
 if [ "$pytest_exit_code" -ne 0 ]; then
    final_exit_code=$pytest_exit_code
 elif [ "$perf_check_exit_code" -ne 0 ]; then
--- a/tests/integration/defs/perf/open_search_db_utils.py
+++ b/tests/integration/defs/perf/open_search_db_utils.py
@ -29,12 +29,14 @@ _project_root = os.path.abspath(
    os.path.join(os.path.dirname(__file__), '../../../..'))
 if _project_root not in sys.path:
    sys.path.insert(0, _project_root)
-from jenkins.scripts.open_search_db import OpenSearchDB
+from jenkins.scripts.open_search_db import (PERF_SANITY_PROJECT_NAME,
+                                            OpenSearchDB)

-PROJECT_ROOT = "sandbox-temp-trtllm-ci-perf-v1"  # "sandbox-trtllm-ci-perf"
-TEST_INFO_PROJECT_NAME = f"{PROJECT_ROOT}-test_info"
-PRE_MERGE_THRESHOLD = 0.1
-POST_MERGE_THRESHOLD = 0.05
+POC_PROJECT_NAME = "sandbox-temp-trtllm-ci-perf-v1-test_info"
+USE_POC_DB = os.environ.get("USE_POC_DB", "false").lower() == "true"
+TEST_INFO_PROJECT_NAME = POC_PROJECT_NAME if USE_POC_DB else PERF_SANITY_PROJECT_NAME
+MAX_QUERY_SIZE = 5000
+QUERY_LOOKBACK_DAYS = 90

 # Metrics where larger is better
 MAXIMIZE_METRICS = [
@ -67,7 +69,6 @@ MINIMIZE_METRICS = [
 SCENARIO_MATCH_FIELDS = [
    "s_runtime",
    "s_model_name",
-    "s_gpu_type",
    "l_isl",
    "l_osl",
    "l_concurrency",
@ -178,49 +179,85 @@ def get_job_info():
    }


-def query_history_data(gpu_type):
+def get_common_values(new_data_dict, match_keys):
    """
-    Query post-merge data with specific gpu type and model name
+    Find keys from match_keys where all data entries in new_data_dict have identical values.
+    Returns a dict with those common key-value pairs.
+    Skips entries that don't have the key or have None/empty values.
    """
-    # Query data from the last 14 days
-    last_days = 14
+    if not new_data_dict or not match_keys:
+        return {}
+
+    data_list = list(new_data_dict.values())
+    if not data_list:
+        return {}
+
+    common_values_dict = {}
+    for key in match_keys:
+        # Collect non-None, non-empty values for this key
+        values = []
+        for data in data_list:
+            if key in data and data[key] is not None:
+                values.append(data[key])
+
+        # Skip if no valid values found
+        if len(values) != len(data_list):
+            continue
+
+        # Check if all valid values are identical
+        first_value = values[0]
+        if all(v == first_value for v in values):
+            common_values_dict[key] = first_value
+
+    return common_values_dict
+
+
+def query_history_data(common_values_dict):
+    """
+    Query post-merge data with common values to narrow down scope.
+    """
+    # Query data from the last 90 days
+    last_days = QUERY_LOOKBACK_DAYS
+
+    # Build must clauses with base filters
+    must_clauses = [
+        {
+            "term": {
+                "b_is_valid": True
+            }
+        },
+        {
+            "term": {
+                "b_is_post_merge": True
+            }
+        },
+        {
+            "term": {
+                "b_is_regression": False
+            }
+        },
+        {
+            "range": {
+                "ts_created": {
+                    "gte":
+                    int(time.time() - 24 * 3600 * last_days) // (24 * 3600) *
+                    24 * 3600 * 1000,
+                }
+            }
+        },
+    ]
+
+    # Add common values as term filters to narrow down the query
+    for key, value in common_values_dict.items():
+        must_clauses.append({"term": {key: value}})
+
    json_data = {
        "query": {
            "bool": {
-                "must": [
-                    {
-                        "term": {
-                            "b_is_valid": True
-                        }
-                    },
-                    {
-                        "term": {
-                            "b_is_post_merge": True
-                        }
-                    },
-                    {
-                        "term": {
-                            "b_is_regression": False
-                        }
-                    },
-                    {
-                        "term": {
-                            "s_gpu_type": gpu_type
-                        }
-                    },
-                    {
-                        "range": {
-                            "ts_created": {
-                                "gte":
-                                int(time.time() - 24 * 3600 * last_days) //
-                                (24 * 3600) * 24 * 3600 * 1000,
-                            }
-                        }
-                    },
-                ]
+                "must": must_clauses
            },
        },
-        "size": 3000,
+        "size": MAX_QUERY_SIZE,
    }
    json_data = json.dumps(json_data)

@ -233,13 +270,13 @@ def query_history_data(gpu_type):
            print_info(
                f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned no response"
            )
-            return []
+            return None
        else:
            payload = res.json().get("hits", {}).get("hits", [])
            if len(payload) == 0:
                # No history data found in database, return empty list
                print_info(
-                    f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned no data"
+                    f"No history data found in {TEST_INFO_PROJECT_NAME}, returned empty list"
                )
                return []
            for hit in payload:
@ -250,7 +287,7 @@ def query_history_data(gpu_type):
                        f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned data with no _id"
                    )
                    # Invalid data, return None
-                    return []
+                    return None
                data_list.append(data_dict)
            print_info(
                f"Successfully query from {TEST_INFO_PROJECT_NAME}, queried {len(data_list)} entries"
@ -259,7 +296,7 @@ def query_history_data(gpu_type):
    except Exception as e:
        print_info(
            f"Fail to query from {TEST_INFO_PROJECT_NAME}, returned error: {e}")
-        return []
+        return None


 def match(history_data, new_data, match_keys):
@ -329,7 +366,7 @@ def calculate_best_perf_result(history_data_list, new_data):
    return best_metrics


-def get_history_data(new_data_dict, gpu_type, match_keys):
+def get_history_data(new_data_dict, match_keys, common_values_dict):
    """
    Query history post-merge data for each cmd_idx
    """
@ -371,15 +408,23 @@ def get_history_data(new_data_dict, gpu_type, match_keys):
                          key=lambda x: parse_timestamp(x.get("@timestamp", 0)))
        return latest_data

+    cmd_idxs = new_data_dict.keys()
+    history_data_list = None
+    if cmd_idxs:
+        history_data_list = query_history_data(common_values_dict)
+
+    # If query_history_data returned None, it means network failure
+    if history_data_list is None:
+        return None, None
+
+    # Query was successful (even if empty list), initialize dicts
    history_baseline_dict = {}
    history_data_dict = {}
-    cmd_idxs = new_data_dict.keys()
    for cmd_idx in cmd_idxs:
        history_data_dict[cmd_idx] = []
        history_baseline_dict[cmd_idx] = []
-    history_data_list = []
-    if cmd_idxs:
-        history_data_list = query_history_data(gpu_type)
+
+    # Process history data if we have any
    if history_data_list:
        for history_data in history_data_list:
            for cmd_idx in cmd_idxs:
@ -390,7 +435,9 @@ def get_history_data(new_data_dict, gpu_type, match_keys):
                    else:
                        history_data_dict[cmd_idx].append(history_data)
                    break
+
    # Sometime database has several baselines and we only use the latest baseline one
+    # If list is empty, set to None for each cmd_idx
    for cmd_idx, baseline_list in history_baseline_dict.items():
        latest_baseline = get_latest_data(baseline_list)
        history_baseline_dict[cmd_idx] = latest_baseline
@ -430,24 +477,27 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
    2. For Minimize metrics, if new perf is above baseline * (1 + threshold)
    Set it as regressive.
    """
+    # If history_baseline_dict is None (network failure), skip regression check
+    if history_baseline_dict is None:
+        return []
+
    regressive_data_list = []
-    cmd_idxs = new_data_dict.keys()
    # Find regressive test cases
-    for cmd_idx in cmd_idxs:
+    for cmd_idx in new_data_dict:
        if history_baseline_dict[cmd_idx] is None:
            continue

-        baseline_data = history_baseline_dict[cmd_idx]
+        history_baseline = history_baseline_dict[cmd_idx]
        new_data = new_data_dict[cmd_idx]
        is_regressive = False
        regressive_metrics = []

        # Check MAXIMIZE_METRICS (new should be >= baseline * (1 - threshold))
        for metric in MAXIMIZE_METRICS:
-            if metric not in new_data or metric not in baseline_data:
+            if metric not in new_data or metric not in history_baseline:
                continue
-            threshold = get_threshold(baseline_data, metric)
-            baseline_value = baseline_data[metric]
+            threshold = get_threshold(history_baseline, metric)
+            baseline_value = history_baseline[metric]
            new_value = new_data[metric]
            # Regressive if new_value < baseline_value * (1 - threshold)
            if new_value < baseline_value * (1 - threshold):
@ -456,10 +506,10 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):

        # Check MINIMIZE_METRICS (new should be <= baseline * (1 + threshold))
        for metric in MINIMIZE_METRICS:
-            if metric not in new_data or metric not in baseline_data:
+            if metric not in new_data or metric not in history_baseline:
                continue
-            threshold = get_threshold(baseline_data, metric)
-            baseline_value = baseline_data[metric]
+            threshold = get_threshold(history_baseline, metric)
+            baseline_value = history_baseline[metric]
            new_value = new_data[metric]
            # Regressive if new_value > baseline_value * (1 + threshold)
            if new_value > baseline_value * (1 + threshold):
@ -471,9 +521,9 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
            regressive_data = new_data.copy()
            # Add baseline values and thresholds for all metrics
            for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
-                if metric in baseline_data:
+                if metric in history_baseline:
                    baseline_key = f"d_baseline_{metric[2:]}"
-                    regressive_data[baseline_key] = baseline_data[metric]
+                    regressive_data[baseline_key] = history_baseline[metric]

                    # Copy all threshold keys from baseline
                    metric_suffix = metric[2:]
@ -482,8 +532,8 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
                            f"d_threshold_post_merge_{metric_suffix}",
                            f"d_threshold_pre_merge_{metric_suffix}"
                    ]:
-                        if threshold_key in baseline_data:
-                            regressive_data[threshold_key] = baseline_data[
+                        if threshold_key in history_baseline:
+                            regressive_data[threshold_key] = history_baseline[
                                threshold_key]

            # Add regression info string
@ -495,11 +545,24 @@ def prepare_regressive_test_cases(history_baseline_dict, new_data_dict):
    return regressive_data_list


-def prepare_baseline_data(history_data_dict, new_data_dict):
+def _is_valid_baseline(baseline_data):
+    """Check if baseline data is valid (non-empty dict)."""
+    if isinstance(baseline_data, dict) and len(baseline_data) > 0:
+        return True
+    return False
+
+
+def prepare_baseline_data(history_baseline_dict, history_data_dict,
+                          new_data_dict):
    """
    Calculate new baseline from history post-merge data and new data.
    Then return new baseline data.
    """
+    # If history_baseline_dict and history_data_dict are None (network failure),
+    # return None to indicate we cannot prepare baseline data
+    if history_baseline_dict is None and history_data_dict is None:
+        return {}
+
    new_baseline_data_dict = {}
    cmd_idxs = new_data_dict.keys()
    # Find the best history post-merge data for each cmd
@ -507,18 +570,42 @@ def prepare_baseline_data(history_data_dict, new_data_dict):
        # Calculate best metrics from history post-merge data and new data
        best_metrics = calculate_best_perf_result(history_data_dict[cmd_idx],
                                                  new_data_dict[cmd_idx])
+
+        # Create new_baseline_data from new_data_dict and set b_is_baseline
        new_baseline_data = new_data_dict[cmd_idx].copy()
        new_baseline_data["b_is_baseline"] = True
-        # Add or update baseline metrics and thresholds
-        for metric, value in best_metrics.items():
-            new_baseline_data[metric] = value
+
+        # Initialize metric_threshold_dict with default thresholds for all metrics
+        metric_threshold_dict = {}
+        for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
            metric_suffix = metric[2:]
            post_merge_key = f"d_threshold_post_merge_{metric_suffix}"
            pre_merge_key = f"d_threshold_pre_merge_{metric_suffix}"
-            new_baseline_data[post_merge_key] = new_baseline_data.get(
-                post_merge_key, POST_MERGE_THRESHOLD)
-            new_baseline_data[pre_merge_key] = new_baseline_data.get(
-                pre_merge_key, PRE_MERGE_THRESHOLD)
+            metric_threshold_dict[post_merge_key] = POST_MERGE_THRESHOLD
+            metric_threshold_dict[pre_merge_key] = PRE_MERGE_THRESHOLD
+
+        # If history baseline is valid, extract thresholds and update metric_threshold_dict
+        history_baseline = history_baseline_dict[cmd_idx]
+        if _is_valid_baseline(history_baseline):
+            for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
+                metric_suffix = metric[2:]
+                post_merge_key = f"d_threshold_post_merge_{metric_suffix}"
+                pre_merge_key = f"d_threshold_pre_merge_{metric_suffix}"
+                if post_merge_key in history_baseline:
+                    metric_threshold_dict[post_merge_key] = history_baseline[
+                        post_merge_key]
+                if pre_merge_key in history_baseline:
+                    metric_threshold_dict[pre_merge_key] = history_baseline[
+                        pre_merge_key]
+
+        # Update new_baseline_data with best_metrics values
+        for metric, value in best_metrics.items():
+            new_baseline_data[metric] = value
+
+        # Add all thresholds to new_baseline_data
+        for threshold_key, threshold_value in metric_threshold_dict.items():
+            new_baseline_data[threshold_key] = threshold_value
+
        add_id(new_baseline_data)
        new_baseline_data_dict[cmd_idx] = new_baseline_data

@ -539,7 +626,8 @@ def post_new_perf_data(new_baseline_data_dict, new_data_dict,
        if cmd_idx in new_data_dict:
            data_list.append(new_data_dict[cmd_idx])
    # Only post regressive test cases when post-merge.
-    if new_baseline_data_dict:
+    # new_baseline_data_dict is None means pre-merge.
+    if new_baseline_data_dict and regressive_data_list:
        data_list.extend(regressive_data_list)
    if not data_list:
        return
--- a/tests/integration/defs/perf/perf_regression_check.py
+++ b/tests/integration/defs/perf/perf_regression_check.py
@ -148,8 +148,8 @@ def main():
    job_workspace = sys.argv[1]

    if not os.path.isdir(job_workspace):
-        print(f"Error: {job_workspace} is not a valid directory")
-        sys.exit(1)
+        print(f"Skipping perf regression check since {job_workspace} is not a valid directory.")
+        return 0

    perf_data_files = find_yaml_files(job_workspace, "perf_data.yaml")
    all_perf_data = read_yaml_data(perf_data_files)
@ -171,15 +171,33 @@ def main():
        print("=" * 60)
        print_regression_data(data)

+    # Split regression data into post-merge and pre-merge categories
+    post_merge_regressions = [
+        data for data in all_regression_data if data.get("b_is_post_merge", False)
+    ]
+    pre_merge_regressions = [
+        data for data in all_regression_data if not data.get("b_is_post_merge", False)
+    ]
+
    if len(all_regression_data) == 0:
        print("\n No regression data found. Perf check is successful.")
        return 0
-    else:
+
+    if len(pre_merge_regressions) > 0:
        print(
-            f"\n Warning: Found {len(all_regression_data)} regression data. Perf check is failed."
+            f"\n Warning: Found {len(pre_merge_regressions)} pre-merge regression data. "
+            "But we don't fail the check temporarily."
+        )
+
+    if len(post_merge_regressions) > 0:
+        print(
+            f"\n Error: Found {len(post_merge_regressions)} post-merge regression data. Perf check is failed."
        )
        return 1

+    print("\n No post-merge regression data found. Perf check is successful.")
+    return 0
+

 if __name__ == "__main__":
    sys.exit(main())
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
--- a/tests/integration/defs/perf/test_perf_sanity.py
+++ b/tests/integration/defs/perf/test_perf_sanity.py
--- a/tests/integration/defs/perf/utils.py
+++ b/tests/integration/defs/perf/utils.py
@ -19,22 +19,17 @@ import io
 import os
 import re
 import subprocess
-import time
 from datetime import datetime
 from enum import Enum
 from pathlib import Path
 from typing import Dict, List, NamedTuple, Optional

-import requests
-import yaml
 from _pytest.nodes import Item
 from _pytest.python import Function
 from defs.trt_test_alternative import (check_output, popen, print_error,
                                       print_info)
 from test_common.http_utils import wait_for_endpoint_ready

-from tensorrt_llm._utils import get_free_port
-
 from ..common import get_trt_llm_lib_dir, venv_mpi_check_output
 from ..local_venv import PythonVenvRunnerImpl
 from ..test_list_parser import parse_test_list
@ -243,55 +238,6 @@ class PerfBenchScriptTestCmds(NamedTuple):
        return cmd_str


-class PerfAggrScriptTestCmds(NamedTuple):
-    server_cmds: List[List[str]]
-    client_cmds: List[List[str]]
-    names: List[str]
-    timeout: int
-    output_dir: str
-
-    def run_cmd(self, cmd_idx: int, venv) -> str:
-        output = ""
-        server_proc = None
-        server_file_path = os.path.join(
-            self.output_dir, f"trtllm-serve.{self.names[cmd_idx]}.log")
-        client_file_path = os.path.join(
-            self.output_dir, f"trtllm-benchmark.{self.names[cmd_idx]}.log")
-        try:
-            server_hostname = "localhost"
-            server_port = get_free_port()
-            server_cmd = add_host_port_to_cmd(self.server_cmds[cmd_idx],
-                                              server_hostname, server_port)
-            print_info(f"Starting server. cmd is {server_cmd}")
-            with open(server_file_path, 'w') as server_ctx:
-                server_proc = subprocess.Popen(
-                    server_cmd,
-                    stdout=server_ctx,
-                    stderr=subprocess.STDOUT,
-                    env=copy.deepcopy(os.environ),
-                )
-            wait_for_endpoint_ready(
-                f"http://{server_hostname}:{server_port}/health",
-                timeout=self.timeout)
-            client_cmd = add_host_port_to_cmd(self.client_cmds[cmd_idx],
-                                              server_hostname, server_port)
-            print_info(f"Starting client. cmd is {client_cmd}")
-            output = subprocess.check_output(
-                client_cmd,
-                stderr=subprocess.STDOUT,
-                env=copy.deepcopy(os.environ),
-            ).decode()
-            with open(client_file_path, 'w') as client_ctx:
-                client_ctx.write(output)
-        finally:
-            server_proc.terminate()
-            server_proc.wait()
-        return output
-
-    def get_cmd_str(self, cmd_idx) -> List[str]:
-        return ["aggr_server tests, please check config files"]
-
-
 class PerfDisaggScriptTestCmds(NamedTuple):
    ctx_cmd: str
    gen_cmd: str
@ -341,249 +287,6 @@ class PerfDisaggScriptTestCmds(NamedTuple):
        return ["disaggregated server tests, please check config files"]


-class PerfMultiNodeDisaggScriptTestCmds(NamedTuple):
-    ctx_server_cmds: List[List[str]]
-    gen_server_cmds: List[List[str]]
-    disagg_server_cmds: List[List[str]]
-    benchmark_cmds: List[List[str]]
-    timeout: int
-    hostname: str
-    disagg_serving_type: str
-    num_ctx_servers: int
-    num_gen_servers: int
-    output_dir: str
-
-    def _generate_hostname_file(self, cmd_idx: int, port: int):
-        # Create hostnames directory
-        hostnames_dir = os.path.join(self.output_dir, f"hostnames-{cmd_idx}")
-        if not os.path.exists(hostnames_dir):
-            os.makedirs(hostnames_dir, exist_ok=True)
-        hostname_file = os.path.join(hostnames_dir,
-                                     f"{self.disagg_serving_type}.txt")
-        with open(hostname_file, 'w') as f:
-            f.write(f"{self.hostname}:{port}")
-
-    def _generate_disagg_server_config(self, cmd_idx: int,
-                                       disagg_server_port: int) -> str:
-        print_info(
-            f"Generating disagg server config for command index {cmd_idx}")
-        hostnames_folder = os.path.join(self.output_dir, f"hostnames-{cmd_idx}")
-        expected_count = self.num_ctx_servers + self.num_gen_servers
-        start_time = time.time()
-        hostnames = []
-        while True:
-            elapsed_time = time.time() - start_time
-            print_info(
-                f"Waiting for hostnames in {hostnames_folder}, elapsed time: {elapsed_time}s, current: {len(hostnames)}, expected: {expected_count}"
-            )
-            if elapsed_time > self.timeout:
-                print_error(
-                    f"Time out. Hostnames files are not ready after {self.timeout}s"
-                )
-            time.sleep(10)
-            if not os.path.exists(hostnames_folder):
-                continue
-            hostnames = os.listdir(hostnames_folder)
-            if len(hostnames) >= expected_count:
-                break
-        print_info(
-            f"All hostnames found in {hostnames_folder} after elapsed time: {elapsed_time}s"
-        )
-
-        # Read ctx and gen hostnames
-        ctx_hostnames = []
-        gen_hostnames = []
-        for hostname_file in hostnames:
-            hostname_file_path = os.path.join(hostnames_folder, hostname_file)
-            with open(hostname_file_path, 'r') as f:
-                hostname_port = f.read().strip()
-                hostname = hostname_port.split(":")[0]
-                port = hostname_port.split(":")[1]
-                print_info(
-                    f"Hostname File: {hostname_file_path} Hostname: {hostname_port} Port: {port}"
-                )
-            if hostname_file.startswith("CTX"):
-                ctx_hostnames.append(hostname_port)
-            elif hostname_file.startswith("GEN"):
-                gen_hostnames.append(hostname_port)
-
-        server_config = {
-            'hostname': self.hostname,
-            'port': disagg_server_port,
-            'backend': 'pytorch',
-            'context_servers': {
-                'num_instances': self.num_ctx_servers,
-                'urls': ctx_hostnames,
-            },
-            'generation_servers': {
-                'num_instances': self.num_gen_servers,
-                'urls': gen_hostnames,
-            }
-        }
-        config_path = os.path.join(self.output_dir,
-                                   f"server_config.{cmd_idx}.yaml")
-        with open(config_path, 'w') as f:
-            yaml.dump(server_config, f)
-        print_info(f"Server config file {config_path} generated")
-        return config_path
-
-    def _get_disagg_server_hostname_and_port(self, cmd_idx: int) -> tuple:
-        config_path = os.path.join(self.output_dir,
-                                   f"server_config.{cmd_idx}.yaml")
-        start_time = time.time()
-        while True:
-            if os.path.exists(config_path):
-                print_info(f"Server config file found: {config_path}")
-                break
-            elapsed_time = time.time() - start_time
-            if elapsed_time > self.timeout:
-                print_error(
-                    f"Server config file {config_path} not found after {self.timeout}s"
-                )
-            print_info(
-                f"Waiting for server config file, elapsed time: {elapsed_time}s"
-            )
-            time.sleep(10)  # Check every 10 seconds
-
-        # Read server config to get hostname and port
-        with open(config_path, 'r') as f:
-            server_config = yaml.safe_load(f)
-        disagg_server_hostname = server_config['hostname']
-        disagg_server_port = server_config['port']
-        return disagg_server_hostname, disagg_server_port
-
-    def wait_for_benchmark_ready(self,
-                                 benchmark_status_file: str,
-                                 timeout: int = 7200):
-        start_time = time.time()
-        while True:
-            if os.path.exists(benchmark_status_file):
-                print_info(
-                    f"Benchmark status file found, terminating server {self.disagg_serving_type}"
-                )
-                break
-            elapsed_time = time.time() - start_time
-            print_info(
-                f"Waiting for benchmark status file, elapsed time: {elapsed_time}s"
-            )
-            if elapsed_time > timeout:
-                print_error(
-                    f"Timeout waiting for benchmark status file after {timeout}s, terminating server {self.disagg_serving_type}"
-                )
-                break
-            time.sleep(10)  # Check every 10 seconds
-
-    def wait_for_endpoint_ready(self, url: str, timeout: int = 7200):
-        start = time.monotonic()
-        while True:
-            elapsed_time = time.monotonic() - start
-            if elapsed_time > timeout:
-                print_error(
-                    f"Timeout waiting for endpoint {url} to be ready after {timeout} seconds"
-                )
-                break
-            print_info(
-                f"Waiting for endpoint {url} to be ready, elapsed time: {elapsed_time}s"
-            )
-            try:
-                time.sleep(10)
-                if requests.get(url).status_code == 200:
-                    print_info(f"endpoint {url} is ready")
-                    return
-            except Exception as err:
-                print_info(
-                    f"endpoint {url} is not ready, with exception: {err}")
-        print_error(
-            f"Endpoint {url} did not become ready within {timeout} seconds")
-
-    def run_cmd(self, cmd_idx: int, venv) -> str:
-        output = ""
-        server_proc = None
-        benchmark_status_file = os.path.join(self.output_dir,
-                                             f"benchmark_status.{cmd_idx}.txt")
-        port = get_free_port()
-        if "CTX" in self.disagg_serving_type or "GEN" in self.disagg_serving_type:
-            self._generate_hostname_file(cmd_idx, port)
-            server_file_path = os.path.join(
-                self.output_dir,
-                f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log")
-            is_ctx = "CTX" in self.disagg_serving_type
-            server_cmd = self.ctx_server_cmds[
-                cmd_idx] if is_ctx else self.gen_server_cmds[cmd_idx]
-            server_cmd = add_host_port_to_cmd(server_cmd, self.hostname, port)
-            try:
-                print_info(
-                    f"Starting server. disagg_serving_type: {self.disagg_serving_type} cmd is {server_cmd}"
-                )
-                with open(server_file_path, 'w') as server_ctx:
-                    server_proc = subprocess.Popen(
-                        server_cmd,
-                        stdout=server_ctx,
-                        stderr=subprocess.STDOUT,
-                        env=copy.deepcopy(os.environ),
-                    )
-                self.wait_for_benchmark_ready(benchmark_status_file,
-                                              timeout=self.timeout)
-            finally:
-                print_info(f"Server {self.disagg_serving_type} stopped")
-                server_proc.terminate()
-                server_proc.wait()
-        elif self.disagg_serving_type == "DISAGG_SERVER":
-            disagg_server_file_path = os.path.join(
-                self.output_dir,
-                f"trtllm-serve.{cmd_idx}.{self.disagg_serving_type}.log")
-            disagg_server_cmd = self.disagg_server_cmds[cmd_idx]
-            try:
-                self._generate_disagg_server_config(cmd_idx, port)
-                print_info(
-                    f"Starting disagg server. disagg_serving_type: {self.disagg_serving_type} disagg server cmd is {disagg_server_cmd}"
-                )
-                with open(disagg_server_file_path, 'w') as disagg_server_ctx:
-                    disagg_server_proc = subprocess.Popen(
-                        disagg_server_cmd,
-                        stdout=disagg_server_ctx,
-                        stderr=subprocess.STDOUT,
-                        env=copy.deepcopy(os.environ),
-                    )
-                self.wait_for_benchmark_ready(benchmark_status_file,
-                                              timeout=self.timeout)
-            finally:
-                print_info(f"Disagg server {self.disagg_serving_type} stopped")
-                disagg_server_proc.terminate()
-                disagg_server_proc.wait()
-        elif self.disagg_serving_type == "BENCHMARK":
-            benchmark_file_path = os.path.join(
-                self.output_dir, f"trtllm-benchmark.{cmd_idx}.log")
-            try:
-                disagg_server_hostname, disagg_server_port = self._get_disagg_server_hostname_and_port(
-                    cmd_idx)
-                benchmark_cmd = add_host_port_to_cmd(
-                    self.benchmark_cmds[cmd_idx], disagg_server_hostname,
-                    disagg_server_port)
-                self.wait_for_endpoint_ready(
-                    f"http://{disagg_server_hostname}:{disagg_server_port}/health",
-                    timeout=self.timeout,
-                )
-                print_info(
-                    f"Starting benchmark. disagg_serving_type: {self.disagg_serving_type} benchmark cmd is {benchmark_cmd}"
-                )
-                output = subprocess.check_output(
-                    benchmark_cmd,
-                    env=copy.deepcopy(os.environ),
-                    stderr=subprocess.STDOUT).decode()
-                with open(benchmark_file_path, 'w') as benchmark_ctx:
-                    benchmark_ctx.write(output)
-            finally:
-                with open(benchmark_status_file, 'w') as status_file:
-                    status_file.write("Done")
-        return output
-
-    def get_cmd_str(self, cmd_idx) -> List[str]:
-        return [
-            "multi-node disaggregated server tests, please check config files"
-        ]
-
-
 class AbstractPerfScriptTestClass(abc.ABC):
    """
    Abstract class for all script-based perf tests.
@ -715,14 +418,6 @@ class AbstractPerfScriptTestClass(abc.ABC):

        cmd_str = commands.get_cmd_str(cmd_idx)
        is_prepare_dataset_cmd = 'prepare_dataset' in cmd_str or "prepare-dataset" in cmd_str
-        is_perf_sanity_test = "perf_sanity" in full_test_name
-
-        is_disagg_server = False
-        if self._config.runtime == "multi_node_disagg_server":
-            disagg_serving_type = self._config.disagg_configs[0][
-                'disagg_serving_type']
-            is_disagg_server = disagg_serving_type != "BENCHMARK"
-
        # Start the timer.
        self._start_timestamp = datetime.utcnow()
        try:
@ -730,8 +425,7 @@ class AbstractPerfScriptTestClass(abc.ABC):
                # Capture the stdout from _gpu_clock_lock because the pipeline JUnit update script tries to parse
                # the log to find the GPU clocks.
                with io.StringIO() as buf:
-                    # Perf-sanity test doesn't lock gpu clock
-                    if self._gpu_clock_lock and not is_perf_sanity_test:
+                    if self._gpu_clock_lock:
                        # Lock GPU clock and start monitoring.
                        with contextlib.redirect_stdout(
                                buf), self._gpu_clock_lock, tmpDir:
@ -746,7 +440,7 @@ class AbstractPerfScriptTestClass(abc.ABC):
                            print(collect_and_clean_myelin_time(output))

                    # Check whether output has error message
-                    if not is_prepare_dataset_cmd and is_perf_sanity_test:
+                    if not is_prepare_dataset_cmd:
                        self._check_benchmark_output_for_errors(output)

                    # Print the output log to stdout and cache it.
@ -793,10 +487,6 @@ class AbstractPerfScriptTestClass(abc.ABC):
                    f"skip writing perf result when calling generating dataset in trtllm-bench."
                )
                outputs.pop(cmd_idx)
-            elif is_disagg_server:
-                print_info(
-                    f"skip writing perf result when running disagg's worker or server."
-                )
            else:
                self._perf_result = self.get_perf_result(outputs)

@ -818,11 +508,6 @@ class AbstractPerfScriptTestClass(abc.ABC):
        Store the test results in the _test_results.
        Write the test results and GPU monitoring data to the output csv and/or yaml files.
        """
-        # Store the test result
-        if cmd_idx not in self._test_results:
-            self._test_results[cmd_idx] = {}
-        self._test_results[cmd_idx][metric_type] = self._perf_result
-
        # Get GPU monitoring data
        self._gpu_monitor_data = self._gpu_clock_lock.get_state_data()
        if not self._gpu_monitor_data:
--- a/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200_perf_sanity.yml
@ -15,9 +15,9 @@ l0_dgx_b200_perf_sanity:
      backend: pytorch
      orchestrator: mpi
  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)

 - condition:
    ranges:
@ -34,8 +34,6 @@ l0_dgx_b200_perf_sanity:
      backend: pytorch
      orchestrator: mpi
  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_dep2,gpt_oss_fp4_dep4] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-gpt_oss_fp4_blackwell-gpt_oss_fp4_tp4_eagle3] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
--- a/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b300_perf_sanity.yml
@ -16,9 +16,9 @@ l0_dgx_b300_perf_sanity:
      backend: pytorch
      orchestrator: mpi
  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_dep8_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tep8_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp8_blackwell-r1_fp8_tp8_mtp3_1k1k] TIMEOUT (180)

 - condition:
    ranges:
@ -36,6 +36,6 @@ l0_dgx_b300_perf_sanity:
      backend: pytorch
      orchestrator: mpi
  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (180)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
@ -1,5 +1,33 @@
 version: 0.0.1
 l0_gb200_multi_gpus_perf_sanity:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*gb200*'
+      linux_distribution_name: ubuntu*
+      cpu: aarch64
+    terms:
+      stage: pre_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120)
 - condition:
    ranges:
      system_gpu_count:
@ -14,6 +42,17 @@ l0_gb200_multi_gpus_perf_sanity:
      stage: post_merge
      backend: pytorch
  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k]
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k]
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (120)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (120)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes.yml
@ -0,0 +1,17 @@
+version: 0.0.1
+l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes:
+- condition:
+    ranges:
+      # 2 nodes with each node has 4 GPUs
+      system_gpu_count:
+        gte: 8
+        lte: 8
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_tep8_mtp3] TIMEOUT (90)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001.yml
@ -1,16 +0,0 @@
-version: 0.0.1
-l0_gb200_multi_nodes_aggr_perf_sanity_2_nodes_001:
- condition:
-    ranges:
-      # 2 nodes with each node has 4 GPUs
-      system_gpu_count:
-        gte: 8
-        lte: 8
-    wildcards:
-      gpu:
-      - '*gb200*'
-    terms:
-      stage: post_merge
-      backend: pytorch
-  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_aggr_upload-deepseek_r1_fp4_v2_2_nodes_grace_blackwell-r1_fp4_v2_dep8_mtp1]
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001.yml
@ -1,5 +1,5 @@
 version: 0.0.1
-l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001:
+l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes:
 - condition:
    ranges:
      # 3 nodes with each node has 4 GPUs
@ -13,4 +13,4 @@ l0_gb200_multi_nodes_disagg_perf_sanity_3_nodes_001:
      stage: post_merge
      backend: pytorch
  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX] TIMEOUT (90)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes.yml
@ -0,0 +1,17 @@
+version: 0.0.1
+l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes:
+- condition:
+    ranges:
+      # 6 nodes with each node has 4 GPUs
+      system_gpu_count:
+        gte: 24
+        lte: 24
+    wildcards:
+      gpu:
+      - '*gb200*'
+    terms:
+      stage: post_merge
+      backend: pytorch
+  tests:
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (90)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001.yml
@ -1,16 +0,0 @@
-version: 0.0.1
-l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_001:
- condition:
-    ranges:
-      # 6 nodes with each node has 4 GPUs
-      system_gpu_count:
-        gte: 24
-        lte: 24
-    wildcards:
-      gpu:
-      - '*gb200*'
-    terms:
-      stage: post_merge
-      backend: pytorch
-  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002.yml
@ -1,16 +0,0 @@
-version: 0.0.1
-l0_gb200_multi_nodes_disagg_perf_sanity_6_nodes_002:
- condition:
-    ranges:
-      # 6 nodes with each node has 4 GPUs
-      system_gpu_count:
-        gte: 24
-        lte: 24
-    wildcards:
-      gpu:
-      - '*gb200*'
-    terms:
-      stage: post_merge
-      backend: pytorch
-  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_1k1k_ctx2_gen1_dep16_bs128_eplb0_mtp3_ccb-NIXL] TIMEOUT (180)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001.yml
@ -1,5 +1,5 @@
 version: 0.0.1
-l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001:
+l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes:
 - condition:
    ranges:
      # 8 nodes with each node has 4 GPUs
@ -13,4 +13,4 @@ l0_gb200_multi_nodes_disagg_perf_sanity_8_nodes_001:
      stage: post_merge
      backend: pytorch
  tests:
-  - perf/test_perf.py::test_perf[perf_sanity_disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (180)
+  - perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_8k1k_ctx1_gen1_dep32_bs128_eplb0_mtp3_ccb-UCX] TIMEOUT (90)
--- a/tests/scripts/perf-sanity/README.md
+++ b/tests/scripts/perf-sanity/README.md
@ -11,24 +11,69 @@ Performance sanity testing scripts for TensorRT-LLM with configuration-driven te

 ## Configuration File Types

-There are three types of YAML config files for different deployment architectures.
-Aggregated config files are in [`tests/scripts/perf-sanity`](./).
-Disaggregated config files are in [`tests/integration/defs/perf/disagg/test_configs/disagg/perf`](../../integration/defs/perf/disagg/test_configs/disagg/perf).
+There are two modes for perf sanity tests: aggregated (aggr) and disaggregated (disagg).

-### 1. Single-Node Aggregated Test Configuration
+### Aggregated Mode (aggr)

-**File Example**: `deepseek_r1_fp4_v2_grace_blackwell.yaml`
+**Config Location**: [`tests/scripts/perf-sanity`](./)

-**Use Case**: Single-node performance tests on a single server with multiple GPUs.
+**File Naming**: `xxx.yaml` where words are connected by `_` (underscore), not `-` (hyphen).

-### 2. Multi-Node Aggregated Test Configuration
+**File Examples**:
+- `deepseek_r1_fp4_v2_grace_blackwell.yaml` - Single-node aggregated test
+- `deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml` - Multi-node aggregated test

-**File Example**: `deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml`
+**Use Cases**:
+- Single-node: Performance tests on a single server with multiple GPUs
+- Multi-node: Model runs across multiple nodes with unified execution

-**Use Case**: Multi-node aggregated architecture where model runs across multiple nodes with unified execution.
+**Test Case Names**:
+```
+perf/test_perf_sanity.py::test_e2e[aggr_upload-{config yaml file base name}]
+perf/test_perf_sanity.py::test_e2e[aggr_upload-{config yaml file base name}-{server_config_name}]
+```

-### 3. Multi-Node Disaggregated Test Configuration
+- Without server config name: runs all server configs in the YAML file
+- With server config name: runs only the specified server config (the `name` field in `server_configs`)
+
+**Examples**:
+```
+perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell]
+perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
+perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k]
+```
+
+### Disaggregated Mode (disagg)
+
+**Config Location**: [`tests/integration/defs/perf/disagg/test_configs/disagg/perf`](../../integration/defs/perf/disagg/test_configs/disagg/perf)
+
+**File Naming**: `xxx.yaml` (can contain `-` hyphen).

 **File Example**: `deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX.yaml`

 **Use Case**: Disaggregated architecture where model runs across multiple nodes with separate context (prefill) and generation (decode) servers.
+
+**Test Case Name**:
+```
+perf/test_perf_sanity.py::test_e2e[disagg_upload-{config yaml file base name}]
+```
+
+**Example**:
+```
+perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX]
+```
+
+## Running Tests
+
+**Important**: Do NOT add `--perf` flag when running pytest. Perf sanity tests are static test cases and do not use perf mode.
+
+```bash
+# Run all server configs in an aggregated test
+pytest perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell]
+
+# Run a specific server config in an aggregated test
+pytest perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k]
+
+# Run a specific disaggregated test
+pytest perf/test_perf_sanity.py::test_e2e[disagg_upload-deepseek-r1-fp4_1k1k_ctx1_gen1_dep8_bs768_eplb0_mtp0_ccb-UCX]
+```
--- a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_2_nodes_grace_blackwell.yaml
@ -35,8 +35,9 @@ server_configs:
        iterations: 12
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"
+
  - name: "r1_fp4_v2_tep8_mtp3"
    model_name: "deepseek_r1_0528_fp4_v2"
    trust_remote_code: true
@ -64,5 +65,5 @@ server_configs:
        iterations: 12
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"
--- a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_blackwell.yaml
@ -30,12 +30,12 @@ server_configs:
      decoding_type: 'MTP'
      num_nextn_predict_layers: 1
    client_configs:
-      - name: "con2048_iter10_1k1k"
+      - name: "con2048_iter5_1k1k"
        concurrency: 2048
-        iterations: 10
+        iterations: 5
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "r1_fp4_v2_tep4_mtp3_1k1k"
@ -65,7 +65,7 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "r1_fp4_v2_tp4_mtp3_1k1k"
@ -95,5 +95,5 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"
--- a/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp4_v2_grace_blackwell.yaml
@ -31,12 +31,12 @@ server_configs:
      decoding_type: 'MTP'
      num_nextn_predict_layers: 1
    client_configs:
-      - name: "con2048_iter10_1k1k"
+      - name: "con2048_iter5_1k1k"
        concurrency: 2048
-        iterations: 10
+        iterations: 5
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "r1_fp4_v2_tep4_mtp3_1k1k"
@ -66,7 +66,7 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "r1_fp4_v2_tp4_mtp3_1k1k"
@ -96,7 +96,7 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  # 8k1k configs
@ -126,12 +126,12 @@ server_configs:
      decoding_type: 'MTP'
      num_nextn_predict_layers: 1
    client_configs:
-      - name: "con2048_iter10_8k1k"
+      - name: "con2048_iter5_8k1k"
        concurrency: 2048
-        iterations: 10
+        iterations: 5
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "r1_fp4_v2_tep4_mtp3_8k1k"
@ -161,7 +161,7 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "r1_fp4_v2_tp4_mtp3_8k1k"
@ -191,7 +191,7 @@ server_configs:
        iterations: 10
        isl: 8192
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  # 1k8k configs
@ -221,12 +221,12 @@ server_configs:
      decoding_type: 'MTP'
      num_nextn_predict_layers: 1
    client_configs:
-      - name: "con2048_iter10_1k8k"
+      - name: "con2048_iter5_1k8k"
        concurrency: 2048
-        iterations: 10
+        iterations: 5
        isl: 1024
        osl: 8192
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "r1_fp4_v2_tep4_mtp3_1k8k"
@ -256,7 +256,7 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 8192
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "r1_fp4_v2_tp4_mtp3_1k8k"
@ -286,5 +286,5 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 8192
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"
--- a/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml
+++ b/tests/scripts/perf-sanity/deepseek_r1_fp8_blackwell.yaml
@ -30,12 +30,12 @@ server_configs:
      decoding_type: 'MTP'
      num_nextn_predict_layers: 1
    client_configs:
-      - name: "con4096_iter10_1k1k"
+      - name: "con4096_iter5_1k1k"
        concurrency: 4096
-        iterations: 10
+        iterations: 5
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "r1_fp8_tep8_mtp3_1k1k"
@ -65,7 +65,7 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "r1_fp8_tp8_mtp3_1k1k"
@ -95,5 +95,5 @@ server_configs:
        iterations: 10
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"
--- a/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml
+++ b/tests/scripts/perf-sanity/gpt_oss_120b_fp4_grace_blackwell.yaml
@ -4,6 +4,37 @@ metadata:
  - B200
  - B300
 server_configs:
+  - name: "gpt_oss_fp4_dep4_1k8k"
+    model_name: "gpt_oss_120b_fp4"
+    tensor_parallel_size: 4
+    moe_expert_parallel_size: 4
+    pipeline_parallel_size: 1
+    max_batch_size: 640
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: true
+    attention_dp_config:
+      enable_balance: true
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 640
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    num_postprocess_workers: 4
+    stream_interval: 20
+    client_configs:
+      - name: "con2560_iter5_1k8k"
+        concurrency: 2560
+        iterations: 5
+        isl: 1024
+        osl: 8192
+        random_range_ratio: 0.8
+        backend: "openai"
+
  - name: "gpt_oss_fp4_dep2_1k1k"
    model_name: "gpt_oss_120b_fp4"
    tensor_parallel_size: 2
@ -32,25 +63,23 @@ server_configs:
        iterations: 5
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"

-  - name: "gpt_oss_fp4_dep4_1k1k"
+  - name: "gpt_oss_fp4_tep2_1k8k"
    model_name: "gpt_oss_120b_fp4"
-    tensor_parallel_size: 4
-    moe_expert_parallel_size: 4
+    tensor_parallel_size: 2
+    moe_expert_parallel_size: 2
    pipeline_parallel_size: 1
-    max_batch_size: 512
+    max_batch_size: 128
    max_num_tokens: 20000
    attn_backend: "TRTLLM"
-    enable_attention_dp: true
-    attention_dp_config:
-      enable_balance: true
+    enable_attention_dp: false
    moe_config:
      backend: 'TRTLLM'
    cuda_graph_config:
      enable_padding: true
-      max_batch_size: 512
+      max_batch_size: 128
    kv_cache_config:
      dtype: 'fp8'
      enable_block_reuse: false
@ -58,12 +87,41 @@ server_configs:
    num_postprocess_workers: 4
    stream_interval: 20
    client_configs:
-      - name: "con2048_iter5_1k1k"
-        concurrency: 2048
-        iterations: 5
+      - name: "con128_iter10_1k8k"
+        concurrency: 128
+        iterations: 10
        isl: 1024
-        osl: 1024
-        random_range_ratio: 0.2
+        osl: 8192
+        random_range_ratio: 0.8
+        backend: "openai"
+
+  - name: "gpt_oss_fp4_tp2_1k8k"
+    model_name: "gpt_oss_120b_fp4"
+    tensor_parallel_size: 2
+    moe_expert_parallel_size: 1
+    pipeline_parallel_size: 1
+    max_batch_size: 8
+    max_num_tokens: 20000
+    attn_backend: "TRTLLM"
+    enable_attention_dp: false
+    moe_config:
+      backend: 'TRTLLM'
+    cuda_graph_config:
+      enable_padding: true
+      max_batch_size: 8
+    kv_cache_config:
+      dtype: 'fp8'
+      enable_block_reuse: false
+      free_gpu_memory_fraction: 0.8
+    num_postprocess_workers: 4
+    stream_interval: 20
+    client_configs:
+      - name: "con8_iter10_1k8k"
+        concurrency: 8
+        iterations: 10
+        isl: 1024
+        osl: 8192
+        random_range_ratio: 0.8
        backend: "openai"

  - name: "gpt_oss_fp4_tp4_eagle3_1k1k"
@ -97,5 +155,5 @@ server_configs:
        iterations: 32
        isl: 1024
        osl: 1024
-        random_range_ratio: 0.2
+        random_range_ratio: 0.8
        backend: "openai"