[None][fix] Decrease Pre Merge Perf Tests (#10390)

Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com> Signed-off-by: Yanchao Lu <yanchaol@nvidia.com> Co-authored-by: Yanchao Lu <yanchaol@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2026-01-05 01:21:34 +08:00 · 2026-01-05 01:21:34 +08:00 · a65b0d4efa
commit a65b0d4efa
parent c4f27fa4c0
6 changed files with 144 additions and 273 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -2872,21 +2872,6 @@ def runLLMTestlistOnPlatformImpl(pipeline, platform, testList, config=VANILLA_CO
                """
            }
        }
-
-        if (stageName.contains("PerfSanity")) {
-            stage ("Check PerfSanity Result") {
-                def perfCheckResult = sh(
-                    script: """
-                        python3 ${llmSrc}/tests/integration/defs/perf/perf_regression_check.py \
-                        ${WORKSPACE}/${stageName}
-                    """,
-                    returnStatus: true
-                )
-                if (perfCheckResult != 0) {
-                    error "Performance regression detected and failing the build (exit code: ${perfCheckResult})"
-                }
-            }
-        }
    }
 }

@ -3319,21 +3304,12 @@ def launchTestJobs(pipeline, testFilter)
        // "GB300-PyTorch-1": ["gb300-single", "l0_gb300", 1, 1],
        // "GB300-4_GPUs-PyTorch-Post-Merge-1": ["gb300-x4", "l0_gb300_multi_gpus", 1, 1, 4],
        // PerfSanity pre-merge tests
-        "GB200-4_GPUs-PyTorch-PerfSanity-4": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 4, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-5": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 5, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-6": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 6, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-11": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 11, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-12": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 12, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 2, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 2, 4],
        // PerfSanity post-merge tests
-        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-7": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 7, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-8": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 8, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-9": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 9, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-10": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 10, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-13": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 13, 14, 4],
-        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-14": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 14, 14, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-1": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 1, 3, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-2": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 2, 3, 4],
+        "GB200-4_GPUs-PyTorch-PerfSanity-Post-Merge-3": ["gb200-x4-oci", "l0_gb200_multi_gpus_perf_sanity", 3, 3, 4],
    ]
    fullSet += SBSASlurmTestConfigs.keySet()

--- a/jenkins/scripts/slurm_run.sh
+++ b/jenkins/scripts/slurm_run.sh
@ -102,7 +102,6 @@ set +e
 pytest_exit_code=0
 perf_check_exit_code=0
 perf_report_exit_code=0
-perf_sanity_check_exit_code=0

 eval $pytestCommand
 pytest_exit_code=$?
@ -154,20 +153,10 @@ if [ $SLURM_PROCID -eq 0 ] && [ "$perfMode" = "true" ]; then
    echo "Rank${SLURM_PROCID} Perf check finished execution with exit code $perf_check_exit_code"
 fi

-if [ $SLURM_PROCID -eq 0 ] && [[ "$stageName" == *PerfSanity* ]]; then
-    echo "Check PerfSanity Result"
-    python3 $llmSrcNode/tests/integration/defs/perf/perf_regression_check.py \
-        $jobWorkspace
-    perf_sanity_check_exit_code=$?
-    echo "Rank${SLURM_PROCID} PerfSanity check finished execution with exit code $perf_sanity_check_exit_code"
-fi
-
 if [ "$pytest_exit_code" -ne 0 ]; then
    final_exit_code=$pytest_exit_code
 elif [ "$perf_check_exit_code" -ne 0 ]; then
    final_exit_code=$perf_check_exit_code
-elif [ "$perf_sanity_check_exit_code" -ne 0 ]; then
-    final_exit_code=$perf_sanity_check_exit_code
 else
    final_exit_code=0
 fi
--- a/tests/integration/defs/perf/open_search_db_utils.py
+++ b/tests/integration/defs/perf/open_search_db_utils.py
@ -23,7 +23,7 @@ import time
 from datetime import datetime

 import yaml
-from defs.trt_test_alternative import print_info, print_warning
+from defs.trt_test_alternative import print_error, print_info, print_warning

 _project_root = os.path.abspath(
    os.path.join(os.path.dirname(__file__), '../../../..'))
@ -660,3 +660,123 @@ def write_regressive_test_cases(regressive_data_list, new_data_dict,
    if len(regressive_data_list) > 0:
        print_warning(
            f"Found {len(regressive_data_list)} regressive test cases")
+
+
+def _get_metric_keys():
+    """Get all metric-related keys for filtering config keys."""
+    metric_keys = set()
+    for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
+        metric_suffix = metric[2:]  # Strip "d_" prefix
+        metric_keys.add(metric)
+        metric_keys.add(f"d_baseline_{metric_suffix}")
+        metric_keys.add(f"d_threshold_post_merge_{metric_suffix}")
+        metric_keys.add(f"d_threshold_pre_merge_{metric_suffix}")
+    return metric_keys
+
+
+def _print_perf_data(data):
+    """Print performance metrics and config for a single data entry."""
+    print_info("=== Metrics ===")
+    for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
+        if metric in data:
+            value = data.get(metric, "N/A")
+            print_info(f'"{metric}": {value}')
+
+    metric_keys = _get_metric_keys()
+    print_info("\n=== Config ===")
+    config_keys = sorted([key for key in data.keys() if key not in metric_keys])
+    for key in config_keys:
+        value = data[key]
+        print_info(f'"{key}": {value}')
+
+
+def _print_regression_data(data, print_func=None):
+    """
+    Print regression info, metrics with baselines/thresholds, and config.
+    """
+    if print_func is None:
+        print_func = print_info
+
+    if "s_regression_info" in data:
+        print_func("=== Regression Info ===")
+        print_func(f"{data['s_regression_info']}")
+
+    metric_keys = _get_metric_keys()
+    is_post_merge = data.get("b_is_post_merge", False)
+
+    print_func("=== Metrics ===")
+    for metric in MAXIMIZE_METRICS + MINIMIZE_METRICS:
+        metric_suffix = metric[2:]  # Strip "d_" prefix
+        baseline_key = f"d_baseline_{metric_suffix}"
+        if is_post_merge:
+            threshold_key = f"d_threshold_post_merge_{metric_suffix}"
+        else:
+            threshold_key = f"d_threshold_pre_merge_{metric_suffix}"
+        # Only print if at least one of the keys exists
+        if metric in data or baseline_key in data or threshold_key in data:
+            value = data.get(metric, "N/A")
+            baseline = data.get(baseline_key, "N/A")
+            threshold = data.get(threshold_key, "N/A")
+            # Calculate percentage difference between value and baseline
+            # Positive percentage means better perf, negative means regression
+            if (isinstance(value, (int, float))
+                    and isinstance(baseline, (int, float)) and baseline != 0):
+                if metric in MAXIMIZE_METRICS:
+                    # Larger is better: value > baseline is positive (better)
+                    percentage = (value - baseline) / baseline * 100
+                else:
+                    # Smaller is better: value < baseline is positive (better)
+                    percentage = (baseline - value) / baseline * 100
+                percentage_str = f"{percentage:+.2f}%"
+            else:
+                percentage_str = "N/A"
+            print_func(
+                f'"{metric}": {value}, "{baseline_key}": {baseline}, '
+                f'"{threshold_key}": {threshold}, "diff": {percentage_str}')
+
+    print_func("\n=== Config ===")
+    config_keys = sorted([key for key in data.keys() if key not in metric_keys])
+    for key in config_keys:
+        if key == "s_regression_info":
+            continue
+        value = data[key]
+        print_func(f'"{key}": {value}')
+
+
+def check_perf_regression(regressive_data_list, new_data_dict):
+    """
+    Check performance regression by printing regression data.
+    """
+    # Split regression data into post-merge and pre-merge
+    post_merge_regressions = [
+        data for data in regressive_data_list
+        if data.get("b_is_post_merge", False)
+    ]
+    pre_merge_regressions = [
+        data for data in regressive_data_list
+        if not data.get("b_is_post_merge", False)
+    ]
+
+    # Print pre-merge regression data with print_warning
+    if len(pre_merge_regressions) > 0:
+        print_warning(
+            f"Found {len(pre_merge_regressions)} pre-merge regression data")
+        for i, data in enumerate(pre_merge_regressions):
+            print_warning(f"\n{'=' * 60}")
+            print_warning(f"Pre-merge Regression Data #{i + 1}")
+            print_warning("=" * 60)
+            _print_regression_data(data, print_func=print_warning)
+
+    # Print post-merge regression data with print_warning for content
+    if len(post_merge_regressions) > 0:
+        for i, data in enumerate(post_merge_regressions):
+            print_warning(f"\n{'=' * 60}")
+            print_warning(f"Post-merge Regression Data #{i + 1}")
+            print_warning("=" * 60)
+            _print_regression_data(data, print_func=print_warning)
+        print_error(
+            f"Found {len(post_merge_regressions)} post-merge regression data")
+
+    # Print summary if no regressions
+    if len(regressive_data_list) == 0:
+        print_info("No regression data found.")
--- a/tests/integration/defs/perf/perf_regression_check.py
+++ b/tests/integration/defs/perf/perf_regression_check.py
@ -1,203 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import sys
-
-import yaml
-
-METRICS = [
-    "seq_throughput",
-    "token_throughput",
-    "total_token_throughput",
-    "user_throughput",
-    "mean_tpot",
-    "median_tpot",
-    "p99_tpot",
-    "mean_ttft",
-    "median_ttft",
-    "p99_ttft",
-    "mean_itl",
-    "median_itl",
-    "p99_itl",
-    "mean_e2el",
-    "median_e2el",
-    "p99_e2el",
-]
-
-
-def should_skip_execution():
-    disagg_type = os.getenv("DISAGG_SERVING_TYPE", "")
-    if (
-        disagg_type.startswith("GEN")
-        or disagg_type.startswith("CTX")
-        or disagg_type == "DISAGG_SERVER"
-    ):
-        return True
-    return False
-
-
-def find_yaml_files(job_workspace, filename):
-    yaml_files = []
-    for root, dirs, files in os.walk(job_workspace):
-        for file in files:
-            if file == filename:
-                yaml_files.append(os.path.join(root, file))
-    return yaml_files
-
-
-def read_yaml_data(yaml_files):
-    all_data = []
-    for file_path in yaml_files:
-        try:
-            with open(file_path, "r") as f:
-                data = yaml.safe_load(f)
-                if data:
-                    if isinstance(data, list):
-                        all_data.extend(data)
-                    else:
-                        all_data.append(data)
-        except Exception as e:
-            print(f"Error reading {file_path}: {e}")
-    return all_data
-
-
-def get_metric_keys():
-    metric_keys = set()
-    for metric in METRICS:
-        metric_keys.add(f"d_{metric}")
-        metric_keys.add(f"d_baseline_{metric}")
-        metric_keys.add(f"d_threshold_{metric}")
-    return metric_keys
-
-
-def print_perf_data(data):
-    print("=== Metrics ===")
-    for metric in METRICS:
-        value_key = f"d_{metric}"
-        if value_key in data:
-            value = data.get(value_key, "N/A")
-            print(f'"{value_key}": {value}')
-
-    metric_keys = get_metric_keys()
-    print("\n=== Config ===")
-    config_keys = sorted([key for key in data.keys() if key not in metric_keys])
-    for key in config_keys:
-        value = data[key]
-        print(f'"{key}": {value}')
-
-
-def print_regression_data(data):
-    if "s_regression_info" in data:
-        print("=== Regression Info ===")
-        print(f"{data['s_regression_info']}")
-
-    metric_keys = get_metric_keys()
-
-    print("=== Metrics ===")
-    for metric in METRICS:
-        value_key = f"d_{metric}"
-        baseline_key = f"d_baseline_{metric}"
-        threshold_key = f"d_threshold_{metric}"
-        # Only print if at least one of the keys exists
-        if value_key in data or baseline_key in data or threshold_key in data:
-            value = data.get(value_key, "N/A")
-            baseline = data.get(baseline_key, "N/A")
-            threshold = data.get(threshold_key, "N/A")
-            # Calculate percentage difference between value and baseline
-            if (
-                isinstance(value, (int, float))
-                and isinstance(baseline, (int, float))
-                and baseline != 0
-            ):
-                percentage = (value - baseline) / baseline * 100
-                percentage_str = f"{percentage:+.2f}%"
-            else:
-                percentage_str = "N/A"
-            print(
-                f'"{value_key}": {value}, "{baseline_key}": {baseline}, '
-                f'"{threshold_key}": {threshold}, "diff": {percentage_str}'
-            )
-
-    print("\n=== Config ===")
-    config_keys = sorted([key for key in data.keys() if key not in metric_keys])
-    for key in config_keys:
-        if key == "s_regression_info":
-            continue
-        value = data[key]
-        print(f'"{key}": {value}')
-
-
-def main():
-    if should_skip_execution():
-        print("Skipping check_perf_regression.py due to DISAGG_SERVING_TYPE")
-        return 0
-
-    job_workspace = sys.argv[1]
-
-    if not os.path.isdir(job_workspace):
-        print(f"Skipping perf regression check since {job_workspace} is not a valid directory.")
-        return 0
-
-    perf_data_files = find_yaml_files(job_workspace, "perf_data.yaml")
-    all_perf_data = read_yaml_data(perf_data_files)
-    print(f"Found {len(all_perf_data)} perf data")
-    for i, data in enumerate(all_perf_data):
-        print(f"\n{'=' * 60}")
-        print(f"Perf Data #{i + 1}")
-        print("=" * 60)
-        print_perf_data(data)
-
-    print(f"\n{'=' * 60}\n")
-
-    regression_files = find_yaml_files(job_workspace, "regression.yaml")
-    all_regression_data = read_yaml_data(regression_files)
-    print(f"Found {len(all_regression_data)} regression data")
-    for i, data in enumerate(all_regression_data):
-        print(f"\n{'=' * 60}")
-        print(f"Regression Data #{i + 1}")
-        print("=" * 60)
-        print_regression_data(data)
-
-    # Split regression data into post-merge and pre-merge categories
-    post_merge_regressions = [
-        data for data in all_regression_data if data.get("b_is_post_merge", False)
-    ]
-    pre_merge_regressions = [
-        data for data in all_regression_data if not data.get("b_is_post_merge", False)
-    ]
-
-    if len(all_regression_data) == 0:
-        print("\n No regression data found. Perf check is successful.")
-        return 0
-
-    if len(pre_merge_regressions) > 0:
-        print(
-            f"\n Warning: Found {len(pre_merge_regressions)} pre-merge regression data. "
-            "But we don't fail the check temporarily."
-        )
-
-    if len(post_merge_regressions) > 0:
-        print(
-            f"\n Error: Found {len(post_merge_regressions)} post-merge regression data. Perf check is failed."
-        )
-        return 1
-
-    print("\n No post-merge regression data found. Perf check is successful.")
-    return 0
-
-
-if __name__ == "__main__":
-    sys.exit(main())
--- a/tests/integration/defs/perf/test_perf_sanity.py
+++ b/tests/integration/defs/perf/test_perf_sanity.py
@ -37,13 +37,13 @@ from ..conftest import get_llm_root, llm_models_root
 from .open_search_db_utils import (
    SCENARIO_MATCH_FIELDS,
    add_id,
+    check_perf_regression,
    get_common_values,
    get_history_data,
    get_job_info,
    post_new_perf_data,
    prepare_baseline_data,
    prepare_regressive_test_cases,
-    write_regressive_test_cases,
 )
 from .utils import collect_and_clean_myelin_time

@ -781,16 +781,17 @@ class DisaggTestCmds(NamedTuple):
        return ["multi-node disaggregated server tests, please check config files"]


-def parse_select_pattern(select_pattern: str) -> str:
-    """Parse select pattern (server config name).
+def parse_select_pattern(select_pattern: str) -> list:
+    """Parse select pattern (server config names).

    Args:
-        select_pattern: Server config name (e.g., "r1_fp8_dep8_mtp1_1k1k").
+        select_pattern: Server config names separated by comma
+            (e.g., "r1_fp4_v2_dep4_mtp1_1k1k,r1_fp4_v2_tep4_mtp3_1k1k,r1_fp4_v2_tp4_mtp3_1k1k").

    Returns:
-        Server config name string.
+        List of server config name strings.
    """
-    return select_pattern
+    return [name.strip() for name in select_pattern.split(",")]


 class PerfSanityTestConfig:
@ -873,11 +874,11 @@ class PerfSanityTestConfig:

    def _parse_aggr_config_file(self, config_file_path: str):
        """Parse YAML config file for aggregated server."""
-        # Parse selection pattern (server config name)
+        # Parse selection pattern (server config names)
        if self.select_pattern:
-            selected_server_name = parse_select_pattern(self.select_pattern)
+            selected_server_names = parse_select_pattern(self.select_pattern)
        else:
-            selected_server_name = None
+            selected_server_names = None

        with open(config_file_path, "r") as f:
            config = yaml.safe_load(f)
@ -895,10 +896,10 @@ class PerfSanityTestConfig:
        server_client_configs = {}

        for server_idx, server_config_data in enumerate(config["server_configs"]):
-            # Check if this server should be included based on selected_server_name
+            # Check if this server should be included based on selected_server_names
            if (
-                selected_server_name is not None
-                and server_config_data.get("name") != selected_server_name
+                selected_server_names is not None
+                and server_config_data.get("name") not in selected_server_names
            ):
                continue

@ -1375,8 +1376,7 @@ class PerfSanityTestConfig:
            # Upload the new perf data and baseline data to database
            post_new_perf_data(new_baseline_data_dict, new_data_dict, regressive_data_list)

-        perf_result_output_dir = os.path.join(self._output_dir, self._test_param_labels)
-        write_regressive_test_cases(regressive_data_list, new_data_dict, perf_result_output_dir)
+        check_perf_regression(regressive_data_list, new_data_dict)


 # Perf sanity test case parameters
@ -1444,10 +1444,13 @@ def get_disagg_test_cases() -> List[str]:
    return test_cases


+# Hardcoded multi-test test cases from test db.
+MULTI_TEST_TEST_CASES = []
+
 # Generate all test case combinations
 # For aggr: {test_type}-{config_yml}, {test_type}-{config_yml}-{server_config_name}
 # For disagg: {test_type}-{config_yml}
-PERF_SANITY_TEST_CASES = get_aggr_test_cases() + get_disagg_test_cases()
+PERF_SANITY_TEST_CASES = get_aggr_test_cases() + get_disagg_test_cases() + MULTI_TEST_TEST_CASES


@pytest.mark.parametrize("perf_sanity_test_case", PERF_SANITY_TEST_CASES)
--- a/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
+++ b/tests/integration/test_lists/test-db/l0_gb200_multi_gpus_perf_sanity.yml
@ -14,20 +14,12 @@ l0_gb200_multi_gpus_perf_sanity:
      stage: pre_merge
      backend: pytorch
  tests:
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90)
 - condition:
    ranges:
      system_gpu_count:
@ -45,14 +37,8 @@ l0_gb200_multi_gpus_perf_sanity:
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k1k] TIMEOUT (90)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k1k] TIMEOUT (60)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k1k] TIMEOUT (60)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_8k1k] TIMEOUT (90)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_8k1k] TIMEOUT (60)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_8k1k] TIMEOUT (60)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_dep4_mtp1_1k8k] TIMEOUT (120)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tep4_mtp3_1k8k] TIMEOUT (60)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-deepseek_r1_fp4_v2_grace_blackwell-r1_fp4_v2_tp4_mtp3_1k8k] TIMEOUT (60)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep4_1k8k] TIMEOUT (90)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_dep2_1k1k] TIMEOUT (90)
-  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tep2_1k8k] TIMEOUT (90)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp2_1k8k] TIMEOUT (90)
  - perf/test_perf_sanity.py::test_e2e[aggr_upload-gpt_oss_120b_fp4_grace_blackwell-gpt_oss_fp4_tp4_eagle3_1k1k] TIMEOUT (90)