[Infra]Remove some old keyword (#4552)

Signed-off-by: qqiao <qqiao@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2025-05-31 13:50:45 +08:00 · 2025-05-31 13:50:45 +08:00 · c945e92fdb
commit c945e92fdb
parent 8cb6163a57
17 changed files with 95 additions and 274 deletions
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@ -1504,7 +1504,7 @@ def runInKubernetes(pipeline, podSpec, containerName)
 def launchTestJobs(pipeline, testFilter, dockerNode=null)
 {
    def dockerArgs = "-v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
-    turtleConfigs = [
+    x86TestConfigs = [
        "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
        "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
        "DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
@ -1579,7 +1579,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
        "DGX_H200-4_GPUs-PyTorch-[Post-Merge]-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 2, 4],
    ]

-    parallelJobs = turtleConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
+    parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
        def config = VANILLA_CONFIG
        if (key.contains("single-device")) {
            config = SINGLE_DEVICE_CONFIG
@ -1591,13 +1591,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
    }]]}
    fullSet = parallelJobs.keySet()

-    slurmX86Configs = [
+    x86SlurmTestConfigs = [
        "RTXPro6000-PyTorch-[Post-Merge]-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
        "DGX_B200-4_GPUs-PyTorch-[Post-Merge]-1": ["b200-4-gpus", "l0_dgx_b200", 1, 1, 4],
    ]
-    fullSet += slurmX86Configs.keySet()
+    fullSet += x86SlurmTestConfigs.keySet()

-    parallelSlurmJobs = slurmX86Configs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "amd64"), {
+    parallelSlurmJobs = x86SlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "amd64"), {
        def config = VANILLA_CONFIG
        if (key.contains("single-device")) {
            config = SINGLE_DEVICE_CONFIG
@ -1612,25 +1612,25 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)

    // Try to match what are being tested on x86 H100_PCIe.
    // The total machine time is scaled proportionally according to the number of each GPU.
-    aarch64Configs = [
+    SBSATestConfigs = [
        "GH200-1": ["gh200", "l0_gh200", 1, 2],
        "GH200-2": ["gh200", "l0_gh200", 2, 2],
        "GH200-[Post-Merge]": ["gh200", "l0_gh200", 1, 1],
    ]
-    fullSet += aarch64Configs.keySet()
+    fullSet += SBSATestConfigs.keySet()

-    slurmSBSAConfigs = [
+    SBSASlurmTestConfigs = [
        "GB200-4_GPUs-PyTorch-[Post-Merge]-1": ["gb200-4-gpus", "l0_gb200", 1, 1, 4],
    ]
-    fullSet += slurmSBSAConfigs.keySet()
+    fullSet += SBSASlurmTestConfigs.keySet()

    if (env.targetArch == AARCH64_TRIPLE) {
-        parallelJobs = aarch64Configs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), {
+        parallelJobs = SBSASlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), {
            runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3])
        }]]}

        // Add SBSA Slurm jobs
-        parallelSlurmJobs = slurmSBSAConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), {
+        parallelSlurmJobs = SBSASlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), {
            def config = LINUX_AARCH64_CONFIG
            if (key.contains("single-device")) {
                config = SINGLE_DEVICE_CONFIG
--- a/scripts/collect_unittests.py
+++ b/scripts/collect_unittests.py
@ -1,51 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#
-# This script collects tensorrt_llm unit tests and transform them into TensorRT TURTLE form.
-#
-# Usage:
-# 1. build and install tensorrt_llm python package
-# 2. install pytest `pip3 install pytest`
-# 3. run `python3 scripts/collect_unittests.py` in tensorrt_llm root directory.
-# 4. update the collected tests into TensorRt TURTLE test.
-#    - check python list `LLM_UNIT_TESTS` in `<tensorrt repo>/tests/trt-test-defs/turtle/defs/llm/test_llm_unittests.py`.
-from subprocess import check_output
-
-KEYWORDS = ["<Module", "<UnitTestCase", "<TestCaseFunction"]
-
-
-def fetch_tests():
-    text = check_output(["pytest", "--collect-only", "tests/"])
-    text = text.decode()
-    lines = text.split("\n")
-    lines = [line for line in lines if any([k in line for k in KEYWORDS])]
-
-    module, unittest, case = "<bad>", "<bad>", "<bad>"
-    for line in lines:
-        if "<Module" in line:
-            module = line.replace("<Module ", "").replace(">", "").strip()
-        elif "<UnitTestCase" in line:
-            unittest = line.replace("<UnitTestCase ", "").replace(">",
-                                                                  "").strip()
-        elif "<TestCaseFunction" in line:
-            case = line.replace("<TestCaseFunction ", "").replace(">",
-                                                                  "").strip()
-            print(f"LLMUnitTestCase(\"{module}\", \"{unittest}.{case}\"),")
-
-
-if __name__ == "__main__":
-    fetch_tests()
--- a/scripts/package_trt_llm.py
+++ b/scripts/package_trt_llm.py
@ -15,7 +15,6 @@ def _clean_files(src_dir: PathLike, extend_files: str) -> None:
        ".devcontainer",
        "docker/README.md",
        "jenkins",
-        "scripts/collect_unittests.py",
        "scripts/package_trt_llm.py",
        "scripts/git_replace.py",
        "tests/integration",
--- a/tests/README.md
+++ b/tests/README.md
@ -119,7 +119,7 @@ Due to CI hardware resource limitation, and some cases only run on specific GPUs

 In directory `integration/test_lists/test-db`, each yml file corresponds to a GPU type.

-In file `jenkins/L0_Test.groovy`, the variable `turtleConfigs` maps yml files to CI stages.
+In file `jenkins/L0_Test.groovy`, the variables `x86TestConfigs`, `SBSATestConfigs`, `x86SlurmTestConfigs` and `SBSASlurmTestConfigs` map yml files to CI stages according to platforms and launch methods.

 Currently the yml files are manually maintained, which requires developer to update them when new test cases are added.

--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@ -2121,7 +2121,7 @@ def all_pytest_items():


@pytest.fixture(scope="session")
-def turtle_root():
+def test_root():
    return os.path.dirname(os.path.dirname(__file__))


--- a/tests/integration/defs/perf/allowed_configs.py
+++ b/tests/integration/defs/perf/allowed_configs.py
@ -24,8 +24,7 @@ except ImportError:

@dataclass
 class BuildConfig:
-    # TODO: Use `tensorrt_llm.builder.BuildConfig` when we switch to Pytest from TURTLE.
-    # Using TURTLE, we cannot do `import tensorrt_llm` in this file.
+    # TODO: Use `tensorrt_llm.builder.BuildConfig` when we switch to use Pytest.
    max_input_len: int = 256
    max_seq_len: int = 512
    opt_batch_size: int = 8
--- a/tests/integration/defs/perf/data_export.py
+++ b/tests/integration/defs/perf/data_export.py
@ -350,7 +350,7 @@ def write_gpu_monitoring_no_test_results(logpath,

 def get_log(fpath):
    """
-    Converts TURTLE log output into an ordered dict of stdout and stderr.
+    Converts log output into an ordered dict of stdout and stderr.
    Used for raw_result for test_result.

    Args:
--- a/tests/integration/defs/perf/gpu_clock_lock.py
+++ b/tests/integration/defs/perf/gpu_clock_lock.py
@ -29,7 +29,6 @@ import time
 import psutil  # type: ignore
 # Nvidia
 import pynvml  # type: ignore
-# TURTLE
 from defs.trt_test_alternative import print_info, print_warning

 from .misc import clean_device_product_name
--- a/tests/integration/defs/perf/misc.py
+++ b/tests/integration/defs/perf/misc.py
@ -14,7 +14,7 @@
 # limitations under the License.
 # -*- coding: utf-8 -*-
 """
-Miscellaneous utility code used by trt_test. Should contain all code that is agnostic to remote mode vs local mode of TURTLE.
+Miscellaneous utility code used by trt_test. Should contain all code that is agnostic to remote mode vs local mode.
 """
 import subprocess as sp

--- a/tests/integration/defs/perf/session_data_writer.py
+++ b/tests/integration/defs/perf/session_data_writer.py
@ -71,7 +71,7 @@ class SessionDataWriter:
    def _write_session_perf_logs(self):
        """
        Write session data. Should only be called once at the end of the entire
-        perf session, in otherwords, when TURTLE ends during teardown().
+        perf session, in otherwords, only during teardown().
        """
        # Output various log files depending on options.
        for fmt in self._output_formats:
--- a/tests/integration/defs/perf/test_perf.py
+++ b/tests/integration/defs/perf/test_perf.py
@ -274,7 +274,7 @@ class PerfTestMetric(NamedTuple):
    """
    Configurations of a test metric.
    """
-    # The original test name used to run the TURTLE test.
+    # The original test name used to run the oraginal perf test.
    original_test_name: str
    # The name for this particular metric.
    metric_name: str
@ -759,7 +759,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
    """

    def __init__(self, full_test_name: str):
-        # full_test_name is the full test name appearing in TURTLE output.
+        # full_test_name is the full test name appearing in test output.
        self._full_test_name = full_test_name
        # test_domain_name is the part before "::".
        self._test_domain_name = "::".join(full_test_name.split("::")[:-1])
--- a/tests/integration/defs/perf/utils.py
+++ b/tests/integration/defs/perf/utils.py
@ -83,7 +83,7 @@ def collect_and_clean_myelin_time(log: str):

 class PerfMetricType(str, Enum):
    """
-    An string-enum type to define what kind of perf metric it is. While it is not used by TURTLE, it is used by QA to
+    An string-enum type to define what kind of perf metric it is. It is used by QA to
    set up special threshold criteria for each type of metrics (like >50MB for engine size increase, etc.).
    """
    INFERENCE_TIME = "INFERENCE_TIME"
@ -352,13 +352,12 @@ class AbstractPerfScriptTestClass(abc.ABC):
        """
        Get the absolute threshold used to flag a perf regression compared to perf baseline.
        Perf comparison will only fail if it exceeds both relative and absolute thresholds.
-        Note: This is not honored by TURTLE for now, but we can add the support later.
        """
        return 0.0

    def get_metric_type(self) -> PerfMetricType:
        """
-        Get the type of perf metric. This does not affect TURTLE for now, but QA uses this field to set up special
+        Get the type of perf metric. QA uses this field to set up special
        threshold criteria depending on the metric type.
        """
        return PerfMetricType.INFERENCE_TIME
--- a/tests/integration/defs/test_list_validation.py
+++ b/tests/integration/defs/test_list_validation.py
@ -87,7 +87,7 @@ def validate_perf_tests(perf_test_names) -> bool:
    return passed


-def test_list_validation(turtle_root, all_pytest_items, trt_config,
+def test_list_validation(test_root, all_pytest_items, trt_config,
                         is_trt_environment):

    # Don't run test list validation in TRT environment because TRT uses
@ -99,13 +99,13 @@ def test_list_validation(turtle_root, all_pytest_items, trt_config,
        return

    # Glob all the test list files.
-    test_list_path = os.path.join(turtle_root, "test_lists", "*", "*.txt")
+    test_list_path = os.path.join(test_root, "test_lists", "*", "*.txt")
    all_test_lists = glob.glob(test_list_path)
    assert len(all_test_lists
               ) > 0, f"Cannot find any test lists with path {test_list_path}!"

    # Glob all the test db files.
-    test_db_path = os.path.join(turtle_root, "test_lists", "*", "*.yml")
+    test_db_path = os.path.join(test_root, "test_lists", "*", "*.yml")
    all_test_dbs = glob.glob(test_db_path)
    assert len(all_test_dbs
               ) > 0, f"Cannot find any test lists with path {test_db_path}!"
--- a/tests/integration/defs/triton_server/conftest.py
+++ b/tests/integration/defs/triton_server/conftest.py
@ -7,33 +7,10 @@ import tempfile

 import pytest

-# pytest_plugins = ["pytester", "trt_test.pytest_plugin"]
-USE_TURTLE = True
-try:
-    import trt_test  # noqa
-except ImportError:
-    from .test_list_parser import (CorrectionMode, get_test_name_corrections_v2,
+from .test_list_parser import (CorrectionMode, get_test_name_corrections_v2,
                               handle_corrections)
-    from .trt_test_alternative import (SessionDataWriter, check_call,
-                                       check_output, print_info)
-
-    @pytest.fixture(scope="session")
-    def trt_config():
-        return None  # tekit shall never call this
-
-    @pytest.fixture(scope="session")
-    def gitlab_token():
-        return None  # tekit shall never call this
-
-    @pytest.fixture(scope="session")
-    def versions_from_infer_device():
-        pass
-
-    USE_TURTLE = False
-else:
-    from trt_test.misc import check_call, check_output, print_info
-    from trt_test.session_data_writer import SessionDataWriter
-    USE_TURTLE = True
+from .trt_test_alternative import (SessionDataWriter, check_call, check_output,
+                                   print_info)


 def llm_models_root() -> str:
@ -78,9 +55,9 @@ def trt_performance_cache_fpath(trt_config, trt_performance_cache_name):
    return fpath


-# Get the executing turtle case name
+# Get the executing test case name
@pytest.fixture(autouse=True)
-def turtle_case_name(request):
+def test_case_name(request):
    return request.node.nodeid


@ -121,44 +98,13 @@ def llm_session_data_writer(trt_config, trt_gpu_clock_lock,
    session_data_writer.teardown()


-if USE_TURTLE:
-
-    @pytest.fixture(scope="session")
-    def trt_py3_venv_factory(trt_py_base_venv_factory):
-        """
-        Session-scoped fixture which provides a factory function to produce a VirtualenvRunner capable of
-        running Python3 code.  Used by other session-scoped fixtures which need to modify the default VirtualenvRunner prolog.
-        """
-
-        # TODO: remove update env after TURTLE support multi devices
-        # Temporarily update CUDA_VISIBLE_DEVICES visible device
-        device_count = get_device_count()
-        visible_devices = ",".join([str(i) for i in range(device_count)])
-
-        print_info(f"Setting CUDA_VISIBLE_DEVICES to {visible_devices}.")
-
-        os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices
-
-        def factory():
-            return trt_py_base_venv_factory("python3")
-
-        return factory
-
-    @pytest.fixture(scope="session")
-    def llm_backend_venv(trt_py3_venv_factory):
-        """
-        The fixture venv used for LLM tests.
-        """
-        venv = trt_py3_venv_factory()
-        return venv
-else:
-
-    @pytest.fixture(scope="session")
-    def custom_user_workspace(request):
+@pytest.fixture(scope="session")
+def custom_user_workspace(request):
    return request.config.getoption("--workspace")

-    @pytest.fixture(scope="session")
-    def llm_backend_venv(custom_user_workspace):
+
+@pytest.fixture(scope="session")
+def llm_backend_venv(custom_user_workspace):
    workspace_dir = custom_user_workspace
    subdir = datetime.datetime.now().strftime("ws-%Y-%m-%d-%H-%M-%S")
    if workspace_dir is None:
@ -689,71 +635,7 @@ def output_dir(request):
        return request.config.getoption("--output-dir")


-if USE_TURTLE:  # perf tests can not run outside turtle for now
-    # Cache all the pytest items so that we can do test list validation.
-    ALL_PYTEST_ITEMS = None  # All pytest items available, before deselection.
-
-    @pytest.hookimpl(hookwrapper=True, tryfirst=True)
-    def pytest_collection_modifyitems(session, config, items):
-        # Flush the current stdout line.
-        print()
-
-        import copy
-
-        global ALL_PYTEST_ITEMS
-        ALL_PYTEST_ITEMS = copy.copy(items)
-        _ = yield
-
-else:
-    #
-    # When test parameters have an empty id, older versions of pytest ignored that parameter when generating the
-    # test node's ID completely. This however was actually a bug, and not expected behavior that got fixed in newer
-    # versions of pytest:https://github.com/pytest-dev/pytest/pull/6607. TRT test defs however rely on this behavior
-    # for quite a few test names. This is a hacky WAR that restores the old behavior back so that the
-    # test names do not change. Note: This might break in a future pytest version.
-    #
-    # TODO: Remove this hack once the test names are fixed.
-    #
-
-    from _pytest.python import CallSpec2
-    CallSpec2.id = property(
-        lambda self: "-".join(map(str, filter(None, self._idlist))))
-
-    # @pytest.hookimpl(tryfirst=True, hookwrapper=True)
-    # def pytest_collection_modifyitems(config, items):
-    #     testlist_path = config.getoption("--test-list")
-    #     waives_file = config.getoption("--waives-file")
-    #     test_prefix = config.getoption("--test-prefix")
-    #     if test_prefix:
-    #         # Override the internal nodeid of each item to contain the correct test prefix.
-    #         # This is needed for reporting to correctly process the test name in order to bucket
-    #         # it into the appropriate test suite.
-    #         for item in items:
-    #             item._nodeid = "{}/{}".format(test_prefix, item._nodeid)
-
-    #     regexp = config.getoption("--regexp")
-
-    #     if testlist_path:
-    #         modify_by_test_list(testlist_path, items, config)
-
-    #     if regexp is not None:
-    #         deselect_by_regex(regexp, items, test_prefix, config)
-
-    #     if waives_file:
-    #         apply_waives(waives_file, items, config)
-
-    #     # We have to remove prefix temporarily before splitting the test list
-    #     # After that change back the test id.
-    #     for item in items:
-    #         if test_prefix and item._nodeid.startswith(f"{test_prefix}/"):
-    #             item._nodeid = item._nodeid[len(f"{test_prefix}/"):]
-    #     yield
-    #     for item in items:
-    #         if test_prefix:
-    #             item._nodeid = f"{test_prefix}/{item._nodeid}"
-
-
-    def deselect_by_regex(regexp, items, test_prefix, config):
+def deselect_by_regex(regexp, items, test_prefix, config):
    """Filter out tests based on the patterns specified in the given list of regular expressions.
        If a test matches *any* of the expressions in the list it is considered selected."""
    compiled_regexes = []
@ -765,8 +647,8 @@ else:
    selected = []
    deselected = []

-        corrections = get_test_name_corrections_v2(
-            set(regex_list), set(it.nodeid for it in items),
+    corrections = get_test_name_corrections_v2(set(regex_list),
+                                               set(it.nodeid for it in items),
                                               CorrectionMode.REGEX)
    handle_corrections(corrections, test_prefix)

--- a/tests/integration/defs/triton_server/local_venv.py
+++ b/tests/integration/defs/triton_server/local_venv.py
@ -21,7 +21,7 @@ class PythonVenvRunnerImpl(PythonRunnerInterface):
        venv_dir (str): Path to the virtualenv root directory, or None if this is
                        an externally-built virtualenv
        venv_bin (str): Path to the Python executable to use when running tests
-        workspace (str): Path to the TURTLE workspace
+        workspace (str): Path to the test workspace
    """

    def __init__(self, pip_opts, venv_dir, venv_bin, workspace):
--- a/tests/integration/defs/triton_server/turtle_defs.json
+++ b/tests/integration/defs/triton_server/turtle_defs.json
@ -1,6 +0,0 @@
-{
-  "needs_turtle_major": [
-    5,
-    6
-  ]
-}
--- a/tests/integration/test_lists/test-db/l0_a30.yml
+++ b/tests/integration/test_lists/test-db/l0_a30.yml
@ -198,29 +198,29 @@ l0_a30:
  - triton_server/test_triton_llm.py::test_mistral_v1_7b_python_backend[e2e]
  - triton_server/test_triton_llm.py::test_gpt_350m_python_backend[accuracy]
  - triton_server/test_triton_llm.py::test_gpt_350m_python_backend[e2e]
-  - triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
-  - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
-  - triton_server/test_triton_llm.py::test_llava[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
-  - triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
-  - triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
-  - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
-  - triton_server/test_triton_llm.py::test_mistral_v1_7b_ifb[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble]
-  - triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble]
-  - triton_server/test_triton_llm.py::test_gpt_350m_speculative_decoding_return_logits[False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-False-ensemble]
-  - triton_server/test_triton_llm.py::test_t5_small_enc_dec_ifb[test_basic-False-1-top_k_top_p-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-4096-1-1-1-False-tensorrt_llm_bls]
-  - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict-1-1-1-False-ensemble]
-  - triton_server/test_triton_llm.py::test_benchmark_core_model[llama_v2_7b-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False]
-  - triton_server/test_triton_llm.py::test_benchmark_core_model[gptj_6b-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False]
-  - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-False-ensemble]
-  - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict-1-1-1-False-ensemble]
-  - triton_server/test_triton_llm.py::test_whisper_large_v3_ifb[True-1-top_k_top_p-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-0.5-guaranteed_no_evict-24000-1-1-1-False-ensemble]
-  - triton_server/test_triton_llm.py::test_llava_onevision[test_video-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
-  - triton_server/test_triton_llm.py::test_llava_onevision[test_basic-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-False-tensorrt_llm_bls]
-  - triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-tensorrtllm-True-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble-accuracy]
-  - triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-python-True-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble-accuracy]
-  - triton_server/test_triton_llm.py::test_gpt_disaggregated_serving_bls[test_basic-False-1-top_k_top_p-False-True-True-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-True-tensorrt_llm_bls]
-  - triton_server/test_triton_llm.py::test_llama_v2_70b_ifb_lad[7-7-7-False-1-lookahead-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_llava[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization---1-1-1-False-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_mistral_v1_7b_ifb[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_gpt_350m_speculative_decoding_return_logits[False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_t5_small_enc_dec_ifb[test_basic-False-1-top_k_top_p--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap---guaranteed_no_evict--4096-1-1-1-False-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_benchmark_core_model[llama_v2_7b-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False]
+  - triton_server/test_triton_llm.py::test_benchmark_core_model[gptj_6b-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False]
+  - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_whisper_large_v3_ifb[True-1-top_k_top_p--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-0.5-guaranteed_no_evict--24000-1-1-1-False-ensemble]
+  - triton_server/test_triton_llm.py::test_llava_onevision[test_video-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_llava_onevision[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-tensorrtllm-True-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble-accuracy]
+  - triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-python-True-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble-accuracy]
+  - triton_server/test_triton_llm.py::test_gpt_disaggregated_serving_bls[test_basic-False-1-top_k_top_p--False-True-True-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-True-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_llama_v2_70b_ifb_lad[7-7-7-False-1-lookahead--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
  - triton_server/test_triton_llm.py::test_llmapi_backend[1-0-disableDecoupleMode-tensorrt_llm]
  - triton_server/test_triton_llm.py::test_llmapi_backend[1-0-enableDecoupleMode-tensorrt_llm]
-  - triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble]
-  - triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1-False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
+  - triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
+  - triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]