diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index 5311418bde..af075e21b7 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -1504,7 +1504,7 @@ def runInKubernetes(pipeline, podSpec, containerName) def launchTestJobs(pipeline, testFilter, dockerNode=null) { def dockerArgs = "-v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog" - turtleConfigs = [ + x86TestConfigs = [ "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], @@ -1579,7 +1579,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) "DGX_H200-4_GPUs-PyTorch-[Post-Merge]-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 2, 4], ] - parallelJobs = turtleConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), { + parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), { def config = VANILLA_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG @@ -1591,13 +1591,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) }]]} fullSet = parallelJobs.keySet() - slurmX86Configs = [ + x86SlurmTestConfigs = [ "RTXPro6000-PyTorch-[Post-Merge]-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1], "DGX_B200-4_GPUs-PyTorch-[Post-Merge]-1": ["b200-4-gpus", "l0_dgx_b200", 1, 1, 4], ] - fullSet += slurmX86Configs.keySet() + fullSet += x86SlurmTestConfigs.keySet() - parallelSlurmJobs = slurmX86Configs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "amd64"), { + parallelSlurmJobs = x86SlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "amd64"), { def config = VANILLA_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG @@ -1612,25 +1612,25 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null) // Try to match what are being tested on x86 H100_PCIe. // The total machine time is scaled proportionally according to the number of each GPU. - aarch64Configs = [ + SBSATestConfigs = [ "GH200-1": ["gh200", "l0_gh200", 1, 2], "GH200-2": ["gh200", "l0_gh200", 2, 2], "GH200-[Post-Merge]": ["gh200", "l0_gh200", 1, 1], ] - fullSet += aarch64Configs.keySet() + fullSet += SBSATestConfigs.keySet() - slurmSBSAConfigs = [ + SBSASlurmTestConfigs = [ "GB200-4_GPUs-PyTorch-[Post-Merge]-1": ["gb200-4-gpus", "l0_gb200", 1, 1, 4], ] - fullSet += slurmSBSAConfigs.keySet() + fullSet += SBSASlurmTestConfigs.keySet() if (env.targetArch == AARCH64_TRIPLE) { - parallelJobs = aarch64Configs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), { + parallelJobs = SBSASlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), { runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3]) }]]} // Add SBSA Slurm jobs - parallelSlurmJobs = slurmSBSAConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), { + parallelSlurmJobs = SBSASlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), { def config = LINUX_AARCH64_CONFIG if (key.contains("single-device")) { config = SINGLE_DEVICE_CONFIG diff --git a/scripts/collect_unittests.py b/scripts/collect_unittests.py deleted file mode 100755 index 32c3d4e3d8..0000000000 --- a/scripts/collect_unittests.py +++ /dev/null @@ -1,51 +0,0 @@ -#!/usr/bin/env python3 -# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -# SPDX-License-Identifier: Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -# -# This script collects tensorrt_llm unit tests and transform them into TensorRT TURTLE form. -# -# Usage: -# 1. build and install tensorrt_llm python package -# 2. install pytest `pip3 install pytest` -# 3. run `python3 scripts/collect_unittests.py` in tensorrt_llm root directory. -# 4. update the collected tests into TensorRt TURTLE test. -# - check python list `LLM_UNIT_TESTS` in `/tests/trt-test-defs/turtle/defs/llm/test_llm_unittests.py`. -from subprocess import check_output - -KEYWORDS = ["", "" - for line in lines: - if "", "").strip() - elif "", - "").strip() - elif "", - "").strip() - print(f"LLMUnitTestCase(\"{module}\", \"{unittest}.{case}\"),") - - -if __name__ == "__main__": - fetch_tests() diff --git a/scripts/package_trt_llm.py b/scripts/package_trt_llm.py index fcb4cde494..c89b8bb29c 100644 --- a/scripts/package_trt_llm.py +++ b/scripts/package_trt_llm.py @@ -15,7 +15,6 @@ def _clean_files(src_dir: PathLike, extend_files: str) -> None: ".devcontainer", "docker/README.md", "jenkins", - "scripts/collect_unittests.py", "scripts/package_trt_llm.py", "scripts/git_replace.py", "tests/integration", diff --git a/tests/README.md b/tests/README.md index 02195b061e..69c39e9a24 100644 --- a/tests/README.md +++ b/tests/README.md @@ -119,7 +119,7 @@ Due to CI hardware resource limitation, and some cases only run on specific GPUs In directory `integration/test_lists/test-db`, each yml file corresponds to a GPU type. -In file `jenkins/L0_Test.groovy`, the variable `turtleConfigs` maps yml files to CI stages. +In file `jenkins/L0_Test.groovy`, the variables `x86TestConfigs`, `SBSATestConfigs`, `x86SlurmTestConfigs` and `SBSASlurmTestConfigs` map yml files to CI stages according to platforms and launch methods. Currently the yml files are manually maintained, which requires developer to update them when new test cases are added. diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py index 30afaf6de5..83ac026492 100644 --- a/tests/integration/defs/conftest.py +++ b/tests/integration/defs/conftest.py @@ -2121,7 +2121,7 @@ def all_pytest_items(): @pytest.fixture(scope="session") -def turtle_root(): +def test_root(): return os.path.dirname(os.path.dirname(__file__)) diff --git a/tests/integration/defs/perf/allowed_configs.py b/tests/integration/defs/perf/allowed_configs.py index 4bfd022da5..5a4885796c 100644 --- a/tests/integration/defs/perf/allowed_configs.py +++ b/tests/integration/defs/perf/allowed_configs.py @@ -24,8 +24,7 @@ except ImportError: @dataclass class BuildConfig: - # TODO: Use `tensorrt_llm.builder.BuildConfig` when we switch to Pytest from TURTLE. - # Using TURTLE, we cannot do `import tensorrt_llm` in this file. + # TODO: Use `tensorrt_llm.builder.BuildConfig` when we switch to use Pytest. max_input_len: int = 256 max_seq_len: int = 512 opt_batch_size: int = 8 diff --git a/tests/integration/defs/perf/data_export.py b/tests/integration/defs/perf/data_export.py index aaa9f84ba4..7615883c18 100644 --- a/tests/integration/defs/perf/data_export.py +++ b/tests/integration/defs/perf/data_export.py @@ -350,7 +350,7 @@ def write_gpu_monitoring_no_test_results(logpath, def get_log(fpath): """ - Converts TURTLE log output into an ordered dict of stdout and stderr. + Converts log output into an ordered dict of stdout and stderr. Used for raw_result for test_result. Args: diff --git a/tests/integration/defs/perf/gpu_clock_lock.py b/tests/integration/defs/perf/gpu_clock_lock.py index 8a61488860..61c86b89b9 100644 --- a/tests/integration/defs/perf/gpu_clock_lock.py +++ b/tests/integration/defs/perf/gpu_clock_lock.py @@ -29,7 +29,6 @@ import time import psutil # type: ignore # Nvidia import pynvml # type: ignore -# TURTLE from defs.trt_test_alternative import print_info, print_warning from .misc import clean_device_product_name diff --git a/tests/integration/defs/perf/misc.py b/tests/integration/defs/perf/misc.py index 8070b265f1..031004366a 100644 --- a/tests/integration/defs/perf/misc.py +++ b/tests/integration/defs/perf/misc.py @@ -14,7 +14,7 @@ # limitations under the License. # -*- coding: utf-8 -*- """ -Miscellaneous utility code used by trt_test. Should contain all code that is agnostic to remote mode vs local mode of TURTLE. +Miscellaneous utility code used by trt_test. Should contain all code that is agnostic to remote mode vs local mode. """ import subprocess as sp diff --git a/tests/integration/defs/perf/session_data_writer.py b/tests/integration/defs/perf/session_data_writer.py index dce67b5145..e28ba79be2 100644 --- a/tests/integration/defs/perf/session_data_writer.py +++ b/tests/integration/defs/perf/session_data_writer.py @@ -71,7 +71,7 @@ class SessionDataWriter: def _write_session_perf_logs(self): """ Write session data. Should only be called once at the end of the entire - perf session, in otherwords, when TURTLE ends during teardown(). + perf session, in otherwords, only during teardown(). """ # Output various log files depending on options. for fmt in self._output_formats: diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index c155c13c7e..6c103155eb 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -274,7 +274,7 @@ class PerfTestMetric(NamedTuple): """ Configurations of a test metric. """ - # The original test name used to run the TURTLE test. + # The original test name used to run the oraginal perf test. original_test_name: str # The name for this particular metric. metric_name: str @@ -759,7 +759,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass): """ def __init__(self, full_test_name: str): - # full_test_name is the full test name appearing in TURTLE output. + # full_test_name is the full test name appearing in test output. self._full_test_name = full_test_name # test_domain_name is the part before "::". self._test_domain_name = "::".join(full_test_name.split("::")[:-1]) diff --git a/tests/integration/defs/perf/utils.py b/tests/integration/defs/perf/utils.py index 129731947b..4e3d583b9a 100644 --- a/tests/integration/defs/perf/utils.py +++ b/tests/integration/defs/perf/utils.py @@ -83,7 +83,7 @@ def collect_and_clean_myelin_time(log: str): class PerfMetricType(str, Enum): """ - An string-enum type to define what kind of perf metric it is. While it is not used by TURTLE, it is used by QA to + An string-enum type to define what kind of perf metric it is. It is used by QA to set up special threshold criteria for each type of metrics (like >50MB for engine size increase, etc.). """ INFERENCE_TIME = "INFERENCE_TIME" @@ -352,13 +352,12 @@ class AbstractPerfScriptTestClass(abc.ABC): """ Get the absolute threshold used to flag a perf regression compared to perf baseline. Perf comparison will only fail if it exceeds both relative and absolute thresholds. - Note: This is not honored by TURTLE for now, but we can add the support later. """ return 0.0 def get_metric_type(self) -> PerfMetricType: """ - Get the type of perf metric. This does not affect TURTLE for now, but QA uses this field to set up special + Get the type of perf metric. QA uses this field to set up special threshold criteria depending on the metric type. """ return PerfMetricType.INFERENCE_TIME diff --git a/tests/integration/defs/test_list_validation.py b/tests/integration/defs/test_list_validation.py index 500e539dad..d671644641 100644 --- a/tests/integration/defs/test_list_validation.py +++ b/tests/integration/defs/test_list_validation.py @@ -87,7 +87,7 @@ def validate_perf_tests(perf_test_names) -> bool: return passed -def test_list_validation(turtle_root, all_pytest_items, trt_config, +def test_list_validation(test_root, all_pytest_items, trt_config, is_trt_environment): # Don't run test list validation in TRT environment because TRT uses @@ -99,13 +99,13 @@ def test_list_validation(turtle_root, all_pytest_items, trt_config, return # Glob all the test list files. - test_list_path = os.path.join(turtle_root, "test_lists", "*", "*.txt") + test_list_path = os.path.join(test_root, "test_lists", "*", "*.txt") all_test_lists = glob.glob(test_list_path) assert len(all_test_lists ) > 0, f"Cannot find any test lists with path {test_list_path}!" # Glob all the test db files. - test_db_path = os.path.join(turtle_root, "test_lists", "*", "*.yml") + test_db_path = os.path.join(test_root, "test_lists", "*", "*.yml") all_test_dbs = glob.glob(test_db_path) assert len(all_test_dbs ) > 0, f"Cannot find any test lists with path {test_db_path}!" diff --git a/tests/integration/defs/triton_server/conftest.py b/tests/integration/defs/triton_server/conftest.py index 7e2ca6ddc0..9cb5642a03 100644 --- a/tests/integration/defs/triton_server/conftest.py +++ b/tests/integration/defs/triton_server/conftest.py @@ -7,33 +7,10 @@ import tempfile import pytest -# pytest_plugins = ["pytester", "trt_test.pytest_plugin"] -USE_TURTLE = True -try: - import trt_test # noqa -except ImportError: - from .test_list_parser import (CorrectionMode, get_test_name_corrections_v2, - handle_corrections) - from .trt_test_alternative import (SessionDataWriter, check_call, - check_output, print_info) - - @pytest.fixture(scope="session") - def trt_config(): - return None # tekit shall never call this - - @pytest.fixture(scope="session") - def gitlab_token(): - return None # tekit shall never call this - - @pytest.fixture(scope="session") - def versions_from_infer_device(): - pass - - USE_TURTLE = False -else: - from trt_test.misc import check_call, check_output, print_info - from trt_test.session_data_writer import SessionDataWriter - USE_TURTLE = True +from .test_list_parser import (CorrectionMode, get_test_name_corrections_v2, + handle_corrections) +from .trt_test_alternative import (SessionDataWriter, check_call, check_output, + print_info) def llm_models_root() -> str: @@ -78,9 +55,9 @@ def trt_performance_cache_fpath(trt_config, trt_performance_cache_name): return fpath -# Get the executing turtle case name +# Get the executing test case name @pytest.fixture(autouse=True) -def turtle_case_name(request): +def test_case_name(request): return request.node.nodeid @@ -121,52 +98,21 @@ def llm_session_data_writer(trt_config, trt_gpu_clock_lock, session_data_writer.teardown() -if USE_TURTLE: +@pytest.fixture(scope="session") +def custom_user_workspace(request): + return request.config.getoption("--workspace") - @pytest.fixture(scope="session") - def trt_py3_venv_factory(trt_py_base_venv_factory): - """ - Session-scoped fixture which provides a factory function to produce a VirtualenvRunner capable of - running Python3 code. Used by other session-scoped fixtures which need to modify the default VirtualenvRunner prolog. - """ - # TODO: remove update env after TURTLE support multi devices - # Temporarily update CUDA_VISIBLE_DEVICES visible device - device_count = get_device_count() - visible_devices = ",".join([str(i) for i in range(device_count)]) - - print_info(f"Setting CUDA_VISIBLE_DEVICES to {visible_devices}.") - - os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices - - def factory(): - return trt_py_base_venv_factory("python3") - - return factory - - @pytest.fixture(scope="session") - def llm_backend_venv(trt_py3_venv_factory): - """ - The fixture venv used for LLM tests. - """ - venv = trt_py3_venv_factory() - return venv -else: - - @pytest.fixture(scope="session") - def custom_user_workspace(request): - return request.config.getoption("--workspace") - - @pytest.fixture(scope="session") - def llm_backend_venv(custom_user_workspace): - workspace_dir = custom_user_workspace - subdir = datetime.datetime.now().strftime("ws-%Y-%m-%d-%H-%M-%S") - if workspace_dir is None: - workspace_dir = "triton-backend-test-workspace" - workspace_dir = os.path.join(workspace_dir, subdir) - from defs.local_venv import PythonVenvRunnerImpl - return PythonVenvRunnerImpl("", "", "python3", - os.path.join(os.getcwd(), workspace_dir)) +@pytest.fixture(scope="session") +def llm_backend_venv(custom_user_workspace): + workspace_dir = custom_user_workspace + subdir = datetime.datetime.now().strftime("ws-%Y-%m-%d-%H-%M-%S") + if workspace_dir is None: + workspace_dir = "triton-backend-test-workspace" + workspace_dir = os.path.join(workspace_dir, subdir) + from defs.local_venv import PythonVenvRunnerImpl + return PythonVenvRunnerImpl("", "", "python3", + os.path.join(os.getcwd(), workspace_dir)) @pytest.fixture(scope="session") @@ -689,98 +635,34 @@ def output_dir(request): return request.config.getoption("--output-dir") -if USE_TURTLE: # perf tests can not run outside turtle for now - # Cache all the pytest items so that we can do test list validation. - ALL_PYTEST_ITEMS = None # All pytest items available, before deselection. +def deselect_by_regex(regexp, items, test_prefix, config): + """Filter out tests based on the patterns specified in the given list of regular expressions. + If a test matches *any* of the expressions in the list it is considered selected.""" + compiled_regexes = [] + regex_list = [] + r = re.compile(regexp) + compiled_regexes.append(r) + regex_list.append(regexp) - @pytest.hookimpl(hookwrapper=True, tryfirst=True) - def pytest_collection_modifyitems(session, config, items): - # Flush the current stdout line. - print() + selected = [] + deselected = [] - import copy + corrections = get_test_name_corrections_v2(set(regex_list), + set(it.nodeid for it in items), + CorrectionMode.REGEX) + handle_corrections(corrections, test_prefix) - global ALL_PYTEST_ITEMS - ALL_PYTEST_ITEMS = copy.copy(items) - _ = yield + for item in items: + found = False + for regex in compiled_regexes: + if regex.search(item.nodeid): + found = True + break + if found: + selected.append(item) + else: + deselected.append(item) -else: - # - # When test parameters have an empty id, older versions of pytest ignored that parameter when generating the - # test node's ID completely. This however was actually a bug, and not expected behavior that got fixed in newer - # versions of pytest:https://github.com/pytest-dev/pytest/pull/6607. TRT test defs however rely on this behavior - # for quite a few test names. This is a hacky WAR that restores the old behavior back so that the - # test names do not change. Note: This might break in a future pytest version. - # - # TODO: Remove this hack once the test names are fixed. - # - - from _pytest.python import CallSpec2 - CallSpec2.id = property( - lambda self: "-".join(map(str, filter(None, self._idlist)))) - - # @pytest.hookimpl(tryfirst=True, hookwrapper=True) - # def pytest_collection_modifyitems(config, items): - # testlist_path = config.getoption("--test-list") - # waives_file = config.getoption("--waives-file") - # test_prefix = config.getoption("--test-prefix") - # if test_prefix: - # # Override the internal nodeid of each item to contain the correct test prefix. - # # This is needed for reporting to correctly process the test name in order to bucket - # # it into the appropriate test suite. - # for item in items: - # item._nodeid = "{}/{}".format(test_prefix, item._nodeid) - - # regexp = config.getoption("--regexp") - - # if testlist_path: - # modify_by_test_list(testlist_path, items, config) - - # if regexp is not None: - # deselect_by_regex(regexp, items, test_prefix, config) - - # if waives_file: - # apply_waives(waives_file, items, config) - - # # We have to remove prefix temporarily before splitting the test list - # # After that change back the test id. - # for item in items: - # if test_prefix and item._nodeid.startswith(f"{test_prefix}/"): - # item._nodeid = item._nodeid[len(f"{test_prefix}/"):] - # yield - # for item in items: - # if test_prefix: - # item._nodeid = f"{test_prefix}/{item._nodeid}" - - - def deselect_by_regex(regexp, items, test_prefix, config): - """Filter out tests based on the patterns specified in the given list of regular expressions. - If a test matches *any* of the expressions in the list it is considered selected.""" - compiled_regexes = [] - regex_list = [] - r = re.compile(regexp) - compiled_regexes.append(r) - regex_list.append(regexp) - - selected = [] - deselected = [] - - corrections = get_test_name_corrections_v2( - set(regex_list), set(it.nodeid for it in items), - CorrectionMode.REGEX) - handle_corrections(corrections, test_prefix) - - for item in items: - found = False - for regex in compiled_regexes: - if regex.search(item.nodeid): - found = True - break - if found: - selected.append(item) - else: - deselected.append(item) - - if deselected: - config.hook.pytest_deselected(items=deselected) - items[:] = selected + if deselected: + config.hook.pytest_deselected(items=deselected) + items[:] = selected diff --git a/tests/integration/defs/triton_server/local_venv.py b/tests/integration/defs/triton_server/local_venv.py index f7ddddc243..77dfb810de 100644 --- a/tests/integration/defs/triton_server/local_venv.py +++ b/tests/integration/defs/triton_server/local_venv.py @@ -21,7 +21,7 @@ class PythonVenvRunnerImpl(PythonRunnerInterface): venv_dir (str): Path to the virtualenv root directory, or None if this is an externally-built virtualenv venv_bin (str): Path to the Python executable to use when running tests - workspace (str): Path to the TURTLE workspace + workspace (str): Path to the test workspace """ def __init__(self, pip_opts, venv_dir, venv_bin, workspace): diff --git a/tests/integration/defs/triton_server/turtle_defs.json b/tests/integration/defs/triton_server/turtle_defs.json deleted file mode 100644 index c77f012b04..0000000000 --- a/tests/integration/defs/triton_server/turtle_defs.json +++ /dev/null @@ -1,6 +0,0 @@ -{ - "needs_turtle_major": [ - 5, - 6 - ] -} diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml index 2ed2aa319d..28e484dcee 100644 --- a/tests/integration/test_lists/test-db/l0_a30.yml +++ b/tests/integration/test_lists/test-db/l0_a30.yml @@ -198,29 +198,29 @@ l0_a30: - triton_server/test_triton_llm.py::test_mistral_v1_7b_python_backend[e2e] - triton_server/test_triton_llm.py::test_gpt_350m_python_backend[accuracy] - triton_server/test_triton_llm.py::test_gpt_350m_python_backend[e2e] - - triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls] - - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls] - - triton_server/test_triton_llm.py::test_llava[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls] - - triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble] - - triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble] - - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls] - - triton_server/test_triton_llm.py::test_mistral_v1_7b_ifb[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble] - - triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble] - - triton_server/test_triton_llm.py::test_gpt_350m_speculative_decoding_return_logits[False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-False-ensemble] - - triton_server/test_triton_llm.py::test_t5_small_enc_dec_ifb[test_basic-False-1-top_k_top_p-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-4096-1-1-1-False-tensorrt_llm_bls] - - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict-1-1-1-False-ensemble] - - triton_server/test_triton_llm.py::test_benchmark_core_model[llama_v2_7b-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False] - - triton_server/test_triton_llm.py::test_benchmark_core_model[gptj_6b-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False] - - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-False-ensemble] - - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict-1-1-1-False-ensemble] - - triton_server/test_triton_llm.py::test_whisper_large_v3_ifb[True-1-top_k_top_p-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-0.5-guaranteed_no_evict-24000-1-1-1-False-ensemble] - - triton_server/test_triton_llm.py::test_llava_onevision[test_video-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls] - - triton_server/test_triton_llm.py::test_llava_onevision[test_basic-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-False-tensorrt_llm_bls] - - triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-tensorrtllm-True-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble-accuracy] - - triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-python-True-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble-accuracy] - - triton_server/test_triton_llm.py::test_gpt_disaggregated_serving_bls[test_basic-False-1-top_k_top_p-False-True-True-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-True-tensorrt_llm_bls] - - triton_server/test_triton_llm.py::test_llama_v2_70b_ifb_lad[7-7-7-False-1-lookahead-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_llava[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization---1-1-1-False-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_mistral_v1_7b_ifb[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_gpt_350m_speculative_decoding_return_logits[False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_t5_small_enc_dec_ifb[test_basic-False-1-top_k_top_p--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap---guaranteed_no_evict--4096-1-1-1-False-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_benchmark_core_model[llama_v2_7b-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False] + - triton_server/test_triton_llm.py::test_benchmark_core_model[gptj_6b-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False] + - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_whisper_large_v3_ifb[True-1-top_k_top_p--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-0.5-guaranteed_no_evict--24000-1-1-1-False-ensemble] + - triton_server/test_triton_llm.py::test_llava_onevision[test_video-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_llava_onevision[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-tensorrtllm-True-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble-accuracy] + - triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-python-True-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble-accuracy] + - triton_server/test_triton_llm.py::test_gpt_disaggregated_serving_bls[test_basic-False-1-top_k_top_p--False-True-True-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-True-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_llama_v2_70b_ifb_lad[7-7-7-False-1-lookahead--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] - triton_server/test_triton_llm.py::test_llmapi_backend[1-0-disableDecoupleMode-tensorrt_llm] - triton_server/test_triton_llm.py::test_llmapi_backend[1-0-enableDecoupleMode-tensorrt_llm] - - triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble] - - triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1-False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble] + - triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] + - triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]