[Infra]Remove some old keyword (#4552)

Signed-off-by: qqiao <qqiao@nvidia.com>
This commit is contained in:
Emma Qiao 2025-05-31 13:50:45 +08:00 committed by GitHub
parent 8cb6163a57
commit c945e92fdb
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
17 changed files with 95 additions and 274 deletions

View File

@ -1504,7 +1504,7 @@ def runInKubernetes(pipeline, podSpec, containerName)
def launchTestJobs(pipeline, testFilter, dockerNode=null)
{
def dockerArgs = "-v /mnt/scratch.trt_llm_data:/scratch.trt_llm_data:ro -v /tmp/ccache:${CCACHE_DIR}:rw -v /tmp/pipcache/http-v2:/root/.cache/pip/http-v2:rw --cap-add syslog"
turtleConfigs = [
x86TestConfigs = [
"DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-PyTorch-Others-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
"DGX_H100-4_GPUs-CPP-1": ["dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
@ -1579,7 +1579,7 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
"DGX_H200-4_GPUs-PyTorch-[Post-Merge]-2": ["dgx-h200-x4", "l0_dgx_h200", 2, 2, 4],
]
parallelJobs = turtleConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
parallelJobs = x86TestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "amd64", values[4] ?: 1, key.contains("Perf")), {
def config = VANILLA_CONFIG
if (key.contains("single-device")) {
config = SINGLE_DEVICE_CONFIG
@ -1591,13 +1591,13 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
}]]}
fullSet = parallelJobs.keySet()
slurmX86Configs = [
x86SlurmTestConfigs = [
"RTXPro6000-PyTorch-[Post-Merge]-1": ["rtx-pro-6000", "l0_rtx_pro_6000", 1, 1],
"DGX_B200-4_GPUs-PyTorch-[Post-Merge]-1": ["b200-4-gpus", "l0_dgx_b200", 1, 1, 4],
]
fullSet += slurmX86Configs.keySet()
fullSet += x86SlurmTestConfigs.keySet()
parallelSlurmJobs = slurmX86Configs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "amd64"), {
parallelSlurmJobs = x86SlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "amd64"), {
def config = VANILLA_CONFIG
if (key.contains("single-device")) {
config = SINGLE_DEVICE_CONFIG
@ -1612,25 +1612,25 @@ def launchTestJobs(pipeline, testFilter, dockerNode=null)
// Try to match what are being tested on x86 H100_PCIe.
// The total machine time is scaled proportionally according to the number of each GPU.
aarch64Configs = [
SBSATestConfigs = [
"GH200-1": ["gh200", "l0_gh200", 1, 2],
"GH200-2": ["gh200", "l0_gh200", 2, 2],
"GH200-[Post-Merge]": ["gh200", "l0_gh200", 1, 1],
]
fullSet += aarch64Configs.keySet()
fullSet += SBSATestConfigs.keySet()
slurmSBSAConfigs = [
SBSASlurmTestConfigs = [
"GB200-4_GPUs-PyTorch-[Post-Merge]-1": ["gb200-4-gpus", "l0_gb200", 1, 1, 4],
]
fullSet += slurmSBSAConfigs.keySet()
fullSet += SBSASlurmTestConfigs.keySet()
if (env.targetArch == AARCH64_TRIPLE) {
parallelJobs = aarch64Configs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), {
parallelJobs = SBSASlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, values[0], "arm64"), {
runLLMTestlistOnPlatform(pipeline, values[0], values[1], LINUX_AARCH64_CONFIG, false, key, values[2], values[3])
}]]}
// Add SBSA Slurm jobs
parallelSlurmJobs = slurmSBSAConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), {
parallelSlurmJobs = SBSASlurmTestConfigs.collectEntries{key, values -> [key, [createKubernetesPodConfig(LLM_DOCKER_IMAGE, "slurm", "arm64"), {
def config = LINUX_AARCH64_CONFIG
if (key.contains("single-device")) {
config = SINGLE_DEVICE_CONFIG

View File

@ -1,51 +0,0 @@
#!/usr/bin/env python3
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# This script collects tensorrt_llm unit tests and transform them into TensorRT TURTLE form.
#
# Usage:
# 1. build and install tensorrt_llm python package
# 2. install pytest `pip3 install pytest`
# 3. run `python3 scripts/collect_unittests.py` in tensorrt_llm root directory.
# 4. update the collected tests into TensorRt TURTLE test.
# - check python list `LLM_UNIT_TESTS` in `<tensorrt repo>/tests/trt-test-defs/turtle/defs/llm/test_llm_unittests.py`.
from subprocess import check_output
KEYWORDS = ["<Module", "<UnitTestCase", "<TestCaseFunction"]
def fetch_tests():
text = check_output(["pytest", "--collect-only", "tests/"])
text = text.decode()
lines = text.split("\n")
lines = [line for line in lines if any([k in line for k in KEYWORDS])]
module, unittest, case = "<bad>", "<bad>", "<bad>"
for line in lines:
if "<Module" in line:
module = line.replace("<Module ", "").replace(">", "").strip()
elif "<UnitTestCase" in line:
unittest = line.replace("<UnitTestCase ", "").replace(">",
"").strip()
elif "<TestCaseFunction" in line:
case = line.replace("<TestCaseFunction ", "").replace(">",
"").strip()
print(f"LLMUnitTestCase(\"{module}\", \"{unittest}.{case}\"),")
if __name__ == "__main__":
fetch_tests()

View File

@ -15,7 +15,6 @@ def _clean_files(src_dir: PathLike, extend_files: str) -> None:
".devcontainer",
"docker/README.md",
"jenkins",
"scripts/collect_unittests.py",
"scripts/package_trt_llm.py",
"scripts/git_replace.py",
"tests/integration",

View File

@ -119,7 +119,7 @@ Due to CI hardware resource limitation, and some cases only run on specific GPUs
In directory `integration/test_lists/test-db`, each yml file corresponds to a GPU type.
In file `jenkins/L0_Test.groovy`, the variable `turtleConfigs` maps yml files to CI stages.
In file `jenkins/L0_Test.groovy`, the variables `x86TestConfigs`, `SBSATestConfigs`, `x86SlurmTestConfigs` and `SBSASlurmTestConfigs` map yml files to CI stages according to platforms and launch methods.
Currently the yml files are manually maintained, which requires developer to update them when new test cases are added.

View File

@ -2121,7 +2121,7 @@ def all_pytest_items():
@pytest.fixture(scope="session")
def turtle_root():
def test_root():
return os.path.dirname(os.path.dirname(__file__))

View File

@ -24,8 +24,7 @@ except ImportError:
@dataclass
class BuildConfig:
# TODO: Use `tensorrt_llm.builder.BuildConfig` when we switch to Pytest from TURTLE.
# Using TURTLE, we cannot do `import tensorrt_llm` in this file.
# TODO: Use `tensorrt_llm.builder.BuildConfig` when we switch to use Pytest.
max_input_len: int = 256
max_seq_len: int = 512
opt_batch_size: int = 8

View File

@ -350,7 +350,7 @@ def write_gpu_monitoring_no_test_results(logpath,
def get_log(fpath):
"""
Converts TURTLE log output into an ordered dict of stdout and stderr.
Converts log output into an ordered dict of stdout and stderr.
Used for raw_result for test_result.
Args:

View File

@ -29,7 +29,6 @@ import time
import psutil # type: ignore
# Nvidia
import pynvml # type: ignore
# TURTLE
from defs.trt_test_alternative import print_info, print_warning
from .misc import clean_device_product_name

View File

@ -14,7 +14,7 @@
# limitations under the License.
# -*- coding: utf-8 -*-
"""
Miscellaneous utility code used by trt_test. Should contain all code that is agnostic to remote mode vs local mode of TURTLE.
Miscellaneous utility code used by trt_test. Should contain all code that is agnostic to remote mode vs local mode.
"""
import subprocess as sp

View File

@ -71,7 +71,7 @@ class SessionDataWriter:
def _write_session_perf_logs(self):
"""
Write session data. Should only be called once at the end of the entire
perf session, in otherwords, when TURTLE ends during teardown().
perf session, in otherwords, only during teardown().
"""
# Output various log files depending on options.
for fmt in self._output_formats:

View File

@ -274,7 +274,7 @@ class PerfTestMetric(NamedTuple):
"""
Configurations of a test metric.
"""
# The original test name used to run the TURTLE test.
# The original test name used to run the oraginal perf test.
original_test_name: str
# The name for this particular metric.
metric_name: str
@ -759,7 +759,7 @@ class MultiMetricPerfTest(AbstractPerfScriptTestClass):
"""
def __init__(self, full_test_name: str):
# full_test_name is the full test name appearing in TURTLE output.
# full_test_name is the full test name appearing in test output.
self._full_test_name = full_test_name
# test_domain_name is the part before "::".
self._test_domain_name = "::".join(full_test_name.split("::")[:-1])

View File

@ -83,7 +83,7 @@ def collect_and_clean_myelin_time(log: str):
class PerfMetricType(str, Enum):
"""
An string-enum type to define what kind of perf metric it is. While it is not used by TURTLE, it is used by QA to
An string-enum type to define what kind of perf metric it is. It is used by QA to
set up special threshold criteria for each type of metrics (like >50MB for engine size increase, etc.).
"""
INFERENCE_TIME = "INFERENCE_TIME"
@ -352,13 +352,12 @@ class AbstractPerfScriptTestClass(abc.ABC):
"""
Get the absolute threshold used to flag a perf regression compared to perf baseline.
Perf comparison will only fail if it exceeds both relative and absolute thresholds.
Note: This is not honored by TURTLE for now, but we can add the support later.
"""
return 0.0
def get_metric_type(self) -> PerfMetricType:
"""
Get the type of perf metric. This does not affect TURTLE for now, but QA uses this field to set up special
Get the type of perf metric. QA uses this field to set up special
threshold criteria depending on the metric type.
"""
return PerfMetricType.INFERENCE_TIME

View File

@ -87,7 +87,7 @@ def validate_perf_tests(perf_test_names) -> bool:
return passed
def test_list_validation(turtle_root, all_pytest_items, trt_config,
def test_list_validation(test_root, all_pytest_items, trt_config,
is_trt_environment):
# Don't run test list validation in TRT environment because TRT uses
@ -99,13 +99,13 @@ def test_list_validation(turtle_root, all_pytest_items, trt_config,
return
# Glob all the test list files.
test_list_path = os.path.join(turtle_root, "test_lists", "*", "*.txt")
test_list_path = os.path.join(test_root, "test_lists", "*", "*.txt")
all_test_lists = glob.glob(test_list_path)
assert len(all_test_lists
) > 0, f"Cannot find any test lists with path {test_list_path}!"
# Glob all the test db files.
test_db_path = os.path.join(turtle_root, "test_lists", "*", "*.yml")
test_db_path = os.path.join(test_root, "test_lists", "*", "*.yml")
all_test_dbs = glob.glob(test_db_path)
assert len(all_test_dbs
) > 0, f"Cannot find any test lists with path {test_db_path}!"

View File

@ -7,33 +7,10 @@ import tempfile
import pytest
# pytest_plugins = ["pytester", "trt_test.pytest_plugin"]
USE_TURTLE = True
try:
import trt_test # noqa
except ImportError:
from .test_list_parser import (CorrectionMode, get_test_name_corrections_v2,
from .test_list_parser import (CorrectionMode, get_test_name_corrections_v2,
handle_corrections)
from .trt_test_alternative import (SessionDataWriter, check_call,
check_output, print_info)
@pytest.fixture(scope="session")
def trt_config():
return None # tekit shall never call this
@pytest.fixture(scope="session")
def gitlab_token():
return None # tekit shall never call this
@pytest.fixture(scope="session")
def versions_from_infer_device():
pass
USE_TURTLE = False
else:
from trt_test.misc import check_call, check_output, print_info
from trt_test.session_data_writer import SessionDataWriter
USE_TURTLE = True
from .trt_test_alternative import (SessionDataWriter, check_call, check_output,
print_info)
def llm_models_root() -> str:
@ -78,9 +55,9 @@ def trt_performance_cache_fpath(trt_config, trt_performance_cache_name):
return fpath
# Get the executing turtle case name
# Get the executing test case name
@pytest.fixture(autouse=True)
def turtle_case_name(request):
def test_case_name(request):
return request.node.nodeid
@ -121,44 +98,13 @@ def llm_session_data_writer(trt_config, trt_gpu_clock_lock,
session_data_writer.teardown()
if USE_TURTLE:
@pytest.fixture(scope="session")
def trt_py3_venv_factory(trt_py_base_venv_factory):
"""
Session-scoped fixture which provides a factory function to produce a VirtualenvRunner capable of
running Python3 code. Used by other session-scoped fixtures which need to modify the default VirtualenvRunner prolog.
"""
# TODO: remove update env after TURTLE support multi devices
# Temporarily update CUDA_VISIBLE_DEVICES visible device
device_count = get_device_count()
visible_devices = ",".join([str(i) for i in range(device_count)])
print_info(f"Setting CUDA_VISIBLE_DEVICES to {visible_devices}.")
os.environ["CUDA_VISIBLE_DEVICES"] = visible_devices
def factory():
return trt_py_base_venv_factory("python3")
return factory
@pytest.fixture(scope="session")
def llm_backend_venv(trt_py3_venv_factory):
"""
The fixture venv used for LLM tests.
"""
venv = trt_py3_venv_factory()
return venv
else:
@pytest.fixture(scope="session")
def custom_user_workspace(request):
@pytest.fixture(scope="session")
def custom_user_workspace(request):
return request.config.getoption("--workspace")
@pytest.fixture(scope="session")
def llm_backend_venv(custom_user_workspace):
@pytest.fixture(scope="session")
def llm_backend_venv(custom_user_workspace):
workspace_dir = custom_user_workspace
subdir = datetime.datetime.now().strftime("ws-%Y-%m-%d-%H-%M-%S")
if workspace_dir is None:
@ -689,71 +635,7 @@ def output_dir(request):
return request.config.getoption("--output-dir")
if USE_TURTLE: # perf tests can not run outside turtle for now
# Cache all the pytest items so that we can do test list validation.
ALL_PYTEST_ITEMS = None # All pytest items available, before deselection.
@pytest.hookimpl(hookwrapper=True, tryfirst=True)
def pytest_collection_modifyitems(session, config, items):
# Flush the current stdout line.
print()
import copy
global ALL_PYTEST_ITEMS
ALL_PYTEST_ITEMS = copy.copy(items)
_ = yield
else:
#
# When test parameters have an empty id, older versions of pytest ignored that parameter when generating the
# test node's ID completely. This however was actually a bug, and not expected behavior that got fixed in newer
# versions of pytest:https://github.com/pytest-dev/pytest/pull/6607. TRT test defs however rely on this behavior
# for quite a few test names. This is a hacky WAR that restores the old behavior back so that the
# test names do not change. Note: This might break in a future pytest version.
#
# TODO: Remove this hack once the test names are fixed.
#
from _pytest.python import CallSpec2
CallSpec2.id = property(
lambda self: "-".join(map(str, filter(None, self._idlist))))
# @pytest.hookimpl(tryfirst=True, hookwrapper=True)
# def pytest_collection_modifyitems(config, items):
# testlist_path = config.getoption("--test-list")
# waives_file = config.getoption("--waives-file")
# test_prefix = config.getoption("--test-prefix")
# if test_prefix:
# # Override the internal nodeid of each item to contain the correct test prefix.
# # This is needed for reporting to correctly process the test name in order to bucket
# # it into the appropriate test suite.
# for item in items:
# item._nodeid = "{}/{}".format(test_prefix, item._nodeid)
# regexp = config.getoption("--regexp")
# if testlist_path:
# modify_by_test_list(testlist_path, items, config)
# if regexp is not None:
# deselect_by_regex(regexp, items, test_prefix, config)
# if waives_file:
# apply_waives(waives_file, items, config)
# # We have to remove prefix temporarily before splitting the test list
# # After that change back the test id.
# for item in items:
# if test_prefix and item._nodeid.startswith(f"{test_prefix}/"):
# item._nodeid = item._nodeid[len(f"{test_prefix}/"):]
# yield
# for item in items:
# if test_prefix:
# item._nodeid = f"{test_prefix}/{item._nodeid}"
def deselect_by_regex(regexp, items, test_prefix, config):
def deselect_by_regex(regexp, items, test_prefix, config):
"""Filter out tests based on the patterns specified in the given list of regular expressions.
If a test matches *any* of the expressions in the list it is considered selected."""
compiled_regexes = []
@ -765,8 +647,8 @@ else:
selected = []
deselected = []
corrections = get_test_name_corrections_v2(
set(regex_list), set(it.nodeid for it in items),
corrections = get_test_name_corrections_v2(set(regex_list),
set(it.nodeid for it in items),
CorrectionMode.REGEX)
handle_corrections(corrections, test_prefix)

View File

@ -21,7 +21,7 @@ class PythonVenvRunnerImpl(PythonRunnerInterface):
venv_dir (str): Path to the virtualenv root directory, or None if this is
an externally-built virtualenv
venv_bin (str): Path to the Python executable to use when running tests
workspace (str): Path to the TURTLE workspace
workspace (str): Path to the test workspace
"""
def __init__(self, pip_opts, venv_dir, venv_bin, workspace):

View File

@ -1,6 +0,0 @@
{
"needs_turtle_major": [
5,
6
]
}

View File

@ -198,29 +198,29 @@ l0_a30:
- triton_server/test_triton_llm.py::test_mistral_v1_7b_python_backend[e2e]
- triton_server/test_triton_llm.py::test_gpt_350m_python_backend[accuracy]
- triton_server/test_triton_llm.py::test_gpt_350m_python_backend[e2e]
- triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_llava[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization-1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-True-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_mistral_v1_7b_ifb[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1-False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_gpt_350m_speculative_decoding_return_logits[False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_t5_small_enc_dec_ifb[test_basic-False-1-top_k_top_p-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-4096-1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_benchmark_core_model[llama_v2_7b-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False]
- triton_server/test_triton_llm.py::test_benchmark_core_model[gptj_6b-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-4096-1-1-1-False]
- triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1-False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_whisper_large_v3_ifb[True-1-top_k_top_p-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-0.5-guaranteed_no_evict-24000-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_llava_onevision[test_video-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict-1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_llava_onevision[test_basic-False-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-tensorrtllm-True-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble-accuracy]
- triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-python-True-1-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble-accuracy]
- triton_server/test_triton_llm.py::test_gpt_disaggregated_serving_bls[test_basic-False-1-top_k_top_p-False-True-True-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization-1-1-1-True-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_llama_v2_70b_ifb_lad[7-7-7-False-1-lookahead-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_llava[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.7-max_utilization---1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_mistral_v1_7b_ifb[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_mistral_v1_multi_models[False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_gpt_350m_speculative_decoding_return_logits[False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_t5_small_enc_dec_ifb[test_basic-False-1-top_k_top_p--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap---guaranteed_no_evict--4096-1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[False-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_benchmark_core_model[llama_v2_7b-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False]
- triton_server/test_triton_llm.py::test_benchmark_core_model[gptj_6b-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization-4096--1-1-1-False]
- triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_gpt_speculative_decoding_bls[True-False-1---False-True-True-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_whisper_large_v3_ifb[True-1-top_k_top_p--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-0.5-guaranteed_no_evict--24000-1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_llava_onevision[test_video-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_llava_onevision[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-False-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-tensorrtllm-True-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble-accuracy]
- triton_server/test_triton_llm.py::test_tiny_llama_1b_guided_decoding[xgrammar-python-True-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble-accuracy]
- triton_server/test_triton_llm.py::test_gpt_disaggregated_serving_bls[test_basic-False-1-top_k_top_p--False-True-True-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-0.2-max_utilization---1-1-1-True-tensorrt_llm_bls]
- triton_server/test_triton_llm.py::test_llama_v2_70b_ifb_lad[7-7-7-False-1-lookahead--False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
- triton_server/test_triton_llm.py::test_llmapi_backend[1-0-disableDecoupleMode-tensorrt_llm]
- triton_server/test_triton_llm.py::test_llmapi_backend[1-0-enableDecoupleMode-tensorrt_llm]
- triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10-False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap-guaranteed_no_evict-1-1-1-False-ensemble]
- triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1-False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap-max_utilization-1-1-1-False-ensemble]
- triton_server/test_triton_rcca.py::test_mistral_beam_search[rcca_4714407-True-10---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
- triton_server/test_triton_rcca.py::test_rcca_bug_4934893[Temperature:0.5-TOP_P:0.95-TOP_K:10-False-1---False-True-False-0-2048-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]