test: skip post blackwell (#6357)

Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com>
This commit is contained in:
xinhe-nv 2025-08-02 01:10:14 +08:00 committed by GitHub
parent 5247df6ae2
commit 263c6c0ad0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
30 changed files with 258 additions and 194 deletions

View File

@ -50,11 +50,13 @@ class TestGpt2(CliFlowAccuracyTestHarness):
def test_context_fmha_fp32_acc(self):
self.run(extra_summarize_args=["--enable_context_fmha_fp32_acc"])
@skip_post_blackwell
@pytest.mark.parametrize("precision", ["int8", "int4"])
def test_weight_only(self, precision: str):
quant_algo = QuantAlgo.W8A16 if precision == "int8" else QuantAlgo.W4A16
self.run(quant_algo=quant_algo)
@skip_post_blackwell
def test_int8_kv_cache(self):
self.run(kv_cache_quant_algo=QuantAlgo.INT8)
@ -415,6 +417,7 @@ class TestVicuna7B(CliFlowAccuracyTestHarness):
EAGLE_MODEL_NAME = "yuhuili/EAGLE-Vicuna-7B-v1.3"
EAGLE_MODEL_PATH = f"{llm_models_root()}/EAGLE-Vicuna-7B-v1.3"
@skip_post_blackwell
def test_lookahead(self, mocker):
mocker.patch.object(CnnDailymail, "MAX_BATCH_SIZE", 8)
@ -425,6 +428,7 @@ class TestVicuna7B(CliFlowAccuracyTestHarness):
],
extra_summarize_args=["--lookahead_config=[7,7,7]"])
@skip_post_blackwell
@parametrize_with_ids("cuda_graph", [False, True])
def test_medusa(self, cuda_graph, mocker):
mocker.patch.object(self.__class__, "EXAMPLE_FOLDER", "medusa")
@ -1104,6 +1108,7 @@ class TestMixtral8x7B(CliFlowAccuracyTestHarness):
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(80000)
@skip_post_blackwell
def test_weight_only_int4_tp2(self):
self.run(quant_algo=QuantAlgo.W4A16,
tp_size=2,
@ -1111,6 +1116,7 @@ class TestMixtral8x7B(CliFlowAccuracyTestHarness):
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(80000)
@skip_post_blackwell
def test_weight_only_int8_tp2(self):
self.run(quant_algo=QuantAlgo.W8A16,
tp_size=2,

View File

@ -1892,6 +1892,10 @@ skip_post_blackwell = pytest.mark.skipif(
get_sm_version() >= 100,
reason="This test is not supported in post-Blackwell architecture")
skip_post_blackwell_ultra = pytest.mark.skipif(
get_sm_version() >= 103,
reason="This test is not supported in post-Blackwell-Ultra architecture")
skip_device_contain_gb200 = pytest.mark.skipif(
check_device_contain(["GB200"]),
reason="This test is not supported on GB200 or GB100")

View File

@ -18,6 +18,12 @@ from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
from defs.conftest import get_device_count, get_sm_version
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
# # Build parameters
@pytest.mark.parametrize(

View File

@ -18,9 +18,15 @@ import os
import pytest
from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
from defs.conftest import get_device_count
from defs.conftest import get_device_count, get_sm_version
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@pytest.fixture(scope="module")
def bindings_example_root(llm_root):

View File

@ -18,12 +18,20 @@ import shutil
import pytest
from defs.common import convert_weights, venv_check_call
from defs.conftest import get_sm_version, skip_post_blackwell
from defs.trt_test_alternative import check_call, exists
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
# TODO: add more test case for input_padding, paged_kv_cache, num_beams
@pytest.mark.skip_less_device_memory(24000)
@pytest.mark.parametrize("use_weight_only", [True, False],
@pytest.mark.parametrize("use_weight_only",
[pytest.param(True, marks=skip_post_blackwell), False],
ids=["enable_weight_only", "disable_weight_only"])
@pytest.mark.parametrize("llm_glm_4_9b_model_root",
["glm-4-9b", "glm-4-9b-chat"],

View File

@ -18,11 +18,19 @@ import os
import pytest
from defs.common import (convert_weights, generate_summary_cmd, venv_check_call,
venv_mpi_check_call)
from defs.conftest import get_gpu_device_list
from defs.conftest import (get_gpu_device_list, get_sm_version,
skip_post_blackwell)
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@pytest.mark.skip_less_device_memory(80000)
@skip_post_blackwell
@pytest.mark.parametrize("use_weight_only", [True, False],
ids=["enable_weight_only", "disable_weight_only"])
def test_llm_commandr_v01_single_gpu_summary(commandr_example_root,
@ -79,7 +87,8 @@ def test_llm_commandr_v01_single_gpu_summary(commandr_example_root,
@pytest.mark.skip_less_device(4)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_host_memory(1000000)
@pytest.mark.parametrize("use_weight_only", [True, False],
@pytest.mark.parametrize("use_weight_only",
[pytest.param(True, marks=skip_post_blackwell), False],
ids=["enable_weight_only", "disable_weight_only"])
def test_llm_commandr_plus_4gpus_summary(commandr_example_root,
llm_commandr_plus_model_root,

View File

@ -19,10 +19,16 @@ from copy import deepcopy
import pytest
from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
from defs.conftest import (get_device_memory, llm_models_root,
from defs.conftest import (get_device_memory, get_sm_version, llm_models_root,
skip_post_blackwell, skip_pre_hopper)
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
# TODO: remove skip after enable Blackwell for Speculative Decoding
@skip_post_blackwell

View File

@ -18,9 +18,15 @@ import os
import pytest
from defs.common import (convert_weights, get_dummy_spec_decoding_heads,
venv_check_call)
from defs.conftest import skip_post_blackwell, skip_pre_ada
from defs.conftest import get_sm_version, skip_post_blackwell, skip_pre_ada
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@skip_post_blackwell
@pytest.mark.parametrize("use_dynamic_tree", [False, True],

View File

@ -16,10 +16,16 @@
import pytest
from defs.common import (convert_weights, quantize_data, venv_check_call,
venv_mpi_check_call)
from defs.conftest import (get_device_count, skip_fp8_pre_ada,
from defs.conftest import (get_device_count, get_sm_version, skip_fp8_pre_ada,
skip_post_blackwell)
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@pytest.mark.parametrize("use_fp8", [True, False],
ids=["enable_fp8", "disable_fp8"])
@ -38,8 +44,11 @@ from defs.trt_test_alternative import check_call
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16', 'float32'])
@pytest.mark.parametrize("enc_dec_model_root", [
't5-small', 'flan-t5-small', 'byt5-small', 'bart-large-cnn',
'mbart-large-50-many-to-one-mmt', 'wmt14'
pytest.param('t5-small', marks=skip_post_blackwell),
pytest.param('flan-t5-small', marks=skip_post_blackwell),
pytest.param('byt5-small', marks=skip_post_blackwell), 'bart-large-cnn',
pytest.param('mbart-large-50-many-to-one-mmt', marks=skip_post_blackwell),
'wmt14'
],
indirect=True)
@pytest.mark.parametrize("compare_hf_fp32", [True, False],

View File

@ -17,10 +17,17 @@
import pytest
from defs.common import (convert_weights, generate_summary_cmd, venv_check_call,
venv_mpi_check_call)
from defs.conftest import skip_post_blackwell
from defs.conftest import get_sm_version, skip_post_blackwell
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@skip_post_blackwell
@pytest.mark.parametrize("num_beams", [1, 2, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])

View File

@ -24,9 +24,16 @@ from defs.common import (convert_weights, generate_summary_cmd, parse_mpi_cmd,
similarity_score, test_multi_lora_support,
venv_check_call, venv_check_output,
venv_mpi_check_call, venv_mpi_check_output)
from defs.conftest import get_device_memory, skip_fp8_pre_ada, skip_pre_ada
from defs.conftest import (get_device_memory, get_sm_version, skip_fp8_pre_ada,
skip_post_blackwell, skip_pre_ada)
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
INPUT_TEXT_1 = "After Washington had returned to Williamsburg, " + \
"Dinwiddie ordered him to lead a larger force to assist Trent in his work. " + \
"While en route, Washington learned of Trent's retreat. " + \
@ -688,6 +695,7 @@ def test_llm_gpt3_175b_1node_8gpus(gpt_example_root, llm_venv, engine_dir,
])
@skip_post_blackwell
@pytest.mark.parametrize("per_token_channel", [True, False],
ids=["enable_ptpc", "disable_ptpc"])
def test_llm_gpt2_smooth_single_gpu_summary(gpt_example_root, llm_venv,
@ -732,6 +740,7 @@ def test_llm_gpt2_smooth_single_gpu_summary(gpt_example_root, llm_venv,
])
@skip_post_blackwell
def test_llm_gpt2_int8_kv_1gpu(gpt_example_root, llm_venv, llm_gpt2_model_root,
llm_datasets_root, engine_dir, cmodel_dir):
"gpt2 INT8 KV Cache test on 1 gpu"
@ -1360,6 +1369,7 @@ def test_llm_gpt2_starcoder_1node_4gpus(gpt_example_root,
summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_host_memory(250000)
def test_llm_gpt2_starcoder_1gpus(gpt_example_root,
llm_gpt2_starcoder_model_root, llm_venv,
@ -1401,6 +1411,7 @@ def test_llm_gpt2_starcoder_1gpus(gpt_example_root,
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_host_memory(250000)
@pytest.mark.parametrize("dtype", ["float16"])
@pytest.mark.parametrize("precision", ["int8", "int4"])
@ -1710,6 +1721,7 @@ def test_llm_gpt2_multi_lora_1gpu(gpt_example_root, llm_venv,
for item in expected_output[idx]]), f"output is {output}"
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("data_type", ['float16', 'fp8'],
ids=['base_fp16', 'base_fp8'])

View File

@ -15,9 +15,15 @@
import pytest
from defs.common import venv_check_call
from defs.conftest import get_gpu_device_list
from defs.conftest import get_gpu_device_list, get_sm_version
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
INPUT_TEXT = """
Write a Python function `find_max(words)` to solve the following problem:\nWrite a function that accepts a list of strings.\nThe list contains different words. Return the word with maximum number\nof unique characters. If multiple strings have maximum number of unique\ncharacters, return the one which comes first in lexicographical order.\nfind_max(["name", "of", "string"]) == "string"\nfind_max(["name", "enam", "game"]) == "enam"\nfind_max(["aaaaaaa", "bb" ,"cc"]) == ""aaaaaaa"
"""

View File

@ -19,8 +19,15 @@ import time
import pytest
from defs.common import (convert_weights, test_multi_lora_support,
venv_mpi_check_call)
from defs.conftest import get_sm_version
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@pytest.fixture(scope="module", autouse=True)
def disable_unified_converter():

View File

@ -14,9 +14,15 @@
# limitations under the License.
import pytest
from defs.common import convert_weights, parse_mpi_cmd, venv_mpi_check_call
from defs.conftest import get_device_memory
from defs.conftest import get_device_memory, get_sm_version
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
# @pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1, 2, 4],

View File

@ -36,6 +36,12 @@ from defs.conftest import (get_device_count, get_device_memory,
# yapf: enable
from defs.trt_test_alternative import check_call, exists
# skip trt flow cases on post-Blackwell-Ultra
# if get_sm_version() >= 103:
# pytest.skip(
# "TRT workflow tests are not supported on post Blackwell-Ultra architecture",
# allow_module_level=True)
INPUT_TEXT_1 = "After Washington had returned to Williamsburg, " + \
"Dinwiddie ordered him to lead a larger force to assist Trent in his work. " + \
"While en route, Washington learned of Trent's retreat. " + \
@ -688,6 +694,7 @@ def test_llm_llama_v2_1gpu_sparsity(llama_example_root, llama_model_root,
])
@skip_post_blackwell
@pytest.mark.parametrize("num_beams", [1],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@ -886,6 +893,7 @@ def test_llm_llama_v2_gather_logits_2gpu_pp2(llama_example_root,
summary_cmd)
@skip_post_blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
def test_llm_llama_v2_1gpu_auto_parallel(llama_example_root, llama_model_root,
llm_venv, cmodel_dir, engine_dir):
@ -911,6 +919,7 @@ def test_llm_llama_v2_1gpu_auto_parallel(llama_example_root, llama_model_root,
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
@skip_post_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@ -1622,6 +1631,7 @@ def test_llm_llama_v2_1gpu_fp8_gemv(llama_example_root, llama_model_root,
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("gemm_swiglu_plugin", ["fp8"])
@ -1697,7 +1707,12 @@ def test_llm_llama_v2_1gpu_gemm_swiglu(llama_example_root, llama_model_root,
@pytest.mark.parametrize(
"data_type", ['float16', 'fp8', 'sq_ootb', 'awq', 'int8_wo'],
"data_type", [
'float16', 'fp8',
pytest.param('sq_ootb', marks=skip_post_blackwell),
pytest.param('awq', marks=skip_post_blackwell),
pytest.param('int8_wo', marks=skip_post_blackwell)
],
ids=['base_fp16', 'base_fp8', 'base_sq_ootb', 'base_awq', 'base_int8_wo'])
@pytest.mark.parametrize("lora_data_type", ['float16'], ids=['lora_fp16'])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-13b-hf'], indirect=True)
@ -2280,6 +2295,7 @@ def test_llm_llama_code_llama_multi_gpus_summary(llama_example_root,
summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@ -2336,6 +2352,7 @@ def test_llm_llama_smooth_quant_1gpu_summary(llama_example_root,
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@ -2385,6 +2402,7 @@ def test_llm_llama_int8_kv_1gpu_summary(llama_example_root, llama_model_root,
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@ -2429,6 +2447,7 @@ def test_llm_llama_int8_sq_ootb_1gpu_summary(
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.parametrize("num_beams", [1],
ids=lambda num_beams: f'nb:{num_beams}')
@ -2488,6 +2507,7 @@ def test_llm_llama_v2_int8sq_2gpu_tp2(data_type, llama_example_root,
summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@ -2543,6 +2563,7 @@ def test_llm_llama_wo_1gpu_summary(llama_example_root, llama_model_root,
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(30000)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@ -2872,7 +2893,9 @@ def test_llm_llama_v2_lora_benchmark_2gpu(llama_example_root, llama_model_root,
@pytest.mark.skip_less_device(4)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize("qformat", ["fp8", "int4_awq"])
@pytest.mark.parametrize(
"qformat",
["fp8", pytest.param("int4_awq", marks=skip_post_blackwell)])
@pytest.mark.parametrize(
"tp_pp_size", [(4, 1), (2, 2)],
ids=lambda tp_pp_size: f'tp{tp_pp_size[0]}pp{tp_pp_size[1]}')
@ -3268,8 +3291,11 @@ def test_llm_llama_1gpu_streaming_llm(llama_example_root, deepseek_model_root,
assert "上海人工智能实验室" in output, output
@pytest.mark.parametrize(
"fp8_quant", ['disable_fp8', 'enable_fp8', 'enable_fp8_meta_recipe'])
@pytest.mark.parametrize("fp8_quant", [
'disable_fp8',
pytest.param('enable_fp8', marks=skip_post_blackwell),
pytest.param('enable_fp8_meta_recipe', marks=skip_post_blackwell)
])
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b', 'llama-3.2-1b'],
indirect=True)
def test_llm_llama_v3_1_1node_single_gpu(llama_example_root, llama_model_root,
@ -3581,6 +3607,7 @@ def test_llm_llama_v3_1_2nodes_8gpus(test_type, llama_example_root,
check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("low_latency_gemm_plugin", ["fp8"])
@pytest.mark.parametrize("llama_model_root", ['llama-v2-7b-hf'], indirect=True)
@ -3813,6 +3840,7 @@ def test_llm_llama_v2_fp8_2gpu_cp2(data_type, llama_example_root,
@skip_pre_ada
@skip_post_blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b', 'llama-3.2-1b'],
indirect=True)
def test_llm_llama_lookahead_xqa_fp8_1gpu(llama_example_root, llama_model_root,
@ -4014,6 +4042,7 @@ def test_mistral_nemo_fp8_with_bf16_lora(
)
@skip_post_blackwell
@pytest.mark.parametrize("llama_model_root", ['llama-3.1-8b'], indirect=True)
def test_llm_llama_lookahead_single_gpu_summary(llama_example_root,
llama_model_root, llm_venv,

View File

@ -18,16 +18,24 @@ import os
import pytest
from defs.common import (convert_weights, generate_summary_cmd, venv_check_call,
venv_mpi_check_call)
from defs.conftest import get_sm_version, skip_post_blackwell
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@pytest.mark.parametrize("gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize("dtype", ['bfloat16', 'float16'])
@pytest.mark.parametrize("mamba_model_root", [
'mamba-130m', 'mamba-2.8b', 'mamba-1.4b', 'mamba-790m', 'mamba-370m',
'mamba2-130m', 'mamba2-2.7b', 'mamba2-1.3b', 'mamba2-780m', 'mamba2-370m',
'mamba-codestral-7B-v0.1'
pytest.param('mamba-130m', marks=skip_post_blackwell), 'mamba-2.8b',
'mamba-1.4b', 'mamba-790m', 'mamba-370m', 'mamba2-130m', 'mamba2-2.7b',
'mamba2-1.3b', 'mamba2-780m', 'mamba2-370m',
pytest.param('mamba-codestral-7B-v0.1', marks=skip_post_blackwell)
],
indirect=True)
def test_llm_mamba_1gpu(mamba_example_root, mamba_model_root,

View File

@ -18,10 +18,17 @@ import os
import pytest
from defs.common import (convert_weights, get_dummy_spec_decoding_heads,
venv_check_call)
from defs.conftest import skip_fp8_pre_ada
from defs.conftest import get_sm_version, skip_fp8_pre_ada, skip_post_blackwell
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@skip_post_blackwell
@pytest.mark.parametrize("batch_size", [1, 8], ids=['bs1', 'bs8'])
@pytest.mark.parametrize("data_type", ['bfloat16'])
@pytest.mark.parametrize("num_medusa_heads", [4], ids=['4-heads'])
@ -79,6 +86,7 @@ def test_llm_medusa_1gpu(batch_size, data_type, medusa_model_roots,
venv_check_call(llm_venv, summary_cmd)
@skip_post_blackwell
@pytest.mark.parametrize("batch_size", [1, 8], ids=['bs1', 'bs8'])
@pytest.mark.parametrize("data_type", ['bfloat16', 'float16'])
@pytest.mark.parametrize("num_medusa_heads", [4], ids=['4-heads'])

View File

@ -20,9 +20,15 @@ import psutil
import pytest
from defs.common import (convert_weights, quantize_data,
test_multi_lora_support, venv_check_call)
from defs.conftest import skip_post_blackwell, skip_pre_ada
from defs.conftest import get_sm_version, skip_post_blackwell, skip_pre_ada
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
def get_optimal_jobs():
cpu_count = multiprocessing.cpu_count()

View File

@ -19,9 +19,16 @@ import os
import pytest
from defs.common import (convert_weights, generate_summary_cmd, quantize_data,
venv_check_call, venv_mpi_check_call)
from defs.conftest import llm_models_root, skip_post_blackwell, skip_pre_ada
from defs.conftest import (get_sm_version, llm_models_root, skip_post_blackwell,
skip_pre_ada)
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@skip_post_blackwell
@pytest.mark.parametrize("model_name", ['mixtral-8x7b-v0.1-AWQ'])

View File

@ -18,9 +18,16 @@ import os
import pytest
import torch
from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
from defs.conftest import get_device_memory, skip_post_blackwell, skip_pre_ada
from defs.conftest import (get_device_memory, get_sm_version,
skip_post_blackwell, skip_pre_ada)
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@pytest.fixture(scope="module")
def multimodal_example_root(llm_root):
@ -623,19 +630,19 @@ def _test_llm_multimodal_general(llm_venv,
reason="Skip due to low memory")),
'llava-onevision-qwen2-7b-ov-hf',
'llava-onevision-qwen2-7b-ov-hf-video',
'nougat-base',
pytest.param('nougat-base', marks=skip_post_blackwell),
'VILA1.5-3b',
'cogvlm-chat',
'fuyu-8b',
'deplot',
pytest.param('deplot', marks=skip_post_blackwell),
pytest.param('neva-22b',
marks=pytest.mark.skip(reason="RCCA https://nvbugs/5220761")),
'kosmos-2',
'video-neva',
pytest.param('video-neva', marks=skip_post_blackwell),
pytest.param('Phi-3-vision-128k-instruct', marks=skip_post_blackwell),
pytest.param('Phi-3.5-vision-instruct', marks=skip_post_blackwell),
pytest.param('Phi-4-multimodal-instruct', marks=skip_post_blackwell),
'Llama-3.2-11B-Vision',
pytest.param('Llama-3.2-11B-Vision', marks=skip_post_blackwell),
'Qwen2-VL-7B-Instruct',
'internlm-xcomposer2-vl-7b',
'Mistral-Small-3.1-24B-Instruct-2503',
@ -688,8 +695,8 @@ def test_llm_multimodal_general(llm_venv, llm_root, llm_datasets_root,
'Phi-3-vision-128k-instruct',
'Phi-3.5-vision-instruct',
'Phi-4-multimodal-instruct',
'Llama-3.2-11B-Vision-Instruct',
'Llama-3.2-11B-Vision',
pytest.param('Llama-3.2-11B-Vision-Instruct', marks=skip_post_blackwell),
pytest.param('Llama-3.2-11B-Vision', marks=skip_post_blackwell),
'Qwen2-VL-7B-Instruct',
],
indirect=True)

View File

@ -15,9 +15,15 @@
import pytest
from defs.common import venv_check_call, venv_mpi_check_call
from defs.conftest import skip_fp8_pre_ada
from defs.conftest import get_sm_version, skip_fp8_pre_ada
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@pytest.mark.skip_less_device_memory(50000)
@pytest.mark.parametrize("qformat", ["full_prec", "fp8", "int4_awq"])

View File

@ -2,9 +2,15 @@ from pathlib import Path
import pytest
from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
from defs.conftest import get_device_memory
from defs.conftest import get_device_memory, get_sm_version
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
ROUGE1_ACCURACY_THRESHOLD = 20

View File

@ -18,9 +18,15 @@ from copy import deepcopy
import pytest
from defs.common import convert_weights, venv_check_call
from defs.conftest import skip_post_blackwell
from defs.conftest import get_sm_version, skip_post_blackwell
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
# TODO: remove skip after support NGram on B200
@skip_post_blackwell

View File

@ -23,6 +23,12 @@ from defs.conftest import (get_device_memory, get_sm_version, skip_fp8_pre_ada,
skip_post_blackwell, skip_pre_ada)
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@pytest.fixture(scope="module")
def phi_example_root(llm_root, llm_venv):
@ -36,6 +42,7 @@ def phi_example_root(llm_root, llm_venv):
return example_root
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(40000)
@pytest.mark.parametrize("num_beams", [1, 2, 4],
ids=lambda num_beams: f'nb:{num_beams}')

View File

@ -20,10 +20,16 @@ import os
import pytest
from defs.common import (convert_weights, test_multi_lora_support,
venv_check_call, venv_mpi_check_call)
from defs.conftest import (get_device_count, get_device_memory,
from defs.conftest import (get_device_count, get_device_memory, get_sm_version,
skip_post_blackwell, skip_pre_ada)
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@pytest.mark.parametrize(
"context_fmha_type",

View File

@ -20,8 +20,15 @@ import re
import pytest
from defs.common import venv_check_call, venv_check_output
from defs.conftest import get_sm_version
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@pytest.fixture(scope="module")
def qwenvl_example_root(llm_root, llm_venv):

View File

@ -19,10 +19,17 @@ from pathlib import Path
import pytest
from defs.common import (convert_weights, generate_summary_cmd, quantize_data,
venv_check_call, venv_mpi_check_call)
from defs.conftest import skip_fp8_pre_ada
from defs.conftest import get_sm_version, skip_fp8_pre_ada, skip_post_blackwell
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@skip_post_blackwell
@pytest.mark.parametrize("gemm_plugin", [True, False],
ids=["enable_gemm_plugin", "disable_gemm_plugin"])
@pytest.mark.parametrize("gpt_attention_plugin", [True, False],

View File

@ -15,9 +15,17 @@
import pytest
from defs.common import convert_weights, venv_check_call
from defs.conftest import get_sm_version, skip_post_blackwell
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@skip_post_blackwell
@pytest.mark.parametrize("batch_size", [8], ids=['bs8'])
@pytest.mark.parametrize("redrafter_num_beams", [5, 8], ids=['nb5', 'nb8'])
@pytest.mark.parametrize("redrafter_draft_len_per_beam", [5], ids=['dl5'])

View File

@ -15,9 +15,15 @@
import pytest
from defs.common import convert_weights, venv_check_call
from defs.conftest import skip_post_blackwell
from defs.conftest import get_sm_version, skip_post_blackwell
from defs.trt_test_alternative import check_call
# skip trt flow cases on post-Blackwell-Ultra
if get_sm_version() >= 103:
pytest.skip(
"TRT workflow tests are not supported on post Blackwell-Ultra architecture",
allow_module_level=True)
@skip_post_blackwell
@pytest.mark.parametrize("use_cpp_runtime", [True, False],

View File

@ -45,193 +45,41 @@ examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (https
examples/test_mistral.py::test_llm_mistral_v1_1gpu[mistral-7b-v0.1-float16-max_attention_window_size_4096-chunked_summarization_long] SKIP (https://nvbugs/5321371)
test_e2e.py::test_openai_chat_structural_tag_example SKIP (https://nvbugspro.nvidia.com/bug/5375594)
cpp/test_e2e.py::test_model[fp8-chatglm-90] SKIP (https://nvbugs/5034830)
full:B200_PCIe/examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-disable_gemm_plugin] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-disable_gemm_plugin] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_v2_1gpu_gemm_swiglu[llama-v2-7b-hf-fp8-float16] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_int8_kv_1gpu_summary[llama-7b-enable_weight_only-nb:4] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_int8_sq_ootb_1gpu_summary[llama-7b-nb:1] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_wo_1gpu_summary[llama-7b-int4-nb:1] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_int8_kv_awq_1gpu_summary[llama-7b-nb:4] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_v2_1gpu_low_latency_gemm[llama-v2-7b-hf-fp8] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8] SKIP (Disable for Blackwell spec decoding)
full:B200_PCIe/examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8] SKIP (Disable for Blackwell spec decoding)
full:B200_PCIe/examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8] SKIP (Disable for Blackwell spec decoding)
full:B200_PCIe/examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8] SKIP (Disable for Blackwell spec decoding)
full:B200_PCIe/accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int8] SKIP (Disable for Blackwell)
full:B200_PCIe/accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4] SKIP (Disable for Blackwell)
full:B200_PCIe/accuracy/test_cli_flow.py::TestGpt2::test_smooth_quant[per_token=False-per_channel=False] SKIP (Disable for Blackwell)
full:B200_PCIe/accuracy/test_cli_flow.py::TestGpt2::test_smooth_quant[per_token=True-per_channel=True] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_exaone.py::test_llm_exaone_1gpu[enable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:1] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_gpt.py::test_llm_gpt_starcoder_lora_1gpu[peft-lora-starcoder2-15b-unity-copilot-starcoder2-lora_fp16-base_fp16] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_gpt.py::test_llm_gpt_starcoder_lora_1gpu[peft-lora-starcoder2-15b-unity-copilot-starcoder2-lora_fp16-base_fp8] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_awq] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_phi.py::test_llm_phi_single_gpu_summary[Phi-3-mini-128k-instruct-bfloat16-enable_gemm_plugin-enable_attention_plugin-enable_fmha_with_fp32_acc-nb:1] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_phi.py::test_llm_phi_single_gpu_summary[Phi-3-small-8k-instruct-bfloat16-enable_gemm_plugin-enable_attention_plugin-enable_fmha_with_fp32_acc-nb:1] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_phi.py::test_llm_phi_single_gpu_summary[Phi-3.5-mini-instruct-bfloat16-enable_gemm_plugin-enable_attention_plugin-enable_fmha_with_fp32_acc-nb:1] SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/functional SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/quantization SKIP (Disable for Blackwell)
full:B200_PCIe/accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False] SKIP (Disable for Blackwell)
full:B200_PCIe/accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True] SKIP (Disable for Blackwell)
full:B200_PCIe/accuracy/test_cli_flow.py::TestVicuna7B::test_lookahead SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/attention/test_bert_attention.py SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/bindings SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/trt/model/test_gpt.py -k "partition0" SKIP (Disable for Blackwell)
full:B200_PCIe/unittest/test_model_runner_cpp.py SKIP (Disable for Blackwell)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_smooth_quant_1gpu_summary[float16-llama-7b-enable_ptpc-nb:4] SKIP (Disable for Blackwell for SQ)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_wo_1gpu_summary[llama-7b-int8-nb:1] SKIP (Disable for Blackwell for WO)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_v3_int8_gptq_1gpu_summary[llama-v3-8b-instruct-hf-float16-nb:1] SKIP (Disable for Blackwell for weight only)
full:B200_PCIe/examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only] SKIP (Disable for Blackwell for weight only)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_v3_1_quantization_1gpu_manage_weights[llama-3.1-8b-int4_wo] SKIP (Disable for Blackwell for weight only)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_v3_1_autoq_1gpu_mmlu[llama-3.1-8b] SKIP (Disable for Blackwell for weight only)
full:B200_PCIe/examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support custom mask)
full:B200_PCIe/examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support custom mask)
full:B200_PCIe/accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=False-chunked_context=False-typical_acceptance=False] SKIP (Disable for Blackwell for Speculative Dec)
full:B200_PCIe/accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=False] SKIP (Disable for Blackwell for Speculative Dec)
full:B200_PCIe/accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=True-typical_acceptance=False] SKIP (Disable for Blackwell for Speculative Dec)
full:B200_PCIe/examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8] SKIP (Disable for Blackwell for Speculative Dec)
full:B200_PCIe/unittest/llmapi/test_llm_models.py -m "part0" SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 80/96)
full:B200_PCIe/examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 96)
full:B200_PCIe/examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 96)
full:B200_PCIe/examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support custom mask)
full:B200_PCIe/examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support custom mask)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_v3_1_1node_single_gpu[llama-3.1-8b-enable_fp8] SKIP (Disable for Blackwell for fp8 rowwise gemm)
full:B200_PCIe/examples/test_llama.py::test_llm_llama_v3_1_1node_single_gpu[llama-3.1-8b-enable_fp8_meta_recipe] SKIP (Disable for Blackwell for fp8 rowwise gemm)
full:B200_PCIe/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (megatron-core 0.8 is not supported in python 3.12)
full:B200_PCIe/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (megatron-core 0.8 is not supported in python 3.12)
full:B200_PCIe/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin SKIP (Disable for Blackwell OOM)
full:B200_PCIe/examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only] SKIP (Disable for Blackwell OOM)
full:B200_PCIe/unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" SKIP (Disable for Blackwell OOM)
full:B200/examples/test_llama.py::test_llm_llama_v2_1gpu_auto_parallel[llama-v2-7b-hf] SKIP (Disable for Blackwell)
full:B200/examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin] SKIP (Disable for Blackwell)
full:B200/examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-disable_gemm_plugin] SKIP (Disable for Blackwell)
full:B200/examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin] SKIP (Disable for Blackwell)
full:B200/examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-disable_gemm_plugin] SKIP (Disable for Blackwell)
full:B200/examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1] SKIP (Disable for Blackwell)
full:B200/examples/test_llama.py::test_llm_llama_v2_1gpu_gemm_swiglu[llama-v2-7b-hf-fp8-float16] SKIP (Disable for Blackwell)
full:B200/examples/test_llama.py::test_llm_llama_int8_kv_1gpu_summary[llama-7b-enable_weight_only-nb:4] SKIP (Disable for Blackwell)
full:B200/examples/test_llama.py::test_llm_llama_int8_sq_ootb_1gpu_summary[llama-7b-nb:1] SKIP (Disable for Blackwell)
full:B200/examples/test_llama.py::test_llm_llama_wo_1gpu_summary[llama-7b-int4-nb:1] SKIP (Disable for Blackwell)
full:B200/examples/test_llama.py::test_llm_llama_int8_kv_awq_1gpu_summary[llama-7b-nb:4] SKIP (Disable for Blackwell)
full:B200/examples/test_llama.py::test_llm_llama_v2_1gpu_low_latency_gemm[llama-v2-7b-hf-fp8] SKIP (Disable for Blackwell)
full:B200/examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8] SKIP (Disable for Blackwell)
full:B200/examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8] SKIP (Disable for Blackwell spec decoding)
full:B200/examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8] SKIP (Disable for Blackwell spec decoding)
full:B200/examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8] SKIP (Disable for Blackwell spec decoding)
full:B200/examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8] SKIP (Disable for Blackwell spec decoding)
full:B200/accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int8] SKIP (Disable for Blackwell)
full:B200/accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4] SKIP (Disable for Blackwell)
full:B200/accuracy/test_cli_flow.py::TestGpt2::test_smooth_quant[per_token=False-per_channel=False] SKIP (Disable for Blackwell)
full:B200/accuracy/test_cli_flow.py::TestGpt2::test_smooth_quant[per_token=True-per_channel=True] SKIP (Disable for Blackwell)
full:B200/examples/test_exaone.py::test_llm_exaone_1gpu[enable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:1] SKIP (Disable for Blackwell)
full:B200/examples/test_gpt.py::test_llm_gpt_starcoder_lora_1gpu[peft-lora-starcoder2-15b-unity-copilot-starcoder2-lora_fp16-base_fp16] SKIP (Disable for Blackwell)
full:B200/examples/test_gpt.py::test_llm_gpt_starcoder_lora_1gpu[peft-lora-starcoder2-15b-unity-copilot-starcoder2-lora_fp16-base_fp8] SKIP (Disable for Blackwell)
full:B200/examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_awq] SKIP (Disable for Blackwell)
full:B200/examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_int8_wo] SKIP (Disable for Blackwell)
full:B200/examples/test_phi.py::test_llm_phi_single_gpu_summary[Phi-3-mini-128k-instruct-bfloat16-enable_gemm_plugin-enable_attention_plugin-enable_fmha_with_fp32_acc-nb:1] SKIP (Disable for Blackwell)
full:B200/examples/test_phi.py::test_llm_phi_single_gpu_summary[Phi-3-small-8k-instruct-bfloat16-enable_gemm_plugin-enable_attention_plugin-enable_fmha_with_fp32_acc-nb:1] SKIP (Disable for Blackwell)
full:B200/examples/test_phi.py::test_llm_phi_single_gpu_summary[Phi-3-small-128k-instruct-bfloat16-enable_gemm_plugin-enable_attention_plugin-enable_fmha_with_fp32_acc-nb:1] SKIP (Disable for Blackwell)
full:B200/examples/test_phi.py::test_llm_phi_single_gpu_summary[Phi-3.5-mini-instruct-bfloat16-enable_gemm_plugin-enable_attention_plugin-enable_fmha_with_fp32_acc-nb:1] SKIP (Disable for Blackwell)
full:B200/examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-mini-128k-instruct-fp8-float16] SKIP (Disable for Blackwell)
full:B200/examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16] SKIP (Disable for Blackwell)
full:B200/unittest/trt/functional SKIP (Disable for Blackwell)
full:B200/unittest/trt/quantization SKIP (Disable for Blackwell)
full:B200/accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=False] SKIP (Disable for Blackwell)
full:B200/accuracy/test_cli_flow.py::TestVicuna7B::test_medusa[cuda_graph=True] SKIP (Disable for Blackwell)
full:B200/accuracy/test_cli_flow.py::TestVicuna7B::test_lookahead SKIP (Disable for Blackwell)
full:B200/unittest/trt/attention/test_bert_attention.py SKIP (Disable for Blackwell)
full:B200/unittest/trt/model/test_mamba.py SKIP (Disable for Blackwell)
full:B200/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
full:B200/examples/test_medusa.py::test_llm_medusa_with_qaunt_base_model_1gpu[fp8-use_py_session-medusa-vicuna-7b-v1.3-4-heads-float16-bs1] SKIP (Disable for Blackwell)
full:B200/unittest/bindings SKIP (Disable for Blackwell)
full:B200/unittest/trt/attention/test_sage_attention.py unittest/llmapi/test_llm_download.py unittest/llmapi/test_llm_kv_cache_events.py unittest/trt/model/redrafter unittest/trt/model/test_phi.py unittest/trt/model/test_unet.py unittest/trt/python_plugin unittest/tools unittest/utils unittest/others SKIP (Disable for Blackwell)
full:B200/unittest/trt/quantization/test_weight_only_quant_matmul.py SKIP (Disable for Blackwell)
full:B200/unittest/trt/quantization/test_weight_only_groupwise_quant_matmul.py SKIP (Disable for Blackwell)
full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int8-float16] SKIP (Disable for Blackwell)
full:B200/unittest/trt/model/test_gpt.py -k "partition0" SKIP (Disable for Blackwell)
full:B200/unittest/test_model_runner_cpp.py SKIP (Disable for Blackwell)
full:B200/examples/test_llama.py::test_llm_llama_smooth_quant_1gpu_summary[float16-llama-7b-enable_ptpc-nb:4] SKIP (Disable for Blackwell for SQ)
full:B200/examples/test_llama.py::test_llm_llama_wo_1gpu_summary[llama-7b-int8-nb:1] SKIP (Disable for Blackwell for WO)
full:B200/examples/test_llama.py::test_llm_llama_v3_int8_gptq_1gpu_summary[llama-v3-8b-instruct-hf-float16-nb:1] SKIP (Disable for Blackwell for weight only)
full:B200/examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only] SKIP (Disable for Blackwell for weight only)
full:B200/examples/test_llama.py::test_llm_llama_v3_1_quantization_1gpu_manage_weights[llama-3.1-8b-int4_wo] SKIP (Disable for Blackwell for weight only)
full:B200/examples/test_llama.py::test_llm_llama_v3_1_autoq_1gpu_mmlu[llama-3.1-8b] SKIP (Disable for Blackwell for weight only)
full:B200/examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support custom mask)
full:B200/examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support custom mask)
full:B200/accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=False-chunked_context=False-typical_acceptance=False] SKIP (Disable for Blackwell for Speculative Dec)
full:B200/accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=False-typical_acceptance=False] SKIP (Disable for Blackwell for Speculative Dec)
full:B200/accuracy/test_cli_flow.py::TestVicuna7B::test_eagle[cuda_graph=True-chunked_context=True-typical_acceptance=False] SKIP (Disable for Blackwell for Speculative Dec)
full:B200/examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8] SKIP (Disable for Blackwell for Speculative Dec)
full:B200/unittest/llmapi/test_llm_models.py -m "part0" SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 80/96)
full:B200/examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 96)
full:B200/examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support when headsize is 96)
full:B200/examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support custom mask)
full:B200/examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (Disable for Blackwell for context fmha doesn't support custom mask)
full:B200/examples/test_llama.py::test_llm_llama_v3_1_1node_single_gpu[llama-3.1-8b-enable_fp8] SKIP (Disable for Blackwell for fp8 rowwise gemm)
full:B200/examples/test_llama.py::test_llm_llama_v3_1_1node_single_gpu[llama-3.1-8b-enable_fp8_meta_recipe] SKIP (Disable for Blackwell for fp8 rowwise gemm)
full:B200/examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (megatron-core 0.8 is not supported in python 3.12)
full:B200/examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] SKIP (megatron-core 0.8 is not supported in python 3.12)
full:B200/accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp4_plugin SKIP (Disable for Blackwell OOM)
full:B200/examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only] SKIP (Disable for Blackwell OOM)
full:B200/unittest/llmapi/test_llm_models.py -m "not (part0 or part1)" SKIP (Disable for Blackwell OOM)
full:B200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-34b-Instruct-tp2pp2-int4_awq-nb:4] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_code_llama_quantization_4gpus_summary[CodeLlama-70b-hf-tp2pp2-int4_awq-nb:1] SKIP (not support on B200)
full:B200/examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-enable_fp8] SKIP (not support on B200)
full:B200/examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-bfloat16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-disable_fp8] SKIP (not support on B200)
full:B200/examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] SKIP (not support on B200)
full:B200/examples/test_enc_dec.py::test_llm_enc_dec_general[no_compare_hf-byt5-small-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-disable_fp8] SKIP (not support on B200)
full:B200/examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-enable_weight_only] SKIP (not support on B200)
full:B200/examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-enable_weight_only] SKIP (not support on B200)
full:B200/examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[enable_weight_only] SKIP (not support on B200)
full:B200/examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2_7b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc] SKIP (not support on B200)
full:B200/examples/test_qwen.py::test_llm_qwen_single_gpu_summary[qwen2.5_1.5b_instruct-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha_fp32_acc] SKIP (not support on B200)
full:B200/examples/test_qwen.py::test_llm_qwen_7b_int8_kv_1node_1gpus[qwen2_7b_instruct-enable_gemm_plugin-enable_weight_only] SKIP (not support on B200)
full:B200/examples/test_gpt.py::test_llm_gpt2_smooth_single_gpu_summary[enable_ptpc] SKIP (not support on B200)
full:B200/examples/test_gpt.py::test_llm_gpt2_smooth_single_gpu_summary[disable_ptpc] SKIP (not support on B200)
full:B200/examples/test_gpt.py::test_llm_gpt2_int8_kv_1gpu SKIP (not support on B200)
full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder-int8-float16] SKIP (not support on B200)
full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder-int4-float16] SKIP (not support on B200)
full:B200/examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2b-int8_sq-bfloat16-8] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_v2_awq_2gpu_summary[llama-v2-7b-hf-nb:1] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_v2_awq_2gpu_summary[Llama-2-7B-AWQ-nb:1] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_v2_awq_2gpu_summary[Llama-2-7B-GPTQ-nb:4] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_v2_lora_1gpu[chinese-llama-2-lora-13b-llama-v2-13b-hf-lora_fp16-base_sq_ootb] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_smooth_quant_1gpu_summary[float16-llama-7b-disable_ptpc-nb:1] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_int8_kv_1gpu_summary[llama-7b-enable_weight_only-nb:1] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_int8_kv_1gpu_summary[llama-7b-disable_weight_only-nb:1] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_v2_int8sq_2gpu_tp2[llama-v2-7b-hf-bfloat16-nb:1] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_int8_kv_awq_1gpu_summary[llama-7b-nb:1] SKIP (not support on B200)
full:B200/accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int4_tp2 SKIP (not support on B200)
full:B200/accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int8_tp2 SKIP (not support on B200)
full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoderplus-int8-float16] SKIP (not support on B200)
full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoderplus-int4-float16] SKIP (not support on B200)
full:B200/examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.1-8b] SKIP (No available XQA kernels are found for speculative decoding mode)
full:B200/examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b] SKIP (No available XQA kernels are found for speculative decoding mode)
full:B200/examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1] SKIP (No available XQA kernels are found for speculative decoding mode)
full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_weight_only[starcoder2-int4-float16] SKIP (not support on B200)
full:B200/examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] SKIP (https://nvbugs/5064768)
full:B200/accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (not support on B200)
full:B200/examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (not support on B200)
full:B200/examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (not support on B200)
full:B200/examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (not support on B200)
examples/test_qwen.py::test_llm_qwen_moe_multi_gpu_summary[qwen2_57b_a14b-tp4pp1-context_fmha] SKIP (https://nvbugs/5063469)
examples/test_qwen.py::test_llm_qwen_moe_multi_gpu_summary[qwen2_57b_a14b-tp2pp2-context_fmha_fp32_acc] SKIP (https://nvbugs/5063469)
examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora] SKIP (https://nvbugs/5064768)
llmapi/test_llm_e2e.py::test_llmapi_build_command_parameters_align[llama-llama-models-v2/TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5061624)
test_e2e.py::test_openai_consistent_chat SKIP (https://nvbugs/5112075)
full:B200/examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-2-9b-it-fp8-bfloat16-8] SKIP (not supported on B200)
full:B200/examples/test_gpt.py::test_llm_gpt2_starcoder_1gpus SKIP (not supported on B200)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen_7b_chat-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen1.5_7b_chat-eagle1] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_qwen_eagle_1gpu[qwen2_7b_instruct-eagle1] SKIP (https://nvbugs/5206383)
@ -250,12 +98,6 @@ examples/test_eagle.py::test_phi_eagle_1gpu[phi-2-eagle2] SKIP (https://nvbugs/5
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-mini-128k-instruct-eagle2] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3-small-128k-instruct-eagle2] SKIP (https://nvbugs/5206383)
examples/test_eagle.py::test_phi_eagle_1gpu[Phi-3.5-mini-instruct-eagle2] SKIP (https://nvbugs/5206383)
full:B200/examples/test_llama.py::test_llm_llama_lookahead_single_gpu_summary[llama-3.1-8b] SKIP (not supported on B200)
full:B200/examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (TRTLLM-GEN does not support custom mask)
full:B200/examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (TRTLLM-GEN does not support custom mask)
full:B200/examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (TRTLLM-GEN does not support custom mask)
full:B200/examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (TRTLLM-GEN does not support custom mask)
full:B200/examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-disable_weight_only] SKIP (https://nvbugs/5114743)
examples/test_gpt.py::test_llm_gpt_starcoder_lora_1gpu[peft-lora-starcoder2-15b-unity-copilot-starcoder2-lora_fp16-base_fp16] SKIP (https://nvbugs/5114678)
examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8] SKIP (https://nvbugs/5135328)
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5141288)