* add rcca tests

Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>

* skip tests on blackwell

Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>

---------

Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>
This commit is contained in:
xinhe-nv 2025-05-27 09:59:47 +08:00 committed by GitHub
parent 157fe62965
commit 59f7622281
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
12 changed files with 202 additions and 19 deletions

View File

@ -200,6 +200,7 @@ class TestNemotronMini4BInstruct(CliFlowAccuracyTestHarness):
self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)
@skip_post_blackwell
class TestPhi2(CliFlowAccuracyTestHarness):
MODEL_NAME = "microsoft/phi-2"
MODEL_PATH = f"{llm_models_root()}/phi-2"
@ -215,6 +216,7 @@ class TestPhi2(CliFlowAccuracyTestHarness):
self.run(tp_size=2)
@skip_post_blackwell
class TestPhi3Mini4kInstruct(CliFlowAccuracyTestHarness):
MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-mini-4k-instruct"
@ -224,6 +226,7 @@ class TestPhi3Mini4kInstruct(CliFlowAccuracyTestHarness):
self.run(dtype='auto')
@skip_post_blackwell
class TestPhi3Mini128kInstruct(CliFlowAccuracyTestHarness):
MODEL_NAME = "microsoft/Phi-3-mini-128k-instruct"
MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-mini-128k-instruct"
@ -233,6 +236,7 @@ class TestPhi3Mini128kInstruct(CliFlowAccuracyTestHarness):
self.run(dtype='auto')
@skip_post_blackwell
class TestPhi3Small8kInstruct(CliFlowAccuracyTestHarness):
MODEL_NAME = "microsoft/Phi-3-small-8k-instruct"
MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-small-8k-instruct"
@ -242,6 +246,7 @@ class TestPhi3Small8kInstruct(CliFlowAccuracyTestHarness):
self.run(dtype='auto')
@skip_post_blackwell
class TestPhi3Small128kInstruct(CliFlowAccuracyTestHarness):
MODEL_NAME = "microsoft/Phi-3-small-128k-instruct"
MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-small-128k-instruct"
@ -251,6 +256,7 @@ class TestPhi3Small128kInstruct(CliFlowAccuracyTestHarness):
self.run(dtype='auto')
@skip_post_blackwell
class TestPhi3_5MiniInstruct(CliFlowAccuracyTestHarness):
MODEL_NAME = "microsoft/Phi-3.5-mini-instruct"
MODEL_PATH = f"{llm_models_root()}/Phi-3.5/Phi-3.5-mini-instruct"

View File

@ -21,8 +21,9 @@ from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig, SamplingParams
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization import QuantAlgo
from ..conftest import (llm_models_root, parametrize_with_ids, skip_pre_ada,
skip_pre_blackwell, skip_pre_hopper)
from ..conftest import (llm_models_root, parametrize_with_ids,
skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
skip_pre_hopper)
from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
LlmapiAccuracyTestHarness)
@ -1015,6 +1016,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
MODEL_NAME = "Qwen3/Qwen3-30B-A3B"
@skip_pre_hopper
@skip_post_blackwell
@pytest.mark.parametrize(
"tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler",
[(1, 1, 1, False, False, True)],

View File

@ -14,11 +14,13 @@
# limitations under the License.
import csv
import os
from copy import deepcopy
import pytest
from defs.common import convert_weights, venv_check_call
from defs.conftest import get_device_memory, skip_post_blackwell
from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
from defs.conftest import (get_device_memory, llm_models_root,
skip_post_blackwell, skip_pre_hopper)
from defs.trt_test_alternative import check_call
@ -144,3 +146,182 @@ def test_llm_draft_target_model_1gpu(batch_size, data_type, draft_len,
assert (
int(dt) == int(b)
), f"Output at ({bs=}, {index=}) is different ({dt} v.s. {b})."
@skip_post_blackwell
def test_llm_draft_target_llama_1gpu(llama_example_root, llm_venv, cmodel_dir,
engine_dir):
"RCCA https://nvbugs/5223130"
data_type = "float16"
max_batch_size = 4
max_draft_len = 10
max_input_len = 3200
max_seq_len = 4800
draft_model = os.path.join(llm_models_root(), "llama-3.2-models",
"Llama-3.2-1B")
target_model = os.path.join(llm_models_root(), "llama-3.1-model",
"Meta-Llama-3.1-8B")
print("Build checkpoint ...")
draft_model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model="llama3-1b",
model_path=draft_model,
data_type=data_type)
target_model_dir = convert_weights(llm_venv=llm_venv,
example_root=llama_example_root,
cmodel_dir=cmodel_dir,
model="llama3-8b",
model_path=target_model,
data_type=data_type)
print("Build engines ...")
draft_engine_dir = os.path.join(engine_dir, "draft")
target_engine_dir = os.path.join(engine_dir, "target")
base_build_cmd = [
"trtllm-build",
f"--max_batch_size={max_batch_size}",
f"--max_input_len={max_input_len}",
f"--max_seq_len={max_seq_len}",
"--use_paged_context_fmha=enable",
f"--gemm_plugin={data_type}",
"--gather_generation_logits",
]
draft_model_build_cmd = base_build_cmd + [
f"--checkpoint_dir={draft_model_dir}",
f"--output_dir={draft_engine_dir}",
]
target_model_build_cmd = base_build_cmd + [
f"--checkpoint_dir={target_model_dir}",
"--speculative_decoding_mode=draft_tokens_external",
f"--max_draft_len={max_draft_len}",
f"--output_dir={target_engine_dir}",
]
check_call(" ".join(draft_model_build_cmd),
shell=True,
env=llm_venv._new_env)
check_call(" ".join(target_model_build_cmd),
shell=True,
env=llm_venv._new_env)
print("Run inferences ...")
run_cmd = [
f"{llama_example_root}/../../../run.py",
f"--tokenizer_dir={target_model}",
f"--draft_engine_dir={draft_engine_dir}",
f"--engine_dir={target_engine_dir}",
"--draft_target_model_config=[4,[0],[0],True]", "--max_output_len=256",
"--kv_cache_enable_block_reuse",
"--kv_cache_free_gpu_memory_fraction=0.4",
f"--input_text='how does draft-sampling work'"
]
venv_check_call(llm_venv, run_cmd)
@skip_post_blackwell
@skip_pre_hopper
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(80000)
def test_llm_draft_target_llama_fp8_2gpu(llama_example_root, llm_venv,
qcache_dir, engine_dir,
llm_datasets_root):
"RCCA https://nvbugs/5257681"
data_type = "bfloat16"
max_batch_size = 16
max_draft_len = 5
max_input_len = 2000
max_seq_len = 4000
draft_model = os.path.join(llm_models_root(), "llama-3.1-model",
"Meta-Llama-3.1-8B")
target_model = os.path.join(llm_models_root(), "llama-3.3-models",
"Llama-3.3-70B-Instruct")
draft_quantized_dir = os.path.join(qcache_dir, "draft")
target_quantized_dir = os.path.join(qcache_dir, "target")
print("Build checkpoint ...")
quantize_cmd = [
f"{llama_example_root}/../../../quantization/quantize.py",
f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
f"--dtype={data_type}",
"--qformat=fp8",
"--calib_size=32",
"--kv_cache_dtype=fp8",
"--tp_size=2",
]
draft_quantized_cmd = quantize_cmd + [
f"--model_dir={draft_model}",
f"--output_dir={draft_quantized_dir}",
]
target_quantized_cmd = quantize_cmd + [
f"--model_dir={target_model}",
f"--output_dir={target_quantized_dir}",
]
venv_check_call(llm_venv, draft_quantized_cmd)
venv_check_call(llm_venv, target_quantized_cmd)
print("Build engines ...")
draft_engine_dir = os.path.join(engine_dir, "draft")
target_engine_dir = os.path.join(engine_dir, "target")
base_build_cmd = [
"trtllm-build",
f"--max_batch_size={max_batch_size}",
f"--max_input_len={max_input_len}",
f"--max_seq_len={max_seq_len}",
"--use_paged_context_fmha=enable",
"--gemm_plugin=auto",
"--workers=2",
"--gather_generation_logits",
]
draft_model_build_cmd = base_build_cmd + [
f"--checkpoint_dir={draft_quantized_dir}",
f"--output_dir={draft_engine_dir}",
]
target_model_build_cmd = base_build_cmd + [
f"--checkpoint_dir={target_quantized_dir}",
f"--output_dir={target_engine_dir}",
f"--max_draft_len={max_draft_len}",
"--speculative_decoding_mode=draft_tokens_external",
]
check_call(" ".join(draft_model_build_cmd),
shell=True,
env=llm_venv._new_env)
check_call(" ".join(target_model_build_cmd),
shell=True,
env=llm_venv._new_env)
INPUT_TEXT = "The United States of America (USA), also known as the United States (U.S.) or America, is a country located primarily in North America. It is a federal republic of 50 states and the federal capital district of Washington, D.C. The 48 contiguous states border Canada to the north and Mexico to the south, with the state of Alaska to the northwest and the islands of Hawaii in Oceania. Indian country includes 574 federally recognized tribes and 326 Indian reservations with tribal sovereignty rights. The U.S. asserts sovereignty over five major island territories and various uninhabited islands in the Pacific Ocean and the Caribbean. It has the world's third-largest land area[c] and third-largest population, exceeding 340 million. Paleo-Indians migrated to North America across"
print("Run inferences ...")
run_cmd = [
f"{llama_example_root}/../../../run.py",
f"--tokenizer_dir={draft_model}",
f"--draft_engine_dir={draft_engine_dir}",
f"--engine_dir={target_engine_dir}",
f"--input_text={INPUT_TEXT}",
"--draft_target_model_config=[3,[0,1],[0,1],False]",
"--max_output_len=800",
"--kv_cache_enable_block_reuse",
"--kv_cache_free_gpu_memory_fraction=0.3",
"--run_profiling",
]
venv_mpi_check_call(llm_venv, ["mpirun", "-n", "1", "--allow-run-as-root"],
run_cmd)

View File

@ -303,6 +303,7 @@ def test_mistral_eagle_1gpu(llm_mistral_model_root,
llm_rouge_root=llm_rouge_root)
@skip_post_blackwell
@skip_pre_ada
@pytest.mark.parametrize("use_dynamic_tree", [False, True],
ids=['eagle1', 'eagle2'])

View File

@ -1065,6 +1065,7 @@ def test_llm_llama_v3_1_autoq_2gpu_mmlu(llama_example_root, llama_model_root,
check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)
@skip_post_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("use_auto_parallel", [True, False],

View File

@ -621,12 +621,13 @@ def _test_llm_multimodal_general(llm_venv,
'cogvlm-chat',
'fuyu-8b',
'deplot',
'neva-22b',
pytest.param('neva-22b',
marks=pytest.mark.skip(reason="RCCA https://nvbugs/5220761")),
'kosmos-2',
'video-neva',
pytest.param('Phi-3-vision-128k-instruct', marks=skip_post_blackwell),
'Phi-3.5-vision-instruct',
'Phi-4-multimodal-instruct',
pytest.param('Phi-3.5-vision-instruct', marks=skip_post_blackwell),
pytest.param('Phi-4-multimodal-instruct', marks=skip_post_blackwell),
'Llama-3.2-11B-Vision',
'Qwen2-VL-7B-Instruct',
'internlm-xcomposer2-vl-7b',

View File

@ -1639,6 +1639,7 @@ def test_ptp_quickstart_advanced_deepseek_r1_8gpus(llm_root, llm_venv,
_check_mem_usage(running_log, [106.3, 0, 0, 0], 8)
@skip_post_blackwell
@pytest.mark.skip_less_device_memory(110000)
@pytest.mark.skip_less_device(8)
@pytest.mark.parametrize("model_name,model_path", [

View File

@ -95,6 +95,8 @@ examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_tokens-draft_len_8-float16-bs1]
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-llama_v2-use_cpp_session-use_tokens-draft_len_4-float16-bs2]
examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-llama_v2-use_cpp_session-use_logits-draft_len_4-float16-bs2]
examples/test_draft_target_model.py::test_llm_draft_target_llama_1gpu
examples/test_draft_target_model.py::test_llm_draft_target_llama_fp8_2gpu
examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs1]
examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs2]
examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs1]
@ -184,8 +186,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-h
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
@ -199,7 +199,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-fl
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:True-nb:1]
examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec]
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-int4_awq]

View File

@ -46,7 +46,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-h
examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-vision-trtllm-pp:1-tp:2-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
@ -56,7 +55,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-fl
examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:True-nb:1]
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec]
examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-int4_awq]

View File

@ -236,7 +236,6 @@ l0_h100:
- accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
- examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] # 7 mins
- examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
- examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
- examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
- examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
- examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]

View File

@ -96,7 +96,6 @@ l0_l40s:
- examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_cpp_session-tp1] # 10 mins
- examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_py_session-tp1]
- examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] #4 mins
- examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
- examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
- examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
- examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] # 18mins

View File

@ -31,7 +31,6 @@ full:L40S/examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disabl
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16] SKIP (https://nvbugs/4955671)
examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-enable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4967883)
full:GH200/cpp/test_e2e.py::test_model[fp8-gptj-90] SKIP (https://nvbugspro.nvidia.com/bug/4979920)
full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugspro.nvidia.com/bug/4979845)
full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugspro.nvidia.com/bug/4979955)
examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-7b-int8_sq-bfloat16-8] SKIP (https://nvbugs/4988782)
examples/test_llama.py::test_llm_llama_v3_8b_1048k_long_context_ppl[SlimPajama-6B-Llama-3-8B-Instruct-Gradient-1048k] SKIP (https://nvbugs/4993898)
@ -370,7 +369,6 @@ full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:in
full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5174573)
examples/test_qwen.py::test_llm_qwen_moe_single_gpu_summary[qwen1.5_moe_a2.7b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha] SKIP (https://nvbugs/5180961)
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5214245)
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-disable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
@ -386,12 +384,9 @@ examples/test_eagle.py::test_mistral_eagle_1gpu[mistral-7b-v0.1-eagle2] SKIP (ht
examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5214239)
examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5214239)
examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5214239)
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5220761)
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5220761)
examples/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5219531)
examples/test_medusa.py::test_llama_medusa_1gpu[llama-3.1-8b] SKIP (https://nvbugs/5219535)
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] SKIP (https://nvbugspro.nvidia.com/bug/5226339)
examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5227342)
perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] SKIP # https://nvbugspro.nvidia.com/bug/5207477
perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] SKIP
perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] SKIP