test: rcca https://nvbugs/5223130 (#4510)

* add rcca tests Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> * skip tests on blackwell Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com> --------- Signed-off-by: xinhe-nv <200704525+xinhe-nv@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-05-27 09:59:47 +08:00 · 2025-05-27 09:59:47 +08:00 · 59f7622281
commit 59f7622281
parent 157fe62965
12 changed files with 202 additions and 19 deletions
--- a/tests/integration/defs/accuracy/test_cli_flow.py
+++ b/tests/integration/defs/accuracy/test_cli_flow.py
@ -200,6 +200,7 @@ class TestNemotronMini4BInstruct(CliFlowAccuracyTestHarness):
        self.run(quant_algo=QuantAlgo.FP8, kv_cache_quant_algo=QuantAlgo.FP8)


+@skip_post_blackwell
 class TestPhi2(CliFlowAccuracyTestHarness):
    MODEL_NAME = "microsoft/phi-2"
    MODEL_PATH = f"{llm_models_root()}/phi-2"
@ -215,6 +216,7 @@ class TestPhi2(CliFlowAccuracyTestHarness):
        self.run(tp_size=2)


+@skip_post_blackwell
 class TestPhi3Mini4kInstruct(CliFlowAccuracyTestHarness):
    MODEL_NAME = "microsoft/Phi-3-mini-4k-instruct"
    MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-mini-4k-instruct"
@ -224,6 +226,7 @@ class TestPhi3Mini4kInstruct(CliFlowAccuracyTestHarness):
        self.run(dtype='auto')


+@skip_post_blackwell
 class TestPhi3Mini128kInstruct(CliFlowAccuracyTestHarness):
    MODEL_NAME = "microsoft/Phi-3-mini-128k-instruct"
    MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-mini-128k-instruct"
@ -233,6 +236,7 @@ class TestPhi3Mini128kInstruct(CliFlowAccuracyTestHarness):
        self.run(dtype='auto')


+@skip_post_blackwell
 class TestPhi3Small8kInstruct(CliFlowAccuracyTestHarness):
    MODEL_NAME = "microsoft/Phi-3-small-8k-instruct"
    MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-small-8k-instruct"
@ -242,6 +246,7 @@ class TestPhi3Small8kInstruct(CliFlowAccuracyTestHarness):
        self.run(dtype='auto')


+@skip_post_blackwell
 class TestPhi3Small128kInstruct(CliFlowAccuracyTestHarness):
    MODEL_NAME = "microsoft/Phi-3-small-128k-instruct"
    MODEL_PATH = f"{llm_models_root()}/Phi-3/Phi-3-small-128k-instruct"
@ -251,6 +256,7 @@ class TestPhi3Small128kInstruct(CliFlowAccuracyTestHarness):
        self.run(dtype='auto')


+@skip_post_blackwell
 class TestPhi3_5MiniInstruct(CliFlowAccuracyTestHarness):
    MODEL_NAME = "microsoft/Phi-3.5-mini-instruct"
    MODEL_PATH = f"{llm_models_root()}/Phi-3.5/Phi-3.5-mini-instruct"
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -21,8 +21,9 @@ from tensorrt_llm.llmapi import KvCacheConfig, MTPDecodingConfig, SamplingParams
 from tensorrt_llm.models.modeling_utils import QuantConfig
 from tensorrt_llm.quantization import QuantAlgo

-from ..conftest import (llm_models_root, parametrize_with_ids, skip_pre_ada,
-                        skip_pre_blackwell, skip_pre_hopper)
+from ..conftest import (llm_models_root, parametrize_with_ids,
+                        skip_post_blackwell, skip_pre_ada, skip_pre_blackwell,
+                        skip_pre_hopper)
 from .accuracy_core import (GSM8K, MMLU, CnnDailymail, GPQADiamond,
                            LlmapiAccuracyTestHarness)

@ -1015,6 +1016,7 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
    MODEL_NAME = "Qwen3/Qwen3-30B-A3B"

    @skip_pre_hopper
+    @skip_post_blackwell
    @pytest.mark.parametrize(
        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler",
        [(1, 1, 1, False, False, True)],
--- a/tests/integration/defs/examples/test_draft_target_model.py
+++ b/tests/integration/defs/examples/test_draft_target_model.py
@ -14,11 +14,13 @@
 # limitations under the License.

 import csv
+import os
 from copy import deepcopy

 import pytest
-from defs.common import convert_weights, venv_check_call
-from defs.conftest import get_device_memory, skip_post_blackwell
+from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
+from defs.conftest import (get_device_memory, llm_models_root,
+                           skip_post_blackwell, skip_pre_hopper)
 from defs.trt_test_alternative import check_call


@ -144,3 +146,182 @@ def test_llm_draft_target_model_1gpu(batch_size, data_type, draft_len,
                assert (
                    int(dt) == int(b)
                ), f"Output at ({bs=}, {index=}) is different ({dt} v.s. {b})."
+
+
+@skip_post_blackwell
+def test_llm_draft_target_llama_1gpu(llama_example_root, llm_venv, cmodel_dir,
+                                     engine_dir):
+    "RCCA https://nvbugs/5223130"
+    data_type = "float16"
+    max_batch_size = 4
+    max_draft_len = 10
+    max_input_len = 3200
+    max_seq_len = 4800
+
+    draft_model = os.path.join(llm_models_root(), "llama-3.2-models",
+                               "Llama-3.2-1B")
+    target_model = os.path.join(llm_models_root(), "llama-3.1-model",
+                                "Meta-Llama-3.1-8B")
+
+    print("Build checkpoint ...")
+    draft_model_dir = convert_weights(llm_venv=llm_venv,
+                                      example_root=llama_example_root,
+                                      cmodel_dir=cmodel_dir,
+                                      model="llama3-1b",
+                                      model_path=draft_model,
+                                      data_type=data_type)
+
+    target_model_dir = convert_weights(llm_venv=llm_venv,
+                                       example_root=llama_example_root,
+                                       cmodel_dir=cmodel_dir,
+                                       model="llama3-8b",
+                                       model_path=target_model,
+                                       data_type=data_type)
+
+    print("Build engines ...")
+    draft_engine_dir = os.path.join(engine_dir, "draft")
+    target_engine_dir = os.path.join(engine_dir, "target")
+
+    base_build_cmd = [
+        "trtllm-build",
+        f"--max_batch_size={max_batch_size}",
+        f"--max_input_len={max_input_len}",
+        f"--max_seq_len={max_seq_len}",
+        "--use_paged_context_fmha=enable",
+        f"--gemm_plugin={data_type}",
+        "--gather_generation_logits",
+    ]
+
+    draft_model_build_cmd = base_build_cmd + [
+        f"--checkpoint_dir={draft_model_dir}",
+        f"--output_dir={draft_engine_dir}",
+    ]
+
+    target_model_build_cmd = base_build_cmd + [
+        f"--checkpoint_dir={target_model_dir}",
+        "--speculative_decoding_mode=draft_tokens_external",
+        f"--max_draft_len={max_draft_len}",
+        f"--output_dir={target_engine_dir}",
+    ]
+
+    check_call(" ".join(draft_model_build_cmd),
+               shell=True,
+               env=llm_venv._new_env)
+    check_call(" ".join(target_model_build_cmd),
+               shell=True,
+               env=llm_venv._new_env)
+
+    print("Run inferences ...")
+    run_cmd = [
+        f"{llama_example_root}/../../../run.py",
+        f"--tokenizer_dir={target_model}",
+        f"--draft_engine_dir={draft_engine_dir}",
+        f"--engine_dir={target_engine_dir}",
+        "--draft_target_model_config=[4,[0],[0],True]", "--max_output_len=256",
+        "--kv_cache_enable_block_reuse",
+        "--kv_cache_free_gpu_memory_fraction=0.4",
+        f"--input_text='how does draft-sampling work'"
+    ]
+
+    venv_check_call(llm_venv, run_cmd)
+
+
+@skip_post_blackwell
+@skip_pre_hopper
+@pytest.mark.skip_less_device(2)
+@pytest.mark.skip_less_device_memory(80000)
+def test_llm_draft_target_llama_fp8_2gpu(llama_example_root, llm_venv,
+                                         qcache_dir, engine_dir,
+                                         llm_datasets_root):
+    "RCCA https://nvbugs/5257681"
+    data_type = "bfloat16"
+    max_batch_size = 16
+    max_draft_len = 5
+    max_input_len = 2000
+    max_seq_len = 4000
+
+    draft_model = os.path.join(llm_models_root(), "llama-3.1-model",
+                               "Meta-Llama-3.1-8B")
+
+    target_model = os.path.join(llm_models_root(), "llama-3.3-models",
+                                "Llama-3.3-70B-Instruct")
+
+    draft_quantized_dir = os.path.join(qcache_dir, "draft")
+    target_quantized_dir = os.path.join(qcache_dir, "target")
+
+    print("Build checkpoint ...")
+    quantize_cmd = [
+        f"{llama_example_root}/../../../quantization/quantize.py",
+        f"--calib_dataset={llm_datasets_root}/cnn_dailymail",
+        f"--dtype={data_type}",
+        "--qformat=fp8",
+        "--calib_size=32",
+        "--kv_cache_dtype=fp8",
+        "--tp_size=2",
+    ]
+
+    draft_quantized_cmd = quantize_cmd + [
+        f"--model_dir={draft_model}",
+        f"--output_dir={draft_quantized_dir}",
+    ]
+
+    target_quantized_cmd = quantize_cmd + [
+        f"--model_dir={target_model}",
+        f"--output_dir={target_quantized_dir}",
+    ]
+
+    venv_check_call(llm_venv, draft_quantized_cmd)
+    venv_check_call(llm_venv, target_quantized_cmd)
+
+    print("Build engines ...")
+    draft_engine_dir = os.path.join(engine_dir, "draft")
+    target_engine_dir = os.path.join(engine_dir, "target")
+
+    base_build_cmd = [
+        "trtllm-build",
+        f"--max_batch_size={max_batch_size}",
+        f"--max_input_len={max_input_len}",
+        f"--max_seq_len={max_seq_len}",
+        "--use_paged_context_fmha=enable",
+        "--gemm_plugin=auto",
+        "--workers=2",
+        "--gather_generation_logits",
+    ]
+
+    draft_model_build_cmd = base_build_cmd + [
+        f"--checkpoint_dir={draft_quantized_dir}",
+        f"--output_dir={draft_engine_dir}",
+    ]
+
+    target_model_build_cmd = base_build_cmd + [
+        f"--checkpoint_dir={target_quantized_dir}",
+        f"--output_dir={target_engine_dir}",
+        f"--max_draft_len={max_draft_len}",
+        "--speculative_decoding_mode=draft_tokens_external",
+    ]
+
+    check_call(" ".join(draft_model_build_cmd),
+               shell=True,
+               env=llm_venv._new_env)
+    check_call(" ".join(target_model_build_cmd),
+               shell=True,
+               env=llm_venv._new_env)
+
+    INPUT_TEXT = "The United States of America (USA), also known as the United States (U.S.) or America, is a country located primarily in North America. It is a federal republic of 50 states and the federal capital district of Washington, D.C. The 48 contiguous states border Canada to the north and Mexico to the south, with the state of Alaska to the northwest and the islands of Hawaii in Oceania. Indian country includes 574 federally recognized tribes and 326 Indian reservations with tribal sovereignty rights. The U.S. asserts sovereignty over five major island territories and various uninhabited islands in the Pacific Ocean and the Caribbean. It has the world's third-largest land area[c] and third-largest population, exceeding 340 million. Paleo-Indians migrated to North America across"
+
+    print("Run inferences ...")
+    run_cmd = [
+        f"{llama_example_root}/../../../run.py",
+        f"--tokenizer_dir={draft_model}",
+        f"--draft_engine_dir={draft_engine_dir}",
+        f"--engine_dir={target_engine_dir}",
+        f"--input_text={INPUT_TEXT}",
+        "--draft_target_model_config=[3,[0,1],[0,1],False]",
+        "--max_output_len=800",
+        "--kv_cache_enable_block_reuse",
+        "--kv_cache_free_gpu_memory_fraction=0.3",
+        "--run_profiling",
+    ]
+
+    venv_mpi_check_call(llm_venv, ["mpirun", "-n", "1", "--allow-run-as-root"],
+                        run_cmd)
--- a/tests/integration/defs/examples/test_eagle.py
+++ b/tests/integration/defs/examples/test_eagle.py
@ -303,6 +303,7 @@ def test_mistral_eagle_1gpu(llm_mistral_model_root,
                          llm_rouge_root=llm_rouge_root)


+@skip_post_blackwell
@skip_pre_ada
@pytest.mark.parametrize("use_dynamic_tree", [False, True],
                         ids=['eagle1', 'eagle2'])
--- a/tests/integration/defs/examples/test_llama.py
+++ b/tests/integration/defs/examples/test_llama.py
@ -1065,6 +1065,7 @@ def test_llm_llama_v3_1_autoq_2gpu_mmlu(llama_example_root, llama_model_root,
    check_call(" ".join(mmlu_cmd), shell=True, env=llm_venv._new_env)


+@skip_post_blackwell
@pytest.mark.skip_less_device(2)
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.parametrize("use_auto_parallel", [True, False],
--- a/tests/integration/defs/examples/test_multimodal.py
+++ b/tests/integration/defs/examples/test_multimodal.py
@ -621,12 +621,13 @@ def _test_llm_multimodal_general(llm_venv,
    'cogvlm-chat',
    'fuyu-8b',
    'deplot',
-    'neva-22b',
+    pytest.param('neva-22b',
+                 marks=pytest.mark.skip(reason="RCCA https://nvbugs/5220761")),
    'kosmos-2',
    'video-neva',
    pytest.param('Phi-3-vision-128k-instruct', marks=skip_post_blackwell),
-    'Phi-3.5-vision-instruct',
-    'Phi-4-multimodal-instruct',
+    pytest.param('Phi-3.5-vision-instruct', marks=skip_post_blackwell),
+    pytest.param('Phi-4-multimodal-instruct', marks=skip_post_blackwell),
    'Llama-3.2-11B-Vision',
    'Qwen2-VL-7B-Instruct',
    'internlm-xcomposer2-vl-7b',
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@ -1639,6 +1639,7 @@ def test_ptp_quickstart_advanced_deepseek_r1_8gpus(llm_root, llm_venv,
        _check_mem_usage(running_log, [106.3, 0, 0, 0], 8)


+@skip_post_blackwell
@pytest.mark.skip_less_device_memory(110000)
@pytest.mark.skip_less_device(8)
@pytest.mark.parametrize("model_name,model_path", [
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@ -95,6 +95,8 @@ examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-
 examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_tokens-draft_len_8-float16-bs1]
 examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-llama_v2-use_cpp_session-use_tokens-draft_len_4-float16-bs2]
 examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-llama_v2-use_cpp_session-use_logits-draft_len_4-float16-bs2]
+examples/test_draft_target_model.py::test_llm_draft_target_llama_1gpu
+examples/test_draft_target_model.py::test_llm_draft_target_llama_fp8_2gpu
 examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs1]
 examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs2]
 examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs1]
@ -184,8 +186,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-h
 examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
@ -199,7 +199,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-fl
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:True-nb:1]
 examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
 examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec]
 examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-int4_awq]
--- a/tests/integration/test_lists/qa/llm_sanity_test.txt
+++ b/tests/integration/test_lists/qa/llm_sanity_test.txt
@ -46,7 +46,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-h
 examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-vision-trtllm-pp:1-tp:2-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
@ -56,7 +55,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-fl
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:True-nb:1]

 examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec]
 examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-int4_awq]
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@ -236,7 +236,6 @@ l0_h100:
  - accuracy/test_llm_api.py::TestMistral_Nemo_12B_Base::test_fp8
  - examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] # 7 mins
  - examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
-  - examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
  - examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
  - examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
  - examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
--- a/tests/integration/test_lists/test-db/l0_l40s.yml
+++ b/tests/integration/test_lists/test-db/l0_l40s.yml
@ -96,7 +96,6 @@ l0_l40s:
  - examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_cpp_session-tp1] # 10 mins
  - examples/test_gpt.py::test_llm_gpt2_next_prompt_tuning[use_py_session-tp1]
  - examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] #4 mins
-  - examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
  - examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
  - examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
  - examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-fp8] # 18mins
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -31,7 +31,6 @@ full:L40S/examples/test_commandr.py::test_llm_commandr_plus_4gpus_summary[disabl
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16] SKIP (https://nvbugs/4955671)
 examples/test_whisper.py::test_llm_whisper_general[large-v3-enable_gemm_plugin-enable_attention_plugin-enable_weight_only-float16-nb:1-use_python_runtime] SKIP (https://nvbugs/4967883)
 full:GH200/cpp/test_e2e.py::test_model[fp8-gptj-90] SKIP (https://nvbugspro.nvidia.com/bug/4979920)
-full:GH200/examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugspro.nvidia.com/bug/4979845)
 full:GH200/unittest/trt/model_api/test_model_quantization.py SKIP (https://nvbugspro.nvidia.com/bug/4979955)
 examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu[gemma-7b-int8_sq-bfloat16-8] SKIP (https://nvbugs/4988782)
 examples/test_llama.py::test_llm_llama_v3_8b_1048k_long_context_ppl[SlimPajama-6B-Llama-3-8B-Instruct-Gradient-1048k] SKIP (https://nvbugs/4993898)
@ -370,7 +369,6 @@ full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:in
 full:RTX_PRO_6000_Blackwell_Server_Edition/perf/test_perf.py::test_perf[quant:int8_sq_per_token_channel] SKIP (https://nvbugspro.nvidia.com/bug/5161074)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5174573)
 examples/test_qwen.py::test_llm_qwen_moe_single_gpu_summary[qwen1.5_moe_a2.7b_chat-enable_paged_kv_cache-enable_remove_input_padding-enable_weight_only-enable_fmha] SKIP (https://nvbugs/5180961)
-examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5214245)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-disable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-no_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
 examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_py_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin] SKIP (https://nvbugs/5214221)
@ -386,12 +384,9 @@ examples/test_eagle.py::test_mistral_eagle_1gpu[mistral-7b-v0.1-eagle2] SKIP (ht
 examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5214239)
 examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5214239)
 examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5214239)
-examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5220761)
-examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5220761)
 examples/test_cli_flow.py::TestSantacoder::test_auto_dtype SKIP (https://nvbugs/5219531)
 examples/test_medusa.py::test_llama_medusa_1gpu[llama-3.1-8b] SKIP (https://nvbugs/5219535)
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-4-mini-instruct-fp8-bfloat16] SKIP (https://nvbugspro.nvidia.com/bug/5226339)
-examples/test_multimodal.py::test_llm_multimodal_general[neva-22b-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:True-nb:1] SKIP (https://nvbugs/5227342)
 perf/test_perf.py::test_perf[t5-bench-float16-input_output_len:128,20] SKIP # https://nvbugspro.nvidia.com/bug/5207477
 perf/test_perf.py::test_perf[flan_t5_base-bench-float16-input_output_len:128,20] SKIP
 perf/test_perf.py::test_perf[flan_t5_large-bench-float16-input_output_len:128,20] SKIP