tests: update sanity tests & fix tests (#5906)

Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-07-11 02:48:19 -07:00 · 2025-07-11 02:48:19 -07:00 · 509363d858
commit 509363d858
parent f4e0425a7b
4 changed files with 362 additions and 455 deletions
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -84,11 +84,10 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
            attn_backend=attn_backend,
            # https://nvbugspro.nvidia.com/bug/5345391
            disable_overlap_scheduler=True)
-        llm = LLM(self.MODEL_PATH,
-                  enable_chunked_prefill=True,
-                  max_num_tokens=512,
-                  **pytorch_config)
-        with llm:
+        with LLM(self.MODEL_PATH,
+                 enable_chunked_prefill=True,
+                 max_num_tokens=512,
+                 **pytorch_config) as llm:
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)

@ -107,8 +106,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
            attn_backend=attn_backend,
            disable_overlap_scheduler=torch_compile,
        )
-        llm = LLM(self.MODEL_PATH, **pytorch_config)
-        with llm:
+        with LLM(self.MODEL_PATH, **pytorch_config) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -134,11 +132,10 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
            attn_backend=attn_backend,
            disable_overlap_scheduler=torch_compile,
        )
-        llm = LLM(self.MODEL_PATH,
-                  tensor_parallel_size=tp_size,
-                  pipeline_parallel_size=pp_size,
-                  **pytorch_config)
-        with llm:
+        with LLM(self.MODEL_PATH,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 **pytorch_config) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -162,14 +159,13 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
        if fp8kv:
            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
            pytorch_config["kv_cache_dtype"] = "fp8"
-        llm = LLM(
-            f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
-            quant_config=quant_config,
-            **pytorch_config)
-        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
-        with llm:
+        with LLM(
+                f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
+                quant_config=quant_config,
+                **pytorch_config) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -201,31 +197,30 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
        if fp8kv:
            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
            pytorch_config["kv_cache_dtype"] = "fp8"
-        llm = LLM(
-            f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
-            tensor_parallel_size=tp_size,
-            pipeline_parallel_size=pp_size,
-            quant_config=quant_config,
-            **pytorch_config)
-        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
-        with llm:
+        with LLM(
+                f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8",
+                tensor_parallel_size=tp_size,
+                pipeline_parallel_size=pp_size,
+                quant_config=quant_config,
+                **pytorch_config) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

    @skip_pre_hopper
    def test_fp8_llm_sampler(self):
        model_path = f"{llm_models_root()}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
-        llm = LLM(model_path, enable_trtllm_sampler=True, max_batch_size=256)
-        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
+        with LLM(model_path, enable_trtllm_sampler=True,
+                 max_batch_size=256) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8

-        sampling_params = SamplingParams(
-            temperature=0.8,
-            top_p=0.95,
-        )
+            sampling_params = SamplingParams(
+                temperature=0.8,
+                top_p=0.95,
+            )

-        with llm:
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm,
                          sampling_params=sampling_params,
@ -245,13 +240,11 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
        spec_config = EagleDecodingConfig(max_draft_len=draft_len,
                                          speculative_model_dir=eagle_model_dir)

-        llm = LLM(model=target_model_dir,
-                  **pytorch_config,
-                  kv_cache_config=kv_cache_config,
-                  speculative_config=spec_config,
-                  build_config=None)
-
-        with llm:
+        with LLM(model=target_model_dir,
+                 **pytorch_config,
+                 kv_cache_config=kv_cache_config,
+                 speculative_config=spec_config,
+                 build_config=None) as llm:
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)

@ -269,12 +262,10 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
            is_public_pool=True,
        )

-        llm = LLM(model=self.MODEL_PATH,
-                  **pytorch_config,
-                  kv_cache_config=kv_cache_config,
-                  speculative_config=spec_config)
-
-        with llm:
+        with LLM(model=self.MODEL_PATH,
+                 **pytorch_config,
+                 kv_cache_config=kv_cache_config,
+                 speculative_config=spec_config) as llm:
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)
            task = GSM8K(self.MODEL_NAME)
@ -291,17 +282,17 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
            task = JsonModeEval(self.MODEL_NAME)
            task.evaluate(llm)

+    @pytest.mark.timeout(7200)
    @pytest.mark.skip_less_device(4)
    @pytest.mark.parametrize("backend", ["xgrammar", "llguidance"])
    def test_guided_decoding_4gpus(self, backend: str, mocker):
        mocker.patch.dict(os.environ, {"TRTLLM_XGUIDANCE_LENIENT": "1"})
-        llm = LLM(self.MODEL_PATH,
-                  guided_decoding_backend=backend,
-                  disable_overlap_scheduler=True,
-                  cuda_graph_config=CudaGraphConfig(),
-                  tensor_parallel_size=2,
-                  pipeline_parallel_size=2)
-        with llm:
+        with LLM(self.MODEL_PATH,
+                 guided_decoding_backend=backend,
+                 disable_overlap_scheduler=True,
+                 cuda_graph_config=CudaGraphConfig(),
+                 tensor_parallel_size=2,
+                 pipeline_parallel_size=2) as llm:
            task = JsonModeEval(self.MODEL_NAME)
            task.evaluate(llm)

@ -591,12 +582,11 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
        mtp_config = None
        if mtp_nextn > 0:
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
-        llm = LLM(self.MODEL_PATH,
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
-        with llm:
+        with LLM(self.MODEL_PATH,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -634,15 +624,14 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
        mtp_config = None
        if mtp_nextn > 0:
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
-        llm = LLM(self.MODEL_PATH,
-                  tensor_parallel_size=tp_size,
-                  pipeline_parallel_size=pp_size,
-                  moe_expert_parallel_size=ep_size,
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
-        with llm:
+        with LLM(self.MODEL_PATH,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -686,18 +675,17 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn,
                                           use_mtp_vanilla=True)

-        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  quant_config=quant_config,
-                  enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
+        with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 quant_config=quant_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:

-        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

-        with llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -742,20 +730,19 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
        if mtp_nextn > 0:
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)

-        llm = LLM(
-            f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
-            kv_cache_config=kv_cache_config,
-            **pytorch_config,
-            quant_config=quant_config,
-            enable_attention_dp=attention_dp,
-            speculative_config=mtp_config,
-        )
+        with LLM(
+                f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
+                kv_cache_config=kv_cache_config,
+                **pytorch_config,
+                quant_config=quant_config,
+                enable_attention_dp=attention_dp,
+                speculative_config=mtp_config,
+        ) as llm:

-        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

-        with llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -773,12 +760,11 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
                padding_enabled=True,
            ),
        )
-        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  speculative_config=mtp_config)
-        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-        with llm:
+        with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 speculative_config=mtp_config) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -799,15 +785,14 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
        quant_config = QuantConfig()
        quant_config.quant_algo = QuantAlgo.FP8_BLOCK_SCALES

-        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
-                  tensor_parallel_size=4,
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  quant_config=quant_config,
-                  enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
-        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-        with llm:
+        with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
+                 tensor_parallel_size=4,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 quant_config=quant_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -856,21 +841,20 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
        if mtp_nextn > 0:
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)

-        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
-                  tensor_parallel_size=tp_size,
-                  pipeline_parallel_size=pp_size,
-                  moe_expert_parallel_size=ep_size,
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  quant_config=quant_config,
-                  enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
+        with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 quant_config=quant_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:

-        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

-        with llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -926,23 +910,21 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
        if mtp_nextn > 0:
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)

-        llm = LLM(
-            f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
-            tensor_parallel_size=tp_size,
-            pipeline_parallel_size=pp_size,
-            moe_expert_parallel_size=ep_size,
-            kv_cache_config=kv_cache_config,
-            **pytorch_config,
-            quant_config=quant_config,
-            enable_attention_dp=attention_dp,
-            speculative_config=mtp_config,
-        )
+        with LLM(
+                f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
+                tensor_parallel_size=tp_size,
+                pipeline_parallel_size=pp_size,
+                moe_expert_parallel_size=ep_size,
+                kv_cache_config=kv_cache_config,
+                **pytorch_config,
+                quant_config=quant_config,
+                enable_attention_dp=attention_dp,
+                speculative_config=mtp_config,
+        ) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

-        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
-
-        with llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -966,13 +948,12 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
        pytorch_backend_options = dict(cuda_graph_config=CudaGraphConfig(),
                                       moe_backend="WIDEEP",
                                       moe_load_balancer=eplb_config)
-        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
-                  tensor_parallel_size=4,
-                  moe_expert_parallel_size=4,
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_backend_options,
-                  enable_attention_dp=True)
-        with llm:
+        with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/fp8",
+                 tensor_parallel_size=4,
+                 moe_expert_parallel_size=4,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_backend_options,
+                 enable_attention_dp=True) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -990,14 +971,13 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
        mtp_config = None
        if mtp_nextn > 0:
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
-        llm = LLM(self.MODEL_PATH,
-                  tensor_parallel_size=4,
-                  moe_expert_parallel_size=4,
-                  kv_cache_config=kv_cache_config,
-                  enable_attention_dp=True,
-                  **pytorch_config,
-                  speculative_config=mtp_config)
-        with llm:
+        with LLM(self.MODEL_PATH,
+                 tensor_parallel_size=4,
+                 moe_expert_parallel_size=4,
+                 kv_cache_config=kv_cache_config,
+                 enable_attention_dp=True,
+                 **pytorch_config,
+                 speculative_config=mtp_config) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -1018,14 +998,13 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
            pytorch_backend_options["kv_cache_dtype"] = "fp8"

-        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
-                  tensor_parallel_size=4,
-                  moe_expert_parallel_size=4,
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_backend_options,
-                  enable_attention_dp=True,
-                  quant_config=quant_config)
-        with llm:
+        with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only",
+                 tensor_parallel_size=4,
+                 moe_expert_parallel_size=4,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_backend_options,
+                 enable_attention_dp=True,
+                 quant_config=quant_config) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -1066,18 +1045,16 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
            pytorch_config["kv_cache_dtype"] = "fp8"

-        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp",
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  quant_config=quant_config,
-                  enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
+        with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp",
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 quant_config=quant_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

-        assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
-
-        with llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -1130,21 +1107,19 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
            quant_config.kv_cache_quant_algo = QuantAlgo.FP8
            pytorch_config["kv_cache_dtype"] = "fp8"

-        llm = LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp",
-                  tensor_parallel_size=tp_size,
-                  pipeline_parallel_size=pp_size,
-                  moe_expert_parallel_size=ep_size,
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  quant_config=quant_config,
-                  enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
+        with LLM(f"{llm_models_root()}/DeepSeek-V3-Lite/nvfp4_moe_only_mtp",
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 quant_config=quant_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

-        assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
-
-        with llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -1195,22 +1170,19 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
                quant_config.kv_cache_quant_algo = QuantAlgo.FP8
                pytorch_config["kv_cache_dtype"] = "fp8"

-        llm = LLM(model_path,
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  quant_config=quant_config,
-                  enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
+        with LLM(model_path,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 quant_config=quant_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:
+            if quant_dtype == "fp8":
+                assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+            elif quant_dtype == "nvfp4":
+                assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4

-        if quant_dtype == "fp8":
-            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-        elif quant_dtype == "nvfp4":
-            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
-
-        with llm:
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -1258,24 +1230,23 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
                quant_config.kv_cache_quant_algo = QuantAlgo.FP8
                pytorch_config["kv_cache_dtype"] = "fp8"

-        llm = LLM(model_path,
-                  kv_cache_config=kv_cache_config,
-                  enable_chunked_prefill=True,
-                  max_num_tokens=512,
-                  **pytorch_config,
-                  quant_config=quant_config,
-                  enable_attention_dp=True,
-                  speculative_config=mtp_config)
+        with LLM(model_path,
+                 kv_cache_config=kv_cache_config,
+                 enable_chunked_prefill=True,
+                 max_num_tokens=512,
+                 **pytorch_config,
+                 quant_config=quant_config,
+                 enable_attention_dp=True,
+                 speculative_config=mtp_config) as llm:

-        if quant_dtype == "fp8":
-            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-        elif quant_dtype == "nvfp4":
-            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+            if quant_dtype == "fp8":
+                assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+            elif quant_dtype == "nvfp4":
+                assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4

-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

-        with llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -1370,23 +1341,22 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
        mtp_config = None
        if mtp_nextn > 0:
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
-        llm = LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4",
-                  max_batch_size=max_batch_size,
-                  tensor_parallel_size=tp_size,
-                  pipeline_parallel_size=pp_size,
-                  moe_expert_parallel_size=ep_size,
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  quant_config=quant_config,
-                  enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
+        with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1-FP4",
+                 max_batch_size=max_batch_size,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 quant_config=quant_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:

-        assert llm.args.moe_backend == moe_backend
-        assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+            assert llm.args.moe_backend == moe_backend
+            assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

-        with llm:
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)
            task = GSM8K(self.MODEL_NAME)
@ -1421,21 +1391,20 @@ class TestDeepSeekR1(LlmapiAccuracyTestHarness):
        mtp_config = None
        if mtp_nextn > 0:
            mtp_config = MTPDecodingConfig(num_nextn_predict_layers=mtp_nextn)
-        llm = LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1",
-                  max_batch_size=max_batch_size,
-                  tensor_parallel_size=tp_size,
-                  pipeline_parallel_size=pp_size,
-                  moe_expert_parallel_size=ep_size,
-                  kv_cache_config=kv_cache_config,
-                  **pytorch_config,
-                  quant_config=quant_config,
-                  enable_attention_dp=attention_dp,
-                  speculative_config=mtp_config)
-        assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
-        if fp8kv:
-            assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
+        with LLM(f"{llm_models_root()}/DeepSeek-R1/DeepSeek-R1",
+                 max_batch_size=max_batch_size,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 **pytorch_config,
+                 quant_config=quant_config,
+                 enable_attention_dp=attention_dp,
+                 speculative_config=mtp_config) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+            if fp8kv:
+                assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8

-        with llm:
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)
            task = GSM8K(self.MODEL_NAME)
@ -1646,13 +1615,12 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
            disable_overlap_scheduler=not overlap_scheduler,
            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)

-        llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8",
-                  tensor_parallel_size=tp_size,
-                  pipeline_parallel_size=pp_size,
-                  moe_expert_parallel_size=ep_size,
-                  **pytorch_config,
-                  enable_attention_dp=attention_dp)
-        with llm:
+        with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B-FP8",
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp) as llm:
            task = CnnDailymail(self.MODEL_NAME)
            task.evaluate(llm)
            task = MMLU(self.MODEL_NAME)
@ -1668,13 +1636,12 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
            disable_overlap_scheduler=not overlap_scheduler,
            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)

-        llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B",
-                  tensor_parallel_size=tp_size,
-                  pipeline_parallel_size=pp_size,
-                  moe_expert_parallel_size=ep_size,
-                  **pytorch_config,
-                  enable_attention_dp=attention_dp)
-        with llm:
+        with LLM(f"{llm_models_root()}/Qwen3/Qwen3-8B",
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp) as llm:
            task = CnnDailymail(self.MODEL_NAME)
            task.evaluate(llm)
            task = MMLU(self.MODEL_NAME)
@ -1696,13 +1663,12 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
            disable_overlap_scheduler=not overlap_scheduler,
            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)

-        llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8",
-                  tensor_parallel_size=tp_size,
-                  pipeline_parallel_size=pp_size,
-                  moe_expert_parallel_size=ep_size,
-                  **pytorch_config,
-                  enable_attention_dp=attention_dp)
-        with llm:
+        with LLM(f"{llm_models_root()}/Qwen3/Qwen3-30B-A3B-FP8",
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -1717,14 +1683,12 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
            disable_overlap_scheduler=not overlap_scheduler,
            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)

-        llm = LLM(
-            f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf",
-            tensor_parallel_size=tp_size,
-            pipeline_parallel_size=pp_size,
-            moe_expert_parallel_size=ep_size,
-            **pytorch_config,
-            enable_attention_dp=attention_dp)
-        with llm:
+        with LLM(f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_fp8_hf",
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -1764,14 +1728,13 @@ class TestQwen3_30B_A3B(LlmapiAccuracyTestHarness):
            moe_backend=moe_backend,
        )

-        llm = LLM(
-            f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf",
-            tensor_parallel_size=tp_size,
-            pipeline_parallel_size=pp_size,
-            moe_expert_parallel_size=ep_size,
-            **pytorch_config,
-            enable_attention_dp=attention_dp)
-        with llm:
+        with LLM(
+                f"{llm_models_root()}/Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf",
+                tensor_parallel_size=tp_size,
+                pipeline_parallel_size=pp_size,
+                moe_expert_parallel_size=ep_size,
+                **pytorch_config,
+                enable_attention_dp=attention_dp) as llm:
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)

@ -1790,13 +1753,12 @@ class TestQwen3_32B(LlmapiAccuracyTestHarness):
            disable_overlap_scheduler=not overlap_scheduler,
            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)

-        llm = LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8",
-                  tensor_parallel_size=tp_size,
-                  pipeline_parallel_size=pp_size,
-                  moe_expert_parallel_size=ep_size,
-                  **pytorch_config,
-                  enable_attention_dp=attention_dp)
-        with llm:
+        with LLM(f"{llm_models_root()}/Qwen3/Qwen3-32B-FP8",
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=pp_size,
+                 moe_expert_parallel_size=ep_size,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp) as llm:
            task = CnnDailymail(self.MODEL_NAME)
            task.evaluate(llm)
            task = MMLU(self.MODEL_NAME)
@ -1819,15 +1781,14 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)

        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
-        llm = LLM(
-            f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
-            tensor_parallel_size=tp_size,
-            pipeline_parallel_size=pp_size,
-            moe_expert_parallel_size=ep_size,
-            **pytorch_config,
-            enable_attention_dp=attention_dp,
-            kv_cache_config=kv_cache_config)
-        with llm:
+        with LLM(
+                f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf",
+                tensor_parallel_size=tp_size,
+                pipeline_parallel_size=pp_size,
+                moe_expert_parallel_size=ep_size,
+                **pytorch_config,
+                enable_attention_dp=attention_dp,
+                kv_cache_config=kv_cache_config) as llm:
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)
            task = GSM8K(self.MODEL_NAME)
@ -1849,15 +1810,14 @@ class TestQwen3_235B_A22B(LlmapiAccuracyTestHarness):
            moe_backend=moe_backend)

        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
-        llm = LLM(
-            f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
-            tensor_parallel_size=tp_size,
-            pipeline_parallel_size=pp_size,
-            moe_expert_parallel_size=ep_size,
-            **pytorch_config,
-            enable_attention_dp=attention_dp,
-            kv_cache_config=kv_cache_config)
-        with llm:
+        with LLM(
+                f"{llm_models_root()}/Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf",
+                tensor_parallel_size=tp_size,
+                pipeline_parallel_size=pp_size,
+                moe_expert_parallel_size=ep_size,
+                **pytorch_config,
+                enable_attention_dp=attention_dp,
+                kv_cache_config=kv_cache_config) as llm:
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)
            task = GSM8K(self.MODEL_NAME)
--- a/tests/integration/test_lists/qa/examples_test_list.txt
+++ b/tests/integration/test_lists/qa/examples_test_list.txt
@ -169,8 +169,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instr
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4]
-examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
@ -194,8 +192,6 @@ examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp
 examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:2-bfloat16-bs:1-cpp_e2e:False-nb:1]
 # Multimodal Executor Cpp E2E Tests
-examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
 examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
--- a/tests/integration/test_lists/qa/llm_sanity_test.txt
+++ b/tests/integration/test_lists/qa/llm_sanity_test.txt
@ -1,157 +1,102 @@
-examples/test_bert.py::test_llm_bert_general[compare_hf-enable_remove_input_padding-use_attention_plugin-enable_context_fmha-tp:1-pp:1-float16-BertModel-bert/bert-base-uncased]
-examples/test_chatglm.py::test_llm_glm_4_9b_single_gpu_summary[glm-4-9b-chat-enable_weight_only]
-examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[disable_weight_only]
-examples/test_commandr.py::test_llm_commandr_v01_single_gpu_summary[enable_weight_only]
-examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-llama_v2-use_cpp_session-use_logits-draft_len_8-float16-bs1]
-examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[no_streaming-llama_v2-use_cpp_session-use_tokens-draft_len_4-float16-bs2]
-examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_logits-draft_len_4-float16-bs2]
-examples/test_draft_target_model.py::test_llm_draft_target_model_1gpu[streaming-gpt2-use_cpp_session-use_tokens-draft_len_8-float16-bs1]
-examples/test_eagle.py::test_llm_eagle_1gpu_modelopt_ckpt[llama3.1-eagle-8b-hf_v0.5-float16-bs8]
-examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle1]
-examples/test_eagle.py::test_llm_eagle_1gpu[EAGLE-Vicuna-7B-v1.3-float16-bs1-eagle2]
-examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-bart-large-cnn-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-disable_fp8]
-examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:1-pp:1-nb:1-enable_fp8]
-examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-flan-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-disable_fp8]
-examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-mbart-large-50-many-to-one-mmt-float16-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:2-nb:1-enable_fp8]
-examples/test_enc_dec.py::test_llm_enc_dec_general[compare_hf-t5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-enable_fp8]
-examples/test_enc_dec.py::test_llm_enc_dec_general[no_compare_hf-byt5-small-float32-enable_gemm_plugin-enable_attention_plugin-enable_paged_kv_cache-tp:2-pp:1-nb:1-disable_fp8]
-examples/test_exaone.py::test_llm_exaone_1gpu[enable_weight_only-exaone_3.0_7.8b_instruct-float16-nb:1]
-examples/test_exaone.py::test_llm_exaone_2gpu[exaone_3.0_7.8b_instruct-float16-nb:1]
-examples/test_gemma.py::test_llm_gemma_1gpu_summary_vswa[gemma-3-1b-it-other-bfloat16-8]
-examples/test_gemma.py::test_llm_hf_gemma_quantization_1gpu_vswa[gemma-3-1b-it-fp8-bfloat16-8]
-examples/test_gemma.py::test_hf_gemma_fp8_base_bf16_multi_lora[gemma-3-1b-it]
-examples/test_gpt.py::test_llm_gpt_starcoder_lora_1gpu[peft-lora-starcoder2-15b-unity-copilot-starcoder2-lora_fp16-base_fp8]
-examples/test_gpt.py::test_streaming_beam[batch_size_3-return_all_generated_tokens-num_beams_4]
-examples/test_llama.py::test_llm_llama_lookahead_xqa_fp8_1gpu[llama-3.2-1b]
-examples/test_llama.py::test_llm_llama_v1_4gpu_paged_kv_cache[llama-3.1-8b]
-examples/test_mamba.py::test_llm_mamba_1gpu[mamba-codestral-7B-v0.1-float16-enable_gemm_plugin]
-examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-1.3b-float16-enable_gemm_plugin]
-examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-130m-float16-enable_gemm_plugin]
-examples/test_mamba.py::test_llm_mamba_1gpu[mamba2-370m-float16-enable_gemm_plugin]
-examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1]
-examples/test_medusa.py::test_llm_medusa_1gpu[use_cpp_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8]
-examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs1]
-examples/test_medusa.py::test_llm_medusa_1gpu[use_py_session-medusa-vicuna-7b-v1.3-4-heads-bfloat16-bs8]
-examples/test_mixtral.py::test_llm_mixtral_moe_plugin_fp8_lora_4gpus[Mixtral-8x7B-v0.1-chinese-mixtral-lora]
-examples/test_multimodal.py::test_llm_multimodal_general[Phi-3-vision-128k-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[Phi-3.5-vision-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[Qwen2-VL-7B-Instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:4]
-examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[deplot-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-vision-trtllm-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-hf-vision-trtllm-pp:1-tp:2-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[video-neva-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[Llama-3.2-11B-Vision-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False-nb:1]
-examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-scienceqa-Llama-3.2-11B-Vision-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False]
-# Multimodal Executor Cpp E2E Tests
-examples/test_multimodal.py::test_llm_multimodal_general[VILA1.5-3b-pp:1-tp:1-float16-bs:8-cpp_e2e:True-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
-examples/test_multimodal.py::test_llm_multimodal_general[llava-1.5-7b-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:True-nb:1]
-
-examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-full_prec]
-examples/test_nemotron.py::test_llm_nemotron_3_8b_1gpu[bfloat16-int4_awq]
-examples/test_nemotron.py::test_llm_nemotron_4_15b_1gpu[bfloat16-fp8]
-examples/test_nemotron.py::test_llm_nemotron_4_15b_1gpu[bfloat16-full_prec]
-examples/test_nemotron_nas.py::test_nemotron_nas_summary_1gpu[DeciLM-7B]
-examples/test_phi.py::test_llm_phi_1node_2gpus_summary[Phi-3.5-MoE-instruct-nb:1]
-examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp16]
-examples/test_phi.py::test_llm_phi_lora_1gpu[Phi-3-mini-4k-instruct-ru-lora-Phi-3-mini-4k-instruct-lora_fp16-base_fp8]
-examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3.5-mini-instruct-fp8-float16]
-examples/test_prompt_lookup.py::test_llm_prompt_lookup_1gpu[no_streaming-gpt2-use_cpp_session-use_tokens-max_matching_ngram_size_2-prompt_lookup_num_tokens_8-float16-bs2]
-examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-disable_quant-float16-enable_attn_plugin-enable_gemm_plugin]
-examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-fp8-float16-enable_attn_plugin-enable_gemm_plugin]
-examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int8_sq-float16-enable_attn_plugin-enable_gemm_plugin]
-examples/test_recurrentgemma.py::test_llm_recurrentgemma_1gpu[use_cpp_session-recurrentgemma-2b-use_paged_cache-int4_awq-float16-enable_attn_plugin-enable_gemm_plugin]
-examples/test_redrafter.py::test_llm_redrafter_1gpu[use_cpp_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb8-bs8]
-examples/test_redrafter.py::test_llm_redrafter_1gpu[use_py_session-redrafter-vicuna-7b-v1.3-bfloat16-dl5-nb5-bs8]
-examples/test_whisper.py::test_llm_whisper_general[large-v3-disable_gemm_plugin-enable_attention_plugin-disable_weight_only-float16-nb:1-use_cpp_runtime]
-test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
-test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
-test_e2e.py::test_llama_e2e[use_py_session--]
-llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-codellama/CodeLlama-7b-Instruct-hf] # 5min
-llmapi/test_llm_e2e.py::test_llmapi_load_engine_from_build_command[llama-llama-models/llama-7b-hf] # 5min
-test_e2e.py::test_mistral_e2e[use_cpp_session-remove_input_padding--]
-test_e2e.py::test_mistral_e2e[use_py_session-remove_input_padding--]
-test_e2e.py::test_mistral_e2e[use_py_session---]
-test_e2e.py::test_openai_multi_chat_example
-test_e2e.py::test_openai_consistent_chat
-
-# Accuracy test list
-accuracy/test_cli_flow.py::TestStarcoder2_3B::test_auto_dtype
-accuracy/test_cli_flow.py::TestMinitron4BBase::test_auto_dtype
-accuracy/test_cli_flow.py::TestPhi3Mini4kInstruct::test_auto_dtype
-accuracy/test_cli_flow.py::TestPhi3Mini128kInstruct::test_auto_dtype
-accuracy/test_cli_flow.py::TestPhi3Small8kInstruct::test_auto_dtype
-accuracy/test_cli_flow.py::TestPhi3Small128kInstruct::test_auto_dtype
-accuracy/test_cli_flow.py::TestPhi3_5MiniInstruct::test_auto_dtype
-accuracy/test_cli_flow.py::TestPhi4MiniInstruct::test_auto_dtype
-accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_fp8
-accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[disable_norm_quant_fusion-disable_fused_quant]
-accuracy/test_cli_flow.py::TestLlama3_8BInstruct::test_nvfp4_gemm_plugin[enable_norm_quant_fusion-enable_fused_quant]
-accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin]
-accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin]
-accuracy/test_cli_flow.py::TestLlama3_1_8B::test_autoq
-accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_medusa_fp8_prequantized
-accuracy/test_cli_flow.py::TestLlama3_2_1B::test_auto_dtype
-accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
-accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
-accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
-accuracy/test_cli_flow.py::TestMistral7B::test_fp8_tp4pp2
-accuracy/test_cli_flow.py::TestMistral7B::test_beam_search
-accuracy/test_cli_flow.py::TestMixtral8x7B::test_fp8_tp2pp2
-accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int4_tp2
-accuracy/test_cli_flow.py::TestMixtral8x7B::test_weight_only_int8_tp2
-accuracy/test_cli_flow.py::TestMixtral8x7B::test_plugin_tp8[renormalize-tensor_parallel]
-accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
-accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
-accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
-accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
-accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
-accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
-accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
-accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
-accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
-accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
-accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
+accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False]
+accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
+accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
+accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
+accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_trtllmgen]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp4]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8]
+accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales[mtp=disable-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=2-fp8kv=False-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]
-accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
+accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[llguidance]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
+accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram
+accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
+accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
+accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
+accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
+accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
+accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
+accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype_gsm8k
+accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
+accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
+accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
+accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
+accuracy/test_llm_api_pytorch.py::TestNemotronNas::test_auto_dtype_tp8
+accuracy/test_llm_api_pytorch.py::TestQwen2_7BInstruct::test_auto_dtype
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[latency]
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency]
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cutlass]
+accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]
 accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm]
-accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
-
-# Pivot to Pytorch test cases.
+accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
+disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
+disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8[DeepSeek-V3-Lite-fp8]
+disaggregated/test_disaggregated.py::test_disaggregated_load_balance[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_workers.py::test_workers_conditional_disaggregation[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
+disaggregated/test_workers.py::test_workers_kv_cache_events[TinyLlama-1.1B-Chat-v1.0]
+test_e2e.py::test_openai_consistent_chat
+test_e2e.py::test_openai_multi_chat_example
+test_e2e.py::test_ptp_quickstart
+test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-405B-FP8-llama-3.1-model/Llama-3.1-405B-Instruct-FP8]
+test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-BF16-llama-3.1-model/Meta-Llama-3.1-70B]
+test_e2e.py::test_ptp_quickstart_advanced_8gpus[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8]
+test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1]
+test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1]
+test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
+test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
 test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
-test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-BF16-Mixtral-8x7B-v0.1]
-test_e2e.py::test_ptp_quickstart_advanced_8gpus[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1]
+test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Llama-3.2-11B-Vision]
+test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
+test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
+test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
+test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False]
+test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
+test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
 test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False]
-test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
+test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False]
+test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-image-False]
+test_e2e.py::test_ptp_quickstart_multimodal[qwen2-vl-7b-instruct-Qwen2-VL-7B-Instruct-video-False]
+test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-False]
+test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-image-True]
+test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-False]
+test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-True]
 test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B]
+test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
+test_e2e.py::test_qwen_e2e_cpprunner_large_new_tokens[DeepSeek-R1-Distill-Qwen-1.5B-DeepSeek-R1-Distill-Qwen-1.5B]
+test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
 test_e2e.py::test_trtllm_bench_pytorch_backend_sanity[meta-llama/Llama-3.1-8B-llama-3.1-8b-hf-nvfp4-False-False]
-
-# PyTorch flow disaggregated tests
-disaggregated/test_disaggregated.py::test_disaggregated_multi_gpu_with_mpirun[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_single_gpu_with_mpirun_trt_backend[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_cuda_graph[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_attention_dp_one_mtp[DeepSeek-V3-Lite-fp8]
-disaggregated/test_disaggregated.py::test_disaggregated_cache_aware_balance[TinyLlama-1.1B-Chat-v1.0]
-disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
+test_e2e.py::test_trtllm_benchmark_serving
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -449,3 +449,9 @@ examples/test_multimodal.py::test_llm_multimodal_general[llava-v1.6-mistral-7b-h
 examples/test_multimodal.py::test_llm_fp8_multimodal_general[fp8-fp8-cnn_dailymail-Qwen2-VL-7B-Instruct-pp:1-tp:1-bfloat16-bs:1-cpp_e2e:False] SKIP (https://nvbugs/5385987)
 examples/test_multimodal.py::test_llm_multimodal_general[Phi-4-multimodal-instruct-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5385992)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[throughput_tp8] SKIP (https://nvbugs/5377914)
+test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] SKIP (https://nvbugs/5387375)
+examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-video-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387423)
+examples/test_multimodal.py::test_llm_multimodal_general[llava-onevision-qwen2-7b-ov-hf-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387423)
+examples/test_multimodal.py::test_llm_multimodal_general[kosmos-2-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387422)
+examples/test_multimodal.py::test_llm_multimodal_general[fuyu-8b-pp:1-tp:1-float16-bs:1-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5387424)
+test_e2e.py::test_ptp_quickstart SKIP (https://nvbugs/5387762)