[TRTQA-2920][chore] improve hang tests (#6781)

Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-08-12 18:26:51 +08:00 · 2025-08-12 18:26:51 +08:00 · e35fca4272
commit e35fca4272
parent 8845e0f065
2 changed files with 10 additions and 3 deletions
--- a/tests/integration/defs/accuracy/test_disaggregated_serving.py
+++ b/tests/integration/defs/accuracy/test_disaggregated_serving.py
@ -505,11 +505,12 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
            task = JsonModeEval(self.MODEL_NAME)
            task.evaluate(llm)

-    @pytest.mark.skip_less_device(2)
    @pytest.mark.parametrize("tp,pp", [(1, 2), (2, 1), (2, 2)],
                             ids=["tp1pp2", "tp2pp1", "tp2pp2"])
    @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"])
    def test_tp_pp_symmetric(self, tp, pp, testset):
+        if tp * pp * 2 > get_device_count():
+            pytest.skip(f"Not enough devices for tp={tp}*pp={pp} test")
        return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, pp, tp, pp,
                                 tp, get_accuracy_task(testset))

@ -517,6 +518,9 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
    @parametrize_with_ids("gen_tp", [1, 2])
    @pytest.mark.parametrize("testset", ["GSM8K", "MMLU"])
    def test_ctx_pp_gen_tp_asymmetric(self, ctx_pp, gen_tp, testset):
+        if ctx_pp * gen_tp * 2 > get_device_count():
+            pytest.skip(
+                f"Not enough devices for ctx_pp={ctx_pp}*gen_tp={gen_tp} test")
        return run_parallel_test(self.MODEL_NAME, self.MODEL_PATH, ctx_pp, 1, 1,
                                 gen_tp, get_accuracy_task(testset))

@ -527,6 +531,7 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
    MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
    MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"

+    @pytest.mark.skip_less_device(8)
    @pytest.mark.parametrize("overlap_scheduler", [False, True])
    def test_auto_dtype(self, overlap_scheduler):
        ctx_server_config = {"disable_overlap_scheduler": True}
@ -565,6 +570,7 @@ class TestDeepSeekV3Lite(LlmapiAccuracyTestHarness):
    MODEL_NAME = "deepseek-ai/DeepSeek-V3-Lite"
    MODEL_PATH = f"{llm_models_root()}/DeepSeek-V3-Lite/bf16"

+    @pytest.mark.skip_less_device(8)
    @parametrize_with_ids("overlap_scheduler", [True, False])
    @parametrize_with_ids("mtp_nextn",
                          [0, pytest.param(2, marks=skip_pre_hopper)])
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -250,7 +250,7 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
                                              enable_padding=True),
        )
        kv_cache_config = KvCacheConfig(
-            enable_block_reuse=True
+            enable_block_reuse=True, free_gpu_memory_fraction=0.8
        )  # both one-model and two-model supports this feature

        eagle_model_dir = f"{llm_models_root()}/EAGLE3-LLaMA3.1-Instruct-8B"
@ -280,7 +280,8 @@ class TestLlama3_1_8BInstruct(LlmapiAccuracyTestHarness):
            cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
        )

-        kv_cache_config = KvCacheConfig(enable_block_reuse=False)
+        kv_cache_config = KvCacheConfig(enable_block_reuse=False,
+                                        free_gpu_memory_fraction=0.8)

        spec_config = NGramDecodingConfig(
            max_draft_len=4,