[https://nvbugs/5625990][chore] Add test coverage for current incapability of the KV cache manager (#8829)

Signed-off-by: eopXD <yuehtingc@nvidia.com>
2026-01-14 06:27:45 +08:00 · 2025-11-04 16:35:45 +08:00 · 2025-11-04 16:35:45 +08:00 · bd1c9c0af4
commit bd1c9c0af4
parent 67208f1512
2 changed files with 67 additions and 0 deletions
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@ -1076,6 +1076,21 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)

+    def test_auto_dtype_vswa_without_reuse_low_memory_available(self):
+        # NOTE: Test with VSWA kv cache config.
+        kv_cache_config = KvCacheConfig(
+            enable_block_reuse=False,
+            enable_partial_reuse=False,
+            max_attention_window=[512, 512, 512, 512, 512, 32768],
+            free_gpu_memory_fraction=0.1,
+        )
+
+        with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
    def test_auto_dtype_vswa_reuse(self):
        # NOTE: Test with VSWA kv cache config.
        kv_cache_config = KvCacheConfig(
@ -1089,6 +1104,54 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
            task = MMLU(self.MODEL_NAME)
            task.evaluate(llm)

+    def test_auto_dtype_vswa_reuse_partial_reuse(self):
+        # NOTE: Test with VSWA kv cache config.
+        kv_cache_config = KvCacheConfig(
+            enable_block_reuse=True,
+            enable_partial_reuse=True,
+            max_attention_window=[512, 512, 512, 512, 512, 32768],
+        )
+
+        with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    def test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse(self):
+        # NOTE: Test with VSWA kv cache config.
+        kv_cache_config = KvCacheConfig(
+            enable_block_reuse=True,
+            enable_partial_reuse=False,
+            max_attention_window=[512, 512, 512, 512, 512, 32768],
+            free_gpu_memory_fraction=0.1,
+        )
+
+        with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
+    @pytest.mark.skip(
+        reason=
+        "Currently failing due to accuracy drop, https://nvbugspro.nvidia.com/bug/5625990"
+    )
+    def test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse(self):
+        # NOTE: Test with VSWA kv cache config.
+        kv_cache_config = KvCacheConfig(
+            enable_block_reuse=True,
+            enable_partial_reuse=True,
+            max_attention_window=[512, 512, 512, 512, 512, 32768],
+            free_gpu_memory_fraction=0.1,
+        )
+
+        with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
+            task = MMLU(self.MODEL_NAME)
+            task.evaluate(llm)
+
    def test_auto_dtype_vswa_chunked_prefill_without_reuse(self):
        # NOTE: Test with VSWA kv cache config.
        kv_cache_config = KvCacheConfig(
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@ -37,6 +37,10 @@ l0_h100:
  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse
  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_chunked_prefill_without_reuse
  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_chunked_prefill_reuse
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_without_reuse_low_memory_available
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_partial_reuse
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse
+  - accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse
  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=False]
  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=True]
  - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] TIMEOUT (90)