[https://nvbugs/5625990][chore] Add test coverage for current incapability of the KV cache manager (#8829)

Signed-off-by: eopXD <yuehtingc@nvidia.com>
This commit is contained in:
Yueh-Ting (eop) Chen 2025-11-04 16:35:45 +08:00 committed by GitHub
parent 67208f1512
commit bd1c9c0af4
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 67 additions and 0 deletions

View File

@ -1076,6 +1076,21 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
def test_auto_dtype_vswa_without_reuse_low_memory_available(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(
enable_block_reuse=False,
enable_partial_reuse=False,
max_attention_window=[512, 512, 512, 512, 512, 32768],
free_gpu_memory_fraction=0.1,
)
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
def test_auto_dtype_vswa_reuse(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(
@ -1089,6 +1104,54 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
def test_auto_dtype_vswa_reuse_partial_reuse(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(
enable_block_reuse=True,
enable_partial_reuse=True,
max_attention_window=[512, 512, 512, 512, 512, 32768],
)
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
def test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(
enable_block_reuse=True,
enable_partial_reuse=False,
max_attention_window=[512, 512, 512, 512, 512, 32768],
free_gpu_memory_fraction=0.1,
)
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
@pytest.mark.skip(
reason=
"Currently failing due to accuracy drop, https://nvbugspro.nvidia.com/bug/5625990"
)
def test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(
enable_block_reuse=True,
enable_partial_reuse=True,
max_attention_window=[512, 512, 512, 512, 512, 32768],
free_gpu_memory_fraction=0.1,
)
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
task = MMLU(self.MODEL_NAME)
task.evaluate(llm)
def test_auto_dtype_vswa_chunked_prefill_without_reuse(self):
# NOTE: Test with VSWA kv cache config.
kv_cache_config = KvCacheConfig(

View File

@ -37,6 +37,10 @@ l0_h100:
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_chunked_prefill_without_reuse
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_chunked_prefill_reuse
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_without_reuse_low_memory_available
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_partial_reuse
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=False]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=True]
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] TIMEOUT (90)