mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[https://nvbugs/5625990][chore] Add test coverage for current incapability of the KV cache manager (#8829)
Signed-off-by: eopXD <yuehtingc@nvidia.com>
This commit is contained in:
parent
67208f1512
commit
bd1c9c0af4
@ -1076,6 +1076,21 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_auto_dtype_vswa_without_reuse_low_memory_available(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=False,
|
||||
enable_partial_reuse=False,
|
||||
max_attention_window=[512, 512, 512, 512, 512, 32768],
|
||||
free_gpu_memory_fraction=0.1,
|
||||
)
|
||||
|
||||
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_auto_dtype_vswa_reuse(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
@ -1089,6 +1104,54 @@ class TestGemma3_1BInstruct(LlmapiAccuracyTestHarness):
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_auto_dtype_vswa_reuse_partial_reuse(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=True,
|
||||
enable_partial_reuse=True,
|
||||
max_attention_window=[512, 512, 512, 512, 512, 32768],
|
||||
)
|
||||
|
||||
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=True,
|
||||
enable_partial_reuse=False,
|
||||
max_attention_window=[512, 512, 512, 512, 512, 32768],
|
||||
free_gpu_memory_fraction=0.1,
|
||||
)
|
||||
|
||||
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@pytest.mark.skip(
|
||||
reason=
|
||||
"Currently failing due to accuracy drop, https://nvbugspro.nvidia.com/bug/5625990"
|
||||
)
|
||||
def test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
enable_block_reuse=True,
|
||||
enable_partial_reuse=True,
|
||||
max_attention_window=[512, 512, 512, 512, 512, 32768],
|
||||
free_gpu_memory_fraction=0.1,
|
||||
)
|
||||
|
||||
with LLM(self.MODEL_PATH, kv_cache_config=kv_cache_config) as llm:
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_auto_dtype_vswa_chunked_prefill_without_reuse(self):
|
||||
# NOTE: Test with VSWA kv cache config.
|
||||
kv_cache_config = KvCacheConfig(
|
||||
|
||||
@ -37,6 +37,10 @@ l0_h100:
|
||||
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse
|
||||
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_chunked_prefill_without_reuse
|
||||
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_chunked_prefill_reuse
|
||||
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_without_reuse_low_memory_available
|
||||
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_partial_reuse
|
||||
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_low_memory_available_no_partial_reuse
|
||||
- accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype_vswa_reuse_low_memory_available_partial_reuse
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16[attn_backend=TRTLLM-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM] TIMEOUT (90)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user