[nvbug 5300551] test: increase block count in eviction test (#5465)

Signed-off-by: zhengd-nv <200704041+zhengd-nv@users.noreply.github.com>
This commit is contained in:
Zheng Duan 2025-07-01 10:48:25 +08:00 committed by GitHub
parent 9fe1dd6be1
commit 1824c44004
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 7 additions and 3 deletions

View File

@ -401,7 +401,7 @@ class KvCacheAwareRouterTester(BasicWorkerTester):
# send a dummy request for initialization
dummy_request = {
"model": self.model_name,
"prompt": [3] * 200,
"prompt": [3] * 2000,
"max_tokens": 1,
"ignore_eos": True,
"temperature": 0.0,
@ -426,15 +426,16 @@ class KvCacheAwareRouterTester(BasicWorkerTester):
prompt=dummy_request["prompt"])
server, info = await self.gen_router.get_next_server(openai_request)
first_match = info["matches"][0]
logger.info(f"Matched blocks: {first_match}")
assert first_match > 0
await self.gen_router.finish_request(openai_request)
# flood requests until eviction
batch_size = 8
batch_size = 64
blocks_per_request = 32
requests = [copy.copy(dummy_request) for _ in range(batch_size)]
has_evicted = False
for i in range(0, block_pool_size // blocks_per_request + 10,
for i in range(0, block_pool_size // blocks_per_request * 2,
batch_size):
logger.info(f"Flooding request {i} ~ {i + batch_size - 1}")
prompt_len = self.gen_router._tokens_per_block * blocks_per_request - 10
@ -454,6 +455,8 @@ class KvCacheAwareRouterTester(BasicWorkerTester):
# the dummy request's reusable length decreases after eviction
server, info = await self.gen_router.get_next_server(openai_request)
logger.info(
f"Matched blocks: {first_match} -> {info['matches'][0]}")
assert info["matches"][0] < first_match

View File

@ -50,6 +50,7 @@ l0_b200:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm]
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
- test_e2e.py::test_ptp_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]