mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[nvbug 5300551] test: increase block count in eviction test (#5465)
Signed-off-by: zhengd-nv <200704041+zhengd-nv@users.noreply.github.com>
This commit is contained in:
parent
9fe1dd6be1
commit
1824c44004
@ -401,7 +401,7 @@ class KvCacheAwareRouterTester(BasicWorkerTester):
|
||||
# send a dummy request for initialization
|
||||
dummy_request = {
|
||||
"model": self.model_name,
|
||||
"prompt": [3] * 200,
|
||||
"prompt": [3] * 2000,
|
||||
"max_tokens": 1,
|
||||
"ignore_eos": True,
|
||||
"temperature": 0.0,
|
||||
@ -426,15 +426,16 @@ class KvCacheAwareRouterTester(BasicWorkerTester):
|
||||
prompt=dummy_request["prompt"])
|
||||
server, info = await self.gen_router.get_next_server(openai_request)
|
||||
first_match = info["matches"][0]
|
||||
logger.info(f"Matched blocks: {first_match}")
|
||||
assert first_match > 0
|
||||
await self.gen_router.finish_request(openai_request)
|
||||
|
||||
# flood requests until eviction
|
||||
batch_size = 8
|
||||
batch_size = 64
|
||||
blocks_per_request = 32
|
||||
requests = [copy.copy(dummy_request) for _ in range(batch_size)]
|
||||
has_evicted = False
|
||||
for i in range(0, block_pool_size // blocks_per_request + 10,
|
||||
for i in range(0, block_pool_size // blocks_per_request * 2,
|
||||
batch_size):
|
||||
logger.info(f"Flooding request {i} ~ {i + batch_size - 1}")
|
||||
prompt_len = self.gen_router._tokens_per_block * blocks_per_request - 10
|
||||
@ -454,6 +455,8 @@ class KvCacheAwareRouterTester(BasicWorkerTester):
|
||||
|
||||
# the dummy request's reusable length decreases after eviction
|
||||
server, info = await self.gen_router.get_next_server(openai_request)
|
||||
logger.info(
|
||||
f"Matched blocks: {first_match} -> {info['matches'][0]}")
|
||||
assert info["matches"][0] < first_match
|
||||
|
||||
|
||||
|
||||
@ -50,6 +50,7 @@ l0_b200:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_no_kv_cache_reuse[quant_dtype=nvfp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_trtllm]
|
||||
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551
|
||||
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
|
||||
- test_e2e.py::test_ptp_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user