mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-19 01:05:12 +08:00
[https://nvbugs/5688388][fix] fix: Reducing num request in disagg test to speed up (#9598)
Signed-off-by: Patrice Castonguay <55748270+pcastonguay@users.noreply.github.com>
This commit is contained in:
parent
a560ba5546
commit
3991aa9c72
@ -351,8 +351,8 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
|
||||
max_tokens = 25
|
||||
|
||||
requests = []
|
||||
# Send 256 requests to make sure the context worker is saturated
|
||||
for _ in range(256):
|
||||
# Send 32 requests to make sure the context worker is saturated
|
||||
for _ in range(32):
|
||||
requests.append(
|
||||
(prompt, SamplingParams(max_tokens=1, ignore_eos=True),
|
||||
DisaggregatedParams(request_type="context_only")))
|
||||
|
||||
Loading…
Reference in New Issue
Block a user