mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
fix: pad dummy run query_start_loc (#44603)
Signed-off-by: UranusSeven <109661872+UranusSeven@users.noreply.github.com>
This commit is contained in:
@@ -5787,6 +5787,9 @@ class GPUModelRunner(
|
||||
num_scheduled_tokens, self.query_pos.np
|
||||
)
|
||||
self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens
|
||||
self.query_start_loc.np[num_reqs + 1 : num_reqs_padded + 1].fill(
|
||||
cum_num_tokens[-1]
|
||||
)
|
||||
self.query_start_loc.copy_to_gpu()
|
||||
|
||||
# Sync block table CPU->GPU so cleared rows from
|
||||
|
||||
Reference in New Issue
Block a user