diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 261995f4b01..8c726be337e 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -5787,6 +5787,9 @@ class GPUModelRunner( num_scheduled_tokens, self.query_pos.np ) self.query_start_loc.np[1 : num_reqs + 1] = cum_num_tokens + self.query_start_loc.np[num_reqs + 1 : num_reqs_padded + 1].fill( + cum_num_tokens[-1] + ) self.query_start_loc.copy_to_gpu() # Sync block table CPU->GPU so cleared rows from