mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
[Bugfix] MiniCPM-V-4.6 video inference crash: placeholder count mismatches visual embedding count (#44509)
Signed-off-by: tc-mb <tianchi_cai@icloud.com>
This commit is contained in:
@@ -5,7 +5,9 @@
|
||||
from collections.abc import Iterable, Mapping
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
from PIL import Image as PILImage
|
||||
from torch import nn
|
||||
from transformers import MiniCPMV4_6Config
|
||||
|
||||
@@ -30,7 +32,7 @@ from vllm.multimodal.inputs import (
|
||||
MultiModalFieldConfig,
|
||||
NestedTensors,
|
||||
)
|
||||
from vllm.multimodal.parse import ImageProcessorItems, VideoProcessorItems
|
||||
from vllm.multimodal.parse import ImageProcessorItems, ImageSize, VideoProcessorItems
|
||||
from vllm.multimodal.processing.processor import (
|
||||
PromptReplacement,
|
||||
PromptUpdateDetails,
|
||||
@@ -239,12 +241,34 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
|
||||
|
||||
per_video_pixel_values: list[torch.Tensor] = []
|
||||
per_video_tgt_sizes: list[torch.Tensor] = []
|
||||
per_video_image_sizes: list[torch.Tensor] = []
|
||||
|
||||
for video in parsed_videos:
|
||||
# video is iterable of frames (PIL Image or numpy array).
|
||||
all_slices: list[torch.Tensor] = []
|
||||
ts_list: list[torch.Tensor] = []
|
||||
frame_sizes: list[torch.Tensor] = []
|
||||
for frame in video:
|
||||
# Record per-frame (W, H) for video_image_sizes so that
|
||||
# get_video_prompt_texts can consume a consistent frame size.
|
||||
if isinstance(frame, PILImage.Image):
|
||||
w, h = frame.size
|
||||
elif isinstance(frame, np.ndarray):
|
||||
if frame.ndim == 3 and frame.shape[-1] in (1, 3, 4):
|
||||
# HWC (e.g. from np.array(PIL.Image))
|
||||
h, w = frame.shape[0], frame.shape[1]
|
||||
else:
|
||||
# CHW
|
||||
_, h, w = frame.shape
|
||||
elif isinstance(frame, torch.Tensor):
|
||||
if frame.ndim == 3 and frame.shape[-1] in (1, 3, 4):
|
||||
h, w = frame.shape[0], frame.shape[1]
|
||||
else:
|
||||
_, h, w = frame.shape
|
||||
else:
|
||||
raise TypeError(f"Unsupported frame type: {type(frame)}")
|
||||
frame_sizes.append(torch.tensor([w, h], dtype=torch.long, device="cpu"))
|
||||
|
||||
ip_out = image_processor([frame], **video_mm_kwargs)
|
||||
pv = ip_out["pixel_values"] # (1, C, P, sum_W)
|
||||
ts = ip_out["target_sizes"] # (n_slices, 2)
|
||||
@@ -275,6 +299,7 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
|
||||
|
||||
per_video_pixel_values.append(out)
|
||||
per_video_tgt_sizes.append(torch.cat(ts_list, dim=0))
|
||||
per_video_image_sizes.append(torch.stack(frame_sizes))
|
||||
|
||||
if not per_video_pixel_values:
|
||||
return {}
|
||||
@@ -282,6 +307,7 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
|
||||
return {
|
||||
"video_pixel_values": per_video_pixel_values,
|
||||
"video_tgt_sizes": per_video_tgt_sizes,
|
||||
"video_image_sizes": per_video_image_sizes,
|
||||
}
|
||||
|
||||
def _get_prompt_updates(
|
||||
@@ -327,6 +353,31 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
|
||||
)
|
||||
|
||||
def get_video_replacement(item_idx: int):
|
||||
# Prefer video_image_sizes from processed data so that the
|
||||
# placeholder count is driven by the same frame sizes that the
|
||||
# vision tower will actually consume.
|
||||
video_mm_kwargs = out_mm_kwargs.get("video")
|
||||
if video_mm_kwargs is not None and item_idx < len(video_mm_kwargs):
|
||||
video_item = video_mm_kwargs[item_idx]
|
||||
image_sizes_elem = video_item.get("video_image_sizes")
|
||||
if image_sizes_elem is not None and image_sizes_elem.data is not None:
|
||||
# image_sizes_elem.data: (num_frames, 2) – each row is [W, H]
|
||||
image_sizes = image_sizes_elem.data
|
||||
num_frames = image_sizes.shape[0]
|
||||
frame_size = ImageSize(
|
||||
width=int(image_sizes[0, 0].item()),
|
||||
height=int(image_sizes[0, 1].item()),
|
||||
)
|
||||
return PromptUpdateDetails.select_text(
|
||||
self.get_video_prompt_texts(
|
||||
frame_size,
|
||||
num_frames,
|
||||
downsample_mode=ds_mode,
|
||||
video_idx=item_idx,
|
||||
),
|
||||
video_embed_text,
|
||||
)
|
||||
|
||||
videos = mm_items.get_items(
|
||||
"video",
|
||||
(MiniCPMVVideoEmbeddingItems, VideoProcessorItems),
|
||||
|
||||
@@ -378,7 +378,14 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem | None]):
|
||||
if isinstance(image, PILImage.Image):
|
||||
return ImageSize(*image.size)
|
||||
if isinstance(image, (np.ndarray, torch.Tensor)):
|
||||
_, h, w = image.shape
|
||||
if image.ndim == 3 and image.shape[-1] in (1, 3, 4):
|
||||
# HWC format (e.g. from np.array(PIL.Image) via
|
||||
# _get_video_with_metadata). PIL images are always
|
||||
# channels-last.
|
||||
h, w = image.shape[0], image.shape[1]
|
||||
else:
|
||||
# CHW format (standard PyTorch / numpy convention).
|
||||
_, h, w = image.shape
|
||||
return ImageSize(w, h)
|
||||
|
||||
assert_never(image)
|
||||
|
||||
Reference in New Issue
Block a user