[Bugfix] MiniCPM-V-4.6 video inference crash: placeholder count mismatches visual embedding count (#44509)

Signed-off-by: tc-mb <tianchi_cai@icloud.com>
This commit is contained in:
tc-mb
2026-06-04 23:22:30 +08:00
committed by GitHub
parent b21443e23c
commit 3dbb4e0ace
2 changed files with 60 additions and 2 deletions
+52 -1
View File
@@ -5,7 +5,9 @@
from collections.abc import Iterable, Mapping
from typing import Any
import numpy as np
import torch
from PIL import Image as PILImage
from torch import nn
from transformers import MiniCPMV4_6Config
@@ -30,7 +32,7 @@ from vllm.multimodal.inputs import (
MultiModalFieldConfig,
NestedTensors,
)
from vllm.multimodal.parse import ImageProcessorItems, VideoProcessorItems
from vllm.multimodal.parse import ImageProcessorItems, ImageSize, VideoProcessorItems
from vllm.multimodal.processing.processor import (
PromptReplacement,
PromptUpdateDetails,
@@ -239,12 +241,34 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
per_video_pixel_values: list[torch.Tensor] = []
per_video_tgt_sizes: list[torch.Tensor] = []
per_video_image_sizes: list[torch.Tensor] = []
for video in parsed_videos:
# video is iterable of frames (PIL Image or numpy array).
all_slices: list[torch.Tensor] = []
ts_list: list[torch.Tensor] = []
frame_sizes: list[torch.Tensor] = []
for frame in video:
# Record per-frame (W, H) for video_image_sizes so that
# get_video_prompt_texts can consume a consistent frame size.
if isinstance(frame, PILImage.Image):
w, h = frame.size
elif isinstance(frame, np.ndarray):
if frame.ndim == 3 and frame.shape[-1] in (1, 3, 4):
# HWC (e.g. from np.array(PIL.Image))
h, w = frame.shape[0], frame.shape[1]
else:
# CHW
_, h, w = frame.shape
elif isinstance(frame, torch.Tensor):
if frame.ndim == 3 and frame.shape[-1] in (1, 3, 4):
h, w = frame.shape[0], frame.shape[1]
else:
_, h, w = frame.shape
else:
raise TypeError(f"Unsupported frame type: {type(frame)}")
frame_sizes.append(torch.tensor([w, h], dtype=torch.long, device="cpu"))
ip_out = image_processor([frame], **video_mm_kwargs)
pv = ip_out["pixel_values"] # (1, C, P, sum_W)
ts = ip_out["target_sizes"] # (n_slices, 2)
@@ -275,6 +299,7 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
per_video_pixel_values.append(out)
per_video_tgt_sizes.append(torch.cat(ts_list, dim=0))
per_video_image_sizes.append(torch.stack(frame_sizes))
if not per_video_pixel_values:
return {}
@@ -282,6 +307,7 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
return {
"video_pixel_values": per_video_pixel_values,
"video_tgt_sizes": per_video_tgt_sizes,
"video_image_sizes": per_video_image_sizes,
}
def _get_prompt_updates(
@@ -327,6 +353,31 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
)
def get_video_replacement(item_idx: int):
# Prefer video_image_sizes from processed data so that the
# placeholder count is driven by the same frame sizes that the
# vision tower will actually consume.
video_mm_kwargs = out_mm_kwargs.get("video")
if video_mm_kwargs is not None and item_idx < len(video_mm_kwargs):
video_item = video_mm_kwargs[item_idx]
image_sizes_elem = video_item.get("video_image_sizes")
if image_sizes_elem is not None and image_sizes_elem.data is not None:
# image_sizes_elem.data: (num_frames, 2) each row is [W, H]
image_sizes = image_sizes_elem.data
num_frames = image_sizes.shape[0]
frame_size = ImageSize(
width=int(image_sizes[0, 0].item()),
height=int(image_sizes[0, 1].item()),
)
return PromptUpdateDetails.select_text(
self.get_video_prompt_texts(
frame_size,
num_frames,
downsample_mode=ds_mode,
video_idx=item_idx,
),
video_embed_text,
)
videos = mm_items.get_items(
"video",
(MiniCPMVVideoEmbeddingItems, VideoProcessorItems),
+7
View File
@@ -378,6 +378,13 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem | None]):
if isinstance(image, PILImage.Image):
return ImageSize(*image.size)
if isinstance(image, (np.ndarray, torch.Tensor)):
if image.ndim == 3 and image.shape[-1] in (1, 3, 4):
# HWC format (e.g. from np.array(PIL.Image) via
# _get_video_with_metadata). PIL images are always
# channels-last.
h, w = image.shape[0], image.shape[1]
else:
# CHW format (standard PyTorch / numpy convention).
_, h, w = image.shape
return ImageSize(w, h)