From 3dbb4e0acef59930bef53b5b8e15ed869cf0a499 Mon Sep 17 00:00:00 2001 From: tc-mb <157115220+tc-mb@users.noreply.github.com> Date: Thu, 4 Jun 2026 23:22:30 +0800 Subject: [PATCH] [Bugfix] MiniCPM-V-4.6 video inference crash: placeholder count mismatches visual embedding count (#44509) Signed-off-by: tc-mb --- vllm/model_executor/models/minicpmv4_6.py | 53 ++++++++++++++++++++++- vllm/multimodal/parse.py | 9 +++- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/vllm/model_executor/models/minicpmv4_6.py b/vllm/model_executor/models/minicpmv4_6.py index d2d465b7e5a..c49af904769 100644 --- a/vllm/model_executor/models/minicpmv4_6.py +++ b/vllm/model_executor/models/minicpmv4_6.py @@ -5,7 +5,9 @@ from collections.abc import Iterable, Mapping from typing import Any +import numpy as np import torch +from PIL import Image as PILImage from torch import nn from transformers import MiniCPMV4_6Config @@ -30,7 +32,7 @@ from vllm.multimodal.inputs import ( MultiModalFieldConfig, NestedTensors, ) -from vllm.multimodal.parse import ImageProcessorItems, VideoProcessorItems +from vllm.multimodal.parse import ImageProcessorItems, ImageSize, VideoProcessorItems from vllm.multimodal.processing.processor import ( PromptReplacement, PromptUpdateDetails, @@ -239,12 +241,34 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor): per_video_pixel_values: list[torch.Tensor] = [] per_video_tgt_sizes: list[torch.Tensor] = [] + per_video_image_sizes: list[torch.Tensor] = [] for video in parsed_videos: # video is iterable of frames (PIL Image or numpy array). all_slices: list[torch.Tensor] = [] ts_list: list[torch.Tensor] = [] + frame_sizes: list[torch.Tensor] = [] for frame in video: + # Record per-frame (W, H) for video_image_sizes so that + # get_video_prompt_texts can consume a consistent frame size. + if isinstance(frame, PILImage.Image): + w, h = frame.size + elif isinstance(frame, np.ndarray): + if frame.ndim == 3 and frame.shape[-1] in (1, 3, 4): + # HWC (e.g. from np.array(PIL.Image)) + h, w = frame.shape[0], frame.shape[1] + else: + # CHW + _, h, w = frame.shape + elif isinstance(frame, torch.Tensor): + if frame.ndim == 3 and frame.shape[-1] in (1, 3, 4): + h, w = frame.shape[0], frame.shape[1] + else: + _, h, w = frame.shape + else: + raise TypeError(f"Unsupported frame type: {type(frame)}") + frame_sizes.append(torch.tensor([w, h], dtype=torch.long, device="cpu")) + ip_out = image_processor([frame], **video_mm_kwargs) pv = ip_out["pixel_values"] # (1, C, P, sum_W) ts = ip_out["target_sizes"] # (n_slices, 2) @@ -275,6 +299,7 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor): per_video_pixel_values.append(out) per_video_tgt_sizes.append(torch.cat(ts_list, dim=0)) + per_video_image_sizes.append(torch.stack(frame_sizes)) if not per_video_pixel_values: return {} @@ -282,6 +307,7 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor): return { "video_pixel_values": per_video_pixel_values, "video_tgt_sizes": per_video_tgt_sizes, + "video_image_sizes": per_video_image_sizes, } def _get_prompt_updates( @@ -327,6 +353,31 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor): ) def get_video_replacement(item_idx: int): + # Prefer video_image_sizes from processed data so that the + # placeholder count is driven by the same frame sizes that the + # vision tower will actually consume. + video_mm_kwargs = out_mm_kwargs.get("video") + if video_mm_kwargs is not None and item_idx < len(video_mm_kwargs): + video_item = video_mm_kwargs[item_idx] + image_sizes_elem = video_item.get("video_image_sizes") + if image_sizes_elem is not None and image_sizes_elem.data is not None: + # image_sizes_elem.data: (num_frames, 2) – each row is [W, H] + image_sizes = image_sizes_elem.data + num_frames = image_sizes.shape[0] + frame_size = ImageSize( + width=int(image_sizes[0, 0].item()), + height=int(image_sizes[0, 1].item()), + ) + return PromptUpdateDetails.select_text( + self.get_video_prompt_texts( + frame_size, + num_frames, + downsample_mode=ds_mode, + video_idx=item_idx, + ), + video_embed_text, + ) + videos = mm_items.get_items( "video", (MiniCPMVVideoEmbeddingItems, VideoProcessorItems), diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py index f2187effab0..cdedd194227 100644 --- a/vllm/multimodal/parse.py +++ b/vllm/multimodal/parse.py @@ -378,7 +378,14 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem | None]): if isinstance(image, PILImage.Image): return ImageSize(*image.size) if isinstance(image, (np.ndarray, torch.Tensor)): - _, h, w = image.shape + if image.ndim == 3 and image.shape[-1] in (1, 3, 4): + # HWC format (e.g. from np.array(PIL.Image) via + # _get_video_with_metadata). PIL images are always + # channels-last. + h, w = image.shape[0], image.shape[1] + else: + # CHW format (standard PyTorch / numpy convention). + _, h, w = image.shape return ImageSize(w, h) assert_never(image)