[Bugfix] MiniCPM-V-4.6 video inference crash: placeholder count mismatches visual embedding count (#44509)

Signed-off-by: tc-mb <tianchi_cai@icloud.com>
2026-06-06 00:16:14 +00:00 · 2026-06-04 23:22:30 +08:00
parent b21443e23c
commit 3dbb4e0ace
2 changed files with 60 additions and 2 deletions
@@ -5,7 +5,9 @@
 from collections.abc import Iterable, Mapping
 from typing import Any

+import numpy as np
 import torch
+from PIL import Image as PILImage
 from torch import nn
 from transformers import MiniCPMV4_6Config

@@ -30,7 +32,7 @@ from vllm.multimodal.inputs import (
    MultiModalFieldConfig,
    NestedTensors,
 )
-from vllm.multimodal.parse import ImageProcessorItems, VideoProcessorItems
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, VideoProcessorItems
 from vllm.multimodal.processing.processor import (
    PromptReplacement,
    PromptUpdateDetails,
@@ -239,12 +241,34 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):

        per_video_pixel_values: list[torch.Tensor] = []
        per_video_tgt_sizes: list[torch.Tensor] = []
+        per_video_image_sizes: list[torch.Tensor] = []

        for video in parsed_videos:
            # video is iterable of frames (PIL Image or numpy array).
            all_slices: list[torch.Tensor] = []
            ts_list: list[torch.Tensor] = []
+            frame_sizes: list[torch.Tensor] = []
            for frame in video:
+                # Record per-frame (W, H) for video_image_sizes so that
+                # get_video_prompt_texts can consume a consistent frame size.
+                if isinstance(frame, PILImage.Image):
+                    w, h = frame.size
+                elif isinstance(frame, np.ndarray):
+                    if frame.ndim == 3 and frame.shape[-1] in (1, 3, 4):
+                        # HWC (e.g. from np.array(PIL.Image))
+                        h, w = frame.shape[0], frame.shape[1]
+                    else:
+                        # CHW
+                        _, h, w = frame.shape
+                elif isinstance(frame, torch.Tensor):
+                    if frame.ndim == 3 and frame.shape[-1] in (1, 3, 4):
+                        h, w = frame.shape[0], frame.shape[1]
+                    else:
+                        _, h, w = frame.shape
+                else:
+                    raise TypeError(f"Unsupported frame type: {type(frame)}")
+                frame_sizes.append(torch.tensor([w, h], dtype=torch.long, device="cpu"))
+
                ip_out = image_processor([frame], **video_mm_kwargs)
                pv = ip_out["pixel_values"]  # (1, C, P, sum_W)
                ts = ip_out["target_sizes"]  # (n_slices, 2)
@@ -275,6 +299,7 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):

            per_video_pixel_values.append(out)
            per_video_tgt_sizes.append(torch.cat(ts_list, dim=0))
+            per_video_image_sizes.append(torch.stack(frame_sizes))

        if not per_video_pixel_values:
            return {}
@@ -282,6 +307,7 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
        return {
            "video_pixel_values": per_video_pixel_values,
            "video_tgt_sizes": per_video_tgt_sizes,
+            "video_image_sizes": per_video_image_sizes,
        }

    def _get_prompt_updates(
@@ -327,6 +353,31 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
            )

        def get_video_replacement(item_idx: int):
+            # Prefer video_image_sizes from processed data so that the
+            # placeholder count is driven by the same frame sizes that the
+            # vision tower will actually consume.
+            video_mm_kwargs = out_mm_kwargs.get("video")
+            if video_mm_kwargs is not None and item_idx < len(video_mm_kwargs):
+                video_item = video_mm_kwargs[item_idx]
+                image_sizes_elem = video_item.get("video_image_sizes")
+                if image_sizes_elem is not None and image_sizes_elem.data is not None:
+                    # image_sizes_elem.data: (num_frames, 2) – each row is [W, H]
+                    image_sizes = image_sizes_elem.data
+                    num_frames = image_sizes.shape[0]
+                    frame_size = ImageSize(
+                        width=int(image_sizes[0, 0].item()),
+                        height=int(image_sizes[0, 1].item()),
+                    )
+                    return PromptUpdateDetails.select_text(
+                        self.get_video_prompt_texts(
+                            frame_size,
+                            num_frames,
+                            downsample_mode=ds_mode,
+                            video_idx=item_idx,
+                        ),
+                        video_embed_text,
+                    )
+
            videos = mm_items.get_items(
                "video",
                (MiniCPMVVideoEmbeddingItems, VideoProcessorItems),
@@ -378,7 +378,14 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem | None]):
        if isinstance(image, PILImage.Image):
            return ImageSize(*image.size)
        if isinstance(image, (np.ndarray, torch.Tensor)):
-            _, h, w = image.shape
+            if image.ndim == 3 and image.shape[-1] in (1, 3, 4):
+                # HWC format (e.g. from np.array(PIL.Image) via
+                # _get_video_with_metadata).  PIL images are always
+                # channels-last.
+                h, w = image.shape[0], image.shape[1]
+            else:
+                # CHW format (standard PyTorch / numpy convention).
+                _, h, w = image.shape
            return ImageSize(w, h)

        assert_never(image)