From 3dbb4e0acef59930bef53b5b8e15ed869cf0a499 Mon Sep 17 00:00:00 2001
From: tc-mb <157115220+tc-mb@users.noreply.github.com>
Date: Thu, 4 Jun 2026 23:22:30 +0800
Subject: [PATCH] [Bugfix] MiniCPM-V-4.6 video inference crash: placeholder
 count mismatches visual embedding count (#44509)

Signed-off-by: tc-mb <tianchi_cai@icloud.com>
---
 vllm/model_executor/models/minicpmv4_6.py | 53 ++++++++++++++++++++++-
 vllm/multimodal/parse.py                  |  9 +++-
 2 files changed, 60 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/minicpmv4_6.py b/vllm/model_executor/models/minicpmv4_6.py
index d2d465b7e5a..c49af904769 100644
--- a/vllm/model_executor/models/minicpmv4_6.py
+++ b/vllm/model_executor/models/minicpmv4_6.py
@@ -5,7 +5,9 @@
 from collections.abc import Iterable, Mapping
 from typing import Any
 
+import numpy as np
 import torch
+from PIL import Image as PILImage
 from torch import nn
 from transformers import MiniCPMV4_6Config
 
@@ -30,7 +32,7 @@ from vllm.multimodal.inputs import (
     MultiModalFieldConfig,
     NestedTensors,
 )
-from vllm.multimodal.parse import ImageProcessorItems, VideoProcessorItems
+from vllm.multimodal.parse import ImageProcessorItems, ImageSize, VideoProcessorItems
 from vllm.multimodal.processing.processor import (
     PromptReplacement,
     PromptUpdateDetails,
@@ -239,12 +241,34 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
 
         per_video_pixel_values: list[torch.Tensor] = []
         per_video_tgt_sizes: list[torch.Tensor] = []
+        per_video_image_sizes: list[torch.Tensor] = []
 
         for video in parsed_videos:
             # video is iterable of frames (PIL Image or numpy array).
             all_slices: list[torch.Tensor] = []
             ts_list: list[torch.Tensor] = []
+            frame_sizes: list[torch.Tensor] = []
             for frame in video:
+                # Record per-frame (W, H) for video_image_sizes so that
+                # get_video_prompt_texts can consume a consistent frame size.
+                if isinstance(frame, PILImage.Image):
+                    w, h = frame.size
+                elif isinstance(frame, np.ndarray):
+                    if frame.ndim == 3 and frame.shape[-1] in (1, 3, 4):
+                        # HWC (e.g. from np.array(PIL.Image))
+                        h, w = frame.shape[0], frame.shape[1]
+                    else:
+                        # CHW
+                        _, h, w = frame.shape
+                elif isinstance(frame, torch.Tensor):
+                    if frame.ndim == 3 and frame.shape[-1] in (1, 3, 4):
+                        h, w = frame.shape[0], frame.shape[1]
+                    else:
+                        _, h, w = frame.shape
+                else:
+                    raise TypeError(f"Unsupported frame type: {type(frame)}")
+                frame_sizes.append(torch.tensor([w, h], dtype=torch.long, device="cpu"))
+
                 ip_out = image_processor([frame], **video_mm_kwargs)
                 pv = ip_out["pixel_values"]  # (1, C, P, sum_W)
                 ts = ip_out["target_sizes"]  # (n_slices, 2)
@@ -275,6 +299,7 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
 
             per_video_pixel_values.append(out)
             per_video_tgt_sizes.append(torch.cat(ts_list, dim=0))
+            per_video_image_sizes.append(torch.stack(frame_sizes))
 
         if not per_video_pixel_values:
             return {}
@@ -282,6 +307,7 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
         return {
             "video_pixel_values": per_video_pixel_values,
             "video_tgt_sizes": per_video_tgt_sizes,
+            "video_image_sizes": per_video_image_sizes,
         }
 
     def _get_prompt_updates(
@@ -327,6 +353,31 @@ class MiniCPMV4_6MultiModalProcessor(MiniCPMVMultiModalProcessor):
             )
 
         def get_video_replacement(item_idx: int):
+            # Prefer video_image_sizes from processed data so that the
+            # placeholder count is driven by the same frame sizes that the
+            # vision tower will actually consume.
+            video_mm_kwargs = out_mm_kwargs.get("video")
+            if video_mm_kwargs is not None and item_idx < len(video_mm_kwargs):
+                video_item = video_mm_kwargs[item_idx]
+                image_sizes_elem = video_item.get("video_image_sizes")
+                if image_sizes_elem is not None and image_sizes_elem.data is not None:
+                    # image_sizes_elem.data: (num_frames, 2) – each row is [W, H]
+                    image_sizes = image_sizes_elem.data
+                    num_frames = image_sizes.shape[0]
+                    frame_size = ImageSize(
+                        width=int(image_sizes[0, 0].item()),
+                        height=int(image_sizes[0, 1].item()),
+                    )
+                    return PromptUpdateDetails.select_text(
+                        self.get_video_prompt_texts(
+                            frame_size,
+                            num_frames,
+                            downsample_mode=ds_mode,
+                            video_idx=item_idx,
+                        ),
+                        video_embed_text,
+                    )
+
             videos = mm_items.get_items(
                 "video",
                 (MiniCPMVVideoEmbeddingItems, VideoProcessorItems),
diff --git a/vllm/multimodal/parse.py b/vllm/multimodal/parse.py
index f2187effab0..cdedd194227 100644
--- a/vllm/multimodal/parse.py
+++ b/vllm/multimodal/parse.py
@@ -378,7 +378,14 @@ class VideoProcessorItems(ProcessorBatchItems[HfVideoItem | None]):
         if isinstance(image, PILImage.Image):
             return ImageSize(*image.size)
         if isinstance(image, (np.ndarray, torch.Tensor)):
-            _, h, w = image.shape
+            if image.ndim == 3 and image.shape[-1] in (1, 3, 4):
+                # HWC format (e.g. from np.array(PIL.Image) via
+                # _get_video_with_metadata).  PIL images are always
+                # channels-last.
+                h, w = image.shape[0], image.shape[1]
+            else:
+                # CHW format (standard PyTorch / numpy convention).
+                _, h, w = image.shape
             return ImageSize(w, h)
 
         assert_never(image)