[Bugfix][Multimodal] PyAV video backend returns keyframes labeled as targets (#42586)

Signed-off-by: Ranran <hzz5361@psu.edu>
2026-06-06 00:16:14 +00:00 · 2026-05-14 11:56:59 -04:00
parent a7737cb4f3
commit f3d5360591
3 changed files with 100 additions and 6 deletions
@@ -13,7 +13,7 @@ from vllm.multimodal.video import (
    VideoLoader,
 )

-from .utils import create_video_from_image
+from .utils import create_long_gop_video, create_video_from_image

 pytestmark = pytest.mark.cpu_test

@@ -364,6 +364,49 @@ def test_pyav_dynamic_backend_loads_frames(
        assert metadata["video_backend"] == "pyav_dynamic"


+def test_pyav_backend_returns_target_frames_not_keyframes():
+    """Regression test: PyAV must decode forward past the seek keyframe.
+
+    container.seek() snaps backward to the nearest keyframe. With a long GOP
+    (here: one keyframe at frame 0), a decoder that does not advance forward
+    to the target PTS collapses every sampled slot onto the keyframe. This
+    test encodes a per-frame marker on the green channel and verifies the
+    returned frames are distinct, ordered, and match the requested indices.
+    """
+    num_frames = 50
+    num_sampled = 4
+    height, width = 64, 64
+
+    video_bytes = create_long_gop_video(
+        num_frames=num_frames, width=width, height=height
+    )
+
+    loader = VIDEO_LOADER_REGISTRY.load("opencv")
+    frames, metadata = loader.load_bytes(
+        video_bytes, num_frames=num_sampled, backend="pyav"
+    )
+    assert frames.shape == (num_sampled, height, width, 3)
+
+    requested = list(metadata["frames_indices"])
+    assert len(requested) == num_sampled
+
+    actual = [int(f[height // 2, width // 2, 1]) for f in frames]
+
+    assert len(set(actual)) == num_sampled, (
+        f"PyAV returned only {len(set(actual))} distinct frames for "
+        f"{num_sampled} requested indices: markers={actual}, "
+        f"requested={requested}. Keyframe-snap regression."
+    )
+
+    assert actual == sorted(actual), f"Returned frames out of order: markers={actual}"
+
+    for marker, want_idx in zip(actual, requested):
+        assert abs(marker - want_idx) <= 10, (
+            f"Frame mismatch: requested index {want_idx}, "
+            f"got marker {marker} (tolerance ±10)"
+        )
+
+
@pytest.mark.parametrize(
    "loader_key, kwargs, expected_num_frames",
    [
@@ -66,6 +66,43 @@ def create_video_from_image(
    return video_path


+def create_long_gop_video(
+    num_frames: int = 50,
+    fps: int = 30,
+    width: int = 64,
+    height: int = 64,
+) -> bytes:
+    """Encode an H.264 clip with one keyframe and green-channel = frame index.
+
+    The marker lets a test recover which frame the decoder actually returned,
+    independent of any metadata label.
+    """
+    import io
+
+    import av
+
+    buf = io.BytesIO()
+    with av.open(buf, mode="w", format="mp4") as container:
+        stream = container.add_stream("h264", rate=fps)
+        stream.width = width
+        stream.height = height
+        stream.pix_fmt = "yuv420p"
+        stream.codec_context.gop_size = num_frames
+        stream.codec_context.max_b_frames = 0
+        stream.codec_context.options = {
+            "x264-params": (f"scenecut=0:keyint={num_frames}:min-keyint={num_frames}")
+        }
+        for i in range(num_frames):
+            img = np.zeros((height, width, 3), dtype=np.uint8)
+            img[:, :, 1] = i
+            frame = av.VideoFrame.from_ndarray(img, format="rgb24")
+            for packet in stream.encode(frame):
+                container.mux(packet)
+        for packet in stream.encode():
+            container.mux(packet)
+    return buf.getvalue()
+
+
 def cosine_similarity(A: npt.NDArray, B: npt.NDArray, axis: int = -1) -> npt.NDArray:
    """Compute cosine similarity between two vectors."""
    return np.sum(A * B, axis=axis) / (
@@ -390,7 +390,7 @@ class PyAVVideoBackendMixin:
        fps: float,
        duration: float,
    ) -> tuple[npt.NDArray, list[int]]:
-        """Decode target frames via per-frame seek + keyframe decode."""
+        """Decode target frames via per-frame seek + forward decode to PTS."""
        stream = container.streams.video[0]
        # SLICE parallelizes within a single frame without the
        # one-frame-per-thread latency penalty of FRAME threading.
@@ -402,14 +402,28 @@ class PyAVVideoBackendMixin:
        frame_interval = 1.0 / fps if fps > 0 else 0.1
        max_ts = max(0.0, duration - frame_interval) if duration > 0 else float("inf")

+        decoder = None
+        last_pts = None
        for idx in frame_indices:
            ts = min(idx / fps, max_ts) if fps > 0 else 0.0
            pts = int(ts / time_base)
-            container.seek(pts, stream=stream)
-            frame = next(container.decode(video=0), None)
-            if frame is not None:
-                frames_list.append(frame.to_ndarray(format="rgb24"))
+            # seek() snaps backward to a keyframe; reuse the running decoder
+            # while targets advance monotonically to avoid re-decoding the
+            # GOP prefix once per requested frame.
+            if decoder is None or last_pts is None or pts <= last_pts:
+                container.seek(pts, stream=stream)
+                decoder = container.decode(video=0)
+            chosen = None
+            for frame in decoder:
+                if frame.pts is not None and frame.pts >= pts:
+                    chosen = frame
+                    last_pts = frame.pts
+                    break
+            if chosen is not None:
+                frames_list.append(chosen.to_ndarray(format="rgb24"))
                valid_indices.append(idx)
+            else:
+                decoder = None

        if not frames_list:
            return np.empty((0,), dtype=np.uint8), valid_indices