[MM][Misc] Support image+video mixed inputs (per prompt) for VLM examples (#40335)

Signed-off-by: shen-shanshan <467638484@qq.com>
2026-06-06 00:16:14 +00:00 · 2026-04-21 11:43:25 +08:00
parent 989cc12d88
commit b47840019e
1 changed files with 291 additions and 101 deletions
@@ -394,18 +394,24 @@ def run_eagle2_5(questions: list[str], modality: str) -> ModelRequestData:
 def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=5,
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
        trust_remote_code=True,
    )

+    image_placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+    video_placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+
    if modality == "image":
-        placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
@@ -425,6 +431,7 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
 def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "LGAI-EXAONE/EXAONE-4.5-33B"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -434,18 +441,23 @@ def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData:
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "<vision><|image_pad|></vision>"
+    video_placeholder = "<vision><|video_pad|></vision>"
+
    if modality == "image":
-        placeholder = "<|image_pad|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|video_pad|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
            "<|system|>\nYou are a helpful assistant.<|endofturn|>\n"
-            f"<|user|>\n<vision>{placeholder}</vision>"
+            f"<|user|>\n{placeholder}"
            f"{question}<|endofturn|>\n"
            "<|assistant|>\n"
        )
@@ -566,6 +578,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
 def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "zai-org/GLM-4.1V-9B-Thinking"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -574,14 +587,19 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
            "fps": 1,
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
        enforce_eager=True,
    )

+    image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
    if modality == "image":
-        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
@@ -602,6 +620,7 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
 def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -610,15 +629,20 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
            "fps": 1,
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
        enforce_eager=True,
        tensor_parallel_size=4,
    )

+    image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
    if modality == "image":
-        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
@@ -639,6 +663,7 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
 def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "zai-org/GLM-4.5V-FP8"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -647,15 +672,20 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
            "fps": 1,
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
        enforce_eager=True,
        tensor_parallel_size=4,
    )

+    image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
    if modality == "image":
-        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
@@ -676,6 +706,7 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
 def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "zai-org/GLM-OCR"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -684,14 +715,19 @@ def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
            "size": {"shortest_edge": 12544, "longest_edge": 47040000},
            "fps": 1,
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
        enforce_eager=True,
    )

+    image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+    video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+
    if modality == "image":
-        placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
@@ -772,11 +808,12 @@ def run_hyperclovax_seed_vision(
    model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
-        max_model_len=8192 if modality == "image" else 16384,
-        limit_mm_per_prompt={modality: 1},
+        max_model_len=16384 if modality in ("video", "image+video") else 8192,
+        limit_mm_per_prompt=mm_limit,
    )

    messages = list()
@@ -828,6 +865,29 @@ def run_hyperclovax_seed_vision(
                    }
                ]
            )
+        elif modality == "image+video":
+            messages.append(
+                [
+                    {
+                        "role": "user",
+                        "content": [
+                            {
+                                "type": "image",
+                                "ocr": "",
+                                "lens_keywords": "",
+                                "lens_local_keywords": "",
+                            },
+                            {
+                                "type": "video",
+                            },
+                            {
+                                "type": "text",
+                                "text": question,
+                            },
+                        ],
+                    }
+                ]
+            )
        else:
            raise ValueError(f"Unsupported modality: {modality}")

@@ -876,19 +936,25 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
 def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "internlm/Intern-S1-mini"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=2,
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
        enforce_eager=True,
    )

+    image_placeholder = "<IMG_CONTEXT>"
+    video_placeholder = "<video>"
+
    if modality == "image":
-        placeholder = "<IMG_CONTEXT>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<video>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + "\n" + video_placeholder

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
@@ -909,20 +975,26 @@ def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
 def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "internlm/Intern-S1-Pro"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
        max_num_seqs=2,
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
        enforce_eager=True,
        tensor_parallel_size=4,
    )

+    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+
    if modality == "image":
-        placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
@@ -943,17 +1015,23 @@ def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData:
 def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "OpenGVLab/InternVL3-2B"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        max_model_len=8192,
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "<image>"
+    video_placeholder = "<video>"
+
    if modality == "image":
-        placeholder = "<image>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<video>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + "\n" + video_placeholder

    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
    messages = [
@@ -1010,21 +1088,27 @@ def run_kanana_v(questions: list[str], modality: str) -> ModelRequestData:
 def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-8B-Preview"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        trust_remote_code=True,
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+
    if modality == "image":
-        placeholder = "<|image_pad|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|video_pad|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
-            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"<|im_start|>user\n{placeholder}"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
@@ -1041,21 +1125,27 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
 def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Kwai-Keye/Keye-VL-1.5-8B"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=8192,
        trust_remote_code=True,
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+
    if modality == "image":
-        placeholder = "<|image_pad|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|video_pad|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
-            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"<|im_start|>user\n{placeholder}"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
@@ -1259,22 +1349,26 @@ def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestDat

 # LLaVA-OneVision
 def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData:
-    if modality == "video":
-        prompts = [
-            f"<|im_start|>user <video>\n{question}<|im_end|><|im_start|>assistant\n"
-            for question in questions
-        ]
+    image_placeholder = "<image>"
+    video_placeholder = "<video>"

-    elif modality == "image":
-        prompts = [
-            f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
-            for question in questions
-        ]
+    if modality == "image":
+        placeholder = image_placeholder
+    elif modality == "video":
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + "\n" + video_placeholder

+    prompts = [
+        (f"<|im_start|>user {placeholder}\n{question}<|im_end|><|im_start|>assistant\n")
+        for question in questions
+    ]
+
+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
        max_model_len=16384,
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

    return ModelRequestData(
@@ -1307,7 +1401,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:

 # MiniCPM-V
 def run_minicpmv_base(questions: list[str], modality: str, model_name):
-    assert modality in ["image", "video"]
+    assert modality in ["image", "video", "image+video"]
    # If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa

    # 2.0
@@ -1329,12 +1423,13 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
    # o2.6: image, video, audio
    # model_name = "openbmb/MiniCPM-o-2_6"
    tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )
    # NOTE The stop_token_ids are different for various versions of MiniCPM-V
    # 2.0
@@ -1347,17 +1442,22 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
    stop_tokens = ["<|im_end|>", "<|endoftext|>"]
    stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]

-    modality_placeholder = {
-        "image": "(<image>./</image>)",
-        "video": "(<video>./</video>)",
-    }
+    image_placeholder = "(<image>./</image>)"
+    video_placeholder = "(<video>./</video>)"
+
+    if modality == "image":
+        placeholder = image_placeholder
+    elif modality == "video":
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + "\n" + video_placeholder

    prompts = [
        tokenizer.apply_chat_template(
            [
                {
                    "role": "user",
-                    "content": f"{modality_placeholder[modality]}\n{question}",
+                    "content": f"{placeholder}\n{question}",
                }
            ],
            tokenize=False,
@@ -1466,20 +1566,24 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
 def run_molmo2(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "allenai/Molmo2-8B"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        trust_remote_code=True,
        dtype="bfloat16",
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
        max_num_batched_tokens=36864,
    )

+    image_placeholder = "<|image|>"
+    video_placeholder = "<|video|>"
+
    if modality == "image":
-        placeholder = "<|image|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|video|>"
-    else:
-        raise ValueError(f"Unsupported modality for molmo2: {modality}")
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        f"{placeholder}<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
@@ -1563,19 +1667,25 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
 def run_openpangu_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "FreedomIntelligence/openPangu-VL-7B"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=4,
        trust_remote_code=True,
        enforce_eager=True,
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "[unused19]"
+    video_placeholder = "[unused32]"
+
    if modality == "image":
-        placeholder = "[unused19]"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "[unused32]"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
@@ -1623,18 +1733,25 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
 def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "AIDC-AI/Ovis2.5-2B"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
        max_num_seqs=2,
        trust_remote_code=True,
        dtype="half",
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )
+
+    image_placeholder = "<image>"
+    video_placeholder = "<video>"
+
    if modality == "image":
-        placeholder = "<image>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<video>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + "\n" + video_placeholder

    prompts = [
        f"<|im_start|>user\n\n{placeholder}\n{question}<|im_end|>\n<|im_start|>assistant\n"
@@ -1846,6 +1963,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
 def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Qwen/Qwen2-VL-7B-Instruct"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -1855,18 +1973,23 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
            "min_pixels": 28 * 28,
            "max_pixels": 1280 * 28 * 28,
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+
    if modality == "image":
-        placeholder = "<|image_pad|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|video_pad|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"<|im_start|>user\n{placeholder}"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
@@ -1883,6 +2006,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
 def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Qwen/Qwen2.5-VL-3B-Instruct"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -1892,18 +2016,23 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+
    if modality == "image":
-        placeholder = "<|image_pad|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|video_pad|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"<|im_start|>user\n{placeholder}"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
@@ -1920,6 +2049,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
 def run_qwen2_5_omni(questions: list[str], modality: str):
    model_name = "Qwen/Qwen2.5-Omni-7B"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -1929,13 +2059,18 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "<|vision_bos|><|IMAGE|><|vision_eos|>"
+    video_placeholder = "<|vision_bos|><|VIDEO|><|vision_eos|>"
+
    if modality == "image":
-        placeholder = "<|IMAGE|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|VIDEO|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    default_system = (
        "You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
@@ -1946,7 +2081,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
    prompts = [
        (
            f"<|im_start|>system\n{default_system}<|im_end|>\n"
-            f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
+            f"<|im_start|>user\n{placeholder}"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
@@ -1962,6 +2097,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
 def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Qwen/Qwen3-VL-4B-Instruct"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -1971,18 +2107,23 @@ def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+
    if modality == "image":
-        placeholder = "<|image_pad|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|video_pad|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"<|im_start|>user\n{placeholder}"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
@@ -1999,6 +2140,7 @@ def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
 def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -2008,18 +2150,23 @@ def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
            "max_pixels": 1280 * 28 * 28,
            "fps": 1,
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+
    if modality == "image":
-        placeholder = "<|image_pad|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|video_pad|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"<|im_start|>user\n{placeholder}"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
@@ -2190,6 +2337,7 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
 def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
    model_name = "omni-research/Tarsier2-Recap-7b"

+    mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
    engine_args = EngineArgs(
        model=model_name,
        max_model_len=4096,
@@ -2197,18 +2345,23 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
            "architectures": ["Tarsier2ForConditionalGeneration"],
            "model_type": "tarsier2",
        },
-        limit_mm_per_prompt={modality: 1},
+        limit_mm_per_prompt=mm_limit,
    )

+    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
+    video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
+
    if modality == "image":
-        placeholder = "<|image_pad|>"
+        placeholder = image_placeholder
    elif modality == "video":
-        placeholder = "<|video_pad|>"
+        placeholder = video_placeholder
+    elif modality == "image+video":
+        placeholder = image_placeholder + video_placeholder

    prompts = [
        (
            "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
-            f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
+            f"<|im_start|>user\n{placeholder}"
            f"{question}<|im_end|>\n"
            "<|im_start|>assistant\n"
        )
@@ -2357,6 +2510,24 @@ def get_multi_modal_input(args):
            "questions": vision_chunk_questions,
        }

+    if args.modality == "image+video":
+        image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
+        needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA
+        video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
+        metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
+        img_video_questions = [
+            "What is shown in the image? What happens in the video?",
+            "Describe both the image and the video content.",
+        ]
+
+        return {
+            "data": {
+                "image": image,
+                "video": ([(video, metadata)] if needs_metadata else video),
+            },
+            "questions": img_video_questions,
+        }
+
    msg = f"Modality {args.modality} is not supported."
    raise ValueError(msg)

@@ -2439,7 +2610,7 @@ def parse_args():
        "--modality",
        type=str,
        default="image",
-        choices=["image", "video", "vision_chunk"],
+        choices=["image", "video", "image+video", "vision_chunk"],
        help="Modality of the input.",
    )
    parser.add_argument(
@@ -2546,23 +2717,42 @@ def main(args):
        else req_data.sampling_params
    )

+    def _mm_data(data, modality):
+        if modality == "image+video":
+            return {"image": data["image"], "video": data["video"]}
+        return {modality: data}
+
+    def _mm_uuid(uuid, modality):
+        if modality == "image+video":
+            return {"image": uuid, "video": uuid + "v"}
+        return {modality: uuid}
+
+    def _mm_empty(modality):
+        if modality == "image+video":
+            return {"image": None, "video": None}
+        return {modality: None}
+
    assert args.num_prompts > 0
    if args.num_prompts == 1:
        # Single inference
        uuid = "uuid_0"
        inputs = {
            "prompt": prompts[0],
-            "multi_modal_data": {modality: data},
-            "multi_modal_uuids": {modality: uuid},
+            "multi_modal_data": _mm_data(data, modality),
+            "multi_modal_uuids": _mm_uuid(uuid, modality),
        }
        inputs_with_empty_media = {
            "prompt": prompts[0],
-            "multi_modal_data": {modality: None},
-            "multi_modal_uuids": {modality: uuid},
+            "multi_modal_data": _mm_empty(modality),
+            "multi_modal_uuids": _mm_uuid(uuid, modality),
        }
    else:
        # Batch inference
        if args.image_repeat_prob is not None:
+            if modality == "image+video":
+                raise ValueError(
+                    "--image-repeat-prob is not supported for 'image+video' modality"
+                )
            # Repeat images with specified probability of "image_repeat_prob"
            inputs, inputs_with_empty_media = apply_image_repeat(
                args.image_repeat_prob,
@@ -2572,7 +2762,7 @@ def main(args):
                modality,
            )
        else:
-            # Use the same image for all prompts
+            # Use the same image/video for all prompts
            inputs = []
            inputs_with_empty_media = []
            for i in range(args.num_prompts):
@@ -2580,15 +2770,15 @@ def main(args):
                inputs.append(
                    {
                        "prompt": prompts[i % len(prompts)],
-                        "multi_modal_data": {modality: data},
-                        "multi_modal_uuids": {modality: uuid},
+                        "multi_modal_data": _mm_data(data, modality),
+                        "multi_modal_uuids": _mm_uuid(uuid, modality),
                    }
                )
                inputs_with_empty_media.append(
                    {
                        "prompt": prompts[i % len(prompts)],
-                        "multi_modal_data": {modality: None},
-                        "multi_modal_uuids": {modality: uuid},
+                        "multi_modal_data": _mm_empty(modality),
+                        "multi_modal_uuids": _mm_uuid(uuid, modality),
                    }
                )