[MM][Misc] Support image+video mixed inputs (per prompt) for VLM examples (#40335)

Signed-off-by: shen-shanshan <467638484@qq.com>
This commit is contained in:
Shanshan Shen
2026-04-21 11:43:25 +08:00
committed by GitHub
parent 989cc12d88
commit b47840019e
+291 -101
View File
@@ -394,18 +394,24 @@ def run_eagle2_5(questions: list[str], modality: str) -> ModelRequestData:
def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "baidu/ERNIE-4.5-VL-28B-A3B-PT"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=5,
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
trust_remote_code=True,
)
image_placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
video_placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
if modality == "image":
placeholder = "Picture 1:<|IMAGE_START|><|image@placeholder|><|IMAGE_END|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "Video 1:<|VIDEO_START|><|video@placeholder|><|VIDEO_END|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -425,6 +431,7 @@ def run_ernie45_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData:
model_name = "LGAI-EXAONE/EXAONE-4.5-33B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -434,18 +441,23 @@ def run_exaone4_5(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<vision><|image_pad|></vision>"
video_placeholder = "<vision><|video_pad|></vision>"
if modality == "image":
placeholder = "<|image_pad|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|video_pad|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
"<|system|>\nYou are a helpful assistant.<|endofturn|>\n"
f"<|user|>\n<vision>{placeholder}</vision>"
f"<|user|>\n{placeholder}"
f"{question}<|endofturn|>\n"
"<|assistant|>\n"
)
@@ -566,6 +578,7 @@ def run_glm4v(questions: list[str], modality: str) -> ModelRequestData:
def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.1V-9B-Thinking"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -574,14 +587,19 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
enforce_eager=True,
)
image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -602,6 +620,7 @@ def run_glm4_1v(questions: list[str], modality: str) -> ModelRequestData:
def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -610,15 +629,20 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
enforce_eager=True,
tensor_parallel_size=4,
)
image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -639,6 +663,7 @@ def run_glm4_5v(questions: list[str], modality: str) -> ModelRequestData:
def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-4.5V-FP8"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -647,15 +672,20 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
enforce_eager=True,
tensor_parallel_size=4,
)
image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -676,6 +706,7 @@ def run_glm4_5v_fp8(questions: list[str], modality: str) -> ModelRequestData:
def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
model_name = "zai-org/GLM-OCR"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -684,14 +715,19 @@ def run_glm_ocr(questions: list[str], modality: str) -> ModelRequestData:
"size": {"shortest_edge": 12544, "longest_edge": 47040000},
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
enforce_eager=True,
)
image_placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
video_placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
if modality == "image":
placeholder = "<|begin_of_image|><|image|><|end_of_image|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|begin_of_video|><|video|><|end_of_video|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -772,11 +808,12 @@ def run_hyperclovax_seed_vision(
model_name = "naver-hyperclovax/HyperCLOVAX-SEED-Vision-Instruct-3B"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192 if modality == "image" else 16384,
limit_mm_per_prompt={modality: 1},
max_model_len=16384 if modality in ("video", "image+video") else 8192,
limit_mm_per_prompt=mm_limit,
)
messages = list()
@@ -828,6 +865,29 @@ def run_hyperclovax_seed_vision(
}
]
)
elif modality == "image+video":
messages.append(
[
{
"role": "user",
"content": [
{
"type": "image",
"ocr": "",
"lens_keywords": "",
"lens_local_keywords": "",
},
{
"type": "video",
},
{
"type": "text",
"text": question,
},
],
}
]
)
else:
raise ValueError(f"Unsupported modality: {modality}")
@@ -876,19 +936,25 @@ def run_idefics3(questions: list[str], modality: str) -> ModelRequestData:
def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1-mini"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
enforce_eager=True,
)
image_placeholder = "<IMG_CONTEXT>"
video_placeholder = "<video>"
if modality == "image":
placeholder = "<IMG_CONTEXT>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<video>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + "\n" + video_placeholder
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
@@ -909,20 +975,26 @@ def run_interns1(questions: list[str], modality: str) -> ModelRequestData:
def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData:
model_name = "internlm/Intern-S1-Pro"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
max_num_seqs=2,
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
enforce_eager=True,
tensor_parallel_size=4,
)
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image":
placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
@@ -943,17 +1015,23 @@ def run_interns1_pro(questions: list[str], modality: str) -> ModelRequestData:
def run_internvl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "OpenGVLab/InternVL3-2B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
max_model_len=8192,
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<image>"
video_placeholder = "<video>"
if modality == "image":
placeholder = "<image>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<video>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + "\n" + video_placeholder
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
messages = [
@@ -1010,21 +1088,27 @@ def run_kanana_v(questions: list[str], modality: str) -> ModelRequestData:
def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-8B-Preview"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
trust_remote_code=True,
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image":
placeholder = "<|image_pad|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|video_pad|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
@@ -1041,21 +1125,27 @@ def run_keye_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_keye_vl1_5(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Kwai-Keye/Keye-VL-1.5-8B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=8192,
trust_remote_code=True,
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image":
placeholder = "<|image_pad|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|video_pad|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
@@ -1259,22 +1349,26 @@ def run_llava_next_video(questions: list[str], modality: str) -> ModelRequestDat
# LLaVA-OneVision
def run_llava_onevision(questions: list[str], modality: str) -> ModelRequestData:
if modality == "video":
prompts = [
f"<|im_start|>user <video>\n{question}<|im_end|><|im_start|>assistant\n"
for question in questions
]
image_placeholder = "<image>"
video_placeholder = "<video>"
elif modality == "image":
prompts = [
f"<|im_start|>user <image>\n{question}<|im_end|><|im_start|>assistant\n"
for question in questions
]
if modality == "image":
placeholder = image_placeholder
elif modality == "video":
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + "\n" + video_placeholder
prompts = [
(f"<|im_start|>user {placeholder}\n{question}<|im_end|><|im_start|>assistant\n")
for question in questions
]
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model="llava-hf/llava-onevision-qwen2-7b-ov-hf",
max_model_len=16384,
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
return ModelRequestData(
@@ -1307,7 +1401,7 @@ def run_mantis(questions: list[str], modality: str) -> ModelRequestData:
# MiniCPM-V
def run_minicpmv_base(questions: list[str], modality: str, model_name):
assert modality in ["image", "video"]
assert modality in ["image", "video", "image+video"]
# If you want to use `MiniCPM-o-2_6` with audio inputs, check `audio_language.py` # noqa
# 2.0
@@ -1329,12 +1423,13 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
# o2.6: image, video, audio
# model_name = "openbmb/MiniCPM-o-2_6"
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
# NOTE The stop_token_ids are different for various versions of MiniCPM-V
# 2.0
@@ -1347,17 +1442,22 @@ def run_minicpmv_base(questions: list[str], modality: str, model_name):
stop_tokens = ["<|im_end|>", "<|endoftext|>"]
stop_token_ids = [tokenizer.convert_tokens_to_ids(i) for i in stop_tokens]
modality_placeholder = {
"image": "(<image>./</image>)",
"video": "(<video>./</video>)",
}
image_placeholder = "(<image>./</image>)"
video_placeholder = "(<video>./</video>)"
if modality == "image":
placeholder = image_placeholder
elif modality == "video":
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + "\n" + video_placeholder
prompts = [
tokenizer.apply_chat_template(
[
{
"role": "user",
"content": f"{modality_placeholder[modality]}\n{question}",
"content": f"{placeholder}\n{question}",
}
],
tokenize=False,
@@ -1466,20 +1566,24 @@ def run_molmo(questions: list[str], modality: str) -> ModelRequestData:
def run_molmo2(questions: list[str], modality: str) -> ModelRequestData:
model_name = "allenai/Molmo2-8B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
trust_remote_code=True,
dtype="bfloat16",
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
max_num_batched_tokens=36864,
)
image_placeholder = "<|image|>"
video_placeholder = "<|video|>"
if modality == "image":
placeholder = "<|image|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|video|>"
else:
raise ValueError(f"Unsupported modality for molmo2: {modality}")
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
f"{placeholder}<|im_start|>user\n{question}<|im_end|>\n<|im_start|>assistant\n"
@@ -1563,19 +1667,25 @@ def run_nvlm_d(questions: list[str], modality: str) -> ModelRequestData:
def run_openpangu_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "FreedomIntelligence/openPangu-VL-7B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=4,
trust_remote_code=True,
enforce_eager=True,
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "[unused19]"
video_placeholder = "[unused32]"
if modality == "image":
placeholder = "[unused19]"
placeholder = image_placeholder
elif modality == "video":
placeholder = "[unused32]"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
@@ -1623,18 +1733,25 @@ def run_ovis(questions: list[str], modality: str) -> ModelRequestData:
def run_ovis2_5(questions: list[str], modality: str) -> ModelRequestData:
model_name = "AIDC-AI/Ovis2.5-2B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
max_num_seqs=2,
trust_remote_code=True,
dtype="half",
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<image>"
video_placeholder = "<video>"
if modality == "image":
placeholder = "<image>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<video>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + "\n" + video_placeholder
prompts = [
f"<|im_start|>user\n\n{placeholder}\n{question}<|im_end|>\n<|im_start|>assistant\n"
@@ -1846,6 +1963,7 @@ def run_qwen_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen2-VL-7B-Instruct"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -1855,18 +1973,23 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
"min_pixels": 28 * 28,
"max_pixels": 1280 * 28 * 28,
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image":
placeholder = "<|image_pad|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|video_pad|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
@@ -1883,6 +2006,7 @@ def run_qwen2_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen2.5-VL-3B-Instruct"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -1892,18 +2016,23 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image":
placeholder = "<|image_pad|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|video_pad|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
@@ -1920,6 +2049,7 @@ def run_qwen2_5_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_qwen2_5_omni(questions: list[str], modality: str):
model_name = "Qwen/Qwen2.5-Omni-7B"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -1929,13 +2059,18 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<|vision_bos|><|IMAGE|><|vision_eos|>"
video_placeholder = "<|vision_bos|><|VIDEO|><|vision_eos|>"
if modality == "image":
placeholder = "<|IMAGE|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|VIDEO|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
default_system = (
"You are Qwen, a virtual human developed by the Qwen Team, Alibaba "
@@ -1946,7 +2081,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
prompts = [
(
f"<|im_start|>system\n{default_system}<|im_end|>\n"
f"<|im_start|>user\n<|vision_bos|>{placeholder}<|vision_eos|>"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
@@ -1962,6 +2097,7 @@ def run_qwen2_5_omni(questions: list[str], modality: str):
def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen3-VL-4B-Instruct"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -1971,18 +2107,23 @@ def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image":
placeholder = "<|image_pad|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|video_pad|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
@@ -1999,6 +2140,7 @@ def run_qwen3_vl(questions: list[str], modality: str) -> ModelRequestData:
def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
model_name = "Qwen/Qwen3-VL-30B-A3B-Instruct"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -2008,18 +2150,23 @@ def run_qwen3_vl_moe(questions: list[str], modality: str) -> ModelRequestData:
"max_pixels": 1280 * 28 * 28,
"fps": 1,
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image":
placeholder = "<|image_pad|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|video_pad|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
@@ -2190,6 +2337,7 @@ def run_tarsier(questions: list[str], modality: str) -> ModelRequestData:
def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
model_name = "omni-research/Tarsier2-Recap-7b"
mm_limit = {"image": 1, "video": 1} if modality == "image+video" else {modality: 1}
engine_args = EngineArgs(
model=model_name,
max_model_len=4096,
@@ -2197,18 +2345,23 @@ def run_tarsier2(questions: list[str], modality: str) -> ModelRequestData:
"architectures": ["Tarsier2ForConditionalGeneration"],
"model_type": "tarsier2",
},
limit_mm_per_prompt={modality: 1},
limit_mm_per_prompt=mm_limit,
)
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
video_placeholder = "<|vision_start|><|video_pad|><|vision_end|>"
if modality == "image":
placeholder = "<|image_pad|>"
placeholder = image_placeholder
elif modality == "video":
placeholder = "<|video_pad|>"
placeholder = video_placeholder
elif modality == "image+video":
placeholder = image_placeholder + video_placeholder
prompts = [
(
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
f"<|im_start|>user\n<|vision_start|>{placeholder}<|vision_end|>"
f"<|im_start|>user\n{placeholder}"
f"{question}<|im_end|>\n"
"<|im_start|>assistant\n"
)
@@ -2357,6 +2510,24 @@ def get_multi_modal_input(args):
"questions": vision_chunk_questions,
}
if args.modality == "image+video":
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
needs_metadata = args.model_type in MODELS_NEED_VIDEO_METADATA
video = VideoAsset(name="baby_reading", num_frames=args.num_frames).np_ndarrays
metadata = VideoAsset(name="baby_reading", num_frames=args.num_frames).metadata
img_video_questions = [
"What is shown in the image? What happens in the video?",
"Describe both the image and the video content.",
]
return {
"data": {
"image": image,
"video": ([(video, metadata)] if needs_metadata else video),
},
"questions": img_video_questions,
}
msg = f"Modality {args.modality} is not supported."
raise ValueError(msg)
@@ -2439,7 +2610,7 @@ def parse_args():
"--modality",
type=str,
default="image",
choices=["image", "video", "vision_chunk"],
choices=["image", "video", "image+video", "vision_chunk"],
help="Modality of the input.",
)
parser.add_argument(
@@ -2546,23 +2717,42 @@ def main(args):
else req_data.sampling_params
)
def _mm_data(data, modality):
if modality == "image+video":
return {"image": data["image"], "video": data["video"]}
return {modality: data}
def _mm_uuid(uuid, modality):
if modality == "image+video":
return {"image": uuid, "video": uuid + "v"}
return {modality: uuid}
def _mm_empty(modality):
if modality == "image+video":
return {"image": None, "video": None}
return {modality: None}
assert args.num_prompts > 0
if args.num_prompts == 1:
# Single inference
uuid = "uuid_0"
inputs = {
"prompt": prompts[0],
"multi_modal_data": {modality: data},
"multi_modal_uuids": {modality: uuid},
"multi_modal_data": _mm_data(data, modality),
"multi_modal_uuids": _mm_uuid(uuid, modality),
}
inputs_with_empty_media = {
"prompt": prompts[0],
"multi_modal_data": {modality: None},
"multi_modal_uuids": {modality: uuid},
"multi_modal_data": _mm_empty(modality),
"multi_modal_uuids": _mm_uuid(uuid, modality),
}
else:
# Batch inference
if args.image_repeat_prob is not None:
if modality == "image+video":
raise ValueError(
"--image-repeat-prob is not supported for 'image+video' modality"
)
# Repeat images with specified probability of "image_repeat_prob"
inputs, inputs_with_empty_media = apply_image_repeat(
args.image_repeat_prob,
@@ -2572,7 +2762,7 @@ def main(args):
modality,
)
else:
# Use the same image for all prompts
# Use the same image/video for all prompts
inputs = []
inputs_with_empty_media = []
for i in range(args.num_prompts):
@@ -2580,15 +2770,15 @@ def main(args):
inputs.append(
{
"prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: data},
"multi_modal_uuids": {modality: uuid},
"multi_modal_data": _mm_data(data, modality),
"multi_modal_uuids": _mm_uuid(uuid, modality),
}
)
inputs_with_empty_media.append(
{
"prompt": prompts[i % len(prompts)],
"multi_modal_data": {modality: None},
"multi_modal_uuids": {modality: uuid},
"multi_modal_data": _mm_empty(modality),
"multi_modal_uuids": _mm_uuid(uuid, modality),
}
)