TensorRT-LLMs/tests/integration/defs/examples/test_multimodal.py
bhsueh_NV 5724c61934
chore: fix bug of model paths in confset.py (#3011)
* fix bugs of model paths of models in examples/models/contrib/

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>

* fix bug of code layout

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>

* fix bug of test_multimodal.py

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>

* add gptj_example_root back

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>

---------

Signed-off-by: bhsueh <11360707+byshiue@users.noreply.github.com>
2025-03-25 17:00:44 +08:00

675 lines
27 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pytest
from defs.common import convert_weights, venv_check_call, venv_mpi_check_call
from defs.conftest import get_device_memory, skip_pre_ada
from defs.trt_test_alternative import check_call
@pytest.fixture(scope="module")
def multimodal_example_root(llm_root):
"Get multimodal example root"
example_root = os.path.join(llm_root, "examples", "multimodal")
return example_root
@pytest.fixture(scope="function")
def recover_transformers(llm_venv, llm_root):
"Recover transformers"
yield
llm_venv.run_cmd([
"-m", "pip", "install", "-r",
os.path.join(llm_root, "requirements.txt")
])
def _call_run_cmd(llm_venv, llm_root, cmd, world_size):
if world_size == 1:
venv_check_call(llm_venv, cmd)
else:
venv_mpi_check_call(
llm_venv, ["mpirun", "-n",
str(world_size), "--allow-run-as-root"], cmd)
dataset_path_mapping = {
'cnn_dailymail': 'cnn_dailymail',
'scienceqa': 'derek-thomas___science_qa',
}
def _test_llm_multimodal_general(llm_venv,
llm_root,
llm_datasets_root,
cmodel_dir,
engine_dir,
batch_size,
data_type,
tp_size,
pp_size,
multimodal_example_root,
multimodal_model_root,
recover_transformers,
calibration_dataset=None,
qformat=None,
kv_cache_dtype=None,
cpp_e2e=False,
num_beams=1):
world_size = tp_size * pp_size
print("Locate model checkpoints in test storage...")
tllm_model_name, model_ckpt_path = multimodal_model_root
if "neva-22b" in tllm_model_name and get_device_memory() < 80000:
pytest.skip("GPU memory is insufficient.")
print("Converting huggingface model into binary format...")
# ckpt from llm_models/<model_name> --> cmodels/<model_name>/<dtype>
model_name = tllm_model_name
model_name = "pix2struct" if model_name == "deplot" else model_name
opt_example_root = multimodal_example_root + "/../models/contrib/opt"
enc_dec_example_root = multimodal_example_root + "/../enc_dec"
llama_example_root = multimodal_example_root + "/../llama"
cogvlm_example_root = multimodal_example_root + "/../cogvlm"
gpt_example_root = multimodal_example_root + "/../gpt"
nemotron_example_root = multimodal_example_root + "/../nemotron"
phi_example_root = multimodal_example_root + "/../phi"
mllama_example_root = multimodal_example_root + "/../mllama"
qwen_example_root = multimodal_example_root + "/../qwen"
internlm_example_root = multimodal_example_root + "/../internlm2"
opt_model = "opt" in model_name
nougat_model = "nougat" in model_name
gpt_model = "fuyu" in model_name or "neva-22b" in model_name or "kosmos" in model_name
pix2struct_model = "pix2struct" in model_name
enc_dec_model = "t5" in model_name or nougat_model or pix2struct_model
llava_model = "llava" in model_name
llava_next_model = "llava-v1.6" in model_name
llava_next_vision_trtllm_engine_model = "vision-trtllm" in model_name and llava_next_model
llava_onevision_model = "llava-onevision" in model_name
llava_onevision_video_model = "video" in model_name and llava_onevision_model
vila_model = "VILA" in model_name
cogvlm_model = "cogvlm" in model_name
nemotron_model = "video-neva" in model_name
phi_model = "phi" in model_name.lower()
mllama_model = 'Llama-3.2' in model_name
qwen2_vl_model = 'Qwen2-VL' in model_name
internlm_model = 'internlm-xcomposer2' in model_name
if enc_dec_model:
builder_root = enc_dec_example_root
if nougat_model:
model_type = "bart"
if pix2struct_model:
model_type = "pix2struct"
if "t5" in model_name:
model_type = "blip2"
elif gpt_model:
builder_root, model_type = gpt_example_root, "gpt"
elif llava_onevision_model:
builder_root, model_type = qwen_example_root, "qwen"
elif qwen2_vl_model:
builder_root, model_type = qwen_example_root, "qwen"
elif internlm_model:
builder_root, model_type = internlm_example_root, "internlm"
elif llava_model or vila_model:
builder_root, model_type = llama_example_root, "llama"
elif cogvlm_model:
builder_root, model_type = cogvlm_example_root, "cogvlm"
elif nemotron_model:
builder_root, model_type = nemotron_example_root, "nemotron"
elif phi_model:
model_name = model_name.split('/')[-1] # Remove HF directory name
builder_root, model_type = phi_example_root, "phi-3-vision"
elif opt_model:
builder_root, model_type = opt_example_root, "blip2"
elif mllama_model:
builder_root, model_type = mllama_example_root, "mllama"
use_weight_only = (not enc_dec_model) and (data_type in [
'int4_weight_only', 'int8_weight_only'
])
weight_only_precision = data_type.split('_')[0] if use_weight_only else None
if use_weight_only: data_type = 'float16'
if vila_model:
print(
"VILA model has dependencies on certain HuggingFace version. Need to pip install until this limitation is removed."
)
check_call(
f"pip install -r {multimodal_example_root}/requirements-vila.txt",
shell=True,
env=llm_venv._new_env)
elif llava_onevision_model:
check_call(
f"pip install -r {multimodal_example_root}/requirements-llava_onevision.txt",
shell=True,
env=llm_venv._new_env)
elif qwen2_vl_model:
check_call(
f"pip install -r {multimodal_example_root}/requirements-qwen2vl.txt",
shell=True,
env=llm_venv._new_env)
elif internlm_model:
check_call(
f"pip install -r {multimodal_example_root}/requirements-internlm-xcomposer2.txt",
shell=True,
env=llm_venv._new_env)
elif mllama_model:
check_call(f"pip install -r {mllama_example_root}/requirements.txt",
shell=True,
env=llm_venv._new_env)
if qformat == 'fp8':
convert_cmd = [
f"{multimodal_example_root}/../quantization/quantize.py",
f"--model_dir={model_ckpt_path}",
f"--calib_dataset={llm_datasets_root}/{dataset_path_mapping[calibration_dataset]}",
f"--dtype={data_type}",
f"--qformat={qformat}",
f"--kv_cache_dtype={kv_cache_dtype}",
f"--output_dir={cmodel_dir}",
f"--calib_size=16",
]
venv_check_call(llm_venv, convert_cmd)
converted_weight_dir = cmodel_dir
else:
converted_weight_dir = convert_weights(
llm_venv,
builder_root,
cmodel_dir,
model_name,
model_ckpt_path,
data_type=data_type,
gpus=tp_size,
model_type=model_type,
use_weight_only=use_weight_only,
weight_only_precision=weight_only_precision,
tp_size=tp_size,
pp_size=pp_size,
batch_size=batch_size,
multimodal=True)
print("Build LLM engines...")
model_name = model_name.split('/')[-1] # Remove HF directory name
llm_engine_dir = f"{engine_dir}/{model_name}/{world_size}-gpu"
if "opt" in model_name or llava_model or vila_model or gpt_model or nemotron_model or phi_model or qwen2_vl_model:
max_input_len_text = 1024
max_output_len = 200
if llava_next_model:
multimodal_len = 4096
elif llava_onevision_model:
multimodal_len = 7300
elif llava_model:
multimodal_len = 576
elif vila_model:
multimodal_len = 196
elif phi_model:
multimodal_len = 5120
elif "fuyu" in model_name:
multimodal_len = 2640
elif "neva-22b" in model_name:
multimodal_len = 729
elif "video-neva" in model_name:
multimodal_len = 3072
elif "kosmos" in model_name:
multimodal_len = 64
elif "Qwen2-VL" in model_name:
multimodal_len = 3552
else:
multimodal_len = 32
max_input_len = max_input_len_text + batch_size * multimodal_len
max_seq_len = max_input_len + max_output_len
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={converted_weight_dir}",
f"--output_dir={llm_engine_dir}/llm",
f"--gpt_attention_plugin {data_type}",
f"--gemm_plugin={data_type}",
f"--max_batch_size={batch_size}",
f"--max_multimodal_len={batch_size * multimodal_len}",
f"--max_input_len={max_input_len}",
f"--max_seq_len={max_seq_len}",
f"--max_num_tokens={max_input_len}",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
elif internlm_model:
max_input_len_text = 1536
max_output_len = 200
multimodal_len = 1225
max_input_len = max_input_len_text + batch_size * multimodal_len
max_seq_len = max_input_len + max_output_len
max_lora_rank = 256
lora_dir = "."
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={converted_weight_dir}",
f"--output_dir={llm_engine_dir}",
f"--gpt_attention_plugin {data_type}",
f"--gemm_plugin={data_type}",
f"--lora_plugin={data_type}",
f"--lora_dir={lora_dir}",
f"--max_lora_rank={max_lora_rank}",
f"--max_batch_size={batch_size}",
f"--max_multimodal_len={batch_size * multimodal_len}",
f"--max_input_len={max_input_len}",
f"--max_seq_len={max_seq_len}",
f"--max_num_tokens={max_input_len}",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
elif enc_dec_model:
components = ["decoder"] if nougat_model or pix2struct_model else [
"encoder", "decoder"
]
for component in components:
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={converted_weight_dir}/{component}",
f"--output_dir={llm_engine_dir}/{data_type}/llm/{component}",
"--paged_kv_cache=enable",
"--moe_plugin=disable",
f"--max_batch_size={batch_size}",
"--max_seq_len=412",
f"--gemm_plugin={data_type}",
f"--bert_attention_plugin={data_type}",
f"--gpt_attention_plugin={data_type}",
"--remove_input_padding=enable",
f"--max_beam_width={num_beams}",
]
# for non-T5 models, FP16/BF16
if model_type == "t5" or data_type == "float32":
build_cmd.append("--context_fmha=disable")
if "t5" in model_name:
if component == "encoder":
build_cmd.append(f"--max_multimodal_len={32 * batch_size}")
build_cmd.append("--max_input_len=412")
else:
build_cmd.append("--max_encoder_input_len=412")
build_cmd.append(f"--max_input_len=1")
else: # Nougat
assert nougat_model or pix2struct_model
if component == "encoder":
build_cmd.append(f"--max_multimodal_len={588 * batch_size}")
# only decoder for nougat
if nougat_model:
build_cmd.append(
f"--max_encoder_input_len={588 * batch_size}")
else:
build_cmd.append(
f"--max_encoder_input_len={2048 * batch_size}")
build_cmd.append(f"--max_input_len=1")
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
elif cogvlm_model:
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={converted_weight_dir}",
f"--output_dir={llm_engine_dir}/llm",
f"--gemm_plugin={data_type}",
f"--gpt_attention_plugin={data_type}",
f"--remove_input_padding=enable",
f"--max_batch_size={batch_size}",
f"--max_input_len=2048",
f"--max_seq_len=2048",
f"--paged_kv_cache=enable",
f"--bert_attention_plugin=disable",
f"--moe_plugin=disable",
f"--max_multimodal_len=61440",
f"--max_beam_width={num_beams}",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
elif mllama_model:
# set max_encoder_input_len = 6404 for running both non-instruct model and instruct model
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={converted_weight_dir}",
f"--output_dir={llm_engine_dir}/llm",
f"--gemm_plugin={data_type}",
f"--max_num_tokens=4096",
f"--max_seq_len=2048",
f"--max_batch_size={batch_size}",
f"--max_encoder_input_len=6404",
f"--max_beam_width={num_beams}",
]
if kv_cache_dtype == 'fp8':
build_cmd.extend([
"--use_fp8_context_fmha=enable",
"--use_paged_context_fmha=enable",
])
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
print("Build visual engines...")
vision_model_type = model_name
if 'llava' in model_name: vision_model_type = 'llava'
if 'llava-v1.6' in model_name: vision_model_type = 'llava_next'
elif llava_onevision_model: vision_model_type = 'llava_onevision'
elif 'VILA' in model_name: vision_model_type = 'vila'
elif nougat_model: vision_model_type = 'nougat'
elif pix2struct_model: vision_model_type = 'pix2struct'
elif 'cogvlm' in model_name: vision_model_type = 'cogvlm'
elif 'fuyu' in model_name: vision_model_type = 'fuyu'
elif 'neva-22b' in model_name: vision_model_type = 'neva'
elif 'video-neva' in model_name: vision_model_type = 'video-neva'
elif phi_model: vision_model_type = "phi-3-vision"
elif 'blip2' in model_name: vision_model_type = 'blip2'
elif 'Llama-3.2' in model_name: vision_model_type = 'mllama'
elif "Qwen2-VL" in model_name: vision_model_type = 'qwen2_vl'
elif 'internlm' in model_name: vision_model_type = 'internlm-xcomposer2'
vit_batch_size = batch_size
if vision_model_type == "llava_next":
vit_batch_size = vit_batch_size * 5
elif vision_model_type == 'llava_onevision':
vit_batch_size = vit_batch_size * 32
llm_engine_subdir = f"{data_type}" if enc_dec_model else ""
build_cmd = [
f"{multimodal_example_root}/build_multimodal_engine.py",
f"--output_dir={os.path.join(llm_engine_dir, llm_engine_subdir, 'vision')}",
f"--model_type={vision_model_type}",
f"--model_path={model_ckpt_path}",
f"--max_batch_size={vit_batch_size}",
]
if vision_model_type == "vila":
vila_path = model_ckpt_path + "/../VILA"
build_cmd.extend([f"--vila_path={vila_path}"])
if llava_next_vision_trtllm_engine_model:
script_root = f"{multimodal_example_root}/../vit"
convert_cmd = [
f"{script_root}/convert_checkpoint.py",
f"--model_dir={model_ckpt_path}",
f"--output_dir={os.path.join(cmodel_dir, model_name, data_type, 'vision')}",
f"--dtype={data_type}",
f"--vision_tp_size={tp_size}",
]
venv_check_call(llm_venv, convert_cmd)
build_cmd = [
"trtllm-build",
f"--checkpoint_dir={os.path.join(cmodel_dir, model_name, data_type, 'vision')}",
f"--output_dir={os.path.join(llm_engine_dir, llm_engine_subdir, 'vision')}",
f"--max_batch_size={vit_batch_size}",
f"--remove_input_padding disable",
f"--bert_attention_plugin disable",
]
check_call(" ".join(build_cmd), shell=True, env=llm_venv._new_env)
else:
venv_check_call(llm_venv, build_cmd)
if llava_next_vision_trtllm_engine_model:
cp_cmd = [
"cp",
f"{os.path.join(cmodel_dir, model_name, data_type, 'vision', 'image_newlines.safetensors')}",
f"{os.path.join(llm_engine_dir, llm_engine_subdir, 'vision')}",
]
check_call(" ".join(cp_cmd), shell=True, env=llm_venv._new_env)
print("Run inference...")
hf_model_dir = model_ckpt_path + "/../vicuna-7b-v1.5" if cogvlm_model else model_ckpt_path
hf_model_dir = converted_weight_dir if "neva" in model_name else hf_model_dir
video_path = os.path.join(
os.path.dirname(model_ckpt_path), "test_video",
"video_test.mp4") if "video-neva" in model_name else ""
run_cmd = [
f"{multimodal_example_root}/run.py",
f"--engine_dir={llm_engine_dir}/{llm_engine_subdir}",
f"--hf_model_dir={hf_model_dir}", "--max_new_tokens=30",
f"--batch_size={batch_size}", "--check_accuracy",
"--enable_context_fmha_fp32_acc"
]
if vision_model_type in ['llava', 'vila'] and batch_size > 1:
# batch inference test
if vision_model_type == 'vila':
input_text = [
'"<image>\n Please elaborate what you see in the images?"'
] * batch_size
else:
input_text = ['"\\n Which city is this? Answer:"'] * batch_size
run_cmd.append("--input_text")
run_cmd.extend(input_text)
if enc_dec_model:
run_cmd.extend(["--cross_kv_cache_fraction", "0.5"])
if vision_model_type == "neva" and not cpp_e2e:
# randomly pick one to test the python runtime
run_cmd.extend(["--session", "python"])
if vision_model_type == "video-neva":
run_cmd.extend(["--video_path", video_path])
if llava_onevision_video_model:
run_cmd.extend(["--video_path", 'llava-onevision-accuracy'])
if phi_model:
run_cmd.extend(["--kv_cache_free_gpu_memory_fraction", "0.4"])
if cpp_e2e:
run_cmd.extend(["--session", "cpp"])
if num_beams > 1:
run_cmd.extend(["--num_beams", str(num_beams)])
if mllama_model:
if qformat is None:
run_cmd_vision = run_cmd.copy()
run_cmd_vision.extend([
"--cross_kv_cache_fraction=0.5", # mllama uses cross attention
"--image_path",
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/0052a70beed5bf71b92610a43a52df6d286cd5f3/diffusers/rabbit.jpg",
"--input_text",
"If I had to write a haiku for this one"
])
print("Run mllama vision test...")
_call_run_cmd(llm_venv, llm_root, run_cmd_vision, world_size)
run_cmd_text = run_cmd.copy()
run_cmd_text.extend([
"--cross_kv_cache_fraction=0.5", # mllama uses cross attention
"--input_text",
"The key to life is",
])
print("Run mllama text test...")
_call_run_cmd(llm_venv, llm_root, run_cmd_text, world_size)
else:
_call_run_cmd(llm_venv, llm_root, run_cmd, world_size)
# Run evaluation test
if batch_size == 1 and (data_type == "float16" or qformat == 'fp8'):
print(f"prepare to run eval test")
# for blip2-t5, ref: https://github.com/huggingface/transformers/issues/25491
if "t5" in model_name:
check_call("pip uninstall -y apex",
shell=True,
env=llm_venv._new_env)
# Threshold are set based on the HF correctness for 20 iterations
threshold_map = {
'blip2-opt-2.7b': 35,
'blip2-flan-t5-xl': 55,
'llava-1.5-7b-hf': 65,
'llava-v1.6-mistral-7b-hf': 65,
'llava-onevision-qwen2-7b-ov-hf': 80,
'VILA1.5-3b': 75, # from local TRT-LLM run
'fuyu-8b': 70,
'kosmos-2': 60,
'Phi-3-vision-128k-instruct': 75,
'Phi-3.5-vision-instruct': 85,
'Llama-3.2-11B-Vision': 60, # The expected score is 62
'Llama-3.2-11B-Vision-Instruct': 75, # The expected score is 77
'Qwen2-VL-7B-Instruct': 80,
}
if model_name not in threshold_map:
print(f"Skip {model_name} evaluation test.")
return
# TODO: Delete these lines after resolving the issues
# For llava - input tokens are not parsed correctly with '<image>\n'
# For llava_next - correctness lower than HF, and needs lower transformer version built
# For Phi-3 - correctness lower than HF
# For qwen_vl - runtime issue with eval.py -- need to unify prompt generation logics
# For internvl - not added to the test
if llava_model or llava_next_model or phi_model or qwen2_vl_model:
return
eval_task = "lmms-lab/ai2d" if mllama_model else "lmms-lab/VQAv2"
eval_cmd = [
f"{multimodal_example_root}/eval.py",
f"--model_type={vision_model_type}",
f"--engine_dir={llm_engine_dir}/{llm_engine_subdir}",
f"--hf_model_dir={hf_model_dir}", "--enable_context_fmha_fp32_acc",
"--test_trtllm",
f"--accuracy_threshold={threshold_map[model_name]}",
f"--eval_task={eval_task}"
]
if mllama_model:
eval_cmd.extend([
f"--dataset_dir={llm_datasets_root}/lmms-lab___ai2d/",
"--cross_kv_cache_fraction=0.5", "--max_ite=100"
])
else:
eval_cmd.extend([
f"--dataset_dir={llm_datasets_root}/lmms-lab__VQAv2_valid_2000samples/"
])
if phi_model:
eval_cmd.extend(["--kv_cache_free_gpu_memory_fraction", "0.4"])
elif enc_dec_model:
eval_cmd.extend(["--cross_kv_cache_fraction", "0.5"])
print(f"Run {model_name} evaluation test...")
_call_run_cmd(llm_venv, llm_root, eval_cmd, world_size)
@pytest.mark.parametrize("num_beams", [1, 4],
ids=lambda num_beams: f'nb:{num_beams}')
@pytest.mark.parametrize('cpp_e2e', [False, True],
ids=lambda cpp_e2e: f'cpp_e2e:{cpp_e2e}')
@pytest.mark.parametrize("batch_size", [1, 8],
ids=lambda batch_size: f'bs:{batch_size}')
@pytest.mark.parametrize(
"data_type",
['float16', 'bfloat16', 'int4_weight_only', 'int8_weight_only'])
@pytest.mark.parametrize("tp_size", [1, 2], ids=lambda tp_size: f'tp:{tp_size}')
@pytest.mark.parametrize("pp_size", [1, 2], ids=lambda pp_size: f'pp:{pp_size}')
@pytest.mark.parametrize("multimodal_model_root", [
'blip2-opt-2.7b',
'blip2-flan-t5-xl',
'llava-1.5-7b-hf',
'llava-v1.6-mistral-7b-hf',
'llava-v1.6-mistral-7b-hf-vision-trtllm',
'llava-onevision-qwen2-7b-ov-hf',
'llava-onevision-qwen2-7b-ov-hf-video',
'nougat-base',
'VILA1.5-3b',
'cogvlm-chat',
'fuyu-8b',
'deplot',
'neva-22b',
'kosmos-2',
'video-neva',
'Phi-3-vision-128k-instruct',
'Phi-3.5-vision-instruct',
'Llama-3.2-11B-Vision',
'Qwen2-VL-7B-Instruct',
'internlm-xcomposer2-vl-7b',
],
indirect=True)
def test_llm_multimodal_general(llm_venv, llm_root, llm_datasets_root,
cmodel_dir, engine_dir, batch_size, data_type,
tp_size, pp_size, multimodal_example_root,
multimodal_model_root, recover_transformers,
cpp_e2e, num_beams):
_test_llm_multimodal_general(llm_venv,
llm_root,
llm_datasets_root,
cmodel_dir,
engine_dir,
batch_size,
data_type,
tp_size,
pp_size,
multimodal_example_root,
multimodal_model_root,
recover_transformers,
cpp_e2e=cpp_e2e,
num_beams=num_beams)
@skip_pre_ada
@pytest.mark.parametrize('cpp_e2e', [False, True],
ids=lambda cpp_e2e: f'cpp_e2e:{cpp_e2e}')
@pytest.mark.parametrize("batch_size", [1, 8],
ids=lambda batch_size: f'bs:{batch_size}')
@pytest.mark.parametrize("data_type", ['float16', 'bfloat16'])
@pytest.mark.parametrize("tp_size", [1, 2], ids=lambda tp_size: f'tp:{tp_size}')
@pytest.mark.parametrize("pp_size", [1, 2], ids=lambda pp_size: f'pp:{pp_size}')
@pytest.mark.parametrize("multimodal_model_root", [
'blip2-opt-2.7b',
'blip2-flan-t5-xl',
'llava-1.5-7b-hf',
'llava-v1.6-mistral-7b-hf',
'llava-onevision-qwen2-7b-ov-hf',
'llava-onevision-qwen2-7b-ov-hf-video',
'nougat-base',
'VILA1.5-3b',
'cogvlm-chat',
'fuyu-8b',
'deplot',
'neva-22b',
'kosmos-2',
'video-neva',
'Phi-3-vision-128k-instruct',
'Phi-3.5-vision-instruct',
'Llama-3.2-11B-Vision-Instruct',
'Llama-3.2-11B-Vision',
'Qwen2-VL-7B-Instruct',
],
indirect=True)
@pytest.mark.parametrize('calibration_dataset', ['scienceqa', 'cnn_dailymail'])
@pytest.mark.parametrize('qformat', ['fp8'])
@pytest.mark.parametrize('kv_cache_dtype', ['fp8'])
def test_llm_fp8_multimodal_general(
llm_venv, llm_root, llm_datasets_root, cmodel_dir, engine_dir,
batch_size, data_type, tp_size, pp_size, multimodal_example_root,
multimodal_model_root, recover_transformers, calibration_dataset,
qformat, kv_cache_dtype, cpp_e2e):
_test_llm_multimodal_general(llm_venv,
llm_root,
llm_datasets_root,
cmodel_dir,
engine_dir,
batch_size,
data_type,
tp_size,
pp_size,
multimodal_example_root,
multimodal_model_root,
recover_transformers,
calibration_dataset,
qformat,
kv_cache_dtype,
cpp_e2e=cpp_e2e)