mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[https://nvbugs/5669097][tests] Add MMMU test for mistral small (#10530)
Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com>
This commit is contained in:
parent
38f249b479
commit
ff7eb93f31
@ -29,3 +29,5 @@ mistral/Mistral-Large-3-675B:
|
||||
- accuracy: 47
|
||||
Qwen/Qwen3-VL-8B-Instruct:
|
||||
- accuracy: 55.11
|
||||
mistralai/Mistral-Small-3.1-24B-Instruct-2503:
|
||||
- accuracy: 57.0
|
||||
|
||||
@ -345,3 +345,28 @@ class TestQwen3VL(LlmapiAccuracyTestHarness):
|
||||
) as llm:
|
||||
task = MMMU(self.MODEL_NAME)
|
||||
task.evaluate(llm, sampling_params=self.sampling_params)
|
||||
|
||||
|
||||
class TestMistralSmall24B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
MODEL_PATH = f"{llm_models_root()}/Mistral-Small-3.1-24B-Instruct-2503"
|
||||
MAX_NUM_TOKENS = 16384
|
||||
|
||||
# NOTE: MMMU adds <|endoftext|> to the stop token.
|
||||
sampling_params = SamplingParams(
|
||||
max_tokens=MMMU.MAX_OUTPUT_LEN,
|
||||
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
|
||||
stop="<|endoftext|>",
|
||||
)
|
||||
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
def test_auto_dtype(self):
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
|
||||
with LLM(
|
||||
self.MODEL_PATH,
|
||||
kv_cache_config=kv_cache_config,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_tokens=self.MAX_NUM_TOKENS,
|
||||
) as llm:
|
||||
task = MMMU(self.MODEL_NAME)
|
||||
task.evaluate(llm, sampling_params=self.sampling_params)
|
||||
|
||||
@ -2536,9 +2536,6 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv):
|
||||
@pytest.mark.parametrize("use_cuda_graph", [False, True])
|
||||
@pytest.mark.parametrize("modality", ["image", "video", "mixture_text_image"])
|
||||
@pytest.mark.parametrize("model_name,model_path", [
|
||||
pytest.param("mistral-small-3.1-24b-instruct",
|
||||
"Mistral-Small-3.1-24B-Instruct-2503",
|
||||
marks=pytest.mark.skip_less_device_memory(80000)),
|
||||
pytest.param(
|
||||
"Nano-v2-VLM",
|
||||
"Nano-v2-VLM",
|
||||
@ -2588,21 +2585,11 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
|
||||
}
|
||||
}
|
||||
|
||||
expected_keywords = {
|
||||
"mistral-small-3.1-24b-instruct": {
|
||||
"image": [
|
||||
["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"],
|
||||
["scenic", "rock", "landscape", "monolith", "formation"],
|
||||
[
|
||||
"multi-lane", "highway", "moderate", "traffic", "flow",
|
||||
"vehicles", "congestion"
|
||||
],
|
||||
],
|
||||
"mixture_text_image":
|
||||
[["invention", "person", "scientists", "Lick", "engineers"],
|
||||
["landscape", "trees", "road", "depicts", "scenic"]]
|
||||
},
|
||||
}
|
||||
# TODO: remove this entire test if there are no plans to extend them for Nano v2 VL.
|
||||
expected_keywords = {}
|
||||
|
||||
if modality not in expected_keywords[model_name]:
|
||||
pytest.skip(f"{modality=} not supported for {model_name}")
|
||||
|
||||
cmd = [
|
||||
str(example_root / "quickstart_multimodal.py"),
|
||||
@ -2620,19 +2607,13 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
|
||||
if use_cuda_graph:
|
||||
cmd.append("--use_cuda_graph")
|
||||
|
||||
output = llm_venv.run_cmd(cmd, caller=check_output)
|
||||
_ = llm_venv.run_cmd(cmd, caller=check_output)
|
||||
|
||||
match_ratio = 4.0 / 5
|
||||
parsed_outputs = parse_output(output)
|
||||
for prompt_output, prompt_keywords in zip(
|
||||
parsed_outputs, expected_keywords[model_name][modality]):
|
||||
matches = [
|
||||
keyword in prompt_output.lower() for keyword in prompt_keywords
|
||||
]
|
||||
obs_match_ratio = 1. * sum(matches) / len(matches)
|
||||
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
|
||||
|
||||
print("All answers are correct!")
|
||||
# NOTE: we deliberately do not check the LLM outputs with keyword matching ratios as in the
|
||||
# other tests, as it can be brittle and cause flakiness in CI.
|
||||
# This test now becomes a smoke / functional test.
|
||||
# Proper accuracy tests should be added to
|
||||
# `tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py`.
|
||||
|
||||
|
||||
@pytest.mark.parametrize("modality", ["image", "video"])
|
||||
|
||||
@ -741,9 +741,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-M
|
||||
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True]
|
||||
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4-True]
|
||||
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
|
||||
|
||||
@ -280,9 +280,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla
|
||||
test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
|
||||
test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
|
||||
test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
|
||||
test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B]
|
||||
test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8]
|
||||
test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
|
||||
@ -258,9 +258,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-M
|
||||
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True]
|
||||
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4-True]
|
||||
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
|
||||
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio]
|
||||
|
||||
@ -72,6 +72,7 @@ l0_h100:
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format
|
||||
- accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
|
||||
- accuracy/test_llm_api_pytorch_multimodal.py::TestMistralSmall24B::test_auto_dtype
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_dummy_load_format
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
|
||||
@ -284,8 +285,6 @@ l0_h100:
|
||||
- accuracy/test_llm_api_pytorch.py::TestStarcoder2_3B::test_auto_dtype
|
||||
- accuracy/test_llm_api_pytorch.py::TestStarcoder2_7B::test_auto_dtype
|
||||
- accuracy/test_llm_api_pytorch.py::TestStarcoder2_15B::test_auto_dtype
|
||||
- test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
|
||||
- test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
|
||||
- test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
|
||||
- test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
|
||||
- examples/test_mistral.py::test_mistral_with_bf16_lora_torch[mistral-7b-v0.1]
|
||||
|
||||
@ -317,8 +317,6 @@ examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-f
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
|
||||
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
|
||||
@ -357,7 +355,6 @@ accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] SKIP (https://nvbugs/5702795)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] SKIP (https://nvbugs/5702795)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] SKIP (https://nvbugs/5702795)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5648560)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5648560)
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4] SKIP (https://nvbugs/5705193)
|
||||
accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/5705193)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user