[https://nvbugs/5669097][tests] Add MMMU test for mistral small (#10530)

Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com>
This commit is contained in:
William Zhang 2026-01-09 16:09:28 -08:00 committed by GitHub
parent 38f249b479
commit ff7eb93f31
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 39 additions and 44 deletions

View File

@ -29,3 +29,5 @@ mistral/Mistral-Large-3-675B:
- accuracy: 47
Qwen/Qwen3-VL-8B-Instruct:
- accuracy: 55.11
mistralai/Mistral-Small-3.1-24B-Instruct-2503:
- accuracy: 57.0

View File

@ -345,3 +345,28 @@ class TestQwen3VL(LlmapiAccuracyTestHarness):
) as llm:
task = MMMU(self.MODEL_NAME)
task.evaluate(llm, sampling_params=self.sampling_params)
class TestMistralSmall24B(LlmapiAccuracyTestHarness):
MODEL_NAME = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
MODEL_PATH = f"{llm_models_root()}/Mistral-Small-3.1-24B-Instruct-2503"
MAX_NUM_TOKENS = 16384
# NOTE: MMMU adds <|endoftext|> to the stop token.
sampling_params = SamplingParams(
max_tokens=MMMU.MAX_OUTPUT_LEN,
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
stop="<|endoftext|>",
)
@pytest.mark.skip_less_device_memory(80000)
def test_auto_dtype(self):
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
with LLM(
self.MODEL_PATH,
kv_cache_config=kv_cache_config,
enable_chunked_prefill=True,
max_num_tokens=self.MAX_NUM_TOKENS,
) as llm:
task = MMMU(self.MODEL_NAME)
task.evaluate(llm, sampling_params=self.sampling_params)

View File

@ -2536,9 +2536,6 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv):
@pytest.mark.parametrize("use_cuda_graph", [False, True])
@pytest.mark.parametrize("modality", ["image", "video", "mixture_text_image"])
@pytest.mark.parametrize("model_name,model_path", [
pytest.param("mistral-small-3.1-24b-instruct",
"Mistral-Small-3.1-24B-Instruct-2503",
marks=pytest.mark.skip_less_device_memory(80000)),
pytest.param(
"Nano-v2-VLM",
"Nano-v2-VLM",
@ -2588,21 +2585,11 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
}
}
expected_keywords = {
"mistral-small-3.1-24b-instruct": {
"image": [
["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"],
["scenic", "rock", "landscape", "monolith", "formation"],
[
"multi-lane", "highway", "moderate", "traffic", "flow",
"vehicles", "congestion"
],
],
"mixture_text_image":
[["invention", "person", "scientists", "Lick", "engineers"],
["landscape", "trees", "road", "depicts", "scenic"]]
},
}
# TODO: remove this entire test if there are no plans to extend them for Nano v2 VL.
expected_keywords = {}
if modality not in expected_keywords[model_name]:
pytest.skip(f"{modality=} not supported for {model_name}")
cmd = [
str(example_root / "quickstart_multimodal.py"),
@ -2620,19 +2607,13 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
if use_cuda_graph:
cmd.append("--use_cuda_graph")
output = llm_venv.run_cmd(cmd, caller=check_output)
_ = llm_venv.run_cmd(cmd, caller=check_output)
match_ratio = 4.0 / 5
parsed_outputs = parse_output(output)
for prompt_output, prompt_keywords in zip(
parsed_outputs, expected_keywords[model_name][modality]):
matches = [
keyword in prompt_output.lower() for keyword in prompt_keywords
]
obs_match_ratio = 1. * sum(matches) / len(matches)
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
print("All answers are correct!")
# NOTE: we deliberately do not check the LLM outputs with keyword matching ratios as in the
# other tests, as it can be brittle and cause flakiness in CI.
# This test now becomes a smoke / functional test.
# Proper accuracy tests should be added to
# `tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py`.
@pytest.mark.parametrize("modality", ["image", "video"])

View File

@ -741,9 +741,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-M
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True]
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4-True]
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]

View File

@ -280,9 +280,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla
test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B]
test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8]
test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]

View File

@ -258,9 +258,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-M
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True]
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4-True]
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio]

View File

@ -72,6 +72,7 @@ l0_h100:
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format
- accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
- accuracy/test_llm_api_pytorch_multimodal.py::TestMistralSmall24B::test_auto_dtype
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_dummy_load_format
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
@ -284,8 +285,6 @@ l0_h100:
- accuracy/test_llm_api_pytorch.py::TestStarcoder2_3B::test_auto_dtype
- accuracy/test_llm_api_pytorch.py::TestStarcoder2_7B::test_auto_dtype
- accuracy/test_llm_api_pytorch.py::TestStarcoder2_15B::test_auto_dtype
- test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
- test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
- test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
- test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
- examples/test_mistral.py::test_mistral_with_bf16_lora_torch[mistral-7b-v0.1]

View File

@ -317,8 +317,6 @@ examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-f
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684)
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697)
accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560)
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560)
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
@ -357,7 +355,6 @@ accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] SKIP (https://nvbugs/5702795)
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] SKIP (https://nvbugs/5702795)
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] SKIP (https://nvbugs/5702795)
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5648560)
test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5648560)
accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4] SKIP (https://nvbugs/5705193)
accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/5705193)