diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index c612e0182c..b6e8cb1321 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -29,3 +29,5 @@ mistral/Mistral-Large-3-675B: - accuracy: 47 Qwen/Qwen3-VL-8B-Instruct: - accuracy: 55.11 +mistralai/Mistral-Small-3.1-24B-Instruct-2503: + - accuracy: 57.0 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index 6c657f9521..6d53fa892a 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -345,3 +345,28 @@ class TestQwen3VL(LlmapiAccuracyTestHarness): ) as llm: task = MMMU(self.MODEL_NAME) task.evaluate(llm, sampling_params=self.sampling_params) + + +class TestMistralSmall24B(LlmapiAccuracyTestHarness): + MODEL_NAME = "mistralai/Mistral-Small-3.1-24B-Instruct-2503" + MODEL_PATH = f"{llm_models_root()}/Mistral-Small-3.1-24B-Instruct-2503" + MAX_NUM_TOKENS = 16384 + + # NOTE: MMMU adds <|endoftext|> to the stop token. + sampling_params = SamplingParams( + max_tokens=MMMU.MAX_OUTPUT_LEN, + truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, + stop="<|endoftext|>", + ) + + @pytest.mark.skip_less_device_memory(80000) + def test_auto_dtype(self): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) + with LLM( + self.MODEL_PATH, + kv_cache_config=kv_cache_config, + enable_chunked_prefill=True, + max_num_tokens=self.MAX_NUM_TOKENS, + ) as llm: + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index b55eeb8359..a5acfabb43 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -2536,9 +2536,6 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv): @pytest.mark.parametrize("use_cuda_graph", [False, True]) @pytest.mark.parametrize("modality", ["image", "video", "mixture_text_image"]) @pytest.mark.parametrize("model_name,model_path", [ - pytest.param("mistral-small-3.1-24b-instruct", - "Mistral-Small-3.1-24B-Instruct-2503", - marks=pytest.mark.skip_less_device_memory(80000)), pytest.param( "Nano-v2-VLM", "Nano-v2-VLM", @@ -2588,21 +2585,11 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, } } - expected_keywords = { - "mistral-small-3.1-24b-instruct": { - "image": [ - ["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"], - ["scenic", "rock", "landscape", "monolith", "formation"], - [ - "multi-lane", "highway", "moderate", "traffic", "flow", - "vehicles", "congestion" - ], - ], - "mixture_text_image": - [["invention", "person", "scientists", "Lick", "engineers"], - ["landscape", "trees", "road", "depicts", "scenic"]] - }, - } + # TODO: remove this entire test if there are no plans to extend them for Nano v2 VL. + expected_keywords = {} + + if modality not in expected_keywords[model_name]: + pytest.skip(f"{modality=} not supported for {model_name}") cmd = [ str(example_root / "quickstart_multimodal.py"), @@ -2620,19 +2607,13 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path, if use_cuda_graph: cmd.append("--use_cuda_graph") - output = llm_venv.run_cmd(cmd, caller=check_output) + _ = llm_venv.run_cmd(cmd, caller=check_output) - match_ratio = 4.0 / 5 - parsed_outputs = parse_output(output) - for prompt_output, prompt_keywords in zip( - parsed_outputs, expected_keywords[model_name][modality]): - matches = [ - keyword in prompt_output.lower() for keyword in prompt_keywords - ] - obs_match_ratio = 1. * sum(matches) / len(matches) - assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}" - - print("All answers are correct!") + # NOTE: we deliberately do not check the LLM outputs with keyword matching ratios as in the + # other tests, as it can be brittle and cause flakiness in CI. + # This test now becomes a smoke / functional test. + # Proper accuracy tests should be added to + # `tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py`. @pytest.mark.parametrize("modality", ["image", "video"]) diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index d7aedd6be4..3ced7b395f 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -741,9 +741,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-M test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True] test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4-True] test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1] -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 58a4436a70..3d2f6fde86 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -280,9 +280,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B] test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8] test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index cc8cbef8c3..0b26975d26 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -258,9 +258,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-M test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True] test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4-True] test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1] -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image] test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index fd5cda5779..adae47f626 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -72,6 +72,7 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format - accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized + - accuracy/test_llm_api_pytorch_multimodal.py::TestMistralSmall24B::test_auto_dtype - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_dummy_load_format - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] @@ -284,8 +285,6 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestStarcoder2_3B::test_auto_dtype - accuracy/test_llm_api_pytorch.py::TestStarcoder2_7B::test_auto_dtype - accuracy/test_llm_api_pytorch.py::TestStarcoder2_15B::test_auto_dtype - - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] - - test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] - test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] - test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image] - examples/test_mistral.py::test_mistral_with_bf16_lora_torch[mistral-7b-v0.1] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 2f24805aea..dcef1eabea 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -317,8 +317,6 @@ examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-f examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684) accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697) accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697) -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560) -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560) examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052) accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441) accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441) @@ -357,7 +355,6 @@ accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] SKIP (https://nvbugs/5702795) accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] SKIP (https://nvbugs/5702795) accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] SKIP (https://nvbugs/5702795) -test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5648560) test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5648560) accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4] SKIP (https://nvbugs/5705193) accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/5705193)