diff --git a/tests/integration/defs/accuracy/references/mmmu.yaml b/tests/integration/defs/accuracy/references/mmmu.yaml index b6e8cb1321..a46c77ec2e 100644 --- a/tests/integration/defs/accuracy/references/mmmu.yaml +++ b/tests/integration/defs/accuracy/references/mmmu.yaml @@ -3,10 +3,17 @@ google/gemma-3-27b-it: - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 50.0 + - quant_algo: NVFP4 + kv_cache_quant_algo: FP8 + accuracy: 48.0 Qwen/Qwen2-VL-7B-Instruct: - accuracy: 48.44 Qwen/Qwen2.5-VL-7B-Instruct: - accuracy: 51.22 + - quant_algo: FP8 + accuracy: 45.44 + - quant_algo: NVFP4 + accuracy: 40.67 nvidia/Nano-v2-VLM: - accuracy: 43.78 llava-hf/llava-v1.6-mistral-7b-hf: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py index 399fa66a4f..ef8b0ad6e8 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py @@ -2,8 +2,9 @@ import pytest from tensorrt_llm import LLM from tensorrt_llm.llmapi import CudaGraphConfig, KvCacheConfig, MoeConfig, SamplingParams +from tensorrt_llm.quantization import QuantAlgo -from ..conftest import llm_models_root, skip_post_blackwell, skip_pre_blackwell, skip_pre_hopper +from ..conftest import llm_models_root, skip_pre_blackwell, skip_pre_hopper from .accuracy_core import MMMU, LlmapiAccuracyTestHarness @@ -54,6 +55,30 @@ class TestQwen2_5_VL_7B(LlmapiAccuracyTestHarness): task = MMMU(self.MODEL_NAME) task.evaluate(llm, sampling_params=self.sampling_params) + @skip_pre_hopper + def test_fp8(self): + model_path = f"{llm_models_root()}/multimodals/Qwen2.5-VL-7B-Instruct-FP8" + with LLM( + model_path, + max_num_tokens=self.MAX_NUM_TOKENS, + kv_cache_config=self.kv_cache_config, + ) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8 + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) + + @skip_pre_blackwell + def test_nvfp4(self): + model_path = f"{llm_models_root()}/multimodals/Qwen2.5-VL-7B-Instruct-FP4" + with LLM( + model_path, + max_num_tokens=self.MAX_NUM_TOKENS, + kv_cache_config=self.kv_cache_config, + ) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) + class TestNano_V2_VLM(LlmapiAccuracyTestHarness): MODEL_NAME = "nvidia/Nano-v2-VLM" @@ -217,7 +242,6 @@ class TestPhi4MMFusedVisionLora(LlmapiAccuracyTestHarness): @skip_pre_hopper -@skip_post_blackwell class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness): MODEL_NAME = "google/gemma-3-27b-it" # Note: This has only the LLM part quantized. Vision part is in bfloat16. @@ -236,17 +260,28 @@ class TestGemma3_27BInstruct(LlmapiAccuracyTestHarness): dtype="fp8", ) - def test_fp8_prequantized(self): + def _make_llm(self, model_path: str): # Gemma3 VLM needs FlashInfer attention backend for custom mask support. - with LLM( - self.MODEL_PATH, + return LLM( + model_path, max_batch_size=16, max_num_tokens=self.MAX_NUM_TOKENS, max_seq_len=8704, # 8192 + 512. kv_cache_config=self.kv_cache_config, attn_backend="FLASHINFER", enable_chunked_prefill=False, - ) as llm: + ) + + def test_fp8_prequantized(self): + with self._make_llm(self.MODEL_PATH) as llm: + task = MMMU(self.MODEL_NAME) + task.evaluate(llm, sampling_params=self.sampling_params) + + @skip_pre_blackwell + def test_nvfp4_prequantized(self): + model_path = f"{llm_models_root()}/gemma/gemma-3-27b-it-FP4" + with self._make_llm(model_path) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4 task = MMMU(self.MODEL_NAME) task.evaluate(llm, sampling_params=self.sampling_params) diff --git a/tests/integration/defs/perf/pytorch_model_config.py b/tests/integration/defs/perf/pytorch_model_config.py index baf05df2fa..1b88bc524b 100644 --- a/tests/integration/defs/perf/pytorch_model_config.py +++ b/tests/integration/defs/perf/pytorch_model_config.py @@ -17,6 +17,8 @@ Model pytorch/TRT yaml config for trtllm-bench perf tests """ +from ..conftest import llm_models_root + def recursive_update(d, u): for k, v in u.items(): @@ -295,6 +297,32 @@ def get_model_yaml_config(model_label: str, 'num_postprocess_workers': 4 } }, + # GPT-OSS 120B speculative decoding (Eagle3 draft) + { + 'patterns': [ + 'gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-maxnt:4096-input_output_len:2048,128-reqs:1-con:1', + ], + 'config': { + 'enable_attention_dp': False, + 'disable_overlap_scheduler': False, + 'enable_autotuner': False, + 'enable_chunked_prefill': True, + 'cuda_graph_config': { + 'enable_padding': True, + }, + 'speculative_config': { + 'decoding_type': + 'Eagle', + 'max_draft_len': + 5, + 'speculative_model_dir': + f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3", + }, + 'kv_cache_config': { + 'enable_block_reuse': False, + }, + } + }, # Phi-4-multimodal-instruct with chunked prefill and kv_cache_reuse { 'patterns': [ diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index a4ed2b60de..d70d1c9f40 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -169,6 +169,7 @@ MODEL_PATH_DICT = { "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503", "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b", "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b", + "gpt_oss_120b_eagle3": "gpt_oss/gpt-oss-120b-Eagle3", "nemotron_nano_3_30b_fp8": "Nemotron-Nano-3-30B-A3.5B-FP8-KVFP8-dev", "nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2", "nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4", diff --git a/tests/integration/test_lists/qa/llm_spark_core.txt b/tests/integration/test_lists/qa/llm_spark_core.txt index 2da9bbb00d..54f177b68b 100644 --- a/tests/integration/test_lists/qa/llm_spark_core.txt +++ b/tests/integration/test_lists/qa/llm_spark_core.txt @@ -37,3 +37,5 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutl accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype + +test_e2e.py::test_trtllm_benchmark_serving[gpt_oss/gpt-oss-20b] diff --git a/tests/integration/test_lists/qa/llm_spark_func.txt b/tests/integration/test_lists/qa/llm_spark_func.txt index 05a2e5e1b7..fade1ddf59 100644 --- a/tests/integration/test_lists/qa/llm_spark_func.txt +++ b/tests/integration/test_lists/qa/llm_spark_func.txt @@ -37,8 +37,38 @@ test_e2e.py::test_ptp_quickstart_advanced_eagle3[GPT-OSS-120B-Eagle3-gpt_oss/gpt accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_fp8 +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_nvfp4 +accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized +accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_nvfp4_prequantized accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype + +test_e2e.py::test_trtllm_benchmark_serving[gpt_oss/gpt-oss-20b] +test_e2e.py::test_openai_health +test_e2e.py::test_openai_chat_guided_decoding +test_e2e.py::test_trtllm_multimodal_benchmark_serving +test_e2e.py::test_openai_completions_example[pytorch] +test_e2e.py::test_openai_reasoning[pytorch] +test_e2e.py::test_openai_chat_harmony +test_e2e.py::test_trtllm_benchmark_serving[llama-3.1-model/Meta-Llama-3.1-8B] + +examples/serve/test_serve_negative.py::test_invalid_max_tokens +examples/serve/test_serve_negative.py::test_invalid_temperature +examples/serve/test_serve_negative.py::test_invalid_top_p[-0.1] +examples/serve/test_serve_negative.py::test_invalid_top_p[1.1] +examples/serve/test_serve_negative.py::test_empty_messages_array +examples/serve/test_serve_negative.py::test_missing_message_role +examples/serve/test_serve_negative.py::test_invalid_token_ids +examples/serve/test_serve_negative.py::test_extremely_large_token_id +examples/serve/test_serve_negative.py::test_server_stability_under_invalid_requests +examples/serve/test_serve_negative.py::test_concurrent_invalid_requests +examples/serve/test_serve_negative.py::test_mixed_valid_invalid_requests +examples/serve/test_serve_negative.py::test_health_check_during_errors +examples/serve/test_serve_negative.py::test_request_exceeds_context_length +examples/serve/test_serve_negative.py::test_malformed_json_request +examples/serve/test_serve_negative.py::test_missing_content_type_header +examples/serve/test_serve_negative.py::test_extremely_large_batch diff --git a/tests/integration/test_lists/qa/llm_spark_perf.yml b/tests/integration/test_lists/qa/llm_spark_perf.yml index 8447702dff..d9f6755179 100644 --- a/tests/integration/test_lists/qa/llm_spark_perf.yml +++ b/tests/integration/test_lists/qa/llm_spark_perf.yml @@ -7,7 +7,10 @@ llm_spark_perf: lte: 1 tests: - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + # GPT-OSS 120B normal case (no spec dec) - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + # GPT-OSS 120B spec dec case (Eagle3) + - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-maxnt:4096-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[nvidia_nemotron_nano_9b_v2_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1]