diff --git a/tests/integration/defs/perf/test_perf.py b/tests/integration/defs/perf/test_perf.py index 90931b2da3..a0db924269 100644 --- a/tests/integration/defs/perf/test_perf.py +++ b/tests/integration/defs/perf/test_perf.py @@ -69,6 +69,8 @@ MODEL_PATH_DICT = { "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1", "llama_v3.3_nemotron_super_49b_fp8": "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8", + "llama_v3.3_nemotron_super_49b_v1.5_fp8": + "nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1_5-FP8", "llama_v3.1_nemotron_ultra_253b": "nemotron-nas/Llama-3_1-Nemotron-Ultra-253B-v1", "llama_v3.1_nemotron_ultra_253b_fp8": @@ -90,11 +92,16 @@ MODEL_PATH_DICT = { "modelopt-hf-model-hub/Mixtral-8x7B-Instruct-v0.1-fp4", "mistral_nemo_12b_base": "Mistral-Nemo-Base-2407", "deepseek_r1_distill_qwen_32b": "DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B", + "deepseek_r1_distill_llama_70b": + "DeepSeek-R1/DeepSeek-R1-Distill-Llama-70B/", "mixtral_8x22b_v0.1": "Mixtral-8x22B-v0.1", "mistral_7b_v0.1": "mistral-7b-v0.1", "ministral_8b": "Ministral-8B-Instruct-2410", "ministral_8b_fp8": "Ministral-8B-Instruct-2410-FP8", "gemma_3_1b_it": "gemma/gemma-3-1b-it", + "gemma_3_27b_it": "gemma/gemma-3-27b-it", + "gemma_3_27b_it_fp8": "gemma/gemma-3-27b-it-fp8", + "gemma_3_27b_it_fp4": "gemma/gemma-3-27b-it-FP4", "deepseek_r1_fp8": "DeepSeek-R1/DeepSeek-R1", "deepseek_r1_nvfp4": "DeepSeek-R1/DeepSeek-R1-FP4", "deepseek_r1_0528_fp8": "DeepSeek-R1/DeepSeek-R1-0528/", @@ -106,8 +113,21 @@ MODEL_PATH_DICT = { "qwen_14b_chat": "Qwen-14B-Chat", "qwen3_0.6b": "Qwen3/Qwen3-0.6B", "qwen3_4b_eagle3": "Qwen3/Qwen3-4B", + "qwen3_8b": "Qwen3/Qwen3-8B", + "qwen3_8b_fp8": "Qwen3/nvidia-Qwen3-8B-FP8", + "qwen3_8b_fp4": "Qwen3/nvidia-Qwen3-8B-NVFP4", + "qwen3_14b": "Qwen3/Qwen3-14B", + "qwen3_14b_fp8": "Qwen3/nvidia-Qwen3-14B-FP8", + "qwen3_14b_fp4": "Qwen3/nvidia-Qwen3-14B-NVFP4", + "qwen3_30b_a3b": "Qwen3/Qwen3-30B-A3B", + "qwen3_30b_a3b_fp4": "Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf", + "qwen3_32b": "Qwen3/Qwen3-32B", + "qwen3_32b_fp4": "Qwen3/nvidia-Qwen3-32B-NVFP4", "qwen3_235b_a22b_fp8": "Qwen3/saved_models_Qwen3-235B-A22B_fp8_hf", "qwen3_235b_a22b_fp4": "Qwen3/saved_models_Qwen3-235B-A22B_nvfp4_hf", + "qwen2_5_vl_7b_instruct": "multimodals/Qwen2.5-VL-7B-Instruct", + "qwen2_5_vl_7b_instruct_fp8": "multimodals/Qwen2.5-VL-7B-Instruct-FP8", + "qwen2_5_vl_7b_instruct_fp4": "multimodals/Qwen2.5-VL-7B-Instruct-FP4", "starcoder2_3b": "starcoder2-3b", "starcoder2_7b": "starcoder2-7b", "starcoder2_15b": "starcoder2-15b", @@ -126,9 +146,14 @@ MODEL_PATH_DICT = { "gpt_20b": "gpt-neox-20b", "gpt_350m_moe": "gpt2-medium", "phi_4_mini_instruct": "Phi-4-mini-instruct", + "phi_4_reasoning_plus": "Phi-4-reasoning-plus", + "phi_4_reasoning_plus_fp8": "nvidia-Phi-4-reasoning-plus-FP8", + "phi_4_reasoning_plus_fp4": "nvidia-Phi-4-reasoning-plus-NVFP4", "phi_4_multimodal_instruct": "multimodals/Phi-4-multimodal-instruct", "phi_4_multimodal_instruct_image": "multimodals/Phi-4-multimodal-instruct", "phi_4_multimodal_instruct_audio": "multimodals/Phi-4-multimodal-instruct", + "phi_4_multimodal_instruct_fp4": + "multimodals/Phi-4-multimodal-instruct-FP4", "phi_4_multimodal_instruct_fp4_image": "multimodals/Phi-4-multimodal-instruct-FP4", "phi_4_multimodal_instruct_fp4_audio": @@ -137,12 +162,15 @@ MODEL_PATH_DICT = { "multimodals/Phi-4-multimodal-instruct-FP8", "phi_4_multimodal_instruct_fp8_audio": "multimodals/Phi-4-multimodal-instruct-FP8", + "phi_4_multimodal_instruct_fp8": + "multimodals/Phi-4-multimodal-instruct-FP8", "bielik_11b_v2.2_instruct": "Bielik-11B-v2.2-Instruct", "bielik_11b_v2.2_instruct_fp8": "Bielik-11B-v2.2-Instruct-FP8", "mistral_small_v3.1_24b": "Mistral-Small-3.1-24B-Instruct-2503", "gpt_oss_120b_fp4": "gpt_oss/gpt-oss-120b", "gpt_oss_20b_fp4": "gpt_oss/gpt-oss-20b", - "nemotron_nano_9b_v2": "NVIDIA-Nemotron-Nano-12B-v2", + "nemotron_nano_12b_v2": "NVIDIA-Nemotron-Nano-12B-v2", + "nvidia_nemotron_nano_9b_v2_nvfp4": "NVIDIA-Nemotron-Nano-9B-v2-NVFP4", "starcoder2_7b": "starcoder2-7b", "kimi_k2_nvfp4": "Kimi-K2-Thinking-NVFP4", } diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index a5acfabb43..b27eb86da4 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1902,11 +1902,47 @@ def test_ptp_quickstart(llm_root, llm_venv): marks=skip_pre_blackwell), pytest.param( 'GPT-OSS-120B', 'gpt_oss/gpt-oss-120b', marks=skip_pre_blackwell), + ("Llama3.1-8B-bf16-instruct", "llama-3.1-model/Llama-3.1-8B-Instruct"), + pytest.param('Llama3.1-8B-FP4', + 'modelopt-hf-model-hub/Llama-3.1-8B-Instruct-fp4', + marks=skip_pre_blackwell), + pytest.param( + 'Qwen3-8b-fp8', 'Qwen3/nvidia-Qwen3-8B-FP8', marks=skip_pre_hopper), + pytest.param('Qwen3-8b-nvfp4', + 'Qwen3/nvidia-Qwen3-8B-NVFP4', + marks=skip_pre_blackwell), + ("Qwen3-8B-bf16", "Qwen3/Qwen3-8B"), + pytest.param( + 'Qwen3-14b-fp8', 'Qwen3/nvidia-Qwen3-14B-FP8', marks=skip_pre_hopper), + pytest.param('Qwen3-14b-nvfp4', + 'Qwen3/nvidia-Qwen3-14B-NVFP4', + marks=skip_pre_blackwell), + ("Qwen3-14B-bf16", "Qwen3/Qwen3-14B"), + pytest.param('Qwen3-32b-nvfp4', + 'Qwen3/nvidia-Qwen3-32B-NVFP4', + marks=skip_pre_blackwell), + ("Qwen3-32B-bf16", "Qwen3/Qwen3-32B"), + pytest.param('Phi4-Reasoning-Plus-fp8', + 'nvidia-Phi-4-reasoning-plus-FP8', + marks=skip_pre_hopper), + pytest.param('Phi4-Reasoning-Plus-nvfp4', + 'nvidia-Phi-4-reasoning-plus-NVFP4', + marks=skip_pre_blackwell), + ("Phi-4-reasoning-plus-bf16", "Phi-4-reasoning-plus"), + pytest.param('Nemotron-Super-49B-v1.5-FP8', + 'nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1_5-FP8', + marks=skip_pre_hopper), + pytest.param('Llama-4-Scout-17B-16E-FP4', + 'llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4', + marks=skip_pre_blackwell), + pytest.param('Nemotron-Nano-9B-v2-nvfp4', + 'NVIDIA-Nemotron-Nano-9B-v2-NVFP4', + marks=skip_pre_blackwell), ]) def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path): print(f"Testing {model_name}.") example_root = Path(os.path.join(llm_root, "examples", "llm-api")) - if model_name == "Nemotron-H-8B": + if model_name in ("Nemotron-H-8B", "Nemotron-Nano-9B-v2-nvfp4"): llm_venv.run_cmd([ str(example_root / "quickstart_advanced.py"), "--disable_kv_cache_reuse", @@ -1934,7 +1970,7 @@ def test_ptp_quickstart_advanced(llm_root, llm_venv, model_name, model_path): ] if "Qwen3" in model_name: cmds.append(f"--kv_cache_fraction=0.6") - if "Llama3.1-70B" in model_name: + if "Llama3.1-70B" in model_name or "Llama3.3-70B" in model_name: cmds.append(f"--max_num_tokens=1024") llm_venv.run_cmd(cmds, stdout=running_log) if model_name in mapping: @@ -2053,11 +2089,22 @@ def test_ptp_quickstart_advanced_deepseek_multi_nodes(llm_root, llm_venv, @pytest.mark.parametrize("model_name,model_path,eagle_model_path", [ ("Llama-3.1-8b-Instruct", "llama-3.1-model/Llama-3.1-8B-Instruct", "EAGLE3-LLaMA3.1-Instruct-8B"), + pytest.param('GPT-OSS-120B-Eagle3', + 'gpt_oss/gpt-oss-120b', + 'gpt_oss/gpt-oss-120b-Eagle3', + marks=skip_pre_blackwell), ]) def test_ptp_quickstart_advanced_eagle3(llm_root, llm_venv, model_name, model_path, eagle_model_path): print(f"Testing {model_name}.") example_root = Path(os.path.join(llm_root, "examples", "llm-api")) + + # Set expected memory based on model size + if "GPT-OSS-120B" in model_name: + expected_mem = [106.71, 0, 0, 0] # Memory for 120B model with Eagle3 + else: + expected_mem = [25.2, 0, 0, 0] # Memory for Llama-3.1-8B with Eagle3 + with tempfile.NamedTemporaryFile(mode='w+t', suffix=f".{model_name}.log", dir="./", @@ -2077,7 +2124,7 @@ def test_ptp_quickstart_advanced_eagle3(llm_root, llm_venv, model_name, "--disable_overlap_scheduler", ], stdout=running_log) - _check_mem_usage(running_log, [25.2, 0, 0, 0]) + _check_mem_usage(running_log, expected_mem) @pytest.mark.parametrize("model_name,model_path,eagle_model_path", [ diff --git a/tests/integration/test_lists/qa/llm_digits_core.txt b/tests/integration/test_lists/qa/llm_digits_core.txt new file mode 100644 index 0000000000..2da9bbb00d --- /dev/null +++ b/tests/integration/test_lists/qa/llm_digits_core.txt @@ -0,0 +1,39 @@ +test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] +test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP4-modelopt-hf-model-hub/Llama-3.1-8B-Instruct-fp4] +test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-bf16-instruct-llama-3.1-model/Llama-3.1-8B-Instruct] +test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-20B-gpt_oss/gpt-oss-20b] +test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-120B-gpt_oss/gpt-oss-120b] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-8b-fp8-Qwen3/nvidia-Qwen3-8B-FP8] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-8b-nvfp4-Qwen3/nvidia-Qwen3-8B-NVFP4] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-8B-bf16-Qwen3/Qwen3-8B] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-14b-fp8-Qwen3/nvidia-Qwen3-14B-FP8] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-14b-nvfp4-Qwen3/nvidia-Qwen3-14B-NVFP4] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-14B-bf16-Qwen3/Qwen3-14B] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio] +test_e2e.py::test_ptp_quickstart_advanced[Phi4-Reasoning-Plus-fp8-nvidia-Phi-4-reasoning-plus-FP8] +test_e2e.py::test_ptp_quickstart_advanced[Phi4-Reasoning-Plus-nvfp4-nvidia-Phi-4-reasoning-plus-NVFP4] +test_e2e.py::test_ptp_quickstart_advanced[Phi-4-reasoning-plus-bf16-Phi-4-reasoning-plus] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-32B-bf16-Qwen3/Qwen3-32B] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-32b-nvfp4-Qwen3/nvidia-Qwen3-32B-NVFP4] +test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Nano-9B-v2-nvfp4-NVIDIA-Nemotron-Nano-9B-v2-NVFP4] +test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1.5-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1_5-FP8] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B_nvfp4_hf-Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf] +test_e2e.py::test_ptp_quickstart_advanced[Llama3.3-70B-FP8-modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8] +test_e2e.py::test_ptp_quickstart_advanced[Llama3.3-70B-FP4-modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4] + +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype diff --git a/tests/integration/test_lists/qa/llm_digits_func.txt b/tests/integration/test_lists/qa/llm_digits_func.txt index 30e3f22384..05a2e5e1b7 100644 --- a/tests/integration/test_lists/qa/llm_digits_func.txt +++ b/tests/integration/test_lists/qa/llm_digits_func.txt @@ -1,25 +1,44 @@ -test_e2e.py::test_ptp_quickstart -test_e2e.py::test_ptp_quickstart_advanced_mixed_precision #Llama-3_1-8B-Instruct_fp8_nvfp4_hf -test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] -test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] -test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B] +test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-20B-gpt_oss/gpt-oss-20b] +test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-120B-gpt_oss/gpt-oss-120b] +test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-bf16-instruct-llama-3.1-model/Llama-3.1-8B-Instruct] test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8] -test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8] -test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-NVFP4-nvfp4-quantized/Mixtral-8x7B-Instruct-v0.1] -test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-FP8-Mixtral-8x7B-Instruct-v0.1-fp8] -test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B] +test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP4-modelopt-hf-model-hub/Llama-3.1-8B-Instruct-fp4] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-8b-fp8-Qwen3/nvidia-Qwen3-8B-FP8] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-8b-nvfp4-Qwen3/nvidia-Qwen3-8B-NVFP4] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-8B-bf16-Qwen3/Qwen3-8B] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-14b-fp8-Qwen3/nvidia-Qwen3-14B-FP8] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-14b-nvfp4-Qwen3/nvidia-Qwen3-14B-NVFP4] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-14B-bf16-Qwen3/Qwen3-14B] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-32B-bf16-Qwen3/Qwen3-32B] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-32b-nvfp4-Qwen3/nvidia-Qwen3-32B-NVFP4] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B] +test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B_nvfp4_hf-Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf] +test_e2e.py::test_ptp_quickstart_advanced[Phi4-Reasoning-Plus-fp8-nvidia-Phi-4-reasoning-plus-FP8] +test_e2e.py::test_ptp_quickstart_advanced[Phi4-Reasoning-Plus-nvfp4-nvidia-Phi-4-reasoning-plus-NVFP4] +test_e2e.py::test_ptp_quickstart_advanced[Phi-4-reasoning-plus-bf16-Phi-4-reasoning-plus] test_e2e.py::test_ptp_quickstart_advanced[Llama3.3-70B-FP8-modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp8] test_e2e.py::test_ptp_quickstart_advanced[Llama3.3-70B-FP4-modelopt-hf-model-hub/Llama-3.3-70B-Instruct-fp4] -test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-BF16-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1] -test_e2e.py::test_ptp_quickstart_advanced[Mixtral-8x7B-BF16-Mixtral-8x7B-Instruct-v0.1] -test_e2e.py::test_ptp_quickstart_advanced[Mistral-Nemo-12b-Base-Mistral-Nemo-Base-2407] +test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-70B-FP8-llama-3.1-model/Llama-3.1-70B-Instruct-FP8] +test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1.5-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1_5-FP8] +test_e2e.py::test_ptp_quickstart_advanced[Llama-4-Scout-17B-16E-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4] test_e2e.py::test_ptp_quickstart_advanced[DeepSeek-R1-Distill-Qwen-32B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-32B] +test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Nano-9B-v2-nvfp4-NVIDIA-Nemotron-Nano-9B-v2-NVFP4] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-audio] +test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio] +test_e2e.py::test_ptp_quickstart_advanced_eagle3[GPT-OSS-120B-Eagle3-gpt_oss/gpt-oss-120b-gpt_oss/gpt-oss-120b-Eagle3] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=False-overlap_scheduler=False] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=False-eagle3_one_model=True-overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3[sampler_async_worker=True-eagle3_one_model=True-overlap_scheduler=True] -accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_ngram - -accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype -accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 +accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] +accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=True] +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8 +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4 +accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype diff --git a/tests/integration/test_lists/qa/llm_digits_perf.txt b/tests/integration/test_lists/qa/llm_digits_perf.txt deleted file mode 100644 index a216f04c30..0000000000 --- a/tests/integration/test_lists/qa/llm_digits_perf.txt +++ /dev/null @@ -1,28 +0,0 @@ -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:128,128] -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-streaming-bfloat16-input_output_len:512,128] -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:128,128] -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-bfloat16-input_output_len:512,128] -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128] -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-float8-input_output_len:128,2048] -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128] -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128] -perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-float4-input_output_len:128,2048] - -perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b-bench-pytorch-bfloat16-input_output_len:128,128] -perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:128,128] -perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_fp8-bench-pytorch-float8-input_output_len:512,128] - -perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] -perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-float8-input_output_len:512,128] -perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:128,128] -perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-float4-input_output_len:512,128] - -perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct-bench-pytorch-float16-input_output_len:128,128] -perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp8-bench-pytorch-float8-input_output_len:128,128] -perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128] -perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:128,128] -perf/test_perf.py::test_perf[mixtral_8x7b_v0.1_instruct_fp4-bench-pytorch-float4-input_output_len:512,128] - -perf/test_perf.py::test_perf[mistral_nemo_12b_base-bench-pytorch-float16-input_output_len:128,128] -perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-float16-input_output_len:128,128] diff --git a/tests/integration/test_lists/qa/llm_digits_perf.yml b/tests/integration/test_lists/qa/llm_digits_perf.yml new file mode 100644 index 0000000000..7dfd4594fe --- /dev/null +++ b/tests/integration/test_lists/qa/llm_digits_perf.yml @@ -0,0 +1,47 @@ +llm_digits_perf: +- condition: + ranges: + system_gpu_count: + gte: 1 + lte: 1 + wildcards: + gpu: + - '*gb10*' + linux_distribution_name: ubuntu* + cpu: aarch64 + terms: + backend: pytorch + tests: + - perf/test_perf.py::test_perf[gpt_oss_20b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[gpt_oss_120b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[nvidia_nemotron_nano_9b_v2_nvfp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen3_8b_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen3_8b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen3_8b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen3_14b_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen3_14b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen3_14b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[llama_v3.3_70b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen3_30b_a3b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen3_30b_a3b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[llama_v4_scout_17b_16e_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[llama_v3.3_nemotron_super_49b_v1.5_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[phi_4_reasoning_plus_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[phi_4_reasoning_plus_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[phi_4_reasoning_plus-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen3_32b_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen3_32b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[deepseek_r1_distill_qwen_32b-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[phi_4_multimodal_instruct-bench-pytorch-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_fp4-bench-pytorch-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[phi_4_multimodal_instruct_fp8-bench-pytorch-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[qwen2_5_vl_7b_instruct_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[gemma_3_27b_it-bench-pytorch-streaming-bfloat16-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[gemma_3_27b_it_fp8-bench-pytorch-streaming-float8-maxbs:1-input_output_len:2048,128-reqs:1-con:1] + - perf/test_perf.py::test_perf[gemma_3_27b_it_fp4-bench-pytorch-streaming-float4-maxbs:1-input_output_len:2048,128-reqs:1-con:1] diff --git a/tests/integration/test_lists/qa/llm_perf_sanity.yml b/tests/integration/test_lists/qa/llm_perf_sanity.yml index 258341983b..a45be4bcad 100644 --- a/tests/integration/test_lists/qa/llm_perf_sanity.yml +++ b/tests/integration/test_lists/qa/llm_perf_sanity.yml @@ -38,7 +38,7 @@ llm_perf_sanity: # Ministral-8B - perf/test_perf.py::test_perf[ministral_8b-bench-pytorch-bfloat16-input_output_len:500,2000-reqs:500-con:250] - perf/test_perf.py::test_perf[phi_4_mini_instruct-bench-pytorch-bfloat16-input_output_len:500,2000] - - perf/test_perf.py::test_perf[nemotron_nano_9b_v2-bench-pytorch-bfloat16-input_output_len:512,512] + - perf/test_perf.py::test_perf[nemotron_nano_12b_v2-bench-pytorch-bfloat16-input_output_len:512,512] - perf/test_perf.py::test_perf[qwen3_4b_eagle3-bench-pytorch-streaming-bfloat16-maxbs:4-kv_frac:0.6-input_output_len:500,100-reqs:200-con:4] - perf/test_perf.py::test_perf[llama_v3.1_8b-bench-pytorch-bfloat16-maxbs:256-input_output_len:128,128-gpus:2] - perf/test_perf.py::test_perf[llama_v3.1_8b_instruct-bench-pytorch-streaming-bfloat16-input_output_len:128,128-gpus:2]