diff --git a/tests/integration/defs/accuracy/test_llm_api.py b/tests/integration/defs/accuracy/test_llm_api.py index e019572ada..76bb0497ed 100644 --- a/tests/integration/defs/accuracy/test_llm_api.py +++ b/tests/integration/defs/accuracy/test_llm_api.py @@ -512,6 +512,7 @@ class TestEagle2Vicuna_7B_v1_3(LlmapiAccuracyTestHarness): task.evaluate(llm) +@pytest.mark.skip_device_not_contain(["A100", "H100"]) class TestStarCoder2_7B(LlmapiAccuracyTestHarness): MODEL_NAME = "bigcode/starcoder2-7b" MODEL_PATH = f"{llm_models_root()}/starcoder2-7b" diff --git a/tests/integration/defs/examples/test_llama.py b/tests/integration/defs/examples/test_llama.py index 4839f1f647..a68aaf7dd8 100644 --- a/tests/integration/defs/examples/test_llama.py +++ b/tests/integration/defs/examples/test_llama.py @@ -3203,6 +3203,7 @@ def test_llm_llama_v3_2_smoothquant_1node_single_gpu( @pytest.mark.timeout(7200) +@pytest.mark.skip_device_not_contain(["A100", "H100"]) @pytest.mark.skip_less_device_memory(80000) @pytest.mark.skip_less_device(4) @skip_post_blackwell_ultra diff --git a/tests/integration/test_lists/qa/llm_function_nim.txt b/tests/integration/test_lists/qa/llm_function_nim.txt index 6df3dc1484..cc8cbef8c3 100644 --- a/tests/integration/test_lists/qa/llm_function_nim.txt +++ b/tests/integration/test_lists/qa/llm_function_nim.txt @@ -1,9 +1,12 @@ -# TRT Backend Tests -examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b] +# TRT Backend Tests (Llama 3.1/3.3 70B + StarCoder2-7B only) examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8] examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8] -examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16] examples/test_llama.py::test_llm_llama_2gpu_fp4[llama-3.1-70b-instruct-fp4_plugin] +accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4 +accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4 +accuracy/test_llm_api.py::TestStarCoder2_7B::test_auto_dtype +accuracy/test_llm_api.py::TestStarCoder2_7B::test_fp8 + # serve tests examples/serve/test_serve.py::test_config_file_loading[--extra_llm_api_options] examples/serve/test_serve.py::test_config_file_loading[--config] @@ -24,24 +27,6 @@ examples/serve/test_serve_negative.py::test_malformed_json_request examples/serve/test_serve_negative.py::test_missing_content_type_header examples/serve/test_serve_negative.py::test_extremely_large_batch -# Accuracy test list -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_auto_dtype -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8 -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[disable_gemm_allreduce_plugin] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[enable_gemm_allreduce_plugin] -accuracy/test_cli_flow.py::TestLlama3_1_8B::test_autoq -accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_auto_dtype -accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_fp8_prequantized -accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4 -accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4 - -accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar] -accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar] -accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph -accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise -accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs # PyTorch Backend Tests accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized @@ -225,9 +210,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False] accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] -accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True] accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True] @@ -294,6 +276,7 @@ llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_type_tensorrt llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_type_default # keep test cases associated open bugs +examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16] examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-full_prec] examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-fp8] examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-int4_awq] diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index b5e54ceb35..ea9ec30eb4 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -316,8 +316,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[t examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16] SKIP (https://nvbugs/5465143) examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684) accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697) -accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5640697) -accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5640697) accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697) test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560) test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560) @@ -372,7 +370,6 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_aut disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5705199) accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2 SKIP (https://nvbugs/5707145) accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2 SKIP (https://nvbugs/5707145) -accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True] SKIP (https://nvbugs/5640697) accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5707145) accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True] SKIP (https://nvbugs/5707145) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343)