[None][chore] remove some model support; add device constraint (#10563)

Signed-off-by: Jie Li <lijie@nvidia.com>
2026-01-13 22:18:36 +08:00 · 2026-01-09 22:36:23 +08:00 · 2026-01-09 22:36:23 +08:00 · 627d306df9
commit 627d306df9
parent 2b72d33fdc
4 changed files with 9 additions and 27 deletions
--- a/tests/integration/defs/accuracy/test_llm_api.py
+++ b/tests/integration/defs/accuracy/test_llm_api.py
@ -512,6 +512,7 @@ class TestEagle2Vicuna_7B_v1_3(LlmapiAccuracyTestHarness):
            task.evaluate(llm)


+@pytest.mark.skip_device_not_contain(["A100", "H100"])
 class TestStarCoder2_7B(LlmapiAccuracyTestHarness):
    MODEL_NAME = "bigcode/starcoder2-7b"
    MODEL_PATH = f"{llm_models_root()}/starcoder2-7b"
--- a/tests/integration/defs/examples/test_llama.py
+++ b/tests/integration/defs/examples/test_llama.py
@ -3203,6 +3203,7 @@ def test_llm_llama_v3_2_smoothquant_1node_single_gpu(


@pytest.mark.timeout(7200)
+@pytest.mark.skip_device_not_contain(["A100", "H100"])
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(4)
@skip_post_blackwell_ultra
--- a/tests/integration/test_lists/qa/llm_function_nim.txt
+++ b/tests/integration/test_lists/qa/llm_function_nim.txt
@ -1,9 +1,12 @@
-# TRT Backend Tests
-examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b]
+# TRT Backend Tests (Llama 3.1/3.3 70B + StarCoder2-7B only)
 examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8]
 examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8]
-examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16]
 examples/test_llama.py::test_llm_llama_2gpu_fp4[llama-3.1-70b-instruct-fp4_plugin]
+accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
+accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
+accuracy/test_llm_api.py::TestStarCoder2_7B::test_auto_dtype
+accuracy/test_llm_api.py::TestStarCoder2_7B::test_fp8
+
 # serve tests
 examples/serve/test_serve.py::test_config_file_loading[--extra_llm_api_options]
 examples/serve/test_serve.py::test_config_file_loading[--config]
@ -24,24 +27,6 @@ examples/serve/test_serve_negative.py::test_malformed_json_request
 examples/serve/test_serve_negative.py::test_missing_content_type_header
 examples/serve/test_serve_negative.py::test_extremely_large_batch

-# Accuracy test list
-accuracy/test_cli_flow.py::TestLlama3_1_8B::test_auto_dtype
-accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8
-accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[disable_gemm_allreduce_plugin]
-accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin]
-accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin]
-accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[enable_gemm_allreduce_plugin]
-accuracy/test_cli_flow.py::TestLlama3_1_8B::test_autoq
-accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_auto_dtype
-accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_fp8_prequantized
-accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
-accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
-
-accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
-accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
-accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
-accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise
-accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs
 # PyTorch Backend Tests
 accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
@ -225,9 +210,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan
 accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False]
 accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True]
-accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True]
-accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True]
-accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
@ -294,6 +276,7 @@ llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_type_tensorrt
 llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_type_default

 # keep test cases associated open bugs
+examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16]
 examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-full_prec]
 examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-fp8]
 examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-int4_awq]
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@ -316,8 +316,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[t
 examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16] SKIP (https://nvbugs/5465143)
 examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684)
 accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697)
-accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5640697)
-accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5640697)
 accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560)
 test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560)
@ -372,7 +370,6 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_aut
 disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5705199)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2 SKIP (https://nvbugs/5707145)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2 SKIP (https://nvbugs/5707145)
-accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True] SKIP (https://nvbugs/5640697)
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5707145)
 accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True] SKIP (https://nvbugs/5707145)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343)