[None][chore] remove some model support; add device constraint (#10563)

Signed-off-by: Jie Li <lijie@nvidia.com>
This commit is contained in:
Jie Li 2026-01-09 22:36:23 +08:00 committed by GitHub
parent 2b72d33fdc
commit 627d306df9
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 9 additions and 27 deletions

View File

@ -512,6 +512,7 @@ class TestEagle2Vicuna_7B_v1_3(LlmapiAccuracyTestHarness):
task.evaluate(llm)
@pytest.mark.skip_device_not_contain(["A100", "H100"])
class TestStarCoder2_7B(LlmapiAccuracyTestHarness):
MODEL_NAME = "bigcode/starcoder2-7b"
MODEL_PATH = f"{llm_models_root()}/starcoder2-7b"

View File

@ -3203,6 +3203,7 @@ def test_llm_llama_v3_2_smoothquant_1node_single_gpu(
@pytest.mark.timeout(7200)
@pytest.mark.skip_device_not_contain(["A100", "H100"])
@pytest.mark.skip_less_device_memory(80000)
@pytest.mark.skip_less_device(4)
@skip_post_blackwell_ultra

View File

@ -1,9 +1,12 @@
# TRT Backend Tests
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b]
# TRT Backend Tests (Llama 3.1/3.3 70B + StarCoder2-7B only)
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8]
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8]
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16]
examples/test_llama.py::test_llm_llama_2gpu_fp4[llama-3.1-70b-instruct-fp4_plugin]
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
accuracy/test_llm_api.py::TestStarCoder2_7B::test_auto_dtype
accuracy/test_llm_api.py::TestStarCoder2_7B::test_fp8
# serve tests
examples/serve/test_serve.py::test_config_file_loading[--extra_llm_api_options]
examples/serve/test_serve.py::test_config_file_loading[--config]
@ -24,24 +27,6 @@ examples/serve/test_serve_negative.py::test_malformed_json_request
examples/serve/test_serve_negative.py::test_missing_content_type_header
examples/serve/test_serve_negative.py::test_extremely_large_batch
# Accuracy test list
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_auto_dtype
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[disable_gemm_allreduce_plugin]
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin]
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin]
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[enable_gemm_allreduce_plugin]
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_autoq
accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_auto_dtype
accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_fp8_prequantized
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs
# PyTorch Backend Tests
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
@ -225,9 +210,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False]
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
@ -294,6 +276,7 @@ llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_type_tensorrt
llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_type_default
# keep test cases associated open bugs
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16]
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-full_prec]
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-fp8]
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-int4_awq]

View File

@ -316,8 +316,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[t
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16] SKIP (https://nvbugs/5465143)
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684)
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697)
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5640697)
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5640697)
accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560)
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560)
@ -372,7 +370,6 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_aut
disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5705199)
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2 SKIP (https://nvbugs/5707145)
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2 SKIP (https://nvbugs/5707145)
accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True] SKIP (https://nvbugs/5640697)
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5707145)
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True] SKIP (https://nvbugs/5707145)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343)