mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][chore] remove some model support; add device constraint (#10563)
Signed-off-by: Jie Li <lijie@nvidia.com>
This commit is contained in:
parent
2b72d33fdc
commit
627d306df9
@ -512,6 +512,7 @@ class TestEagle2Vicuna_7B_v1_3(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
@pytest.mark.skip_device_not_contain(["A100", "H100"])
|
||||
class TestStarCoder2_7B(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "bigcode/starcoder2-7b"
|
||||
MODEL_PATH = f"{llm_models_root()}/starcoder2-7b"
|
||||
|
||||
@ -3203,6 +3203,7 @@ def test_llm_llama_v3_2_smoothquant_1node_single_gpu(
|
||||
|
||||
|
||||
@pytest.mark.timeout(7200)
|
||||
@pytest.mark.skip_device_not_contain(["A100", "H100"])
|
||||
@pytest.mark.skip_less_device_memory(80000)
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@skip_post_blackwell_ultra
|
||||
|
||||
@ -1,9 +1,12 @@
|
||||
# TRT Backend Tests
|
||||
examples/test_llama.py::test_llama_3_x_fp8_with_bf16_lora[llama-3.1-8b]
|
||||
# TRT Backend Tests (Llama 3.1/3.3 70B + StarCoder2-7B only)
|
||||
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[enable_gemm_allreduce_plugin-llama-3.1-70b-disable_fp8]
|
||||
examples/test_llama.py::test_llm_llama_v3_1_1node_multi_gpus[disable_gemm_allreduce_plugin-llama-3.1-70b-enable_fp8]
|
||||
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16]
|
||||
examples/test_llama.py::test_llm_llama_2gpu_fp4[llama-3.1-70b-instruct-fp4_plugin]
|
||||
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
|
||||
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
|
||||
accuracy/test_llm_api.py::TestStarCoder2_7B::test_auto_dtype
|
||||
accuracy/test_llm_api.py::TestStarCoder2_7B::test_fp8
|
||||
|
||||
# serve tests
|
||||
examples/serve/test_serve.py::test_config_file_loading[--extra_llm_api_options]
|
||||
examples/serve/test_serve.py::test_config_file_loading[--config]
|
||||
@ -24,24 +27,6 @@ examples/serve/test_serve_negative.py::test_malformed_json_request
|
||||
examples/serve/test_serve_negative.py::test_missing_content_type_header
|
||||
examples/serve/test_serve_negative.py::test_extremely_large_batch
|
||||
|
||||
# Accuracy test list
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[disable_gemm_allreduce_plugin]
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_tp4[enable_gemm_allreduce_plugin]
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[disable_gemm_allreduce_plugin]
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_fp8_rowwise_tp4[enable_gemm_allreduce_plugin]
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8B::test_autoq
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_auto_dtype
|
||||
accuracy/test_cli_flow.py::TestLlama3_1_8BInstruct::test_fp8_prequantized
|
||||
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
|
||||
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
|
||||
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding[xgrammar]
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[xgrammar]
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_gather_generation_logits_cuda_graph
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8B::test_fp8_rowwise
|
||||
accuracy/test_llm_api.py::TestLlama3_1_8BInstruct::test_logprobs
|
||||
# PyTorch Backend Tests
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_1B::test_fp8_prequantized
|
||||
@ -225,9 +210,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_auto_dtype[cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH::test_reasoning_fp8_prequantized[cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True]
|
||||
@ -294,6 +276,7 @@ llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_type_tensorrt
|
||||
llmapi/test_llm_api_qa.py::TestLlmDefaultBackend::test_llm_args_type_default
|
||||
|
||||
# keep test cases associated open bugs
|
||||
examples/test_llama.py::test_llm_llama_1gpu_fp4[llama-3.1-70b-instruct-enable_norm_quant_fusion-enable_fused_quant-fp4_plugin-bfloat16]
|
||||
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-full_prec]
|
||||
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-fp8]
|
||||
examples/test_nemotron.py::test_llm_nemotron_4_15b_2gpus[bfloat16-int4_awq]
|
||||
|
||||
@ -316,8 +316,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[t
|
||||
examples/test_phi.py::test_llm_phi_quantization_1gpu[Phi-3-small-128k-instruct-fp8-bfloat16] SKIP (https://nvbugs/5465143)
|
||||
examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-Instruct-2503-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5644684)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697)
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5640697)
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5640697)
|
||||
accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560)
|
||||
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560)
|
||||
@ -372,7 +370,6 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_aut
|
||||
disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0] SKIP (https://nvbugs/5705199)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_auto_dtype_tp2 SKIP (https://nvbugs/5707145)
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2 SKIP (https://nvbugs/5707145)
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronH_56B_Base::test_auto_dtype[tp8-cuda_graph=True] SKIP (https://nvbugs/5640697)
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5707145)
|
||||
accuracy/test_llm_api_pytorch.py::TestNemotronUltra::test_fp8_prequantized[tp8-cuda_graph=True] SKIP (https://nvbugs/5707145)
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto] SKIP (https://nvbugs/5596343)
|
||||
|
||||
Loading…
Reference in New Issue
Block a user