mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-03 09:41:30 +08:00
tests: add test_chunked_prefill for llama4 (#5549)
Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com>
This commit is contained in:
parent
d97419805b
commit
6268a60ab3
@ -413,6 +413,23 @@ class TestLlama4MaverickInstruct(LlmapiAccuracyTestHarness):
|
||||
task = GSM8K(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
@skip_pre_blackwell
|
||||
@pytest.mark.skip_less_device(8)
|
||||
@parametrize_with_ids("attn_backend", ["TRTLLM", "FLASHINFER"])
|
||||
def test_chunked_prefill(self, attn_backend):
|
||||
pytorch_config = dict(attn_backend=attn_backend,
|
||||
disable_overlap_scheduler=True)
|
||||
with LLM(self.MODEL_PATH,
|
||||
tensor_parallel_size=8,
|
||||
pipeline_parallel_size=1,
|
||||
moe_expert_parallel_size=1,
|
||||
max_seq_len=8192,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_tokens=256,
|
||||
**pytorch_config) as llm:
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
|
||||
class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
|
||||
|
||||
@ -383,6 +383,8 @@ accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_pp2
|
||||
accuracy/test_llm_api.py::TestLlama3_2_1B::test_fp8_rowwise
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_2_3B::test_fp8_prequantized
|
||||
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
|
||||
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
|
||||
accuracy/test_cli_flow.py::TestMistral7B::test_beam_search
|
||||
accuracy/test_cli_flow.py::TestMistral7B::test_fp8_tp4pp2
|
||||
accuracy/test_cli_flow.py::TestMistral7B::test_smooth_quant_tp4pp1
|
||||
@ -435,6 +437,8 @@ accuracy/test_llm_api.py::TestMixtral8x7B::test_tp2
|
||||
accuracy/test_llm_api.py::TestMixtral8x7B::test_smooth_quant_tp2pp2
|
||||
accuracy/test_llm_api.py::TestMixtral8x7BInstruct::test_awq_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
|
||||
@ -445,13 +449,13 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_4gpus[llguidance]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
|
||||
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_fp8_prequantized_tp4
|
||||
accuracy/test_cli_flow.py::TestLlama3_3_70BInstruct::test_nvfp4_prequantized_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestMistral7B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
||||
|
||||
@ -2,6 +2,8 @@ accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama4ScoutInstruct::test_auto_dtype[True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestBielik11BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
|
||||
@ -18,6 +20,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUT
|
||||
accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_eagle3
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_llm_sampler
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_beam_search
|
||||
@ -35,9 +38,15 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=FLASHINFER]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_chunked_prefill[attn_backend=TRTLLM]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
|
||||
accuracy/test_llm_api_pytorch.py::TestMinitron4BBaseInstruct::test_fp8_prequantized
|
||||
|
||||
Loading…
Reference in New Issue
Block a user