mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-02-06 11:11:36 +08:00
[TRTLLM-7070][feat] add gpt-oss chunked prefill tests (#7779)
Signed-off-by: Xin He (SW-GPU) <200704525+xinhe-nv@users.noreply.github.com>
This commit is contained in:
parent
ab26d21620
commit
9c1b75e978
@ -3261,6 +3261,42 @@ class TestGPTOSS(LlmapiAccuracyTestHarness):
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
|
||||
|
||||
@pytest.mark.skip_less_device(4)
|
||||
@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"])
|
||||
@pytest.mark.parametrize(
|
||||
"moe_backend",
|
||||
["CUTLASS",
|
||||
pytest.param("TRTLLM", marks=skip_pre_blackwell), "TRITON"],
|
||||
ids=["cutlass", "trtllm", "triton"])
|
||||
def test_w4_chunked_prefill(self, kv_cache_dtype, moe_backend, mocker):
|
||||
if moe_backend == "TRITON":
|
||||
if not IS_TRITON_KERNELS_AVAILABLE:
|
||||
pytest.skip("Triton kernels are not available")
|
||||
|
||||
pytorch_config = dict(disable_overlap_scheduler=True,
|
||||
cuda_graph_config=CudaGraphConfig())
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
|
||||
dtype=kv_cache_dtype)
|
||||
|
||||
model_name = "GPT-OSS/MXFP4"
|
||||
with LLM(self.MODEL_PATH,
|
||||
tensor_parallel_size=4,
|
||||
pipeline_parallel_size=1,
|
||||
moe_expert_parallel_size=1,
|
||||
kv_cache_config=kv_cache_config,
|
||||
max_seq_len=8192,
|
||||
max_num_tokens=512,
|
||||
enable_chunked_prefill=True,
|
||||
enable_attention_dp=False,
|
||||
moe_config=MoeConfig(backend=moe_backend),
|
||||
**pytorch_config) as llm:
|
||||
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
|
||||
mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
|
||||
{"scores_filter": "exact_match,flexible-extract"})
|
||||
task = GSM8K(model_name)
|
||||
task.evaluate(llm,
|
||||
extra_evaluator_kwargs=self.extra_evaluator_kwargs)
|
||||
|
||||
|
||||
class TestEXAONE4(LlmapiAccuracyTestHarness):
|
||||
MODEL_NAME = "LGAI-EXAONE/EXAONE-4.0-32B"
|
||||
|
||||
@ -547,6 +547,10 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-fp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[False]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_auto_dtype[True]
|
||||
accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ngram
|
||||
|
||||
@ -83,6 +83,10 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-fp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestMistralSmall24B::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestKanana_Instruct::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
|
||||
|
||||
@ -153,6 +153,10 @@ accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-fp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[cutlass-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[triton-auto]
|
||||
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_chunked_prefill[trtllm-fp8]
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_3NemotronSuper49Bv1::test_fp8_prequantized_tp2
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_auto_dtype
|
||||
accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized
|
||||
|
||||
@ -56,8 +56,20 @@ def test_trtllm_serve_benchmark(server: RemoteOpenAIServer, benchmark_root: str,
|
||||
client_script = os.path.join(benchmark_root, "benchmark_serving.py")
|
||||
dataset = dataset_path("sharegpt")
|
||||
benchmark_cmd = [
|
||||
"python3", client_script, "--dataset-name", "sharegpt", "--model",
|
||||
model_name, "--dataset-path", dataset, "--tokenizer", model_path
|
||||
"python3",
|
||||
client_script,
|
||||
"--dataset-name",
|
||||
"sharegpt",
|
||||
"--model",
|
||||
model_name,
|
||||
"--dataset-path",
|
||||
dataset,
|
||||
"--tokenizer",
|
||||
model_path,
|
||||
"--temperature",
|
||||
"1.0",
|
||||
"--top-p",
|
||||
"1.0",
|
||||
]
|
||||
|
||||
# CalledProcessError will be raised if any errors occur
|
||||
|
||||
Loading…
Reference in New Issue
Block a user