mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None][infra] Enable accuracy test for eagle3 and chunked prefill (#6386)
Signed-off-by: leslie-fang25 <leslief@nvidia.com>
This commit is contained in:
parent
4763e94156
commit
a60190836c
@ -8,8 +8,8 @@
|
||||
| Disaggregated Serving | Yes | Yes | Yes | --- | | | | | | | | | | |
|
||||
| Chunked Prefill | Yes | Yes | Yes | Untested | --- | | | | | | | | | |
|
||||
| MTP | Yes | Yes | Yes | Yes | Untested | --- | | | | | | | | |
|
||||
| EAGLE-3(One Model Engine) | Yes | Yes | Yes | No | Untested | No | --- | | | | | | | |
|
||||
| EAGLE-3(Two Model Engine) | NO | Yes | Yes | No | Untested | No | No | --- | | | | | | |
|
||||
| EAGLE-3(One Model Engine) | Yes | Yes | Yes | No | Yes | No | --- | | | | | | | |
|
||||
| EAGLE-3(Two Model Engine) | NO | Yes | Yes | No | Yes | No | No | --- | | | | | | |
|
||||
| Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | |
|
||||
| TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | |
|
||||
| KV Cache Reuse | Yes | Yes | Yes | Untested | Yes | Untested | Yes | No | Yes | Yes | --- | | | |
|
||||
|
||||
@ -1955,7 +1955,9 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
|
||||
task = MMLU(self.MODEL_NAME)
|
||||
task.evaluate(llm)
|
||||
|
||||
def test_eagle3(self):
|
||||
@parametrize_with_ids("eagle3_one_model", [True, False])
|
||||
@parametrize_with_ids("enable_chunked_prefill", [False, True])
|
||||
def test_eagle3(self, enable_chunked_prefill, eagle3_one_model):
|
||||
pytorch_config = dict(
|
||||
disable_overlap_scheduler=True,
|
||||
cuda_graph_config=CudaGraphConfig(batch_sizes=[1]),
|
||||
@ -1967,11 +1969,13 @@ class TestQwen3_8B(LlmapiAccuracyTestHarness):
|
||||
|
||||
draft_len = 4
|
||||
spec_config = EagleDecodingConfig(max_draft_len=draft_len,
|
||||
speculative_model_dir=eagle_model_dir)
|
||||
speculative_model_dir=eagle_model_dir,
|
||||
eagle3_one_model=eagle3_one_model)
|
||||
|
||||
llm = LLM(model=target_model_dir,
|
||||
**pytorch_config,
|
||||
kv_cache_config=kv_cache_config,
|
||||
enable_chunked_prefill=enable_chunked_prefill,
|
||||
speculative_config=spec_config,
|
||||
build_config=None)
|
||||
|
||||
|
||||
@ -43,7 +43,10 @@ l0_h100:
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=True]
|
||||
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=False]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=0]
|
||||
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_cuda_graph_padding[mtp_nextn=2]
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user