mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][infra] update feature_combination_matrix of disaggregated and Eagle3 (#6945)
Signed-off-by: leslie-fang25 <leslief@nvidia.com>
This commit is contained in:
parent
d6322f70b7
commit
ce0b13ea02
@ -8,8 +8,8 @@
|
||||
| Disaggregated Serving | Yes | Yes | Yes | --- | | | | | | | | | | |
|
||||
| Chunked Prefill | Yes | Yes | Yes | Untested | --- | | | | | | | | | |
|
||||
| MTP | Yes | Yes | Yes | Yes | Untested | --- | | | | | | | | |
|
||||
| EAGLE-3(One Model Engine) | Yes | Yes | Yes | No | Yes | No | --- | | | | | | | |
|
||||
| EAGLE-3(Two Model Engine) | NO | Yes | Yes | No | Yes | No | No | --- | | | | | | |
|
||||
| EAGLE-3(One Model Engine) | Yes | Yes | Yes | Yes | Yes | No | --- | | | | | | | |
|
||||
| EAGLE-3(Two Model Engine) | NO | Yes | Yes | Yes | Yes | No | No | --- | | | | | | |
|
||||
| Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | |
|
||||
| TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | |
|
||||
| KV Cache Reuse | Yes | Yes | Yes | Untested | Yes | Untested | Yes | No | Yes | Yes | --- | | | |
|
||||
|
||||
@ -349,13 +349,15 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
|
||||
@pytest.mark.parametrize("model", ["Llama-3.1-8B-Instruct"])
|
||||
@pytest.mark.parametrize("spec_dec_model_path", ["EAGLE3-LLaMA3.1-Instruct-8B"])
|
||||
@pytest.mark.parametrize("generation_overlap", [False])
|
||||
@pytest.mark.parametrize("eagle3_one_model", [True, False])
|
||||
def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path,
|
||||
generation_overlap):
|
||||
generation_overlap,
|
||||
eagle3_one_model):
|
||||
# Test whether the batch slots are properly released when using speculative decoding
|
||||
# with disaggregated serving.
|
||||
spec_dec_config = EagleDecodingConfig(
|
||||
speculative_model_dir=model_path(spec_dec_model_path),
|
||||
eagle3_one_model=False,
|
||||
eagle3_one_model=eagle3_one_model,
|
||||
max_draft_len=3)
|
||||
|
||||
worker_pytorch_configs = []
|
||||
|
||||
@ -85,7 +85,8 @@ l0_h100:
|
||||
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
|
||||
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8]
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct]
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[True-False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct]
|
||||
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[False-False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct]
|
||||
- test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
|
||||
- test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]
|
||||
|
||||
Loading…
Reference in New Issue
Block a user