[None][infra] update feature_combination_matrix of disaggregated and Eagle3 (#6945)

Signed-off-by: leslie-fang25 <leslief@nvidia.com>
This commit is contained in:
Leslie Fang 2025-08-18 09:18:17 +08:00 committed by GitHub
parent d6322f70b7
commit ce0b13ea02
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 8 additions and 5 deletions

View File

@ -8,8 +8,8 @@
| Disaggregated Serving | Yes | Yes | Yes | --- | | | | | | | | | | |
| Chunked Prefill | Yes | Yes | Yes | Untested | --- | | | | | | | | | |
| MTP | Yes | Yes | Yes | Yes | Untested | --- | | | | | | | | |
| EAGLE-3(One Model Engine) | Yes | Yes | Yes | No | Yes | No | --- | | | | | | | |
| EAGLE-3(Two Model Engine) | NO | Yes | Yes | No | Yes | No | No | --- | | | | | | |
| EAGLE-3(One Model Engine) | Yes | Yes | Yes | Yes | Yes | No | --- | | | | | | | |
| EAGLE-3(Two Model Engine) | NO | Yes | Yes | Yes | Yes | No | No | --- | | | | | | |
| Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | |
| TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | |
| KV Cache Reuse | Yes | Yes | Yes | Untested | Yes | Untested | Yes | No | Yes | Yes | --- | | | |

View File

@ -349,13 +349,15 @@ def test_disaggregated_llama_context_capacity(model, enable_cuda_graph,
@pytest.mark.parametrize("model", ["Llama-3.1-8B-Instruct"])
@pytest.mark.parametrize("spec_dec_model_path", ["EAGLE3-LLaMA3.1-Instruct-8B"])
@pytest.mark.parametrize("generation_overlap", [False])
@pytest.mark.parametrize("eagle3_one_model", [True, False])
def test_disaggregated_spec_dec_batch_slot_limit(model, spec_dec_model_path,
generation_overlap):
generation_overlap,
eagle3_one_model):
# Test whether the batch slots are properly released when using speculative decoding
# with disaggregated serving.
spec_dec_config = EagleDecodingConfig(
speculative_model_dir=model_path(spec_dec_model_path),
eagle3_one_model=False,
eagle3_one_model=eagle3_one_model,
max_draft_len=3)
worker_pytorch_configs = []

View File

@ -85,7 +85,8 @@ l0_h100:
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0]
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_llama_context_capacity[False-False-DeepSeek-V3-Lite-fp8/fp8]
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct]
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[True-False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct]
- disaggregated/test_disaggregated_single_gpu.py::test_disaggregated_spec_dec_batch_slot_limit[False-False-EAGLE3-LLaMA3.1-Instruct-8B-Llama-3.1-8B-Instruct]
- test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
- test_e2e.py::test_trtllm_bench_iteration_log[PyTorch-non-streaming-meta-llama/Llama-3.1-8B-llama-3.1-model/Meta-Llama-3.1-8B]
- test_e2e.py::test_trtllm_bench_request_rate_and_concurrency[enable_concurrency-]