From 92d90fa29a14d0dff6bed491190e49f60931b5ba Mon Sep 17 00:00:00 2001 From: Jhao-Ting Chen Date: Tue, 23 Dec 2025 09:41:31 -0800 Subject: [PATCH] [None][feat] Expose enable_trt_overlap in Triton_backend brings 1.05x OTPS (#10018) Signed-off-by: Jhao-Ting Chen --- .../defs/triton_server/test_triton_llm.py | 4 ++-- .../test_lists/qa/llm_triton_integration.txt | 1 + tests/integration/test_lists/test-db/l0_a30.yml | 1 + .../inflight_batcher_llm/tensorrt_llm/config.pbtxt | 14 +++++++------- .../src/model_instance_state.cc | 5 +++-- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/tests/integration/defs/triton_server/test_triton_llm.py b/tests/integration/defs/triton_server/test_triton_llm.py index e0af741acd..e1886624c5 100644 --- a/tests/integration/defs/triton_server/test_triton_llm.py +++ b/tests/integration/defs/triton_server/test_triton_llm.py @@ -38,8 +38,8 @@ def stop_triton_server(): @pytest.mark.parametrize("BATCH_SCHEDULER_POLICY", ["max_utilization", "guaranteed_no_evict"]) @pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""]) -@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"], - ids=["disableTrtOverlap"]) +@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False", "True"], + ids=["disableTrtOverlap", "enableTrtOverlap"]) @pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"]) @pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"], ids=["enableDecoupleMode", "disableDecoupleMode"]) diff --git a/tests/integration/test_lists/qa/llm_triton_integration.txt b/tests/integration/test_lists/qa/llm_triton_integration.txt index 28e8005540..5af902a715 100644 --- a/tests/integration/test_lists/qa/llm_triton_integration.txt +++ b/tests/integration/test_lists/qa/llm_triton_integration.txt @@ -14,6 +14,7 @@ triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---Fals triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-True-tensorrt_llm_bls] triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble] triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls] +triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-enableTrtOverlap--guaranteed_no_evict---1-1-1-True-ensemble] triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-ensemble] triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls] triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml index b63ea04b5f..1a6b95fbb6 100644 --- a/tests/integration/test_lists/test-db/l0_a30.yml +++ b/tests/integration/test_lists/test-db/l0_a30.yml @@ -218,6 +218,7 @@ l0_a30: - triton_server/test_triton_llm.py::test_gpt_350m_python_backend[e2e] - triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls] - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls] + - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-enableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls] - triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] - triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble] - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls] diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt index f5f6cb41a4..eb9c527e6d 100644 --- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt +++ b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt @@ -686,13 +686,13 @@ parameters: { string_value: "${kv_cache_onboard_blocks}" } } -# enable_trt_overlap is deprecated and doesn't have any effect on the runtime -# parameters: { -# key: "enable_trt_overlap" -# value: { -# string_value: "${enable_trt_overlap}" -# } -# } +# enable_trt_overlap is an experimental feature used with CUDA Graph Mode. +parameters: { + key: "enable_trt_overlap" + value: { + string_value: "${enable_trt_overlap}" + } +} parameters: { key: "exclude_input_in_output" value: { diff --git a/triton_backend/inflight_batcher_llm/src/model_instance_state.cc b/triton_backend/inflight_batcher_llm/src/model_instance_state.cc index 82ee70bc99..0df022436f 100644 --- a/triton_backend/inflight_batcher_llm/src/model_instance_state.cc +++ b/triton_backend/inflight_batcher_llm/src/model_instance_state.cc @@ -491,10 +491,10 @@ executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams() + std::to_string(requestStatsMaxIterations)); } + bool enableTrtOverlap = false; try { - model_state_->GetParameter("enable_trt_overlap"); - TLLM_LOG_WARNING("enable_trt_overlap is deprecated and will be ignored"); + enableTrtOverlap = model_state_->GetParameter("enable_trt_overlap"); } catch (std::exception const& e) { @@ -698,6 +698,7 @@ executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams() maxQueueSize, extendedRuntimePerfKnobConfig, /*DebugConfig*/ std::nullopt, recvPollPeriodMs}; execConfig.setSpecDecConfig(specDecConfig); + execConfig.setEnableTrtOverlap(enableTrtOverlap); execConfig.setCacheTransceiverConfig(tle::CacheTransceiverConfig(tle::CacheTransceiverConfig::BackendType::MPI)); if (guidedConfig.has_value()) {