From 92d90fa29a14d0dff6bed491190e49f60931b5ba Mon Sep 17 00:00:00 2001
From: Jhao-Ting Chen <jhaotingc@nvidia.com>
Date: Tue, 23 Dec 2025 09:41:31 -0800
Subject: [PATCH] [None][feat] Expose enable_trt_overlap in Triton_backend
 brings 1.05x OTPS (#10018)

Signed-off-by: Jhao-Ting Chen <jhaotingc@nvidia.com>
---
 .../defs/triton_server/test_triton_llm.py          |  4 ++--
 .../test_lists/qa/llm_triton_integration.txt       |  1 +
 tests/integration/test_lists/test-db/l0_a30.yml    |  1 +
 .../inflight_batcher_llm/tensorrt_llm/config.pbtxt | 14 +++++++-------
 .../src/model_instance_state.cc                    |  5 +++--
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/tests/integration/defs/triton_server/test_triton_llm.py b/tests/integration/defs/triton_server/test_triton_llm.py
index e0af741acd..e1886624c5 100644
--- a/tests/integration/defs/triton_server/test_triton_llm.py
+++ b/tests/integration/defs/triton_server/test_triton_llm.py
@@ -38,8 +38,8 @@ def stop_triton_server():
 @pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
                          ["max_utilization", "guaranteed_no_evict"])
 @pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
-@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
-                         ids=["disableTrtOverlap"])
+@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False", "True"],
+                         ids=["disableTrtOverlap", "enableTrtOverlap"])
 @pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
 @pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
                          ids=["enableDecoupleMode", "disableDecoupleMode"])
diff --git a/tests/integration/test_lists/qa/llm_triton_integration.txt b/tests/integration/test_lists/qa/llm_triton_integration.txt
index 28e8005540..5af902a715 100644
--- a/tests/integration/test_lists/qa/llm_triton_integration.txt
+++ b/tests/integration/test_lists/qa/llm_triton_integration.txt
@@ -14,6 +14,7 @@ triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---Fals
 triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-True-tensorrt_llm_bls]
 triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
 triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
+triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-enableTrtOverlap--guaranteed_no_evict---1-1-1-True-ensemble]
 triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-ensemble]
 triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
 triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
diff --git a/tests/integration/test_lists/test-db/l0_a30.yml b/tests/integration/test_lists/test-db/l0_a30.yml
index b63ea04b5f..1a6b95fbb6 100644
--- a/tests/integration/test_lists/test-db/l0_a30.yml
+++ b/tests/integration/test_lists/test-db/l0_a30.yml
@@ -218,6 +218,7 @@ l0_a30:
   - triton_server/test_triton_llm.py::test_gpt_350m_python_backend[e2e]
   - triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
   - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
+  - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-enableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
   - triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
   - triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
   - triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
diff --git a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
index f5f6cb41a4..eb9c527e6d 100644
--- a/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
+++ b/triton_backend/all_models/inflight_batcher_llm/tensorrt_llm/config.pbtxt
@@ -686,13 +686,13 @@ parameters: {
     string_value: "${kv_cache_onboard_blocks}"
   }
 }
-# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
-# parameters: {
-#   key: "enable_trt_overlap"
-#   value: {
-#     string_value: "${enable_trt_overlap}"
-#   }
-# }
+# enable_trt_overlap is an experimental feature used with CUDA Graph Mode.
+parameters: {
+  key: "enable_trt_overlap"
+  value: {
+    string_value: "${enable_trt_overlap}"
+  }
+}
 parameters: {
   key: "exclude_input_in_output"
   value: {
diff --git a/triton_backend/inflight_batcher_llm/src/model_instance_state.cc b/triton_backend/inflight_batcher_llm/src/model_instance_state.cc
index 82ee70bc99..0df022436f 100644
--- a/triton_backend/inflight_batcher_llm/src/model_instance_state.cc
+++ b/triton_backend/inflight_batcher_llm/src/model_instance_state.cc
@@ -491,10 +491,10 @@ executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams()
             + std::to_string(requestStatsMaxIterations));
     }
 
+    bool enableTrtOverlap = false;
     try
     {
-        model_state_->GetParameter<bool>("enable_trt_overlap");
-        TLLM_LOG_WARNING("enable_trt_overlap is deprecated and will be ignored");
+        enableTrtOverlap = model_state_->GetParameter<bool>("enable_trt_overlap");
     }
     catch (std::exception const& e)
     {
@@ -698,6 +698,7 @@ executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams()
         maxQueueSize, extendedRuntimePerfKnobConfig,
         /*DebugConfig*/ std::nullopt, recvPollPeriodMs};
     execConfig.setSpecDecConfig(specDecConfig);
+    execConfig.setEnableTrtOverlap(enableTrtOverlap);
     execConfig.setCacheTransceiverConfig(tle::CacheTransceiverConfig(tle::CacheTransceiverConfig::BackendType::MPI));
     if (guidedConfig.has_value())
     {