mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[None][feat] Expose enable_trt_overlap in Triton_backend brings 1.05x OTPS (#10018)
Signed-off-by: Jhao-Ting Chen <jhaotingc@nvidia.com>
This commit is contained in:
parent
0027a01ad5
commit
92d90fa29a
@ -38,8 +38,8 @@ def stop_triton_server():
|
|||||||
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
@pytest.mark.parametrize("BATCH_SCHEDULER_POLICY",
|
||||||
["max_utilization", "guaranteed_no_evict"])
|
["max_utilization", "guaranteed_no_evict"])
|
||||||
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
@pytest.mark.parametrize("KV_CACHE_FREE_GPU_MEM_FRACTION", [""])
|
||||||
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False"],
|
@pytest.mark.parametrize("ENABLE_TRT_OVERLAP", ["False", "True"],
|
||||||
ids=["disableTrtOverlap"])
|
ids=["disableTrtOverlap", "enableTrtOverlap"])
|
||||||
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
@pytest.mark.parametrize("BATCHING_STRATEGY", ["inflight_fused_batching"])
|
||||||
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
@pytest.mark.parametrize("DECOUPLED_MODE", ["True", "False"],
|
||||||
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
ids=["enableDecoupleMode", "disableDecoupleMode"])
|
||||||
|
|||||||
@ -14,6 +14,7 @@ triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---Fals
|
|||||||
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-True-tensorrt_llm_bls]
|
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-True-tensorrt_llm_bls]
|
||||||
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
|
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-ensemble]
|
||||||
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
|
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-disableTrtOverlap--guaranteed_no_evict---1-1-1-False-tensorrt_llm_bls]
|
||||||
|
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_basic-False-1---False-True-False-0-128-disableDecoupleMode-inflight_fused_batching-enableTrtOverlap--guaranteed_no_evict---1-1-1-True-ensemble]
|
||||||
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-ensemble]
|
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-ensemble]
|
||||||
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
|
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
|
||||||
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
|
triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
|
||||||
|
|||||||
@ -218,6 +218,7 @@ l0_a30:
|
|||||||
- triton_server/test_triton_llm.py::test_gpt_350m_python_backend[e2e]
|
- triton_server/test_triton_llm.py::test_gpt_350m_python_backend[e2e]
|
||||||
- triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
|
- triton_server/test_triton_llm.py::test_gpt_350m_ifb[test_basic-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
|
||||||
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
|
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
|
||||||
|
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[batched_inputs-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-enableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
|
||||||
- triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
|
- triton_server/test_triton_llm.py::test_medusa_vicuna_7b_ifb[False-1-medusa--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
|
||||||
- triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
|
- triton_server/test_triton_llm.py::test_eagle_vicuna_7b_ifb[False-1-eagle--False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-False-ensemble]
|
||||||
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
|
- triton_server/test_triton_llm.py::test_llama_v2_7b_ifb[test_stop_words-False-1---False-True-False-0-128-enableDecoupleMode-inflight_fused_batching-disableTrtOverlap--max_utilization---1-1-1-True-tensorrt_llm_bls]
|
||||||
|
|||||||
@ -686,13 +686,13 @@ parameters: {
|
|||||||
string_value: "${kv_cache_onboard_blocks}"
|
string_value: "${kv_cache_onboard_blocks}"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
# enable_trt_overlap is deprecated and doesn't have any effect on the runtime
|
# enable_trt_overlap is an experimental feature used with CUDA Graph Mode.
|
||||||
# parameters: {
|
parameters: {
|
||||||
# key: "enable_trt_overlap"
|
key: "enable_trt_overlap"
|
||||||
# value: {
|
value: {
|
||||||
# string_value: "${enable_trt_overlap}"
|
string_value: "${enable_trt_overlap}"
|
||||||
# }
|
}
|
||||||
# }
|
}
|
||||||
parameters: {
|
parameters: {
|
||||||
key: "exclude_input_in_output"
|
key: "exclude_input_in_output"
|
||||||
value: {
|
value: {
|
||||||
|
|||||||
@ -491,10 +491,10 @@ executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams()
|
|||||||
+ std::to_string(requestStatsMaxIterations));
|
+ std::to_string(requestStatsMaxIterations));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool enableTrtOverlap = false;
|
||||||
try
|
try
|
||||||
{
|
{
|
||||||
model_state_->GetParameter<bool>("enable_trt_overlap");
|
enableTrtOverlap = model_state_->GetParameter<bool>("enable_trt_overlap");
|
||||||
TLLM_LOG_WARNING("enable_trt_overlap is deprecated and will be ignored");
|
|
||||||
}
|
}
|
||||||
catch (std::exception const& e)
|
catch (std::exception const& e)
|
||||||
{
|
{
|
||||||
@ -698,6 +698,7 @@ executor::ExecutorConfig ModelInstanceState::getExecutorConfigFromParams()
|
|||||||
maxQueueSize, extendedRuntimePerfKnobConfig,
|
maxQueueSize, extendedRuntimePerfKnobConfig,
|
||||||
/*DebugConfig*/ std::nullopt, recvPollPeriodMs};
|
/*DebugConfig*/ std::nullopt, recvPollPeriodMs};
|
||||||
execConfig.setSpecDecConfig(specDecConfig);
|
execConfig.setSpecDecConfig(specDecConfig);
|
||||||
|
execConfig.setEnableTrtOverlap(enableTrtOverlap);
|
||||||
execConfig.setCacheTransceiverConfig(tle::CacheTransceiverConfig(tle::CacheTransceiverConfig::BackendType::MPI));
|
execConfig.setCacheTransceiverConfig(tle::CacheTransceiverConfig(tle::CacheTransceiverConfig::BackendType::MPI));
|
||||||
if (guidedConfig.has_value())
|
if (guidedConfig.has_value())
|
||||||
{
|
{
|
||||||
|
|||||||
Loading…
Reference in New Issue
Block a user