From 8257b67ea57135d5173b085883e01f64da98838f Mon Sep 17 00:00:00 2001 From: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> Date: Fri, 16 Jan 2026 11:18:24 +0800 Subject: [PATCH] [https://nvbugs/5791936][fix] Add warning for gen-only paused (#10664) Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com> --- .../tensorrt_llm/batch_manager/llmRequest.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h index d3c967b7eb..2d6e792281 100644 --- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h +++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h @@ -841,6 +841,20 @@ public: // for enc-dec models, pause means saving generated tokens to prompt but need to re-do encoder phase mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? LlmRequestState::kENCODER_INIT : LlmRequestState::kCONTEXT_INIT; + + if (mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_GENERATION_ONLY) + { + + // If gen only server is configured with MAX_UTILIZATION scheduler, the running gen only request may be + // paused and rescheduled as context_init state, which will run context phase, degrading performance. + // Have no idea how to avoid this. If we modify the max utilization scheduler to avoid pausing + // generation-only requests, it could result in no KV cache being available, causing requests to remain + // unscheduled indefinitely. We just issue a warning here. + TLLM_LOG_WARNING( + "Pausing generation-only request, request_id: %lu, changes it to context init state, which may degrade " + "performance.", + mRequestId); + } mContextCurrentPositionTarget = 0; mContextCurrentPositionDraft = 0; mPrepopulatedPromptLenTarget = 0;