[https://nvbugs/5791936][fix] Add warning for gen-only paused (#10664)

Signed-off-by: Chuang Zhu <111838961+chuangz0@users.noreply.github.com>
This commit is contained in:
Chuang Zhu 2026-01-16 11:18:24 +08:00 committed by GitHub
parent 6541e41c74
commit 8257b67ea5
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -841,6 +841,20 @@ public:
// for enc-dec models, pause means saving generated tokens to prompt but need to re-do encoder phase
mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? LlmRequestState::kENCODER_INIT
: LlmRequestState::kCONTEXT_INIT;
if (mLlmRequestType == LlmRequestType::LLMREQUEST_TYPE_GENERATION_ONLY)
{
// If gen only server is configured with MAX_UTILIZATION scheduler, the running gen only request may be
// paused and rescheduled as context_init state, which will run context phase, degrading performance.
// Have no idea how to avoid this. If we modify the max utilization scheduler to avoid pausing
// generation-only requests, it could result in no KV cache being available, causing requests to remain
// unscheduled indefinitely. We just issue a warning here.
TLLM_LOG_WARNING(
"Pausing generation-only request, request_id: %lu, changes it to context init state, which may degrade "
"performance.",
mRequestId);
}
mContextCurrentPositionTarget = 0;
mContextCurrentPositionDraft = 0;
mPrepopulatedPromptLenTarget = 0;