mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
354 lines
14 KiB
C++
354 lines
14 KiB
C++
/*
|
|
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "tensorrt_llm/batch_manager/llmRequest.h"
|
|
#include "tensorrt_llm/executor/serializeUtils.h"
|
|
#include "tensorrt_llm/kernels/beamSearchKernels.h"
|
|
|
|
namespace tensorrt_llm::batch_manager
|
|
{
|
|
|
|
template <typename TTensor, typename TStream>
|
|
runtime::SizeType32 GenericLlmRequest<TTensor, TStream>::getBeamWidthByIter(bool const forNextIteration)
|
|
{
|
|
runtime::SizeType32 beamWidth = mSamplingConfig.beamWidth; // For non-Variable-Beam-Width-Search
|
|
auto const& beamWidthArray = mSamplingConfig.beamWidthArray;
|
|
if (beamWidthArray.has_value())
|
|
{
|
|
auto const iter = mDecodingIter + (forNextIteration ? 1 : 0);
|
|
// Clamped `decodingIter` into [0,kMaxBeamWidthArrayLength-1] as index
|
|
int const index
|
|
= std::max(std::min(iter, static_cast<int>(tensorrt_llm::kernels::kMaxBeamWidthArrayLength)) - 1, 0);
|
|
beamWidth = beamWidthArray.value()[0][index];
|
|
}
|
|
return beamWidth;
|
|
}
|
|
|
|
template class GenericLlmRequest<runtime::ITensor::SharedPtr>;
|
|
|
|
std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits, int32_t mpiWorldRank)
|
|
{
|
|
auto requestId = isChild() ? mParentRequestId : mRequestId;
|
|
auto result = createResult(useFastLogits, mpiWorldRank);
|
|
if (result.has_value())
|
|
{
|
|
return executor::Response(requestId, result.value(), mClientId);
|
|
}
|
|
return std::nullopt;
|
|
}
|
|
|
|
void LlmRequest::createSerializedResult(
|
|
std::vector<char>& serializedResult, bool& isFinal, bool useFastLogits, int32_t mpiWorldRank)
|
|
{
|
|
auto result = createResult(useFastLogits, mpiWorldRank);
|
|
if (result.has_value())
|
|
{
|
|
std::ostringstream oStream;
|
|
executor::serialize_utils::serialize(result.value(), oStream);
|
|
auto str = oStream.str();
|
|
serializedResult.resize(str.size());
|
|
std::copy(str.begin(), str.end(), serializedResult.begin());
|
|
isFinal = result.value().isFinal;
|
|
}
|
|
}
|
|
|
|
/// Note that there is some dependency on the order of operations in this method. Modify with care!
|
|
std::optional<executor::Result> LlmRequest::createResult(bool useFastLogits, int32_t mpiWorldRank)
|
|
{
|
|
TLLM_CHECK(!isDisaggContextCompleteState());
|
|
if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))
|
|
{
|
|
return std::nullopt;
|
|
}
|
|
|
|
TLLM_LOG_DEBUG("Creating response for request %lu", mRequestId);
|
|
|
|
executor::Result result;
|
|
result.sequenceIndex = mSequenceIndex;
|
|
|
|
result.isSequenceFinal = isFinished();
|
|
mSequenceFinalVec->at(mSequenceIndex) = result.isSequenceFinal;
|
|
|
|
result.isFinal = std::all_of(
|
|
mSequenceFinalVec->begin(), mSequenceFinalVec->end(), [](bool isSequenceFinal) { return isSequenceFinal; });
|
|
|
|
auto const maxNbTokens = getMaxBeamNumTokens();
|
|
|
|
if (isDisaggContextTransmissionState() && isContextOnlyRequest())
|
|
{
|
|
auto const reqBeamWidth = mSamplingConfig.beamWidth;
|
|
std::vector<TokenIdType> firstGenTokens;
|
|
for (SizeType32 beam = 0; beam < reqBeamWidth; ++beam)
|
|
{
|
|
firstGenTokens.push_back(getTokens().at(beam).back());
|
|
}
|
|
if (!hasDraftTokens())
|
|
{
|
|
result.contextPhaseParams = executor::ContextPhaseParams{
|
|
std::move(firstGenTokens), mRequestId, mContextPhaseParams.value().releaseState(), std::nullopt};
|
|
}
|
|
else
|
|
{
|
|
result.contextPhaseParams = executor::ContextPhaseParams{
|
|
std::move(firstGenTokens), mRequestId, mContextPhaseParams.value().releaseState(), *getDraftTokens()};
|
|
}
|
|
}
|
|
|
|
auto const calculateNbTokensOut = [this](SizeType32 maxNbTokens)
|
|
{
|
|
if (!mIsStreaming)
|
|
{
|
|
return maxNbTokens - (mExcludeInputFromOutput ? getOrigPromptLen() : 0);
|
|
}
|
|
return mReturnAllGeneratedTokens ? maxNbTokens - getOrigPromptLen() : maxNbTokens - getMaxSentTokenLen();
|
|
};
|
|
|
|
auto const maxNbTokensOut = calculateNbTokensOut(maxNbTokens);
|
|
|
|
auto const nbBeams = mSamplingConfig.getNumReturnBeams();
|
|
|
|
result.outputTokenIds.resize(nbBeams);
|
|
|
|
auto const startTokenPos = maxNbTokens - maxNbTokensOut;
|
|
|
|
auto const shouldSendResponse = isFinished() || (mIsStreaming && maxNbTokens > getMaxSentTokenLen());
|
|
|
|
if (!shouldSendResponse)
|
|
{
|
|
return std::nullopt;
|
|
}
|
|
|
|
for (SizeType32 beam = 0; beam < nbBeams; ++beam)
|
|
{
|
|
auto const& tokens = getTokens(beam);
|
|
auto const nbTokensOut = calculateNbTokensOut(tokens.size());
|
|
|
|
if (nbTokensOut > 0)
|
|
{
|
|
auto const first = tokens.data() + startTokenPos;
|
|
result.outputTokenIds.at(beam).assign(first, first + nbTokensOut);
|
|
}
|
|
}
|
|
|
|
auto sliceBeams = [&nbBeams](auto beams)
|
|
{ return std::vector<typename decltype(beams)::value_type>(beams.begin(), beams.begin() + nbBeams); };
|
|
|
|
if (returnLogProbs())
|
|
{
|
|
result.cumLogProbs = sliceBeams(getCumLogProbs());
|
|
result.logProbs = sliceBeams(getLogProbs());
|
|
}
|
|
|
|
if (getReturnContextLogits())
|
|
{
|
|
result.contextLogits = executor::detail::ofITensor(getContextLogitsHost());
|
|
}
|
|
|
|
if (getReturnGenerationLogits())
|
|
{
|
|
bool hasDraftTokens = mDraftTokens && !mDraftTokens->empty();
|
|
if (isStreaming() && !hasDraftTokens)
|
|
{
|
|
auto startGenTokenPos = startTokenPos - getOrigPromptLen();
|
|
TensorPtr generationLogitsHostCurrentStep
|
|
= runtime::ITensor::slice(getGenerationLogitsHost(), startGenTokenPos, maxNbTokensOut);
|
|
result.generationLogits = executor::detail::ofITensor(generationLogitsHostCurrentStep);
|
|
}
|
|
else if (useFastLogits)
|
|
{
|
|
result.specDecFastLogitsInfo = executor::SpeculativeDecodingFastLogitsInfo{mRequestId, mpiWorldRank};
|
|
}
|
|
else
|
|
{
|
|
result.generationLogits
|
|
= executor::detail::ofITensor(runtime::ITensor::slice(getGenerationLogitsHost(), 0, nbBeams));
|
|
}
|
|
}
|
|
|
|
if (getReturnEncoderOutput())
|
|
{
|
|
result.encoderOutput = executor::detail::ofITensor(getEncoderOutputHost());
|
|
}
|
|
|
|
if (getReturnPerfMetrics())
|
|
{
|
|
mPerfMetrics.kvCacheMetrics.kvCacheHitRate = getKVCacheHitRatePerRequest();
|
|
|
|
auto& specDecMetrics = mPerfMetrics.speculativeDecoding;
|
|
if (specDecMetrics.totalDraftTokens != 0)
|
|
{
|
|
specDecMetrics.acceptanceRate
|
|
= static_cast<float>(specDecMetrics.totalAcceptedDraftTokens) / specDecMetrics.totalDraftTokens;
|
|
}
|
|
|
|
result.requestPerfMetrics = mPerfMetrics;
|
|
}
|
|
|
|
result.finishReasons = sliceBeams(mFinishReasons);
|
|
result.decodingIter = mDecodingIter;
|
|
|
|
if (hasAdditionalOutputs())
|
|
{
|
|
std::string prefix = "context_";
|
|
for (auto const& outputTensorMap : {mAdditionalContextOutputTensors, mAdditionalGenerationOutputTensors})
|
|
{
|
|
for (auto const& outputTensor : outputTensorMap)
|
|
{
|
|
TLLM_LOG_DEBUG("Adding tensor %s with shape %s to result.", outputTensor.first.c_str(),
|
|
runtime::ITensor::toString(outputTensor.second->getShape()).c_str());
|
|
result.additionalOutputs.emplace_back(
|
|
prefix + outputTensor.first, executor::detail::ofITensor(outputTensor.second));
|
|
}
|
|
prefix = "generation_";
|
|
}
|
|
}
|
|
|
|
// Update position of last sent response
|
|
setMaxSentTokenLen(maxNbTokens);
|
|
return result;
|
|
}
|
|
|
|
void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
|
|
SizeType32 vocabSizePadded, std::optional<SizeType32> maxEncoderInputLen, bool enableKVCacheReuse)
|
|
{
|
|
if (mEndId.has_value())
|
|
{
|
|
TLLM_CHECK_WITH_INFO(*mEndId >= -1 && *mEndId < vocabSizePadded,
|
|
"EndId (%d) is not within acceptable range [-1, %d).", *mEndId, vocabSizePadded);
|
|
}
|
|
if (getEncoderInputFeatures()
|
|
&& getEncoderInputFeatures()->getShape().nbDims < 4) // skip encoder shape validation for image inputs
|
|
{
|
|
TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderInputLen() > maxEncoderInputLen.value()),
|
|
"Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderInputLen(),
|
|
maxEncoderInputLen.value());
|
|
}
|
|
|
|
if (mPromptLen > maxInputLen)
|
|
{
|
|
TLLM_THROW(
|
|
"Prompt length (%d) exceeds maximum input length (%d). Set log level to info and check "
|
|
"TRTGptModel logs for how maximum input length is set",
|
|
mPromptLen, maxInputLen);
|
|
}
|
|
|
|
// Maximum number of draft tokens per request we pass to the engine for single runtime iteration.
|
|
// It depends on the speculative decoding mode.
|
|
auto draftLenPerEngineStep = maxDraftLen;
|
|
auto const& draftTokens = getDraftTokens();
|
|
if (draftTokens && !draftTokens->empty())
|
|
{
|
|
auto const inputDraftTokensLen = static_cast<SizeType32>(draftTokens->size());
|
|
if (inputDraftTokensLen > maxDraftLen)
|
|
{
|
|
TLLM_THROW(
|
|
"Draft tokens length (%d) exceeds maximum draft tokens length (%d).", inputDraftTokensLen, maxDraftLen);
|
|
}
|
|
draftLenPerEngineStep = inputDraftTokensLen;
|
|
|
|
if (mPromptLen + draftLenPerEngineStep > maxInputLen)
|
|
{
|
|
auto const newDraftLenPerEngineStep = maxInputLen - mPromptLen;
|
|
TLLM_LOG_WARNING(
|
|
"Prompt length + number of draft tokens (%d + %d) exceeds maximum input length (%d)."
|
|
"Number of draft tokens is changed to (%d)",
|
|
mPromptLen, draftLenPerEngineStep, maxInputLen, newDraftLenPerEngineStep);
|
|
draftLenPerEngineStep = newDraftLenPerEngineStep;
|
|
mDraftTokens->resize(draftLenPerEngineStep);
|
|
}
|
|
}
|
|
|
|
if (mPromptLen + mMaxNewTokens + draftLenPerEngineStep > maxSequenceLen)
|
|
{
|
|
auto const maxNewTokens = maxSequenceLen - mPromptLen - draftLenPerEngineStep;
|
|
TLLM_LOG_WARNING(
|
|
"Prompt length + number of requested output tokens + draft tokens per step (%d + %d + %d) exceeds "
|
|
"maximum sequence length (%d). "
|
|
"Number of requested output tokens is changed to (%d).",
|
|
mPromptLen, mMaxNewTokens, draftLenPerEngineStep, maxSequenceLen, maxNewTokens);
|
|
mMaxNewTokens = maxNewTokens;
|
|
}
|
|
|
|
TLLM_CHECK_WITH_INFO(mSamplingConfig.validate(), "Incorrect sampling config");
|
|
|
|
// validate extra ids when enabling kv cache reuse with prompt table
|
|
if (enableKVCacheReuse && mPromptEmbeddingTable.has_value() && mPromptVocabSize.has_value())
|
|
{
|
|
TLLM_CHECK_WITH_INFO(mInputTokenExtraIds.has_value() && mInputTokenExtraIds.value(),
|
|
"Input token extra ids must be provided when enabling kv cache reuse with prompt table");
|
|
TLLM_CHECK_WITH_INFO(mInputTokenExtraIds.value()->size() == static_cast<size_t>(mOrigPromptLen),
|
|
"inputTokenExtraIds vector size (%lu) must be the same as input token vector size (%lu).",
|
|
mInputTokenExtraIds.value()->size(), static_cast<size_t>(mOrigPromptLen));
|
|
}
|
|
}
|
|
|
|
std::shared_ptr<LlmRequest> LlmRequest::createChildRequest(RequestIdType requestId)
|
|
{
|
|
TLLM_CHECK_WITH_INFO(!isChild(), "A child request cannot create its own child.");
|
|
TLLM_CHECK_WITH_INFO(mChildRequests.size() + 1 < static_cast<size_t>(getNumSubRequests()),
|
|
"Cannot create child requests more than the number of return sequences (%d)", getNumSubRequests());
|
|
auto childReq = std::make_shared<LlmRequest>(*this);
|
|
childReq->mRequestId = requestId;
|
|
childReq->mSequenceIndex = mChildRequests.size() + 1;
|
|
childReq->mParentRequestId = this->mRequestId;
|
|
childReq->mSequenceFinalVec = this->mSequenceFinalVec;
|
|
childReq->mSeqSlot.reset();
|
|
|
|
// To ensure different randomness across children, assign a unique random seed to each child
|
|
// by adding its sequence index to the base seed. If no seed is provided, the parent's seed defaults to 0.
|
|
using RandomSeedType = tensorrt_llm::executor::RandomSeedType;
|
|
if (childReq->mSamplingConfig.randomSeed.has_value())
|
|
{
|
|
childReq->mSamplingConfig.randomSeed->at(0) += static_cast<RandomSeedType>(childReq->mSequenceIndex);
|
|
}
|
|
else
|
|
{
|
|
RandomSeedType defaultSeed{0};
|
|
mSamplingConfig.randomSeed = std::vector<RandomSeedType>(1, defaultSeed);
|
|
childReq->mSamplingConfig.randomSeed
|
|
= std::vector<RandomSeedType>(1, defaultSeed + static_cast<RandomSeedType>(childReq->mSequenceIndex));
|
|
}
|
|
|
|
mChildRequests.push_back(childReq);
|
|
return childReq;
|
|
}
|
|
|
|
void LlmRequest::movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager)
|
|
{
|
|
if (!mPromptEmbeddingTable.has_value()
|
|
|| mPromptEmbeddingTable.value()->getMemoryType() == runtime::MemoryType::kGPU)
|
|
{
|
|
return;
|
|
}
|
|
|
|
TensorPtr gpuPromptEmbeddingTable = manager.copyFrom(*mPromptEmbeddingTable.value(), runtime::MemoryType::kGPU);
|
|
mPromptEmbeddingTable = gpuPromptEmbeddingTable;
|
|
}
|
|
|
|
void LlmRequest::moveLoraWeightsToGpu(runtime::BufferManager const& manager)
|
|
{
|
|
if (!mLoraWeights.has_value() || mLoraWeights.value()->getMemoryType() == runtime::MemoryType::kGPU)
|
|
{
|
|
return;
|
|
}
|
|
// TODO for tp / pp models we only need to move the bit that belong on the local device
|
|
TensorPtr gpuLoraWeights = manager.copyFrom(*mLoraWeights.value(), runtime::MemoryType::kGPU);
|
|
mLoraWeights = gpuLoraWeights;
|
|
}
|
|
|
|
} // namespace tensorrt_llm::batch_manager
|