mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* refactor: Update ExecutorConfig to use AdditionalModelOutput type - Changed function signatures and member variables across multiple files to replace std::optional<std::vector<std::string>> with std::optional<std::vector<executor::AdditionalModelOutput>> to include gatherContext flag for each additional output. - Updated related serialization and deserialization methods to accommodate the new type. - Adjusted tests to reflect the changes in the output handling structure. This refactor enhances the flexibility and maintainability of the output configuration in the executor and batch manager components. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Remove equality operator from TrtGptModelOptionalParams - Deleted the operator== implementation from TrtGptModelOptionalParams to simplify the class. - Updated the pybind11 bindings to remove the exposure of the equality operator to Python. This change streamlines the class definition and reduces unnecessary complexity in the bindings. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Enhance copyAdditionalOutputs to utilize AdditionalModelOutput - Updated the copyAdditionalOutputs function to accept a vector of AdditionalModelOutput, allowing for the inclusion of the gatherContext flag. - Adjusted the logic to handle context and non-context outputs separately, improving the output handling mechanism. - Modified related unit tests to incorporate the new gatherContext parameter, ensuring comprehensive testing of the updated functionality. This refactor improves the flexibility and clarity of output management in the batch processing workflow. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Introduce findOutputTensor utility function for output tensor retrieval - Added a new utility function, findOutputTensor, to encapsulate the logic for finding output tensors and checking their validity. - Refactored copyAdditionalOutputs to utilize findOutputTensor, reducing code duplication and improving clarity. - Enhanced error checking for additional context and generation output tensors. This change streamlines the output tensor retrieval process, enhancing maintainability and readability in the batch processing workflow. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * refactor: Check final indices of additional output tensors and update tests - Added checks to verify the final indices of additional output tensors for context and generation outputs. - Updated unit tests to verify the changes. - Add lastTokenIds input tensor to test engines. - Logits output depends on gatherContextLogits parameter. - Removed gatherContextOutputs parameter from the validate method in LlmRequest. - Context outputs do not depend on computeContextLogits parameter. Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * fixup! refactor: Check final indices of additional output tensors and update tests Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * fixup! refactor: Update ExecutorConfig to use AdditionalModelOutput type Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * fixup! refactor: Remove equality operator from TrtGptModelOptionalParams Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * docs: Update executor.md Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> * chore: Clean up includes Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com> --------- Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
313 lines
12 KiB
C++
313 lines
12 KiB
C++
/*
|
|
* SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
* SPDX-License-Identifier: Apache-2.0
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#include "tensorrt_llm/batch_manager/llmRequest.h"
|
|
#include "tensorrt_llm/runtime/utils/mpiUtils.h"
|
|
|
|
namespace tensorrt_llm::batch_manager
|
|
{
|
|
|
|
/// Note that there is some dependency on the order of operations in this method. Modify with care!
|
|
std::optional<executor::Response> LlmRequest::createResponse(bool useFastLogits, int32_t mpiWorldRank)
|
|
{
|
|
TLLM_CHECK(!isDisaggContextCompleteState());
|
|
if (!(isFinished() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)))
|
|
{
|
|
return std::nullopt;
|
|
}
|
|
|
|
TLLM_LOG_DEBUG("Creating response for request %lu", mRequestId);
|
|
|
|
executor::Result result;
|
|
result.sequenceIndex = mSequenceIndex;
|
|
|
|
result.isSequenceFinal = isFinished();
|
|
mSequenceFinalVec->at(mSequenceIndex) = result.isSequenceFinal;
|
|
|
|
result.isFinal = std::all_of(
|
|
mSequenceFinalVec->begin(), mSequenceFinalVec->end(), [](bool isSequenceFinal) { return isSequenceFinal; });
|
|
|
|
auto const maxNbTokens = getMaxBeamNumTokens();
|
|
|
|
if (isDisaggContextTransmissionState() && isContextOnlyRequest())
|
|
{
|
|
auto const reqBeamWidth = mSamplingConfig.beamWidth;
|
|
std::vector<TokenIdType> firstGenTokens;
|
|
for (SizeType32 beam = 0; beam < reqBeamWidth; ++beam)
|
|
{
|
|
firstGenTokens.push_back(getTokens().at(beam).back());
|
|
}
|
|
if (!hasDraftTokens())
|
|
{
|
|
result.contextPhaseParams = executor::ContextPhaseParams{
|
|
std::move(firstGenTokens), mRequestId, mContextPhaseParams.value().releaseState(), std::nullopt};
|
|
}
|
|
else
|
|
{
|
|
result.contextPhaseParams = executor::ContextPhaseParams{
|
|
std::move(firstGenTokens), mRequestId, mContextPhaseParams.value().releaseState(), *getDraftTokens()};
|
|
}
|
|
}
|
|
|
|
auto const calculateNbTokensOut = [this](SizeType32 maxNbTokens)
|
|
{
|
|
if (!mIsStreaming)
|
|
{
|
|
return maxNbTokens - (mExcludeInputFromOutput ? getOrigPromptLen() : 0);
|
|
}
|
|
return mReturnAllGeneratedTokens ? maxNbTokens - getOrigPromptLen() : maxNbTokens - getMaxSentTokenLen();
|
|
};
|
|
|
|
auto const maxNbTokensOut = calculateNbTokensOut(maxNbTokens);
|
|
|
|
auto const nbBeams = mSamplingConfig.getNumReturnBeams();
|
|
|
|
result.outputTokenIds.resize(nbBeams);
|
|
|
|
auto const startTokenPos = maxNbTokens - maxNbTokensOut;
|
|
|
|
auto const shouldSendResponse = isFinished() || (mIsStreaming && maxNbTokens > getMaxSentTokenLen());
|
|
|
|
if (!shouldSendResponse)
|
|
{
|
|
return std::nullopt;
|
|
}
|
|
|
|
for (SizeType32 beam = 0; beam < nbBeams; ++beam)
|
|
{
|
|
auto const& tokens = getTokens(beam);
|
|
auto const nbTokensOut = calculateNbTokensOut(tokens.size());
|
|
|
|
if (nbTokensOut > 0)
|
|
{
|
|
auto const first = tokens.data() + startTokenPos;
|
|
result.outputTokenIds.at(beam).assign(first, first + nbTokensOut);
|
|
}
|
|
}
|
|
|
|
auto sliceBeams = [&nbBeams](auto beams)
|
|
{ return std::vector<typename decltype(beams)::value_type>(beams.begin(), beams.begin() + nbBeams); };
|
|
|
|
if (returnLogProbs())
|
|
{
|
|
result.cumLogProbs = sliceBeams(getCumLogProbs());
|
|
result.logProbs = sliceBeams(getLogProbs());
|
|
}
|
|
|
|
if (getReturnContextLogits())
|
|
{
|
|
result.contextLogits = executor::detail::ofITensor(getContextLogitsHost());
|
|
}
|
|
|
|
if (getReturnGenerationLogits())
|
|
{
|
|
bool hasDraftTokens = mDraftTokens && !mDraftTokens->empty();
|
|
if (isStreaming() && !hasDraftTokens)
|
|
{
|
|
auto startGenTokenPos = startTokenPos - getOrigPromptLen();
|
|
TensorPtr generationLogitsHostCurrentStep
|
|
= runtime::ITensor::slice(getGenerationLogitsHost(), startGenTokenPos, maxNbTokensOut);
|
|
result.generationLogits = executor::detail::ofITensor(generationLogitsHostCurrentStep);
|
|
}
|
|
else if (useFastLogits)
|
|
{
|
|
result.specDecFastLogitsInfo = executor::SpeculativeDecodingFastLogitsInfo{mRequestId, mpiWorldRank};
|
|
}
|
|
else
|
|
{
|
|
result.generationLogits
|
|
= executor::detail::ofITensor(runtime::ITensor::slice(getGenerationLogitsHost(), 0, nbBeams));
|
|
}
|
|
}
|
|
|
|
if (getReturnEncoderOutput())
|
|
{
|
|
result.encoderOutput = executor::detail::ofITensor(getEncoderOutputHost());
|
|
}
|
|
|
|
if (getReturnPerfMetrics())
|
|
{
|
|
mPerfMetrics.kvCacheMetrics.kvCacheHitRate = getKVCacheHitRatePerRequest();
|
|
|
|
auto& specDecMetrics = mPerfMetrics.speculativeDecoding;
|
|
if (specDecMetrics.totalDraftTokens != 0)
|
|
{
|
|
specDecMetrics.acceptanceRate
|
|
= static_cast<float>(specDecMetrics.totalAcceptedDraftTokens) / specDecMetrics.totalDraftTokens;
|
|
}
|
|
|
|
result.requestPerfMetrics = mPerfMetrics;
|
|
}
|
|
|
|
result.finishReasons = sliceBeams(mFinishReasons);
|
|
result.decodingIter = mDecodingIter;
|
|
|
|
if (hasAdditionalOutputs())
|
|
{
|
|
std::string prefix = "context_";
|
|
for (auto const& outputTensorMap : {mAdditionalContextOutputTensors, mAdditionalGenerationOutputTensors})
|
|
{
|
|
for (auto const& outputTensor : outputTensorMap)
|
|
{
|
|
TLLM_LOG_DEBUG("Adding tensor %s with shape %s to result.", outputTensor.first.c_str(),
|
|
runtime::ITensor::toString(outputTensor.second->getShape()).c_str());
|
|
result.additionalOutputs.emplace_back(
|
|
prefix + outputTensor.first, executor::detail::ofITensor(outputTensor.second));
|
|
}
|
|
prefix = "generation_";
|
|
}
|
|
}
|
|
|
|
// Update position of last sent response
|
|
setMaxSentTokenLen(maxNbTokens);
|
|
|
|
auto requestId = isChild() ? mParentRequestId : mRequestId;
|
|
auto response = executor::Response(requestId, std::move(result), mClientId);
|
|
|
|
return response;
|
|
}
|
|
|
|
void LlmRequest::validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
|
|
SizeType32 vocabSizePadded, std::optional<SizeType32> maxEncoderInputLen, bool enableKVCacheReuse)
|
|
{
|
|
if (mEndId.has_value())
|
|
{
|
|
TLLM_CHECK_WITH_INFO(*mEndId >= -1 && *mEndId < vocabSizePadded,
|
|
"EndId (%d) is not within acceptable range [-1, %d).", *mEndId, vocabSizePadded);
|
|
}
|
|
if (getEncoderInputFeatures()
|
|
&& getEncoderInputFeatures()->getShape().nbDims < 4) // skip encoder shape validation for image inputs
|
|
{
|
|
TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderInputLen() > maxEncoderInputLen.value()),
|
|
"Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderInputLen(),
|
|
maxEncoderInputLen.value());
|
|
}
|
|
|
|
if (mPromptLen > maxInputLen)
|
|
{
|
|
TLLM_THROW(
|
|
"Prompt length (%d) exceeds maximum input length (%d). Set log level to info and check "
|
|
"TRTGptModel logs for how maximum input length is set",
|
|
mPromptLen, maxInputLen);
|
|
}
|
|
|
|
// Maximum number of draft tokens per request we pass to the engine for single runtime iteration.
|
|
// It depends on the speculative decoding mode.
|
|
auto draftLenPerEngineStep = maxDraftLen;
|
|
auto const& draftTokens = getDraftTokens();
|
|
if (draftTokens && !draftTokens->empty())
|
|
{
|
|
auto const inputDraftTokensLen = static_cast<SizeType32>(draftTokens->size());
|
|
if (inputDraftTokensLen > maxDraftLen)
|
|
{
|
|
TLLM_THROW(
|
|
"Draft tokens length (%d) exceeds maximum draft tokens length (%d).", inputDraftTokensLen, maxDraftLen);
|
|
}
|
|
draftLenPerEngineStep = inputDraftTokensLen;
|
|
|
|
if (mPromptLen + draftLenPerEngineStep > maxInputLen)
|
|
{
|
|
auto const newDraftLenPerEngineStep = maxInputLen - mPromptLen;
|
|
TLLM_LOG_WARNING(
|
|
"Prompt length + number of draft tokens (%d + %d) exceeds maximum input length (%d)."
|
|
"Number of draft tokens is changed to (%d)",
|
|
mPromptLen, draftLenPerEngineStep, maxInputLen, newDraftLenPerEngineStep);
|
|
draftLenPerEngineStep = newDraftLenPerEngineStep;
|
|
mDraftTokens->resize(draftLenPerEngineStep);
|
|
}
|
|
}
|
|
|
|
if (mPromptLen + mMaxNewTokens + draftLenPerEngineStep > maxSequenceLen)
|
|
{
|
|
auto const maxNewTokens = maxSequenceLen - mPromptLen - draftLenPerEngineStep;
|
|
TLLM_LOG_WARNING(
|
|
"Prompt length + number of requested output tokens + draft tokens per step (%d + %d + %d) exceeds "
|
|
"maximum sequence length (%d). "
|
|
"Number of requested output tokens is changed to (%d).",
|
|
mPromptLen, mMaxNewTokens, draftLenPerEngineStep, maxSequenceLen, maxNewTokens);
|
|
mMaxNewTokens = maxNewTokens;
|
|
}
|
|
|
|
TLLM_CHECK_WITH_INFO(mSamplingConfig.validate(), "Incorrect sampling config");
|
|
|
|
// validate extra ids when enabling kv cache reuse with prompt table
|
|
if (enableKVCacheReuse && mPromptEmbeddingTable.has_value() && mPromptVocabSize.has_value())
|
|
{
|
|
TLLM_CHECK_WITH_INFO(mInputTokenExtraIds.has_value() && mInputTokenExtraIds.value(),
|
|
"Input token extra ids must be provided when enabling kv cache reuse with prompt table");
|
|
TLLM_CHECK_WITH_INFO(mInputTokenExtraIds.value()->size() == static_cast<size_t>(mOrigPromptLen),
|
|
"inputTokenExtraIds vector size (%lu) must be the same as input token vector size (%lu).",
|
|
mInputTokenExtraIds.value()->size(), static_cast<size_t>(mOrigPromptLen));
|
|
}
|
|
}
|
|
|
|
std::shared_ptr<LlmRequest> LlmRequest::createChildRequest(RequestIdType requestId)
|
|
{
|
|
TLLM_CHECK_WITH_INFO(!isChild(), "A child request cannot create its own child.");
|
|
TLLM_CHECK_WITH_INFO(mChildRequests.size() + 1 < static_cast<size_t>(getNumSubRequests()),
|
|
"Cannot create child requests more than the number of return sequences (%d)", getNumSubRequests());
|
|
auto childReq = std::make_shared<LlmRequest>(*this);
|
|
childReq->mRequestId = requestId;
|
|
childReq->mSequenceIndex = mChildRequests.size() + 1;
|
|
childReq->mParentRequestId = this->mRequestId;
|
|
childReq->mSequenceFinalVec = this->mSequenceFinalVec;
|
|
childReq->mSeqSlot.reset();
|
|
|
|
// To ensure different randomness across children, assign a unique random seed to each child
|
|
// by adding its sequence index to the base seed. If no seed is provided, the parent's seed defaults to 0.
|
|
using RandomSeedType = tensorrt_llm::executor::RandomSeedType;
|
|
if (childReq->mSamplingConfig.randomSeed.has_value())
|
|
{
|
|
childReq->mSamplingConfig.randomSeed->at(0) += static_cast<RandomSeedType>(childReq->mSequenceIndex);
|
|
}
|
|
else
|
|
{
|
|
RandomSeedType defaultSeed{0};
|
|
mSamplingConfig.randomSeed = std::vector<RandomSeedType>(1, defaultSeed);
|
|
childReq->mSamplingConfig.randomSeed
|
|
= std::vector<RandomSeedType>(1, defaultSeed + static_cast<RandomSeedType>(childReq->mSequenceIndex));
|
|
}
|
|
|
|
mChildRequests.push_back(childReq);
|
|
return childReq;
|
|
}
|
|
|
|
void LlmRequest::movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager)
|
|
{
|
|
if (!mPromptEmbeddingTable.has_value()
|
|
|| mPromptEmbeddingTable.value()->getMemoryType() == runtime::MemoryType::kGPU)
|
|
{
|
|
return;
|
|
}
|
|
|
|
TensorPtr gpuPromptEmbeddingTable = manager.copyFrom(*mPromptEmbeddingTable.value(), runtime::MemoryType::kGPU);
|
|
mPromptEmbeddingTable = gpuPromptEmbeddingTable;
|
|
}
|
|
|
|
void LlmRequest::moveLoraWeightsToGpu(runtime::BufferManager const& manager)
|
|
{
|
|
if (!mLoraWeights.has_value() || mLoraWeights.value()->getMemoryType() == runtime::MemoryType::kGPU)
|
|
{
|
|
return;
|
|
}
|
|
// TODO for tp / pp models we only need to move the bit that belong on the local device
|
|
TensorPtr gpuLoraWeights = manager.copyFrom(*mLoraWeights.value(), runtime::MemoryType::kGPU);
|
|
mLoraWeights = gpuLoraWeights;
|
|
}
|
|
|
|
} // namespace tensorrt_llm::batch_manager
|