mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* Update TensorRT-LLM --------- Co-authored-by: akhoroshev <arthoroshev@gmail.com> Co-authored-by: Fabian Joswig <fjosw@users.noreply.github.com> Co-authored-by: Tayef Shah <tayefshah@gmail.com> Co-authored-by: lfz941 <linfanzai941@gmail.com>
1220 lines
45 KiB
C++
1220 lines
45 KiB
C++
/*
|
|
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
|
*
|
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
|
* you may not use this file except in compliance with the License.
|
|
* You may obtain a copy of the License at
|
|
*
|
|
* http://www.apache.org/licenses/LICENSE-2.0
|
|
*
|
|
* Unless required by applicable law or agreed to in writing, software
|
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
* See the License for the specific language governing permissions and
|
|
* limitations under the License.
|
|
*/
|
|
|
|
#pragma once
|
|
|
|
#include "tensorrt_llm/common/assert.h"
|
|
#include "tensorrt_llm/common/logger.h"
|
|
#include "tensorrt_llm/executor/executor.h"
|
|
#include "tensorrt_llm/runtime/bufferManager.h"
|
|
#include "tensorrt_llm/runtime/iBuffer.h"
|
|
#include "tensorrt_llm/runtime/iTensor.h"
|
|
#include "tensorrt_llm/runtime/samplingConfig.h"
|
|
|
|
#include <cassert>
|
|
#include <cstdint>
|
|
#include <memory>
|
|
#include <optional>
|
|
#include <utility>
|
|
#include <vector>
|
|
|
|
namespace tensorrt_llm::batch_manager
|
|
{
|
|
|
|
/**
|
|
* @brief The state of the request.
|
|
*
|
|
* Enum order must follow chronological order for state dependency check, @see hasReachedState().
|
|
*
|
|
* @todo(rkobus): refactor
|
|
*/
|
|
enum LlmRequestState_t
|
|
{
|
|
REQUEST_STATE_UNKNOWN = 0, ///< Unknown state
|
|
REQUEST_STATE_ENCODER_INIT = 1, ///< Encoder phase starts (for encoder-decoder models)
|
|
REQUEST_STATE_CONTEXT_INIT = 2, ///< Context phase starts
|
|
REQUEST_STATE_GENERATION_IN_PROGRESS = 3, ///< Generation phase is in progress
|
|
REQUEST_STATE_GENERATION_TO_COMPLETE = 4, ///< Generation phase is to be completed
|
|
REQUEST_STATE_GENERATION_COMPLETE = 5, ///< Generation phase completed
|
|
};
|
|
|
|
template <typename TTensor, typename TStream = runtime::BufferManager::CudaStreamPtr>
|
|
class GenericLlmRequest
|
|
{
|
|
public:
|
|
using SizeType32 = runtime::SizeType32;
|
|
using TokenIdType = runtime::TokenIdType;
|
|
using RequestIdType = std::uint64_t;
|
|
using LoraTaskIdType = std::uint64_t;
|
|
using VecTokens = std::vector<TokenIdType>;
|
|
using VecLogProbs = std::vector<float>;
|
|
using BeamTokens = std::vector<VecTokens>;
|
|
using TensorPtr = TTensor;
|
|
using LogitsPostProcessor = std::function<void(
|
|
RequestIdType, TensorPtr&, BeamTokens const&, TStream const&, std::optional<RequestIdType>)>;
|
|
|
|
GenericLlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> inputTokens,
|
|
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
|
|
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
|
|
std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
|
|
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
|
|
std::optional<SizeType32> promptVocabSize = std::nullopt,
|
|
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
|
|
std::optional<TensorPtr> loraConfig = std::nullopt, bool returnLogProbs = false,
|
|
bool returnContextLogits = false, bool returnGenerationLogits = false,
|
|
std::optional<std::shared_ptr<VecTokens>> draftTokens = std::nullopt,
|
|
std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
|
|
std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
|
|
bool applyLogitsPostProcessorBatched = false,
|
|
std::optional<std::shared_ptr<VecTokens>> encoderInputTokens = std::nullopt, bool returnEncoderOutput = false,
|
|
std::optional<RequestIdType> clientId = std::nullopt,
|
|
executor::PriorityType priority = executor::Request::kDefaultPriority)
|
|
: mRequestId(requestId)
|
|
, mPromptLen(inputTokens->size())
|
|
, mMaxNewTokens(maxNewTokens)
|
|
, mSamplingConfig(samplingConfig)
|
|
, mState(REQUEST_STATE_CONTEXT_INIT)
|
|
, mEndId(endId)
|
|
, mPadId(padId)
|
|
, mLogitsPostProcessor(logitsPostProcessor)
|
|
, mApplyLogitsPostProcessorBatched(applyLogitsPostProcessorBatched)
|
|
, mClientId(clientId)
|
|
, mIsStreaming(isStreaming)
|
|
, mOrigPromptLen(mPromptLen)
|
|
, mMaxSentTokenLen(mPromptLen)
|
|
, mEmbeddingBias(std::move(embeddingBias))
|
|
, mBadWordsList(std::move(badWordsList))
|
|
, mStopWordsList(std::move(stopWordsList))
|
|
, mPromptEmbeddingTable(std::move(promptEmbeddingTable))
|
|
, mPromptVocabSize(promptVocabSize)
|
|
, mLoraTaskId(loraTaskId)
|
|
, mLoraWeights(std::move(loraWeights))
|
|
, mLoraConfig(std::move(loraConfig))
|
|
, mContextChunkSize(std::nullopt)
|
|
, mContextCurrentPosition(0)
|
|
, mLogProbs(samplingConfig.beamWidth)
|
|
, mCumLogProbs(samplingConfig.beamWidth)
|
|
, mDraftTokens(draftTokens.value_or(std::make_shared<VecTokens>()))
|
|
, mDraftLogits(draftLogits)
|
|
, mNumTokensPerIteration(1)
|
|
, mReturnAllGeneratedTokens(isStreaming && (samplingConfig.beamWidth > 1))
|
|
, mReturnContextLogits(returnContextLogits)
|
|
, mReturnGenerationLogits(returnGenerationLogits)
|
|
, mExcludeInputFromOutput(excludeInputFromOutput)
|
|
, mEncoderTokens(std::move(encoderInputTokens))
|
|
, mReturnEncoderOutput(returnEncoderOutput)
|
|
, mDecodingIter(0)
|
|
, mPriority(priority)
|
|
{
|
|
if (mEncoderTokens.has_value())
|
|
{
|
|
mState = REQUEST_STATE_ENCODER_INIT;
|
|
}
|
|
initialize(*inputTokens, returnLogProbs);
|
|
}
|
|
|
|
GenericLlmRequest(RequestIdType requestId, executor::Request const& req)
|
|
: mRequestId(requestId)
|
|
, mPromptLen(req.getInputTokenIds().size())
|
|
, mMaxNewTokens(req.getMaxNewTokens())
|
|
, mSamplingConfig(req.getSamplingConfig(), req.getExternalDraftTokensConfig())
|
|
, mState(REQUEST_STATE_CONTEXT_INIT)
|
|
, mEndId(req.getEndId())
|
|
, mPadId(req.getPadId())
|
|
, mClientId(req.getClientId())
|
|
, mIsStreaming(req.getStreaming())
|
|
, mOrigPromptLen(mPromptLen)
|
|
, mMaxSentTokenLen(mPromptLen)
|
|
, mEmbeddingBias(std::nullopt)
|
|
, mBadWordsList(std::nullopt)
|
|
, mStopWordsList(std::nullopt)
|
|
, mPromptEmbeddingTable(std::nullopt)
|
|
, mPromptVocabSize(std::nullopt)
|
|
, mLoraTaskId(std::nullopt)
|
|
, mLoraWeights(std::nullopt)
|
|
, mLoraConfig(std::nullopt)
|
|
, mContextChunkSize(std::nullopt)
|
|
, mContextCurrentPosition(0)
|
|
, mLogProbs(mSamplingConfig.beamWidth)
|
|
, mCumLogProbs(mSamplingConfig.beamWidth)
|
|
, mDraftTokens(std::make_shared<VecTokens>())
|
|
, mDraftLogits(std::nullopt)
|
|
, mNumTokensPerIteration(1)
|
|
, mReturnAllGeneratedTokens(req.getReturnAllGeneratedTokens())
|
|
, mReturnContextLogits(req.getOutputConfig().returnContextLogits)
|
|
, mReturnGenerationLogits(req.getOutputConfig().returnGenerationLogits)
|
|
, mExcludeInputFromOutput(req.getOutputConfig().excludeInputFromOutput)
|
|
, mEncoderTokens(std::nullopt)
|
|
, mReturnEncoderOutput(req.getOutputConfig().returnEncoderOutput)
|
|
, mDecodingIter(0)
|
|
, mPriority(req.getPriority())
|
|
{
|
|
if (mIsStreaming && mSamplingConfig.beamWidth > 1 && mReturnAllGeneratedTokens == false)
|
|
{
|
|
TLLM_LOG_WARNING(
|
|
"Setting mReturnAllGeneratedTokens to True since streaming AND beam search are done simultaneously. "
|
|
"Returning the full beams at each streaming step is needed because beam search + streaming can change "
|
|
"previous outputs. Initialize request with mReturnAllGeneratedTokens = True to dismiss this error. "
|
|
"WARNING: using this option may increase network usage significantly (quadratically w.r.t output "
|
|
"length).");
|
|
mReturnAllGeneratedTokens = true;
|
|
}
|
|
if (req.getEncoderInputTokenIds())
|
|
{
|
|
mState = REQUEST_STATE_ENCODER_INIT;
|
|
mEncoderTokens = std::make_shared<VecTokens>(req.getEncoderInputTokenIds().value());
|
|
}
|
|
if (req.getEmbeddingBias())
|
|
{
|
|
mEmbeddingBias = executor::detail::toITensor(req.getEmbeddingBias().value());
|
|
// Add leading 1 dimension since that's what IFB code expects
|
|
mEmbeddingBias.value()->unsqueeze(0);
|
|
}
|
|
if (req.getBadWords())
|
|
{
|
|
mBadWordsList = createListTensor(req.getBadWords().value());
|
|
}
|
|
if (req.getStopWords())
|
|
{
|
|
mStopWordsList = createListTensor(req.getStopWords().value());
|
|
}
|
|
|
|
auto pTuningConfig = req.getPromptTuningConfig();
|
|
if (pTuningConfig)
|
|
{
|
|
mPromptEmbeddingTable = executor::detail::toITensor(pTuningConfig.value().getEmbeddingTable());
|
|
TLLM_CHECK(mPromptEmbeddingTable.value()->getShape().nbDims == 2);
|
|
mPromptVocabSize = mPromptEmbeddingTable.value()->getShape().d[0];
|
|
mPromptEmbeddingTable.value()->unsqueeze(0);
|
|
}
|
|
|
|
auto loraConfig = req.getLoraConfig();
|
|
if (loraConfig)
|
|
{
|
|
mLoraTaskId = loraConfig->getTaskId();
|
|
auto optWeights = loraConfig->getWeights();
|
|
if (loraConfig.value().getWeights())
|
|
{
|
|
mLoraWeights = executor::detail::toITensor(loraConfig.value().getWeights().value());
|
|
mLoraWeights.value()->unsqueeze(0);
|
|
}
|
|
|
|
if (loraConfig.value().getConfig())
|
|
{
|
|
mLoraConfig = executor::detail::toITensor(loraConfig.value().getConfig().value());
|
|
mLoraConfig.value()->unsqueeze(0);
|
|
}
|
|
}
|
|
|
|
auto externalDraftTokensConfig = req.getExternalDraftTokensConfig();
|
|
if (externalDraftTokensConfig)
|
|
{
|
|
mDraftTokens = std::make_shared<VecTokens>(externalDraftTokensConfig.value().getTokens());
|
|
|
|
if (externalDraftTokensConfig.value().getLogits())
|
|
{
|
|
mDraftLogits = executor::detail::toITensor(externalDraftTokensConfig.value().getLogits().value());
|
|
}
|
|
|
|
// NOTE: Draft acceptance threshold is stored in mSamplingConfig
|
|
}
|
|
|
|
initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);
|
|
}
|
|
|
|
void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
|
|
std::optional<SizeType32> maxEncoderInputLen = std::nullopt)
|
|
{
|
|
TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderLen() > maxEncoderInputLen.value()),
|
|
"Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderLen(),
|
|
maxEncoderInputLen.value());
|
|
|
|
if (mPromptLen > maxInputLen)
|
|
{
|
|
TLLM_THROW(
|
|
"Prompt length (%d) exceeds maximum input length (%d). Set log level to info and check "
|
|
"TRTGptModel logs for how maximum input length is set",
|
|
mPromptLen, maxInputLen);
|
|
}
|
|
|
|
// Maximum number of draft tokens per request we pass to the engine for single runtime iteration.
|
|
// It depends on the speculative decoding mode.
|
|
auto draftLenPerEngineStep = maxDraftLen;
|
|
auto const& draftTokens = getDraftTokens();
|
|
if (draftTokens && !draftTokens->empty())
|
|
{
|
|
auto const inputDraftTokensLen = static_cast<SizeType32>(draftTokens->size());
|
|
if (inputDraftTokensLen > maxDraftLen)
|
|
{
|
|
TLLM_THROW("Draft tokens length (%d) exceeds maximum draft tokens length (%d).", inputDraftTokensLen,
|
|
maxDraftLen);
|
|
}
|
|
draftLenPerEngineStep = inputDraftTokensLen;
|
|
|
|
if (mPromptLen + draftLenPerEngineStep > maxInputLen)
|
|
{
|
|
auto const newDraftLenPerEngineStep = maxInputLen - mPromptLen;
|
|
TLLM_LOG_WARNING(
|
|
"Prompt length + number of draft tokens (%d + %d) exceeds maximum input length (%d)."
|
|
"Number of draft tokens is changed to (%d)",
|
|
mPromptLen, draftLenPerEngineStep, maxInputLen, newDraftLenPerEngineStep);
|
|
draftLenPerEngineStep = newDraftLenPerEngineStep;
|
|
mDraftTokens->resize(draftLenPerEngineStep);
|
|
}
|
|
}
|
|
|
|
if (mPromptLen + mMaxNewTokens + draftLenPerEngineStep > maxSequenceLen)
|
|
{
|
|
auto const maxNewTokens = maxSequenceLen - mPromptLen - draftLenPerEngineStep;
|
|
TLLM_LOG_WARNING(
|
|
"Prompt length + number of requested output tokens + draft tokens per step (%d + %d + %d) exceeds "
|
|
"maximum sequence length (%d). "
|
|
"Number of requested output tokens is changed to (%d).",
|
|
mPromptLen, mMaxNewTokens, draftLenPerEngineStep, maxSequenceLen, maxNewTokens);
|
|
mMaxNewTokens = maxNewTokens;
|
|
}
|
|
|
|
TLLM_CHECK_WITH_INFO(mSamplingConfig.validate(), "Incorrect sampling config");
|
|
}
|
|
|
|
void setExcludeInputFromOutput(bool exclude)
|
|
{
|
|
mExcludeInputFromOutput = exclude;
|
|
}
|
|
|
|
/// @brief Get total number of tokens for this req (prompt + generated)
|
|
/// @param beam The beam index
|
|
/// @return The number of tokens
|
|
[[nodiscard]] SizeType32 getNumTokens(SizeType32 beam) const
|
|
{
|
|
return mTokens.at(beam).size();
|
|
}
|
|
|
|
/// @brief Get max number of tokens across all beams
|
|
/// @return The number of tokens
|
|
[[nodiscard]] SizeType32 getMaxBeamNumTokens() const
|
|
{
|
|
SizeType32 maxTokens = 0;
|
|
for (SizeType32 beam = 0; beam < mSamplingConfig.beamWidth; ++beam)
|
|
{
|
|
maxTokens = std::max(maxTokens, static_cast<SizeType32>(mTokens.at(beam).size()));
|
|
}
|
|
return maxTokens;
|
|
}
|
|
|
|
/// @brief Get a token at a given position and beam index
|
|
/// @param beam The beam index
|
|
/// @param pos The position of the token relative to beginning of the prompt
|
|
/// @return The token index
|
|
[[nodiscard]] TokenIdType getToken(SizeType32 beam, SizeType32 pos) const
|
|
{
|
|
return mTokens.at(beam).at(pos);
|
|
}
|
|
|
|
/// @brief Get the tokens at a given beam index
|
|
/// @param beam The beam index
|
|
/// @return A vector of tokens for this beam index, includes the prompt
|
|
[[nodiscard]] VecTokens const& getTokens(SizeType32 beam) const
|
|
{
|
|
return mTokens.at(beam);
|
|
}
|
|
|
|
/// @brief Get all tokens (input+output) for all beams
|
|
/// @return A vector of vector of tokens.
|
|
[[nodiscard]] BeamTokens const& getTokens() const
|
|
{
|
|
return mTokens;
|
|
}
|
|
|
|
/// @brief Get input tokens to encoder
|
|
/// @return A vector of tokens.
|
|
[[nodiscard]] std::optional<std::shared_ptr<VecTokens>> const& getEncoderTokens() const
|
|
{
|
|
return mEncoderTokens;
|
|
}
|
|
|
|
/// @brief Get the number of input tokens to encoder
|
|
/// @return The number of encoder input tokens.
|
|
[[nodiscard]] SizeType32 getEncoderLen() const
|
|
{
|
|
TLLM_CHECK_WITH_INFO(getEncoderTokens().has_value(), "Encoder tokens are not given");
|
|
return getEncoderTokens().value()->size();
|
|
}
|
|
|
|
/// @brief Get the draft tokens
|
|
/// @return shared_ptr to vector of draft tokens
|
|
[[nodiscard]] std::shared_ptr<VecTokens> const& getDraftTokens() const
|
|
{
|
|
return mDraftTokens;
|
|
}
|
|
|
|
/// @brief Get the logits for the draft tokens
|
|
/// @return Tensor of draft logits
|
|
[[nodiscard]] std::optional<TensorPtr> getDraftLogits() const
|
|
{
|
|
return mDraftLogits;
|
|
}
|
|
|
|
/// @brief Returns true if request has draft tokens
|
|
/// @return flag
|
|
[[nodiscard]] bool hasDraftTokens() const
|
|
{
|
|
return mDraftTokens && !mDraftTokens->empty();
|
|
}
|
|
|
|
/// @brief Get the maximum number of generated tokens among all rays in beam
|
|
/// @return The number of generated tokens (doesn't include the prompt tokens)
|
|
[[nodiscard]] SizeType32 getMaxNumGeneratedTokens() const
|
|
{
|
|
return getMaxBeamNumTokens() - mPromptLen;
|
|
}
|
|
|
|
/// @brief Add new generated tokens to the vector of tokens and set mLastTokens
|
|
/// @param token The token to add
|
|
/// @param beam The beam to which to add the new token
|
|
void addNewToken(TokenIdType token, SizeType32 beam)
|
|
{
|
|
mLastTokens[beam] = token;
|
|
mTokens.at(beam).push_back(token);
|
|
}
|
|
|
|
/// @brief Add new generated tokens to the vector of tokens and set mLastTokens
|
|
/// @param beamTokens A vector containing the tokens to add for each beam index
|
|
/// beamTokens is expected to be of size beamWidth
|
|
void addNewTokens(VecTokens const& beamTokens)
|
|
{
|
|
assert(static_cast<size_t>(mSamplingConfig.beamWidth) == beamTokens.size());
|
|
mLastTokens = beamTokens;
|
|
for (std::size_t beam = 0; beam < beamTokens.size(); ++beam)
|
|
{
|
|
auto const outputId = beamTokens[beam];
|
|
mTokens.at(beam).push_back(outputId);
|
|
}
|
|
}
|
|
|
|
/// @brief Sets the generated tokens for all beams. Erases all previous generated tokens.
|
|
/// @param generatedBeamTokens The generated tokens for all beams (vector of vector of tokens)
|
|
void setGeneratedTokens(BeamTokens const& generatedBeamTokens)
|
|
{
|
|
assert(generatedBeamTokens.size() == static_cast<size_t>(mSamplingConfig.beamWidth));
|
|
for (std::size_t beam = 0; beam < generatedBeamTokens.size(); ++beam)
|
|
{
|
|
auto& beamTokens = mTokens[beam];
|
|
beamTokens.resize(mPromptLen);
|
|
beamTokens.insert(beamTokens.end(), generatedBeamTokens[beam].begin(), generatedBeamTokens[beam].end());
|
|
}
|
|
}
|
|
|
|
/// @brief Return a vector of the last-generated tokens of shape [num_beams]
|
|
[[nodiscard]] VecTokens const& getLastTokens()
|
|
{
|
|
return mLastTokens;
|
|
}
|
|
|
|
/// @brief Return the last-generated token of from a particular beam
|
|
[[nodiscard]] TokenIdType const& getLastTokens(SizeType32 beam)
|
|
{
|
|
return mLastTokens[beam];
|
|
}
|
|
|
|
/// @brief Pause a request by moving the generated tokens to the prompt
|
|
/// @param maxInputLen The maximum prompt len.
|
|
void pause(SizeType32 maxInputLen)
|
|
{
|
|
// TODO: For beamWidth > 1, we would need to support swapping to avoid
|
|
// recomputing from the start
|
|
// As a temporary solution, we currently reset the tokens to the prompt
|
|
if (mSamplingConfig.beamWidth > 1)
|
|
{
|
|
for (std::size_t beam = 0; beam < mTokens.size(); ++beam)
|
|
{
|
|
auto& beamTokens = mTokens.at(beam);
|
|
beamTokens.resize(mPromptLen);
|
|
if (returnLogProbs())
|
|
{
|
|
mLogProbs.at(beam).clear();
|
|
}
|
|
}
|
|
}
|
|
else
|
|
{
|
|
SizeType32 newPromptLen = std::min(maxInputLen, mPromptLen + getMaxNumGeneratedTokens());
|
|
for (std::size_t beam = 0; beam < mTokens.size(); ++beam)
|
|
{
|
|
auto& beamTokens = mTokens.at(beam);
|
|
beamTokens.resize(newPromptLen);
|
|
|
|
if (returnLogProbs())
|
|
{
|
|
auto& logProb = mLogProbs.at(beam);
|
|
logProb.resize(newPromptLen - mPromptLen);
|
|
}
|
|
}
|
|
mMaxNewTokens -= (newPromptLen - mPromptLen);
|
|
mPromptLen = newPromptLen;
|
|
}
|
|
|
|
// for enc-dec models, pause means saving generated tokens to prompt but need to re-do encoder phase
|
|
mState = mEncoderTokens.has_value() ? REQUEST_STATE_ENCODER_INIT : REQUEST_STATE_CONTEXT_INIT;
|
|
mContextCurrentPosition = 0;
|
|
mContextChunkSize = std::nullopt;
|
|
mSeqSlot.reset();
|
|
}
|
|
|
|
/// @brief Get the maximum length of tokens returned to the client. Use to ensure we don't return to
|
|
/// client duplicated tokens.
|
|
/// @return The maximum length of the tokens sent to the client.
|
|
[[nodiscard]] SizeType32 getMaxSentTokenLen() const
|
|
{
|
|
return mMaxSentTokenLen;
|
|
}
|
|
|
|
/// @brief Sets the maximum length of tokens returned to the client. Use to ensure we don't return to
|
|
/// client duplicated tokens.
|
|
/// @param maxSentLength The new maximum length.
|
|
void setMaxSentTokenLen(SizeType32 maxSentLength)
|
|
{
|
|
mMaxSentTokenLen = maxSentLength;
|
|
}
|
|
|
|
[[nodiscard]] std::optional<TensorPtr> getPromptEmbeddingTable() const
|
|
{
|
|
return mPromptEmbeddingTable;
|
|
}
|
|
|
|
[[nodiscard]] std::optional<SizeType32> getPromptVocabSize() const
|
|
{
|
|
return mPromptVocabSize;
|
|
}
|
|
|
|
[[nodiscard]] std::optional<LoraTaskIdType> getLoraTaskId() const
|
|
{
|
|
return mLoraTaskId;
|
|
}
|
|
|
|
void setLoraTaskId(LoraTaskIdType taskId)
|
|
{
|
|
mLoraTaskId = taskId;
|
|
}
|
|
|
|
[[nodiscard]] std::optional<TensorPtr> getLoraWeights() const
|
|
{
|
|
return mLoraWeights;
|
|
}
|
|
|
|
void setLoraWeights(TensorPtr weights)
|
|
{
|
|
mLoraWeights = weights;
|
|
}
|
|
|
|
void clearLoraWeights()
|
|
{
|
|
mLoraWeights = std::nullopt;
|
|
}
|
|
|
|
[[nodiscard]] std::optional<TensorPtr> getLoraConfig() const
|
|
{
|
|
return mLoraConfig;
|
|
}
|
|
|
|
void setLoraConfig(TensorPtr config)
|
|
{
|
|
mLoraConfig = config;
|
|
}
|
|
|
|
void clearLoraConfig()
|
|
{
|
|
mLoraConfig = std::nullopt;
|
|
}
|
|
|
|
[[nodiscard]] std::optional<TensorPtr> getEmbeddingBias() const
|
|
{
|
|
return mEmbeddingBias;
|
|
}
|
|
|
|
[[nodiscard]] std::optional<TensorPtr> getBadWordsList() const
|
|
{
|
|
return mBadWordsList;
|
|
}
|
|
|
|
[[nodiscard]] std::optional<TensorPtr> getStopWordsList() const
|
|
{
|
|
return mStopWordsList;
|
|
}
|
|
|
|
[[nodiscard]] bool returnLogProbs() const
|
|
{
|
|
return mSamplingConfig.outputLogProbs.has_value() ? mSamplingConfig.outputLogProbs->at(0) : false;
|
|
}
|
|
|
|
void setReturnLogProbs(bool returnLogProbs)
|
|
{
|
|
mSamplingConfig.outputLogProbs = {{returnLogProbs}};
|
|
mSamplingConfig.cumLogProbs = {{returnLogProbs}};
|
|
}
|
|
|
|
[[nodiscard]] std::vector<VecLogProbs> const& getLogProbs() const
|
|
{
|
|
return mLogProbs;
|
|
}
|
|
|
|
[[nodiscard]] VecLogProbs const& getLogProbs(SizeType32 beam) const
|
|
{
|
|
return mLogProbs.at(beam);
|
|
}
|
|
|
|
void setLogProbs(VecLogProbs const& logProbs, SizeType32 beam)
|
|
{
|
|
mLogProbs.at(beam).resize(mPromptLen - mOrigPromptLen);
|
|
mLogProbs.at(beam).insert(mLogProbs.at(beam).end(), logProbs.begin(), logProbs.end());
|
|
}
|
|
|
|
[[nodiscard]] VecLogProbs const& getCumLogProbs() const
|
|
{
|
|
return mCumLogProbs;
|
|
}
|
|
|
|
void setCumLogProb(float cumLogProb, SizeType32 beam)
|
|
{
|
|
mCumLogProbs.at(beam) = cumLogProb;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getOrigPromptLen() const
|
|
{
|
|
return mOrigPromptLen;
|
|
}
|
|
|
|
void setPrepopulatedPromptLen(SizeType32 prepopulatedPromptLen)
|
|
{
|
|
mPrepopulatedPromptLen = prepopulatedPromptLen;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getPrepopulatedPromptLen() const
|
|
{
|
|
return mPrepopulatedPromptLen;
|
|
}
|
|
|
|
void setDraftTokens(std::shared_ptr<VecTokens> const& draftTokens)
|
|
{
|
|
mDraftTokens = draftTokens;
|
|
}
|
|
|
|
void setDraftLogits(std::optional<TensorPtr> const& draftLogits)
|
|
{
|
|
mDraftLogits = draftLogits;
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumDraftTokens() const
|
|
{
|
|
return mDraftTokens->size();
|
|
}
|
|
|
|
void discardDraftTokens(SizeType32 numTokensToDiscard)
|
|
{
|
|
TLLM_CHECK_WITH_INFO(
|
|
numTokensToDiscard > 0, "Can only discard a positive amount of draft tokens, got %d", numTokensToDiscard);
|
|
TLLM_CHECK_WITH_INFO(numTokensToDiscard <= getNumDraftTokens(),
|
|
"Can't discard more draft tokens (%d) than exists (%d).", numTokensToDiscard, getNumDraftTokens());
|
|
mDraftTokens->resize(getNumDraftTokens() - numTokensToDiscard);
|
|
}
|
|
|
|
void setNumTokensPerIteration(SizeType32 numTokensPerIteration)
|
|
{
|
|
mNumTokensPerIteration = std::max(1, numTokensPerIteration);
|
|
}
|
|
|
|
[[nodiscard]] SizeType32 getNumTokensPerIteration() const
|
|
{
|
|
return mNumTokensPerIteration;
|
|
}
|
|
|
|
void setReturnEncoderOutput(bool const returnEncoderOutput)
|
|
{
|
|
mReturnEncoderOutput = returnEncoderOutput;
|
|
}
|
|
|
|
[[nodiscard]] bool getReturnEncoderOutput() const
|
|
{
|
|
return mReturnEncoderOutput;
|
|
}
|
|
|
|
[[nodiscard]] TensorPtr const& getEncoderOutputHost() const
|
|
{
|
|
return mEncoderOutputHost;
|
|
}
|
|
|
|
void setEncoderOutputHost(TensorPtr encoderOutputHost)
|
|
{
|
|
mEncoderOutputHost = std::move(encoderOutputHost);
|
|
}
|
|
|
|
void allocEncoderOutputHost(SizeType32 encoderHiddenSize, nvinfer1::DataType dataType)
|
|
{
|
|
mEncoderOutputHost = runtime::BufferManager::pinned(
|
|
runtime::ITensor::makeShape({getEncoderLen(), encoderHiddenSize}), dataType);
|
|
}
|
|
|
|
[[nodiscard]] TensorPtr const& getEncoderOutput() const noexcept
|
|
{
|
|
return mEncoderOutput;
|
|
}
|
|
|
|
[[nodiscard]] TensorPtr const& getEncoderHiddenStates() const noexcept
|
|
{
|
|
return mEncoderHiddenStates;
|
|
}
|
|
|
|
void allocEncoderOutput(runtime::BufferManager const& manager, nvinfer1::DataType dataType)
|
|
{
|
|
// unique_ptr --> shared_ptr ownership move
|
|
mEncoderOutput = std::move(manager.emptyTensor(runtime::MemoryType::kGPU, dataType));
|
|
}
|
|
|
|
void allocEncoderHiddenStates(runtime::BufferManager const& manager, nvinfer1::DataType dataType)
|
|
{
|
|
// unique_ptr --> shared_ptr ownership move
|
|
mEncoderHiddenStates = std::move(manager.emptyTensor(runtime::MemoryType::kGPU, dataType));
|
|
}
|
|
|
|
void freeEncoderOutputBuffers()
|
|
{
|
|
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
|
|
|
TLLM_LOG_DEBUG(
|
|
"Encoder output buffers use count: %u, %u", mEncoderOutput.use_count(), mEncoderHiddenStates.use_count());
|
|
|
|
// TODO: better ways to free shared_ptr buffers
|
|
mEncoderOutput.reset();
|
|
mEncoderHiddenStates.reset();
|
|
|
|
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
|
}
|
|
|
|
[[nodiscard]] bool constexpr isStreaming() const noexcept
|
|
{
|
|
return mIsStreaming;
|
|
}
|
|
|
|
void constexpr setStreaming(bool isStreaming) noexcept
|
|
{
|
|
mIsStreaming = isStreaming;
|
|
}
|
|
|
|
void setPriority(executor::PriorityType priority) noexcept
|
|
{
|
|
mPriority = priority;
|
|
}
|
|
|
|
void setReturnAllGeneratedTokens(bool const returnAllGeneratedTokens)
|
|
{
|
|
TLLM_CHECK_WITH_INFO(!mIsStreaming || mSamplingConfig.beamWidth == 1 || returnAllGeneratedTokens,
|
|
"returnAllGeneratedTokens must be true if streaming AND beam search are used.");
|
|
mReturnAllGeneratedTokens = returnAllGeneratedTokens;
|
|
}
|
|
|
|
void setReturnContextLogits(bool const returnContextLogits)
|
|
{
|
|
mReturnContextLogits = returnContextLogits;
|
|
}
|
|
|
|
[[nodiscard]] bool getReturnContextLogits() const
|
|
{
|
|
return mReturnContextLogits;
|
|
}
|
|
|
|
void setReturnGenerationLogits(bool const returnGenerationLogits)
|
|
{
|
|
mReturnGenerationLogits = returnGenerationLogits;
|
|
}
|
|
|
|
[[nodiscard]] bool getReturnGenerationLogits() const
|
|
{
|
|
return mReturnGenerationLogits;
|
|
}
|
|
|
|
[[nodiscard]] TensorPtr const& getContextLogitsHost() const
|
|
{
|
|
return mContextLogitsHost;
|
|
}
|
|
|
|
/// @param contextLogitsHost Expected shape [promtLen, vocabSizePadded]
|
|
void setContextLogitsHost(TensorPtr contextLogitsHost)
|
|
{
|
|
mContextLogitsHost = std::move(contextLogitsHost);
|
|
}
|
|
|
|
void allocContextLogitsHost(SizeType32 vocabSizePadded, nvinfer1::DataType logitsDataType)
|
|
{
|
|
mContextLogitsHost = runtime::BufferManager::pinnedPool(
|
|
runtime::ITensor::makeShape({mPromptLen, vocabSizePadded}), logitsDataType);
|
|
}
|
|
|
|
[[nodiscard]] TensorPtr const& getGenerationLogitsHost() const
|
|
{
|
|
return mGenerationLogitsHost;
|
|
}
|
|
|
|
/// @param generationLogitsHost Expected shape
|
|
/// * [beamWidth, maxNewTokens, vocabSizePadded] for non-speculative decoding
|
|
/// * [1, numDraftTokens + 1, vocabSizePadded] for speculative decoding
|
|
void setGenerationLogitsHost(TensorPtr generationLogitsHost)
|
|
{
|
|
mGenerationLogitsHost = std::move(generationLogitsHost);
|
|
}
|
|
|
|
void allocGenerationLogitsHost(SizeType32 vocabSizePadded, nvinfer1::DataType logitsDataType)
|
|
{
|
|
mGenerationLogitsHost = runtime::BufferManager::pinnedPool(
|
|
runtime::ITensor::makeShape({mSamplingConfig.beamWidth, mMaxNewTokens, vocabSizePadded}), logitsDataType);
|
|
}
|
|
|
|
void allocTargetModelAcceptedTokenLogitsHost(SizeType32 vocabSizePadded, nvinfer1::DataType logitsDataType)
|
|
{
|
|
mGenerationLogitsHost = runtime::BufferManager::pinnedPool(
|
|
runtime::ITensor::makeShape({1, getNumDraftTokens() + 1, vocabSizePadded}), logitsDataType);
|
|
}
|
|
|
|
[[nodiscard]] std::vector<TensorPtr> const& getGenerationLogitsFragments() const
|
|
{
|
|
return mGenerationLogitsFragments;
|
|
}
|
|
|
|
void addGenerationLogitsFragment(TensorPtr& genLogits)
|
|
{
|
|
mGenerationLogitsFragments.push_back(genLogits);
|
|
}
|
|
|
|
SizeType32 getGenerationLogitsFragmentsSize()
|
|
{
|
|
return mGenerationLogitsFragments.size();
|
|
}
|
|
|
|
void clearGenerationLogitsFragments()
|
|
{
|
|
mGenerationLogitsFragments.clear();
|
|
}
|
|
|
|
[[nodiscard]] bool hasReachedState(LlmRequestState_t state) const noexcept
|
|
{
|
|
return mState >= state;
|
|
}
|
|
|
|
[[nodiscard]] bool isEncoderInitState() const noexcept
|
|
{
|
|
return mState == REQUEST_STATE_ENCODER_INIT;
|
|
}
|
|
|
|
[[nodiscard]] bool isContextInitState() const noexcept
|
|
{
|
|
return mState == REQUEST_STATE_CONTEXT_INIT;
|
|
}
|
|
|
|
[[nodiscard]] bool isGenerationInProgressState() const noexcept
|
|
{
|
|
return mState == REQUEST_STATE_GENERATION_IN_PROGRESS || mState == REQUEST_STATE_GENERATION_TO_COMPLETE;
|
|
}
|
|
|
|
[[nodiscard]] bool isGenerationCompleteState() const noexcept
|
|
{
|
|
return mState == REQUEST_STATE_GENERATION_COMPLETE;
|
|
}
|
|
|
|
/// To determine whether the context is unchunked. When a context is chunked into only a part, it
|
|
/// is still different from the unchunked state, which indicates the initial status.
|
|
[[nodiscard]] bool isFullContextRequest() const noexcept
|
|
{
|
|
return isContextInitState() && !mContextChunkSize;
|
|
}
|
|
|
|
void setContextCurrentPosition(SizeType32 contextCurrentPosition)
|
|
{
|
|
mContextCurrentPosition = contextCurrentPosition;
|
|
}
|
|
|
|
/// When chunked, the position of the current chunk is returned. Otherwise, only the beginning
|
|
/// or end of the context is returned.
|
|
[[nodiscard]] SizeType32 getContextCurrentPosition() const noexcept
|
|
{
|
|
return mContextCurrentPosition;
|
|
}
|
|
|
|
/// Return the length of the context that has not yet been processed.
|
|
[[nodiscard]] SizeType32 getContextRemainingLength() const noexcept
|
|
{
|
|
return mPromptLen - getContextCurrentPosition();
|
|
}
|
|
|
|
/// To retrieve the context chunk size, throw an exception when the context is not chunked.
|
|
[[nodiscard]] SizeType32 getContextChunkSize() const
|
|
{
|
|
TLLM_CHECK_WITH_INFO(
|
|
isContextInitState() && mContextChunkSize, "The current request is not in context chunking state.");
|
|
return mContextChunkSize.value();
|
|
}
|
|
|
|
/// To set the context chunk size, throw an exception when the chunk size is negative. If the chunk
|
|
/// size is greater than the remaining length of the context, the size will be reduced to fit the
|
|
/// remaining length.
|
|
void setContextChunkSize(SizeType32 size)
|
|
{
|
|
TLLM_CHECK_WITH_INFO(isContextInitState(), "Chunking is only possible during the context phase.");
|
|
TLLM_CHECK_WITH_INFO(size >= 0, "The chunk size of context (%d) can't be negative.", size);
|
|
mContextChunkSize = std::min(size, getContextRemainingLength());
|
|
}
|
|
|
|
/// Determines whether the current position is only one chunk away from the end of the context.
|
|
/// It will return true when the context is not chunked.
|
|
[[nodiscard]] bool isLastContextChunk() const noexcept
|
|
{
|
|
return isFullContextRequest()
|
|
|| (isContextInitState() && getContextCurrentPosition() + getContextChunkSize() == mPromptLen);
|
|
}
|
|
|
|
/// Returns whether the position is at the beginning of the context. It will return true when the
|
|
/// context is not chunked.
|
|
[[nodiscard]] bool isFirstContextChunk() const noexcept
|
|
{
|
|
return isFullContextRequest() || getContextCurrentPosition() == 0;
|
|
}
|
|
|
|
[[nodiscard]] executor::PriorityType priority() const noexcept
|
|
{
|
|
return mPriority;
|
|
}
|
|
|
|
/// Move the cursor forward one chunk. When not chunked, move forward to the end of the context.
|
|
void moveToNextContextChunk()
|
|
{
|
|
TLLM_CHECK_WITH_INFO(isContextInitState(), "Chunking is only possible during the context phase.");
|
|
if (mContextChunkSize)
|
|
{
|
|
mContextCurrentPosition += getContextChunkSize();
|
|
setContextChunkSize(0);
|
|
}
|
|
else
|
|
{
|
|
TLLM_CHECK_WITH_INFO(mContextCurrentPosition == 0, "Full context out of bounds.");
|
|
mContextCurrentPosition = mPromptLen;
|
|
}
|
|
}
|
|
|
|
/// Increment the counter of decoding iterations.
|
|
void advanceDecodingIter()
|
|
{
|
|
mDecodingIter++;
|
|
}
|
|
|
|
/// @brief Return the average number of decoded tokens per iteration. For standard model it is 1.
|
|
/// For speculative decoding model >= 1 -- number of draft tokens accepted per step + 1.
|
|
[[nodiscard]] float getAvgDecodedTokensPerIter() const noexcept
|
|
{
|
|
if (mDecodingIter == 0)
|
|
{
|
|
return 0.f;
|
|
}
|
|
return static_cast<float>(getMaxNumGeneratedTokens()) / mDecodingIter;
|
|
}
|
|
|
|
/// @brief Create a Response from the current state of the request
|
|
/// @return An optional Response
|
|
std::optional<executor::Response> createResponse()
|
|
{
|
|
if (isGenerationCompleteState() || (mIsStreaming && isGenerationInProgressState()))
|
|
{
|
|
TLLM_LOG_DEBUG("Creating response for request %lu", mRequestId);
|
|
|
|
executor::Result result;
|
|
result.isFinal = isGenerationCompleteState();
|
|
|
|
auto const nbBeams = mSamplingConfig.beamWidth;
|
|
auto const maxNbTokens = getMaxBeamNumTokens();
|
|
|
|
auto const calculateNbTokensOut = [this](SizeType32 maxNbTokens)
|
|
{
|
|
if (!mIsStreaming)
|
|
{
|
|
return maxNbTokens - (mExcludeInputFromOutput ? getOrigPromptLen() : 0);
|
|
}
|
|
return mReturnAllGeneratedTokens ? maxNbTokens - getOrigPromptLen()
|
|
: maxNbTokens - getMaxSentTokenLen();
|
|
};
|
|
|
|
auto const maxNbTokensOut = calculateNbTokensOut(maxNbTokens);
|
|
|
|
result.outputTokenIds.resize(nbBeams);
|
|
|
|
auto const startTokenPos = maxNbTokens - maxNbTokensOut;
|
|
|
|
auto const shouldSendResponse
|
|
= isGenerationCompleteState() || (mIsStreaming && maxNbTokens > getMaxSentTokenLen());
|
|
|
|
if (!shouldSendResponse)
|
|
{
|
|
return std::nullopt;
|
|
}
|
|
else
|
|
{
|
|
for (SizeType32 beam = 0; beam < nbBeams; ++beam)
|
|
{
|
|
auto const& tokens = getTokens(beam);
|
|
auto const nbTokensOut = calculateNbTokensOut(tokens.size());
|
|
|
|
if (nbTokensOut > 0)
|
|
{
|
|
auto const first = tokens.data() + startTokenPos;
|
|
result.outputTokenIds.at(beam).assign(first, first + nbTokensOut);
|
|
}
|
|
}
|
|
|
|
if (returnLogProbs())
|
|
{
|
|
result.cumLogProbs = getCumLogProbs();
|
|
result.logProbs = getLogProbs();
|
|
}
|
|
|
|
if (getReturnContextLogits())
|
|
{
|
|
result.contextLogits = executor::detail::ofITensor(getContextLogitsHost());
|
|
}
|
|
|
|
if (getReturnGenerationLogits())
|
|
{
|
|
result.generationLogits = executor::detail::ofITensor(getGenerationLogitsHost());
|
|
}
|
|
|
|
if (getReturnEncoderOutput())
|
|
{
|
|
result.encoderOutput = executor::detail::ofITensor(getEncoderOutputHost());
|
|
}
|
|
|
|
// Update position of last sent response
|
|
setMaxSentTokenLen(maxNbTokens);
|
|
|
|
auto response = executor::Response(mRequestId, std::move(result));
|
|
return response;
|
|
}
|
|
}
|
|
else
|
|
{
|
|
return std::nullopt;
|
|
}
|
|
}
|
|
|
|
RequestIdType mRequestId;
|
|
SizeType32 mPromptLen;
|
|
SizeType32 mMaxNewTokens;
|
|
// Tokens [beam_size, mPromptLen + getMaxNumGeneratedTokens()]
|
|
runtime::SamplingConfig mSamplingConfig;
|
|
LlmRequestState_t mState;
|
|
std::optional<TokenIdType> mEndId;
|
|
std::optional<TokenIdType> mPadId;
|
|
std::optional<SizeType32> mSeqSlot;
|
|
std::optional<LogitsPostProcessor> mLogitsPostProcessor;
|
|
bool mApplyLogitsPostProcessorBatched;
|
|
std::optional<RequestIdType> mClientId;
|
|
// Position of mask token in GLM model inputs
|
|
SizeType32 mMaskPosition{0};
|
|
|
|
protected:
|
|
bool mIsStreaming;
|
|
|
|
// A list of tokens generated at the current step.
|
|
// Used to pass the decoded tokens as the input to the next step.
|
|
// `mLastTokens[beam] != mTokens.back()[beam]` for streaming + beam search
|
|
// as `mTokens` will be overwritten by the gathered tokens.
|
|
VecTokens mLastTokens;
|
|
BeamTokens mTokens;
|
|
SizeType32 mOrigPromptLen;
|
|
// Number of tokens already in KV cache before context phase.
|
|
// A value > 0 indicates cached KV cache blocks were reused.
|
|
// Up to inputLen - 1 tokens can be reused.
|
|
SizeType32 mPrepopulatedPromptLen{0};
|
|
SizeType32 mMaxSentTokenLen;
|
|
|
|
std::optional<TensorPtr> mEmbeddingBias;
|
|
std::optional<TensorPtr> mBadWordsList;
|
|
std::optional<TensorPtr> mStopWordsList;
|
|
|
|
std::optional<TensorPtr> mPromptEmbeddingTable;
|
|
std::optional<SizeType32> mPromptVocabSize;
|
|
|
|
std::optional<LoraTaskIdType> mLoraTaskId;
|
|
std::optional<TensorPtr> mLoraWeights;
|
|
std::optional<TensorPtr> mLoraConfig;
|
|
|
|
// To enable chunked context, the FHMA paged kv-cache also needs to be enabled. Except for the last one,
|
|
// the size of the context chunk needs to be an integer multiple of the kv-cache block size. The meaning
|
|
// of null value is that the context is not chunked.
|
|
std::optional<SizeType32> mContextChunkSize;
|
|
SizeType32 mContextCurrentPosition;
|
|
|
|
std::vector<VecLogProbs> mLogProbs; // [beamSize, seqLen]
|
|
VecLogProbs mCumLogProbs; // [beamSize]
|
|
std::shared_ptr<VecTokens> mDraftTokens;
|
|
std::optional<TensorPtr> mDraftLogits;
|
|
SizeType32 mNumTokensPerIteration;
|
|
|
|
// whether to return the full beams on each iteration. True when doing streaming + beamsearch
|
|
bool mReturnAllGeneratedTokens;
|
|
// Save logits
|
|
bool mReturnContextLogits;
|
|
bool mReturnGenerationLogits;
|
|
bool mReturnLogProbs;
|
|
TensorPtr mContextLogitsHost; // [mPromptLen, vocab_size_padded]
|
|
TensorPtr mGenerationLogitsHost; // [beam_size, mMaxNewTokens, vocab_size_padded]
|
|
std::vector<TensorPtr> mGenerationLogitsFragments;
|
|
|
|
bool mExcludeInputFromOutput;
|
|
|
|
// Encoder-only and Encoder-Decoder models
|
|
// Encoder input tokens
|
|
std::optional<std::shared_ptr<VecTokens>> mEncoderTokens;
|
|
bool mReturnEncoderOutput;
|
|
// Encoder output, used to compute cross attention KV Cache
|
|
TensorPtr mEncoderOutput; // [numTokens, hidden_size]
|
|
TensorPtr mEncoderHiddenStates; // for pipeline parallelism, [numTokens, hiddenSize]
|
|
TensorPtr mEncoderOutputHost;
|
|
|
|
SizeType32 mDecodingIter;
|
|
executor::PriorityType mPriority;
|
|
|
|
private:
|
|
void initialize(VecTokens const& inputTokens, bool outputLogProbs)
|
|
{
|
|
// Scatter the input tokens to other beam
|
|
mTokens = BeamTokens(mSamplingConfig.beamWidth, inputTokens);
|
|
mLastTokens = VecTokens(mSamplingConfig.beamWidth);
|
|
|
|
if ((mPromptEmbeddingTable.has_value() && !mPromptVocabSize.has_value())
|
|
|| (!mPromptEmbeddingTable.has_value() && mPromptVocabSize.has_value()))
|
|
{
|
|
std::string errStr
|
|
= "Prompt embedding table and prompt vocab size tensors must both be provided for requests with "
|
|
"prompt "
|
|
"tuning enabled.";
|
|
TLLM_THROW(errStr);
|
|
}
|
|
|
|
if (mDraftLogits.has_value() && mDraftTokens->empty())
|
|
{
|
|
TLLM_THROW("Draft tokens must be specified when draft logits are given.");
|
|
}
|
|
|
|
setReturnLogProbs(outputLogProbs);
|
|
}
|
|
|
|
TensorPtr createListTensor(std::list<VecTokens> const& wordsList)
|
|
{
|
|
std::vector<SizeType32> offsets;
|
|
VecTokens words;
|
|
SizeType32 offsetCnt = 0;
|
|
for (auto const& tokens : wordsList)
|
|
{
|
|
offsetCnt += tokens.size();
|
|
offsets.push_back(offsetCnt);
|
|
words.insert(words.end(), tokens.begin(), tokens.end());
|
|
}
|
|
offsets.resize(words.size(), -1);
|
|
|
|
SizeType32 numWords = static_cast<SizeType32>(words.size());
|
|
auto shape = runtime::ITensor::makeShape({2, numWords});
|
|
auto tensor = runtime::BufferManager::pinnedPool(shape, nvinfer1::DataType::kINT32);
|
|
auto data = runtime::bufferCast<int32_t>(*tensor);
|
|
std::memcpy(data, words.data(), numWords * sizeof(int32_t));
|
|
std::memcpy(data + numWords, offsets.data(), numWords * sizeof(int32_t));
|
|
|
|
// Add leading dim of 1
|
|
tensor->unsqueeze(0);
|
|
|
|
return tensor;
|
|
}
|
|
};
|
|
|
|
class LlmRequest : public GenericLlmRequest<runtime::ITensor::SharedPtr>
|
|
{
|
|
public:
|
|
using Base = GenericLlmRequest<runtime::ITensor::SharedPtr>;
|
|
using TensorPtr = Base::TensorPtr;
|
|
using SizeType32 = Base::SizeType32;
|
|
using TokenIdType = Base::TokenIdType;
|
|
using RequestIdType = Base::RequestIdType;
|
|
using VecLogProbs = Base::VecLogProbs;
|
|
using BeamTokens = Base::BeamTokens;
|
|
using VecTokens = Base::VecTokens;
|
|
|
|
LlmRequest(RequestIdType requestId, SizeType32 maxNewTokens, std::shared_ptr<VecTokens> inputTokens,
|
|
runtime::SamplingConfig const& samplingConfig, bool isStreaming, std::optional<SizeType32> endId = std::nullopt,
|
|
std::optional<SizeType32> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
|
|
std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
|
|
std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
|
|
std::optional<SizeType32> promptVocabSize = std::nullopt,
|
|
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
|
|
std::optional<TensorPtr> loraConfig = std::nullopt, bool returnLogProbs = false,
|
|
bool returnContextLogits = false, bool returnGenerationLogits = false,
|
|
std::optional<std::shared_ptr<VecTokens>> draftTokens = std::nullopt,
|
|
std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
|
|
std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
|
|
bool applyLogitsPostProcessorBatched = false,
|
|
std::optional<std::shared_ptr<VecTokens>> encoderInputTokens = std::nullopt, bool returnEncoderOutput = false,
|
|
std::optional<RequestIdType> clientId = std::nullopt,
|
|
executor::PriorityType priority = executor::Request::kDefaultPriority)
|
|
: Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
|
|
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList),
|
|
std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
|
|
returnLogProbs, returnContextLogits, returnGenerationLogits, std::move(draftTokens), std::move(draftLogits),
|
|
excludeInputFromOutput, std::move(logitsPostProcessor), applyLogitsPostProcessorBatched,
|
|
std::move(encoderInputTokens), returnEncoderOutput, clientId, priority)
|
|
{
|
|
}
|
|
|
|
LlmRequest(RequestIdType requestId, executor::Request const& Request,
|
|
std::optional<Base::LogitsPostProcessor> logitsPostProcessor = std::nullopt,
|
|
bool applyLogitsPostProcessorBatched = false)
|
|
: Base(requestId, Request)
|
|
{
|
|
mLogitsPostProcessor = std::move(logitsPostProcessor);
|
|
mApplyLogitsPostProcessorBatched = applyLogitsPostProcessorBatched;
|
|
}
|
|
|
|
void movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager)
|
|
{
|
|
if (!mPromptEmbeddingTable.has_value()
|
|
|| mPromptEmbeddingTable.value()->getMemoryType() == runtime::MemoryType::kGPU)
|
|
{
|
|
return;
|
|
}
|
|
else
|
|
{
|
|
TensorPtr gpuPromptEmbeddingTable
|
|
= manager.copyFrom(*mPromptEmbeddingTable.value(), runtime::MemoryType::kGPU);
|
|
mPromptEmbeddingTable = gpuPromptEmbeddingTable;
|
|
}
|
|
}
|
|
|
|
void moveLoraWeightsToGpu(runtime::BufferManager const& manager)
|
|
{
|
|
if (!mLoraWeights.has_value() || mLoraWeights.value()->getMemoryType() == runtime::MemoryType::kGPU)
|
|
{
|
|
return;
|
|
}
|
|
// TODO for tp / pp models we only need to move the bit that belong on the local device
|
|
TensorPtr gpuLoraWeights = manager.copyFrom(*mLoraWeights.value(), runtime::MemoryType::kGPU);
|
|
mLoraWeights = gpuLoraWeights;
|
|
}
|
|
};
|
|
|
|
} // namespace tensorrt_llm::batch_manager
|