mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
[None] [refactor] Minor cleanup and improvements (#7619)
Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
This commit is contained in:
parent
ba3dbb6c94
commit
e2f69c5c23
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: Apache-2.0
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -20,7 +20,7 @@
|
||||
#include "tensorrt_llm/batch_manager/common.h"
|
||||
#include "tensorrt_llm/common/algorithm.h"
|
||||
#include "tensorrt_llm/common/optionalRef.h"
|
||||
#include "tensorrt_llm/runtime/bufferManager.h"
|
||||
#include "tensorrt_llm/executor/executor.h"
|
||||
#include "tensorrt_llm/runtime/common.h"
|
||||
#include "tensorrt_llm/runtime/iTensor.h"
|
||||
#include "tensorrt_llm/runtime/modelConfig.h"
|
||||
@ -28,11 +28,7 @@
|
||||
|
||||
namespace tensorrt_llm::runtime
|
||||
{
|
||||
class DecodingInput;
|
||||
class DecodingOutput;
|
||||
class GptDecoderBatched;
|
||||
class SamplingConfig;
|
||||
class SpeculativeDecodingMode;
|
||||
|
||||
namespace decoder
|
||||
{
|
||||
@ -56,10 +52,6 @@ public:
|
||||
using CudaStream = tensorrt_llm::runtime::CudaStream;
|
||||
using TensorPtr = runtime::ITensor::SharedPtr;
|
||||
using SharedConstPtr = runtime::ITensor::SharedConstPtr;
|
||||
using DecodingInput = runtime::DecodingInput;
|
||||
using DecodingOutput = runtime::DecodingOutput;
|
||||
using SpeculativeDecodingMode = runtime::SpeculativeDecodingMode;
|
||||
using GptDecoderBatched = runtime::GptDecoderBatched;
|
||||
template <typename T>
|
||||
using OptionalRef = tensorrt_llm::common::OptionalRef<T>;
|
||||
|
||||
@ -70,7 +62,7 @@ public:
|
||||
{
|
||||
}
|
||||
|
||||
std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
|
||||
[[nodiscard]] std::tuple<TensorPtr, std::vector<SamplingConfig>, std::vector<SharedConstPtr>,
|
||||
std::vector<executor::LookaheadDecodingConfig>>
|
||||
operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
|
||||
executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
|
||||
@ -78,8 +70,7 @@ public:
|
||||
CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
|
||||
SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;
|
||||
|
||||
[[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
|
||||
std::vector<executor::LookaheadDecodingConfig>>
|
||||
[[nodiscard]] std::tuple<std::vector<SharedConstPtr>, std::vector<executor::LookaheadDecodingConfig>>
|
||||
createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
|
||||
executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
|
||||
nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
/*
|
||||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved.
|
||||
* Copyright (c) 2022-2025, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
@ -29,6 +29,8 @@
|
||||
#include <cassert>
|
||||
#include <chrono>
|
||||
#include <cstdint>
|
||||
#include <cstring>
|
||||
#include <list>
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <utility>
|
||||
@ -56,9 +58,9 @@ enum class LlmRequestState : int32_t
|
||||
/// used in layer-wise transmission
|
||||
kDISAGG_GENERATION_TRANS_COMPLETE = 12, ///< Kv cache transmission are finished
|
||||
kGENERATION_IN_PROGRESS = 13, ///< Generation phase is in progress
|
||||
kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed
|
||||
|
||||
// schedulable states ends
|
||||
kGENERATION_TO_COMPLETE = 14, ///< Generation phase is to be completed
|
||||
kGENERATION_COMPLETE = 20, ///< Generation phase completed
|
||||
kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,
|
||||
/// after computation finished
|
||||
@ -1075,7 +1077,6 @@ public:
|
||||
TLLM_CHECK_WITH_INFO(prepopulatedPromptLen < promptLen,
|
||||
"Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen,
|
||||
promptLen, mRequestId);
|
||||
TLLM_CHECK(prepopulatedPromptLen < promptLen);
|
||||
|
||||
auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;
|
||||
auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;
|
||||
@ -1116,9 +1117,9 @@ public:
|
||||
mDraftLogits = draftLogits;
|
||||
}
|
||||
|
||||
[[nodiscard]] SizeType32 getNumDraftTokens() const
|
||||
[[nodiscard]] SizeType32 getNumDraftTokens() const noexcept
|
||||
{
|
||||
return hasDraftTokens() ? mDraftTokens->size() : 0;
|
||||
return hasDraftTokens() ? static_cast<SizeType32>(mDraftTokens->size()) : 0;
|
||||
}
|
||||
|
||||
void discardDraftTokens(SizeType32 numTokensToDiscard)
|
||||
@ -1379,17 +1380,17 @@ public:
|
||||
mGenerationLogitsFragments.push_back(genLogits);
|
||||
}
|
||||
|
||||
SizeType32 getGenerationLogitsFragmentsSize()
|
||||
[[nodiscard]] SizeType32 getGenerationLogitsFragmentsSize() const noexcept
|
||||
{
|
||||
return mGenerationLogitsFragments.size();
|
||||
return static_cast<SizeType32>(mGenerationLogitsFragments.size());
|
||||
}
|
||||
|
||||
void clearGenerationLogitsFragments()
|
||||
void clearGenerationLogitsFragments() noexcept
|
||||
{
|
||||
mGenerationLogitsFragments.clear();
|
||||
}
|
||||
|
||||
bool hasAdditionalOutputs()
|
||||
[[nodiscard]] bool hasAdditionalOutputs() const noexcept
|
||||
{
|
||||
return !mAdditionalContextOutputTensors.empty() || !mAdditionalGenerationOutputTensors.empty();
|
||||
}
|
||||
|
||||
@ -1478,7 +1478,8 @@ private:
|
||||
class ExecutorConfig
|
||||
{
|
||||
public:
|
||||
static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000;
|
||||
static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds
|
||||
= std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::minutes(3)).count();
|
||||
|
||||
static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000;
|
||||
|
||||
|
||||
@ -19,7 +19,6 @@
|
||||
#include "tensorrt_llm/executor/executor.h"
|
||||
#include "tensorrt_llm/runtime/common.h"
|
||||
#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
|
||||
#include <memory>
|
||||
|
||||
namespace tensorrt_llm::runtime
|
||||
{
|
||||
@ -29,7 +28,6 @@ class LookaheadModule : public SpeculativeDecodingModule
|
||||
public:
|
||||
explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
|
||||
: SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens)
|
||||
, mExecutionConfig()
|
||||
{
|
||||
}
|
||||
|
||||
@ -43,7 +41,7 @@ public:
|
||||
mExecutionConfig = config;
|
||||
}
|
||||
|
||||
executor::LookaheadDecodingConfig const getExecutionConfig() const
|
||||
[[nodiscard]] executor::LookaheadDecodingConfig const& getExecutionConfig() const
|
||||
{
|
||||
return mExecutionConfig;
|
||||
}
|
||||
|
||||
@ -21,6 +21,7 @@
|
||||
#include "tensorrt_llm/runtime/lookaheadModule.h"
|
||||
#include "tensorrt_llm/runtime/loraModule.h"
|
||||
#include "tensorrt_llm/runtime/speculativeDecodingMode.h"
|
||||
#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
|
||||
|
||||
#include <NvInferRuntime.h>
|
||||
#include <array>
|
||||
|
||||
@ -39,7 +39,6 @@ using namespace tensorrt_llm::runtime;
|
||||
|
||||
namespace tc = tensorrt_llm::common;
|
||||
namespace te = tensorrt_llm::executor;
|
||||
namespace tk = tensorrt_llm::kernels;
|
||||
namespace tr = tensorrt_llm::runtime;
|
||||
|
||||
namespace tensorrt_llm::batch_manager
|
||||
|
||||
@ -39,8 +39,8 @@ public:
|
||||
|
||||
if (offset + size > mBuffer->getSize())
|
||||
{
|
||||
throw std::out_of_range(std::string("slice ") + std::to_string(offset + size) + " exceeds buffer size "
|
||||
+ std::to_string(mBuffer->getSize()));
|
||||
throw std::out_of_range(std::string("offset ") + std::to_string(offset) + std::string(" + size ")
|
||||
+ std::to_string(size) + " exceeds buffer size " + std::to_string(mBuffer->getSize()));
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@ -284,8 +284,8 @@ void verifyOutput(RequestList const& finishedRequestList,
|
||||
}
|
||||
|
||||
// Pick a different endId at random from one of the expected tokens
|
||||
std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelType const& modelType,
|
||||
std::vector<SizeType32> const& givenInputLengths, SizeType32 const maxNewTokens, bool replaceLogits)
|
||||
std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, std::vector<SizeType32> const& givenInputLengths,
|
||||
SizeType32 const maxNewTokens, bool replaceLogits)
|
||||
{
|
||||
auto const nbGivenInputs = testData.nbGivenInputs;
|
||||
auto const beamWidth = testData.beamWidth;
|
||||
@ -328,9 +328,9 @@ std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelT
|
||||
return endIds;
|
||||
}
|
||||
|
||||
TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelType, ModelIds const modelIds,
|
||||
BeamResult const& beamResult, ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId,
|
||||
bool const replaceLogits, BufferManager& manager)
|
||||
TestData loadTestData(ModelSpec const& modelSpec, ModelIds const modelIds, BeamResult const& beamResult,
|
||||
ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
|
||||
BufferManager& manager)
|
||||
{
|
||||
auto const [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(givenInput, modelIds.padId);
|
||||
auto const& [beamWidth, resultsFile, contextLogitsFile, genLogitsFile, cumLogProbsFile, logProbsFile] = beamResult;
|
||||
@ -353,7 +353,7 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy
|
||||
|
||||
if (useRandomEndId)
|
||||
{
|
||||
testData.endIds = pickRandomEndIds(testData, modelType, givenInputLengths, maxNewTokens, replaceLogits);
|
||||
testData.endIds = pickRandomEndIds(testData, givenInputLengths, maxNewTokens, replaceLogits);
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -409,9 +409,8 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy
|
||||
}
|
||||
|
||||
std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> loadTestData(ModelSpec const& modelSpec,
|
||||
TrtGptModelType const& modelType, ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths,
|
||||
ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
|
||||
BufferManager& manager)
|
||||
ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths, ITensor const& givenInput,
|
||||
SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits, BufferManager& manager)
|
||||
{
|
||||
// Map between beam width, and expected results for that beam width
|
||||
std::unordered_map<SizeType32, TestData> beamWidthTestData;
|
||||
@ -424,8 +423,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
|
||||
EXPECT_EQ(std::find(beamWidths.begin(), beamWidths.end(), beamWidth), beamWidths.end());
|
||||
beamWidths.push_back(beamWidth);
|
||||
|
||||
auto testData = loadTestData(modelSpec, modelType, modelIds, beamResult, givenInput, maxBeamWidth,
|
||||
useRandomEndId, replaceLogits, manager);
|
||||
auto testData = loadTestData(
|
||||
modelSpec, modelIds, beamResult, givenInput, maxBeamWidth, useRandomEndId, replaceLogits, manager);
|
||||
beamWidthTestData.emplace(beamWidth, std::move(testData));
|
||||
}
|
||||
|
||||
@ -435,9 +434,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
|
||||
RequestList runGptModelInference(std::shared_ptr<TrtGptModel>& trtGptModel, std::vector<SizeType32> const& beamWidths,
|
||||
std::unordered_map<SizeType32, TestData> const& beamWidthTestData, SizeType32 batchSize, SizeType32 nbGivenInputs,
|
||||
SizeType32 maxInputLength, SizeType32 padId, std::vector<SizeType32> const& givenInputLengths,
|
||||
TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType,
|
||||
TrtGptModelType modelType, int maxReqPerStep, bool prepopulateKVCache, bool enableStreamingMode,
|
||||
bool enableBlockReuse)
|
||||
TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType, int maxReqPerStep,
|
||||
bool prepopulateKVCache, bool enableStreamingMode, bool enableBlockReuse)
|
||||
{
|
||||
// Fill the requests using givenInput
|
||||
// requestList will have batchSize requests
|
||||
@ -641,8 +639,8 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds
|
||||
|
||||
auto const maxBeamWidth = executorConfig.getMaxBeamWidth();
|
||||
// Load expected outputs for each beam width value
|
||||
auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelType, modelIds, resultsFilesBeamWidths,
|
||||
*givenInput, maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);
|
||||
auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelIds, resultsFilesBeamWidths, *givenInput,
|
||||
maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);
|
||||
|
||||
int const worldSize = modelSpec.mTPSize * modelSpec.mPPSize * modelSpec.mCPSize;
|
||||
auto const worldConfig = WorldConfig::mpi(worldSize, modelSpec.mTPSize, modelSpec.mPPSize, modelSpec.mCPSize);
|
||||
@ -663,14 +661,14 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds
|
||||
// Prepopulate KV cache for speculative decoding test
|
||||
bool const prepopulateKVCache = modelSpec.mMaxDraftTokens > 0;
|
||||
auto finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
|
||||
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
|
||||
maxReqPerStep, prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);
|
||||
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, maxReqPerStep,
|
||||
prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);
|
||||
|
||||
if (prepopulateKVCache)
|
||||
{
|
||||
// Call the 2nd time with prefilled KV cache
|
||||
finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
|
||||
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
|
||||
nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType,
|
||||
maxReqPerStep, false, enableStreamingMode, modelSpec.mKVCacheReuse);
|
||||
}
|
||||
|
||||
|
||||
@ -56,7 +56,7 @@ TEST_F(LlmRequestTest, fromExecutorRequest)
|
||||
EXPECT_EQ(llmReq.getState(), tb::LlmRequestState::kCONTEXT_INIT);
|
||||
EXPECT_FALSE(llmReq.mSeqSlot);
|
||||
// No speculative decoding config, draft tokens should be empty
|
||||
EXPECT_EQ(llmReq.getDraftTokens()->size(), 0);
|
||||
EXPECT_EQ(llmReq.getNumDraftTokens(), 0);
|
||||
EXPECT_FALSE(llmReq.getEmbeddingBias().has_value());
|
||||
EXPECT_FALSE(llmReq.getBadWordsList().has_value());
|
||||
EXPECT_FALSE(llmReq.getStopWordsList().has_value());
|
||||
|
||||
Loading…
Reference in New Issue
Block a user