[None] [refactor] Minor cleanup and improvements (#7619)

Signed-off-by: Robin Kobus <19427718+Funatiq@users.noreply.github.com>
2026-01-13 22:18:36 +08:00 · 2025-10-03 11:40:06 +02:00 · 2025-10-03 11:40:06 +02:00 · e2f69c5c23
commit e2f69c5c23
parent ba3dbb6c94
9 changed files with 38 additions and 49 deletions
--- a/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
+++ b/cpp/include/tensorrt_llm/batch_manager/createNewDecoderRequests.h
@ -1,5 +1,5 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
 * SPDX-License-Identifier: Apache-2.0
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@ -20,7 +20,7 @@
 #include "tensorrt_llm/batch_manager/common.h"
 #include "tensorrt_llm/common/algorithm.h"
 #include "tensorrt_llm/common/optionalRef.h"
-#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/modelConfig.h"
@ -28,11 +28,7 @@

 namespace tensorrt_llm::runtime
 {
-class DecodingInput;
-class DecodingOutput;
-class GptDecoderBatched;
 class SamplingConfig;
-class SpeculativeDecodingMode;

 namespace decoder
 {
@ -56,10 +52,6 @@ public:
    using CudaStream = tensorrt_llm::runtime::CudaStream;
    using TensorPtr = runtime::ITensor::SharedPtr;
    using SharedConstPtr = runtime::ITensor::SharedConstPtr;
-    using DecodingInput = runtime::DecodingInput;
-    using DecodingOutput = runtime::DecodingOutput;
-    using SpeculativeDecodingMode = runtime::SpeculativeDecodingMode;
-    using GptDecoderBatched = runtime::GptDecoderBatched;
    template <typename T>
    using OptionalRef = tensorrt_llm::common::OptionalRef<T>;

@ -70,7 +62,7 @@ public:
    {
    }

-    std::tuple<TensorPtr, std::vector<runtime::SamplingConfig>, std::vector<runtime::ITensor::SharedConstPtr>,
+    [[nodiscard]] std::tuple<TensorPtr, std::vector<SamplingConfig>, std::vector<SharedConstPtr>,
        std::vector<executor::LookaheadDecodingConfig>>
    operator()(runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
        executor::DecodingConfig const& decodingConfig, RequestVector const& contextRequests,
@ -78,8 +70,7 @@ public:
        CudaStream const& runtimeStream, CudaStream const& decoderStream, SizeType32 maxSequenceLength,
        SizeType32 beamWidth, OptionalRef<MedusaBuffers const> medusaBuffers) const;

-    [[nodiscard]] std::tuple<std::vector<runtime::ITensor::SharedConstPtr>,
-        std::vector<executor::LookaheadDecodingConfig>>
+    [[nodiscard]] std::tuple<std::vector<SharedConstPtr>, std::vector<executor::LookaheadDecodingConfig>>
    createDecoderRequests(RequestVector const& finishedContextRequests, TensorPtr const& inputIds,
        executor::DecodingConfig const& decodingConfig, runtime::decoder::DecoderState& decoderState,
        nvinfer1::DataType logitsType, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2024, NVIDIA CORPORATION.  All rights reserved.
+ * Copyright (c) 2022-2025, NVIDIA CORPORATION.  All rights reserved.
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@ -29,6 +29,8 @@
 #include <cassert>
 #include <chrono>
 #include <cstdint>
+#include <cstring>
+#include <list>
 #include <memory>
 #include <optional>
 #include <utility>
@ -56,9 +58,9 @@ enum class LlmRequestState : int32_t
                                            /// used in layer-wise transmission
    kDISAGG_GENERATION_TRANS_COMPLETE = 12, ///< Kv cache transmission are finished
    kGENERATION_IN_PROGRESS = 13,           ///< Generation phase is in progress
-    kGENERATION_TO_COMPLETE = 14,           ///< Generation phase is to be completed

    // schedulable states ends
+    kGENERATION_TO_COMPLETE = 14,           ///< Generation phase is to be completed
    kGENERATION_COMPLETE = 20,              ///< Generation phase completed
    kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 21, ///< Waiting context-only request transmitting the kv cache,
                                            /// after computation finished
@ -1075,7 +1077,6 @@ public:
        TLLM_CHECK_WITH_INFO(prepopulatedPromptLen < promptLen,
            "Invalid state: prepopulatedPromptLen (%d) >= promptLen (%d) for request %lu", prepopulatedPromptLen,
            promptLen, mRequestId);
-        TLLM_CHECK(prepopulatedPromptLen < promptLen);

        auto& prePromptLen = mUseDraftModel ? mPrepopulatedPromptLenDraft : mPrepopulatedPromptLenTarget;
        auto& contextCurrentPosition = mUseDraftModel ? mContextCurrentPositionDraft : mContextCurrentPositionTarget;
@ -1116,9 +1117,9 @@ public:
        mDraftLogits = draftLogits;
    }

-    [[nodiscard]] SizeType32 getNumDraftTokens() const
+    [[nodiscard]] SizeType32 getNumDraftTokens() const noexcept
    {
-        return hasDraftTokens() ? mDraftTokens->size() : 0;
+        return hasDraftTokens() ? static_cast<SizeType32>(mDraftTokens->size()) : 0;
    }

    void discardDraftTokens(SizeType32 numTokensToDiscard)
@ -1379,17 +1380,17 @@ public:
        mGenerationLogitsFragments.push_back(genLogits);
    }

-    SizeType32 getGenerationLogitsFragmentsSize()
+    [[nodiscard]] SizeType32 getGenerationLogitsFragmentsSize() const noexcept
    {
-        return mGenerationLogitsFragments.size();
+        return static_cast<SizeType32>(mGenerationLogitsFragments.size());
    }

-    void clearGenerationLogitsFragments()
+    void clearGenerationLogitsFragments() noexcept
    {
        mGenerationLogitsFragments.clear();
    }

-    bool hasAdditionalOutputs()
+    [[nodiscard]] bool hasAdditionalOutputs() const noexcept
    {
        return !mAdditionalContextOutputTensors.empty() || !mAdditionalGenerationOutputTensors.empty();
    }
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@ -1478,7 +1478,8 @@ private:
 class ExecutorConfig
 {
 public:
-    static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds = 180000000;
+    static constexpr uint64_t kDefaultMaxSeqIdleMicroseconds
+        = std::chrono::duration_cast<std::chrono::microseconds>(std::chrono::minutes(3)).count();

    static constexpr SizeType32 kDefaultIterStatsMaxIterations = 1000;

--- a/cpp/include/tensorrt_llm/runtime/lookaheadModule.h
+++ b/cpp/include/tensorrt_llm/runtime/lookaheadModule.h
@ -19,7 +19,6 @@
 #include "tensorrt_llm/executor/executor.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/speculativeDecodingModule.h"
-#include <memory>

 namespace tensorrt_llm::runtime
 {
@ -29,7 +28,6 @@ class LookaheadModule : public SpeculativeDecodingModule
 public:
    explicit LookaheadModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens) noexcept
        : SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxDecodingDraftTokens)
-        , mExecutionConfig()
    {
    }

@ -43,7 +41,7 @@ public:
        mExecutionConfig = config;
    }

-    executor::LookaheadDecodingConfig const getExecutionConfig() const
+    [[nodiscard]] executor::LookaheadDecodingConfig const& getExecutionConfig() const
    {
        return mExecutionConfig;
    }
--- a/cpp/include/tensorrt_llm/runtime/modelConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/modelConfig.h
@ -21,6 +21,7 @@
 #include "tensorrt_llm/runtime/lookaheadModule.h"
 #include "tensorrt_llm/runtime/loraModule.h"
 #include "tensorrt_llm/runtime/speculativeDecodingMode.h"
+#include "tensorrt_llm/runtime/speculativeDecodingModule.h"

 #include <NvInferRuntime.h>
 #include <array>
--- a/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
+++ b/cpp/tensorrt_llm/batch_manager/createNewDecoderRequests.cpp
@ -39,7 +39,6 @@ using namespace tensorrt_llm::runtime;

 namespace tc = tensorrt_llm::common;
 namespace te = tensorrt_llm::executor;
-namespace tk = tensorrt_llm::kernels;
 namespace tr = tensorrt_llm::runtime;

 namespace tensorrt_llm::batch_manager
--- a/cpp/tensorrt_llm/runtime/bufferView.h
+++ b/cpp/tensorrt_llm/runtime/bufferView.h
@ -39,8 +39,8 @@ public:

        if (offset + size > mBuffer->getSize())
        {
-            throw std::out_of_range(std::string("slice ") + std::to_string(offset + size) + " exceeds buffer size "
-                + std::to_string(mBuffer->getSize()));
+            throw std::out_of_range(std::string("offset ") + std::to_string(offset) + std::string(" + size ")
+                + std::to_string(size) + " exceeds buffer size " + std::to_string(mBuffer->getSize()));
        }
    }

--- a/cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp
+++ b/cpp/tests/e2e_tests/batch_manager/trtGptModelRealDecoderTest.cpp
@ -284,8 +284,8 @@ void verifyOutput(RequestList const& finishedRequestList,
 }

 // Pick a different endId at random from one of the expected tokens
-std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelType const& modelType,
-    std::vector<SizeType32> const& givenInputLengths, SizeType32 const maxNewTokens, bool replaceLogits)
+std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, std::vector<SizeType32> const& givenInputLengths,
+    SizeType32 const maxNewTokens, bool replaceLogits)
 {
    auto const nbGivenInputs = testData.nbGivenInputs;
    auto const beamWidth = testData.beamWidth;
@ -328,9 +328,9 @@ std::vector<TokenIdType> pickRandomEndIds(TestData const& testData, TrtGptModelT
    return endIds;
 }

-TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelType, ModelIds const modelIds,
-    BeamResult const& beamResult, ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId,
-    bool const replaceLogits, BufferManager& manager)
+TestData loadTestData(ModelSpec const& modelSpec, ModelIds const modelIds, BeamResult const& beamResult,
+    ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
+    BufferManager& manager)
 {
    auto const [givenInputLengths, nbGivenInputs, maxInputLength] = getGivenInputLengths(givenInput, modelIds.padId);
    auto const& [beamWidth, resultsFile, contextLogitsFile, genLogitsFile, cumLogProbsFile, logProbsFile] = beamResult;
@ -353,7 +353,7 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy

    if (useRandomEndId)
    {
-        testData.endIds = pickRandomEndIds(testData, modelType, givenInputLengths, maxNewTokens, replaceLogits);
+        testData.endIds = pickRandomEndIds(testData, givenInputLengths, maxNewTokens, replaceLogits);
    }
    else
    {
@ -409,9 +409,8 @@ TestData loadTestData(ModelSpec const& modelSpec, TrtGptModelType const& modelTy
 }

 std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> loadTestData(ModelSpec const& modelSpec,
-    TrtGptModelType const& modelType, ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths,
-    ITensor const& givenInput, SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits,
-    BufferManager& manager)
+    ModelIds const modelIds, BeamResults const& resultsFilesBeamWidths, ITensor const& givenInput,
+    SizeType32 const maxBeamWidth, bool const useRandomEndId, bool const replaceLogits, BufferManager& manager)
 {
    // Map between beam width, and expected results for that beam width
    std::unordered_map<SizeType32, TestData> beamWidthTestData;
@ -424,8 +423,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
        EXPECT_EQ(std::find(beamWidths.begin(), beamWidths.end(), beamWidth), beamWidths.end());
        beamWidths.push_back(beamWidth);

-        auto testData = loadTestData(modelSpec, modelType, modelIds, beamResult, givenInput, maxBeamWidth,
-            useRandomEndId, replaceLogits, manager);
+        auto testData = loadTestData(
+            modelSpec, modelIds, beamResult, givenInput, maxBeamWidth, useRandomEndId, replaceLogits, manager);
        beamWidthTestData.emplace(beamWidth, std::move(testData));
    }

@ -435,9 +434,8 @@ std::tuple<std::vector<SizeType32>, std::unordered_map<SizeType32, TestData>> lo
 RequestList runGptModelInference(std::shared_ptr<TrtGptModel>& trtGptModel, std::vector<SizeType32> const& beamWidths,
    std::unordered_map<SizeType32, TestData> const& beamWidthTestData, SizeType32 batchSize, SizeType32 nbGivenInputs,
    SizeType32 maxInputLength, SizeType32 padId, std::vector<SizeType32> const& givenInputLengths,
-    TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType,
-    TrtGptModelType modelType, int maxReqPerStep, bool prepopulateKVCache, bool enableStreamingMode,
-    bool enableBlockReuse)
+    TokenIdType const* givenInputData, ModelSpec const& modelSpec, TrtGptModelIfbTestType testType, int maxReqPerStep,
+    bool prepopulateKVCache, bool enableStreamingMode, bool enableBlockReuse)
 {
    // Fill the requests using givenInput
    // requestList will have batchSize requests
@ -641,8 +639,8 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds

    auto const maxBeamWidth = executorConfig.getMaxBeamWidth();
    // Load expected outputs for each beam width value
-    auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelType, modelIds, resultsFilesBeamWidths,
-        *givenInput, maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);
+    auto [beamWidths, beamWidthTestData] = loadTestData(modelSpec, modelIds, resultsFilesBeamWidths, *givenInput,
+        maxBeamWidth, useRandomEndId, modelSpec.mReplaceLogits, manager);

    int const worldSize = modelSpec.mTPSize * modelSpec.mPPSize * modelSpec.mCPSize;
    auto const worldConfig = WorldConfig::mpi(worldSize, modelSpec.mTPSize, modelSpec.mPPSize, modelSpec.mCPSize);
@ -663,14 +661,14 @@ void runIfbTest(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds
        // Prepopulate KV cache for speculative decoding test
        bool const prepopulateKVCache = modelSpec.mMaxDraftTokens > 0;
        auto finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
-            nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
-            maxReqPerStep, prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);
+            nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, maxReqPerStep,
+            prepopulateKVCache, enableStreamingMode, modelSpec.mKVCacheReuse);

        if (prepopulateKVCache)
        {
            // Call the 2nd time with prefilled KV cache
            finishedRequestList = runGptModelInference(trtGptModel, beamWidths, beamWidthTestData, batchSize,
-                nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType, modelType,
+                nbGivenInputs, maxInputLength, padId, givenInputLengths, givenInputData, modelSpec, testType,
                maxReqPerStep, false, enableStreamingMode, modelSpec.mKVCacheReuse);
        }

--- a/cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp
+++ b/cpp/tests/unit_tests/batch_manager/llmRequestTest.cpp
@ -56,7 +56,7 @@ TEST_F(LlmRequestTest, fromExecutorRequest)
        EXPECT_EQ(llmReq.getState(), tb::LlmRequestState::kCONTEXT_INIT);
        EXPECT_FALSE(llmReq.mSeqSlot);
        // No speculative decoding config, draft tokens should be empty
-        EXPECT_EQ(llmReq.getDraftTokens()->size(), 0);
+        EXPECT_EQ(llmReq.getNumDraftTokens(), 0);
        EXPECT_FALSE(llmReq.getEmbeddingBias().has_value());
        EXPECT_FALSE(llmReq.getBadWordsList().has_value());
        EXPECT_FALSE(llmReq.getStopWordsList().has_value());