// // Created by martinma on 5/24/23. // /* * Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. */ #include "tensorrt_llm/runtime/gptSession.h" #include "common.h" #include "iBuffer.h" #include "tensorrt_llm/batch_manager/kvCacheManager.h" #include "tensorrt_llm/common/customAllReduceUtils.h" #include "tensorrt_llm/common/stringUtils.h" #include "tensorrt_llm/runtime/gptDecoderBatch.h" #include "tensorrt_llm/runtime/ipcUtils.h" #include "tensorrt_llm/runtime/ncclCommunicator.h" #include "tensorrt_llm/runtime/runtimeBuffers.h" #include "tensorrt_llm/runtime/runtimeKernels.h" #include "tensorrt_llm/runtime/statefulGptDecoder.h" #include "tensorrt_llm/runtime/tllmLogger.h" #include "tensorrt_llm/runtime/tllmRuntime.h" #include "tensorrt_llm/runtime/utils/sessionUtils.h" #include #include // std::getenv #include #include #include #include #include using namespace tensorrt_llm::runtime; namespace tc = tensorrt_llm::common; namespace bmkv = tensorrt_llm::batch_manager::kv_cache_manager; namespace { std::unordered_set populateMicrobatchIndexes() { auto const* profileMbIdxChar = std::getenv("TLLM_GPTS_PROFILE_START_STOP"); std::unordered_set idxSet; if (profileMbIdxChar != nullptr) { std::istringstream iss{profileMbIdxChar}; std::int32_t idx; char c; while (iss >> idx) { idxSet.insert(idx); iss >> c; } } return idxSet; } auto const kProfileMbIdxs = populateMicrobatchIndexes(); } // namespace GptSession::GptSession(Config const& sessionConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig, void const* engineBuffer, std::size_t engineSize, LoggerPtr logger) : mModelConfig{modelConfig} , mWorldConfig{worldConfig} , mDevice{utils::initDevice(worldConfig)} , mLogger{logger ? std::move(logger) : std::make_shared()} , mRuntime{std::make_shared(engineBuffer, engineSize, *mLogger)} { if (mWorldConfig.isPipelineParallel()) { mPipelineComm = std::make_shared(mWorldConfig); mCommStream = std::make_shared(); } TLLM_CHECK_WITH_INFO(!(mModelConfig.usePromptTuning() && !mModelConfig.useGptAttentionPlugin()), "Prompt tuning is only enabled with GPT attention plugin."); // TODO compare expected and runtime tensor names? setup(sessionConfig); } nvinfer1::ILogger& GptSession::getLogger() const { return *mLogger; } BufferManager const& GptSession::getBufferManager() const { return mRuntime->getBufferManager(); } nvinfer1::DataType GptSession::getLogitDataType() const { return mRuntime->getEngine().getTensorDataType("logits"); } void GptSession::createContexts() { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); mRuntime->clearContexts(); auto const numProfiles = mRuntime->getNbProfiles(); TLLM_CHECK_WITH_INFO( numProfiles == 1 || numProfiles == 2, "GPT only expects one optimization profile or two optimization profiles"); // Instantiate 1 execution context for each profile for (auto contextId = 0; contextId < numProfiles; ++contextId) { mRuntime->addContext(contextId); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptSession::createBuffers(SizeType numMicroBatches) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); mBuffers.clear(); for (SizeType i = 0; i < numMicroBatches; ++i) { mBuffers.emplace_back(std::make_shared()); mBuffers.back()->create(*mRuntime, mModelConfig, mWorldConfig); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptSession::createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches, DecodingMode const& decodingMode) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto const vocabSize = mModelConfig.getVocabSize(); auto const vocabSizePadded = mModelConfig.getVocabSizePadded(mWorldConfig.getSize()); auto const& stream = mRuntime->getStreamPtr(); mDecoders.clear(); for (SizeType i = 0; i < numMicroBatches; ++i) { if (decoderPerRequest) { mDecoders.emplace_back(std::make_shared(vocabSize, vocabSizePadded, stream)); } else { mDecoders.emplace_back(std::make_shared(vocabSize, vocabSizePadded, stream)); } constexpr SizeType maxTokensPerStep = 1; mDecoders.back()->setup(decodingMode, batchSize, beamWidth, maxAttentionWindow, sinkTokenLength, maxSequenceLength, maxTokensPerStep, /* fusedDecoder*/ false, logitsType, mModelConfig); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptSession::createKvCacheManager(SizeType batchSize, SizeType beamWidth, SizeType maxAttentionWindow, SizeType sinkTokenLength, SizeType maxSequenceLength, KvCacheConfig const& kvCacheConfig) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto const tokensPerBlock = mModelConfig.getTokensPerBlock(); auto const kvDtype = mModelConfig.getKvDataType(); auto const [blocksInPrimaryPool, blocksInSecondaryPool] = bmkv::KVCacheManager::calculateMaxNumBlocks( kvCacheConfig, kvDtype, mModelConfig, mWorldConfig, getBufferManager()); // If beamWidth > 1, use one more block for each sequence in the paged kv cache to avoid dropping the needed // tokens, when enabling cyclic kv cache. auto const useOneMoreBlock = beamWidth > 1 && maxSequenceLength > maxAttentionWindow; auto const localNbLayers = mModelConfig.getNbLayers(mWorldConfig.getPipelineParallelism()); auto const nbKvHeads = mModelConfig.getNbKvHeads(); auto const sizePerHead = mModelConfig.getSizePerHead(); bool constexpr enableBlockReuse{false}; mKvCacheManager = std::make_shared(localNbLayers, nbKvHeads, sizePerHead, tokensPerBlock, blocksInPrimaryPool, blocksInSecondaryPool, batchSize, beamWidth, maxAttentionWindow, sinkTokenLength, useOneMoreBlock, kvDtype, mRuntime->getStreamPtr(), enableBlockReuse, kvCacheConfig.useUvm, kvCacheConfig.onboardBlocks); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptSession::createCustomAllReduceWorkspace( SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); setPeerAccess(mWorldConfig, true); mIpcMemoryHandles.clear(); const std::size_t bufferSize = std::min(static_cast(maxBatchSize) * maxBeamWidth * maxSequenceLength * mModelConfig.getHiddenSize() * mWorldConfig.getTensorParallelism() * sizeof(float), ::tensorrt_llm::utils::customAllReduceUtils::getMaxRequiredWorkspaceSize(mWorldConfig.getTensorParallelism())); mIpcMemoryHandles.emplace_back(std::make_shared(mWorldConfig, bufferSize)); mIpcMemoryHandles.emplace_back(std::make_shared(mWorldConfig, bufferSize)); mIpcMemoryHandles.emplace_back(std::make_shared(mWorldConfig, IpcMemory::FLAGS_SIZE * sizeof(int32_t))); mIpcMemoryHandles.emplace_back(std::make_shared(mWorldConfig, IpcMemory::FLAGS_SIZE * sizeof(int32_t))); mCommPtrs = BufferManager::cpu( ITensor::makeShape({static_cast(mIpcMemoryHandles.size()) * mWorldConfig.getTensorParallelism()}), nvinfer1::DataType::kINT64); auto* const commPtrsData = bufferCast(*mCommPtrs); for (size_t memIdx = 0; memIdx < mIpcMemoryHandles.size(); memIdx++) { auto const& memCommPtrs = mIpcMemoryHandles[memIdx]->getCommPtrsTensor(); for (SizeType tpIdx = 0; tpIdx < mWorldConfig.getTensorParallelism(); tpIdx++) { commPtrsData[memIdx * mWorldConfig.getTensorParallelism() + tpIdx] = memCommPtrs[tpIdx]; } } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } GptSession::MicroBatchConfig::MicroBatchConfig(SizeType maxBatchSize, SizeType pipelineParallelism, std::optional genMicroBatchSize, std::optional ctxMicroBatchSize) { if (genMicroBatchSize || ctxMicroBatchSize) { genBatchSize = genMicroBatchSize.value_or(maxBatchSize); TLLM_CHECK(genBatchSize <= maxBatchSize); ctxBatchSize = ctxMicroBatchSize.value_or(genBatchSize); TLLM_CHECK_WITH_INFO(genBatchSize % ctxBatchSize == 0, "Generation batch size (%d) must be divisible by context batch size (%d)", genBatchSize, ctxBatchSize); numGenBatches = tc::ceilDiv(maxBatchSize, genBatchSize); numCtxBatches = numGenBatches * (genBatchSize / ctxBatchSize); } else { numCtxBatches = numGenBatches = pipelineParallelism; ctxBatchSize = genBatchSize = tc::ceilDiv(maxBatchSize, numGenBatches); } } void GptSession::setup(Config const& sessionConfig) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); mCudaGraphMode = sessionConfig.cudaGraphMode; auto const maxBatchSize = sessionConfig.maxBatchSize; auto const maxBeamWidth = sessionConfig.maxBeamWidth; auto const maxSequenceLength = sessionConfig.maxSequenceLength; if (sessionConfig.kvCacheConfig.maxAttentionWindow.has_value() && sessionConfig.kvCacheConfig.maxAttentionWindow.value() > maxSequenceLength) { TLLM_LOG_WARNING( "The value of maxAttentionWindow cannot exceed maxSequenceLength. " "Therefore, it has been adjusted to match the value of maxSequenceLength."); } auto const maxAttentionWindow = sessionConfig.kvCacheConfig.maxAttentionWindow.has_value() ? std::min(sessionConfig.kvCacheConfig.maxAttentionWindow.value(), maxSequenceLength) : maxSequenceLength; auto const sinkTokenLength = sessionConfig.kvCacheConfig.sinkTokenLength.has_value() ? sessionConfig.kvCacheConfig.sinkTokenLength.value() : 0; mMicroBatchConfig = MicroBatchConfig(maxBatchSize, mWorldConfig.getPipelineParallelism(), sessionConfig.genMicroBatchSize, sessionConfig.ctxMicroBatchSize); if (sessionConfig.cudaGraphMode) { // Instantiate 2 graph instances for flip-flopping of each generation batch mCudaGraphInstances.resize(2 * mMicroBatchConfig.numGenBatches); } createContexts(); createBuffers(mMicroBatchConfig.numGenBatches); mNormalizeLogProbs = sessionConfig.normalizeLogProbs; // Store this param related to decoder buffer size and kv cache manager to check against // the input shape with the params given in generate(). // gptDecoderBatch does not resize buffers, but allows smaller batchSize and beamWidth. // TODO refactor batch manager to remove dependency on maxSequenceLength. mDecoderMaxSequenceLength = maxSequenceLength; mDecoderMaxAttentionWindow = maxAttentionWindow; mDecoderSinkTokenLength = sinkTokenLength; if (mWorldConfig.isLastPipelineParallelRank()) { auto const logitsType = mRuntime->getEngine().getTensorDataType("logits"); DecodingMode decodingMode = sessionConfig.decodingMode.value_or( maxBeamWidth == 1 ? DecodingMode::TopKTopP() : DecodingMode::BeamSearch()); createDecoders(mMicroBatchConfig.genBatchSize, maxBeamWidth, maxAttentionWindow, sinkTokenLength, maxSequenceLength, logitsType, sessionConfig.decoderPerRequest, mMicroBatchConfig.numGenBatches, decodingMode); } if (mWorldConfig.isPipelineParallel() || mMicroBatchConfig.numGenBatches > 1) { mReceivedEvents.clear(); for (SizeType i = 0; i < mMicroBatchConfig.numGenBatches; ++i) { mReceivedEvents.emplace_back(); } } if (mWorldConfig.isTensorParallel() && mModelConfig.useCustomAllReduce()) { createCustomAllReduceWorkspace(mMicroBatchConfig.genBatchSize, maxBeamWidth, maxSequenceLength); } if (mModelConfig.isTransformerBased() && mModelConfig.usePagedKvCache()) { createKvCacheManager(maxBatchSize, maxBeamWidth, maxAttentionWindow, sinkTokenLength, maxSequenceLength, sessionConfig.kvCacheConfig); } auto* kvCacheManager = mModelConfig.usePagedKvCache() ? mKvCacheManager.get() : nullptr; for (auto& buffers : mBuffers) { // we don't know maxInputLength yet and ignore it for pre-allocation buffers->generationConfig = GenerationConfig{ mMicroBatchConfig.genBatchSize, maxBeamWidth, 0, maxAttentionWindow, sinkTokenLength, maxSequenceLength}; buffers->reshape(kvCacheManager, mModelConfig, mWorldConfig); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptSession::kvCacheAddSequences(SizeType beamWidth, SizeType microBatchId, SizeType firstBatchIdx) { if (mModelConfig.usePagedKvCache()) { TLLM_CHECK(mKvCacheManager); auto contextLengthsHost = mBuffers.at(microBatchId)->contextLengthsHost; TLLM_CHECK(contextLengthsHost); auto const* const contextLengthsPtr = bufferCast(*contextLengthsHost); auto const contextLengthsSize = static_cast(contextLengthsHost->getSize()); for (SizeType batchIdx = 0; batchIdx < contextLengthsSize; ++batchIdx) { mKvCacheManager->addSequence(firstBatchIdx + batchIdx, contextLengthsPtr[batchIdx], beamWidth); } } } ITensor::SharedPtr GptSession::initDecoder(ITensor& outputIds, GenerationInput const& inputs, GenerationOutput const& outputs, SamplingConfig const& samplingConfig, SizeType microBatchId) const { if (mWorldConfig.isLastPipelineParallelRank()) { auto& decoder = *mDecoders.at(microBatchId); decoder.newBatch(inputs, outputs, samplingConfig); return decoder.getNewTokens(); } else if (mWorldConfig.isFirstPipelineParallelRank()) { auto& manager = mRuntime->getBufferManager(); auto const& stream = mRuntime->getStreamPtr(); auto const inputLengths = inputs.lengths; auto const batchSize = static_cast(inputLengths->getSize()); auto const inputLengthsHost = manager.copyFrom(*inputLengths, MemoryType::kCPU); auto const* inputLengthsData = bufferCast(*inputLengthsHost); SizeType const maxInputLength = *std::max_element(inputLengthsData, inputLengthsData + inputLengths->getSize()); ITensor::SharedPtr inputOffsets = manager.emptyTensor(MemoryType::kGPU, TRTDataType::value); if (inputs.packed) { inputOffsets->reshape(ITensor::makeShape({batchSize + 1})); manager.setZero(*inputOffsets); kernels::invokeInclusiveSum(*ITensor::slice(inputOffsets, 1), *inputLengths, manager, *stream); } kernels::initOutputIds(outputIds, *inputs.ids, *inputLengths, *inputOffsets, inputs.padId, inputs.endId, maxInputLength, inputs.packed, *stream); auto const beamWidth = samplingConfig.beamWidth; return manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kINT32); } else { return ITensor::SharedPtr{}; } } namespace { std::tuple, std::vector, std::vector> splitInputIds( GenerationInput const& inputs, SizeType microBatchSize, BufferManager& manager) { auto const numRequests = inputs.lengths->getShape().d[0]; std::vector inputIds; std::vector inputLengths; std::vector microBatchOffsets(1, 0); if (inputs.packed) { auto const contextLengthsHost = manager.copyFrom(*inputs.lengths, MemoryType::kCPU); ITensor::SharedPtr inputIdsView = ITensor::view(inputs.ids); if (inputIdsView->getShape().nbDims == 2) { // For users still pass inputs.ids with shape [1, num_tokens], do squeeze for them. inputIdsView->squeeze(0); } TLLM_CHECK(inputIdsView->getShape().nbDims == 1); auto const contextLengthsRange = BufferRange(*contextLengthsHost); auto tokensBegin = 0; for (auto offset = 0; offset < numRequests; offset += microBatchSize) { auto const batchSize = std::min(microBatchSize, numRequests - offset); auto const numTokens = std::accumulate( contextLengthsRange.begin() + offset, contextLengthsRange.begin() + offset + batchSize, 0); ITensor::SharedPtr batchInputs = ITensor::slice(inputIdsView, tokensBegin, numTokens); TLLM_CHECK(batchInputs->getShape().nbDims == 1); TLLM_CHECK(batchInputs->getShape().d[0] == numTokens); inputIds.emplace_back(std::move(batchInputs)); inputLengths.emplace_back(ITensor::slice(inputs.lengths, offset, batchSize)); microBatchOffsets.emplace_back(offset + batchSize); tokensBegin += numTokens; } } else { for (auto offset = 0; offset < numRequests; offset += microBatchSize) { auto const batchSize = std::min(microBatchSize, numRequests - offset); inputIds.emplace_back(ITensor::slice(inputs.ids, offset, batchSize)); inputLengths.emplace_back(ITensor::slice(inputs.lengths, offset, batchSize)); microBatchOffsets.emplace_back(offset + batchSize); } } return {inputIds, inputLengths, microBatchOffsets}; } std::vector splitInputs(GenerationInput const& inputs, SizeType microBatchSize, BufferManager& manager) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto [inputIds, inputLengths, microBatchOffsets] = splitInputIds(inputs, microBatchSize, manager); std::vector inputBatches; for (std::size_t batchId = 0; batchId < inputIds.size(); ++batchId) { inputBatches.emplace_back( inputs.endId, inputs.padId, std::move(inputIds[batchId]), std::move(inputLengths[batchId]), inputs.packed); } for (std::size_t batchId = 0; batchId < inputBatches.size(); ++batchId) { auto& batch = inputBatches[batchId]; auto const offset = microBatchOffsets[batchId]; auto const batchSize = microBatchOffsets[batchId + 1] - offset; if (inputs.embeddingBias) { batch.embeddingBias = inputs.embeddingBias; } if (inputs.badWordsList) { auto const& shape = inputs.badWordsList->getShape(); if (shape.nbDims == 2) { batch.badWordsList = inputs.badWordsList; } else { assert(shape.nbDims == 3); batch.badWordsList = ITensor::slice(inputs.badWordsList, offset, batchSize); } } if (inputs.stopWordsList) { batch.stopWordsList = ITensor::slice(inputs.stopWordsList, offset, batchSize); } if (inputs.maxNewTokens) { batch.maxNewTokens = inputs.maxNewTokens; } if (inputs.promptTuningParams.embeddingTable) { batch.promptTuningParams.embeddingTable = inputs.promptTuningParams.embeddingTable; } if (inputs.promptTuningParams.tasks) { batch.promptTuningParams.tasks = ITensor::slice(inputs.promptTuningParams.tasks, offset, batchSize); } if (inputs.promptTuningParams.vocabSize) { batch.promptTuningParams.vocabSize = inputs.promptTuningParams.vocabSize; } } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); return inputBatches; } std::vector splitOutputs(GenerationOutput& outputs, SizeType microBatchSize) { auto const numRequests = outputs.ids->getShape().d[0]; std::vector outputBatches; for (auto batchOffset = 0; batchOffset < numRequests; batchOffset += microBatchSize) { auto const batchSize = std::min(microBatchSize, numRequests - batchOffset); outputBatches.emplace_back(ITensor::slice(outputs.ids, batchOffset, batchSize), ITensor::slice(outputs.lengths, batchOffset, batchSize)); if (outputs.cumLogProbs) { outputBatches.back().cumLogProbs = ITensor::slice(outputs.cumLogProbs, batchOffset, batchSize); } if (outputs.logProbs) { outputBatches.back().logProbs = ITensor::slice(outputs.logProbs, batchOffset, batchSize); } if (outputs.contextLogits) { outputBatches.back().contextLogits = ITensor::slice(outputs.contextLogits, batchOffset, batchSize); } if (outputs.generationLogits) { outputBatches.back().generationLogits = ITensor::slice(outputs.generationLogits, batchOffset, batchSize); } } return outputBatches; } void updateOutputIds(ITensor::SharedPtr const& outputIds, ITensor::SharedPtr const& newTokens, SizeType decoderStep, CudaStream const& stream) { // assemble outputIds of all micro batches auto const& newTokensShape = newTokens->getShape(); auto newTokensView = ITensor::view(newTokens, ITensor::makeShape({1, newTokensShape.d[0] * newTokensShape.d[1]})); auto const& outputIdsShape = outputIds->getShape(); auto outputIdsView = ITensor::view( outputIds, ITensor::makeShape({outputIdsShape.d[0] * outputIdsShape.d[1], outputIdsShape.d[2]})); kernels::invokeTransposeWithOutputOffset(*outputIdsView, *newTokensView, decoderStep, stream); sync_check_cuda_error(); } } // namespace void GptSession::generate(GenerationOutput& outputs, GenerationInput const& inputs, SamplingConfig const& samplingConfig, std::shared_ptr const generationProfiler) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK_WITH_INFO(inputs.packed == mModelConfig.usePackedInput(), "The chosen model requires a packed input tensor (did you set packed?)."); auto const& inputLengths = inputs.lengths; TLLM_CHECK_WITH_INFO(inputLengths->getShape().nbDims == 1, "Input lengths tensor must be one-dimensional."); auto& manager = mRuntime->getBufferManager(); auto const batchSize = static_cast(inputLengths->getSize()); auto const beamWidth = samplingConfig.beamWidth; outputs.ids->reshape(ITensor::makeShape({batchSize, beamWidth, mDecoderMaxSequenceLength})); outputs.lengths->reshape(ITensor::makeShape({batchSize, beamWidth})); if (mWorldConfig.isLastPipelineParallelRank()) { if (outputs.cumLogProbs) { TLLM_CHECK_WITH_INFO(outputs.cumLogProbs, "outputs.cumLogProbs is nullptr. It must be allocated when computeLogProbs is true"); outputs.cumLogProbs->reshape(ITensor::makeShape({batchSize, beamWidth})); } if (outputs.logProbs) { TLLM_CHECK_WITH_INFO( outputs.logProbs, "outputs.logProbs is nullptr. It must be allocated when computeLogProbs is true"); outputs.logProbs->reshape(ITensor::makeShape({batchSize, beamWidth, mDecoderMaxSequenceLength})); } if (mModelConfig.computeContextLogits() || mModelConfig.computeGenerationLogits()) { auto const vocabSizePadded = mModelConfig.getVocabSizePadded(mWorldConfig.getSize()); auto const inputLengthsHost = manager.copyFrom(*inputLengths, MemoryType::kCPU); auto const inputLengthsRange = BufferRange(*inputLengthsHost); auto const maxInputLength = *std::max_element(inputLengthsRange.begin(), inputLengthsRange.end()); if (mModelConfig.computeContextLogits()) { // outputs.contextLogits = mBuffers.back()->cacheContextLogits; if (!outputs.contextLogits) { outputs.contextLogits = manager.emptyTensor(MemoryType::kGPU, getLogitDataType()); } outputs.contextLogits->reshape(ITensor::makeShape({batchSize, maxInputLength, vocabSizePadded})); } // Initialize the output generation logits buffer if (mModelConfig.computeGenerationLogits()) { SizeType maxNewTokens = 0; if (inputs.maxNewTokens) { maxNewTokens = inputs.maxNewTokens.value(); } else { for (auto iter : inputLengthsRange) { maxNewTokens = std::max(maxNewTokens, mDecoderMaxSequenceLength - iter); } } TLLM_CHECK_WITH_INFO(maxNewTokens, "maxNewTokens is null"); if (!outputs.generationLogits) { outputs.generationLogits = manager.emptyTensor(MemoryType::kGPU, getLogitDataType()); } outputs.generationLogits->reshape( ITensor::makeShape({batchSize, beamWidth, maxNewTokens, vocabSizePadded})); auto const generationLogitsShape = outputs.generationLogits->getShape(); TLLM_CHECK_WITH_INFO(generationLogitsShape.d[0] == batchSize, "Invalid dim[0]"); TLLM_CHECK_WITH_INFO(generationLogitsShape.d[1] == beamWidth, "Invalid dim[1]"); TLLM_CHECK_WITH_INFO(generationLogitsShape.d[2] == maxNewTokens, "Invalid dim[2]"); TLLM_CHECK_WITH_INFO(generationLogitsShape.d[3] == vocabSizePadded, "Invalid dim[3]"); }; } } // callbacks auto const onTokenGenerated = createOnTokenGeneratedCallback(outputs); if (batchSize <= mMicroBatchConfig.genBatchSize) { std::vector microBatchesInputs{inputs}; std::vector microBatchesOutputs{outputs}; generateBatched(microBatchesOutputs, microBatchesInputs, samplingConfig, onTokenGenerated, generationProfiler); } else { auto const microBatchesInputs = splitInputs(inputs, mMicroBatchConfig.genBatchSize, manager); auto microBatchesOutputs = splitOutputs(outputs, mMicroBatchConfig.genBatchSize); generateBatched(microBatchesOutputs, microBatchesInputs, samplingConfig, onTokenGenerated, generationProfiler); } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } GptSession::TokenGeneratedCallback GptSession::createOnTokenGeneratedCallback(GenerationOutput& outputs) { if (outputs.onTokenGenerated && mWorldConfig.isFirstPipelineParallelRank()) { ITensor::SharedPtr outputIds{mWorldConfig.isPipelineParallel() || mMicroBatchConfig.numGenBatches > 1 ? outputs.ids : mDecoders.front()->getOutputIds()}; return [onTokenGenerated = outputs.onTokenGenerated, outputIds = std::move(outputIds)]( SizeType step, bool finished) { onTokenGenerated(outputIds, step, finished); }; } else { return [](SizeType step, bool finished) {}; } } void GptSession::generateBatched(std::vector& microBatchesOutputs, std::vector const& microBatchesInputs, SamplingConfig const& samplingConfig, TokenGeneratedCallback const& onTokenGenerated, std::shared_ptr const generationProfiler) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto& manager = mRuntime->getBufferManager(); TLLM_CHECK(microBatchesInputs.size() == microBatchesOutputs.size()); auto const numMicroBatches = static_cast(microBatchesInputs.size()); TLLM_CHECK(numMicroBatches > 0); TLLM_CHECK(numMicroBatches <= mMicroBatchConfig.numGenBatches); SizeType const beamWidth{samplingConfig.beamWidth}; auto* kvCacheManager = mModelConfig.usePagedKvCache() ? mKvCacheManager.get() : nullptr; // Initialize and reshape buffers for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId) { auto const& microBatchInputs = microBatchesInputs.at(microBatchId); auto& buffers = *mBuffers.at(microBatchId); buffers.initFromInput(*microBatchInputs.ids, microBatchInputs.lengths, microBatchInputs.packed, beamWidth, mDecoderMaxAttentionWindow, mDecoderSinkTokenLength, mDecoderMaxSequenceLength, manager); buffers.reshape(kvCacheManager, mModelConfig, mWorldConfig); buffers.reset(manager); } std::vector microBatchOffsets(1, 0); microBatchOffsets.reserve(numMicroBatches + 1); for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId) { auto const& generationConfig = mBuffers.at(microBatchId)->generationConfig; microBatchOffsets.emplace_back(microBatchOffsets.back() + generationConfig.batchSize); } for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId) { auto& buffers = *mBuffers.at(microBatchId); auto const batchOffset = microBatchOffsets.at(microBatchId); kvCacheAddSequences(beamWidth, microBatchId, batchOffset); auto const& microBatchInputs = microBatchesInputs.at(microBatchId); auto& microBatchOutputs = microBatchesOutputs.at(microBatchId); buffers.outputIds = microBatchOutputs.ids; buffers.outputLengths = microBatchOutputs.lengths; buffers.newTokens = initDecoder(*buffers.outputIds, microBatchInputs, microBatchOutputs, samplingConfig, microBatchId); if (mWorldConfig.isLastPipelineParallelRank()) { buffers.cumLogProbs = nullptr; if (microBatchOutputs.cumLogProbs) { buffers.cumLogProbs = microBatchOutputs.cumLogProbs; } buffers.logProbs = nullptr; if (microBatchOutputs.logProbs) { buffers.logProbs = microBatchOutputs.logProbs; } if (mModelConfig.computeContextLogits()) { buffers.logits = microBatchOutputs.contextLogits; } } if (mModelConfig.usePromptTuning()) { buffers.promptTuningParams = microBatchInputs.promptTuningParams; } } if (useCudaGraphs()) { for (auto& instance : mCudaGraphInstances) { instance.clear(); } } auto const profileContext = !kProfileMbIdxs.empty() && kProfileMbIdxs.count(0) > 0; if (profileContext) cudaProfilerStart(); executeContextStep(microBatchesInputs, microBatchOffsets, kvCacheManager); if (profileContext) cudaProfilerStop(); std::vector microBatchesFinished(numMicroBatches, false); SizeType numBatchesFinished{0}; SizeType step{0}; if (generationProfiler) { manager.getStream().record(generationProfiler->getStart()); } while (numBatchesFinished < numMicroBatches) { ++step; auto const profileStep = !kProfileMbIdxs.empty() && kProfileMbIdxs.count(step) > 0; if (profileStep) cudaProfilerStart(); numBatchesFinished += executeGenerationStep( step, microBatchesInputs, microBatchesOutputs, microBatchOffsets, kvCacheManager, microBatchesFinished); onTokenGenerated(step - 1, numBatchesFinished == numMicroBatches); if (profileStep) cudaProfilerStop(); } if (generationProfiler) { manager.getStream().record(generationProfiler->getEnd()); } // Collect the results for the last step for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId) { auto const& generationConfig = mBuffers.at(microBatchId)->generationConfig; auto const microBatchSize = generationConfig.batchSize; auto const firstBatchIdx = microBatchOffsets.at(microBatchId); if (mModelConfig.usePagedKvCache()) { for (auto batchIdx = firstBatchIdx; batchIdx < firstBatchIdx + microBatchSize; ++batchIdx) { kvCacheManager->removeSequence(batchIdx); } } // TODO(micro batching) use mCommStream? if (beamWidth > 1) { finalize(microBatchId); } else if (!mWorldConfig.isPipelineParallel()) { auto& buffers = *mBuffers.at(microBatchId); auto& decoder = *mDecoders.at(microBatchId); manager.copy(*decoder.getOutputIds(), *buffers.outputIds); auto& cumLogProbs = buffers.cumLogProbs; if (cumLogProbs) { manager.copy(*decoder.getCumLogProbs(), *buffers.cumLogProbs); } auto& logProbs = buffers.logProbs; if (logProbs) { manager.copy(*decoder.getLogProbs(), *buffers.logProbs); } } // copy generation logits fragments into a single generationLogits tensor if (mModelConfig.computeGenerationLogits()) { auto& buffers = *mBuffers.at(microBatchId); auto& microBatchOutputs = microBatchesOutputs.at(microBatchId); auto const beamWidth = generationConfig.beamWidth; TensorPtr cachePointerDevice = ITensor::slice(buffers.cacheGenerationFragmentPointerDevice, microBatchId, 1); TensorPtr cachePointerHost = ITensor::slice(buffers.cacheGenerationFragmentPointerHost, microBatchId, 1); tensorrt_llm::runtime::kernels::mergeLogitsFragments(manager, *microBatchOutputs.generationLogits, *buffers.generationLogitsFragments, *cachePointerDevice, *cachePointerHost, 0, microBatchSize, beamWidth, manager.getStream(), 0); buffers.generationLogitsFragments->clear(); } } manager.getStream().synchronize(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptSession::executeContextStep(std::vector const& generationBatchesInputs, std::vector const& generationBatchesOffsets, KvCacheManager const* kvCacheManager) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto& manager = mRuntime->getBufferManager(); auto const numGenerationBatches = static_cast(generationBatchesInputs.size()); auto constexpr step = 0; auto constexpr contextId = 0; for (auto generationBatchId = 0; generationBatchId < numGenerationBatches; ++generationBatchId) { auto const& generationBatchInputs = generationBatchesInputs.at(generationBatchId); auto& generationBuffers = *mBuffers.at(generationBatchId); auto const contextBatchSize = mMicroBatchConfig.ctxBatchSize; auto [inputIds, inputLengths, contextBatchOffsets] = splitInputIds(generationBatchInputs, contextBatchSize, manager); auto contextBuffers = generationBuffers.split(contextBatchSize, mModelConfig, mWorldConfig); TLLM_CHECK(inputIds.size() == contextBuffers.size()); auto const numContextBatches = static_cast(contextBuffers.size()); for (auto contextBatchId = 0; contextBatchId < numContextBatches; ++contextBatchId) { auto batchOffset = generationBatchesOffsets.at(generationBatchId) + contextBatchOffsets.at(contextBatchId); auto& buffers = contextBuffers.at(contextBatchId); auto& inputBuffer = buffers.inputBuffers[0]; auto& outputBuffer = buffers.outputBuffers[0]; buffers.prepareContextStep(inputIds.at(contextBatchId), generationBatchInputs.padId, manager, kvCacheManager, batchOffset, mModelConfig, mWorldConfig); buffers.getRuntimeBuffers( inputBuffer, outputBuffer, step, inputIds.at(contextBatchId), mCommPtrs, mModelConfig, mWorldConfig); mRuntime->setInputTensors(contextId, inputBuffer); mRuntime->setOutputTensors(contextId, outputBuffer); TLLM_CHECK_WITH_INFO(mRuntime->executeContext(contextId), "Executing TRT engine in context step failed!"); sync_check_cuda_error(); } generationBuffers.postContextStep(contextBuffers, manager, mModelConfig, mWorldConfig); sync_check_cuda_error(); // Save the last token logits of context into generation logits if (mModelConfig.computeGenerationLogits()) { auto& buffers = *mBuffers.at(generationBatchId); buffers.generationLogitsFragments->push_back(generationBuffers.logits); } std::swap(generationBuffers.cacheIndirectionDecoderInput, generationBuffers.cacheIndirectionDecoderOutput); auto const decoderStep = generationBuffers.generationConfig.maxInputLength + step; decoderStepAsync(decoderStep, generationBatchId); if (mModelConfig.computeGenerationLogits()) { TensorPtr newLogitBuffer = ITensor::slice(generationBuffers.allGenerationLogits, 1, 1); newLogitBuffer->squeeze(0); generationBuffers.logits = newLogitBuffer; } } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } SizeType GptSession::executeGenerationStep(SizeType step, std::vector const& microBatchesInputs, std::vector& microBatchesOutputs, std::vector const& microBatchOffsets, KvCacheManager* kvCacheManager, std::vector& microBatchesFinished) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CHECK(microBatchesInputs.size() == microBatchesOutputs.size()); auto& manager = mRuntime->getBufferManager(); auto const numMicroBatches = static_cast(microBatchesInputs.size()); SizeType numBatchesFinished{0}; auto const flipFlopId = step % 2; auto const contextId = mRuntime->getNbProfiles() - 1; for (auto generationBatchId = 0; generationBatchId < numMicroBatches; ++generationBatchId) { if (microBatchesFinished.at(generationBatchId)) continue; auto& buffers = *mBuffers.at(generationBatchId); auto const& generationConfig = buffers.generationConfig; auto const graphId = mMicroBatchConfig.getGenGraphId(flipFlopId, generationBatchId); auto& inputBuffer = buffers.inputBuffers[flipFlopId]; auto& outputBuffer = buffers.outputBuffers[flipFlopId]; auto nextInputIds = buffers.prepareNextStep( step - 1, manager, kvCacheManager, microBatchOffsets.at(generationBatchId), mModelConfig, mWorldConfig); buffers.getRuntimeBuffers(inputBuffer, outputBuffer, step, nextInputIds, mCommPtrs, mModelConfig, mWorldConfig); mRuntime->setInputTensors(contextId, inputBuffer); mRuntime->setOutputTensors(contextId, outputBuffer); if (useCudaGraphs()) { // SSMBased only create cuda graph instance once. if (mModelConfig.isSsmBased()) { // step > 3 is WAR for TRT 9.x. // Which is fixed in TRT 10.x. if (step > 3 && !mCudaGraphInstances.at(graphId).hasInstance()) { mCudaGraphInstances.at(graphId).prepareNextGraph(*mRuntime, contextId); } } else { mCudaGraphInstances.at(graphId).prepareNextGraph(*mRuntime, contextId); } } // check decoder result of previous iteration if (shouldStopSync(generationConfig.batchSize, generationConfig.beamWidth, generationBatchId)) { mLogger->log(nvinfer1::ILogger::Severity::kVERBOSE, tc::fmtstr("GPT decoding finished for step %d and microBatchId %d", step, generationBatchId).c_str()); microBatchesFinished.at(generationBatchId) = true; numBatchesFinished += 1; continue; } if (useCudaGraphs() && mCudaGraphInstances.size() > (size_t) graphId && mCudaGraphInstances.at(graphId).hasInstance()) { auto& cudaGraphInstance = mCudaGraphInstances.at(graphId); cudaGraphInstance.launch(mRuntime->getStream()); } else { TLLM_CHECK_WITH_INFO( mRuntime->executeContext(contextId), tc::fmtstr("Executing TRT engine in step %d failed!", step)); } sync_check_cuda_error(); if (mModelConfig.computeGenerationLogits()) { auto& buffers = *mBuffers.at(generationBatchId); buffers.generationLogitsFragments->push_back(buffers.logits); } sync_check_cuda_error(); std::swap(buffers.cacheIndirectionDecoderInput, buffers.cacheIndirectionDecoderOutput); auto const decoderStep = generationConfig.maxInputLength + step; decoderStepAsync(decoderStep, generationBatchId); if (mModelConfig.computeGenerationLogits() && buffers.allGenerationLogits->getShape().d[0] > step + 1) { TensorPtr newLogitBuffer = ITensor::slice(buffers.allGenerationLogits, step + 1, 1); newLogitBuffer->squeeze(0); buffers.logits = newLogitBuffer; } } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); return numBatchesFinished; } void GptSession::decoderStepAsync(SizeType decoderStep, SizeType microBatchId) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto const& stream = mRuntime->getStream(); auto& buffers = *mBuffers.at(microBatchId); auto const& outputIds = buffers.outputIds; auto const& newTokens = buffers.newTokens; if (mWorldConfig.isLastPipelineParallelRank()) { auto& decoder = *mDecoders.at(microBatchId); decoder::Input decodingInput{buffers.logits}; decoder::Output decodingOutput{}; decodingInput.cacheIndirection = buffers.cacheIndirectionDecoderInput; decodingOutput.cacheIndirection = buffers.cacheIndirectionDecoderOutput; decodingOutput.sequenceLengths = buffers.sequenceLengths; decoder.forwardAsync(decodingOutput, decodingInput); if (mWorldConfig.isPipelineParallel()) { // send shouldStop to all previous ranks and newTokens to the first rank stream.record(mCommEvent.get()); mCommStream->wait(mCommEvent.get()); auto const pipelineGroup = mWorldConfig.getPipelineParallelGroup(); auto& cacheIndirection = *buffers.cacheIndirectionDecoderOutput; auto& sequenceLengths = *buffers.sequenceLengths; auto const beamWidth = cacheIndirection.getShape().d[1]; for (auto peerIdx = 0; peerIdx < mWorldConfig.getPipelineParallelism() - 1; ++peerIdx) { mPipelineComm->send(*decoder.getNbFinished(), pipelineGroup[peerIdx], *mCommStream); if (beamWidth > 1) { mPipelineComm->send(cacheIndirection, pipelineGroup[peerIdx], *mCommStream); } mPipelineComm->send(sequenceLengths, pipelineGroup[peerIdx], *mCommStream); } mPipelineComm->send(*decoder.getNewTokens(), pipelineGroup.front(), *mCommStream); } } else // pipeline parallel mode { // receive shouldStop from the last rank on a separate stream stream.record(mCommEvent.get()); mCommStream->wait(mCommEvent.get()); auto const pipelineGroup = mWorldConfig.getPipelineParallelGroup(); auto const peer = pipelineGroup.back(); mPipelineComm->receive(*buffers.nbFinished, peer, *mCommStream); auto& cacheIndirection = *buffers.cacheIndirectionDecoderOutput; auto& sequenceLengths = *buffers.sequenceLengths; auto const beamWidth = cacheIndirection.getShape().d[1]; if (beamWidth > 1) { mPipelineComm->receive(cacheIndirection, peer, *mCommStream); } mPipelineComm->receive(sequenceLengths, peer, *mCommStream); if (mWorldConfig.isFirstPipelineParallelRank()) { // receive newTokens from last rank on a separate stream mPipelineComm->receive(*newTokens, peer, *mCommStream); updateOutputIds(outputIds, newTokens, decoderStep, *mCommStream); } mCommStream->record(mReceivedEvents.at(microBatchId).get()); } if (!mWorldConfig.isPipelineParallel() && mMicroBatchConfig.numGenBatches > 1) { updateOutputIds(outputIds, newTokens, decoderStep, stream); stream.record(mReceivedEvents.at(microBatchId).get()); } sync_check_cuda_error(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } bool GptSession::shouldStopSync(SizeType batchSize, SizeType beamWidth, SizeType microBatchId) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); SizeType nbFinished = 0; if (mWorldConfig.isLastPipelineParallelRank()) { // read the Finished flag from the decoder auto& decoder = *mDecoders.at(microBatchId); decoder.forwardSync(); nbFinished = *bufferCast(*decoder.getNbFinished()); if (!mWorldConfig.isPipelineParallel() && mMicroBatchConfig.numGenBatches > 1) { // ensure outputIds have been updated mReceivedEvents.at(microBatchId).synchronize(); } } else { // ensure all information has been received mReceivedEvents.at(microBatchId).synchronize(); nbFinished = *bufferCast(*mBuffers.at(microBatchId)->nbFinished); } sync_check_cuda_error(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); return nbFinished == batchSize * beamWidth; } void GptSession::finalize(SizeType microBatchId) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto& manager = mRuntime->getBufferManager(); auto& buffers = *mBuffers.at(microBatchId); auto& outputIds = buffers.outputIds; auto& cumLogProbs = buffers.cumLogProbs; auto& logProbs = buffers.logProbs; auto& sequenceLengths = buffers.sequenceLengths; if (mWorldConfig.isPipelineParallel()) { auto& stream = mRuntime->getStream(); auto const pipelineGroup = mWorldConfig.getPipelineParallelGroup(); if (mWorldConfig.isLastPipelineParallelRank()) { // send ids from last to first auto& decoder = mDecoders.at(microBatchId); // only the last rank has the decoder decoder->finalize(); auto finalOutputIds = decoder->getOutputIds(); auto const peer = pipelineGroup.front(); mPipelineComm->send(*finalOutputIds, peer, stream); mPipelineComm->send(*sequenceLengths, peer, stream); manager.copy(*finalOutputIds, *outputIds); if (cumLogProbs) { auto finalCumLogProbs = decoder->getCumLogProbs(); mPipelineComm->send(*finalCumLogProbs, peer, stream); manager.copy(*finalCumLogProbs, *cumLogProbs); } if (logProbs) { auto finalLogProbs = decoder->getLogProbs(); mPipelineComm->send(*finalLogProbs, peer, stream); manager.copy(*finalLogProbs, *logProbs); } } else if (mWorldConfig.isFirstPipelineParallelRank()) { // receive ids from last on first auto const peer = pipelineGroup.back(); mPipelineComm->receive(*outputIds, peer, stream); mPipelineComm->receive(*sequenceLengths, peer, stream); if (cumLogProbs) { mPipelineComm->receive(*cumLogProbs, peer, stream); } if (logProbs) { mPipelineComm->receive(*logProbs, peer, stream); } } } else { auto& decoder = mDecoders.at(microBatchId); // all ranks have the decoder decoder->finalize(); auto finalOutputIds = decoder->getOutputIds(); manager.copy(*finalOutputIds, *outputIds); if (cumLogProbs) { auto finalCumLogProbs = decoder->getCumLogProbs(); manager.copy(*finalCumLogProbs, *cumLogProbs); } if (logProbs) { auto finalLogProbs = decoder->getLogProbs(); manager.copy(*finalLogProbs, *logProbs); } // sequenceLengths are already updated by decoder } sync_check_cuda_error(); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptSession::CudaGraphExecutor::create(cudaGraph_t const& graph) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); assert(mInstance == nullptr); TLLM_CUDA_CHECK(cudaGraphInstantiate(&mInstance, graph, nullptr, nullptr, 0)); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptSession::CudaGraphExecutor::uploadToStream(CudaStream const& stream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); assert(hasInstance()); TLLM_CUDA_CHECK(cudaGraphUpload(mInstance, stream.get())); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptSession::CudaGraphExecutor::launch(CudaStream const& stream) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); TLLM_CUDA_CHECK(cudaGraphLaunch(mInstance, stream.get())); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } bool GptSession::CudaGraphExecutor::update(cudaGraph_t const& graph) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); return cudaGraphExecUpdate(mInstance, graph, nullptr) != cudaSuccess; } void GptSession::CudaGraphExecutor::clear() { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); if (mInstance != nullptr) { TLLM_CUDA_CHECK(cudaGraphExecDestroy(mInstance)); mInstance = nullptr; } TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); } void GptSession::CudaGraphExecutor::prepareNextGraph(TllmRuntime const& runtime, SizeType nextContextId) { TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__); auto& stream = runtime.getStream(); cudaGraph_t nextGraph; TLLM_CUDA_CHECK(cudaStreamBeginCapture(stream.get(), cudaStreamCaptureModeThreadLocal)); runtime.executeContext(nextContextId); TLLM_CUDA_CHECK(cudaStreamEndCapture(stream.get(), &nextGraph)); if (hasInstance()) { if (update(nextGraph)) { clear(); create(nextGraph); } } else { create(nextGraph); } TLLM_CUDA_CHECK(cudaGraphDestroy(nextGraph)); uploadToStream(stream); TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__); }