diff --git a/_cpp_gen/executor.html b/_cpp_gen/executor.html index 5ea0292f09..e13198b92a 100644 --- a/_cpp_gen/executor.html +++ b/_cpp_gen/executor.html @@ -83,15 +83,718 @@
C++ API
tensorrt_llm::batch_manager
+tensorrt_llm::executor::RetentionPrioritytensorrt_llm::executor::KVCacheEventDatatensorrt_llm::executor::version()tensorrt_llm::executor::ContextPhaseParamstensorrt_llm::executor::ContextPhaseParams::RequestIdTypetensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams()tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams()tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams()tensorrt_llm::executor::ContextPhaseParams::ContextPhaseParams()tensorrt_llm::executor::ContextPhaseParams::operator=()tensorrt_llm::executor::ContextPhaseParams::operator=()tensorrt_llm::executor::ContextPhaseParams::~ContextPhaseParams()tensorrt_llm::executor::ContextPhaseParams::operator==()tensorrt_llm::executor::ContextPhaseParams::getFirstGenTokens()tensorrt_llm::executor::ContextPhaseParams::popFirstGenTokens()tensorrt_llm::executor::ContextPhaseParams::getReqId()tensorrt_llm::executor::ContextPhaseParams::getState()tensorrt_llm::executor::ContextPhaseParams::getState()tensorrt_llm::executor::ContextPhaseParams::releaseState()tensorrt_llm::executor::ContextPhaseParams::StatePtrtensorrt_llm::executor::ContextPhaseParams::mReqIdtensorrt_llm::executor::ContextPhaseParams::mFirstGenTokenstensorrt_llm::executor::ContextPhaseParams::mStatetensorrt_llm::executor::ContextPhaseParams::deleter()tensorrt_llm::executor::DebugConfigtensorrt_llm::executor::DebugConfig::DebugConfig()tensorrt_llm::executor::DebugConfig::operator==()tensorrt_llm::executor::DebugConfig::getDebugInputTensors()tensorrt_llm::executor::DebugConfig::getDebugOutputTensors()tensorrt_llm::executor::DebugConfig::getDebugTensorNames()tensorrt_llm::executor::DebugConfig::getDebugTensorsMaxIterations()tensorrt_llm::executor::DebugConfig::setDebugInputTensors()tensorrt_llm::executor::DebugConfig::setDebugOutputTensors()tensorrt_llm::executor::DebugConfig::setDebugTensorNames()tensorrt_llm::executor::DebugConfig::setDebugTensorsMaxIterations()tensorrt_llm::executor::DebugConfig::StringVectensorrt_llm::executor::DebugConfig::mDebugInputTensorstensorrt_llm::executor::DebugConfig::mDebugOutputTensorstensorrt_llm::executor::DebugConfig::mDebugTensorNamestensorrt_llm::executor::DebugConfig::mDebugTensorsMaxIterationstensorrt_llm::executor::DecodingConfigtensorrt_llm::executor::DecodingConfig::DecodingConfig()tensorrt_llm::executor::DecodingConfig::operator==()tensorrt_llm::executor::DecodingConfig::setDecodingMode()tensorrt_llm::executor::DecodingConfig::getDecodingMode()tensorrt_llm::executor::DecodingConfig::setLookaheadDecoding()tensorrt_llm::executor::DecodingConfig::getLookaheadDecodingConfig()tensorrt_llm::executor::DecodingConfig::setMedusaChoices()tensorrt_llm::executor::DecodingConfig::getMedusaChoices()tensorrt_llm::executor::DecodingConfig::setEagleConfig()tensorrt_llm::executor::DecodingConfig::getEagleConfig()tensorrt_llm::executor::DecodingConfig::mDecodingModetensorrt_llm::executor::DecodingConfig::mLookaheadDecodingConfigtensorrt_llm::executor::DecodingConfig::mMedusaChoicestensorrt_llm::executor::DecodingConfig::mEagleConfigtensorrt_llm::executor::DynamicBatchConfigtensorrt_llm::executor::DynamicBatchConfig::DynamicBatchConfig()tensorrt_llm::executor::DynamicBatchConfig::getDynamicBatchMovingAverageWindow()tensorrt_llm::executor::DynamicBatchConfig::getEnableBatchSizeTuning()tensorrt_llm::executor::DynamicBatchConfig::getEnableMaxNumTokensTuning()tensorrt_llm::executor::DynamicBatchConfig::getBatchSizeTable()tensorrt_llm::executor::DynamicBatchConfig::kDefaultDynamicBatchMovingAverageWindowtensorrt_llm::executor::DynamicBatchConfig::kDefaultBatchSizeTabletensorrt_llm::executor::DynamicBatchConfig::mEnableBatchSizeTuningtensorrt_llm::executor::DynamicBatchConfig::mEnableMaxNumTokensTuningtensorrt_llm::executor::DynamicBatchConfig::mDynamicBatchMovingAverageWindowtensorrt_llm::executor::DynamicBatchConfig::mBatchSizeTabletensorrt_llm::executor::EagleConfigtensorrt_llm::executor::EagleConfig::EagleConfig()tensorrt_llm::executor::EagleConfig::operator==()tensorrt_llm::executor::EagleConfig::getEagleChoices()tensorrt_llm::executor::EagleConfig::getPosteriorThreshold()tensorrt_llm::executor::EagleConfig::isGreedySampling()tensorrt_llm::executor::EagleConfig::checkPosteriorValue()tensorrt_llm::executor::EagleConfig::mEagleChoicestensorrt_llm::executor::EagleConfig::mGreedySamplingtensorrt_llm::executor::EagleConfig::mPosteriorThresholdtensorrt_llm::executor::Executortensorrt_llm::executor::Executor::Executor()tensorrt_llm::executor::Executor::Executor()tensorrt_llm::executor::Executor::Executor()tensorrt_llm::executor::Executor::Executor()tensorrt_llm::executor::Executor::Executor()tensorrt_llm::executor::Executor::Executor()tensorrt_llm::executor::Executor::~Executor()tensorrt_llm::executor::Executor::Executor()tensorrt_llm::executor::Executor::operator=()tensorrt_llm::executor::Executor::Executor()tensorrt_llm::executor::Executor::operator=()tensorrt_llm::executor::Executor::enqueueRequest()tensorrt_llm::executor::Executor::enqueueRequests()tensorrt_llm::executor::Executor::awaitResponses()tensorrt_llm::executor::Executor::awaitResponses()tensorrt_llm::executor::Executor::awaitResponses()tensorrt_llm::executor::Executor::getNumResponsesReady()tensorrt_llm::executor::Executor::cancelRequest()tensorrt_llm::executor::Executor::shutdown()tensorrt_llm::executor::Executor::getLatestIterationStats()tensorrt_llm::executor::Executor::getLatestRequestStats()tensorrt_llm::executor::Executor::getLatestDebugTensors()tensorrt_llm::executor::Executor::canEnqueueRequests()tensorrt_llm::executor::Executor::isParticipant()tensorrt_llm::executor::Executor::getKVCacheEventManager()tensorrt_llm::executor::Executor::mImpltensorrt_llm::executor::ExecutorConfigtensorrt_llm::executor::ExecutorConfig::ExecutorConfig()tensorrt_llm::executor::ExecutorConfig::getMaxBeamWidth()tensorrt_llm::executor::ExecutorConfig::getSchedulerConfig()tensorrt_llm::executor::ExecutorConfig::getKvCacheConfig()tensorrt_llm::executor::ExecutorConfig::getSchedulerConfigRef()tensorrt_llm::executor::ExecutorConfig::getKvCacheConfigRef()tensorrt_llm::executor::ExecutorConfig::getEnableChunkedContext()tensorrt_llm::executor::ExecutorConfig::getNormalizeLogProbs()tensorrt_llm::executor::ExecutorConfig::getIterStatsMaxIterations()tensorrt_llm::executor::ExecutorConfig::getRequestStatsMaxIterations()tensorrt_llm::executor::ExecutorConfig::getBatchingType()tensorrt_llm::executor::ExecutorConfig::getMaxBatchSize()tensorrt_llm::executor::ExecutorConfig::getMaxNumTokens()tensorrt_llm::executor::ExecutorConfig::getParallelConfig()tensorrt_llm::executor::ExecutorConfig::getPeftCacheConfig()tensorrt_llm::executor::ExecutorConfig::getLogitsPostProcessorConfig()tensorrt_llm::executor::ExecutorConfig::getDecodingConfig()tensorrt_llm::executor::ExecutorConfig::getGpuWeightsPercent()tensorrt_llm::executor::ExecutorConfig::getMaxQueueSize()tensorrt_llm::executor::ExecutorConfig::getExtendedRuntimePerfKnobConfig()tensorrt_llm::executor::ExecutorConfig::getDebugConfig()tensorrt_llm::executor::ExecutorConfig::getRecvPollPeriodMs()tensorrt_llm::executor::ExecutorConfig::getMaxSeqIdleMicroseconds()tensorrt_llm::executor::ExecutorConfig::getSpecDecConfig()tensorrt_llm::executor::ExecutorConfig::getGuidedDecodingConfig()tensorrt_llm::executor::ExecutorConfig::setMaxBeamWidth()tensorrt_llm::executor::ExecutorConfig::setMaxBatchSize()tensorrt_llm::executor::ExecutorConfig::setMaxNumTokens()tensorrt_llm::executor::ExecutorConfig::setSchedulerConfig()tensorrt_llm::executor::ExecutorConfig::setKvCacheConfig()tensorrt_llm::executor::ExecutorConfig::setEnableChunkedContext()tensorrt_llm::executor::ExecutorConfig::setNormalizeLogProbs()tensorrt_llm::executor::ExecutorConfig::setIterStatsMaxIterations()tensorrt_llm::executor::ExecutorConfig::setRequestStatsMaxIterations()tensorrt_llm::executor::ExecutorConfig::setBatchingType()tensorrt_llm::executor::ExecutorConfig::setParallelConfig()tensorrt_llm::executor::ExecutorConfig::setPeftCacheConfig()tensorrt_llm::executor::ExecutorConfig::setLogitsPostProcessorConfig()tensorrt_llm::executor::ExecutorConfig::setDecodingConfig()tensorrt_llm::executor::ExecutorConfig::setGpuWeightsPercent()tensorrt_llm::executor::ExecutorConfig::setMaxQueueSize()tensorrt_llm::executor::ExecutorConfig::setExtendedRuntimePerfKnobConfig()tensorrt_llm::executor::ExecutorConfig::setDebugConfig()tensorrt_llm::executor::ExecutorConfig::setRecvPollPeriodMs()tensorrt_llm::executor::ExecutorConfig::setMaxSeqIdleMicroseconds()tensorrt_llm::executor::ExecutorConfig::setSpecDecConfig()tensorrt_llm::executor::ExecutorConfig::setGuidedDecodingConfig()tensorrt_llm::executor::ExecutorConfig::kDefaultMaxSeqIdleMicrosecondstensorrt_llm::executor::ExecutorConfig::kDefaultIterStatsMaxIterationstensorrt_llm::executor::ExecutorConfig::kDefaultRequestStatsMaxIterationstensorrt_llm::executor::ExecutorConfig::mMaxBeamWidthtensorrt_llm::executor::ExecutorConfig::mSchedulerConfigtensorrt_llm::executor::ExecutorConfig::mKvCacheConfigtensorrt_llm::executor::ExecutorConfig::mEnableChunkedContexttensorrt_llm::executor::ExecutorConfig::mNormalizeLogProbstensorrt_llm::executor::ExecutorConfig::mIterStatsMaxIterationstensorrt_llm::executor::ExecutorConfig::mRequestStatsMaxIterationstensorrt_llm::executor::ExecutorConfig::mBatchingTypetensorrt_llm::executor::ExecutorConfig::mMaxBatchSizetensorrt_llm::executor::ExecutorConfig::mMaxNumTokenstensorrt_llm::executor::ExecutorConfig::mParallelConfigtensorrt_llm::executor::ExecutorConfig::mPeftCacheConfigtensorrt_llm::executor::ExecutorConfig::mLogitsPostProcessorConfigtensorrt_llm::executor::ExecutorConfig::mDecodingConfigtensorrt_llm::executor::ExecutorConfig::mGpuWeightsPercenttensorrt_llm::executor::ExecutorConfig::mMaxQueueSizetensorrt_llm::executor::ExecutorConfig::mExtendedRuntimePerfKnobConfigtensorrt_llm::executor::ExecutorConfig::mDebugConfigtensorrt_llm::executor::ExecutorConfig::mRecvPollPeriodMstensorrt_llm::executor::ExecutorConfig::mMaxSeqIdleMicrosecondstensorrt_llm::executor::ExecutorConfig::mSpeculativeDecodingConfigtensorrt_llm::executor::ExecutorConfig::mGuidedDecodingConfigtensorrt_llm::executor::ExtendedRuntimePerfKnobConfigtensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::ExtendedRuntimePerfKnobConfig()tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::operator==()tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getMultiBlockMode()tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getEnableContextFMHAFP32Acc()tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getCudaGraphMode()tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::getCudaGraphCacheSize()tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setMultiBlockMode()tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setEnableContextFMHAFP32Acc()tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setCudaGraphMode()tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::setCudaGraphCacheSize()tensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mMultiBlockModetensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mEnableContextFMHAFP32Acctensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mCudaGraphModetensorrt_llm::executor::ExtendedRuntimePerfKnobConfig::mCudaGraphCacheSizetensorrt_llm::executor::ExternalDraftTokensConfigtensorrt_llm::executor::ExternalDraftTokensConfig::ExternalDraftTokensConfig()tensorrt_llm::executor::ExternalDraftTokensConfig::getTokens()tensorrt_llm::executor::ExternalDraftTokensConfig::getLogits()tensorrt_llm::executor::ExternalDraftTokensConfig::getAcceptanceThreshold()tensorrt_llm::executor::ExternalDraftTokensConfig::getFastLogits()tensorrt_llm::executor::ExternalDraftTokensConfig::mTokenstensorrt_llm::executor::ExternalDraftTokensConfig::mLogitstensorrt_llm::executor::ExternalDraftTokensConfig::mAcceptanceThresholdtensorrt_llm::executor::ExternalDraftTokensConfig::mFastLogitstensorrt_llm::executor::GuidedDecodingConfigtensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingBackendtensorrt_llm::executor::GuidedDecodingConfig::GuidedDecodingConfig()tensorrt_llm::executor::GuidedDecodingConfig::operator==()tensorrt_llm::executor::GuidedDecodingConfig::setBackend()tensorrt_llm::executor::GuidedDecodingConfig::getBackend()tensorrt_llm::executor::GuidedDecodingConfig::setEncodedVocab()tensorrt_llm::executor::GuidedDecodingConfig::getEncodedVocab()tensorrt_llm::executor::GuidedDecodingConfig::setTokenizerStr()tensorrt_llm::executor::GuidedDecodingConfig::getTokenizerStr()tensorrt_llm::executor::GuidedDecodingConfig::setStopTokenIds()tensorrt_llm::executor::GuidedDecodingConfig::getStopTokenIds()tensorrt_llm::executor::GuidedDecodingConfig::validate()tensorrt_llm::executor::GuidedDecodingConfig::mBackendtensorrt_llm::executor::GuidedDecodingConfig::mEncodedVocabtensorrt_llm::executor::GuidedDecodingConfig::mTokenizerStrtensorrt_llm::executor::GuidedDecodingConfig::mStopTokenIdstensorrt_llm::executor::GuidedDecodingParamstensorrt_llm::executor::GuidedDecodingParams::GuideTypetensorrt_llm::executor::GuidedDecodingParams::GuidedDecodingParams()tensorrt_llm::executor::GuidedDecodingParams::operator==()tensorrt_llm::executor::GuidedDecodingParams::getGuideType()tensorrt_llm::executor::GuidedDecodingParams::getGuide()tensorrt_llm::executor::GuidedDecodingParams::mGuideTypetensorrt_llm::executor::GuidedDecodingParams::mGuidetensorrt_llm::executor::JsonSerialization
+tensorrt_llm::executor::KvCacheConfigtensorrt_llm::executor::KvCacheConfig::KvCacheConfig()tensorrt_llm::executor::KvCacheConfig::getEnableBlockReuse()tensorrt_llm::executor::KvCacheConfig::getMaxTokens()tensorrt_llm::executor::KvCacheConfig::getMaxAttentionWindowVec()tensorrt_llm::executor::KvCacheConfig::getSinkTokenLength()tensorrt_llm::executor::KvCacheConfig::getFreeGpuMemoryFraction()tensorrt_llm::executor::KvCacheConfig::getCrossKvCacheFraction()tensorrt_llm::executor::KvCacheConfig::getHostCacheSize()tensorrt_llm::executor::KvCacheConfig::getOnboardBlocks()tensorrt_llm::executor::KvCacheConfig::getSecondaryOffloadMinPriority()tensorrt_llm::executor::KvCacheConfig::getEventBufferMaxSize()tensorrt_llm::executor::KvCacheConfig::setEnableBlockReuse()tensorrt_llm::executor::KvCacheConfig::setMaxTokens()tensorrt_llm::executor::KvCacheConfig::setMaxAttentionWindowVec()tensorrt_llm::executor::KvCacheConfig::setSinkTokenLength()tensorrt_llm::executor::KvCacheConfig::setFreeGpuMemoryFraction()tensorrt_llm::executor::KvCacheConfig::setCrossKvCacheFraction()tensorrt_llm::executor::KvCacheConfig::setHostCacheSize()tensorrt_llm::executor::KvCacheConfig::setOnboardBlocks()tensorrt_llm::executor::KvCacheConfig::setSecondaryOffloadMinPriority()tensorrt_llm::executor::KvCacheConfig::setEventBufferMaxSize()tensorrt_llm::executor::KvCacheConfig::fillEmptyFieldsFromRuntimeDefaults()tensorrt_llm::executor::KvCacheConfig::mEnableBlockReusetensorrt_llm::executor::KvCacheConfig::mMaxTokenstensorrt_llm::executor::KvCacheConfig::mMaxAttentionWindowVectensorrt_llm::executor::KvCacheConfig::mSinkTokenLengthtensorrt_llm::executor::KvCacheConfig::mFreeGpuMemoryFractiontensorrt_llm::executor::KvCacheConfig::mCrossKvCacheFractiontensorrt_llm::executor::KvCacheConfig::mHostCacheSizetensorrt_llm::executor::KvCacheConfig::mOnboardBlockstensorrt_llm::executor::KvCacheConfig::mSecondaryOffloadMinPrioritytensorrt_llm::executor::KvCacheConfig::mEventBufferMaxSizetensorrt_llm::executor::KVCacheCreatedData
+tensorrt_llm::executor::KVCacheEvent
+tensorrt_llm::executor::KVCacheEventDiff
+tensorrt_llm::executor::KVCacheEventManager
+tensorrt_llm::executor::KVCacheRemovedData
+tensorrt_llm::executor::KvCacheRetentionConfigtensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig()tensorrt_llm::executor::KvCacheRetentionConfig::KvCacheRetentionConfig()tensorrt_llm::executor::KvCacheRetentionConfig::getTokenRangeRetentionConfigs()tensorrt_llm::executor::KvCacheRetentionConfig::getDecodeRetentionPriority()tensorrt_llm::executor::KvCacheRetentionConfig::getDecodeDurationMs()tensorrt_llm::executor::KvCacheRetentionConfig::getPerBlockRetentionPriorityDuration()tensorrt_llm::executor::KvCacheRetentionConfig::kMinRetentionPrioritytensorrt_llm::executor::KvCacheRetentionConfig::kMaxRetentionPrioritytensorrt_llm::executor::KvCacheRetentionConfig::kDefaultRetentionPrioritytensorrt_llm::executor::KvCacheRetentionConfig::mTokenRangeRetentionConfigstensorrt_llm::executor::KvCacheRetentionConfig::mDecodeRetentionPrioritytensorrt_llm::executor::KvCacheRetentionConfig::mDecodeDurationMstensorrt_llm::executor::KvCacheRetentionConfig::TokenRangeRetentionConfigtensorrt_llm::executor::KVCacheStoredBlockDatatensorrt_llm::executor::KVCacheStoredBlockData::KVCacheStoredBlockData()tensorrt_llm::executor::KVCacheStoredBlockData::blockHashtensorrt_llm::executor::KVCacheStoredBlockData::tokenstensorrt_llm::executor::KVCacheStoredBlockData::loraIdtensorrt_llm::executor::KVCacheStoredBlockData::cacheLeveltensorrt_llm::executor::KVCacheStoredBlockData::prioritytensorrt_llm::executor::KVCacheStoredData
+tensorrt_llm::executor::KVCacheUpdatedDatatensorrt_llm::executor::KVCacheUpdatedData::KVCacheUpdatedData()tensorrt_llm::executor::KVCacheUpdatedData::cacheLevelUpdated()tensorrt_llm::executor::KVCacheUpdatedData::priorityUpdated()tensorrt_llm::executor::KVCacheUpdatedData::blockHashtensorrt_llm::executor::KVCacheUpdatedData::cacheLeveltensorrt_llm::executor::KVCacheUpdatedData::prioritytensorrt_llm::executor::LogitsPostProcessorConfigtensorrt_llm::executor::LogitsPostProcessorConfig::LogitsPostProcessorConfig()tensorrt_llm::executor::LogitsPostProcessorConfig::getProcessorMap()tensorrt_llm::executor::LogitsPostProcessorConfig::getProcessorBatched()tensorrt_llm::executor::LogitsPostProcessorConfig::getReplicate()tensorrt_llm::executor::LogitsPostProcessorConfig::setProcessorMap()tensorrt_llm::executor::LogitsPostProcessorConfig::setProcessorBatched()tensorrt_llm::executor::LogitsPostProcessorConfig::setReplicate()tensorrt_llm::executor::LogitsPostProcessorConfig::mProcessorMaptensorrt_llm::executor::LogitsPostProcessorConfig::mProcessorBatchedtensorrt_llm::executor::LogitsPostProcessorConfig::mReplicatetensorrt_llm::executor::LookaheadDecodingConfigtensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig()tensorrt_llm::executor::LookaheadDecodingConfig::LookaheadDecodingConfig()tensorrt_llm::executor::LookaheadDecodingConfig::operator==()tensorrt_llm::executor::LookaheadDecodingConfig::get()tensorrt_llm::executor::LookaheadDecodingConfig::getWindowSize()tensorrt_llm::executor::LookaheadDecodingConfig::getNgramSize()tensorrt_llm::executor::LookaheadDecodingConfig::getVerificationSetSize()tensorrt_llm::executor::LookaheadDecodingConfig::calculateSpeculativeResource()tensorrt_llm::executor::LookaheadDecodingConfig::isLE()tensorrt_llm::executor::LookaheadDecodingConfig::isLegal()tensorrt_llm::executor::LookaheadDecodingConfig::mWindowSizetensorrt_llm::executor::LookaheadDecodingConfig::mNgramSizetensorrt_llm::executor::LookaheadDecodingConfig::mVerificationSetSizetensorrt_llm::executor::LoraConfigtensorrt_llm::executor::LoraConfig::LoraConfig()tensorrt_llm::executor::LoraConfig::getTaskId()tensorrt_llm::executor::LoraConfig::getWeights()tensorrt_llm::executor::LoraConfig::getConfig()tensorrt_llm::executor::LoraConfig::mTaskIdtensorrt_llm::executor::LoraConfig::mWeightstensorrt_llm::executor::LoraConfig::mConfigtensorrt_llm::executor::MropeConfig
+tensorrt_llm::executor::OrchestratorConfigtensorrt_llm::executor::OrchestratorConfig::OrchestratorConfig()tensorrt_llm::executor::OrchestratorConfig::getIsOrchestrator()tensorrt_llm::executor::OrchestratorConfig::getWorkerExecutablePath()tensorrt_llm::executor::OrchestratorConfig::getOrchLeaderComm()tensorrt_llm::executor::OrchestratorConfig::getSpawnProcesses()tensorrt_llm::executor::OrchestratorConfig::setIsOrchestrator()tensorrt_llm::executor::OrchestratorConfig::setWorkerExecutablePath()tensorrt_llm::executor::OrchestratorConfig::setOrchLeaderComm()tensorrt_llm::executor::OrchestratorConfig::setSpawnProcesses()tensorrt_llm::executor::OrchestratorConfig::mIsOrchestratortensorrt_llm::executor::OrchestratorConfig::mWorkerExecutablePathtensorrt_llm::executor::OrchestratorConfig::mOrchLeaderCommtensorrt_llm::executor::OrchestratorConfig::mSpawnProcessestensorrt_llm::executor::OutputConfigtensorrt_llm::executor::OutputConfig::OutputConfig()tensorrt_llm::executor::OutputConfig::returnLogProbstensorrt_llm::executor::OutputConfig::returnContextLogitstensorrt_llm::executor::OutputConfig::returnGenerationLogitstensorrt_llm::executor::OutputConfig::excludeInputFromOutputtensorrt_llm::executor::OutputConfig::returnEncoderOutputtensorrt_llm::executor::OutputConfig::returnPerfMetricstensorrt_llm::executor::ParallelConfigtensorrt_llm::executor::ParallelConfig::ParallelConfig()tensorrt_llm::executor::ParallelConfig::getCommunicationType()tensorrt_llm::executor::ParallelConfig::getCommunicationMode()tensorrt_llm::executor::ParallelConfig::getDeviceIds()tensorrt_llm::executor::ParallelConfig::getParticipantIds()tensorrt_llm::executor::ParallelConfig::getOrchestratorConfig()tensorrt_llm::executor::ParallelConfig::setCommunicationType()tensorrt_llm::executor::ParallelConfig::setCommunicationMode()tensorrt_llm::executor::ParallelConfig::setDeviceIds()tensorrt_llm::executor::ParallelConfig::setParticipantIds()tensorrt_llm::executor::ParallelConfig::setOrchestratorConfig()tensorrt_llm::executor::ParallelConfig::mCommTypetensorrt_llm::executor::ParallelConfig::mCommModetensorrt_llm::executor::ParallelConfig::mDeviceIdstensorrt_llm::executor::ParallelConfig::mParticipantIdstensorrt_llm::executor::ParallelConfig::mOrchestratorConfigtensorrt_llm::executor::PeftCacheConfigtensorrt_llm::executor::PeftCacheConfig::PeftCacheConfig()tensorrt_llm::executor::PeftCacheConfig::operator==()tensorrt_llm::executor::PeftCacheConfig::getNumHostModuleLayer()tensorrt_llm::executor::PeftCacheConfig::getNumDeviceModuleLayer()tensorrt_llm::executor::PeftCacheConfig::getOptimalAdapterSize()tensorrt_llm::executor::PeftCacheConfig::getMaxAdapterSize()tensorrt_llm::executor::PeftCacheConfig::getNumPutWorkers()tensorrt_llm::executor::PeftCacheConfig::getNumEnsureWorkers()tensorrt_llm::executor::PeftCacheConfig::getNumCopyStreams()tensorrt_llm::executor::PeftCacheConfig::getMaxPagesPerBlockHost()tensorrt_llm::executor::PeftCacheConfig::getMaxPagesPerBlockDevice()tensorrt_llm::executor::PeftCacheConfig::getDeviceCachePercent()tensorrt_llm::executor::PeftCacheConfig::getHostCacheSize()tensorrt_llm::executor::PeftCacheConfig::kDefaultOptimalAdapterSizetensorrt_llm::executor::PeftCacheConfig::kDefaultMaxAdapterSizetensorrt_llm::executor::PeftCacheConfig::kDefaultMaxPagesPerBlockHosttensorrt_llm::executor::PeftCacheConfig::kDefaultMaxPagesPerBlockDevicetensorrt_llm::executor::PeftCacheConfig::mNumHostModuleLayertensorrt_llm::executor::PeftCacheConfig::mNumDeviceModuleLayertensorrt_llm::executor::PeftCacheConfig::mOptimalAdapterSizetensorrt_llm::executor::PeftCacheConfig::mMaxAdapterSizetensorrt_llm::executor::PeftCacheConfig::mNumPutWorkerstensorrt_llm::executor::PeftCacheConfig::mNumEnsureWorkerstensorrt_llm::executor::PeftCacheConfig::mNumCopyStreamstensorrt_llm::executor::PeftCacheConfig::mMaxPagesPerBlockHosttensorrt_llm::executor::PeftCacheConfig::mMaxPagesPerBlockDevicetensorrt_llm::executor::PeftCacheConfig::mDeviceCachePercenttensorrt_llm::executor::PeftCacheConfig::mHostCacheSizetensorrt_llm::executor::PromptTuningConfigtensorrt_llm::executor::PromptTuningConfig::PromptTuningConfig()tensorrt_llm::executor::PromptTuningConfig::getEmbeddingTable()tensorrt_llm::executor::PromptTuningConfig::getInputTokenExtraIds()tensorrt_llm::executor::PromptTuningConfig::mEmbeddingTabletensorrt_llm::executor::PromptTuningConfig::mInputTokenExtraIdstensorrt_llm::executor::Requesttensorrt_llm::executor::Request::Request()tensorrt_llm::executor::Request::Request()tensorrt_llm::executor::Request::Request()tensorrt_llm::executor::Request::operator=()tensorrt_llm::executor::Request::operator=()tensorrt_llm::executor::Request::~Request()tensorrt_llm::executor::Request::getInputTokenIds()tensorrt_llm::executor::Request::getMaxTokens()tensorrt_llm::executor::Request::getMaxNewTokens()tensorrt_llm::executor::Request::getStreaming()tensorrt_llm::executor::Request::getSamplingConfig()tensorrt_llm::executor::Request::getOutputConfig()tensorrt_llm::executor::Request::getEndId()tensorrt_llm::executor::Request::getPadId()tensorrt_llm::executor::Request::getPositionIds()tensorrt_llm::executor::Request::getBadWords()tensorrt_llm::executor::Request::getStopWords()tensorrt_llm::executor::Request::getEmbeddingBias()tensorrt_llm::executor::Request::getExternalDraftTokensConfig()tensorrt_llm::executor::Request::getPromptTuningConfig()tensorrt_llm::executor::Request::getMropeConfig()tensorrt_llm::executor::Request::getLoraConfig()tensorrt_llm::executor::Request::getLookaheadConfig()tensorrt_llm::executor::Request::getKvCacheRetentionConfig()tensorrt_llm::executor::Request::getLogitsPostProcessorName()tensorrt_llm::executor::Request::getEncoderInputTokenIds()tensorrt_llm::executor::Request::getClientId()tensorrt_llm::executor::Request::getPriority()tensorrt_llm::executor::Request::getReturnAllGeneratedTokens()tensorrt_llm::executor::Request::getContextPhaseParams()tensorrt_llm::executor::Request::getEncoderInputFeatures()tensorrt_llm::executor::Request::getEncoderOutputLength()tensorrt_llm::executor::Request::getCrossAttentionMask()tensorrt_llm::executor::Request::getRequestType()tensorrt_llm::executor::Request::getNumReturnSequences()tensorrt_llm::executor::Request::getEagleConfig()tensorrt_llm::executor::Request::getSkipCrossAttnBlocks()tensorrt_llm::executor::Request::getGuidedDecodingParams()tensorrt_llm::executor::Request::getAllottedTimeMs()tensorrt_llm::executor::Request::setStreaming()tensorrt_llm::executor::Request::setSamplingConfig()tensorrt_llm::executor::Request::setOutputConfig()tensorrt_llm::executor::Request::setEndId()tensorrt_llm::executor::Request::setPadId()tensorrt_llm::executor::Request::setPositionIds()tensorrt_llm::executor::Request::setBadWords()tensorrt_llm::executor::Request::setStopWords()tensorrt_llm::executor::Request::setEmbeddingBias()tensorrt_llm::executor::Request::setExternalDraftTokensConfig()tensorrt_llm::executor::Request::setPromptTuningConfig()tensorrt_llm::executor::Request::setMropeConfig()tensorrt_llm::executor::Request::setLoraConfig()tensorrt_llm::executor::Request::setLookaheadConfig()tensorrt_llm::executor::Request::setKvCacheRetentionConfig()tensorrt_llm::executor::Request::setLogitsPostProcessorName()tensorrt_llm::executor::Request::setEncoderInputTokenIds()tensorrt_llm::executor::Request::setClientId()tensorrt_llm::executor::Request::setPriority()tensorrt_llm::executor::Request::setReturnAllGeneratedTokens()tensorrt_llm::executor::Request::setRequestType()tensorrt_llm::executor::Request::setContextPhaseParams()tensorrt_llm::executor::Request::setEncoderInputFeatures()tensorrt_llm::executor::Request::setEncoderOutputLength()tensorrt_llm::executor::Request::setCrossAttentionMask()tensorrt_llm::executor::Request::setNumReturnSequences()tensorrt_llm::executor::Request::setEagleConfig()tensorrt_llm::executor::Request::setSkipCrossAttnBlocks()tensorrt_llm::executor::Request::setGuidedDecodingParams()tensorrt_llm::executor::Request::setAllottedTimeMs()tensorrt_llm::executor::Request::kDefaultPrioritytensorrt_llm::executor::Request::kBatchedPostProcessorNametensorrt_llm::executor::Request::mImpltensorrt_llm::executor::Responsetensorrt_llm::executor::Response::Response()tensorrt_llm::executor::Response::Response()tensorrt_llm::executor::Response::~Response()tensorrt_llm::executor::Response::Response()tensorrt_llm::executor::Response::Response()tensorrt_llm::executor::Response::operator=()tensorrt_llm::executor::Response::operator=()tensorrt_llm::executor::Response::getRequestId()tensorrt_llm::executor::Response::getClientId()tensorrt_llm::executor::Response::hasError()tensorrt_llm::executor::Response::getErrorMsg()tensorrt_llm::executor::Response::getResult()tensorrt_llm::executor::Response::mImpltensorrt_llm::executor::Resulttensorrt_llm::executor::Result::isFinaltensorrt_llm::executor::Result::outputTokenIdstensorrt_llm::executor::Result::cumLogProbstensorrt_llm::executor::Result::logProbstensorrt_llm::executor::Result::contextLogitstensorrt_llm::executor::Result::generationLogitstensorrt_llm::executor::Result::specDecFastLogitsInfotensorrt_llm::executor::Result::encoderOutputtensorrt_llm::executor::Result::finishReasonstensorrt_llm::executor::Result::contextPhaseParamstensorrt_llm::executor::Result::decodingItertensorrt_llm::executor::Result::sequenceIndextensorrt_llm::executor::Result::isSequenceFinaltensorrt_llm::executor::Result::requestPerfMetricstensorrt_llm::executor::RetentionPriorityAndDuration
+tensorrt_llm::executor::SamplingConfigtensorrt_llm::executor::SamplingConfig::SamplingConfig()tensorrt_llm::executor::SamplingConfig::operator==()tensorrt_llm::executor::SamplingConfig::getBeamWidth()tensorrt_llm::executor::SamplingConfig::getNumReturnBeams()tensorrt_llm::executor::SamplingConfig::getTopK()tensorrt_llm::executor::SamplingConfig::getTopP()tensorrt_llm::executor::SamplingConfig::getTopPMin()tensorrt_llm::executor::SamplingConfig::getTopPResetIds()tensorrt_llm::executor::SamplingConfig::getTopPDecay()tensorrt_llm::executor::SamplingConfig::getSeed()tensorrt_llm::executor::SamplingConfig::getRandomSeed()tensorrt_llm::executor::SamplingConfig::getTemperature()tensorrt_llm::executor::SamplingConfig::getMinTokens()tensorrt_llm::executor::SamplingConfig::getMinLength()tensorrt_llm::executor::SamplingConfig::getBeamSearchDiversityRate()tensorrt_llm::executor::SamplingConfig::getRepetitionPenalty()tensorrt_llm::executor::SamplingConfig::getPresencePenalty()tensorrt_llm::executor::SamplingConfig::getFrequencyPenalty()tensorrt_llm::executor::SamplingConfig::getLengthPenalty()tensorrt_llm::executor::SamplingConfig::getEarlyStopping()tensorrt_llm::executor::SamplingConfig::getNoRepeatNgramSize()tensorrt_llm::executor::SamplingConfig::getNumReturnSequences()tensorrt_llm::executor::SamplingConfig::setBeamWidth()tensorrt_llm::executor::SamplingConfig::setTopK()tensorrt_llm::executor::SamplingConfig::setTopP()tensorrt_llm::executor::SamplingConfig::setTopPMin()tensorrt_llm::executor::SamplingConfig::setTopPResetIds()tensorrt_llm::executor::SamplingConfig::setTopPDecay()tensorrt_llm::executor::SamplingConfig::setSeed()tensorrt_llm::executor::SamplingConfig::setRandomSeed()tensorrt_llm::executor::SamplingConfig::setTemperature()tensorrt_llm::executor::SamplingConfig::setMinTokens()tensorrt_llm::executor::SamplingConfig::setMinLength()tensorrt_llm::executor::SamplingConfig::setBeamSearchDiversityRate()tensorrt_llm::executor::SamplingConfig::setRepetitionPenalty()tensorrt_llm::executor::SamplingConfig::setPresencePenalty()tensorrt_llm::executor::SamplingConfig::setFrequencyPenalty()tensorrt_llm::executor::SamplingConfig::setLengthPenalty()tensorrt_llm::executor::SamplingConfig::setEarlyStopping()tensorrt_llm::executor::SamplingConfig::setNoRepeatNgramSize()tensorrt_llm::executor::SamplingConfig::setNumReturnSequences()tensorrt_llm::executor::SamplingConfig::updateNumReturnBeams()tensorrt_llm::executor::SamplingConfig::mBeamWidthtensorrt_llm::executor::SamplingConfig::mTopKtensorrt_llm::executor::SamplingConfig::mTopPtensorrt_llm::executor::SamplingConfig::mTopPMintensorrt_llm::executor::SamplingConfig::mTopPResetIdstensorrt_llm::executor::SamplingConfig::mTopPDecaytensorrt_llm::executor::SamplingConfig::mSeedtensorrt_llm::executor::SamplingConfig::mTemperaturetensorrt_llm::executor::SamplingConfig::mMinTokenstensorrt_llm::executor::SamplingConfig::mBeamSearchDiversityRatetensorrt_llm::executor::SamplingConfig::mRepetitionPenaltytensorrt_llm::executor::SamplingConfig::mPresencePenaltytensorrt_llm::executor::SamplingConfig::mFrequencyPenaltytensorrt_llm::executor::SamplingConfig::mLengthPenaltytensorrt_llm::executor::SamplingConfig::mEarlyStoppingtensorrt_llm::executor::SamplingConfig::mNoRepeatNgramSizetensorrt_llm::executor::SamplingConfig::mNumReturnSequencestensorrt_llm::executor::SamplingConfig::mNumReturnBeamstensorrt_llm::executor::SamplingConfig::checkBeamWidth()tensorrt_llm::executor::SamplingConfig::checkTopK()tensorrt_llm::executor::SamplingConfig::checkTopP()tensorrt_llm::executor::SamplingConfig::checkTopPMin()tensorrt_llm::executor::SamplingConfig::checkTopPResetIds()tensorrt_llm::executor::SamplingConfig::checkTopPDecay()tensorrt_llm::executor::SamplingConfig::checkTemperature()tensorrt_llm::executor::SamplingConfig::checkRepetitionPenalty()tensorrt_llm::executor::SamplingConfig::checkMinTokens()tensorrt_llm::executor::SamplingConfig::checkNoRepeatNgramSize()tensorrt_llm::executor::SamplingConfig::checkBeamSearchDiversityRate()tensorrt_llm::executor::SamplingConfig::checkNumReturnSequences()tensorrt_llm::executor::SchedulerConfigtensorrt_llm::executor::SchedulerConfig::SchedulerConfig()tensorrt_llm::executor::SchedulerConfig::operator==()tensorrt_llm::executor::SchedulerConfig::getCapacitySchedulerPolicy()tensorrt_llm::executor::SchedulerConfig::getContextChunkingPolicy()tensorrt_llm::executor::SchedulerConfig::getDynamicBatchConfig()tensorrt_llm::executor::SchedulerConfig::mCapacitySchedulerPolicytensorrt_llm::executor::SchedulerConfig::mContextChunkingPolicytensorrt_llm::executor::SchedulerConfig::mDynamicBatchConfigtensorrt_llm::executor::SpeculativeDecodingConfig
+tensorrt_llm::executor::SpeculativeDecodingFastLogitsInfo
+tensorrt_llm::mpiPerformance
Public Functions
+Constructs a DisaggExecutorOrchestrator object.
+ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
Enqueue context-only requests to context executors.
+requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.
Enqueue generation-only requests to generation executors.
+requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
Await for context responses.
+timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids
+Await for generation responses.
+timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids.
+Indicates if the current process is allowed to enqueueRequests.
+Get context executors.
+Get generation executors.
+Private Members
+Public Functions
+Typedefs
Public Functions
-Constructs a DisaggExecutorOrchestrator object.
-ctxEnginePaths – A vector of file paths to context engine files.
genEnginePaths – A vector of file paths to generation engine files.
ctxExecutorConfigs – A vector of ExecutorConfig for context executors.
genExecutorConfigs – A vector of ExecutorConfig for generation executors.
hasContextAwaitThreads – Whether or not there are threads that receive response for each generation executor.
hasGenAwaitThreads – Whether or not there are threads that receive response for each generation executor.
Enqueue context-only requests to context executors.
-requests – A vector of context-only requests.
selectContextId – The index of the context executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same context executor.If false, will try to use a different executor for each request.
A vector of global request ids, corresponding to the order of the requests in requests, the id returned may be different from the request id in each executor.
Enqueue generation-only requests to generation executors.
-requests – A vector of generation-only requests.
globalRequestIds – A vector of global request ids, corresponding to the order of the requests,and must be the ids returned by the enqueueContext function.
selectGenIdx – The index of the generation executor to use. If std::nullopt, the executor that has the smallest number of inflight requests will be used.
batch – If true,enqueue requests in same generation executor.If false, will try to use a different executor for each request.
Await for context responses.
-timeout – The maximum time to wait for new responses
contextIdx – The index of the context executor to use. If std::nullopt, return ready responses in all context executors,if hasContextAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids
-Await for generation responses.
-timeout – The maximum time to wait for new responses.
genIdx – The index of the generation executor to use. If std::nullopt, return ready responses in all generation executors,if hasGenAwaitThreads is true, then this parameter must be std::nullopt.
A vector of responses with corresponding global request ids.
-Indicates if the current process is allowed to enqueueRequests.
-Get context executors.
-Get generation executors.
-Private Members
-Public Functions
-