Merge remote-tracking branch 'origin/main' into feat/b300_cu13

Signed-off-by: Xiwen Yu <13230610+VALLIS-NERIA@users.noreply.github.com>
This commit is contained in:
Xiwen Yu 2025-09-09 14:34:27 +08:00
commit a8b630f178
146 changed files with 6537 additions and 1319 deletions

View File

@ -45,6 +45,10 @@ option(ENABLE_MULTI_DEVICE
option(ENABLE_UCX "Enable building with UCX (Uniform Communication X) support"
ON)
option(NVRTC_DYNAMIC_LINKING "Link against the dynamic NVRTC libraries" OFF)
option(CUBLAS_DYNAMIC_LINKING
"Link against the dynamic cublas/cublasLt libraries" ON)
option(CURAND_DYNAMIC_LINKING "Link against the dynamic curand library" ON)
option(ENABLE_NVSHMEM "Enable building with NVSHMEM support" OFF)
option(USING_OSS_CUTLASS_LOW_LATENCY_GEMM
"Using open sourced Cutlass low latency gemm kernel" ON)
@ -153,26 +157,55 @@ enable_language(C CXX CUDA)
# after that CMake handles it just fine.
setup_cuda_architectures()
find_package(CUDAToolkit 11.2 REQUIRED COMPONENTS cudart_static cuda_driver
cublas cublasLt curand nvml)
set(CUBLAS_LIB CUDA::cublas)
set(CUBLASLT_LIB CUDA::cublasLt)
set(CURAND_LIB CUDA::curand)
set(CUDA_DRV_LIB CUDA::cuda_driver)
set(CUDA_NVML_LIB CUDA::nvml)
set(CUDA_RT_LIB CUDA::cudart_static)
set(NVPTX_LIB CUDA::nvptxcompiler_static)
set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
set(CUDA_TOOLKIT_COMPONENTS cudart_static cuda_driver nvml nvptxcompiler_static)
if(CUBLAS_DYNAMIC_LINKING)
set(CUBLAS_LIB CUDA::cublas)
set(CUBLASLT_LIB CUDA::cublasLt)
list(APPEND CUDA_TOOLKIT_COMPONENTS cublas cublasLt)
else()
if(WIN32)
message(FATAL_ERROR "Static cublas not available on windows")
endif()
message(DEBUG "Linking with static cublas libs")
set(CUBLAS_LIB CUDA::cublas_static)
set(CUBLASLT_LIB CUDA::cublasLt_static)
list(APPEND CUDA_TOOLKIT_COMPONENTS cublas_static cublasLt_static)
endif()
if(CURAND_DYNAMIC_LINKING)
set(CURAND_LIB CUDA::curand)
list(APPEND CUDA_TOOLKIT_COMPONENTS curand)
else()
if(WIN32)
message(FATAL_ERROR "Static curand not available on windows")
endif()
message(DEBUG "Linking with static curand lib")
set(CURAND_LIB CUDA::curand_static)
list(APPEND CUDA_TOOLKIT_COMPONENTS curand_static)
endif()
if(NVRTC_DYNAMIC_LINKING)
set(NVRTC_LIB CUDA::nvrtc)
set(NVRTC_BUILTINS_LIB CUDA::nvrtc_builtins)
list(APPEND CUDA_TOOLKIT_COMPONENTS nvrtc nvrtc_builtins)
else()
set(NVRTC_LIB CUDA::nvrtc_static)
set(NVRTC_BUILTINS_LIB CUDA::nvrtc_builtins_static)
list(APPEND CUDA_TOOLKIT_COMPONENTS nvrtc_static nvrtc_builtins_static)
endif()
find_package(CUDAToolkit 11.2 REQUIRED COMPONENTS ${CUDA_TOOLKIT_COMPONENTS})
set(CMAKE_CUDA_RUNTIME_LIBRARY Static)
resolve_dirs(CUDAToolkit_INCLUDE_DIRS "${CUDAToolkit_INCLUDE_DIRS}")
message(STATUS "CUDA library status:")

View File

@ -72,20 +72,20 @@ class CacheTransceiver : public BaseCacheTransceiver
public:
CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager,
executor::kv_cache::CacheState::ModelConfig const& cacheStateModelCfg, runtime::WorldConfig const& worldConfig,
nvinfer1::DataType dataType,
std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
executor::kv_cache::CacheState::AttentionType attentionType
= executor::kv_cache::CacheState::AttentionType::kDEFAULT,
std::optional<executor::CacheTransceiverConfig> cacheTransceiverConfig = std::nullopt);
CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager, std::vector<SizeType32> numKvHeadsPerLayer,
SizeType32 sizePerHead, SizeType32 tokensPerBlock, runtime::WorldConfig const& worldConfig,
nvinfer1::DataType dataType,
std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
executor::kv_cache::CacheState::AttentionType attentionType
= executor::kv_cache::CacheState::AttentionType::kDEFAULT,
std::optional<executor::CacheTransceiverConfig> cacheTransceiverConfig = std::nullopt)
: CacheTransceiver(cacheManager,
executor::kv_cache::CacheState::ModelConfig{numKvHeadsPerLayer, sizePerHead, tokensPerBlock}, worldConfig,
dataType, attentionType, cacheTransceiverConfig)
attentionLayerNumPerPP, dataType, attentionType, cacheTransceiverConfig)
{
}

View File

@ -22,6 +22,7 @@
#include "tensorrt_llm/batch_manager/llmRequest.h" // TODO forward declare
#include "tensorrt_llm/common/optionalRef.h"
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/executor/transferAgent.h"
#include "tensorrt_llm/kernels/kvCacheIndex.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/common.h"
@ -42,6 +43,8 @@
#include <unordered_map>
#include <vector>
namespace kvc = tensorrt_llm::executor::kv_cache;
namespace tensorrt_llm::batch_manager::eviction_policy
{
class BaseEvictionPolicy;
@ -448,6 +451,16 @@ public:
return mKvCacheRetentionConfig.getDecodeDurationMs();
}
[[nodiscard]] executor::KvCacheTransferMode getTransferMode() const
{
return mKvCacheRetentionConfig.getTransferMode();
}
[[nodiscard]] std::string const& getDirectory() const
{
return mKvCacheRetentionConfig.getDirectory();
}
// @brief Check whether the sequence uses cyclic KV cache.
// @return `true` if we have begun overwriting the beginning of the sequence's KV cache.
// @details If `true`, we cannot store the sequence's KV cache for reuse.
@ -541,7 +554,8 @@ public:
SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream,
bool onboardBlocks, CacheType cacheType, std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager);
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager,
std::shared_ptr<kvc::BaseLoopbackAgent> loopbackAgent = nullptr);
~WindowBlockManager();
@ -705,11 +719,13 @@ public:
//! \brief Bring offloaded block from secondary to primary memory.
//! \details Does nothing if block is already in primary memory.
void onboardBlock(BlockPtr const& offloadBlock);
void onboardBlock(BlockPtr const& offloadBlock,
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
//! \brief Bring block from primary to secondary memory.
//! \details Does nothing if block is already in secondary memory.
void offloadBlock(BlockPtr const& block);
void offloadBlock(BlockPtr const& block, executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM,
std::string const& directory = "");
//! \brief Find first new block that must be allocated for context phase and return it's concatenated token vectors.
//! \details Only full blocks are considered.
@ -763,7 +779,8 @@ private:
//! \param sequence Sequence to which blocks are assigned.
//! \return Number of matched tokens from loaded blocks.
SizeType32 loadOrAllocateBlocks(std::vector<BlockKey> const& blockKeys, SizeType32 numContextBlocks,
GenerationRequest& sequence, std::vector<executor::RetentionPriorityAndDuration> const& perBlockRetentions);
GenerationRequest& sequence, std::vector<executor::RetentionPriorityAndDuration> const& perBlockRetentions,
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
//! \brief Free block and all it's descendants. This makes block a claimed leaf block.
void freeChildren(BlockPtr const& block, executor::RetentionPriority priority,
@ -772,7 +789,8 @@ private:
//! \brief Find block least likely to be reused, free it if necessary and return.
[[nodiscard]] BlockPtr getFreeBlock(
executor::RetentionPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority,
std::optional<std::chrono::milliseconds> durationMs = std::nullopt);
std::optional<std::chrono::milliseconds> durationMs = std::nullopt,
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
//! \brief Free block from previous block and claim it from free blocks list.
void claimLeafBlock(BlockPtr const& block, std::optional<executor::RetentionPriority> priority = std::nullopt,
@ -820,6 +838,8 @@ private:
std::shared_ptr<BaseEvictionPolicy> mEvictionPolicy;
// Event manager
std::shared_ptr<KVCacheEventManager> mEventManager;
// Pointer to parent loopback agent
std::shared_ptr<kvc::BaseLoopbackAgent> mLoopbackAgent;
// Transfer manager
std::shared_ptr<KVCacheTransferManager> mTransferManager;
@ -867,7 +887,8 @@ public:
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enablePartialReuse = true,
bool copyOnPartialReuse = true,
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr);
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager = nullptr,
std::optional<kvc::BaseAgentConfig> agentConfig = std::nullopt);
BlockManager(BlockManager const&) = delete;
BlockManager& operator=(BlockManager const&) = delete;
@ -916,11 +937,13 @@ public:
//! \brief Bring block from primary to secondary memory for window size.
//! \details Does nothing if block is already in primary memory.
void onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize);
void onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize,
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
//! \brief Bring block from primary to secondary memory for window size.
//! \details Does nothing if block is already in secondary memory.
void offloadBlock(BlockPtr const& block, SizeType32 windowSize);
void offloadBlock(BlockPtr const& block, SizeType32 windowSize,
executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM, std::string const& directory = "");
void storeBlocks(std::vector<BlockKey> const& blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
SizeType32 windowSize)
@ -1159,6 +1182,7 @@ private:
SizeType32 mNumLayers;
SizeType32 mTokensPerBlock;
std::shared_ptr<KVCacheEventManager> mEventManager;
std::shared_ptr<kvc::BaseLoopbackAgent> mLoopbackAgent;
CudaStreamPtr mStream;
CacheType mCacheType;

View File

@ -20,6 +20,7 @@
#include "tensorrt_llm/runtime/cudaEvent.h"
namespace tr = tensorrt_llm::runtime;
namespace kvc = tensorrt_llm::executor::kv_cache;
#pragma once
@ -32,17 +33,18 @@ namespace tensorrt_llm::batch_manager::kv_cache_manager
class KVCacheTransferManager
{
public:
explicit KVCacheTransferManager(tr::BufferManager const& bufferManager);
explicit KVCacheTransferManager(
tr::BufferManager const& bufferManager, std::shared_ptr<kvc::BaseLoopbackAgent> loopbackAgent = nullptr);
//! \brief Onboard a block to gpu memory.
void onboard(BlockPtr const& offloadBlock, BlockPtr const& block, std::vector<KVCacheBlockPool> const& pools,
int numTokensToCopy = 0, executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM,
std::optional<std::string> directory = std::nullopt);
std::string const& directory = "");
//! \brief Offload a block to cpu memory.
void offload(BlockPtr const& block, BlockPtr const& offloadBlock, std::vector<KVCacheBlockPool> const& pools,
int numTokensToCopy = 0, executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM,
std::optional<std::string> directory = std::nullopt);
std::string const& directory = "");
//! \brief Synchronize the offload/onboard streams with the bufferManager stream.
void syncTransfers();
@ -67,7 +69,7 @@ private:
*/
void copyBlock(BlockPtr const& src, BlockPtr const& dst, std::vector<KVCacheBlockPool> const& pools, bool isOffload,
int numTokensToCopy = 0, executor::KvCacheTransferMode mode = executor::KvCacheTransferMode::DRAM,
std::optional<std::string> directory = std::nullopt);
std::string const& directory = "");
runtime::BufferManager mBufferManager;
runtime::BufferManager mOnboardManager;
@ -75,6 +77,9 @@ private:
// Track the block ids offloaded in this iteration.
std::unordered_map<int32_t, tr::CudaEvent> mPendingOffloads;
// Reference to parent loopback agent
std::shared_ptr<kvc::BaseLoopbackAgent> mLoopbackAgent;
int mDeviceId;
};
} // namespace tensorrt_llm::batch_manager::kv_cache_manager

View File

@ -48,12 +48,13 @@ public:
kMLA = 1,
};
CacheState(ModelConfig modelConfig, runtime::WorldConfig const& worldConfig, nvinfer1::DataType dataType,
CacheState(ModelConfig modelConfig, runtime::WorldConfig const& worldConfig,
std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2)
: mModelConfig(std::move(modelConfig))
, mParallelConfig{worldConfig.getTensorParallelism(), worldConfig.getPipelineParallelism(),
worldConfig.getContextParallelism(), worldConfig.enableAttentionDP(), worldConfig.getTensorParallelRank(),
worldConfig.getTensorParallelism()}
worldConfig.getTensorParallelism(), attentionLayerNumPerPP}
, mDataType{dataType}
, mAttentionConfig(attentionType, kvFactor)
{
@ -61,10 +62,12 @@ public:
CacheState(std::vector<SizeType32> nbKvHeadPerLayer, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
nvinfer1::DataType dataType, AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2,
bool enableAttentionDP = false, int DPrank = 0, int DPsize = 0)
std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
int DPrank = 0, int DPsize = 0)
: mModelConfig{std::move(nbKvHeadPerLayer), sizePerHead, tokensPerBlock}
, mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize}
, mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize,
attentionLayerNumPerPP}
, mDataType{dataType}
, mAttentionConfig(attentionType, kvFactor)
{
@ -72,10 +75,12 @@ public:
CacheState(SizeType32 nbAttentionLayers, SizeType32 nbKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
SizeType32 tensorParallelism, SizeType32 pipelineParallelism, SizeType32 contextParallelism,
nvinfer1::DataType dataType, AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2,
bool enableAttentionDP = false, int DPrank = 0, int DPsize = 0)
std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
AttentionType attentionType = AttentionType::kDEFAULT, int kvFactor = 2, bool enableAttentionDP = false,
int DPrank = 0, int DPsize = 0)
: mModelConfig{std::vector(nbAttentionLayers, nbKvHeads), sizePerHead, tokensPerBlock}
, mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize}
, mParallelConfig{tensorParallelism, pipelineParallelism, contextParallelism, enableAttentionDP, DPrank, DPsize,
attentionLayerNumPerPP}
, mDataType{dataType}
, mAttentionConfig(attentionType, kvFactor)
{
@ -108,12 +113,16 @@ public:
bool mEnableAttentionDP;
SizeType32 mDPrank;
SizeType32 mDPsize;
// number of attention layers per pipeline parallelism rank, the size of the vector is equal to the pipeline
// parallelism size.
std::vector<SizeType32> mAttentionLayerNumPerPP;
[[nodiscard]] bool operator==(ParallelConfig const& other) const noexcept
{
return mTensorParallelism == other.mTensorParallelism && mPipelineParallelism == other.mPipelineParallelism
&& mContextParallelism == other.mContextParallelism && mEnableAttentionDP == other.mEnableAttentionDP
&& mDPrank == other.mDPrank && mDPsize == other.mDPsize;
&& mDPrank == other.mDPrank && mDPsize == other.mDPsize
&& mAttentionLayerNumPerPP == other.mAttentionLayerNumPerPP;
}
};

View File

@ -582,14 +582,13 @@ public:
explicit KvCacheRetentionConfig(std::vector<TokenRangeRetentionConfig> const& tokenRangeRetentionPriorities,
RetentionPriority decodeRetentionPriority = kDefaultRetentionPriority,
std::optional<std::chrono::milliseconds> decodeDurationMs = std::nullopt,
KvCacheTransferMode transferMode = KvCacheTransferMode::DRAM,
std::optional<std::string> directory = std::nullopt);
KvCacheTransferMode transferMode = KvCacheTransferMode::DRAM, std::string const& directory = "");
[[nodiscard]] std::vector<TokenRangeRetentionConfig> getTokenRangeRetentionConfigs() const;
[[nodiscard]] RetentionPriority getDecodeRetentionPriority() const;
[[nodiscard]] std::optional<std::chrono::milliseconds> getDecodeDurationMs() const;
[[nodiscard]] KvCacheTransferMode getTransferMode() const;
[[nodiscard]] std::optional<std::string> getDirectory() const;
[[nodiscard]] std::string const& getDirectory() const;
/// @brief Convert the token range data into an entry per kv block. Returns a tuple of vectors corresponding to the
/// priorities and durations for each block.
@ -616,7 +615,7 @@ private:
/// @brief The transfer mode for the block.
KvCacheTransferMode mTransferMode;
/// @brief Name of the directory if transfer mode is GDS or POSIX_DEBUG_FALLBACK.
std::optional<std::string> mDirectory;
std::string mDirectory;
};
/// @brief A class that holds information about the request

View File

@ -17,10 +17,14 @@
#pragma once
#include "tensorrt_llm/common/assert.h"
#include <fcntl.h>
#include <memory>
#include <mutex>
#include <optional>
#include <string>
#include <sys/stat.h>
#include <sys/types.h>
#include <unistd.h>
#include <unordered_map>
#include <vector>
@ -109,6 +113,80 @@ private:
std::vector<MemoryDesc> mDescs;
};
class FileDesc
{
public:
FileDesc(std::string const& filename, int flags, mode_t mode, size_t len)
: mLen{len}
{
int fd = ::open(filename.c_str(), flags, mode);
TLLM_CHECK_WITH_INFO(fd >= 0, "Failed to open '%s' (GDS)", filename.c_str());
this->fd = fd;
}
FileDesc(FileDesc&& other) noexcept
: fd(other.fd)
, mLen(other.mLen)
{
other.fd = -1;
other.mLen = 0;
}
FileDesc& operator=(FileDesc&& other) noexcept
{
if (this != &other)
{
if (fd != -1)
::close(fd);
fd = other.fd;
mLen = other.mLen;
other.fd = -1;
other.mLen = 0;
}
return *this;
}
~FileDesc()
{
if (fd != -1)
::close(fd);
}
[[nodiscard]] uint64_t getFd() const noexcept
{
return fd;
}
[[nodiscard]] size_t getLen() const noexcept
{
return mLen;
}
FileDesc(FileDesc const&) = delete;
FileDesc& operator=(FileDesc const&) = delete;
private:
int fd;
size_t mLen;
};
class FileDescs
{
public:
FileDescs(std::vector<FileDesc>&& descs)
: mDescs(std::move(descs))
{
}
[[nodiscard]] std::vector<FileDesc> const& getDescs() const noexcept
{
return mDescs;
}
private:
std::vector<FileDesc> mDescs;
};
using TransferDescs = MemoryDescs;
using RegisterDescs = MemoryDescs;
using SyncMessage = std::string;
@ -195,6 +273,7 @@ struct BaseAgentConfig
{
std::string mName;
bool useProgThread;
bool multiThread;
};
class BaseTransferAgent
@ -221,6 +300,13 @@ public:
virtual bool checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs) = 0;
};
class BaseLoopbackAgent
{
public:
virtual ~BaseLoopbackAgent() = default;
virtual void executeLoopbackRequest(MemoryDescs const& memoryDescs, FileDescs const& fileDescs, bool isOffload) = 0;
};
class DynLibLoader final
{
public:
@ -264,4 +350,18 @@ template <typename... Args>
TLLM_THROW("Unknown backend name.");
}
template <typename... Args>
[[nodiscard]] std::shared_ptr<BaseLoopbackAgent> makeLoopbackAgent(std::string const& backend, Args&&... args)
{
if (backend == "nixl")
{
auto& loader = DynLibLoader::getInstance();
using CreateNixlFuncType = std::shared_ptr<BaseLoopbackAgent> (*)(BaseAgentConfig const*);
auto* func = loader.getFunctionPointer<CreateNixlFuncType>(
"libtensorrt_llm_nixl_wrapper.so", "createNixlLoopbackAgent");
return func(std::forward<Args>(args)...);
}
TLLM_THROW("Unknown backend name.");
}
} // namespace tensorrt_llm::executor::kv_cache

View File

@ -61,6 +61,8 @@ def getSMVersion():
def test_trtllm_flash_attention_fmha(d, s, dtype, flag, tiled_kernel):
verbose = 0
sm_version = getSMVersion()
if flag == "-use-attention-sinks" and sm_version != 90:
pytest.skip("use-attention-sinks is only supported on sm90 currently.")
if sm_version == 90 and tiled_kernel == "-force-non-tiled":
pytest.skip(
"Tiled/non-tiled flags only make a difference to ampere-style kernels."

View File

@ -75,7 +75,6 @@ BlockRange getBlockRangeForReceiving(BaseKVCacheManager* cacheManager, LlmReques
bool CacheFormatter::needSendCache(
CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx)
{
// int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism;
auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
if (targetInfo.mDupHeadFactor <= 1)
{
@ -91,15 +90,27 @@ bool CacheFormatter::needSendCache(
selfTpRankInDpGroup = selfTpRank % selfTPNumInDPGroup;
}
// only TP rank % dupHeadFactor == 0 need to send cache.
return selfTpRankInDpGroup % targetInfo.mDupHeadFactor == 0;
}
void checkAlternateWindow(BaseKVCacheManager* cacheManager, BaseCacheFormatter::CacheState const& selfConfig,
BaseCacheFormatter::CacheState const& destConfig)
{
// TODO: VSWA do not support uneven layer per PP.
// if gen PP and context PP are different, cache formatter only support alternative window like gpt-oss.
// which is one layer is WSA, and another layer is Full attention.
auto numPools = cacheManager->getBlockManager().getNumPools();
auto layerNum = cacheManager->getBlockManager().getNumLayers();
auto selfPPNum = selfConfig.getParallelConfig().mPipelineParallelism;
auto selfAllLayerNum = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size();
auto destPPNum = destConfig.getParallelConfig().mPipelineParallelism;
auto destAllLayerNum = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
TLLM_CHECK_WITH_INFO(selfAllLayerNum % selfPPNum == 0, " For VWSA selfAllLayerNum must be divisible by selfPPNum");
TLLM_CHECK_WITH_INFO(destAllLayerNum % destPPNum == 0, "For VWSA destAllLayerNum must be divisible by destPPNum");
std::vector<SizeType32> poolIdxs(numPools);
TLLM_CHECK(layerNum >= numPools);
for (int i = 0; i < numPools; i++)
@ -156,6 +167,7 @@ void CacheFormatter::format(TransferSession& session)
auto const& destConfig = session.getOtherState().getCacheState().value();
auto const selfIdx = session.getSelfState().getCommState().value().getSelfIdx();
auto& bufferManager = session.getBufferManager();
// Some TP rank don't need to send cache since duplicate header is not needed.
if (!needSendCache(selfConfig, destConfig, selfIdx))
{
return;
@ -207,21 +219,22 @@ void CacheFormatter::format(TransferSession& session)
int blockNum = 0;
size_t allCacheBlockSize = 0;
std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> inputKvCacheBlocks;
// gather cache blocks of the request.
std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>> inputKvCacheBlocksPerWindow;
for (auto poolIdx = 0; poolIdx < numPools; poolIdx++)
{
blockRange.updatePoolIdx(poolIdx);
SizeType32 window = mCacheManager->getBlockManager().getPoolWindowSize(poolIdx);
TLLM_CHECK_WITH_INFO(inputKvCacheBlocks.find(window) == inputKvCacheBlocks.end(),
TLLM_CHECK_WITH_INFO(inputKvCacheBlocksPerWindow.find(window) == inputKvCacheBlocksPerWindow.end(),
"window size already exists, which is not supported");
inputKvCacheBlocks.emplace(window, std::vector<runtime::ITensor::SharedPtr>());
inputKvCacheBlocksPerWindow.emplace(window, std::vector<runtime::ITensor::SharedPtr>());
auto maxBlockThisWindow = window / selfConfig.getModelConfig().mTokensPerBlock;
// only block in window will be sent.
SizeType32 blockNumThisWindow = 0;
for (auto it = blockRange.begin(); it != blockRange.end(); ++it)
{
blockNum++;
inputKvCacheBlocks.at(window).push_back(it);
inputKvCacheBlocksPerWindow.at(window).push_back(it);
allCacheBlockSize += it->getSize();
blockNumThisWindow++;
if (blockNumThisWindow >= maxBlockThisWindow)
@ -231,7 +244,7 @@ void CacheFormatter::format(TransferSession& session)
}
}
if (inputKvCacheBlocks.size() > 1)
if (inputKvCacheBlocksPerWindow.size() > 1)
{
if (selfConfig.getParallelConfig().mPipelineParallelism
!= destConfig.getParallelConfig().mPipelineParallelism)
@ -239,15 +252,15 @@ void CacheFormatter::format(TransferSession& session)
checkAlternateWindow(mCacheManager, selfConfig, destConfig);
}
}
TLLM_CHECK(!inputKvCacheBlocks.empty());
TLLM_CHECK(!inputKvCacheBlocksPerWindow.empty());
TLLM_CHECK(blockNum > 0);
int deviceId = mCacheManager->getBlockManager().getStreamDevice();
auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
if (common::getEnvTryZCopyForKVCacheTransfer()
&& (destConfig.getParallelConfig().mPipelineParallelism
<= selfConfig.getParallelConfig().mPipelineParallelism)
&& (destConfig.getParallelConfig().mTensorParallelism <= selfConfig.getParallelConfig().mTensorParallelism))
== selfConfig.getParallelConfig().mPipelineParallelism)
&& (destConfig.getParallelConfig().mTensorParallelism == selfConfig.getParallelConfig().mTensorParallelism))
{
TLLM_LOG_DEBUG("Try using zero-copy for the KV cache.");
NVTX3_SCOPED_RANGE(sendBufferFun);
@ -257,7 +270,7 @@ void CacheFormatter::format(TransferSession& session)
TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
for (size_t i = 0; i < connections.size(); i++)
{
for (auto const& [window, blocks] : inputKvCacheBlocks)
for (auto const& [window, blocks] : inputKvCacheBlocksPerWindow)
{
for (auto const& block : blocks)
{
@ -271,80 +284,123 @@ void CacheFormatter::format(TransferSession& session)
return;
}
// formatter flow
// 1. collect cache blocks of the request.
// 2. compute the buffer size for each target.
// 3. prepare the pre-allocated buffer for each target according to the buffer size.
// 4. call splitKVCacheDispatch to split the cache blocks according to the different parallelis and gather the
// cache blocks to the corresponding buffer.
// 5. send the buffer to the corresponding target. Ideally, we send only once (one buffer) for each target.
auto cacheBufferId = mCacheTransBufferManager->assignBufferIndexForSend();
int peerDuplicateHeadFactor = targetInfo.mPeerDupHeadFactor;
auto targetNum = connections.size();
auto const targetBufferSize = allCacheBlockSize / targetNum * peerDuplicateHeadFactor;
auto bufferTargetNum = targetNum / peerDuplicateHeadFactor;
TLLM_LOG_DEBUG(" formatOutput bufferTargetNum: %d, targetNum: %d, peerDuplicateHeadFactor: %d dupliacete:%d ",
bufferTargetNum, targetNum, peerDuplicateHeadFactor, targetInfo.mDupHeadFactor);
auto ppRank = selfIdx
/ (selfConfig.getParallelConfig().mTensorParallelism * selfConfig.getParallelConfig().mContextParallelism);
int selfAttentionLayerNum = selfConfig.getParallelConfig().mAttentionLayerNumPerPP.at(ppRank);
auto getBufferSizeForTarget = [&]()
{
std::vector<size_t> bufferSizeForTarget(targetNum, 0);
// only first bufferTargetNum is used.
if (inputKvCacheBlocksPerWindow.size() > 1)
{
// for VWSA
for (size_t i = 0; i < targetNum; i++)
{
bufferSizeForTarget[i] = allCacheBlockSize * peerDuplicateHeadFactor / targetNum;
}
return bufferSizeForTarget;
}
for (size_t i = 0; i < targetNum; i++)
{
bufferSizeForTarget[i] = allCacheBlockSize * peerDuplicateHeadFactor / targetInfo.mDomainTPSize
/ selfAttentionLayerNum * targetInfo.getPeerPPDomainLayerNum(i);
}
return bufferSizeForTarget;
};
auto bufferEleSizes = getBufferSizeForTarget();
auto result = mCacheTransBufferManager->getOrAllocateSendBuffers(
cacheBufferId, bufferTargetNum, targetBufferSize, bufferManager);
cacheBufferId, static_cast<int>(bufferTargetNum), bufferEleSizes, bufferManager);
auto& outputSplitCaches = std::get<0>(result);
auto& bufferCoverTargetNum = std::get<1>(result);
auto& onlyUseDynamicBuffer = std::get<2>(result);
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
" format bufferTargetNum: %d, targetNum: %d, peerDuplicateHeadFactor: %d duplicate:%d "
"bufferCoverTargetNum:%d connections.size():%ld",
bufferTargetNum, targetNum, peerDuplicateHeadFactor, targetInfo.mDupHeadFactor, bufferCoverTargetNum,
connections.size());
auto* agentConnnecion = dynamic_cast<executor::kv_cache::AgentConnection const*>(connections[0]);
if (agentConnnecion != nullptr)
{
TLLM_CHECK_WITH_INFO(bufferCoverTargetNum == bufferTargetNum, "Agent need all buffer pre-allocated");
TLLM_CHECK(onlyUseDynamicBuffer == false);
}
// TODO: add parameters for layerNumForEachOutput
tensorrt_llm::executor::kv_cache::splitKVCacheDispatch(
inputKvCacheBlocks, outputSplitCaches, destConfig, selfConfig, selfIdx, bufferManager);
inputKvCacheBlocksPerWindow, outputSplitCaches, destConfig, selfConfig, selfIdx, bufferManager);
bufferManager.getStream().synchronize();
auto preAllocSendBuffer = mCacheTransBufferManager->getSendBuffer(cacheBufferId);
if (preAllocSendBuffer != nullptr)
{
TLLM_CHECK(preAllocSendBuffer->getDataType() == inputKvCacheBlocks.begin()->second.front()->getDataType());
TLLM_CHECK(preAllocSendBuffer->getDataType()
== inputKvCacheBlocksPerWindow.begin()->second.front()->getDataType());
}
auto sendBufferFun = [&](int deviceId, size_t processIdx)
{
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), " send processIdx: %ld", processIdx);
NVTX3_SCOPED_RANGE(sendBufferFun);
TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
TLLM_CHECK(connections.size() > (processIdx / peerDuplicateHeadFactor));
TLLM_CHECK(outputSplitCaches.size() > (processIdx / peerDuplicateHeadFactor));
auto startTime = std::chrono::steady_clock::now();
size_t size;
size_t ppDomainSize = targetInfo.mDomainPPSize;
size_t bufferTpRank = (processIdx / ppDomainSize) / peerDuplicateHeadFactor;
size_t bufferIdx = (bufferTpRank * ppDomainSize) + (processIdx % ppDomainSize);
size_t size = outputSplitCaches[bufferIdx]->getSizeInBytes();
if (bufferIdx < bufferCoverTargetNum)
{
size = outputSplitCaches[bufferIdx]->getSizeInBytes();
session.send(processIdx, outputSplitCaches[bufferIdx]->data(), size);
}
else if (bufferCoverTargetNum > 0)
{
// copy buffer allocated by cudaMallocAsync to buffer allocated by cudaMalloc before sending
auto sendBufferIdx = bufferIdx % bufferCoverTargetNum;
bufferManager.copy(*outputSplitCaches[processIdx], *outputSplitCaches.at(sendBufferIdx));
bufferManager.getStream().synchronize();
size = outputSplitCaches.at(sendBufferIdx)->getSizeInBytes();
session.send(processIdx, outputSplitCaches.at(sendBufferIdx)->data(), size);
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), " send processIdx: %d bufferIdx: %d size:%ld",
processIdx, bufferIdx, outputSplitCaches[bufferIdx]->getSizeInBytes());
session.send(
processIdx, outputSplitCaches[bufferIdx]->data(), outputSplitCaches[bufferIdx]->getSizeInBytes());
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), " end send processIdx: %d bufferIdx: %d size:%ld",
processIdx, bufferIdx, outputSplitCaches[bufferIdx]->getSizeInBytes());
}
else
{
// If cacheIdx< bufferCoverTargetNum, the ouputSplitCaches.at(cacheIdx) is allocated by cudaMallocAsync,
// which is unable to be transferred by UCX GPU-direct RDMA. We need copy the data to pre-allocated
// cudaMalloc buffer,and then start send.
// bufferCoverTargetNum == 0, mSendBuffer size < one outputSlice
// send multiple times
size = targetBufferSize;
size_t remainSendSize = targetBufferSize;
size_t remainSendSize = outputSplitCaches[processIdx]->getSize();
size_t needSendSize = outputSplitCaches[processIdx]->getSize();
auto sendBufferIdx = bufferCoverTargetNum == 0 ? 0 : bufferIdx % bufferCoverTargetNum;
auto sendUseAllocBuffer
= bufferCoverTargetNum == 0 ? preAllocSendBuffer : outputSplitCaches[sendBufferIdx];
while (remainSendSize > 0)
{
TLLM_CHECK(preAllocSendBuffer != nullptr);
auto sendBufferEleSize = preAllocSendBuffer->getSize();
TLLM_CHECK(sendUseAllocBuffer != nullptr);
auto sendBufferEleSize = sendUseAllocBuffer->getSize();
auto sendSize = std::min(remainSendSize, sendBufferEleSize);
auto copySlice = runtime::ITensor::slice(
outputSplitCaches[bufferIdx], targetBufferSize - remainSendSize, sendSize);
outputSplitCaches[bufferIdx], needSendSize - remainSendSize, sendSize);
auto copyTargetSlice = runtime::ITensor::slice(preAllocSendBuffer, 0, sendSize);
auto copyTargetSlice = runtime::ITensor::slice(sendUseAllocBuffer, 0, sendSize);
bufferManager.copy(*copySlice, *copyTargetSlice);
bufferManager.getStream().synchronize();
session.send(processIdx, copyTargetSlice->data(), copyTargetSlice->getSizeInBytes());
@ -376,7 +432,7 @@ void CacheFormatter::format(TransferSession& session)
}
else
{
// concurrency num
// concurrency num should <=bufferCoverTargetNum to avoid data-race.
auto concurrencyNum
= std::min(std::max(static_cast<size_t>(1), bufferCoverTargetNum), connections.size());
@ -462,6 +518,7 @@ void CacheFormatter::unformat(TransferSession& session)
TLLM_CHECK(!outputBuffersPerWindow.empty());
if (outputBuffersPerWindow.size() > 1)
{
// We only support limited case for VSWA.
if (selfConfig.getParallelConfig().mPipelineParallelism != destConfig.getParallelConfig().mPipelineParallelism)
{
checkAlternateWindow(mCacheManager, selfConfig, destConfig);
@ -560,13 +617,13 @@ void CacheFormatter::unformat(TransferSession& session)
ctxReqId);
return;
}
// legacyPath: context executor rank only send data to one gen executor rank. it sends multiple cache
// blocks.
auto legacyPath = common::getEnvTryZCopyForKVCacheTransfer()
&& (destConfig.getParallelConfig().mPipelineParallelism
>= selfConfig.getParallelConfig().mPipelineParallelism)
&& (destConfig.getParallelConfig().mTensorParallelism
>= selfConfig.getParallelConfig().mTensorParallelism);
// unformatted flow
// 1. collect cache blocks of the request.
// 2. compute the buffer size for each target.
// 3. prepare the pre-allocated buffer for each target according to the buffer size.
// 4. receive the buffer from the corresponding target. Ideally, we receive only once (one buffer) for each
// target.
// 5. call concatKvCacheV2Dispatch to concatenate the cache blocks according to the different parallelis
runtime::ITensor::SharedPtr recvBufferTemp;
std::vector<runtime::ITensor::SharedPtr> recvSplitCaches;
@ -574,7 +631,44 @@ void CacheFormatter::unformat(TransferSession& session)
auto dataType = outputBuffersPerWindow.begin()->second.front()->getDataType();
auto targetNum = pickUpConnections.size();
TLLM_CHECK(cacheBlockSizeSum % targetNum == 0);
auto targetBufferSize = cacheBlockSizeSum / targetNum;
auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
auto ppRank = selfIdx
/ (selfConfig.getParallelConfig().mTensorParallelism
* selfConfig.getParallelConfig().mContextParallelism);
int selfAttentionLayerNum = selfConfig.getParallelConfig().mAttentionLayerNumPerPP.at(ppRank);
auto getTargetBufferEleSize = [&]()
{
if (outputBuffersPerWindow.size() > 1)
{
std::vector<size_t> bufferSizeForTarget(targetNum, 0);
for (size_t i = 0; i < targetNum; i++)
{
bufferSizeForTarget[i] = cacheBlockSizeSum / targetNum;
}
return bufferSizeForTarget;
}
// for duplicate header, gen will not recv from TP which has duplicate header, and will not prepare
// buffer for it.
size_t validTpSize = pickUpConnections.size() / targetInfo.mDomainPPSize;
TLLM_CHECK_WITH_INFO(cacheBlockSizeSum % validTpSize == 0,
"cacheBlockSizeSum must be divisible by validTpSize %ld", validTpSize);
TLLM_CHECK_WITH_INFO((cacheBlockSizeSum % (selfAttentionLayerNum * validTpSize)) == 0,
"cacheBlockSizeSum must be divisible by validTpSize %ld * selfAttentionLayerNum %d", validTpSize,
selfAttentionLayerNum);
TLLM_CHECK(targetNum == pickUpConnections.size());
// the sum of buffer size is cacheBlockSizeSum.
size_t cacheBlockSizePerLayer = cacheBlockSizeSum / (validTpSize * selfAttentionLayerNum);
std::vector<size_t> bufferEleSizes(targetNum, 0);
for (size_t i = 0; i < targetNum; i++)
{
auto layerNum = targetInfo.getPeerPPDomainLayerNum(static_cast<SizeType32>(pickUpConnections[i]));
bufferEleSizes[i] = cacheBlockSizePerLayer * layerNum;
}
return bufferEleSizes;
};
auto bufferEleSizes = getTargetBufferEleSize();
size_t remainNoCoverTargetNum = 0;
size_t bufferCoverTargetNum = 0;
@ -583,49 +677,31 @@ void CacheFormatter::unformat(TransferSession& session)
NVTX3_SCOPED_RANGE(formatInputAllocBuffer);
TLLM_CHECK(blockNum > 0);
if (legacyPath)
auto* agentConnnecion
= dynamic_cast<executor::kv_cache::AgentConnection const*>(connections[pickUpConnections[0]]);
if (agentConnnecion != nullptr)
{
TLLM_LOG_DEBUG("formatOutput using legacy path");
auto cacheShape = executor::kv_cache::makeShapeFromCacheState(destConfig);
auto cacheVolume = runtime::ITensor::volume(cacheShape);
size_t bufferNum = blockNum * pickUpConnections.size();
recvBufferTemp = bufferManager.gpu(
runtime::ITensor::makeShape({static_cast<int64_t>(cacheVolume * bufferNum)}), dataType);
recvSplitCaches.resize(bufferNum);
for (size_t i = 0; i < bufferNum; i++)
{
recvSplitCaches[i] = runtime::ITensor::slice(recvBufferTemp, i * cacheVolume, cacheVolume);
}
cacheBufferId = agentConnnecion->getCacheBufferId();
TLLM_CHECK(cacheBufferId.has_value());
}
else
{
auto* agentConnnecion
= dynamic_cast<executor::kv_cache::AgentConnection const*>(connections[pickUpConnections[0]]);
if (agentConnnecion != nullptr)
{
cacheBufferId = agentConnnecion->getCacheBufferId();
TLLM_CHECK(cacheBufferId.has_value());
}
else
{
cacheBufferId = mCacheTransBufferManager->assignBufferIndexForRecv();
}
TLLM_CHECK(cacheBufferId.has_value());
auto [recvSplitCachestmp, bufferCoverTargetNumtmp, onlyUseDynamicBuffer]
= mCacheTransBufferManager->getOrAllocateRecvBuffers(
cacheBufferId, targetNum, targetBufferSize, bufferManager);
bufferCoverTargetNum = bufferCoverTargetNumtmp;
remainNoCoverTargetNum = targetNum > bufferCoverTargetNum ? targetNum - bufferCoverTargetNum : 0;
if (agentConnnecion != nullptr)
{
TLLM_CHECK_WITH_INFO(bufferCoverTargetNum == targetNum, "Agent need buffer pre-allocated");
TLLM_CHECK(onlyUseDynamicBuffer == false);
}
recvSplitCaches = std::move(recvSplitCachestmp);
cacheBufferId = mCacheTransBufferManager->assignBufferIndexForRecv();
}
TLLM_CHECK(cacheBufferId.has_value());
auto [recvSplitCachestmp, bufferCoverTargetNumtmp, onlyUseDynamicBuffer]
= mCacheTransBufferManager->getOrAllocateRecvBuffers(
cacheBufferId, static_cast<int>(targetNum), bufferEleSizes, bufferManager);
bufferCoverTargetNum = bufferCoverTargetNumtmp;
remainNoCoverTargetNum = targetNum > bufferCoverTargetNum ? targetNum - bufferCoverTargetNum : 0;
if (agentConnnecion != nullptr)
{
TLLM_CHECK_WITH_INFO(bufferCoverTargetNum == targetNum, "Agent need buffer pre-allocated");
TLLM_CHECK(onlyUseDynamicBuffer == false);
}
recvSplitCaches = std::move(recvSplitCachestmp);
// sync to alloc buffer
bufferManager.getStream().synchronize();
@ -647,63 +723,45 @@ void CacheFormatter::unformat(TransferSession& session)
TLLM_CHECK(recvSplitCaches.size() > processIdx);
auto startTime = std::chrono::steady_clock::now();
size_t size = 0;
if (legacyPath)
{
size_t idx = processIdx * blockNum;
for (size_t i = 0; i < blockNum; i++)
{
size_t commIdx = idx / (blockNum);
size_t blockIdx = idx % (blockNum);
size_t recvBufferIdx = blockIdx * pickUpConnections.size() + commIdx;
llmRequest.updateKvCacheSize((*recvSplitCaches[recvBufferIdx]).getSizeInBytes());
auto& buffer = recvSplitCaches.at(recvBufferIdx);
size += buffer->getSizeInBytes();
session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
idx++;
}
if (processIdx >= remainNoCoverTargetNum)
{
llmRequest.updateKvCacheSize((*recvSplitCaches.at(processIdx)).getSizeInBytes());
auto& buffer = recvSplitCaches[processIdx];
size = buffer->getSizeInBytes();
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), " start recv bufferIdx: %d size:%ld", processIdx,
buffer->getSizeInBytes());
session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), " recv bufferIdx: %d size:%ld", processIdx,
buffer->getSizeInBytes());
}
else
{
if (processIdx >= remainNoCoverTargetNum)
auto recvBufferIdx
= bufferCoverTargetNum == 0 ? 0 : processIdx % bufferCoverTargetNum + remainNoCoverTargetNum;
// bufferCoverTargetNum == 0
auto recvBufferUsed
= bufferCoverTargetNum == 0 ? preAllocRecvBuffer : recvSplitCaches[recvBufferIdx];
size_t remainRecvSize = recvSplitCaches[processIdx]->getSize();
size_t needRecvSize = recvSplitCaches[processIdx]->getSize();
while (remainRecvSize > 0)
{
llmRequest.updateKvCacheSize((*recvSplitCaches.at(processIdx)).getSizeInBytes());
auto& buffer = recvSplitCaches[processIdx];
size = buffer->getSizeInBytes();
session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
}
else if (bufferCoverTargetNum > 0)
{
auto recvBufferIdx = processIdx % bufferCoverTargetNum
+ remainNoCoverTargetNum; // caches.at(recvBufferIdx) is allocated by cudaMalloc
llmRequest.updateKvCacheSize((*recvSplitCaches.at(recvBufferIdx)).getSizeInBytes());
auto& buffer = recvSplitCaches.at(recvBufferIdx);
size = buffer->getSizeInBytes();
session.recv(pickUpConnections[processIdx], buffer->data(), buffer->getSizeInBytes());
bufferManager.copy(*recvSplitCaches.at(recvBufferIdx), *recvSplitCaches[processIdx]);
TLLM_CHECK(recvBufferUsed != nullptr);
auto recvBufferEleSize = recvBufferUsed->getSize();
auto recvSize = std::min(remainRecvSize, recvBufferEleSize);
auto recvSlice = runtime::ITensor::slice(recvBufferUsed, 0, recvSize);
auto copySlice = runtime::ITensor::slice(
recvSplitCaches[processIdx], needRecvSize - remainRecvSize, recvSize);
size += recvSlice->getSizeInBytes();
llmRequest.updateKvCacheSize((*recvSlice).getSizeInBytes());
session.recv(pickUpConnections[processIdx], recvSlice->data(), recvSlice->getSizeInBytes());
bufferManager.copy(*recvSlice, *copySlice);
bufferManager.getStream().synchronize();
}
else
{
// bufferCoverTargetNum == 0
size_t remainRecvSize = targetBufferSize;
while (remainRecvSize > 0)
{
TLLM_CHECK(preAllocRecvBuffer != nullptr);
auto recvBufferEleSize = preAllocRecvBuffer->getSize();
auto recvSize = std::min(remainRecvSize, recvBufferEleSize);
auto recvSlice = runtime::ITensor::slice(preAllocRecvBuffer, 0, recvSize);
auto copySlice = runtime::ITensor::slice(
recvSplitCaches[processIdx], targetBufferSize - remainRecvSize, recvSize);
size += recvSlice->getSizeInBytes();
llmRequest.updateKvCacheSize((*recvSlice).getSizeInBytes());
session.recv(pickUpConnections[processIdx], recvSlice->data(), recvSlice->getSizeInBytes());
bufferManager.copy(*recvSlice, *copySlice);
bufferManager.getStream().synchronize();
remainRecvSize -= recvSize;
}
remainRecvSize -= recvSize;
}
}
auto endTime = std::chrono::steady_clock::now();
double delay = 0.0;
if (recordDelay)
@ -764,19 +822,9 @@ void CacheFormatter::unformat(TransferSession& session)
{
NVTX3_SCOPED_RANGE(formatInputConcatenate);
if (legacyPath)
{
TLLM_CHECK(outputBuffersPerWindow.size() == 1);
executor::kv_cache::concatKVCacheDispatch(recvSplitCaches.data(), recvSplitCaches.size(),
getCounterparts(selfConfig, selfIdx, destConfig), destConfig,
outputBuffersPerWindow.begin()->second.data(), outputBuffersPerWindow.begin()->second.size(),
selfIdx, selfConfig, bufferManager);
}
else
{
executor::kv_cache::concatKvCacheV2Dispatch(
recvSplitCaches, outputBuffersPerWindow, destConfig, selfConfig, selfIdx, bufferManager);
}
executor::kv_cache::concatKvCacheV2Dispatch(
recvSplitCaches, outputBuffersPerWindow, destConfig, selfConfig, selfIdx, bufferManager);
bufferManager.getStream().synchronize();
if (cacheBufferId.has_value())
{
@ -852,27 +900,6 @@ void CacheFormatter::unformat(TransferSession& session)
destConfig.getModelConfig().mNbKvHeadsPerLayer.size());
return false;
}
int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size();
int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism;
int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
if (selfPPSize == destPPSize)
{
return true;
}
if (selfNumLayers % selfPPSize != 0)
{
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d",
selfNumLayers, selfPPSize);
return false;
}
if (destNumLayers % destPPSize != 0)
{
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d ",
destNumLayers, destPPSize);
return false;
}
return true;
}

View File

@ -295,17 +295,19 @@ void CacheTransBufferManager::freeBufferIndexForRecv(std::optional<int> bufferId
}
std::tuple<std::vector<runtime::ITensor::SharedPtr>, size_t, bool> CacheTransBufferManager::getOrAllocateSendBuffers(
std::optional<int> bufferId, int targetNum, size_t targetBufferSize,
std::optional<int> bufferId, int targetNum, std::vector<size_t> const& targetBufferEleSizes,
runtime::BufferManager const& bufferManagerToUse)
{
return getOrAllocateBuffers(bufferId, targetNum, targetBufferSize, bufferManagerToUse, mConcurrenceSendResource);
return getOrAllocateBuffers(
bufferId, targetNum, targetBufferEleSizes, bufferManagerToUse, mConcurrenceSendResource);
}
std::tuple<std::vector<runtime::ITensor::SharedPtr>, size_t, bool> CacheTransBufferManager::getOrAllocateRecvBuffers(
std::optional<int> bufferId, int targetNum, size_t targetBufferSize,
std::optional<int> bufferId, int targetNum, std::vector<size_t> const& targetBufferEleSizes,
runtime::BufferManager const& bufferManagerToUse)
{
return getOrAllocateBuffers(bufferId, targetNum, targetBufferSize, bufferManagerToUse, mConcurrenceRecvResource);
return getOrAllocateBuffers(
bufferId, targetNum, targetBufferEleSizes, bufferManagerToUse, mConcurrenceRecvResource);
}
runtime::ITensor::SharedPtr CacheTransBufferManager::getSendBuffer(std::optional<int> bufferId)
@ -332,54 +334,58 @@ runtime::ITensor::SharedPtr CacheTransBufferManager::getRecvBuffer(std::optional
}
std::tuple<std::vector<runtime::ITensor::SharedPtr>, size_t, bool> CacheTransBufferManager::getOrAllocateBuffers(
std::optional<int> bufferId, int targetNum, size_t targetBufferEleSize,
std::optional<int> bufferId, int targetNum, std::vector<size_t> const& targetBufferEleSizes,
runtime::BufferManager const& bufferManagerToUse, ConcurrenceResource& concurrenceResource)
{
TLLM_CHECK(bufferId.has_value() || mOnlyUseDynamicBuffer);
TLLM_CHECK(targetBufferEleSizes.size() >= static_cast<size_t>(targetNum));
std::vector<runtime::ITensor::SharedPtr> retSplitCaches;
size_t bufferCoverTargetNum = std::min(
static_cast<size_t>(targetNum), mTransferBufferSize / (targetBufferEleSize * common::getDTypeSize(mDataType)));
TLLM_LOG_DEBUG("getOrAllocateBuffers bufferCoverTargetNum:%d", bufferCoverTargetNum);
if (bufferCoverTargetNum < static_cast<size_t>(targetNum))
{
TLLM_LOG_WARNING(
"CacheTransceiver getOrAllocateBuffers: bufferCoverTargetNum:%d < targetNum:%d, may use dynamic buffer, "
"it's better to increase MaxTokensInBuffer in cacheTransceiverConfig, otherwise, the performance may "
"be degraded",
bufferCoverTargetNum, targetNum);
}
size_t bufferCoverTargetNum = 0;
if (bufferId.has_value())
{
TLLM_CHECK(static_cast<size_t>(bufferId.value()) < concurrenceResource.mBuffers.size());
TLLM_CHECK(concurrenceResource.mBufferIndexFlag[bufferId.value()] == 1);
size_t preBufferEleSize = 0;
for (int i = 0; i < targetNum; i++)
{
if (static_cast<size_t>(i) < bufferCoverTargetNum)
// Strict checking.
if (preBufferEleSize + targetBufferEleSizes[i] <= mBufferEleSize)
{
auto slice = runtime::ITensor::slice(
concurrenceResource.mBuffers[bufferId.value()], i * targetBufferEleSize, targetBufferEleSize);
concurrenceResource.mBuffers[bufferId.value()], preBufferEleSize, targetBufferEleSizes[i]);
preBufferEleSize += targetBufferEleSizes[i];
bufferCoverTargetNum++;
retSplitCaches.push_back(std::move(slice));
}
else
{
retSplitCaches.push_back(bufferManagerToUse.gpu(
runtime::ITensor::makeShape({static_cast<int64_t>(targetBufferEleSize)}), mDataType));
runtime::ITensor::makeShape({static_cast<int64_t>(targetBufferEleSizes[i])}), mDataType));
}
}
TLLM_LOG_DEBUG("getOrAllocateBuffers bufferCoverTargetNum:%d", bufferCoverTargetNum);
if (bufferCoverTargetNum < static_cast<size_t>(targetNum))
{
TLLM_LOG_WARNING(
"CacheTransceiver getOrAllocateBuffers: bufferCoverTargetNum:%d < targetNum:%d, may use dynamic "
"buffer, "
"it's better to increase MaxTokensInBuffer in cacheTransceiverConfig, otherwise, the performance may "
"be degraded",
bufferCoverTargetNum, targetNum);
}
}
else
{
for (int i = 0; i < targetNum; i++)
{
retSplitCaches.push_back(bufferManagerToUse.gpu(
runtime::ITensor::makeShape({static_cast<int64_t>(targetBufferEleSize)}), mDataType));
runtime::ITensor::makeShape({static_cast<int64_t>(targetBufferEleSizes[i])}), mDataType));
}
}
if (mOnlyUseDynamicBuffer)
{
bufferCoverTargetNum = targetNum;
}
return std::make_tuple(retSplitCaches, bufferCoverTargetNum, mOnlyUseDynamicBuffer);
}

View File

@ -69,11 +69,11 @@ public:
void freeBufferIndexForRecv(std::optional<int> bufferId);
std::tuple<std::vector<runtime::ITensor::SharedPtr>, size_t, bool> getOrAllocateSendBuffers(
std::optional<int> bufferId, int targetNum, size_t targetBufferSize,
std::optional<int> bufferId, int targetNum, std::vector<size_t> const& targetBufferEleSizes,
runtime::BufferManager const& bufferManagerToUse);
std::tuple<std::vector<runtime::ITensor::SharedPtr>, size_t, bool> getOrAllocateRecvBuffers(
std::optional<int> bufferId, int targetNum, size_t targetBufferSize,
std::optional<int> bufferId, int targetNum, std::vector<size_t> const& targetBufferEleSizes,
runtime::BufferManager const& bufferManagerToUse);
runtime::ITensor::SharedPtr getSendBuffer(std::optional<int> bufferId);
@ -92,8 +92,8 @@ private:
};
std::tuple<std::vector<runtime::ITensor::SharedPtr>, size_t, bool> getOrAllocateBuffers(std::optional<int> bufferId,
int targetNum, size_t targetBufferEleSize, runtime::BufferManager const& bufferManagerToUse,
ConcurrenceResource& concurrenceResource);
int targetNum, std::vector<size_t> const& targetBufferEleSizes,
runtime::BufferManager const& bufferManagerToUse, ConcurrenceResource& concurrenceResource);
void allocateBuffer();
std::optional<int> assignBufferIndex(ConcurrenceResource& resource, size_t bufferCount, bool onlyUseDynamicBuffer);

View File

@ -96,13 +96,22 @@ std::unique_ptr<BaseCacheTransceiver> CacheTransceiverFactory::createCacheTransc
executor::kv_cache::CacheState::ModelConfig cacheStateCfg{
modelConfig.getNumKvHeadsPerLayer(), modelConfig.getSizePerHead(), modelConfig.getTokensPerBlock()};
return std::make_unique<CacheTransceiver>(
cacheManager, cacheStateCfg, worldConfig, modelConfig.getKvDataType(), attentionType, cacheTransceiverConfig);
auto ppSize = worldConfig.getPipelineParallelism();
std::vector<SizeType32> attentionLayerNumPerPP(ppSize, 0);
for (int ppRank = 0; ppRank < ppSize; ppRank++)
{
attentionLayerNumPerPP[ppRank] = modelConfig.getNbAttentionLayers(ppSize, ppRank);
}
return std::make_unique<CacheTransceiver>(cacheManager, cacheStateCfg, worldConfig, attentionLayerNumPerPP,
modelConfig.getKvDataType(), attentionType, cacheTransceiverConfig);
}
CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheManager,
executor::kv_cache::CacheState::ModelConfig const& cacheStateModelCfg, runtime::WorldConfig const& worldConfig,
nvinfer1::DataType dataType, executor::kv_cache::CacheState::AttentionType attentionType,
std::vector<SizeType32> const& attentionLayerNumPerPP, nvinfer1::DataType dataType,
executor::kv_cache::CacheState::AttentionType attentionType,
std::optional<executor::CacheTransceiverConfig> cacheTransceiverConfig)
: mMpiGroupComm(std::addressof(tensorrt_llm::mpi::MpiComm::session()))
, mCacheTransceiverConfig{cacheTransceiverConfig}
@ -124,7 +133,7 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
kvFactor = 1;
}
mCacheState = std::make_unique<executor::kv_cache::CacheState>(
cacheStateModelCfg, worldConfig, dataType, attentionType, kvFactor);
cacheStateModelCfg, worldConfig, attentionLayerNumPerPP, dataType, attentionType, kvFactor);
if (mCacheState->getParallelConfig().mEnableAttentionDP)
{
@ -177,7 +186,7 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa
else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::NIXL)
{
mManager = std::make_unique<tensorrt_llm::executor::kv_cache::AgentConnectionManager>(
mCacheTransBufferManager.get());
mCacheTransBufferManager.get(), *mCacheState);
TLLM_LOG_INFO("NIXL Connection Manager created");
}
else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MPI)

View File

@ -41,6 +41,7 @@
namespace tc = tensorrt_llm::common;
namespace tk = tensorrt_llm::kernels;
namespace tle = tensorrt_llm::executor;
using namespace tle::kv_cache;
using namespace tensorrt_llm::runtime;
using namespace tensorrt_llm::batch_manager::kv_cache_manager;
using namespace tensorrt_llm::batch_manager::eviction_policy;
@ -515,13 +516,19 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
SizeType32 sinkBubbleLength, bool onboardBlocks, CacheType cacheType,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager)
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager,
std::optional<BaseAgentConfig> agentConfig)
: mNumLayers{static_cast<SizeType32>(numKvHeadsPerLayer.size())}
, mTokensPerBlock{tokensPerBlock}
, mEventManager{std::move(eventManager)}
, mStream{stream}
, mCacheType{cacheType}
{
if (agentConfig.has_value())
mLoopbackAgent = makeLoopbackAgent("nixl", &agentConfig.value());
else
mLoopbackAgent = nullptr;
auto const uniqueWindowSizeToLayers
= BaseKVCacheManager::groupLayersByWindowSize(maxAttentionWindowVec, mNumLayers);
@ -545,7 +552,7 @@ BlockManager::BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, Si
mWindowBlockManagers.try_emplace(windowSize, dtype, windowSize, layersWithWindowSize, numKvHeadsPerLayer,
sizePerHead, tokensPerBlock, allottedPrimaryBlocks, allottedSecondaryBlocks, maxNumSequences, stream,
onboardBlocks, cacheType, secondaryOffloadMinPriority, mEventManager, enablePartialReuse,
copyOnPartialReuse, kvCacheConnectorManager);
copyOnPartialReuse, kvCacheConnectorManager, mLoopbackAgent);
}
auto const numAllPools = getNumPools();
@ -588,7 +595,8 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks, CacheType cacheType,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority,
std::shared_ptr<KVCacheEventManager> eventManager, bool enablePartialReuse, bool copyOnPartialReuse,
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager)
std::shared_ptr<kv_connector::KvCacheConnectorManager> kvCacheConnectorManager,
std::shared_ptr<kvc::BaseLoopbackAgent> loopbackAgent)
: mDataType{dtype}
, mWindowSize{windowSize}
, mNumPrimaryBlocks{blocksInPrimaryPool}
@ -600,7 +608,8 @@ WindowBlockManager::WindowBlockManager(nvinfer1::DataType dtype, SizeType32 wind
, mCachedBlocksRoot{std::make_shared<KVCacheBlock>(KVCacheBlock::kCachedBlocksRootId, tk::KVCacheIndex{0})}
, mCacheType{cacheType}
, mEventManager(std::move(eventManager))
, mTransferManager{std::make_shared<KVCacheTransferManager>(mBufferManager)}
, mLoopbackAgent{loopbackAgent}
, mTransferManager{std::make_shared<KVCacheTransferManager>(mBufferManager, mLoopbackAgent)}
, mAllocTotalBlocks{0}
, mAllocNewBlocks{0}
, mReusedBlocks{0}
@ -865,8 +874,9 @@ void WindowBlockManager::freeChildren(
claimLeafBlock(block, priority, durationMs);
}
BlockPtr WindowBlockManager::getFreeBlock(
executor::RetentionPriority priority, std::optional<std::chrono::milliseconds> durationMs)
BlockPtr WindowBlockManager::getFreeBlock(executor::RetentionPriority priority,
std::optional<std::chrono::milliseconds> durationMs, executor::KvCacheTransferMode mode,
std::string const& directory)
{
// eviction policy get free primary block
auto [block, canOffload] = mEvictionPolicy->getFreeBlock(kPrimaryLevel);
@ -887,7 +897,7 @@ BlockPtr WindowBlockManager::getFreeBlock(
mEvictionPolicy->claimBlock(block);
// Offload block in primary memory before repurposing
auto offloadBlock = std::get<0>(mEvictionPolicy->getFreeBlock(kSecondaryLevel));
mTransferManager->offload(block, offloadBlock, mPools);
mTransferManager->offload(block, offloadBlock, mPools, 0, mode, directory);
// swap linear block offsets (i.e. make block the offload block)
block->swapMemoryPoolBlockOffset(offloadBlock);
@ -939,17 +949,20 @@ void BlockManager::setOffsets(tk::KVCacheIndex* offsetsPtr, nvinfer1::Dims const
mWindowBlockManagers.at(windowSize).setOffsets(offsetsPtr, offsetsShape, beamIdx, blockIdx, blockId);
}
void BlockManager::onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize)
void BlockManager::onboardBlock(BlockPtr const& offloadBlock, SizeType32 windowSize, executor::KvCacheTransferMode mode,
std::string const& directory)
{
mWindowBlockManagers.at(windowSize).onboardBlock(offloadBlock);
mWindowBlockManagers.at(windowSize).onboardBlock(offloadBlock, mode, directory);
}
void WindowBlockManager::onboardBlock(BlockPtr const& offloadBlock)
void WindowBlockManager::onboardBlock(
BlockPtr const& offloadBlock, executor::KvCacheTransferMode mode, std::string const& directory)
{
if (mOnboardBlocks && !offloadBlock->isPrimary())
{
auto block = getFreeBlock();
mTransferManager->onboard(offloadBlock, block, mPools);
auto block
= getFreeBlock(executor::KvCacheRetentionConfig::kDefaultRetentionPriority, std::nullopt, mode, directory);
mTransferManager->onboard(offloadBlock, block, mPools, 0, mode, directory);
// swap linear block offsets (i.e. make block the offload block and vice versa)
offloadBlock->swapMemoryPoolBlockOffset(block);
@ -964,12 +977,14 @@ void WindowBlockManager::onboardBlock(BlockPtr const& offloadBlock)
}
}
void BlockManager::offloadBlock(BlockPtr const& block, SizeType32 windowSize)
void BlockManager::offloadBlock(
BlockPtr const& block, SizeType32 windowSize, executor::KvCacheTransferMode mode, std::string const& directory)
{
mWindowBlockManagers.at(windowSize).offloadBlock(block);
mWindowBlockManagers.at(windowSize).offloadBlock(block, mode, directory);
}
void WindowBlockManager::offloadBlock(BlockPtr const& block)
void WindowBlockManager::offloadBlock(
BlockPtr const& block, executor::KvCacheTransferMode mode, std::string const& directory)
{
if (mOnboardBlocks && block->isPrimary())
{
@ -977,7 +992,7 @@ void WindowBlockManager::offloadBlock(BlockPtr const& block)
auto offloadBlock = std::get<0>(mEvictionPolicy->getFreeBlock(kSecondaryLevel));
// If we're swapping a block to secondary memory, maintain the prior priority values.
mEvictionPolicy->claimBlock(offloadBlock);
mTransferManager->offload(block, offloadBlock, mPools);
mTransferManager->offload(block, offloadBlock, mPools, 0, mode, directory);
// swap linear block offsets (i.e. make block the offload block)
block->swapMemoryPoolBlockOffset(offloadBlock);
@ -1031,7 +1046,8 @@ bool WindowBlockManager::blockInRadixTree(BlockPtr const& block)
}
SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const& blockKeys, SizeType32 numContextBlocks,
GenerationRequest& sequence, std::vector<executor::RetentionPriorityAndDuration> const& perBlockRetentions)
GenerationRequest& sequence, std::vector<executor::RetentionPriorityAndDuration> const& perBlockRetentions,
executor::KvCacheTransferMode mode, std::string const& directory)
{
SizeType32 numMatchedTokens{0};
auto searchRoot = mCachedBlocksRoot;
@ -1065,8 +1081,9 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
if (matchingBlock->hasRefs() || !matchingBlock->isLeaf())
{
// Somebody else is using block or it is not a leaf, copy reusable tokens
auto newBlock = getFreeBlock(matchingBlock->getPriority(), matchingBlock->getDurationMs());
mTransferManager->onboard(matchingBlock, newBlock, mPools, numMatched);
auto newBlock
= getFreeBlock(matchingBlock->getPriority(), matchingBlock->getDurationMs(), mode, directory);
mTransferManager->onboard(matchingBlock, newBlock, mPools, numMatched, mode, directory);
// TODO: (optional) Send out event
matchingBlock = newBlock;
TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Copied partially filled block %d", mLogPrefix.c_str(),
@ -1090,7 +1107,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - Matched full block %d", mLogPrefix.c_str(), matchingBlockId);
searchRoot = matchingBlock;
}
onboardBlock(matchingBlock);
onboardBlock(matchingBlock, mode, directory);
addBlockToAllBeams(matchingBlock, sequence);
// TODO: only add once for reused blocks
++mReusedBlocks;
@ -1106,7 +1123,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
// If we haven't set a priority, set it to the default priority level (low)
auto freeBlock = getFreeBlock(perBlockRetentions[bi].retentionPriority.value_or(
executor::KvCacheRetentionConfig::kDefaultRetentionPriority),
perBlockRetentions[bi].durationMs);
perBlockRetentions[bi].durationMs, mode, directory);
addBlockToAllBeams(freeBlock, sequence);
TLLM_LOG_DEBUG("%s::loadOrAllocateBlocks - No match, allocated new block %d for sequence %lu",
mLogPrefix.c_str(), freeBlock->getBlockId(), sequence.getRequestId());
@ -1132,7 +1149,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector<BlockKey> const&
// If we haven't set a priority, set it to the default priority level (low)
auto freeBlock = getFreeBlock(perBlockRetentions[bi].retentionPriority.value_or(
executor::KvCacheRetentionConfig::kDefaultRetentionPriority),
perBlockRetentions[bi].durationMs);
perBlockRetentions[bi].durationMs, mode, directory);
addBlockToBeam(freeBlock, sequence, beamIdx);
if (blockItr != blockKeys.end())
{
@ -1205,9 +1222,20 @@ void WindowBlockManager::addSequence(
auto perBlockRetentions = config.value_or(executor::KvCacheRetentionConfig())
.getPerBlockRetentionPriorityDuration(getTokensPerBlock(), inputLength);
auto mode = config.value_or(executor::KvCacheRetentionConfig()).getTransferMode();
auto directory = config.value_or(executor::KvCacheRetentionConfig()).getDirectory();
if (mode != executor::KvCacheTransferMode::DRAM && directory.empty())
{
TLLM_LOG_WARNING(
"Transfer mode %d specified without directory, falling back to DRAM mode", static_cast<int>(mode));
mode = executor::KvCacheTransferMode::DRAM;
}
TLLM_CHECK(perBlockRetentions.size() == (size_t) numContextBlocks);
auto const prepopulatedPromptLen = loadOrAllocateBlocks(blockKeys, numContextBlocks, sequence, perBlockRetentions);
auto const prepopulatedPromptLen
= loadOrAllocateBlocks(blockKeys, numContextBlocks, sequence, perBlockRetentions, mode, directory);
mReusedTokens += static_cast<double>(prepopulatedPromptLen);
mTotalInputTokens += static_cast<double>(uniqueTokens.size());
@ -1297,7 +1325,8 @@ void WindowBlockManager::allocateBlock(GenerationRequest& sequence, bool shareAm
if (shareAmongBeams)
{
// add same block to all beams
auto block = getFreeBlock(sequence.getDecodeRetentionPriority(), sequence.getDecodeDurationMs());
auto block = getFreeBlock(sequence.getDecodeRetentionPriority(), sequence.getDecodeDurationMs(),
sequence.getTransferMode(), sequence.getDirectory());
for (auto beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
{
addBlockToBeam(block, sequence, beamIdx);
@ -1308,7 +1337,8 @@ void WindowBlockManager::allocateBlock(GenerationRequest& sequence, bool shareAm
// add different block to each beam
for (auto beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
{
auto block = getFreeBlock(sequence.getDecodeRetentionPriority(), sequence.getDecodeDurationMs());
auto block = getFreeBlock(sequence.getDecodeRetentionPriority(), sequence.getDecodeDurationMs(),
sequence.getTransferMode(), sequence.getDirectory());
addBlockToBeam(block, sequence, beamIdx);
}
}
@ -1409,7 +1439,8 @@ void WindowBlockManager::replaceSharedBlock(GenerationRequest& sequence, SizeTyp
TLLM_CHECK_WITH_INFO(hasFreeBlocks(beamWidth), "Can't allocate new blocks. No free blocks left.");
for (auto beamIdx = 0; beamIdx < beamWidth; ++beamIdx)
{
auto block = getFreeBlock();
auto block = getFreeBlock(executor::KvCacheRetentionConfig::kDefaultRetentionPriority, std::nullopt,
sequence.getTransferMode(), sequence.getDirectory());
block->incRefCount();
if (sequence.getCacheBlockIds(mWindowSize).at(beamIdx).size() == 0)
{

View File

@ -42,6 +42,7 @@
namespace tr = tensorrt_llm::runtime;
namespace tk = tensorrt_llm::kernels;
namespace kvc = tensorrt_llm::executor::kv_cache;
namespace tensorrt_llm::batch_manager::kv_cache_manager
{
@ -86,11 +87,15 @@ static bool fileToGpuPosix(tr::ITensor::SharedPtr const& dstPtr, std::string con
return true;
}
KVCacheTransferManager::KVCacheTransferManager(tr::BufferManager const& bufferManager)
KVCacheTransferManager::KVCacheTransferManager(
tr::BufferManager const& bufferManager, std::shared_ptr<kvc::BaseLoopbackAgent> loopbackAgent)
: mBufferManager{bufferManager}
, mOnboardManager(std::make_shared<tr::CudaStream>())
, mOffloadManager(std::make_shared<tr::CudaStream>())
, mLoopbackAgent{loopbackAgent}
{
TLLM_CUDA_CHECK(cudaGetDevice(&mDeviceId));
TLLM_CHECK(mDeviceId != -1);
}
tr::ITensor::SharedPtr KVCacheTransferManager::computeBlockPointer(
@ -106,7 +111,7 @@ tr::ITensor::SharedPtr KVCacheTransferManager::computeBlockPointer(
void KVCacheTransferManager::copyBlock(BlockPtr const& src, BlockPtr const& dst,
std::vector<KVCacheBlockPool> const& pools, bool isOffload, int numTokensToCopy, executor::KvCacheTransferMode mode,
std::optional<std::string> directory)
std::string const& directory)
{
TLLM_LOG_DEBUG("copyBlock entered: srcId=%d, dstId=%d, isOffload=%s, mode=%d", src->getBlockId(), dst->getBlockId(),
(isOffload ? "true" : "false"), static_cast<int>(mode));
@ -159,120 +164,64 @@ void KVCacheTransferManager::copyBlock(BlockPtr const& src, BlockPtr const& dst,
return;
}
std::vector<kvc::FileDesc> fileBlobs;
std::vector<kvc::MemoryDesc> memoryBlobs;
for (size_t poolIdx = 0; poolIdx < pools.size(); ++poolIdx)
{
auto srcPtr = computeBlockPointer(src, pools, poolIdx);
auto dstPtr = computeBlockPointer(dst, pools, poolIdx);
auto ptr = isOffload ? computeBlockPointer(src, pools, poolIdx) : computeBlockPointer(dst, pools, poolIdx);
auto block_id = src->getBlockId();
TLLM_CHECK_WITH_INFO(
directory.has_value(), "Expected a directory path for KVCache offload, but none was provided.");
!directory.empty(), "Expected a directory path for KVCache offload, but none was provided.");
int size = std::snprintf(
nullptr, 0, "%s/block_%d_pool_%zu.bin", directory.value().c_str(), src->getBlockId(), poolIdx);
std::string filename(size + 1, '\0');
std::snprintf(filename.data(), filename.size(), "%s/block_%d_pool_%zu.bin", directory.value().c_str(),
src->getBlockId(), poolIdx);
int size = std::snprintf(nullptr, 0, "%s/block_%d_pool_%zu.bin", directory.c_str(), block_id, poolIdx);
std::string filename;
filename.resize(size + 1);
std::snprintf(
filename.data(), filename.size(), "%s/block_%d_pool_%zu.bin", directory.c_str(), block_id, poolIdx);
if (mode == executor::KvCacheTransferMode::POSIX_DEBUG_FALLBACK)
{
TLLM_LOG_INFO("Forcing POSIX fallback for file: %s", filename.c_str());
if (isOffload)
{
gpuToFilePosix(srcPtr, filename);
gpuToFilePosix(ptr, filename);
}
else
{
fileToGpuPosix(dstPtr, filename);
fileToGpuPosix(ptr, filename);
}
continue;
}
int openFlags = isOffload ? (O_CREAT | O_WRONLY) : O_RDONLY;
int fd = ::open(filename.c_str(), openFlags, 0664);
if (fd < 0)
else if (mode == executor::KvCacheTransferMode::GDS)
{
TLLM_LOG_ERROR(
"Failed to open '%s' for %s; fallback POSIX", filename.c_str(), (isOffload ? "writing" : "reading"));
if (isOffload)
{
gpuToFilePosix(srcPtr, filename);
}
else
{
fileToGpuPosix(dstPtr, filename);
}
continue;
int openFlags = isOffload ? (O_CREAT | O_WRONLY) : O_RDONLY;
fileBlobs.emplace_back(filename, openFlags, 0664, ptr->getSizeInBytes());
memoryBlobs.emplace_back(ptr->data(), ptr->getSizeInBytes(), mDeviceId);
}
}
if (mode == executor::KvCacheTransferMode::GDS)
{
if (mLoopbackAgent == nullptr)
{
TLLM_LOG_DEBUG("KVCacheTransferManager: creating mLoopbackAgent lazily");
kvc::BaseAgentConfig config{std::string("GDSAgent"), true, true};
mLoopbackAgent = kvc::makeLoopbackAgent("nixl", &config);
}
#ifdef ENABLE_CUFILE
CUfileDescr_t cufileDesc = {};
cufileDesc.type = CU_FILE_HANDLE_TYPE_OPAQUE_FD;
cufileDesc.handle.fd = fd;
kvc::FileDescs fileDescs(std::move(fileBlobs));
kvc::MemoryDescs memoryDescs(kvc::MemoryType::kVRAM, memoryBlobs);
CUfileHandle_t cufileHandle;
CUfileError_t status = cuFileHandleRegister(&cufileHandle, &cufileDesc);
if (status.err != CU_FILE_SUCCESS)
{
// Fallback to POSIX
TLLM_LOG_WARN(
"cuFileHandleRegister failed (err=%d). Falling back to POSIX for '%s'", status.err, filename.c_str());
::close(fd);
if (isOffload)
gpuToFilePosix(srcPtr, filename);
else
fileToGpuPosix(dstPtr, filename);
continue;
}
ssize_t numBytes = static_cast<ssize_t>(srcPtr->getSizeInBytes());
if (isOffload)
{
ssize_t written = cuFileWrite(cufileHandle, srcPtr->data(), numBytes, 0, 0);
if (written < 0)
{
TLLM_LOG_ERROR("cuFileWrite error=%zd. Fallback to POSIX", written);
cuFileHandleDeregister(cufileHandle);
::close(fd);
gpuToFilePosix(srcPtr, filename);
continue;
}
}
else
{
ssize_t readCount = cuFileRead(cufileHandle, dstPtr->data(), numBytes, 0, 0);
if (readCount < 0)
{
TLLM_LOG_ERROR("cuFileRead error=%zd. Fallback to POSIX", readCount);
cuFileHandleDeregister(cufileHandle);
::close(fd);
fileToGpuPosix(dstPtr, filename);
continue;
}
}
cuFileHandleDeregister(cufileHandle);
::close(fd);
#else
// If GDS isn't enabled, fallback to POSIX automatically
TLLM_LOG_DEBUG("ENABLE_CUFILE=OFF, so fallback to POSIX for %s", filename.c_str());
::close(fd); // close the file opened for GDS
if (isOffload)
{
gpuToFilePosix(srcPtr, filename);
}
else
{
fileToGpuPosix(dstPtr, filename);
}
#endif
mLoopbackAgent->executeLoopbackRequest(memoryDescs, fileDescs, isOffload);
}
}
void KVCacheTransferManager::onboard(BlockPtr const& offloadBlock, BlockPtr const& block,
std::vector<KVCacheBlockPool> const& pools, int numTokensToCopy, executor::KvCacheTransferMode mode,
std::optional<std::string> directory)
std::string const& directory)
{
if (mode != executor::KvCacheTransferMode::DRAM
&& mPendingOffloads.find(offloadBlock->getBlockId()) == mPendingOffloads.end())
@ -291,7 +240,7 @@ void KVCacheTransferManager::onboard(BlockPtr const& offloadBlock, BlockPtr cons
void KVCacheTransferManager::offload(BlockPtr const& block, BlockPtr const& offloadBlock,
std::vector<KVCacheBlockPool> const& pools, int numTokensToCopy, executor::KvCacheTransferMode mode,
std::optional<std::string> directory)
std::string const& directory)
{
mPendingOffloads[block->getBlockId()] = tr::CudaEvent();
copyBlock(block, offloadBlock, pools, true, numTokensToCopy, mode, directory);

View File

@ -153,12 +153,24 @@ void MLACacheFormatter::format(TransferSession& session)
// diff start
auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
size_t const pPDomainSize = targetInfo.mDomainPPSize;
TLLM_CHECK((cacheBlockSize * blockNum) % pPDomainSize == 0);
auto const targetBufferSize = (cacheBlockSize * blockNum) / pPDomainSize;
auto ppRank = selfIdx
/ (selfConfig.getParallelConfig().mTensorParallelism * selfConfig.getParallelConfig().mContextParallelism);
int selfAttentionLayerNum = selfConfig.getParallelConfig().mAttentionLayerNumPerPP.at(ppRank);
size_t pPDomainSize = targetInfo.mDomainPPSize;
auto getBufferSizeForTarget = [&]()
{
std::vector<size_t> bufferSizeForTarget(pPDomainSize, 0);
size_t cacheSizePerLayer = cacheBlockSize * blockNum / selfAttentionLayerNum;
for (size_t i = 0; i < pPDomainSize; i++)
{
auto layerNum = targetInfo.getPeerPPDomainLayerNum(i);
bufferSizeForTarget[i] = cacheSizePerLayer * layerNum;
}
return bufferSizeForTarget;
};
auto bufferEleSizes = getBufferSizeForTarget();
auto result = mCacheTransBufferManager->getOrAllocateSendBuffers(
cacheBufferId, pPDomainSize, targetBufferSize, bufferManager);
cacheBufferId, static_cast<int>(pPDomainSize), bufferEleSizes, bufferManager);
auto& outputSplitCaches = std::get<0>(result);
auto& bufferCoverTargetNum = std::get<1>(result);
auto& onlyUseDynamicBuffer = std::get<2>(result);
@ -192,35 +204,30 @@ void MLACacheFormatter::format(TransferSession& session)
TLLM_CUDA_CHECK(cudaSetDevice(deviceId));
auto startTime = std::chrono::steady_clock::now();
auto cacheIdx = processIdx % pPDomainSize;
size_t size;
if (cacheIdx < bufferCoverTargetNum)
{
size = outputSplitCaches.at(cacheIdx)->getSizeInBytes();
size_t size = outputSplitCaches.at(cacheIdx)->getSizeInBytes();
session.send(processIdx, outputSplitCaches.at(cacheIdx)->data(), size);
}
else if (bufferCoverTargetNum > 0)
{
// copy buffer allocated by cudaMallocAsync to buffer allocated by cudaMalloc before sending
auto sendBufferIdx = cacheIdx % bufferCoverTargetNum;
size = outputSplitCaches.at(sendBufferIdx)->getSizeInBytes();
bufferManager.copy(*outputSplitCaches.at(cacheIdx), *outputSplitCaches.at(sendBufferIdx));
bufferManager.getStream().synchronize();
session.send(processIdx, outputSplitCaches.at(sendBufferIdx)->data(), size);
}
else
{
// bufferCoverTargetNum=0, mSendBuffer size < one outputSlice
// send multiple times
size = targetBufferSize;
size_t remainSendSize = targetBufferSize;
// If cacheIdx< bufferCoverTargetNum, the ouputSplitCaches.at(cacheIdx) is allocated by cudaMallocAsync,
// which is unable to be transferred by UCX GPU-direct RDMA. We need copy the data to pre-allocated
// cudaMalloc buffer,and then start send.
// bufferCoverTargetNum=0, mSendBuffer size < one outputSlice send multiple times
size_t remainSendSize = outputSplitCaches.at(cacheIdx)->getSize();
size_t needSendSize = outputSplitCaches.at(cacheIdx)->getSize();
auto sendBufferIdx = bufferCoverTargetNum == 0 ? 0 : cacheIdx % bufferCoverTargetNum;
auto sendBufferUsed = bufferCoverTargetNum == 0 ? preAllocSendBuffer : outputSplitCaches.at(sendBufferIdx);
while (remainSendSize > 0)
{
TLLM_CHECK(preAllocSendBuffer != nullptr);
auto sendBufferEleSize = preAllocSendBuffer->getSize();
TLLM_CHECK(sendBufferUsed != nullptr);
auto sendBufferEleSize = sendBufferUsed->getSize();
auto sendSize = std::min(remainSendSize, sendBufferEleSize);
auto copySlice = runtime::ITensor::slice(
outputSplitCaches.at(cacheIdx), targetBufferSize - remainSendSize, sendSize);
auto copyTargetSlice = runtime::ITensor::slice(preAllocSendBuffer, 0, sendSize);
auto copySlice
= runtime::ITensor::slice(outputSplitCaches.at(cacheIdx), needSendSize - remainSendSize, sendSize);
auto copyTargetSlice = runtime::ITensor::slice(sendBufferUsed, 0, sendSize);
bufferManager.copy(*copySlice, *copyTargetSlice);
bufferManager.getStream().synchronize();
session.send(processIdx, copyTargetSlice->data(), copyTargetSlice->getSizeInBytes());
@ -236,7 +243,7 @@ void MLACacheFormatter::format(TransferSession& session)
}
double cacheTransferTime
= std::max(0.0, std::chrono::duration<double, std::milli>(endTime - startTime).count());
session.appendMeasure(delay, cacheTransferTime, size);
session.appendMeasure(delay, cacheTransferTime, outputSplitCaches.at(cacheIdx)->getSizeInBytes());
};
if (connections.size() > 1)
@ -360,10 +367,27 @@ void MLACacheFormatter::unformat(TransferSession& session)
auto cacheBlockSize = outputBuffers.at(0)->getSize();
auto targetNum = pickUpConnections.size();
TLLM_CHECK((cacheBlockSize * blockNum) % targetNum == 0);
auto targetBufferSize = (cacheBlockSize * blockNum) / targetNum;
auto targetInfo = executor::kv_cache::targetIRanks(destConfig, selfConfig, selfIdx);
auto ppRank = selfIdx
/ (selfConfig.getParallelConfig().mTensorParallelism * selfConfig.getParallelConfig().mContextParallelism);
auto selfAttentionLayerNum = selfConfig.getParallelConfig().mAttentionLayerNumPerPP.at(ppRank);
TLLM_CHECK_WITH_INFO(selfAttentionLayerNum != 0, "selfAttentionLayerNum should not be 0");
auto getBufferSizeForTarget = [&]()
{
std::vector<size_t> bufferEleSizes(targetNum, 0);
auto cacheSizePerLayer = cacheBlockSize * blockNum / selfAttentionLayerNum;
for (size_t i = 0; i < targetNum; i++)
{
auto layerNum = targetInfo.getPeerPPDomainLayerNum(static_cast<SizeType32>(pickUpConnections[i]));
bufferEleSizes[i] = cacheSizePerLayer * layerNum;
}
return bufferEleSizes;
};
auto bufferEleSizes = getBufferSizeForTarget();
auto result = mCacheTransBufferManager->getOrAllocateRecvBuffers(
cacheBufferId, targetNum, targetBufferSize, bufferManager);
cacheBufferId, static_cast<int>(targetNum), bufferEleSizes, bufferManager);
auto& recvSplitCaches = std::get<0>(result);
auto& bufferCoverTargetNum = std::get<1>(result);
size_t remainNoCoverTargetNum = targetNum > bufferCoverTargetNum ? targetNum - bufferCoverTargetNum : 0;
@ -394,29 +418,22 @@ void MLACacheFormatter::unformat(TransferSession& session)
size = buffer->getSizeInBytes();
session.recv(pickUpConnections.at(processIdx), buffer->data(), buffer->getSizeInBytes());
}
else if (bufferCoverTargetNum > 0)
{
auto recvBufferIdx = processIdx % bufferCoverTargetNum
+ remainNoCoverTargetNum; // caches.at(recvBufferIdx) is allocated by cudaMalloc
auto& buffer = recvSplitCaches.at(recvBufferIdx);
llmRequest.updateKvCacheSize(buffer->getSizeInBytes());
size = buffer->getSizeInBytes();
session.recv(pickUpConnections.at(processIdx), buffer->data(), buffer->getSizeInBytes());
bufferManager.copy(*recvSplitCaches.at(recvBufferIdx), *recvSplitCaches.at(processIdx));
bufferManager.getStream().synchronize();
}
else
{
auto recvBufferIdx
= bufferCoverTargetNum == 0 ? 0 : processIdx % bufferCoverTargetNum + remainNoCoverTargetNum;
auto recvBufferUsed = bufferCoverTargetNum == 0 ? preAllocRecvBuffer : recvSplitCaches[recvBufferIdx];
// bufferCoverTargetNum==0
size_t remainRecvSize = targetBufferSize;
size_t remainRecvSize = recvBufferUsed->getSize();
size_t needRecvSize = recvSplitCaches.at(processIdx)->getSize();
while (remainRecvSize > 0)
{
TLLM_CHECK(preAllocRecvBuffer != nullptr);
auto recvBufferEleSize = preAllocRecvBuffer->getSize();
TLLM_CHECK(recvBufferUsed != nullptr);
auto recvBufferEleSize = recvBufferUsed->getSize();
auto recvSize = std::min(remainRecvSize, recvBufferEleSize);
auto recvSlice = runtime::ITensor::slice(preAllocRecvBuffer, 0, recvSize);
auto recvSlice = runtime::ITensor::slice(recvBufferUsed, 0, recvSize);
auto copySlice = runtime::ITensor::slice(
recvSplitCaches.at(processIdx), targetBufferSize - remainRecvSize, recvSize);
recvSplitCaches.at(processIdx), needRecvSize - remainRecvSize, recvSize);
llmRequest.updateKvCacheSize(recvSlice->getSizeInBytes());
size += recvSlice->getSizeInBytes();
session.recv(pickUpConnections.at(processIdx), recvSlice->data(), recvSlice->getSizeInBytes());
@ -585,28 +602,6 @@ void MLACacheFormatter::unformat(TransferSession& session)
return false;
}
int selfNumLayers = selfConfig.getModelConfig().mNbKvHeadsPerLayer.size();
int selfPPSize = selfConfig.getParallelConfig().mPipelineParallelism;
int destPPSize = destConfig.getParallelConfig().mPipelineParallelism;
int destNumLayers = destConfig.getModelConfig().mNbKvHeadsPerLayer.size();
if (selfPPSize == destPPSize)
{
return true;
}
if (selfNumLayers % selfPPSize != 0)
{
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d",
selfNumLayers, selfPPSize);
return false;
}
if (destNumLayers % destPPSize != 0)
{
TLLM_LOG_WARNING("CacheFormatter::inquireSupport: layers %d must be divisible by pipeline parallelism :%d ",
destNumLayers, destPPSize);
return false;
}
return true;
}
} // namespace tensorrt_llm::batch_manager::kv_cache_manager

View File

@ -17,6 +17,7 @@
#include "connection.h"
#include "tensorrt_llm/common/envUtils.h"
#include "tensorrt_llm/executor/cache_transmission/cacheSplitConcat.h"
#include <string>
#include <unistd.h>
@ -34,6 +35,28 @@ std::string genUniqueAgentName()
return std::string(hostname) + "_" + std::to_string(pid) + "_" + std::to_string(counter++);
}
// NIXL connection is specific ,and different from the UCX and mpi connection, since NIXL only support one-sided
// communication. gen send buffer metaData to context when it sending requestInfo, but don't send buffer offset, since
// unformmatter has not called yet, it didn't know the cacheSize and offset. We assume the recv_size is the same as the
// send_size. and compute the buffer offset according to the layer num of the selfPPrank ,and previous PP rank's layer
// num, since the buffer size is ratio is equal to the layer num ratio except the VSWA case.
auto computeSendOffsetRatio(
CacheState const& peerCacheState, int peerIdx, CacheState const& selfCacheState, int validConnectionIdx)
{
auto peerTargetInfo = targetIRanks(selfCacheState, peerCacheState, peerIdx);
// int ppRank = valideConnectionIdx % peerTargetInfo.mDomainPPSize;
size_t offsetLayer = 0;
for (int i = 0; i < validConnectionIdx; i++)
{
offsetLayer += peerTargetInfo.getPeerPPDomainLayerNum(i);
}
size_t selfSendLayer = peerTargetInfo.getPeerPPDomainLayerNum(validConnectionIdx);
return std::make_pair(offsetLayer, selfSendLayer);
}
AgentConnection::AgentConnection(
std::string mAgentName, std::string mRemoteAgentName, AgentConnectionManager* mAgentConnectionManager)
: mAgentName(mAgentName)
@ -82,7 +105,8 @@ void AgentConnection::send(DataContext const& ctx, void const* data, size_t size
reinterpret_cast<uintptr_t>(data), size, static_cast<uint32_t>(mAgentConnectionManager->getDeviceId())};
MemoryDescs srcDescs{MemoryType::kVRAM, {srcDesc}};
auto dstBaseDesc = mSenderState.mReceiverBufferDesc;
MemoryDesc dstDesc{dstBaseDesc.getAddr() + (mSenderState.validSegmentIdx * size), size, dstBaseDesc.getDeviceId()};
auto offset = size / mSenderState.mOffsetRatio.second * mSenderState.mOffsetRatio.first;
MemoryDesc dstDesc{dstBaseDesc.getAddr() + offset, size, dstBaseDesc.getDeviceId()};
TLLM_LOG_DEBUG(
"send dstDesc: %p, size: %ld ,validSegmentIdx: %ld", dstDesc.getAddr(), size, mSenderState.validSegmentIdx);
MemoryDescs dstDescs{MemoryType::kVRAM, {dstDesc}};
@ -137,10 +161,12 @@ void AgentConnection::sendRequestAndBufferInfo(
mAgentConnectionManager->getAgent()->notifySyncMessage(mRemoteAgentName, ss.str());
}
void AgentConnection::setSenderState(MemoryDesc mReceiverBufferDesc, int validSegmentIdx)
void AgentConnection::setSenderState(
MemoryDesc mReceiverBufferDesc, int validSegmentIdx, std::pair<size_t, size_t> offsetRatio)
{
mSenderState.mReceiverBufferDesc = mReceiverBufferDesc;
mSenderState.validSegmentIdx = validSegmentIdx;
mSenderState.mOffsetRatio = offsetRatio;
}
void AgentConnection::setHasLoadRemoteAgent(bool hasLoadRemoteAgent)
@ -155,8 +181,9 @@ bool AgentConnection::hasLoadRemoteAgent() const
}
AgentConnectionManager::AgentConnectionManager(
batch_manager::kv_cache_manager::CacheTransBufferManager* cacheTransBufferManager)
: mRegMemDescs(MemoryType::kVRAM, {})
batch_manager::kv_cache_manager::CacheTransBufferManager* cacheTransBufferManager, CacheState cacheState)
: mCacheState(std::move(cacheState))
, mRegMemDescs(MemoryType::kVRAM, {})
{
TLLM_CUDA_CHECK(cudaGetDevice(&mDeviceId));
TLLM_CHECK(mDeviceId != -1);
@ -260,7 +287,10 @@ AgentConnection const* AgentConnectionManager::recvConnectionAndRequestInfo(batc
auto remoteAgentName = requestAndBufferInfo.mAgentName;
TLLM_LOG_DEBUG(" recv Address:%s", address.c_str());
auto connection = connect(remoteAgentName, address, metadataOpt, true);
connection->setSenderState(bufferDesc, validConnectionIdx);
// to compute the offset.
auto offsetRatio = computeSendOffsetRatio(requestInfo.getTransState().getCacheState().value(),
requestInfo.getTransState().getCommState()->getSelfIdx(), mCacheState, validConnectionIdx);
connection->setSenderState(bufferDesc, validConnectionIdx, offsetRatio);
it2 = notifs.erase(it2);
if (notifs.empty())
{
@ -328,7 +358,7 @@ batch_manager::kv_cache_manager::CacheTransBufferManager* AgentConnectionManager
return mCacheTransBufferManager;
}
AgentConnection* AgentConnectionManager::connect(std::string const& remoteAgentName, std::string const& connecitonInfo,
AgentConnection* AgentConnectionManager::connect(std::string const& remoteAgentName, std::string const& connectionInfo,
std::optional<std::string> metadata, bool isSender)
{
@ -369,7 +399,7 @@ AgentConnection* AgentConnectionManager::connect(std::string const& remoteAgentN
TLLM_CHECK_WITH_INFO(!isSender, "Sender shouldn't call connectRemoteAgent");
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), "mAgentName: %s connect to %s with connectRemoteAgent",
mAgentName.c_str(), remoteAgentName.c_str());
m_Agent->connectRemoteAgent(remoteAgentName, connecitonInfo);
m_Agent->connectRemoteAgent(remoteAgentName, connectionInfo);
}
}
else

View File

@ -175,7 +175,7 @@ public:
void recv(DataContext const& ctx, void* data, size_t size) const override;
void sendRequestAndBufferInfo(
batch_manager::RequestInfo& requestInfo, std::optional<size_t> cacheBufferId, int validConnectionIdx);
void setSenderState(MemoryDesc mReceiverBufferDesc, int valideSegmentIdx);
void setSenderState(MemoryDesc mReceiverBufferDesc, int valideSegmentIdx, std::pair<size_t, size_t> offsetRatio);
[[nodiscard]] std::optional<size_t> getCacheBufferId() const;
void setHasLoadRemoteAgent(bool hasLoadRemoteAgent);
[[nodiscard]] bool hasLoadRemoteAgent() const;
@ -188,6 +188,7 @@ private:
{
MemoryDesc mReceiverBufferDesc{nullptr, 0, 0};
int validSegmentIdx{0};
std::pair<size_t, size_t> mOffsetRatio;
SenderState() = default;
};
@ -203,7 +204,8 @@ private:
class AgentConnectionManager : public ConnectionManager
{
public:
AgentConnectionManager(batch_manager::kv_cache_manager::CacheTransBufferManager* cacheTransBufferManager);
AgentConnectionManager(
batch_manager::kv_cache_manager::CacheTransBufferManager* cacheTransBufferManager, CacheState cacheState);
~AgentConnectionManager();
AgentConnection* recvConnect(DataContext const& ctx, void* data, size_t size) override;
[[nodiscard]] std::vector<Connection const*> getConnections(CommState const& state) override;
@ -222,6 +224,7 @@ private:
std::map<std::string, std::shared_ptr<AgentConnection>> mConnections;
std::mutex mConnectionsMutex;
CommState mCommState;
CacheState mCacheState;
batch_manager::kv_cache_manager::CacheTransBufferManager* mCacheTransBufferManager;
std::mutex mNotificationMutex;
std::unordered_map<std::string, std::list<std::string>> mUnhandledNotifications;

View File

@ -67,22 +67,49 @@ TargetRanksInfo TargetRanksInfoForDP(
int peerPPRankStart = 0;
int mDomainPPSize = 1;
int peerPPRankEnd = 0;
for (auto val : {peerPPNum, selfPPNum})
{
TLLM_CHECK(isPowerOfTwo(val));
}
if (selfPPNum <= peerPPNum)
{
mDomainPPSize = peerPPNum / selfPPNum;
peerPPRankStart = selfPPRank * mDomainPPSize;
peerPPRankEnd = (selfPPRank + 1) * mDomainPPSize;
}
else
{
peerPPRankStart = selfPPRank / (selfPPNum / peerPPNum);
peerPPRankEnd = peerPPRankStart + mDomainPPSize;
}
std::vector<SizeType32> peerNumLayerPerPP = peerParConfig.mAttentionLayerNumPerPP;
std::vector<SizeType32> selfNumLayerPerPP = selfParConfig.mAttentionLayerNumPerPP;
TLLM_CHECK(peerNumLayerPerPP.size() == peerPPNum);
TLLM_CHECK(selfNumLayerPerPP.size() == selfPPNum);
int selfStartLayerId = 0;
// global start layer id for selfPPrank, which is the sum of the layer num of the previous PP ranks.
// compute the target PP ranks and layer num need to be fetched from each target PP rank, according to [global start
// layer id, global end layer id)
for (int ppRank = 0; ppRank < selfPPRank; ppRank++)
{
selfStartLayerId += selfNumLayerPerPP[ppRank];
}
int selfEndLayerId = selfStartLayerId + selfNumLayerPerPP[selfPPRank];
int prePeerPPLayerId = 0;
std::vector<int> targetPeerPPRanks;
std::vector<int> targetPeerPPLayerNum;
for (int ppRank = 0; ppRank < peerPPNum; ppRank++)
{
int peerPPStartLayerId = prePeerPPLayerId;
int peerPPEndLayerId = peerPPStartLayerId + peerNumLayerPerPP[ppRank];
prePeerPPLayerId += peerNumLayerPerPP[ppRank];
if (selfStartLayerId < peerPPEndLayerId && selfEndLayerId > peerPPStartLayerId)
{
targetPeerPPRanks.push_back(ppRank);
int layerNumInDomainPP
= std::min(peerPPEndLayerId, selfEndLayerId) - std::max(peerPPStartLayerId, selfStartLayerId);
targetPeerPPLayerNum.push_back(layerNumInDomainPP);
}
}
mDomainPPSize = static_cast<int>(targetPeerPPRanks.size());
peerPPRankStart = targetPeerPPRanks.front();
peerPPRankEnd = peerPPRankStart + mDomainPPSize;
TLLM_CHECK(targetPeerPPLayerNum.size() == mDomainPPSize);
int targetPeerPpLayerNumSum = std::accumulate(targetPeerPPLayerNum.begin(), targetPeerPPLayerNum.end(), 0);
TLLM_CHECK(targetPeerPpLayerNumSum == selfNumLayerPerPP[selfPPRank]);
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
"selfPPRank:%d,selfPPNum:%d,peerPPNum:%d,selfTPNum:%d,peerTPNum:%d,peerPPRankStart:%d,peerPPRankEnd:%d",
selfPPRank, selfPPNum, peerPPNum, selfTPNum, peerTPNum, peerPPRankStart, peerPPRankEnd);
int peerTPRankStart = 0;
int mDomainTPSize = 1;
int peerTPRankEnd = 0;
@ -156,7 +183,27 @@ TargetRanksInfo TargetRanksInfoForDP(
= (peerNbHeadsPerLayer * peerTPSizePerDPGroup) / (selfNbHeadsPerLayer * selfTPSizePerDPGroup);
}
return {mDomainPPSize, mDomainTPSize, mDomainCPSize, std::move(retRanks), mDupHeadFactor, mPeerDupHeadFactor};
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(),
"mDomainPPSize:%d, mDomainTPSize:%d, mDupHeadFactor:%d, mPeerDupHeadFactor:%d, selfPPRank:%d, selfPPNum:%d, "
"peerPPNum:%d, selfTPNum:%d, peerTPNum:%d, selfTPSizePerDPGroup:%d, peerTPSizePerDPGroup:%d, "
"selfNbHeadsPerLayer:%d, peerNbHeadsPerLayer:%d, selfTPrankInDPGroup:%d, peerDpRank:%d, selfRank:%d",
mDomainPPSize, mDomainTPSize, mDupHeadFactor, mPeerDupHeadFactor, selfPPRank, selfPPNum, peerPPNum, selfTPNum,
peerTPNum, selfTPSizePerDPGroup, peerTPSizePerDPGroup, selfNbHeadsPerLayer, peerNbHeadsPerLayer,
selfTPrankInDPGroup, peerDpRank, selfRank);
auto vector_to_string = [](std::vector<int> const& vec)
{
std::stringstream ss;
for (auto val : vec)
{
ss << val << ",";
}
return ss.str();
};
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), "retRanks:%s , targetPeerPPLayerNum:%s",
vector_to_string(retRanks).c_str(), vector_to_string(targetPeerPPLayerNum).c_str());
return {mDomainPPSize, mDomainTPSize, mDomainCPSize, std::move(retRanks), mDupHeadFactor, mPeerDupHeadFactor,
std::move(targetPeerPPLayerNum)};
}
TargetRanksInfo targetIRanks(
@ -496,12 +543,37 @@ nvinfer1::Dims makeShapeFromCacheState(kv_cache::CacheState const& cacheState)
cacheState.getAttentionConfig().mKvFactor, blockSize});
}
__device__ __forceinline__ void getLayerIdInDomainPPandRankInDomainPP(int layerId, int DomainPPSize,
uint64_t* prefixLayerNumDevPtr, int& layerIdInDomainPP, int& rankInDomainPP, int& layerNumInSpecPP)
{
__shared__ int sharedLayerIdInDomainPP;
__shared__ int sharedRankInDomainPP;
__shared__ int sharedLayerNumInSpecPP;
#pragma unroll 1
for (int ppRank = threadIdx.x; ppRank < DomainPPSize; ppRank += blockDim.x)
{
if (layerId >= prefixLayerNumDevPtr[ppRank] && layerId < prefixLayerNumDevPtr[ppRank + 1])
{
sharedLayerIdInDomainPP = layerId - prefixLayerNumDevPtr[ppRank];
sharedRankInDomainPP = ppRank;
sharedLayerNumInSpecPP = prefixLayerNumDevPtr[ppRank + 1] - prefixLayerNumDevPtr[ppRank];
break;
}
}
__syncthreads();
layerIdInDomainPP = sharedLayerIdInDomainPP;
rankInDomainPP = sharedRankInDomainPP;
layerNumInSpecPP = sharedLayerNumInSpecPP;
}
// MLA Head 1: One thread block per [(2), tokens, dimsPerHead]
template <typename T, int subWarpSize, int vecSizeByte>
__global__ void splitKVCacheForMLAKernel(T const** __restrict__ inputBlocks, T** __restrict__ outputCaches,
int tokensPerBlock, int numLayers, int headNum, int dimsPerHead, int inputBlockNum, int DomainPPSize,
int DomainTPSize, int layerNumDomainPP, int kvFactor)
int DomainTPSize, int kvFactor, uint64_t* prefixLayerNumDevPtr)
{
int const subWarpId = threadIdx.x / subWarpSize;
int const laneId = threadIdx.x % subWarpSize;
@ -518,19 +590,25 @@ __global__ void splitKVCacheForMLAKernel(T const** __restrict__ inputBlocks, T**
for (int layerId = blockIdx.x; layerId < numLayers; layerId += gridDim.x)
{
int layerIdInDomainPP{};
int rankInDomainPP{};
int layerNumInSpecPP{};
getLayerIdInDomainPPandRankInDomainPP(
layerId, DomainPPSize, prefixLayerNumDevPtr, layerIdInDomainPP, rankInDomainPP, layerNumInSpecPP);
#pragma unroll 1
for (int headId = 0; headId < headNum; headId++)
{
T const* inputBlockPtr = inputBlocks[blockId];
T const* kInputPtr = inputBlockPtr + layerId * kvFactor * headNum * tokensPerBlock * dimsPerHead
+ headId * tokensPerBlock * dimsPerHead;
int const outputCacheIdx = layerId / layerNumDomainPP;
int outputCacheIdx = rankInDomainPP;
T* outputCachePtr = outputCaches[outputCacheIdx];
int const layerIdInDomainPP = layerId % layerNumDomainPP;
int const headIdInDomainTP = headId;
T* kOutputPtr = outputCachePtr
+ blockId * (layerNumDomainPP * kvFactor * headNum * tokensPerBlock * dimsPerHead)
+ blockId * (layerNumInSpecPP * kvFactor * headNum * tokensPerBlock * dimsPerHead)
+ layerIdInDomainPP * kvFactor * headNum * tokensPerBlock * dimsPerHead
+ headIdInDomainTP * tokensPerBlock * dimsPerHead;
int const kvOffset = headNum * tokensPerBlock * dimsPerHead;
@ -565,7 +643,7 @@ __global__ void splitKVCacheForMLAKernel(T const** __restrict__ inputBlocks, T**
template <typename T, int subWarpSize, int subWarpNumInGroup, int vecSizeByte>
__global__ void splitKVCacheKernel(T const** __restrict__ inputBlocks, T** __restrict__ outputCaches,
int tokensPerBlock, int numLayers, int headNum, int dimsPerHead, int inputBlockNum, int DomainPPSize,
int DomainTPSize, int layerNumDomainPP, int headNumDomainTP)
int DomainTPSize, int headNumDomainTP, uint64_t* prefixLayerNumDevPtr)
{
// layerNumDomainPP = numLayers/DomainPPSize
@ -587,6 +665,13 @@ __global__ void splitKVCacheKernel(T const** __restrict__ inputBlocks, T** __res
for (int layerId = blockIdx.x; layerId < numLayers; layerId += gridDim.x)
{
int layerIdInDomainPP{};
int rankInDomainPP{};
int layerNumInSpecPP{};
getLayerIdInDomainPPandRankInDomainPP(
layerId, DomainPPSize, prefixLayerNumDevPtr, layerIdInDomainPP, rankInDomainPP, layerNumInSpecPP);
#pragma unroll 1
for (int headId = subWarpGroupId; headId < headNum; headId += subWarpGroupNum)
@ -598,13 +683,12 @@ __global__ void splitKVCacheKernel(T const** __restrict__ inputBlocks, T** __res
T const* vInputPtr = inputBlockPtr + (layerId * 2 + 1) * headNum * tokensPerBlock * dimsPerHead
+ headId * tokensPerBlock * dimsPerHead;
int outputCacheIdx = headId / headNumDomainTP * DomainPPSize + layerId / layerNumDomainPP;
int outputCacheIdx = headId / headNumDomainTP * DomainPPSize + rankInDomainPP;
T* outputCachePtr = outputCaches[outputCacheIdx];
int layerIdInDomainPP = layerId % layerNumDomainPP;
int headIdInDomainTP = headId % headNumDomainTP;
T* kOutputPtr = outputCachePtr
+ blockId * (layerNumDomainPP * 2 * headNumDomainTP * tokensPerBlock * dimsPerHead)
+ blockId * (layerNumInSpecPP * 2 * headNumDomainTP * tokensPerBlock * dimsPerHead)
+ layerIdInDomainPP * 2 * headNumDomainTP * tokensPerBlock * dimsPerHead
+ headIdInDomainTP * tokensPerBlock * dimsPerHead;
@ -746,7 +830,7 @@ __global__ void splitKVCacheForWindowKernel(T const** __restrict__ inputBlocks,
template <typename T, int subWarpSize, int vecSizeByte>
__global__ void concatKVCacheForMLAKernel(T const** __restrict__ inputCaches, T** __restrict__ outputBlocks,
int tokensPerBlock, int numLayers, int headNum, int dimsPerHead, int outputBlockNum, int DomainPPSize,
int DomainTPSize, int layerNumDomainPP, int kvFactor)
int DomainTPSize, int kvFactor, uint64_t* prefixLayerNumDevPtr)
{
int const subWarpId = threadIdx.x / subWarpSize;
@ -761,7 +845,11 @@ __global__ void concatKVCacheForMLAKernel(T const** __restrict__ inputCaches, T*
#pragma unroll 1
for (int layerId = blockIdx.x; layerId < numLayers; layerId += gridDim.x)
{
int layerIdInDomainPP{};
int rankInDomainPP{};
int layerNumInSpecPP{};
getLayerIdInDomainPPandRankInDomainPP(
layerId, DomainPPSize, prefixLayerNumDevPtr, layerIdInDomainPP, rankInDomainPP, layerNumInSpecPP);
#pragma unroll 1
for (int headId = 0; headId < headNum; headId++)
@ -769,13 +857,12 @@ __global__ void concatKVCacheForMLAKernel(T const** __restrict__ inputCaches, T*
T* outputBlockPtr = outputBlocks[blockId];
T* kOutputPtr = outputBlockPtr + layerId * kvFactor * headNum * tokensPerBlock * dimsPerHead
+ headId * tokensPerBlock * dimsPerHead;
int inputCacheIdx = layerId / layerNumDomainPP;
int inputCacheIdx = rankInDomainPP;
T const* inputCachePtr = inputCaches[inputCacheIdx];
int layerIdInDomainPP = layerId % layerNumDomainPP;
int headIdInDomainTP = headId;
T const* kInputPtr = inputCachePtr
+ blockId * (layerNumDomainPP * kvFactor * headNum * tokensPerBlock * dimsPerHead)
+ blockId * (layerNumInSpecPP * kvFactor * headNum * tokensPerBlock * dimsPerHead)
+ layerIdInDomainPP * kvFactor * headNum * tokensPerBlock * dimsPerHead
+ headIdInDomainTP * tokensPerBlock * dimsPerHead;
int const kvOffset = headNum * tokensPerBlock * dimsPerHead;
@ -804,7 +891,7 @@ __global__ void concatKVCacheForMLAKernel(T const** __restrict__ inputCaches, T*
template <typename T, int subWarpSize, int subWarpNumInGroup, int vecSizeByte>
__global__ void concatKVCacheKernel(T const** __restrict__ inputCaches, T** __restrict__ outputBlocks,
int tokensPerBlock, int numLayers, int headNum, int dimsPerHead, int outputBlockNum, int DomainPPSize,
int DomainTPSize, int layerNumDomainPP, int headNumDomainTP)
int DomainTPSize, int headNumDomainTP, uint64_t* prefixLayerNumDevPtr)
{
int const subWarpId = threadIdx.x / subWarpSize;
int const laneId = threadIdx.x % subWarpSize;
@ -821,6 +908,11 @@ __global__ void concatKVCacheKernel(T const** __restrict__ inputCaches, T** __re
#pragma unroll 1
for (int layerId = blockIdx.x; layerId < numLayers; layerId += gridDim.x)
{
int layerIdInDomainPP{};
int rankInDomainPP{};
int layerNumInSpecPP{};
getLayerIdInDomainPPandRankInDomainPP(
layerId, DomainPPSize, prefixLayerNumDevPtr, layerIdInDomainPP, rankInDomainPP, layerNumInSpecPP);
#pragma unroll 1
for (int headId = subWarpGroupId; headId < headNum; headId += subWarpGroupNum)
@ -832,13 +924,12 @@ __global__ void concatKVCacheKernel(T const** __restrict__ inputCaches, T** __re
T* vOutputPtr = outputBlockPtr + (layerId * 2 + 1) * headNum * tokensPerBlock * dimsPerHead
+ headId * tokensPerBlock * dimsPerHead;
int inputCacheIdx = headId / headNumDomainTP * DomainPPSize + layerId / layerNumDomainPP;
int inputCacheIdx = headId / headNumDomainTP * DomainPPSize + rankInDomainPP;
T const* inputCachePtr = inputCaches[inputCacheIdx];
int layerIdInDomainPP = layerId % layerNumDomainPP;
int headIdInDomainTP = headId % headNumDomainTP;
T const* kInputPtr = inputCachePtr
+ blockId * (layerNumDomainPP * 2 * headNumDomainTP * tokensPerBlock * dimsPerHead)
+ blockId * (layerNumInSpecPP * 2 * headNumDomainTP * tokensPerBlock * dimsPerHead)
+ layerIdInDomainPP * 2 * headNumDomainTP * tokensPerBlock * dimsPerHead
+ headIdInDomainTP * tokensPerBlock * dimsPerHead;
@ -942,7 +1033,9 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
}
TLLM_CHECK(outputCacheNum == outputSplitBlocks.size());
TLLM_CHECK(inputBlockNumSum > 0);
std::vector<T*> cachePtrs;
// we want to reduce the call of `cudaMemcpyAsync H2D` , cachePtrs is used to store the pointers of the cache blocks
// and the values of the prefix layer num.
std::vector<uint64_t> cachePtrs;
std::vector<SizeType32> windowSizes;
std::vector<SizeType32> blockNumInwindow;
std::vector<SizeType32> layersInWindow;
@ -965,7 +1058,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
TLLM_CHECK(kvCacheBlock->getDataType() == cacheDataType);
TLLM_CHECK(kvCacheBlock->getSize() == cacheBlockSize);
cacheBlockSizeSum += kvCacheBlock->getSize();
cachePtrs.push_back(static_cast<T*>(kvCacheBlock->data()));
cachePtrs.push_back(reinterpret_cast<uint64_t>((kvCacheBlock->data())));
inputBlockLayerNumSum += layersNum;
}
}
@ -973,10 +1066,15 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
for (auto&& outputSplitBlock : outputSplitBlocks)
{
TLLM_CHECK(outputSplitBlock->getDataType() == cacheDataType);
TLLM_CHECK(outputSplitBlock->getSize() == cacheBlockSizeSum / outputCacheNum);
cachePtrs.push_back(static_cast<T*>(outputSplitBlock->data()));
cachePtrs.push_back(reinterpret_cast<uint64_t>(outputSplitBlock->data()));
}
std::vector<uint64_t> prefixLayerNum(targetRankInfo.mDomainPPSize + 1, 0);
prefixLayerNum[0] = 0;
for (int i = 0; i < targetRankInfo.mDomainPPSize; i++)
{
prefixLayerNum[i + 1] = prefixLayerNum[i] + targetRankInfo.mPeerAttentionLayerNumInDomainPP[i];
}
cachePtrs.insert(cachePtrs.end(), prefixLayerNum.begin(), prefixLayerNum.end());
bool const isWindow = windowSizes.size() > 1;
runtime::BufferManager::IBufferPtr PtrsDeviceBuffer
@ -1037,23 +1135,25 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
int const sizePerHead = selfModelConfig.mSizePerHead;
T const** inputBlockPtrsDev = static_cast<T const**>(PtrsDeviceBuffer->data());
T** outputCachePtrsDev = static_cast<T**>(PtrsDeviceBuffer->data()) + inputBlockNumSum;
uint64_t* prefixLayerNumDevPtr
= static_cast<uint64_t*>(PtrsDeviceBuffer->data()) + inputBlockNumSum + outputSplitBlocks.size();
int const tokensPerBlock = selfModelConfig.mTokensPerBlock;
int const numLayers = selfModelConfig.mNbKvHeadsPerLayer.size() / oPPNum;
int selfPPRank = selfIdx / (selfParallelConfig.mTensorParallelism * selfParallelConfig.mContextParallelism);
int const numLayers = selfParallelConfig.mAttentionLayerNumPerPP.at(selfPPRank);
int const headNum = selfModelConfig.mNbKvHeadsPerLayer[0];
int const dimsPerHead = selfModelConfig.mSizePerHead;
int const DomainPPSize = targetRankInfo.mDomainPPSize;
int const DomainTPSize = targetRankInfo.mDomainTPSize;
int const layerNumDomainPP = numLayers / DomainPPSize;
int const headNumDomainTP
= headNum / (DomainTPSize / targetRankInfo.mPeerDupHeadFactor); // TODO: duplicate head factor
int const headNumDomainTP = headNum / (DomainTPSize / targetRankInfo.mPeerDupHeadFactor);
int const kvFactor = selfAttentionConfig.mKvFactor;
bool const isMLA = selfAttentionConfig.mAttentionType == CacheState::AttentionType::kMLA;
constexpr int mlaSubWarpSize = 16;
TLLM_LOG_DEBUG(
"splitKVCache - numLayers: %d, headNum: %d, domainPPSize: %d, domainTPSize: %d, "
"layersPerDomainPP: %d, headsPerDomainTP: %d",
numLayers, headNum, DomainPPSize, DomainTPSize, layerNumDomainPP, headNumDomainTP);
"headsPerDomainTP: %d",
numLayers, headNum, DomainPPSize, DomainTPSize, headNumDomainTP);
int const remainder = sizePerHead * sizeof(T) % 16;
switch (remainder)
@ -1064,7 +1164,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
{
splitKVCacheForMLAKernel<T, mlaSubWarpSize, 16><<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(
inputBlockPtrsDev, outputCachePtrsDev, tokensPerBlock, numLayers, headNum, dimsPerHead,
inputBlockNumSum, DomainPPSize, DomainTPSize, layerNumDomainPP, kvFactor);
inputBlockNumSum, DomainPPSize, DomainTPSize, kvFactor, prefixLayerNumDevPtr);
}
else if (isWindow)
{
@ -1078,7 +1178,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
splitKVCacheKernel<T, subWarpSize, subWarpNumInGroup, 16>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputBlockPtrsDev, outputCachePtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, inputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, headNumDomainTP);
headNumDomainTP, prefixLayerNumDevPtr);
}
break;
}
@ -1088,7 +1188,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
{
splitKVCacheForMLAKernel<T, mlaSubWarpSize, 8><<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(
inputBlockPtrsDev, outputCachePtrsDev, tokensPerBlock, numLayers, headNum, dimsPerHead,
inputBlockNumSum, DomainPPSize, DomainTPSize, layerNumDomainPP, kvFactor);
inputBlockNumSum, DomainPPSize, DomainTPSize, kvFactor, prefixLayerNumDevPtr);
}
else if (isWindow)
{
@ -1102,7 +1202,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
splitKVCacheKernel<T, subWarpSize, subWarpNumInGroup, 8>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputBlockPtrsDev, outputCachePtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, inputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, headNumDomainTP);
headNumDomainTP, prefixLayerNumDevPtr);
}
break;
}
@ -1116,7 +1216,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
splitKVCacheForMLAKernel<T, mlaSubWarpSize, 4>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputBlockPtrsDev, outputCachePtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, inputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, kvFactor);
kvFactor, prefixLayerNumDevPtr);
}
else if (isWindow)
{
@ -1131,7 +1231,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
splitKVCacheKernel<T, subWarpSize, subWarpNumInGroup, 4>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputBlockPtrsDev, outputCachePtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, inputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, headNumDomainTP);
headNumDomainTP, prefixLayerNumDevPtr);
}
break;
}
@ -1149,7 +1249,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
splitKVCacheForMLAKernel<T, mlaSubWarpSize, 2>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputBlockPtrsDev, outputCachePtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, inputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, kvFactor);
kvFactor, prefixLayerNumDevPtr);
}
else if (isWindow)
{
@ -1164,7 +1264,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
splitKVCacheKernel<T, subWarpSize, subWarpNumInGroup, 2>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputBlockPtrsDev, outputCachePtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, inputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, headNumDomainTP);
headNumDomainTP, prefixLayerNumDevPtr);
}
break;
}
@ -1178,7 +1278,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
splitKVCacheForMLAKernel<T, mlaSubWarpSize, 1>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputBlockPtrsDev, outputCachePtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, inputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, kvFactor);
kvFactor, prefixLayerNumDevPtr);
}
else if (isWindow)
{
@ -1193,7 +1293,7 @@ void splitKVCache(std::map<SizeType32, std::vector<runtime::ITensor::SharedPtr>>
splitKVCacheKernel<T, subWarpSize, subWarpNumInGroup, 1>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputBlockPtrsDev, outputCachePtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, inputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, headNumDomainTP);
headNumDomainTP, prefixLayerNumDevPtr);
}
break;
}
@ -1274,7 +1374,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
TLLM_CHECK(inputCacheNum == inputSplitBlocks.size());
TLLM_CHECK(outputBlockNumSum > 0);
std::vector<T*> cachePtrs;
std::vector<uint64_t> cachePtrs;
std::vector<SizeType32> windowSizes;
std::vector<SizeType32> blockNumInwindow;
std::vector<SizeType32> layersInWindow;
@ -1294,7 +1394,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
{
TLLM_CHECK(kvCacheBlock->getDataType() == cacheDataType);
TLLM_CHECK(kvCacheBlock->getSize() == cacheBlockSize);
cachePtrs.push_back(static_cast<T*>(kvCacheBlock->data()));
cachePtrs.push_back(reinterpret_cast<uint64_t>(kvCacheBlock->data()));
cacheBlockSizeSum += kvCacheBlock->getSize();
}
outputBlockLayerNumSum += layersNum * blocks.size();
@ -1302,12 +1402,23 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
for (auto&& inputSplitBlock : inputSplitBlocks)
{
TLLM_CHECK(inputSplitBlock->getDataType() == cacheDataType);
TLLM_CHECK(inputSplitBlock->getSize() == cacheBlockSizeSum / inputCacheNum);
cachePtrs.push_back(static_cast<T*>(inputSplitBlock->data()));
cachePtrs.push_back(reinterpret_cast<uint64_t>(inputSplitBlock->data()));
}
// the prefix layer num is used to store the layer num of the previous PP ranks.
// which is helpful for the kernel to get layer num info. refer to the function
// `getLayerIdInDomainPPandRankInDomainPP`.
std::vector<uint64_t> prefixLayerNum(targetRankInfo.mDomainPPSize + 1, 0);
prefixLayerNum[0] = 0;
for (int i = 0; i < targetRankInfo.mDomainPPSize; i++)
{
prefixLayerNum[i + 1] = prefixLayerNum[i] + targetRankInfo.mPeerAttentionLayerNumInDomainPP[i];
}
cachePtrs.insert(cachePtrs.end(), prefixLayerNum.begin(), prefixLayerNum.end());
runtime::BufferManager::IBufferPtr PtrsDeviceBuffer
= bufferManager.gpu(cachePtrs.size(), nvinfer1::DataType::kINT64);
TLLM_CHECK(PtrsDeviceBuffer->getSizeInBytes() == cachePtrs.size() * sizeof(T*));
TLLM_CHECK(PtrsDeviceBuffer->getSizeInBytes() == cachePtrs.size() * sizeof(uint64_t));
bufferManager.copy(cachePtrs.data(), *PtrsDeviceBuffer, runtime::MemoryType::kCPU);
bool const isWindow = windowSizes.size() > 1;
runtime::BufferManager::IBufferPtr windowInfoDeviceBuffer;
@ -1350,14 +1461,17 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
int const endLayerId = selfModelConfig.mNbKvHeadsPerLayer.size() / oPPNum;
T** ouptutBlockPtrsDev = static_cast<T**>(PtrsDeviceBuffer->data());
T const** inputSplitBlockPtrsDev = static_cast<T const**>(PtrsDeviceBuffer->data()) + outputBlockNumSum;
uint64_t* prefixLayerNumDevPtr
= static_cast<uint64_t*>(PtrsDeviceBuffer->data()) + outputBlockNumSum + inputSplitBlocks.size();
int const tokensPerBlock = selfModelConfig.mTokensPerBlock;
int const numLayers = selfModelConfig.mNbKvHeadsPerLayer.size() / oPPNum;
int selfPPRank = selfIdx / (selfParallelConfig.mTensorParallelism * selfParallelConfig.mContextParallelism);
int const numLayers = selfParallelConfig.mAttentionLayerNumPerPP.at(selfPPRank);
TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), "concatKVCache numLayers:%d", numLayers);
int const headNum = selfModelConfig.mNbKvHeadsPerLayer[0];
int const dimsPerHead = selfModelConfig.mSizePerHead;
int const DomainPPSize = targetRankInfo.mDomainPPSize;
int const DomainTPSize = targetRankInfo.mDomainTPSize;
int const layerNumDomainPP = numLayers / DomainPPSize;
int const headNumDomainTP
= headNum / (DomainTPSize / targetRankInfo.mPeerDupHeadFactor); // TODO: duplicate head factor
int const kvFactor = selfAttentionConfig.mKvFactor;
@ -1365,8 +1479,8 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
bool isMLA = selfAttentionConfig.mAttentionType == CacheState::AttentionType::kMLA;
TLLM_LOG_DEBUG(
"concatKVCache - numLayers: %d, headNum: %d, domainPPSize: %d, domainTPSize: %d, "
"layersPerDomainPP: %d, headsPerDomainTP: %d",
numLayers, headNum, DomainPPSize, DomainTPSize, layerNumDomainPP, headNumDomainTP);
"headsPerDomainTP: %d",
numLayers, headNum, DomainPPSize, DomainTPSize, headNumDomainTP);
int const remainder = sizePerHead * sizeof(T) % 16;
@ -1380,7 +1494,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
concatKVCacheForMLAKernel<T, mlaSubWarpSize, 16>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputSplitBlockPtrsDev, ouptutBlockPtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, outputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, kvFactor);
kvFactor, prefixLayerNumDevPtr);
}
else if (isWindow)
{
@ -1394,7 +1508,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
concatKVCacheKernel<T, subWarpSize, subWarpNumInGroup, 16>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputSplitBlockPtrsDev, ouptutBlockPtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, outputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, headNumDomainTP);
headNumDomainTP, prefixLayerNumDevPtr);
}
break;
}
@ -1404,7 +1518,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
{
concatKVCacheForMLAKernel<T, mlaSubWarpSize, 8><<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(
inputSplitBlockPtrsDev, ouptutBlockPtrsDev, tokensPerBlock, numLayers, headNum, dimsPerHead,
outputBlockNumSum, DomainPPSize, DomainTPSize, layerNumDomainPP, kvFactor);
outputBlockNumSum, DomainPPSize, DomainTPSize, kvFactor, prefixLayerNumDevPtr);
}
else if (isWindow)
{
@ -1418,7 +1532,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
concatKVCacheKernel<T, subWarpSize, subWarpNumInGroup, 8>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputSplitBlockPtrsDev, ouptutBlockPtrsDev,
tokensPerBlock, numLayers, headNum, dimsPerHead, outputBlockNumSum, DomainPPSize, DomainTPSize,
layerNumDomainPP, headNumDomainTP);
headNumDomainTP, prefixLayerNumDevPtr);
}
break;
}
@ -1432,7 +1546,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
concatKVCacheForMLAKernel<T, mlaSubWarpSize, 4>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputSplitBlockPtrsDev,
ouptutBlockPtrsDev, tokensPerBlock, numLayers, headNum, dimsPerHead, outputBlockNumSum,
DomainPPSize, DomainTPSize, layerNumDomainPP, kvFactor);
DomainPPSize, DomainTPSize, kvFactor, prefixLayerNumDevPtr);
}
else if (isWindow)
{
@ -1447,7 +1561,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
concatKVCacheKernel<T, subWarpSize, subWarpNumInGroup, 4>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputSplitBlockPtrsDev,
ouptutBlockPtrsDev, tokensPerBlock, numLayers, headNum, dimsPerHead, outputBlockNumSum,
DomainPPSize, DomainTPSize, layerNumDomainPP, headNumDomainTP);
DomainPPSize, DomainTPSize, headNumDomainTP, prefixLayerNumDevPtr);
}
break;
@ -1465,7 +1579,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
concatKVCacheForMLAKernel<T, mlaSubWarpSize, 2>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputSplitBlockPtrsDev,
ouptutBlockPtrsDev, tokensPerBlock, numLayers, headNum, dimsPerHead, outputBlockNumSum,
DomainPPSize, DomainTPSize, layerNumDomainPP, kvFactor);
DomainPPSize, DomainTPSize, kvFactor, prefixLayerNumDevPtr);
}
else if (isWindow)
{
@ -1480,7 +1594,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
concatKVCacheKernel<T, subWarpSize, subWarpNumInGroup, 2>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputSplitBlockPtrsDev,
ouptutBlockPtrsDev, tokensPerBlock, numLayers, headNum, dimsPerHead, outputBlockNumSum,
DomainPPSize, DomainTPSize, layerNumDomainPP, headNumDomainTP);
DomainPPSize, DomainTPSize, headNumDomainTP, prefixLayerNumDevPtr);
}
break;
}
@ -1494,7 +1608,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
concatKVCacheForMLAKernel<T, mlaSubWarpSize, 1>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputSplitBlockPtrsDev,
ouptutBlockPtrsDev, tokensPerBlock, numLayers, headNum, dimsPerHead, outputBlockNumSum,
DomainPPSize, DomainTPSize, layerNumDomainPP, kvFactor);
DomainPPSize, DomainTPSize, kvFactor, prefixLayerNumDevPtr);
}
else if (isWindow)
{
@ -1509,7 +1623,7 @@ void concatKVCache(std::vector<runtime::ITensor::SharedPtr> const& inputSplitBlo
concatKVCacheKernel<T, subWarpSize, subWarpNumInGroup, 1>
<<<gridDim, blockDimx, 0, bufferManager.getStream().get()>>>(inputSplitBlockPtrsDev,
ouptutBlockPtrsDev, tokensPerBlock, numLayers, headNum, dimsPerHead, outputBlockNumSum,
DomainPPSize, DomainTPSize, layerNumDomainPP, headNumDomainTP);
DomainPPSize, DomainTPSize, headNumDomainTP, prefixLayerNumDevPtr);
}
break;
}

View File

@ -40,6 +40,18 @@ struct TargetRanksInfo
std::vector<int> mIRanks;
int mDupHeadFactor;
int mPeerDupHeadFactor;
// the size of the vector is equal to the mDomainPPSize. the value of the vector is the layer num should be fetched
// from each target PP rank in domain PP.
std::vector<int> mPeerAttentionLayerNumInDomainPP;
int getPeerPPDomainLayerNum(int targetRankIdx)
{
//[TP,PP]
int ppDomainRankIdx = targetRankIdx % mDomainPPSize;
return mPeerAttentionLayerNumInDomainPP[ppDomainRankIdx];
}
};
TargetRanksInfo targetIRanks(

View File

@ -37,4 +37,7 @@ if(NIXL_ROOT)
# Link against all NIXL libraries
target_link_libraries(${NIXL_WRAPPER_TARGET} PRIVATE NIXL::nixl)
# Link against CUDA
target_link_libraries(${NIXL_WRAPPER_TARGET} PRIVATE CUDA::cudart)
endif()

View File

@ -223,6 +223,16 @@ uint16_t getIncrmentPort(uint16_t basePort)
return list;
}
[[nodiscard]] nixl_reg_dlist_t NixlHelper::convertRegDlist(FileDescs const& descs)
{
nixl_reg_dlist_t list(FILE_SEG);
for (auto const& desc : descs.getDescs())
{
list.addDesc(nixlBlobDesc{0, desc.getLen(), desc.getFd()});
}
return list;
}
[[nodiscard]] nixl_xfer_op_t NixlHelper::convert(TransferOp const& op)
{
switch (op)
@ -243,6 +253,62 @@ uint16_t getIncrmentPort(uint16_t basePort)
return list;
}
[[nodiscard]] nixl_xfer_dlist_t NixlHelper::convertXferDist(FileDescs const& descs)
{
nixl_xfer_dlist_t list{FILE_SEG};
for (auto const& desc : descs.getDescs())
{
list.addDesc(nixlBasicDesc{0, desc.getLen(), desc.getFd()});
}
return list;
}
void NixlHelper::posixGpuToFileFallback(MemoryDescs const& memoryDescs, FileDescs const& fileDescs)
{
auto const& memVec = memoryDescs.getDescs();
auto const& fileVec = fileDescs.getDescs();
std::size_t i;
for (i = 0; i < std::min(memVec.size(), fileVec.size()); i++)
{
auto& memDesc = memVec[i];
auto& fileDesc = fileVec[i];
ssize_t numBytes = static_cast<ssize_t>(memDesc.getLen());
std::vector<uint8_t> hostBuffer(numBytes);
cudaError_t cpyErr = cudaMemcpy(
hostBuffer.data(), reinterpret_cast<void*>(memDesc.getAddr()), numBytes, cudaMemcpyDeviceToHost);
TLLM_CHECK_WITH_INFO(cpyErr == cudaSuccess, "cudaMemcpy to host failed, error=%d", cpyErr);
ssize_t written = ::write(fileDesc.getFd(), hostBuffer.data(), numBytes);
TLLM_CHECK_WITH_INFO(written >= 0, "POSIX write error=%zd", written);
}
}
void NixlHelper::posixFileToGpuFallback(MemoryDescs const& memoryDescs, FileDescs const& fileDescs)
{
auto const& memVec = memoryDescs.getDescs();
auto const& fileVec = fileDescs.getDescs();
std::size_t i;
for (i = 0; i < std::min(memVec.size(), fileVec.size()); i++)
{
auto& memDesc = memVec[i];
auto& fileDesc = fileVec[i];
ssize_t numBytes = static_cast<ssize_t>(memDesc.getLen());
std::vector<uint8_t> hostBuffer(numBytes);
ssize_t bytesRead = ::read(fileDesc.getFd(), hostBuffer.data(), numBytes);
TLLM_CHECK_WITH_INFO(bytesRead == numBytes, "POSIX read error=%zd", bytesRead);
cudaError_t cpyErr = cudaMemcpy(
reinterpret_cast<void*>(memDesc.getAddr()), hostBuffer.data(), numBytes, cudaMemcpyHostToDevice);
TLLM_CHECK_WITH_INFO(cpyErr == cudaSuccess, "cudaMemcpy to device failed, error=%d", cpyErr);
}
}
NixlTransferStatus::NixlTransferStatus(nixlAgent* agent, nixlXferReqH* handle)
: mRawAgent{agent}
, mHandle{handle}
@ -457,6 +523,132 @@ NixlTransferAgent::~NixlTransferAgent()
TLLM_LOG_DEBUG("NixlTransferAgent::~NixlTransferAgent");
}
NixlLoopbackAgent::NixlLoopbackAgent(BaseAgentConfig const& config)
: mName{config.mName}
{
nixlAgentConfig nixlConfig{config.useProgThread};
nixlBackendH* backend;
nixl_status_t status;
nixl_b_params_t init;
mRawAgent = std::make_unique<nixlAgent>(config.mName, std::move(nixlConfig));
init["batch_pool_size"] = std::to_string(8);
init["batch_limit"] = std::to_string(128);
init["max_request_size"] = std::to_string(16 * 1024 * 1024);
if (config.multiThread)
{
status = mRawAgent->createBackend("GDS_MT", init, backend);
if (status != NIXL_SUCCESS || !backend)
TLLM_THROW("Failed to create NIXL GDS_MT backend, status = %d", status);
}
else
{
status = mRawAgent->createBackend("GDS", init, backend);
if (status != NIXL_SUCCESS || !backend)
TLLM_THROW("Failed to create NIXL GDS backend, status = %d", status);
}
}
int NixlLoopbackAgent::registerMemory(MemoryDescs const& descs)
{
nixl_status_t status = mRawAgent->registerMem(NixlHelper::convertRegDlist(descs));
if (status != NIXL_SUCCESS)
return -1;
return 0;
}
int NixlLoopbackAgent::deregisterMemory(MemoryDescs const& descs)
{
nixl_status_t status = mRawAgent->deregisterMem(NixlHelper::convertRegDlist(descs));
if (status != NIXL_SUCCESS)
return -1;
return 0;
}
int NixlLoopbackAgent::registerFiles(FileDescs const& descs)
{
nixl_status_t status = mRawAgent->registerMem(NixlHelper::convertRegDlist(descs));
if (status != NIXL_SUCCESS)
return -1;
return 0;
}
int NixlLoopbackAgent::deregisterFiles(FileDescs const& descs)
{
nixl_status_t status = mRawAgent->deregisterMem(NixlHelper::convertRegDlist(descs));
if (status != NIXL_SUCCESS)
return -1;
return 0;
}
std::unique_ptr<TransferStatus> NixlLoopbackAgent::submitLoopbackRequests(
MemoryDescs const& memoryDescs, FileDescs const& fileDescs, bool isOffload)
{
nixl_xfer_dlist_t vram_seg = NixlHelper::convertXferDist(memoryDescs);
nixl_xfer_dlist_t file_seg = NixlHelper::convertXferDist(fileDescs);
nixl_xfer_dlist_t& src = isOffload ? vram_seg : file_seg;
nixl_xfer_dlist_t& dst = isOffload ? file_seg : vram_seg;
nixl_xfer_op_t op = isOffload ? NIXL_WRITE : NIXL_READ;
nixlXferReqH* handle = nullptr;
nixl_status_t status = mRawAgent->createXferReq(op, src, dst, mName, handle);
TLLM_CHECK(status == NIXL_SUCCESS && handle);
status = mRawAgent->postXferReq(handle);
TLLM_CHECK(status == NIXL_IN_PROG);
return std::make_unique<NixlTransferStatus>(mRawAgent.get(), handle);
}
void NixlLoopbackAgent::executeLoopbackRequest(
MemoryDescs const& memoryDescs, FileDescs const& fileDescs, bool isOffload)
{
bool fallback = false;
int ret;
ret = this->registerFiles(fileDescs);
if (ret < 0)
{ // register can fail if no GDS support
TLLM_LOG_DEBUG("NIXL GDS register files failed, using POSIX fallback");
fallback = true;
}
else
{
ret = this->registerMemory(memoryDescs);
if (ret < 0)
{ // register can fail if no GDS support
TLLM_LOG_DEBUG("NIXL GDS register memory failed, using POSIX fallback");
this->deregisterFiles(fileDescs);
fallback = true;
}
}
if (fallback)
{
if (isOffload)
{
NixlHelper::posixGpuToFileFallback(memoryDescs, fileDescs);
}
else
{
NixlHelper::posixFileToGpuFallback(memoryDescs, fileDescs);
}
return;
}
std::unique_ptr<TransferStatus> status = this->submitLoopbackRequests(memoryDescs, fileDescs, isOffload);
TLLM_CHECK_WITH_INFO(status != nullptr, "submitLoopbackRequests failed");
status->wait();
this->deregisterMemory(memoryDescs);
this->deregisterFiles(fileDescs);
}
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
@ -471,6 +663,15 @@ extern "C"
}
}
extern "C"
{
std::shared_ptr<BaseLoopbackAgent> createNixlLoopbackAgent(BaseAgentConfig const* config)
{
TLLM_CHECK(config);
return std::make_shared<NixlLoopbackAgent>(*config);
}
}
#if defined(__clang__)
#pragma clang diagnostic pop
#endif

View File

@ -30,8 +30,12 @@ struct NixlHelper
[[nodiscard]] static nixl_mem_t convert(MemoryType type);
[[nodiscard]] static nixlBasicDesc convert(MemoryDesc const& desc);
[[nodiscard]] static nixl_reg_dlist_t convertRegDlist(RegisterDescs const& descs);
[[nodiscard]] static nixl_reg_dlist_t convertRegDlist(FileDescs const& descs);
[[nodiscard]] static nixl_xfer_op_t convert(TransferOp const& op);
[[nodiscard]] static nixl_xfer_dlist_t convertXferDist(TransferDescs const& descs);
[[nodiscard]] static nixl_xfer_dlist_t convertXferDist(FileDescs const& descs);
static void posixGpuToFileFallback(MemoryDescs const& memoryDesc, FileDescs const& fileDescs);
static void posixFileToGpuFallback(MemoryDescs const& memoryDesc, FileDescs const& fileDescs);
};
class NixlTransferStatus final : public TransferStatus
@ -97,6 +101,28 @@ private:
std::vector<char> mDRamDstBuffer;
};
class NixlLoopbackAgent final : public BaseLoopbackAgent
{
public:
NixlLoopbackAgent(BaseAgentConfig const& config);
virtual ~NixlLoopbackAgent() = default;
virtual void executeLoopbackRequest(
MemoryDescs const& memoryDescs, FileDescs const& fileDescs, bool isOffload) override;
private:
int registerMemory(MemoryDescs const& descs);
int deregisterMemory(MemoryDescs const& descs);
int registerFiles(FileDescs const& descs);
int deregisterFiles(FileDescs const& descs);
[[nodiscard]] std::unique_ptr<TransferStatus> submitLoopbackRequests(
MemoryDescs const& memoryDescs, FileDescs const& filedescs, bool isOffload);
std::unique_ptr<nixlAgent> mRawAgent;
std::string mName;
};
#if defined(__clang__)
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wreturn-type-c-linkage"
@ -107,6 +133,11 @@ extern "C"
[[nodiscard]] std::unique_ptr<BaseTransferAgent> createNixlTransferAgent(BaseAgentConfig const* config);
}
extern "C"
{
[[nodiscard]] std::shared_ptr<BaseLoopbackAgent> createNixlLoopbackAgent(BaseAgentConfig const* config);
}
#if defined(__clang__)
#pragma clang diagnostic pop
#endif

View File

@ -45,7 +45,7 @@ bool KvCacheRetentionConfig::TokenRangeRetentionConfig::operator==(TokenRangeRet
KvCacheRetentionConfig::KvCacheRetentionConfig(
std::vector<KvCacheRetentionConfig::TokenRangeRetentionConfig> const& tokenRangeRetentionPriorities,
RetentionPriority decodeRetentionPriority, std::optional<std::chrono::milliseconds> decodeDurationMs,
KvCacheTransferMode transferMode, std::optional<std::string> directory)
KvCacheTransferMode transferMode, std::string const& directory)
: mTokenRangeRetentionConfigs(std::vector<TokenRangeRetentionConfig>(tokenRangeRetentionPriorities))
, mDecodeRetentionPriority{decodeRetentionPriority}
, mDecodeDurationMs{decodeDurationMs}
@ -117,7 +117,7 @@ KvCacheTransferMode KvCacheRetentionConfig::getTransferMode() const
return mTransferMode;
}
std::optional<std::string> KvCacheRetentionConfig::getDirectory() const
std::string const& KvCacheRetentionConfig::getDirectory() const
{
return mDirectory;
}

View File

@ -535,11 +535,13 @@ kv_cache::CacheState Serialization::deserializeCacheState(std::istream& is)
auto enableAttentionDP = su::deserialize<decltype(CacheState::ParallelConfig::mEnableAttentionDP)>(is);
auto DPrank = su::deserialize<decltype(CacheState::ParallelConfig::mDPrank)>(is);
auto DPsize = su::deserialize<decltype(CacheState::ParallelConfig::mDPsize)>(is);
auto attentionLayerNumPerPP = su::deserialize<decltype(CacheState::ParallelConfig::mAttentionLayerNumPerPP)>(is);
auto dataType = su::deserialize<decltype(CacheState::mDataType)>(is);
auto attentionType = su::deserialize<decltype(CacheState::AttentionConfig::mAttentionType)>(is);
auto kvFactor = su::deserialize<decltype(CacheState::AttentionConfig::mKvFactor)>(is);
return CacheState{nbKvHeadsPerLayer, sizePerHead, tokensPerBlock, tensorParallelism, pipelineParallelism,
contextParallelism, dataType, attentionType, kvFactor, enableAttentionDP, DPrank, DPsize};
contextParallelism, attentionLayerNumPerPP, dataType, attentionType, kvFactor, enableAttentionDP, DPrank,
DPsize};
}
void Serialization::serialize(kv_cache::CacheState const& state, std::ostream& os)
@ -553,6 +555,7 @@ void Serialization::serialize(kv_cache::CacheState const& state, std::ostream& o
su::serialize(state.mParallelConfig.mEnableAttentionDP, os);
su::serialize(state.mParallelConfig.mDPrank, os);
su::serialize(state.mParallelConfig.mDPsize, os);
su::serialize(state.mParallelConfig.mAttentionLayerNumPerPP, os);
su::serialize(state.mDataType, os);
su::serialize(state.mAttentionConfig.mAttentionType, os);
su::serialize(state.mAttentionConfig.mKvFactor, os);
@ -570,6 +573,7 @@ size_t Serialization::serializedSize(kv_cache::CacheState const& state)
totalSize += su::serializedSize(state.mParallelConfig.mEnableAttentionDP);
totalSize += su::serializedSize(state.mParallelConfig.mDPrank);
totalSize += su::serializedSize(state.mParallelConfig.mDPsize);
totalSize += su::serializedSize(state.mParallelConfig.mAttentionLayerNumPerPP);
totalSize += su::serializedSize(state.mDataType);
totalSize += su::serializedSize(state.mAttentionConfig.mAttentionType);
totalSize += su::serializedSize(state.mAttentionConfig.mKvFactor);
@ -1594,7 +1598,7 @@ KvCacheRetentionConfig Serialization::deserializeKvCacheRetentionConfig(std::ist
auto decodePriority = su::deserialize<executor::RetentionPriority>(is);
auto decodeDurationMs = intToDuration(su::deserialize<std::optional<size_t>>(is));
auto transferMode = su::deserialize<executor::KvCacheTransferMode>(is);
auto directory = su::deserialize<std::optional<std::string>>(is);
auto directory = su::deserialize<std::string>(is);
return KvCacheRetentionConfig{
tokenRangeRetentionPriorities, decodePriority, decodeDurationMs, transferMode, directory};

View File

@ -25,6 +25,7 @@
#include <nanobind/stl/optional.h>
#include <nanobind/stl/shared_ptr.h>
#include <nanobind/stl/unique_ptr.h>
#include <nanobind/stl/vector.h>
#include <nanobind/trampoline.h>
#include <torch/extension.h>
@ -90,11 +91,11 @@ void tb::CacheTransceiverBindings::initBindings(nb::module_& m)
nb::class_<tb::CacheTransceiver, tb::BaseCacheTransceiver>(m, "CacheTransceiver")
.def(nb::init<tb::kv_cache_manager::BaseKVCacheManager*, std::vector<SizeType32>, SizeType32, SizeType32,
runtime::WorldConfig, nvinfer1::DataType, executor::kv_cache::CacheState::AttentionType,
std::optional<executor::CacheTransceiverConfig>>(),
runtime::WorldConfig, std::vector<SizeType32>, nvinfer1::DataType,
executor::kv_cache::CacheState::AttentionType, std::optional<executor::CacheTransceiverConfig>>(),
nb::arg("cache_manager"), nb::arg("num_kv_heads_per_layer"), nb::arg("size_per_head"),
nb::arg("tokens_per_block"), nb::arg("world_config"), nb::arg("dtype"), nb::arg("attention_type"),
nb::arg("cache_transceiver_config") = std::nullopt);
nb::arg("tokens_per_block"), nb::arg("world_config"), nb::arg("attention_layer_num_per_pp"),
nb::arg("dtype"), nb::arg("attention_type"), nb::arg("cache_transceiver_config") = std::nullopt);
nb::class_<tb::kv_cache_manager::CacheTransBufferManager>(m, "CacheTransBufferManager")
.def(nb::init<tb::kv_cache_manager::BaseKVCacheManager*, std::optional<size_t>>(), nb::arg("cache_manager"),

View File

@ -394,7 +394,7 @@ void initRequestBindings(nb::module_& m)
new (&kvCacheRetentionConfig) tle::KvCacheRetentionConfig(
nb::cast<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>>(state[0]),
nb::cast<tle::RetentionPriority>(state[1]), nb::cast<std::optional<std::chrono::milliseconds>>(state[2]),
nb::cast<tle::KvCacheTransferMode>(state[3]), nb::cast<std::optional<std::string>>(state[4]));
nb::cast<tle::KvCacheTransferMode>(state[3]), nb::cast<std::string>(state[4]));
};
auto kvCacheRetentionConfig = nb::class_<tle::KvCacheRetentionConfig>(m, "KvCacheRetentionConfig");
@ -417,7 +417,7 @@ void initRequestBindings(nb::module_& m)
// TokenRangeRetentionPriority bindings have been defined.
kvCacheRetentionConfig
.def(nb::init<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>, tle::RetentionPriority,
std::optional<std::chrono::milliseconds>, tle::KvCacheTransferMode, std::optional<std::string>>(),
std::optional<std::chrono::milliseconds>, tle::KvCacheTransferMode, std::string>(),
nb::arg("token_range_retention_configs"),
nb::arg("decode_retention_priority") = tle::KvCacheRetentionConfig::kDefaultRetentionPriority,
nb::arg("decode_duration_ms") = nb::none(), nb::arg("transfer_mode") = tle::KvCacheTransferMode::DRAM,

View File

@ -87,11 +87,11 @@ void tb::CacheTransceiverBindings::initBindings(py::module_& m)
py::classh<tb::CacheTransceiver, tb::BaseCacheTransceiver>(m, "CacheTransceiver")
.def(py::init<tb::kv_cache_manager::BaseKVCacheManager*, std::vector<SizeType32>, SizeType32, SizeType32,
runtime::WorldConfig, nvinfer1::DataType, executor::kv_cache::CacheState::AttentionType,
std::optional<executor::CacheTransceiverConfig>>(),
runtime::WorldConfig, std::vector<SizeType32>, nvinfer1::DataType,
executor::kv_cache::CacheState::AttentionType, std::optional<executor::CacheTransceiverConfig>>(),
py::arg("cache_manager"), py::arg("num_kv_heads_per_layer"), py::arg("size_per_head"),
py::arg("tokens_per_block"), py::arg("world_config"), py::arg("dtype"), py::arg("attention_type"),
py::arg("cache_transceiver_config") = std::nullopt);
py::arg("tokens_per_block"), py::arg("world_config"), py::arg("attention_layer_num_per_pp"),
py::arg("dtype"), py::arg("attention_type"), py::arg("cache_transceiver_config") = std::nullopt);
py::class_<tb::kv_cache_manager::CacheTransBufferManager>(m, "CacheTransBufferManager")
.def(py::init<tb::kv_cache_manager::BaseKVCacheManager*, std::optional<size_t>>(), py::arg("cache_manager"),

View File

@ -364,7 +364,7 @@ void initRequestBindings(pybind11::module_& m)
return tle::KvCacheRetentionConfig(
state[0].cast<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>>(),
state[1].cast<tle::RetentionPriority>(), state[2].cast<std::optional<std::chrono::milliseconds>>(),
state[3].cast<tle::KvCacheTransferMode>(), state[4].cast<std::optional<std::string>>());
state[3].cast<tle::KvCacheTransferMode>(), state[4].cast<std::string>());
};
auto kvCacheRetentionConfig = py::class_<tle::KvCacheRetentionConfig>(m, "KvCacheRetentionConfig");
@ -386,7 +386,7 @@ void initRequestBindings(pybind11::module_& m)
// TokenRangeRetentionPriority bindings have been defined.
kvCacheRetentionConfig
.def(py::init<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionConfig>, tle::RetentionPriority,
std::optional<std::chrono::milliseconds>, tle::KvCacheTransferMode, std::optional<std::string>>(),
std::optional<std::chrono::milliseconds>, tle::KvCacheTransferMode, std::string>(),
py::arg("token_range_retention_configs"),
py::arg("decode_retention_priority") = tle::KvCacheRetentionConfig::kDefaultRetentionPriority,
py::arg("decode_duration_ms") = py::none(),

View File

@ -346,8 +346,9 @@ TEST_F(CacheTransBufferTest, TestForNullOptAndNoneTransSize)
auto bufferManager = tensorrt_llm::runtime::BufferManager{std::make_shared<CudaStream>()};
auto targetNum = 2;
auto targetSize = 1024;
std::vector<size_t> targetSizeVec = std::vector<size_t>(targetNum, targetSize);
auto [sendBuffers, bufferCoverTargetNum, onlyUseDynamicBuffer]
= mTransBufferManager->getOrAllocateSendBuffers(bufferId3, targetNum, targetSize, bufferManager);
= mTransBufferManager->getOrAllocateSendBuffers(bufferId3, targetNum, targetSizeVec, bufferManager);
EXPECT_EQ(sendBuffers.size(), targetNum);
EXPECT_EQ(bufferCoverTargetNum, targetNum);
EXPECT_EQ(onlyUseDynamicBuffer, true);
@ -393,8 +394,9 @@ TEST_F(CacheTransBufferTest, TestForNullOptAndDefaultTransSize)
auto bufferManager = tensorrt_llm::runtime::BufferManager{std::make_shared<CudaStream>()};
auto targetNum = 2;
auto targetSize = 1024;
std::vector<size_t> targetSizeVec = std::vector<size_t>(targetNum, targetSize);
auto [sendBuffers, bufferCoverTargetNum, onlyUseDynamicBuffer]
= mTransBufferManager->getOrAllocateSendBuffers(bufferId3, targetNum, targetSize, bufferManager);
= mTransBufferManager->getOrAllocateSendBuffers(bufferId3, targetNum, targetSizeVec, bufferManager);
EXPECT_EQ(sendBuffers.size(), targetNum);
EXPECT_EQ(bufferCoverTargetNum, targetNum);
EXPECT_EQ(onlyUseDynamicBuffer, false);
@ -407,8 +409,9 @@ TEST_F(CacheTransBufferTest, TestForNullOptAndDefaultTransSize)
auto bufferId4 = mTransBufferManager->assignBufferIndexForSend();
EXPECT_TRUE(bufferId4.has_value());
EXPECT_EQ(bufferId4.value(), 0);
targetSizeVec = std::vector<size_t>(targetNum, targetSize);
auto [sendBuffers2, bufferCoverTargetNum2, onlyUseDynamicBuffer2]
= mTransBufferManager->getOrAllocateSendBuffers(bufferId4, targetNum, targetSize, bufferManager);
= mTransBufferManager->getOrAllocateSendBuffers(bufferId4, targetNum, targetSizeVec, bufferManager);
EXPECT_EQ(sendBuffers2.size(), targetNum);
EXPECT_EQ(bufferCoverTargetNum2, targetNum / 2);
EXPECT_EQ(onlyUseDynamicBuffer2, false);
@ -418,8 +421,9 @@ TEST_F(CacheTransBufferTest, TestForNullOptAndDefaultTransSize)
auto bufferId5 = mTransBufferManager->assignBufferIndexForSend();
EXPECT_TRUE(bufferId5.has_value());
EXPECT_EQ(bufferId5.value(), 0);
targetSizeVec = std::vector<size_t>(targetNum, targetSize);
auto [sendBuffers3, bufferCoverTargetNum3, onlyUseDynamicBuffer3]
= mTransBufferManager->getOrAllocateSendBuffers(bufferId5, targetNum, targetSize, bufferManager);
= mTransBufferManager->getOrAllocateSendBuffers(bufferId5, targetNum, targetSizeVec, bufferManager);
EXPECT_EQ(sendBuffers3.size(), targetNum);
EXPECT_EQ(bufferCoverTargetNum3, targetNum);
EXPECT_EQ(onlyUseDynamicBuffer3, false);

View File

@ -18,6 +18,7 @@
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/memoryUtils.h"
#include "tensorrt_llm/executor/transferAgent.h"
#include "tensorrt_llm/executor/types.h"
#include "tensorrt_llm/kernels/kvCacheIndex.h"
#include "tensorrt_llm/kernels/kvCacheUtils.h"
@ -32,6 +33,7 @@
#include <chrono>
#include <cmath>
#include <cstddef>
#include <filesystem>
#include <memory>
#include <set>
#include <thread>
@ -45,6 +47,7 @@ namespace tk = tensorrt_llm::kernels;
namespace tlk = tensorrt_llm::batch_manager::kv_cache_manager;
namespace tle = tensorrt_llm::executor;
namespace tr = tensorrt_llm::runtime;
namespace fs = std::filesystem;
using BlocksPerWindow = std::map<SizeType32, std::tuple<SizeType32, SizeType32>>;
@ -182,7 +185,39 @@ TEST_F(KVCacheManagerTest, BlockManagerTest)
std::runtime_error);
}
template <typename T, nvinfer1::DataType type, int mask>
template <typename T>
void writePatternToOffloadedBlocksDRAM(T* rawBlockPtr, int blockSize, int mask)
{
for (int i = 0; i < blockSize; ++i)
{
rawBlockPtr[i] = i & mask;
}
}
template <typename T>
void writePatternToOffloadedBlocksGDS(
std::string const& directory, int blockId, SizeType32 numPools, int blockSize, int mask)
{
for (size_t poolIdx = 0; poolIdx < numPools; ++poolIdx)
{
std::string filename
= directory + "/block_" + std::to_string(blockId) + "_pool_" + std::to_string(poolIdx) + ".bin";
int fd = ::open(filename.c_str(), O_WRONLY);
if (fd >= 0)
{
auto poolBlockSize = blockSize / numPools;
std::vector<T> buffer(poolBlockSize);
for (int i = 0; i < poolBlockSize; ++i)
{
buffer[i] = i & mask;
}
::write(fd, buffer.data(), poolBlockSize * sizeof(T));
::close(fd);
}
}
}
template <typename T, nvinfer1::DataType type, int mask, KvCacheTransferMode transferMode>
void runPartialCopyTest()
{
auto constexpr numLayers = 12;
@ -202,6 +237,16 @@ void runPartialCopyTest()
auto constexpr maxAttentionWindowAllLayer = 4096;
auto constexpr sinkTokenLen = 0;
auto constexpr canUseOneMoreBlock = true;
std::string directory = "";
static int file_num = 0;
if constexpr (transferMode == KvCacheTransferMode::GDS)
{
std::string filename = std::string("test_copy") + std::to_string(file_num++);
auto dirPath = fs::absolute(filename);
fs::create_directories(dirPath);
directory = dirPath.string();
}
SizeType32 constexpr maxNewTokens{0};
auto constexpr beamWidth = 1;
@ -256,7 +301,7 @@ void runPartialCopyTest()
auto block = blockManager.getBlockById(cacheBlockId, maxAttentionWindow);
EXPECT_TRUE(block->isPrimary());
// offload so we can write to block in CPU code
blockManager.offloadBlock(block, maxAttentionWindow);
blockManager.offloadBlock(block, maxAttentionWindow, transferMode, directory);
EXPECT_FALSE(block->isPrimary());
// need to sync so D2H transfer is done before accessing blocks
EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
@ -264,12 +309,19 @@ void runPartialCopyTest()
auto memoryPoolIndex = block->getMemoryPoolBlockIndex();
auto blockPtr{tr::ITensor::slice(secondaryPoolPtr, memoryPoolIndex, 1)};
auto rawBlockPtr = reinterpret_cast<T*>(blockPtr->data());
for (int i = 0; i < blockSize; ++i)
// Write value
if constexpr (transferMode == KvCacheTransferMode::DRAM)
{
rawBlockPtr[i] = i & mask;
writePatternToOffloadedBlocksDRAM<T>(rawBlockPtr, blockSize, mask);
}
else if constexpr (transferMode == KvCacheTransferMode::GDS)
{
auto block_id = block->getBlockId();
auto numPools = blockManager.getNumPools(false);
writePatternToOffloadedBlocksGDS<T>(directory, block_id, numPools, blockSize, mask);
}
// onboard
blockManager.onboardBlock(block, maxAttentionWindow);
blockManager.onboardBlock(block, maxAttentionWindow, transferMode, directory);
EXPECT_TRUE(block->isPrimary());
EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
EXPECT_TRUE(blockManager.verifyQueueIntegrity(maxAttentionWindow));
@ -344,60 +396,72 @@ void runPartialCopyTest()
}
}
EXPECT_EQ(numBad, 0);
blockManager.onboardBlock(block2, maxAttentionWindow);
blockManager.onboardBlock(block2, maxAttentionWindow, transferMode, directory);
EXPECT_TRUE(block2->isPrimary());
EXPECT_EQ(cudaDeviceSynchronize(), cudaSuccess);
blockManager.releaseBlocks(seq1, llmRequest1);
blockManager.releaseBlocks(seq2, llmRequest2);
if constexpr (transferMode == KvCacheTransferMode::GDS)
fs::remove_all(directory);
}
TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyINT64)
{
runPartialCopyTest<std::uint64_t, nvinfer1::DataType::kINT64, -1>();
runPartialCopyTest<std::uint64_t, nvinfer1::DataType::kINT64, -1, KvCacheTransferMode::DRAM>();
runPartialCopyTest<std::uint64_t, nvinfer1::DataType::kINT64, -1, KvCacheTransferMode::GDS>();
}
TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyINT32)
{
runPartialCopyTest<std::uint32_t, nvinfer1::DataType::kINT32, -1>();
runPartialCopyTest<std::uint32_t, nvinfer1::DataType::kINT32, -1, KvCacheTransferMode::DRAM>();
runPartialCopyTest<std::uint32_t, nvinfer1::DataType::kINT32, -1, KvCacheTransferMode::GDS>();
}
TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyFLOAT)
{
runPartialCopyTest<std::uint32_t, nvinfer1::DataType::kFLOAT, -1>();
runPartialCopyTest<std::uint32_t, nvinfer1::DataType::kFLOAT, -1, KvCacheTransferMode::DRAM>();
runPartialCopyTest<std::uint32_t, nvinfer1::DataType::kFLOAT, -1, KvCacheTransferMode::GDS>();
}
#ifdef ENABLE_BF16
TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyBF16)
{
runPartialCopyTest<std::uint16_t, nvinfer1::DataType::kBF16, 65535>();
runPartialCopyTest<std::uint16_t, nvinfer1::DataType::kBF16, 65535, KvCacheTransferMode::DRAM>();
runPartialCopyTest<std::uint16_t, nvinfer1::DataType::kBF16, 65535, KvCacheTransferMode::GDS>();
}
#endif
TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyHALF)
{
runPartialCopyTest<std::uint16_t, nvinfer1::DataType::kHALF, 65535>();
runPartialCopyTest<std::uint16_t, nvinfer1::DataType::kHALF, 65535, KvCacheTransferMode::DRAM>();
runPartialCopyTest<std::uint16_t, nvinfer1::DataType::kHALF, 65535, KvCacheTransferMode::GDS>();
}
TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyBOOL)
{
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kBOOL, 255>();
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kBOOL, 255, KvCacheTransferMode::DRAM>();
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kBOOL, 255, KvCacheTransferMode::GDS>();
}
TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyUINT8)
{
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kUINT8, 255>();
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kUINT8, 255, KvCacheTransferMode::DRAM>();
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kUINT8, 255, KvCacheTransferMode::GDS>();
}
TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyINT8)
{
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kINT8, 255>();
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kINT8, 255, KvCacheTransferMode::DRAM>();
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kINT8, 255, KvCacheTransferMode::GDS>();
}
#ifdef ENABLE_FP8
TEST_F(KVCacheManagerTest, BlockManagerTestPartialCopyFP8)
{
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kFP8, 255>();
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kFP8, 255, KvCacheTransferMode::DRAM>();
runPartialCopyTest<std::uint8_t, nvinfer1::DataType::kFP8, 255, KvCacheTransferMode::GDS>();
}
#endif

View File

@ -78,7 +78,7 @@ protected:
auto constexpr dataType = nvinfer1::DataType::kFLOAT;
using BlocksPerWindow = std::map<SizeType32, std::tuple<SizeType32, SizeType32>>;
const BlocksPerWindow blocksPerWindow
BlocksPerWindow const blocksPerWindow
= {{maxAttentionWindow, std::make_tuple(totalNumBlocks, blocksInSecondaryPool)}};
mCacheManager = std::make_unique<KVCacheManager>(numLayers, numHeads, sizePerHead, tokensPerBlock,
@ -90,7 +90,8 @@ protected:
size_t maxNumTokens = 1024;
mTransBufferManager = std::make_unique<CacheTransBufferManager>(mCacheManager.get(), maxNumTokens);
mCacheState = std::make_unique<CacheState>(numLayers, numHeads, sizePerHead, tokensPerBlock, 1, 1, 1, dataType);
mCacheState = std::make_unique<CacheState>(
numLayers, numHeads, sizePerHead, tokensPerBlock, 1, 1, 1, std::vector<SizeType32>{numLayers}, dataType);
}
void TearDown() override
@ -107,7 +108,7 @@ protected:
TEST_F(AgentCommTest, AgentConnectionManagerBasic)
{
auto connectionManager = std::make_unique<AgentConnectionManager>(mTransBufferManager.get());
auto connectionManager = std::make_unique<AgentConnectionManager>(mTransBufferManager.get(), *mCacheState);
ASSERT_TRUE(connectionManager != nullptr);
ASSERT_TRUE(connectionManager->getCacheTransBufferManager() != nullptr);
ASSERT_EQ(connectionManager->getDeviceId(), 0);
@ -120,8 +121,8 @@ TEST_F(AgentCommTest, AgentConnectionManagerBasic)
TEST_F(AgentCommTest, AgentConnectionManagerConnect)
{
auto connectionManager0 = std::make_unique<AgentConnectionManager>(mTransBufferManager.get());
auto connectionManager1 = std::make_unique<AgentConnectionManager>(mTransBufferManager.get());
auto connectionManager0 = std::make_unique<AgentConnectionManager>(mTransBufferManager.get(), *mCacheState);
auto connectionManager1 = std::make_unique<AgentConnectionManager>(mTransBufferManager.get(), *mCacheState);
auto agentName0 = connectionManager0->getAgentName();
auto agentName1 = connectionManager1->getAgentName();
ASSERT_TRUE(!agentName0.empty());

View File

@ -726,7 +726,7 @@ TEST(SerializeUtilsTest, ContextPhaseParams)
{
auto state = std::make_unique<texec::DataTransceiverState>();
state->setCommState(texec::kv_cache::CommState{12, "127.0.0.1"});
state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 8, nvinfer1::DataType::kFLOAT});
state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 8, {4}, nvinfer1::DataType::kFLOAT});
auto stats = texec::ContextPhaseParams({10, 20, 30, 40, 50, 60}, 0, state.release(), VecTokens{10, 20});
auto stats2 = serializeDeserialize(stats);
EXPECT_EQ(stats, stats2);

View File

@ -16,6 +16,10 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>
#include <filesystem>
namespace fs = std::filesystem;
using namespace tensorrt_llm::executor::kv_cache;
class RegisteredHostMemory
@ -341,3 +345,118 @@ TEST_F(TransferAgentTest, SyncMessage)
nixlAgent0->invalidateRemoteAgent(agent1);
nixlAgent1->invalidateRemoteAgent(agent0);
}
class LoopbackAgentTest : public ::testing::Test,
public ::testing::WithParamInterface<bool> // NOLINT(cppcoreguidelines-pro-type-member-init)
{
public:
void SetUp() override
{
static int file_num = 0;
std::string filename = std::string("test_agent") + std::to_string(file_num++);
auto dirPath = fs::absolute(filename);
std::error_code ec;
fs::create_directories(dirPath, ec);
TLLM_CHECK_WITH_INFO(!ec, "Failed to create test directory: %s", ec.message().c_str());
mDirectory = dirPath.string();
}
void TearDown() override
{
std::error_code ec;
fs::remove_all(mDirectory, ec);
if (ec)
std::cerr << "Warning: Failed to clean up test directory: " << ec.message() << std::endl;
}
[[nodiscard]] std::shared_ptr<BaseLoopbackAgent> makeLoopbackAgent(BaseAgentConfig const& config)
{
return tensorrt_llm::executor::kv_cache::makeLoopbackAgent("nixl", &config);
}
[[nodiscard]] std::string getDirectory() const
{
return mDirectory;
}
private:
std::string mDirectory;
};
TEST_P(LoopbackAgentTest, FileToGpu)
{
std::string const agentName{"loopbackAgent"};
BaseAgentConfig config{agentName, true, GetParam()};
auto loopbackAgent = makeLoopbackAgent(config);
TLLM_CHECK(loopbackAgent);
std::vector<char> memory(100, 1);
char* cuda_mem;
TLLM_CUDA_CHECK(cudaMalloc(&cuda_mem, 100));
TLLM_CUDA_CHECK(cudaMemcpy(cuda_mem, memory.data(), 100, cudaMemcpyHostToDevice));
std::string filename = getDirectory() + std::string("/file2gpu.bin");
int fd = ::open(filename.c_str(), O_CREAT | O_WRONLY, 0664);
TLLM_CHECK_WITH_INFO(fd >= 0, "Failed to open '%s' for writing", filename.c_str());
std::vector<char> fileData(100, 10);
ssize_t bytesWritten = ::write(fd, fileData.data(), fileData.size());
TLLM_CHECK_WITH_INFO(bytesWritten == static_cast<ssize_t>(fileData.size()), "Failed to write to file");
::close(fd);
{
MemoryDesc mem_desc(cuda_mem, 100, 0);
MemoryDescs memDescs{MemoryType::kVRAM, {mem_desc}};
std::vector<FileDesc> fileDescVec;
fileDescVec.emplace_back(filename, O_RDONLY, 0664, 100);
FileDescs fileDescs{std::move(fileDescVec)};
loopbackAgent->executeLoopbackRequest(memDescs, fileDescs, false);
}
TLLM_CUDA_CHECK(cudaMemcpy(memory.data(), cuda_mem, 100, cudaMemcpyDeviceToHost));
TLLM_CHECK(memory == fileData);
TLLM_CUDA_CHECK(cudaFree(cuda_mem));
}
TEST_P(LoopbackAgentTest, GpuToFile)
{
std::string const agentName{"loopbackAgent"};
BaseAgentConfig config{agentName, true, GetParam()};
auto loopbackAgent = makeLoopbackAgent(config);
TLLM_CHECK(loopbackAgent);
std::vector<char> memory(100, 1);
char* cuda_mem;
TLLM_CUDA_CHECK(cudaMalloc(&cuda_mem, 100));
TLLM_CUDA_CHECK(cudaMemcpy(cuda_mem, memory.data(), 100, cudaMemcpyHostToDevice));
std::string filename = getDirectory() + std::string("/gpu2file.bin");
{
MemoryDesc mem_desc(cuda_mem, 100, 0);
MemoryDescs memDescs{MemoryType::kVRAM, {mem_desc}};
std::vector<FileDesc> fileDescVec;
fileDescVec.emplace_back(filename, O_CREAT | O_WRONLY, 0664, 100);
FileDescs fileDescs{std::move(fileDescVec)};
loopbackAgent->executeLoopbackRequest(memDescs, fileDescs, true);
}
int fd = ::open(filename.c_str(), O_RDONLY, 0664);
TLLM_CHECK_WITH_INFO(fd >= 0, "Failed to open '%s' for reading", filename.c_str());
std::vector<char> fileData(100);
ssize_t bytesRead = ::read(fd, fileData.data(), fileData.size());
TLLM_CHECK_WITH_INFO(bytesRead == static_cast<ssize_t>(fileData.size()), "Failed to read from file");
::close(fd);
TLLM_CHECK(fileData == memory);
TLLM_CUDA_CHECK(cudaFree(cuda_mem));
}
INSTANTIATE_TEST_SUITE_P(, LoopbackAgentTest, ::testing::Values(true, false));

View File

@ -99,7 +99,7 @@ TEST_F(RequestInfoTest, Basic)
}
auto state = std::make_unique<texec::DataTransceiverState>();
state->setCommState(texec::kv_cache::CommState{12, "127.0.0.1"});
state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 8, nvinfer1::DataType::kFLOAT});
state->setCacheState(texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 8, {10}, nvinfer1::DataType::kFLOAT});
RequestInfo info{1, *state};
auto info2 = serializeDeserialize(info);
EXPECT_EQ(info, info2);
@ -141,14 +141,16 @@ TEST_F(CacheConfigTest, EqualTo)
vocabSize, nbAttentionLayers + nbRnnLayers, nbAttentionLayers, nbRnnLayers, nbHeads, hiddenSize, dtype};
modelConfig.setTokensPerBlock(tokensPerBlock);
tr::WorldConfig worldConfig{tensorParallelism, pipelineParallelism, contextParallelism};
std::vector<SizeType32> attentionLayerNumPerPP(pipelineParallelism, nbAttentionLayers / pipelineParallelism);
texec::kv_cache::CacheState::ModelConfig cacheStateCfg{
modelConfig.getNumKvHeadsPerLayer(), modelConfig.getSizePerHead(), modelConfig.getTokensPerBlock()};
texec::kv_cache::CacheState state0{
cacheStateCfg, worldConfig, modelConfig.getKvDataType(), attentionType, kvFactor};
cacheStateCfg, worldConfig, attentionLayerNumPerPP, modelConfig.getKvDataType(), attentionType, kvFactor};
texec::kv_cache::CacheState state1{nbAttentionLayers, nbHeads, sizePerHead, tokensPerBlock, tensorParallelism,
pipelineParallelism, contextParallelism, dtype, attentionType, kvFactor, false, 0, tensorParallelism};
pipelineParallelism, contextParallelism, attentionLayerNumPerPP, dtype, attentionType, kvFactor, false, 0,
tensorParallelism};
EXPECT_EQ(state0, state1);
}
@ -165,7 +167,7 @@ public:
ON_CALL(*this, recvRequestInfo)
.WillByDefault(Return(RequestInfo{0,
texec::DataTransceiverState{
texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 1, nvinfer1::DataType::kFLOAT},
texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 1, {10}, nvinfer1::DataType::kFLOAT},
texec::kv_cache::CommState{std::vector<SizeType32>{0}, 0}}}));
ON_CALL(*this, getCounterpartsCount).WillByDefault(Return(1));
}
@ -218,7 +220,7 @@ TEST_F(MockTransceiverTest, MpiResponderBasic)
EXPECT_CALL(*sender, recvRequestInfo)
.WillOnce(Return(RequestInfo{0,
texec::DataTransceiverState{
texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 1, nvinfer1::DataType::kFLOAT},
texec::kv_cache::CacheState{10, 12, 128, 128, 8, 8, 1, {4}, nvinfer1::DataType::kFLOAT},
texec::kv_cache::CommState{std::vector<SizeType32>{0}, 0}}}));
EXPECT_CALL(*sender, sendSync).WillOnce(Return());
EXPECT_CALL(*sender, getCounterpartsCount).WillOnce(Return(1));
@ -318,8 +320,9 @@ protected:
mMaxNumSequences, maxBeamWidth, std::vector<BlockManager::SizeType32>{maxAttentionWindow}, std::nullopt,
dataType, sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks, CacheType::kSELF,
std::nullopt, nullptr, true);
auto attentionLayerNumPerPP = std::vector<SizeType32>{numLayers};
mCacheState = std::make_unique<texec::kv_cache::CacheState>(
numLayers, numHeads, sizePerHead, tokensPerBlock, 1, 1, 1, dataType);
numLayers, numHeads, sizePerHead, tokensPerBlock, 1, 1, 1, attentionLayerNumPerPP, dataType);
if (tensorrt_llm::common::getEnvUseUCXKvCache())
{
@ -614,7 +617,29 @@ protected:
return;
}
ASSERT_EQ(numLayers % mPpSize, 0);
// ASSERT_EQ(numLayers % mPpSize, 0);
auto getLayerNumPPRank = [](int numLayers, int ppRank, int ppSize)
{
int layerNumPerPP = numLayers / ppSize;
int layerNumExtraInPP = numLayers % ppSize;
int layerNumInPPRank = layerNumPerPP + (ppRank < layerNumExtraInPP ? 1 : 0);
return layerNumInPPRank;
};
mAttentionLayerNumPerPP = std::vector<SizeType32>(mPpSize, 0);
for (int ppRank = 0; ppRank < mPpSize; ppRank++)
{
mAttentionLayerNumPerPP[ppRank] = getLayerNumPPRank(numLayers, ppRank, mPpSize);
}
int layerNumthisRank = getLayerNumPPRank(numLayers, mPpRank, mPpSize);
auto contextAttentionLayerNumPerPP = std::vector<SizeType32>(mContextPpSize, 0);
for (int ppRank = 0; ppRank < mContextPpSize; ppRank++)
{
contextAttentionLayerNumPerPP[ppRank] = getLayerNumPPRank(numLayers, ppRank, mContextPpSize);
}
if (!isMLA)
{
// ASSERT_EQ(numHeads % mTpSize , 0);
@ -693,19 +718,19 @@ protected:
maxAttentionWindowVec.push_back(windowAttentionToken);
}
TLLM_LOG_DEBUG(" cacheManager isWindowAttention: %d", mIsWindowAttention);
mManager = std::make_unique<KVCacheManager>(numLayers / mPpSize, numHeadsPerRank, sizePerHead, tokensPerBlock,
mManager = std::make_unique<KVCacheManager>(layerNumthisRank, numHeadsPerRank, sizePerHead, tokensPerBlock,
blocksPerWindow, mMaxNumSequences, maxBeamWidth, maxAttentionWindowVec, std::nullopt, dataType,
sinkTokenLength, stream, std::nullopt, enableBlockReuse, onboardBlocks, cacheType, std::nullopt, nullptr,
true);
texec::kv_cache::CacheState::AttentionType attentionType = isMLA
? texec::kv_cache::CacheState::AttentionType::kMLA
: texec::kv_cache::CacheState::AttentionType::kDEFAULT;
mCacheState
= std::make_unique<texec::kv_cache::CacheState>(numLayers, numHeadsPerRank, sizePerHead, tokensPerBlock,
mTpSize, mPpSize, mCpSize, dataType, attentionType, kvFactor, enableDPAttention, DPrank, DPsize);
mCacheState = std::make_unique<texec::kv_cache::CacheState>(numLayers, numHeadsPerRank, sizePerHead,
tokensPerBlock, mTpSize, mPpSize, mCpSize, mAttentionLayerNumPerPP, dataType, attentionType, kvFactor,
enableDPAttention, DPrank, DPsize);
mContextCacheState = std::make_unique<texec::kv_cache::CacheState>(numLayers, numHeadsPerRankForContext,
sizePerHead, tokensPerBlock, mContextTpSize, mContextPpSize, mContextCpSize, dataType, attentionType,
kvFactor, mContextDP, DPrank, mContextTpSize);
sizePerHead, tokensPerBlock, mContextTpSize, mContextPpSize, mContextCpSize, contextAttentionLayerNumPerPP,
dataType, attentionType, kvFactor, mContextDP, DPrank, mContextTpSize);
// UVM seems to be incompatible with MPI, and it is continuing to investigate.
bool constexpr useUvm = false;
@ -751,8 +776,8 @@ protected:
setenv("TRTLLM_NIXL_PORT", std::to_string(port).c_str(), 1);
mConnectionManager
= std::make_unique<texec::kv_cache::AgentConnectionManager>(mCacheTransBufferManager.get());
mConnectionManager = std::make_unique<texec::kv_cache::AgentConnectionManager>(
mCacheTransBufferManager.get(), *mCacheState);
}
else
{
@ -865,7 +890,8 @@ protected:
mContextCacheState->getModelConfig().mSizePerHead, mContextCacheState->getModelConfig().mTokensPerBlock,
mContextCacheState->getParallelConfig().mTensorParallelism,
mContextCacheState->getParallelConfig().mPipelineParallelism,
mContextCacheState->getParallelConfig().mContextParallelism, mContextCacheState->getDataType(),
mContextCacheState->getParallelConfig().mContextParallelism,
mContextCacheState->getParallelConfig().mAttentionLayerNumPerPP, mContextCacheState->getDataType(),
mContextCacheState->getAttentionConfig().mAttentionType, mContextCacheState->getAttentionConfig().mKvFactor,
mContextCacheState->getParallelConfig().mEnableAttentionDP, contextDpRank,
mContextCacheState->getParallelConfig().mTensorParallelism};
@ -944,8 +970,19 @@ protected:
auto const onlyWindowSize = blockManager.getPoolWindowSize(blockPoolIdx);
auto const& bufferManager = blockManager.getBufferManager(onlyWindowSize);
auto hostTensor = tensorrt_llm::runtime::BufferManager::cpu(blockData.getShape(), blockData.getDataType());
int layerSizePerRank = blockData.getDimension<1>();
int startLayerId = layerSizePerRank * mPpRank;
int layerSizeThisRank = blockData.getDimension<1>();
int startLayerId = 0;
if (mIsWindowAttention)
{
startLayerId = layerSizeThisRank * mPpRank;
}
else
{
for (int ppRank = 0; ppRank < mPpRank; ppRank++)
{
startLayerId += mAttentionLayerNumPerPP[ppRank];
}
}
int headSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.at(0);
int startHeadId = headSizePerRank * (mTpRank / mDupHeadFactor);
bool enableDP = mCacheState->getParallelConfig().mEnableAttentionDP;
@ -958,7 +995,7 @@ protected:
int startTokenId = blockId * tokensPerBlock;
int sizePerHead = mCacheState->getModelConfig().mSizePerHead;
auto dataTypeSize = tensorrt_llm::common::getDTypeSize(blockData.getDataType());
for (int layerId = 0; layerId < layerSizePerRank; layerId++)
for (int layerId = 0; layerId < layerSizeThisRank; layerId++)
{
for (int headId = 0; headId < headSizePerRank; headId++)
{
@ -1008,8 +1045,20 @@ protected:
auto const& bufferManager = blockManager.getBufferManager(onlyWindowSize);
auto hostTensor = tensorrt_llm::runtime::BufferManager::cpu(blockData.getShape(), blockData.getDataType());
int layerSizePerRank = blockData.getDimension<1>();
int startLayerId = layerSizePerRank * mPpRank;
int layerSizethisRank = blockData.getDimension<1>();
int startLayerId = 0;
if (mIsWindowAttention)
{
startLayerId = layerSizethisRank * mPpRank;
}
else
{
for (int ppRank = 0; ppRank < mPpRank; ppRank++)
{
startLayerId += mAttentionLayerNumPerPP[ppRank];
}
}
int headSizePerRank = mCacheState->getModelConfig().mNbKvHeadsPerLayer.at(0);
int startHeadId = headSizePerRank * (mTpRank / mDupHeadFactor);
bool enableDP = mCacheState->getParallelConfig().mEnableAttentionDP;
@ -1025,7 +1074,7 @@ protected:
bufferManager.copy(blockData, *hostTensor);
bufferManager.getStream().synchronize();
for (int layerId = 0; layerId < layerSizePerRank; layerId++)
for (int layerId = 0; layerId < layerSizethisRank; layerId++)
{
for (int headId = 0; headId < headSizePerRank; headId++)
{
@ -1108,6 +1157,7 @@ protected:
bool mIsMLA{false};
bool mIsWindowAttention{false};
int mDupHeadFactor{1};
std::vector<SizeType32> mAttentionLayerNumPerPP;
SizeType32 mMaxNumSequences{};
std::unique_ptr<KVCacheManager> mManager;
@ -1351,6 +1401,18 @@ INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest1, AsymmetricalCacheTest,
testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2),
testing::Values(false), testing::Values(false), testing::Values(false), testing::Values(false, true)));
INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest1EvenLayer, AsymmetricalCacheTest,
testing::Combine(testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(4),
testing::Values(1), testing::Values(10), testing::Values(4), testing::Values(4), testing::Values(8),
testing::Values(nvinfer1::DataType::kFLOAT), testing::Values(2), testing::Values(false), testing::Values(false),
testing::Values(false), testing::Values(false)));
INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest2EvenLayer, AsymmetricalCacheTest,
testing::Combine(testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(4),
testing::Values(1), testing::Values(10), testing::Values(4), testing::Values(4), testing::Values(8),
testing::Values(nvinfer1::DataType::kFLOAT), testing::Values(2), testing::Values(false), testing::Values(false),
testing::Values(false), testing::Values(false)));
INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest2, AsymmetricalCacheTest,
testing::Combine(testing::Values(1), testing::Values(2), testing::Values(1), testing::Values(1),
testing::Values(1, 4), testing::Values(1), testing::Values(16), testing::Values(16), testing::Values(4),
@ -1369,6 +1431,18 @@ INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest1ForMLA, AsymmetricalCacheTest,
testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1),
testing::Values(true), testing::Values(false), testing::Values(false), testing::Values(false)));
INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest1ForMLAEvenLayer, AsymmetricalCacheTestWithDP,
testing::Combine(testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4), testing::Values(1),
testing::Values(1), testing::Values(10), testing::Values(1), testing::Values(4), testing::Values(8),
testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1),
testing::Values(true), testing::Values(false), testing::Values(false, true), testing::Values(false)));
INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest2ForMLAEvenLayer, AsymmetricalCacheTestWithDP,
testing::Combine(testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(1), testing::Values(4),
testing::Values(1), testing::Values(10), testing::Values(1), testing::Values(4), testing::Values(8),
testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(1),
testing::Values(true), testing::Values(false), testing::Values(false, true), testing::Values(false)));
INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForMLA1, AsymmetricalCacheTestWithDP,
testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2),
testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4),
@ -1403,12 +1477,19 @@ INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLA2, AsymmetricalCacheTest
testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(4), testing::Values(4),
testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2),
testing::Values(false), testing::Values(false), testing::Values(true), testing::Values(false)));
INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLADuplicate0, AsymmetricalCacheTestWithDP,
testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(4),
testing::Values(1), testing::Values(1), testing::Values(4), testing::Values(2), testing::Values(4),
testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2),
testing::Values(false), testing::Values(true, false), testing::Values(false), testing::Values(false)));
INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLADuplicate0EvenLayer, AsymmetricalCacheTestWithDP,
testing::Combine(testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4), testing::Values(1),
testing::Values(1), testing::Values(5), testing::Values(2), testing::Values(4), testing::Values(16),
testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2),
testing::Values(false), testing::Values(true, false), testing::Values(false), testing::Values(false)));
INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLADuplicate1, AsymmetricalCacheTestWithDP,
testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(2),
testing::Values(2), testing::Values(1), testing::Values(4), testing::Values(1), testing::Values(4),
@ -1419,6 +1500,7 @@ INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLADuplicate2, Asymmetrical
testing::Values(1), testing::Values(1), testing::Values(4), testing::Values(2), testing::Values(4),
testing::Values(16), testing::Values(nvinfer1::DataType::kFLOAT, nvinfer1::DataType::kINT8), testing::Values(2),
testing::Values(false), testing::Values(false), testing::Values(false), testing::Values(false)));
INSTANTIATE_TEST_CASE_P(AsymmetricCaseTestWithDPForNoMLADuplicate4, AsymmetricalCacheTestWithDP,
testing::Combine(testing::Values(4), testing::Values(1), testing::Values(1), testing::Values(1, 2),
testing::Values(2), testing::Values(1), testing::Values(4), testing::Values(1, 2), testing::Values(4),
@ -1444,12 +1526,17 @@ TEST(targetTest, CacheStateNODP)
{
auto attentionType = isMLA ? texec::kv_cache::CacheState::AttentionType::kMLA
: texec::kv_cache::CacheState::AttentionType::kDEFAULT;
std::vector<SizeType32> contextAttentionLayerNumPerPP(
contextWC.getPipelineParallelism(), numLayers / contextWC.getPipelineParallelism());
std::vector<SizeType32> genAttentionLayerNumPerPP(
genWC.getPipelineParallelism(), numLayers / genWC.getPipelineParallelism());
auto const sharedModelConfig
= texec::kv_cache::CacheState::ModelConfig{std::vector(numLayers, numHeads), sizePerHead, tokensPerBlock};
auto const contextCache
= texec::kv_cache::CacheState(sharedModelConfig, contextWC, dataType, attentionType, kvFactor);
auto const genCache = texec::kv_cache::CacheState(sharedModelConfig, genWC, dataType, attentionType, kvFactor);
auto const contextCache = texec::kv_cache::CacheState(
sharedModelConfig, contextWC, contextAttentionLayerNumPerPP, dataType, attentionType, kvFactor);
auto const genCache = texec::kv_cache::CacheState(
sharedModelConfig, genWC, genAttentionLayerNumPerPP, dataType, attentionType, kvFactor);
auto const contextTargetInfo
= tensorrt_llm::executor::kv_cache::TargetRanksInfoForDP(genCache, contextCache, contextRank);
@ -1731,6 +1818,8 @@ TEST(targetTest, CacheStateContextDP)
int genCP = 1;
bool contextEnableDP = true;
bool genEnableDP = true;
std::vector<SizeType32> contextAttentionLayerNumPerPP(contextPP, numLayers / contextPP);
std::vector<SizeType32> genAttentionLayerNumPerPP(genPP, numLayers / genPP);
auto const verifyContext = [&](int contextRank, int generationRank, std::vector<int> const& expectRanks,
int expectPPDomain, int expectTPDomain, bool expectNeedSend)
@ -1740,13 +1829,13 @@ TEST(targetTest, CacheStateContextDP)
auto attentionType = isMLA ? texec::kv_cache::CacheState::AttentionType::kMLA
: texec::kv_cache::CacheState::AttentionType::kDEFAULT;
auto const contextCache
= tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, tokensPerBlock, contextTP,
contextPP, contextCP, dataType, attentionType, kvFactor, contextEnableDP, contextDPRank, contextTP};
auto const contextCache = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead,
tokensPerBlock, contextTP, contextPP, contextCP, contextAttentionLayerNumPerPP, dataType, attentionType,
kvFactor, contextEnableDP, contextDPRank, contextTP};
auto const genCache
= tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, tokensPerBlock, genTP,
genPP, genCP, dataType, attentionType, kvFactor, genEnableDP, generationDPRank, genTP};
auto const genCache = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead,
tokensPerBlock, genTP, genPP, genCP, genAttentionLayerNumPerPP, dataType, attentionType, kvFactor,
genEnableDP, generationDPRank, genTP};
auto const contextTragetInfo
= tensorrt_llm::executor::kv_cache::TargetRanksInfoForDP(genCache, contextCache, contextRank);
@ -1847,13 +1936,13 @@ TEST(targetTest, CacheStateContextDP)
auto attentionType = isMLA ? texec::kv_cache::CacheState::AttentionType::kMLA
: texec::kv_cache::CacheState::AttentionType::kDEFAULT;
auto const contextCache
= tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, tokensPerBlock, contextTP,
contextPP, contextCP, dataType, attentionType, kvFactor, contextEnableDP, contextDPRank, contextTP};
auto const contextCache = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead,
tokensPerBlock, contextTP, contextPP, contextCP, contextAttentionLayerNumPerPP, dataType, attentionType,
kvFactor, contextEnableDP, contextDPRank, contextTP};
auto const genCache
= tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead, tokensPerBlock, genTP,
genPP, genCP, dataType, attentionType, kvFactor, genEnableDP, generationDPRank, genTP};
auto const genCache = tensorrt_llm::executor::kv_cache::CacheState{numLayers, numHeads, sizePerHead,
tokensPerBlock, genTP, genPP, genCP, genAttentionLayerNumPerPP, dataType, attentionType, kvFactor,
genEnableDP, generationDPRank, genTP};
auto const contextTragetInfo
= tensorrt_llm::executor::kv_cache::TargetRanksInfoForDP(contextCache, genCache, generationRank);
@ -1872,6 +1961,8 @@ TEST(targetTest, CacheStateContextDP)
contextPP = 1;
genTP = 1;
genPP = 2;
contextAttentionLayerNumPerPP = std::vector<SizeType32>(contextPP, numLayers / contextPP);
genAttentionLayerNumPerPP = std::vector<SizeType32>(genPP, numLayers / genPP);
verfiyGeneration(
/*contextRank*/ 0, /*generationRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1);
@ -1885,6 +1976,8 @@ TEST(targetTest, CacheStateContextDP)
contextPP = 1;
genTP = 1;
genPP = 1;
contextAttentionLayerNumPerPP = std::vector<SizeType32>(contextPP, numLayers / contextPP);
genAttentionLayerNumPerPP = std::vector<SizeType32>(genPP, numLayers / genPP);
verfiyGeneration(
/*contextRank*/ 0, /*generationRank*/ 0, /*expectRanks*/ {0}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1);

View File

@ -12,6 +12,18 @@ NVCC_VERSION_OUTPUT=$(nvcc --version)
OLD_CUDA_VER=$(echo $NVCC_VERSION_OUTPUT | grep -oP "\d+\.\d+" | head -n 1)
echo "The version of pre-installed CUDA is ${OLD_CUDA_VER}."
check_cuda_version() {
if [ -n "$CUDA_VERSION" ] && [ -n "$CUDA_DRIVER_VERSION" ]; then
CUDA_VERSION_SHORT=$(echo "$CUDA_VERSION" | cut -d'.' -f1-3)
ENV_CUDA_VER="${CUDA_VERSION_SHORT}_${CUDA_DRIVER_VERSION}"
if [ "$ENV_CUDA_VER" = "$CUDA_VER" ]; then
echo "CUDA version matches ($ENV_CUDA_VER), skipping reinstallation"
return 0
fi
fi
return 1
}
reinstall_rockylinux_cuda() {
dnf -y install epel-release
dnf remove -y "cuda*" "*cublas*" "*cufft*" "*cufile*" "*curand*" "*cusolver*" "*cusparse*" "*gds-tools*" "*npp*" "*nvjpeg*" "nsight*" "*nvvm*"
@ -25,6 +37,10 @@ reinstall_rockylinux_cuda() {
ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
case "$ID" in
rocky)
if check_cuda_version; then
echo "CUDA version matches ($CUDA_VER), skipping reinstallation"
exit 0
fi
echo "Reinstall CUDA for RockyLinux 8..."
reinstall_rockylinux_cuda
;;

View File

@ -100,7 +100,7 @@ install_rockylinux_requirements() {
done
# Remove old packages
dnf remove -y "libnccl*" "cuda-compat*" "cuda-toolkit*" "libcublas*"
dnf remove -y "libnccl*"
# Install new packages
dnf -y install \

View File

@ -133,9 +133,9 @@ Next, consider this linear layer is a `RowLinear` layer. When we partition the w
#### DoRA
TRTLLM supports DoRA as described in https://arxiv.org/abs/2402.09353 . To enable DoRA, you must add the additional `--dora_plugin enable` flag to the `trtllm-build` command.
TensorRT LLM supports DoRA as described in https://arxiv.org/abs/2402.09353 . To enable DoRA, you must add the additional `--dora_plugin enable` flag to the `trtllm-build` command.
The DoRA scales must be normalized before they are submitted to TRTLLM in an inference request. The normalization requires the base model weights. To normalize your adapter you may use the script provided in `tensorrt_llm/examples/dora/normalize_weights.py`.
The DoRA scales must be normalized before they are submitted to TensorRT LLM in an inference request. The normalization requires the base model weights. To normalize your adapter you may use the script provided in `tensorrt_llm/examples/dora/normalize_weights.py`.
When using DoRA, the format of `LoraWeights` and `LoraConfig` changes slightly.
The shape of `LoraConfig` becomes `[module_id, layer_idx, adapter_size D (i.e. R value), is_dora]`, with `is_dora` a boolean flag that determines whether the supplied adapter contains DoRA scales or not. If the old config shape is used, it is assumed the adapter does not have DoRA scales.

View File

@ -173,7 +173,7 @@ Similarly to ReDrafter, TensorRT-LLM implements the EAGLE model such that logits
### Disaggregated Serving
[Disaggregated Serving](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md) with EAGLE-3 using the two-model approach is supported in the PyTorch backend.
[Disaggregated Serving](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/features/disaggregated-service.md) with EAGLE3 using the two model approach is supported in the Pytorch backend. Please refer to the following [Dynamo example](https://github.com/ai-dynamo/dynamo/blob/main/examples/tensorrt_llm/llama4_plus_eagle.md) on how to run EAGLE3 with Disaggregated Serving for Llama 4 Maverick.
## Lookahead Decoding

View File

@ -2,11 +2,11 @@
# Adding a Model
This document describes how to add a typical decoder-only model in TensorRT-LLM.
This document describes how to add a typical decoder-only model in TensorRT LLM.
## Step 1. Write Modeling Part
TensorRT-LLM provides different levels of APIs:
TensorRT LLM provides different levels of APIs:
- Low-level functions, for example, `concat`, `add`, and `sum`.
- Basic layers, such as, `Linear` and `LayerNorm`.
@ -14,7 +14,7 @@ TensorRT-LLM provides different levels of APIs:
- Base class for typical decoder-only models, such as, `DecoderModelForCausalLM`.
1. Create a model directory in `tensorrt_llm/models`, for example `my_model`.
2. Write a `model.py` with TensorRT-LLM's APIs
2. Write a `model.py` with TensorRT LLM's APIs
```python
class MyDecoderLayer(Module):
@ -52,7 +52,7 @@ class MyModelForCausalLM(DecoderModelForCausalLM):
## Step 2. Implement Weight Conversion
The weights from source framework need to be converted and bound to the new added TensorRT-LLM model. Here is an example of converting HuggingFace weights:
The weights from source framework need to be converted and bound to the new added TensorRT LLM model. Here is an example of converting HuggingFace weights:
```python
class MyModelForCausalLM(DecoderModelForCausalLM):
@ -62,8 +62,8 @@ class MyModelForCausalLM(DecoderModelForCausalLM):
hf_model_dir,
dtype='float16',
mapping: Optional[Mapping] = None) -> MyModelForCausalLM
# create a TensorRT-LLM MyModelForCausalLM model object
# convert HuggingFace checkpoint to TensorRT-LLM expected weights dict
# create a TensorRT LLM MyModelForCausalLM model object
# convert HuggingFace checkpoint to TensorRT LLM expected weights dict
# load the weights to MyModelForCausalLM object
```

View File

@ -1,36 +1,36 @@
# TensorRT-LLM Checkpoint
# TensorRT LLM Checkpoint
## Overview
The earlier versions (pre-0.8 version) of TensorRT-LLM were developed with a very aggressive timeline. For those versions, emphasis was not put on defining a unified workflow. Now that TensorRT-LLM has reached some level of feature richness, the development team has decided to put more effort into unifying the APIs and workflow of TensorRT-LLM. This file documents the workflow around TensorRT-LLM checkpoint and the set of CLI tools to generate checkpoint, build engines, and evaluate engines.
The earlier versions (pre-0.8 version) of TensorRT LLM were developed with a very aggressive timeline. For those versions, emphasis was not put on defining a unified workflow. Now that TensorRT LLM has reached some level of feature richness, the development team has decided to put more effort into unifying the APIs and workflow of TensorRT LLM. This file documents the workflow around TensorRT LLM checkpoint and the set of CLI tools to generate checkpoint, build engines, and evaluate engines.
There are three steps in the workflow:
1. Convert weights from different source frameworks into TensorRT-LLM checkpoint.
2. Build the TensorRT-LLM checkpoint into TensorRT engines with a unified build command.
3. Load the engines to TensorRT-LLM model runner and evaluate with different evaluation tasks.
1. Convert weights from different source frameworks into TensorRT LLM checkpoint.
2. Build the TensorRT LLM checkpoint into TensorRT engines with a unified build command.
3. Load the engines to TensorRT LLM model runner and evaluate with different evaluation tasks.
```
NeMo -------------
|
HuggingFace ------
| convert build load
Modelopt --------- ----------> TensorRT-LLM Checkpoint --------> TensorRT Engine ------> TensorRT-LLM ModelRunner
Modelopt --------- ----------> TensorRT LLM Checkpoint --------> TensorRT Engine ------> TensorRT LLM ModelRunner
|
JAX --------------
|
DeepSpeed --------
```
## Prepare the TensorRT-LLM Checkpoint
## Prepare the TensorRT LLM Checkpoint
TensorRT-LLM aims at supporting different sources:
TensorRT LLM aims at supporting different sources:
1. Trained models from NVIDIA NeMo, Microsoft DeepSpeed, and JAX
2. Quantized models from NVIDIA Modelopt
3. Popular models from HuggingFace
TensorRT-LLM defines its own checkpoint format. A checkpoint directory includes:
TensorRT LLM defines its own checkpoint format. A checkpoint directory includes:
1. One config `json` file, which contains several model hyper-parameters.
2. One or several rank weights files, each file contains a dictionary of tensors (weights).
@ -107,7 +107,7 @@ Here is the model specific config list:
### Rank Weights
Like PyTorch, the tensor (weight) name is a string containing hierarchical information,
which is uniquely mapped to a certain parameter of a TensorRT-LLM model.
which is uniquely mapped to a certain parameter of a TensorRT LLM model.
For example, each transformer layer of the OPT model contains an `Attention` layer, an `MLP` layer. and two `LayerNorm` layers.
@ -169,7 +169,7 @@ Here is the AWQ scaling factors of `mlp.fc` linear layer:
- `transformer.layers.0.mlp.fc.prequant_scaling_factor`
```{note}
The linear weights in TensorRT-LLM checkpoint always follows (`out_feature`, `in_feature`) shape, whereas some quantized linear in TensorRT-LLM implemented by plugin may use (`in_feature`, `out_fature`) shape. The `trtllm-build` command adds a transpose operation to post-process it.
The linear weights in TensorRT LLM checkpoint always follows (`out_feature`, `in_feature`) shape, whereas some quantized linear in TensorRT LLM implemented by plugin may use (`in_feature`, `out_feature`) shape. The `trtllm-build` command adds a transpose operation to post-process it.
### Example
@ -218,7 +218,7 @@ Here is the `config.json`:
## Build Checkpoint into TensorRT Engine
TensorRT-LLM provides a unified build command: `trtllm-build`. Before using it,
TensorRT LLM provides a unified build command: `trtllm-build`. Before using it,
you may need to add it to the `PATH`.
```bash

View File

@ -1,18 +1,75 @@
(architecture-overview)=
# Architecture Overview
# TensorRT-LLM Architecture
The `LLM` class is a core entry point for the TensorRT LLM, providing a simplified `generate()` API for efficient large language model inference. This abstraction aims to streamline the user experience, as demonstrated with TinyLlama:
TensorRT-LLM is a toolkit to assemble optimized solutions to perform Large Language Model (LLM) inference. It offers a Model Definition API to define models and compile efficient [TensorRT](https://developer.nvidia.com/tensorrt) engines for NVIDIA GPUs. It also contains Python and C++ components to build runtimes to execute those engines as well as backends for the [Triton Inference
Server](https://developer.nvidia.com/nvidia-triton-inference-server) to easily create web-based services for LLMs. TensorRT-LLM supports multi-GPU and multi-node configurations (through MPI).
```python
from tensorrt_llm import LLM
As a user, the very first step to create an inference solution is to either define your own model or select a pre-defined network architecture (refer to {ref}`models` for the list of models supported by TensorRT-LLM). Once defined, that model must be trained using a training framework (training is outside of the scope of TensorRT-LLM). For pre-defined models, checkpoints can be downloaded from various providers. To illustrate that point, a lot of examples in TensorRT-LLM use model weights obtained from the [Hugging Face](https://huggingface.co) hub and trained using [NVIDIA Nemo](https://developer.nvidia.com/nemo) or [PyTorch](https://pytorch.org).
# Initialize the LLM with a specified model
llm = LLM(model="TinyLlama/TinyLlama-1.1B-Chat-v1.0")
Equipped with the model definition and the weights, a user must use TensorRT-LLM's Model Definition API to recreate the model in a way that can be compiled by TensorRT into an efficient engine. For ease of use, TensorRT-LLM already supports a handful of standard models.
# Generate text using the model
output = llm.generate("Hello, my name is")
```
Together with the Model Definition API to describe models, TensorRT-LLM provides users with components to create a runtime that executes the efficient TensorRT engine. Runtime components offer beam-search, along with extensive sampling functionalities such as top-K and top-P sampling. The exhaustive list can be found in the documentation of the {ref}`gpt-runtime`. The C++ runtime is the recommended runtime.
The `LLM` class automatically manages essential pre and post-processing steps, including tokenization (encoding input prompts into numerical representations) and detokenization (decoding model outputs back into human-readable text).
TensorRT-LLM also includes Python and C++ backends for NVIDIA Triton Inference Server to assemble solutions for LLM online serving. The C++ backend implements in-flight batching as explained in the {ref}`executor` documentation and is the recommended backend.
Internally, the `LLM` class orchestrates the creation of a dedicated `PyExecutor(Worker)` process on each rank.
## Model Weights
![TensorRT LLM Architecture Overview](../media/TRTLLM_Architecture_Overview.png)
TensorRT-LLM is a library for LLM inference, and so to use it, you need to supply a set of trained weights. You can either use your own model weights trained in a framework like [NVIDIA NeMo](https://www.nvidia.com/en-us/ai-data-science/generative-ai/nemo-framework/) or pull a set of pretrained weights from repositories like the Hugging Face Hub.
This `PyExecutor` operates in a continuous background loop, designed for the efficient, asynchronous processing of inference requests.
The `PyExecutor`'s functionality is built upon several key components:
- `Scheduler`: Responsible for determining which active requests are ready for execution at each processing step.
- `KVCacheManager`: Manages the allocation, deallocation, and maintenance of the Key-Value (KV) Cache. This is a critical optimization for Transformer models, significantly enhancing performance during autoregressive text generation by storing previously computed attention keys and values.
- `ModelEngine`: Handles the loading and highly efficient execution of the language model on the GPU hardware.
- `Sampler`: Takes the raw outputs (logits) from the ModelEngine and applies appropriate sampling strategies (e.g., greedy, top-k, top-p, beam search) to generate the final output tokens.
During each iteration of its background loop, the `PyExecutor` performs the following sequence of operations:
- Request Fetching: Retrieves new inference requests from an internal request queue, if available.
- Scheduling: Interacts with the `Scheduler` to identify and prioritize requests that are ready to be processed in the current step.
- Resource Preparation: Coordinates with the `KVCacheManager` to ensure that the necessary Key-Value (KV) Cache resources are allocated for the selected requests.
- Model Execution: Invokes the `ModelEngine` to perform a forward pass on the scheduled requests, predicting the next output tokens.
- Output Handling: Updates the partial outputs for ongoing requests and finalizes the results for any requests that have reached completion, returning them to the user.
## Runtime Optimizations
TensorRT LLM enhances inference throughput and reduces latency by integrating a suite of runtime optimizations, including CUDA Graph, [Overlap Scheduler](../features/overlap-scheduler.md), [Speculative decoding](../features/speculative-decoding.md), etc.
### CUDA Graph
CUDA Graphs drastically reduce the CPU-side overhead associated with launching GPU kernels, which is particularly impactful in PyTorch-based inference where Python's host-side code can be a bottleneck. By capturing a sequence of CUDA operations as a single graph, the entire sequence can be launched with one API call, minimizing CPU-GPU synchronization and driver overhead.
To maximize the "hit rate" of these cached graphs, TensorRT LLM employs CUDA Graph padding. If an incoming batch's size doesn't match a captured graph, it's padded to the nearest larger, supported size for which a graph exists. While this incurs minor overhead from computing "wasted" tokens, it's often a better trade-off than falling back to slower eager mode execution. This optimization has a significant impact, demonstrating up to a 22% end-to-end throughput increase on certain models and hardware.
### Overlap Scheduler
The Overlap Scheduler maximizes GPU utilization by hiding CPU-bound latency behind GPU computation.
The key strategy is to launch the GPU's work for the next step (n+1) immediately, without waiting for the CPU to finish processing the results of the current step (n). This allows the CPU to handle tasks like checking stop criteria or updating responses for one batch while the GPU is already executing the model for the subsequent batch.
This concurrent execution pipeline is illustrated in the `PyExecutor`'s logic:
```python
# Schedule and launch GPU work for the current step (n)
scheduled_batch, _, _ = self._schedule()
batch_outputs = self._forward_step(scheduled_batch, previous_tensors_device)
sample_state = self._sample_async(scheduled_batch, batch_outputs)
# While the GPU is busy, process the CPU-bound results from the previous step (n-1)
if self.previous_batch is not None:
self._process_previous_batch()
```
This approach effectively reduces GPU idle time and improves overall hardware occupancy. While it introduces one extra decoding step into the pipeline, the resulting throughput gain is a significant trade-off. For this reason, the Overlap Scheduler is enabled by default in TensorRT LLM.

View File

@ -1,4 +1,4 @@
# How to get best performance on DeepSeek-R1 in TensorRT-LLM
# How to get best performance on DeepSeek-R1 in TensorRT LLM
NVIDIA has announced world-record DeepSeek-R1 inference performance at NVIDIA GTC 2025. A single NVIDIA DGX system with eight NVIDIA Blackwell GPUs can achieve over 250 tokens per second per user or a maximum throughput of over 30,000 tokens per second on the massive, state-of-the-art 671 billion parameter DeepSeek-R1 model. [NVIDIA Blackwell Delivers World-Record DeepSeek-R1 Inference Performance](https://developer.nvidia.com/blog/nvidia-blackwell-delivers-world-record-deepseek-r1-inference-performance/)
@ -6,13 +6,13 @@ In this blog, we share the configurations and procedures about how to reproduce
## Table of Contents
- [How to get best performance on DeepSeek-R1 in TensorRT-LLM](#how-to-get-best-performance-on-deepseek-r1-in-tensorrt-llm)
- [How to get best performance on DeepSeek-R1 in TensorRT LLM](#how-to-get-best-performance-on-deepseek-r1-in-tensorrt-llm)
- [Table of Contents](#table-of-contents)
- [Prerequisites: Install TensorRT-LLM and download models](#prerequisites-install-tensorrt-llm-and-download-models)
- [1. Download TensorRT-LLM](#1-download-tensorrt-llm)
- [Prerequisites: Install TensorRT LLM and download models](#prerequisites-install-tensorrt-llm-and-download-models)
- [1. Download TensorRT LLM](#1-download-tensorrt-llm)
- [2. Download the DeepSeek R1 models](#2-download-the-deepseek-r1-models)
- [3. Build and run TensorRT-LLM container](#3-build-and-run-tensorrt-llm-container)
- [4. Compile and Install TensorRT-LLM](#4-compile-and-install-tensorrt-llm)
- [3. Build and run TensorRT LLM container](#3-build-and-run-tensorrt-llm-container)
- [4. Compile and Install TensorRT LLM](#4-compile-and-install-tensorrt-llm)
- [5. Optional: Tune GPU clocks](#5-optional-tune-gpu-clocks)
- [6. Dataset preparation](#6-dataset-preparation)
- [Reproducing steps](#reproducing-steps)
@ -34,13 +34,13 @@ In this blog, we share the configurations and procedures about how to reproduce
- [Out of memory issues](#out-of-memory-issues)
## Prerequisites: Install TensorRT-LLM and download models
## Prerequisites: Install TensorRT LLM and download models
This section can be skipped if you already have TensorRT-LLM installed and have already downloaded the DeepSeek R1 model checkpoint.
This section can be skipped if you already have TensorRT LLM installed and have already downloaded the DeepSeek R1 model checkpoint.
#### 1. Download TensorRT-LLM
#### 1. Download TensorRT LLM
**You can also find more comprehensive instructions to install TensorRT-LLM in this [TensorRT-LLM installation guide](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html), refer to that guide for common issues if you encounter any here.**
**You can also find more comprehensive instructions to install TensorRT LLM in this [TensorRT LLM installation guide](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source-linux.html), refer to that guide for common issues if you encounter any here.**
``` bash
# Prerequisites
@ -50,7 +50,7 @@ git lfs install
# Replace with your actual path
YOUR_WORK_PATH=<YOUR_WORK_PATH>
# Clone the TensorRT-LLM repository
# Clone the TensorRT LLM repository
cd $YOUR_WORK_PATH
git clone https://github.com/NVIDIA/TensorRT-LLM.git
cd TensorRT-LLM
@ -77,7 +77,7 @@ git clone https://huggingface.co/nvidia/DeepSeek-R1-FP4
git clone https://huggingface.co/deepseek-ai/DeepSeek-R1
```
#### 3. Build and run TensorRT-LLM container
#### 3. Build and run TensorRT LLM container
``` bash
cd TensorRT-LLM
@ -85,7 +85,7 @@ make -C docker run LOCAL_USER=1 DOCKER_RUN_ARGS="-v $YOUR_MODEL_PATH:$YOUR_MODEL
```
Here we set `LOCAL_USER=1` argument to set up the local user instead of root account inside the container, you can remove it if running as root inside container is fine.
#### 4. Compile and Install TensorRT-LLM
#### 4. Compile and Install TensorRT LLM
Here we compile the source inside the container:
``` bash
@ -122,11 +122,11 @@ The command to generate synthetic dataset will be attached to the max throughput
This section provides the reproducing steps for NVIDIA Blackwell B200 and H200 GPUs, for both min-latency and max-throughput scenarios.
All the benchmarking is done by the trtllm-bench command line tool provided in the TensorRT-LLM installation, see [TensorRT-LLM Benchmarking](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html) for details of this tool.
All the benchmarking is done by the trtllm-bench command line tool provided in the TensorRT LLM installation, see [TensorRT LLM Benchmarking](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html) for details of this tool.
For brevity, we only provide the commands to reproduce the perf numbers without detailed explanation of the tools and options in this doc.
All these commands here are assumed to be running inside the container started by `make -C docker run ...` command mentioned in the [Build and run TensorRT-LLM container section](#3-build-and-run-tensorrt-llm-container)
All these commands here are assumed to be running inside the container started by `make -C docker run ...` command mentioned in the [Build and run TensorRT LLM container section](#3-build-and-run-tensorrt-llm-container)
### B200 min-latency
Our benchmark results are based on **Batch = 1, ISL = 1K, OSL = 2K, num_requests = 10 from real dataset**
@ -158,7 +158,7 @@ trtllm-bench --model nvidia/DeepSeek-R1-FP4 \
```
Explanation:
- `trtllm-bench`: A CLI benchmarking utility that aims to make it easier for users to reproduce our officially published. See [TensorRT-LLM Benchmarking](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html) for details.
- `trtllm-bench`: A CLI benchmarking utility that aims to make it easier for users to reproduce our officially published. See [TensorRT LLM Benchmarking](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html) for details.
- `--dataset`: Prompt dataset used to benchmark. Our official benchmark dataset has ISL = 1K, OSL = 2K
- `--num_requests`: Num requests used for the benchmark.
- `--concurrency`: Total concurrency for the system.
@ -186,7 +186,7 @@ Average request latency (ms): 7456.1219
Due to our evaluation found that FP8 KV cache does not introduce obvious accuracy drop compared to BF16 KV cache. See [Precision strategy](./tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md#precision-strategy), the latest [DeepSeek-R1-0528-FP4](https://huggingface.co/nvidia/DeepSeek-R1-0528-FP4) checkpoint had enabled FP8 KV cache by-default.
We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers here. The results are reproduced with TensorRT-LLM commit b6261862419c33d6ce2313aff1e7116067d6037d.
We are seeing meaningful speedup using FP8 KV cache, thus refreshing the numbers here. The results are reproduced with TensorRT LLM commit b6261862419c33d6ce2313aff1e7116067d6037d.
!! Note that the exact command to reproduce numbers can change as the API/options are refactored, the option and numbers here is a reference at given exact commit.
@ -239,7 +239,7 @@ Per GPU Output Throughput (tps/gpu): 5393.2755
### B200 max-throughput for R1 with FP16 KV cache
Our benchmark results are based on **Batch = 3072, ISL = 1K, OSL = 2K, num_requests = 49152 from synthetic dataset**.
The results are reproduced with TensorRT-LLM commit b6261862419c33d6ce2313aff1e7116067d6037d.
The results are reproduced with TensorRT LLM commit b6261862419c33d6ce2313aff1e7116067d6037d.
!! Note that the exact command to reproduce numbers can change as the API/options are refactored, the option and numbers here is a reference at given exact commit.
@ -401,7 +401,7 @@ Average request latency (ms): 181540.5739
## Exploring more ISL/OSL combinations
To benchmark TensorRT-LLM on DeepSeek models with more ISL/OSL combinations, you can use `prepare_dataset.py` to generate the dataset and use similar commands mentioned in the previous section. TensorRT-LLM is working on enhancements that can make the benchmark process smoother.
To benchmark TensorRT LLM on DeepSeek models with more ISL/OSL combinations, you can use `prepare_dataset.py` to generate the dataset and use similar commands mentioned in the previous section. TensorRT LLM is working on enhancements that can make the benchmark process smoother.
### WIP: Enable more features by default
Currently, there are some features that need to be enabled through a user-defined file `extra-llm-api-config.yml`, such as CUDA graph, overlap scheduler and attention dp. We're working on to enable those features by default, so that users can get good out-of-the-box performance on DeepSeek models.

View File

@ -1,13 +1,13 @@
# Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100
H200's large capacity & high memory bandwidth, paired with TensorRT-LLM's
H200's large capacity & high memory bandwidth, paired with TensorRT LLM's
optimizations, maximizes inference performance.
## Falcon-180B on a single H200 with INT4 AWQ
[Falcon-180B](https://huggingface.co/tiiuae/falcon-180B), one of the largest &
most accurate open source models available, can run on a *single* H200 GPU.
The 141GB of memory on H200, paired with TensorRT-LLM running INT4 AWQ with
The 141GB of memory on H200, paired with TensorRT LLM running INT4 AWQ with
FP8, allows for the entire large language model to fit on a single GPU, where
previously eight A100s were required. H200 Falcon-180B provides up to **800**
tok/s and retains high accuracy.
@ -30,7 +30,7 @@ BS: (in order) 256, 128 </sup>
**Model Accuracy:**
Often quantization can have adverse impacts on the accuracy of the model,
however, TensorRT-LLM's AWQ decreases memory footprint of the model by **4x**
however, TensorRT LLM's AWQ decreases memory footprint of the model by **4x**
while maintaining high accuracy.
<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/5aec7af45fc0abd876fa68a9ae8c8cae084f3af3/docs/source/blogs/media/Falcon180B-H200_acc.png?raw=true" alt="Falcon-180B accuracy comparison" width="600" height="auto">
@ -52,18 +52,18 @@ retain higher accuracy than other 4bit methods and reduce memory usage, but
requires special kernels capable of handling the change in precision
performantly.
TensorRT-LLM has implemented custom kernels for AWQ, and taken the technique a
TensorRT LLM has implemented custom kernels for AWQ, and taken the technique a
step further by performing FP8 computation on Hopper GPUs instead of the
standard FP16.
Similar examples running Falcon-180B with quantization in TensorRT-LLM are
Similar examples running Falcon-180B with quantization in TensorRT LLM are
available in [examples/models/contrib/falcon](/examples/models/contrib/falcon).
## Llama-70B on H200 up to 6.7x A100
TensorRT-LLM has improved its Group Query Attention (GQA) kernels, in the
TensorRT LLM has improved its Group Query Attention (GQA) kernels, in the
generation phase, providing up to 2.4x improvement on Llama-70B over
TensorRT-LLM v0.5, achieving over **3,800** tok/s/gpu at up to **6.7x** faster
TensorRT LLM v0.5, achieving over **3,800** tok/s/gpu at up to **6.7x** faster
than A100.
**H200 6.7x A100**
@ -106,7 +106,7 @@ BS 192 </sup>
[**Grouped Query Attention (GQA)**](https://arxiv.org/abs/2305.13245v2)
(Ainslie et al., 2023), used in Llama-70B, is a variant of Multihead Attention
(MHA) which groups key-value (KV) heads together, resulting in fewer KV heads
than query (Q) heads. TensorRT-LLM has a custom implementation of MHA which
than query (Q) heads. TensorRT LLM has a custom implementation of MHA which
supports GQA, multi-query attention (MQA) and standard MHA. It leverages Tensor
Cores, including in the generation phase, and delivers great performance on
NVIDIA GPUs.
@ -116,7 +116,7 @@ NVIDIA GPUs.
These improvements will be published in the `main` branch soon, and will be
included in the v0.7 & v0.8 releases.
Similar examples running Llama-70B in TensorRT-LLM are published in
Similar examples running Llama-70B in TensorRT LLM are published in
[examples/models/core/llama](/examples/models/core/llama).
For more information about H200, please see the [H200 announcement blog](./H200launch.md).

View File

@ -1,16 +1,16 @@
> :bangbang: :new: *NVIDIA H200 has been announced & is optimized on TensorRT-LLM. Learn more about H200, & H100 comparison, here:* [**H200** achieves nearly **12,000 tokens/sec on Llama2-13B** with TensorRT-LLM](./H200launch.md)
> :bangbang: :new: *NVIDIA H200 has been announced & is optimized on TensorRT LLM. Learn more about H200, & H100 comparison, here:* [**H200** achieves nearly **12,000 tokens/sec on Llama2-13B** with TensorRT LLM](./H200launch.md)
# H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token
# H100 has 4.6x A100 Performance in TensorRT LLM, achieving 10,000 tok/s at 100ms to first token
TensorRT-LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x max throughput and 4.4x faster 1st token latency than A100**. H100 FP8 is able to achieve over 10,000 output tok/s at peak throughput for 64 concurrent requests, while maintaining a 1st token latency of 100ms. For min-latency applications, TRT-LLM H100 can achieve less than 10ms to 1st token latency.
TensorRT LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x max throughput and 4.4x faster 1st token latency than A100**. H100 FP8 is able to achieve over 10,000 output tok/s at peak throughput for 64 concurrent requests, while maintaining a 1st token latency of 100ms. For min-latency applications, TRT-LLM H100 can achieve less than 10ms to 1st token latency.
<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/blogs/media/TRT_LLM_v0-5-0_H100vA100_tps.png?raw=true" alt="max throughput" width="500" height="auto">
<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/blogs/media/TRT_LLM_v0-5-0_H100vA100_1st.png?raw=true" alt="1st token latency" width="500" height="auto">
<sub>TensorRT-LLM throughput & first token latency on H100 & A100. H100 FP8, A100 FP16, SXM 80GB GPUs, ISL/OSL's provided, TP=1, BS=32/64 max throughput, BS=1 1st token latency. TensorRT-LLM v0.5.0, TensorRT 9.1. </sub>
<sub>TensorRT LLM throughput & first token latency on H100 & A100. H100 FP8, A100 FP16, SXM 80GB GPUs, ISL/OSL's provided, TP=1, BS=32/64 max throughput, BS=1 1st token latency. TensorRT LLM v0.5.0, TensorRT 9.1. </sub>
<sub>Max throughput calculated by sweeping BS 1,2,...,64. Throughput taken at largest successful.</sub>
**Max Throughput & Min Latency**
@ -26,9 +26,9 @@ TensorRT-LLM evaluated on both Hopper and Ampere shows **H100 FP8 is up to 4.6x
| GPT-J 6B | 64 | 128 | 128 | **3.0x** | **4.7x** |
| GPT-J 6B | 1 | 128 | - | **2.4x** | 1.7x |
<sub>FP8 H100, FP16 A100, SXM 80GB GPUs, TP1, ISL/OSL's provided, TensorRT-LLM v0.5.0., TensorRT 9.1</sub>
<sub>FP8 H100, FP16 A100, SXM 80GB GPUs, TP1, ISL/OSL's provided, TensorRT LLM v0.5.0., TensorRT 9.1</sub>
The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT-LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html)
The full data behind these charts & tables and including larger models with higher TP values can be found in TensorRT LLM's [Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html)
Stay tuned for a highlight on Llama coming soon!

View File

@ -1,8 +1,8 @@
:loudspeaker: Note: The below data is using TensorRT-LLM v0.5. There have been significant improvements in v0.6 & later. Please see updated Llama performance [here](./Falcon180B-H200.md).
:loudspeaker: Note: The below data is using TensorRT LLM v0.5. There have been significant improvements in v0.6 & later. Please see updated Llama performance [here](./Falcon180B-H200.md).
# H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM
# H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT LLM
TensorRT-LLM evaluation of the [new H200 GPU](https://nvidianews.nvidia.com/news/nvidia-supercharges-hopper-the-worlds-leading-ai-computing-platform) achieves **11,819 tokens/s on Llama2-13B** on a single GPU. H200 is up to **1.9x faster** than H100. This performance is enabled by H200's larger, faster [HBM3e memory](#latest-hbm-memory).
TensorRT LLM evaluation of the [new H200 GPU](https://nvidianews.nvidia.com/news/nvidia-supercharges-hopper-the-worlds-leading-ai-computing-platform) achieves **11,819 tokens/s on Llama2-13B** on a single GPU. H200 is up to **1.9x faster** than H100. This performance is enabled by H200's larger, faster [HBM3e memory](#latest-hbm-memory).
**H200 FP8 Max throughput**
@ -17,11 +17,11 @@ TensorRT-LLM evaluation of the [new H200 GPU](https://nvidianews.nvidia.com/news
| llama_70b | 64 | 1 | 2048 | 128 | 341 |
| llama_70b | 32 | 1 | 2048 | 128 | 303 |
<sub>Preliminary measured performance, subject to change. TensorRT-LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. </sub>
<sub>Preliminary measured performance, subject to change. TensorRT LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. </sub>
<sup>*(1) Largest batch supported on given TP configuration by power of 2.*</sup> <sup>*(2) TP = Tensor Parallelism*</sup>
Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT-LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html).
Additional Performance data is available on the [NVIDIA Data Center Deep Learning Product Performance](https://developer.nvidia.com/deep-learning-performance-training-inference/ai-inference) page, & soon in [TensorRT LLM's Performance Documentation](https://nvidia.github.io/TensorRT-LLM/latest/performance/perf-overview.html).
### H200 vs H100
@ -38,7 +38,7 @@ an online chat agent scenario (ISL/OSL=80/200) with GPT3-175B on a full HGX (TP8
<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/blogs/media/H200launch_tps.png?raw=true" alt="H200 TPS" width="500" height="auto">
<sub>Preliminary measured performance, subject to change.
TensorRT-LLM v0.5.0, TensorRT v9.1.0.4. | Llama-70B: H100 FP8 BS 8, H200 FP8 BS 32 | GPT3-175B: H100 FP8 BS 64, H200 FP8 BS 128 </sub>
TensorRT LLM v0.5.0, TensorRT v9.1.0.4. | Llama-70B: H100 FP8 BS 8, H200 FP8 BS 32 | GPT3-175B: H100 FP8 BS 64, H200 FP8 BS 128 </sub>
**Max Throughput across TP/BS:**
@ -47,7 +47,7 @@ Max throughput<sup>(3)</sup> on H200 vs H100 varies by model, sequence lengths,
<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/rel/docs/source/blogs/media/H200launch_H200vsH100_tps.png?raw=true" alt="max throughput llama sweep" width="500" height="auto">
<sub>Preliminary measured performance, subject to change.
TensorRT-LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. </sub>
TensorRT LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. </sub>
<sup>*(3) Max Throughput per GPU is defined as the highest tok/s per GPU, swept across TP configurations & BS powers of 2.*</sup>
@ -55,4 +55,4 @@ TensorRT-LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. </sub>
### Latest HBM Memory
H200 is the newest addition to NVIDIAs data center GPU portfolio. To maximize that compute performance, H200 is the first GPU with HBM3e memory with 4.8TB/s of memory bandwidth, a 1.4X increase over H100. H200 also expands GPU memory capacity nearly 2X to 141 gigabytes (GB). The combination of faster and larger HBM memory accelerates performance of LLM model inference performance with faster throughput and tokens per second. These results are measured and preliminary, more updates expected as optimizations for H200 continue with TensorRT-LLM.
H200 is the newest addition to NVIDIAs data center GPU portfolio. To maximize that compute performance, H200 is the first GPU with HBM3e memory with 4.8TB/s of memory bandwidth, a 1.4X increase over H100. H200 also expands GPU memory capacity nearly 2X to 141 gigabytes (GB). The combination of faster and larger HBM memory accelerates performance of LLM model inference performance with faster throughput and tokens per second. These results are measured and preliminary, more updates expected as optimizations for H200 continue with TensorRT LLM.

View File

@ -1,6 +1,6 @@
# ADP Balance Strategy
By NVIDIA TensorRT-LLM team
By NVIDIA TensorRT LLM team
## Table of Contents
- [ADP Balance Strategy](#adp-balance-strategy)
@ -96,7 +96,7 @@ The conventional approach employs a global load balancing strategy that sorts in
<div align="center">
<figure>
<img src="./../media/tech_blog10_baseline_round_robin_strategy.png">
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog10_baseline_round_robin_strategy.png">
</figure>
</div>
<p align="center"><sub><em>Figure 1: Baseline round-robin strategy balances context request tokens across ranks through sorting and cyclic distribution</em></sub></p>
@ -179,7 +179,7 @@ We evaluate our approach using a comprehensive dataset comprising 16,000 inferen
<div align="center">
<figure>
<img src="./../media/tech_blog10_dataset_token_distribution.png">
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog10_dataset_token_distribution.png">
</figure>
</div>
<p align="center"><sub><em>Figure 2: Distribution of input and output token lengths</em></sub></p>
@ -225,7 +225,7 @@ Figure 3 provides comprehensive insight into baseline system behavior, displayin
<div align="center">
<figure>
<img src="./../media/tech_blog10_baseline_performance_overview.png">
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog10_baseline_performance_overview.png">
</figure>
</div>
<p align="center"><sub><em>Figure 3: Baseline performance overview showing token distribution and balance ratios across all iterations</em></sub></p>
@ -239,7 +239,7 @@ Figure 4 zooms into the critical imbalance period [100-12,000], revealing the dr
<div align="center">
<figure>
<img src="./../media/tech_blog10_baseline_performance_detail.png">
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog10_baseline_performance_detail.png">
</figure>
</div>
<p align="center"><sub><em>Figure 4: Detailed baseline analysis for iterations 100-12,000 showing severe balance fluctuations</em></sub></p>
@ -260,7 +260,7 @@ The Context Wait mechanism (`timeout_iters=50`) demonstrates the effectiveness o
<div align="center">
<figure>
<img src="./../media/tech_blog10_context_wait_performance.png">
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog10_context_wait_performance.png">
</figure>
</div>
<p align="center"><sub><em>Figure 5: Context Wait performance showing improved balance stability for iterations 100-12,000</em></sub></p>
@ -300,7 +300,7 @@ The effectiveness of our complete ADP Balance implementation is clearly demonstr
<div align="center">
<figure>
<img src="./../media/tech_blog10_full_strategy_performance.png">
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog10_full_strategy_performance.png">
</figure>
</div>
<p align="center"><sub><em>Figure 6: Full ADP Balance strategy demonstrating superior balance stability for iterations 100-12,000</em></sub></p>
@ -324,7 +324,7 @@ Understanding the performance trade-offs inherent in our ADP Balance strategy is
<div align="center">
<figure>
<img src="./../media/tech_blog10_tps_ttft_pareto_curve.png">
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog10_tps_ttft_pareto_curve.png">
</figure>
</div>
<p align="center"><sub><em>Figure 7: Pareto frontier analysis showing throughput-latency trade-offs across different ADP Balance configurations</em></sub></p>
@ -364,4 +364,4 @@ The Pareto frontier analysis provides critical insights for real-world deploymen
## Acknowledgement
The ADP Balance strategy was a great team effort, covering system performance analysis and optimization. While we cannot thank every contributor individually, we are proud to acknowledge the dedicated team of engineers whose collective expertise has propelled TensorRT-LLM to new heights of performance. Through this collaborative effort, we have gained valuable insights into improving GPU utilization for large language model inference. We hope the techniques and experiences shared in this blog post will empower the developer community to better leverage the performance of NVIDIA GPUs in their mission-critical LLM inference applications.
The ADP Balance strategy was a great team effort, covering system performance analysis and optimization. While we cannot thank every contributor individually, we are proud to acknowledge the dedicated team of engineers whose collective expertise has propelled TensorRT LLM to new heights of performance. Through this collaborative effort, we have gained valuable insights into improving GPU utilization for large language model inference. We hope the techniques and experiences shared in this blog post will empower the developer community to better leverage the performance of NVIDIA GPUs in their mission-critical LLM inference applications.

View File

@ -1,4 +1,4 @@
## Running GPT-OSS-120B with Eagle3 Speculative Decoding on GB200/B200 (TensorRT-LLM)
## Running GPT-OSS-120B with Eagle3 Speculative Decoding on GB200/B200 (TensorRT LLM)
This guide sets up a production endpoint that uses Eagle3 speculative decoding on NVIDIA GB200 or B200 GPUs only. It replaces the lowlatency flow from the previous guide and intentionally omits maxthroughput, Hopper, and benchmarking content.
@ -17,7 +17,7 @@ Expected directory layout on the host (example):
└─ eagle/ # Eagle3 speculative decoding assets
```
### Get the TensorRT-LLM Container (1.1.0rc0)
### Get the TensorRT LLM Container (1.1.0rc0)
If required by your environment, log into NGC and pull the image:
@ -30,7 +30,7 @@ docker login nvcr.io
docker pull nvcr.io/nvidia/tensorrt-llm/release:1.1.0rc0
```
### Start the TensorRT-LLM Container
### Start the TensorRT LLM Container
Run the container and bind-mount your models directory to `/config/models` inside the container:
@ -122,7 +122,7 @@ When `Status: 200` is returned, the endpoint is ready to serve requests.
### Sample Chat Completions Request
Note: This Eagle3 + TensorRT-LLM endpoint currently supports only greedy sampling. The following Chat Completions parameters are ignored (no-ops): `temperature`, `top_p`, `top_k`, and `seed`.
Note: This Eagle3 + TensorRT LLM endpoint currently supports only greedy sampling. The following Chat Completions parameters are ignored (no-ops): `temperature`, `top_p`, `top_k`, and `seed`.
Send a simple OpenAI-compatible Chat Completions request to the running server:

View File

@ -1,5 +1,5 @@
# Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs
by NVIDIA TensorRT-LLM team
by NVIDIA TensorRT LLM team
## Table of Contents
- [Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs](#pushing-latency-boundaries-optimizing-deepseek-r1-performance-on-nvidia-b200-gpus)
@ -39,7 +39,7 @@ by NVIDIA TensorRT-LLM team
## Background
Recent advancements in Large Language Reasoning Models have demonstrated remarkable success, while creating new deployment challenges. A critical challenge emerges from extended Output Sequence Lengths (OSL) due to complex "thinking and reasoning" processes. Longer OSL demands stricter Token-to-Token Latency (TTL) requirements, often forcing concurrency limitations. The most extreme case, single concurrency (min-latency scenario) , becomes particularly challenging for real-time applications.
This article explores how TensorRT-LLM achieves record-breaking performance for [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) in min-latency scenarios on NVIDIA's 8×B200 GPU configuration progressing from 67 tokens per second (TPS) to 253 before GTC 2025(**3.7x** speed-up), and to our current number is 368 TPS (**5.5x** speed-up).
This article explores how TensorRT LLM achieves record-breaking performance for [DeepSeek-R1](https://huggingface.co/deepseek-ai/DeepSeek-R1) in min-latency scenarios on NVIDIA's 8×B200 GPU configuration progressing from 67 tokens per second (TPS) to 253 before GTC 2025(**3.7x** speed-up), and to our current number is 368 TPS (**5.5x** speed-up).
## Implementation Configuration
@ -65,7 +65,7 @@ We have explored a mixed precision recipe, which provides a better tradeoff betw
| 3x MTP Layers | bf16 |
| RouterGEMM*** | bf16 |
*TensorRT-LLM already supports [FP8 Attention](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#fp8-kv-cache-and-mla) while for this latency scenario low-precision attention computation doesn't help with performance so we choose to use bf16 precision for the Attention Modules.
*TensorRT LLM already supports [FP8 Attention](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/models/core/deepseek_v3#fp8-kv-cache-and-mla) while for this latency scenario low-precision attention computation doesn't help with performance so we choose to use bf16 precision for the Attention Modules.
** nvfp4 model checkpoint is generated by the [NVIDIA TensorRT Model Optimizer toolkit](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
@ -263,6 +263,6 @@ It's also needed to set `use_relaxed_acceptance_for_thinking: true`, `relaxed_to
- More Exploration of MTP
## Acknowledgment
Pushing the performance boundaries of DeepSeek R1 for latency-sensitive applications has been a remarkable engineering journey. The optimizations detailed in this post represent an exceptional cross-functional collaboration across the entire AI technology stack - spanning kernel-level optimizations, runtime enhancements, model quantization techniques, algorithmic improvements, and systematic performance analysis and tuning. While we can't individually acknowledge every contributor, we're proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in TensorRT-LLM performance engineering.
Pushing the performance boundaries of DeepSeek R1 for latency-sensitive applications has been a remarkable engineering journey. The optimizations detailed in this post represent an exceptional cross-functional collaboration across the entire AI technology stack - spanning kernel-level optimizations, runtime enhancements, model quantization techniques, algorithmic improvements, and systematic performance analysis and tuning. While we can't individually acknowledge every contributor, we're proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in TensorRT LLM performance engineering.
Through this collaborative endeavor, we've developed valuable insights into maximizing GPU utilization for large language model inference. We hope that the techniques and best practices shared in this blog will empower the developer community to better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications.

View File

@ -1,5 +1,5 @@
# DeepSeek R1 MTP Implementation and Optimization
by NVIDIA TensorRT-LLM team
by NVIDIA TensorRT LLM team
## Table of Contents
- [DeepSeek R1 MTP Implementation and Optimization](#deepseek-r1-mtp-implementation-and-optimization)
- [Table of Contents](#table-of-contents)
@ -7,7 +7,7 @@ by NVIDIA TensorRT-LLM team
- [Background](#background)
- [MTP Vanilla](#mtp-vanilla)
- [MTP Eagle](#mtp-eagle)
- [MTP implementation in TensorRT-LLM](#mtp-implementation-in-tensorrt-llm)
- [MTP implementation in TensorRT LLM](#mtp-implementation-in-tensorrt-llm)
- [Basic Implementation](#basic-implementation)
- [MTP Modules](#mtp-modules)
- [Attention for MTP](#attention-for-mtp)
@ -25,7 +25,7 @@ by NVIDIA TensorRT-LLM team
- [Acknowledgment](#acknowledgment)
TensorRT-LLM achieves world-record inference performance for DeepSeek-R1 on NVIDIA Blackwell GPUs, where Multi-Token Prediction (MTP) delivers a significant speedup. In our [previous blog post](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md), we discussed the key optimizations that enable the outstanding inference latency of the DeepSeek-R1 model. This article dives deeper into the implementation and optimization of MTP in TensorRT-LLM.
TensorRT LLM achieves world-record inference performance for DeepSeek-R1 on NVIDIA Blackwell GPUs, where Multi-Token Prediction (MTP) delivers a significant speedup. In our [previous blog post](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md), we discussed the key optimizations that enable the outstanding inference latency of the DeepSeek-R1 model. This article dives deeper into the implementation and optimization of MTP in TensorRT LLM.
## MTP for inference
Inspired by a previous [research work](https://arxiv.org/pdf/2404.19737), MTP is designed to help the DeepSeek-V3 training. It adds additional MTP modules at the end of the main model and uses them to predict additional tokens. In this way, MTP can extend the prediction scope to multiple future tokens at each position to achieve better model accuracy. During inference, those MTP modules can also be used for speculative decoding to improve the generation latency further. In this section, we will introduce the MTP speculative decoding algorithm for LLM inference.
@ -74,18 +74,18 @@ Figure 3 gives an MTP Eagle example. In the context phase, the inputs of the fir
In the generation phase, the verification stage is the same as MTP Vanilla. Once we get the accepted tokens, we use all of them along with their corresponding hidden states as inputs for the first MTP module forward. Unlike MTP Vanilla, which needs to store past tokens and hidden states, this approach is much easier to implement. Subsequent MTP module forwards follow the same input preparation method as the context phase. After predicting all draft tokens, we need to evict the key/value pairs of any rejected draft tokens from the main models KV cache.
## MTP implementation in TensorRT-LLM
## MTP implementation in TensorRT LLM
### Basic Implementation
TensorRT-LLM has two different paths for MTP, one for [MTP Vanilla](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047) and another for [MTP Eagle](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047). MTP Eagle is the default path for DeepSeek-V3 and DeepSeek-R1 models.
TensorRT LLM has two different paths for MTP, one for [MTP Vanilla](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047) and another for [MTP Eagle](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/mtp.py#L1047). MTP Eagle is the default path for DeepSeek-V3 and DeepSeek-R1 models.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog2_overall_workflow.png" alt="tech_blog2_overall_workflow" width="800" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 4. MTP workflow in TensorRT-LLM</em></sub></p>
<p align="center"><sub><em>Figure 4. MTP workflow in TensorRT LLM</em></sub></p>
Figure 4 shows the overall workflow of MTP in TensorRT-LLM. Both paths share the runtime workflow, and the differences are in the MTP modules forward. In the context phase, there is no draft token in the inputs. TensorRT-LLM model engine fetches the input IDs from the requests and inputs to the model engine forward to get the next token and the hidden state. Then we prepare the MTP module inputs, and the MTP modules forward the inputs to predict the draft tokens.
Figure 4 shows the overall workflow of MTP in TensorRT LLM. Both paths share the runtime workflow, and the differences are in the MTP modules forward. In the context phase, there is no draft token in the inputs. TensorRT LLM model engine fetches the input IDs from the requests and inputs to the model engine forward to get the next token and the hidden state. Then we prepare the MTP module inputs, and the MTP modules forward the inputs to predict the draft tokens.
The generation workflow is more complicated. We need to do both the verification and draft stages. The predicted new token and draft tokens are the inputs for the main model. After the main model forward, we can sample from the output logits and get the following new tokens. Then compare them with the input draft tokens to get the final accepted tokens. The verification stage will be finished here. We will use the accepted tokens and hidden states to start a new draft stage, which uses the MTP layers to predict new draft tokens for the next iteration. Finally, we need to rewind the KV cache to evict keys/values corresponding to those rejected tokens.
@ -107,7 +107,7 @@ The MTP module follows the design in DeepSeek-V3. The embedding layer and output
### Attention for MTP
Attention is also a very important component in supporting MTP inference. The changes are mainly in the attention kernels for the generation phase. For the normal request, there will be only one input token in the generation phase, but for MTP, there will be $K+1$ input tokens. Since MTP sequentially predicts additional tokens, the predicted draft tokens are chained. Though we have an MTP Eagle path, currently, we only have the chain-based support for MTP Eagle. So, a causal mask is enough for the attention kernel to support MTP. In our implementation, TensorRT-LLM will use the fp8 flashMLA generation kernel on Hopper GPU, while using TRTLLM customized attention kernels on Blackwell for better performance.
Attention is also a very important component in supporting MTP inference. The changes are mainly in the attention kernels for the generation phase. For the normal request, there will be only one input token in the generation phase, but for MTP, there will be $K+1$ input tokens. Since MTP sequentially predicts additional tokens, the predicted draft tokens are chained. Though we have an MTP Eagle path, currently, we only have the chain-based support for MTP Eagle. So, a causal mask is enough for the attention kernel to support MTP. In our implementation, TensorRT LLM will use the fp8 flashMLA generation kernel on Hopper GPU, while using TRTLLM customized attention kernels on Blackwell for better performance.
### How to run DeepSeek models with MTP
Run DeepSeek-V3/R1 models with MTP, use [examples/llm-api/quickstart_advanced.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llm-api/quickstart_advanced.py) with additional options:
@ -237,12 +237,12 @@ We validated the Relaxed Acceptance on different datasets. In Figure 8, we show
</div>
<p align="center"><sub><em>Figure 9. Comparison between the chain-based and tree-based speculative decoding</em></sub></p>
TensorRT-LLM PyTorch backend can only support chain-based speculative decoding now, both MTP Vanilla and MTP Eagle. However, the tree-based speculative decoding technique is widely used in previous advanced methods, such as Ealge2 and Eagle3, to increase the acceptance rate. MTPs in TensorRT-LLM can also be extended to support the tree-based technique. Figure 9 compares the chain-based method with the tree-based method. Both full tree and dynamic tree methods can help expand the candidate combinations, so that we can have more choices for the draft tokens.
TensorRT LLM PyTorch backend can only support chain-based speculative decoding now, both MTP Vanilla and MTP Eagle. However, the tree-based speculative decoding technique is widely used in previous advanced methods, such as Ealge2 and Eagle3, to increase the acceptance rate. MTPs in TensorRT LLM can also be extended to support the tree-based technique. Figure 9 compares the chain-based method with the tree-based method. Both full tree and dynamic tree methods can help expand the candidate combinations, so that we can have more choices for the draft tokens.
### Eagle3 support
Another important method is Eagle3. From the [Eagle3 paper](https://arxiv.org/pdf/2503.01840), the promising results show that it can help greatly increase the acceptance rate by leveraging different levels hidden states to predict draft tokens. Since TensorRT-LLM already has [Eagle-3 support](https://github.com/NVIDIA/TensorRT-LLM/pull/3035) now, in the future, we also want to train an Eagle3 head to support DeepSeek-V3/R1+Eagle3 to achieve better speedup.
Another important method is Eagle3. From the [Eagle3 paper](https://arxiv.org/pdf/2503.01840), the promising results show that it can help greatly increase the acceptance rate by leveraging different levels hidden states to predict draft tokens. Since TensorRT LLM already has [Eagle-3 support](https://github.com/NVIDIA/TensorRT-LLM/pull/3035) now, in the future, we also want to train an Eagle3 head to support DeepSeek-V3/R1+Eagle3 to achieve better speedup.
## Acknowledgment
This was a remarkable cross-team effort to support and optimize MTP in TensorRT-LLM. We would like to extend our gratitude to everyone who contributed to making this possible, as it involved a typical system/algorithm co-design approach spanning multiple technical layers—including kernel optimization, runtime enhancements, algorithmic improvements, and performance measurement & analysis. And a special thanks goes to the DeepSeek team for developing the MTP method, which lays down the foundation of this blog.
This was a remarkable cross-team effort to support and optimize MTP in TensorRT LLM. We would like to extend our gratitude to everyone who contributed to making this possible, as it involved a typical system/algorithm co-design approach spanning multiple technical layers—including kernel optimization, runtime enhancements, algorithmic improvements, and performance measurement & analysis. And a special thanks goes to the DeepSeek team for developing the MTP method, which lays down the foundation of this blog.

View File

@ -1,6 +1,6 @@
# Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers
By NVIDIA TensorRT-LLM team
By NVIDIA TensorRT LLM team
## Table of Contents
- [Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers](#optimizing-deepseek-r1-throughput-on-nvidia-blackwell-gpus-a-deep-dive-for-developers)
- [Table of Contents](#table-of-contents)
@ -18,7 +18,7 @@ By NVIDIA TensorRT-LLM team
- [Acknowledgment](#acknowledgment)
## Introduction
The open source DeepSeek R1 model's innovative architecture including the multi-head latent attention (MLA) and large sparse Mixture-of-Experts (MoE) significantly improved the inference efficiency of the LLM models. However, harnessing the full potential of such an innovative structure requires equally important hardware/software co-optimization. This post delves into the optimization strategies for DeepSeek R1 throughput oriented scenarios (TPS/GPU), developed by NVIDIA within TensorRT-LLM on NVIDIA's Blackwell B200 GPUs. We will explore the rationale behind each enhancement. [The other min-latency optimization blog](./blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) explained in detail how TensorRT-LLM optimizes the R1 performance to achieve the best of the TPS/USER.
The open source DeepSeek R1 model's innovative architecture including the multi-head latent attention (MLA) and large sparse Mixture-of-Experts (MoE) significantly improved the inference efficiency of the LLM models. However, harnessing the full potential of such an innovative structure requires equally important hardware/software co-optimization. This post delves into the optimization strategies for DeepSeek R1 throughput oriented scenarios (TPS/GPU), developed by NVIDIA within TensorRT LLM on NVIDIA's Blackwell B200 GPUs. We will explore the rationale behind each enhancement. [The other min-latency optimization blog](./blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) explained in detail how TensorRT LLM optimizes the R1 performance to achieve the best of the TPS/USER.
These optimizations have significantly boosted DeepSeek R1 throughput on Blackwell. Performance increased from approximately 2000 TPS/GPU in February to 4600 TPS/GPU on ISL/OSL 1K/2K dataset. The optimizations are general and applicable to other ISL/OSL configs too. These optimization items were broadly categorized into three areas: MLA layers, MoE layers, and runtime.
@ -29,12 +29,12 @@ The mixed precision recipe for DeepSeek R1 throughput scenario is almost the sam
* FP8 KV cache and FP8 attention, rather than BF16 precision.
* FP4 Allgather for better communication bandwidth utilization.
The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT-LLM implementations are:
The checkpoint used in this blog is hosted in [nvidia/DeepSeek-R1-FP4](https://huggingface.co/nvidia/DeepSeek-R1-FP4), generated by [NVIDIA Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer). The accuracy score of common dataset on this FP4 checkpoint and TensorRT LLM implementations are:
| Precision | GPQA Diamond | MATH-500
| :-- | :-- | :-- |
| TensorRT-LLM FP8 | 0.697 | 0.954 |
| TensorRT-LLM FP4 | 0.705 | 0.96 |
| TensorRT LLM FP8 | 0.697 | 0.954 |
| TensorRT LLM FP4 | 0.705 | 0.96 |
** Note there are some run-to-run variance for these evaluations, so FP4 data is slight higher here. We think FP4 has comparable accuracy with FP8 on these datasets.
@ -62,12 +62,12 @@ In the following sections we will explain the rationale why DP and EP are chosen
### Weights absorb and MQA
The core idea of MLA is the low-rank joint compression for the attention keys and values to reduce KV-cache size during the inference. Based on the MLA formulas, the down-projected KV latent is up-projected to multiple heads and combined with the up-projected Q to establish a normal multi-head attention (MHA). Due to the nature of the matrix multiplication, the up projection weights matrix of the K (W^UK) can be multiplied by the up-projection weights matrix of Q (W^Q) firstly, the computed results of these 2 can be then multiplied to Q. The up-projection weights matrix of V (W^UV) and the attention output projection matrix W^O can also be multiplied after the attention output. The DeepSeek-V2 technical report calls this technique "absorb". After the weights are absorbed, the MLA is equivalent to multiple query attention(MQA). Please see the [original DeepSeek-V2 technical paper](https://arxiv.org/pdf/2405.04434) for the detailed formulas and explanations, the following block diagram shows the computational flow of weights absorbed MLA in TensorRT-LLM.
The core idea of MLA is the low-rank joint compression for the attention keys and values to reduce KV-cache size during the inference. Based on the MLA formulas, the down-projected KV latent is up-projected to multiple heads and combined with the up-projected Q to establish a normal multi-head attention (MHA). Due to the nature of the matrix multiplication, the up projection weights matrix of the K (W^UK) can be multiplied by the up-projection weights matrix of Q (W^Q) firstly, the computed results of these 2 can be then multiplied to Q. The up-projection weights matrix of V (W^UV) and the attention output projection matrix W^O can also be multiplied after the attention output. The DeepSeek-V2 technical report calls this technique "absorb". After the weights are absorbed, the MLA is equivalent to multiple query attention(MQA). Please see the [original DeepSeek-V2 technical paper](https://arxiv.org/pdf/2405.04434) for the detailed formulas and explanations, the following block diagram shows the computational flow of weights absorbed MLA in TensorRT LLM.
![Weights Absorb](../media/tech_blog3_mla_absorb.png "Weights Absorbed MLA")
For the decoding phase, the weights absorb significantly reduces the math FLOPS needed to up project the K and V, since the FLOPs needed for these up projections of KV are linear to the KV cache length, while length of Q vector is always 1 in the decoding phase. The longer the KV cache history is, the more FLOPs are needed, and the up projections are repeated for every decoded token since only the projected KV latent were saved, which further increases the FLOPs needed.
For the prefill phase, the weights absorbed version changes the dimensions of Q and KV thus increasing the number of FLOPs for attention. Based on roofline analysis, non absorbed version is beneficial for the prefill phase with input length 256 or larger
The TensorRT-LLM MLA implementation chooses different highly optimized kernels for prefill and decoding, see [MLA](../../../../tensorrt_llm/_torch/modules/attention.py).
The TensorRT LLM MLA implementation chooses different highly optimized kernels for prefill and decoding, see [MLA](../../../../tensorrt_llm/_torch/modules/attention.py).
### Data Parallel for Attention module (ADP)
@ -147,7 +147,7 @@ The following optimizations are already done for MoE layers.
## Runtime Optimizations
These optimizations target the overall execution flow, scheduling, and resource management within the inference system. They are shared between DeepSeek R1 models and other models supported in the TensorRT-LLM, here we are sharing some ablation study for the performance benefits on DeepSeek R1 on B200.
These optimizations target the overall execution flow, scheduling, and resource management within the inference system. They are shared between DeepSeek R1 models and other models supported in the TensorRT LLM, here we are sharing some ablation study for the performance benefits on DeepSeek R1 on B200.
* CUDA Graph
@ -155,13 +155,13 @@ These optimizations target the overall execution flow, scheduling, and resource
CUDA Graphs allow capturing a sequence of CUDA operations and launching them as a single unit, drastically reducing kernel launch overheads. This is particularly beneficial for models with many small kernels, and particularly on the PyTorch flow, because the python host code normally executes slower than C++. Since the CUDA Graph freezes the kernel launch parameters, which is normally associated with the tensor shapes, it can only be safely used with static shape, meaning that different CUDA graphs need to be captured for different batch sizes. Each graph will have some cost of memory usage, and capturing time, thus we cannot capture every possible CUDA graph for all possible batches. For the non-captured batch sizes, PyTorch eager mode code will be executed.
There is a feature called CUDA Graph padding in TensorRT-LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation.
There is a feature called CUDA Graph padding in TensorRT LLM, which is a good trade-off between the number of CUDA Graphs and the CUDA Graph hit ratio; it tries to pad a batch to the nearest one with a captured CUDA Graph. Normally you should enable the CUDA Graph padding feature to increase the CUDA Graph hit rate, but the padding itself has some overhead due to wasted tokens computation.
Users can opt-out the CUDA Graph padding feature to see the perf benefits, by setting the `cuda_graph_config:\n enable_padding: False`, see API here [Pytorch backend config](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/config.py#L41)
* Overlap Scheduler:
Showed a **4% E2E performance impact** and should generally **always be used**. This scheduler manages the execution of different operations (like computation and communication) to overlap them effectively on the GPU and network. The intuition is to hide latency by performing computation while waiting for data transfers or vice versa, improving overall hardware utilization. The overlap schedule is already defaulted on in TensorRT-LLM by [commit](https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428#diff-3c4f29d6594b37af0f1fbb97f5291b18e49f3f2510f9d296c7adb2829e9da0bf). In case there are corner cases where it does not work, users can still opt-out this feature by set *disable_overlap_scheduler* to true.
Showed a **4% E2E performance impact** and should generally **always be used**. This scheduler manages the execution of different operations (like computation and communication) to overlap them effectively on the GPU and network. The intuition is to hide latency by performing computation while waiting for data transfers or vice versa, improving overall hardware utilization. The overlap schedule is already defaulted on in TensorRT LLM by [commit](https://github.com/NVIDIA/TensorRT-LLM/commit/b4e5df0ee0024eda3eeb83a6ba822245a30ab428#diff-3c4f29d6594b37af0f1fbb97f5291b18e49f3f2510f9d296c7adb2829e9da0bf). In case there are corner cases where it does not work, users can still opt-out this feature by set *disable_overlap_scheduler* to true.
* Memory Optimizations
@ -179,4 +179,4 @@ See [Perf practices](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/sourc
## Acknowledgment
The substantial throughput advancements for DeepSeek R1 on Blackwell GPUs, as detailed in this post, are the fruit of a dedicated and collaborative engineering effort. Achieving nearly a 2.3x increase in TPS/GPU required a deep dive into MLA layers, MoE layers, and runtime optimizations. We extend our sincere appreciation to all the engineers involved in this intensive optimization process. Their collective expertise in pushing the boundaries of throughput performance within TensorRT-LLM has been instrumental. We trust that sharing these specific strategies for maximizing throughput will prove beneficial to the developer community as they tackle demanding LLM inference workloads on NVIDIA hardware.
The substantial throughput advancements for DeepSeek R1 on Blackwell GPUs, as detailed in this post, are the fruit of a dedicated and collaborative engineering effort. Achieving nearly a 2.3x increase in TPS/GPU required a deep dive into MLA layers, MoE layers, and runtime optimizations. We extend our sincere appreciation to all the engineers involved in this intensive optimization process. Their collective expertise in pushing the boundaries of throughput performance within TensorRT LLM has been instrumental. We trust that sharing these specific strategies for maximizing throughput will prove beneficial to the developer community as they tackle demanding LLM inference workloads on NVIDIA hardware.

View File

@ -1,9 +1,9 @@
# Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)
# Scaling Expert Parallelism in TensorRT LLM (Part 1: Design and Implementation of Large-scale EP)
By NVIDIA TensorRT-LLM Team
By NVIDIA TensorRT LLM Team
## Table of Contents
- [Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)](#scaling-expert-parallelism-in-tensorrt-llm-part-1-design-and-implementation-of-large-scale-ep)
- [Scaling Expert Parallelism in TensorRT LLM (Part 1: Design and Implementation of Large-scale EP)](#scaling-expert-parallelism-in-tensorrt-llm-part-1-design-and-implementation-of-large-scale-ep)
- [Table of Contents](#table-of-contents)
- [Motivation for large-scale EP](#motivation-for-large-scale-ep)
- [Observations over one machine translation dataset](#observations-over-one-machine-translation-dataset)
@ -39,7 +39,7 @@ In the past, we have shared TensorRT-LLMs optimization experience to [push th
The DeepSeek team has also shared their valuable experience and practice on how to optimize this kind of large-scale Expert Parallelism (EP) model, including [DeepEP](https://github.com/deepseek-ai/DeepEP) and [EPLB](https://github.com/deepseek-ai/EPLB). Also, the DeepSeek team has shared their concrete design considerations in [this](https://arxiv.org/abs/2412.19437) tech report. On top of those great sharings, there are also nice community efforts to implement large-scale EP in other inference engines, such as [this](https://lmsys.org/blog/2025-05-05-large-scale-ep/) effort from the SGLang team.
In this tech blog, we will introduce the details of the design and implementation to support E2E large-scale EP in TensorRT-LLM. This blog post mainly covers the following:
In this tech blog, we will introduce the details of the design and implementation to support E2E large-scale EP in TensorRT LLM . This blog post mainly covers the following:
* How to leverage NVIDIA GB200 Multi-Node NVLink (MNNVL) HW features to implement high-performance communication kernels.
* How to design and implement an online expert workload balancer to dynamically balance the expert load distribution and adapt to the changes of online traffic patterns. We present:
@ -48,16 +48,16 @@ In this tech blog, we will introduce the details of the design and implementatio
* The design and implementation of the replication/placement strategy.
* The MoE weight load/re-distributer to balance the online workload across multiple GPUs.
* The changes needed to the MoE router and computation module to adapt to the expert load balancer needs.
* Some preliminary data demonstrating the effectiveness of the current implementation in TensorRT-LLM.
* Some preliminary data demonstrating the effectiveness of the current implementation in TensorRT LLM .
In future tech blogs, we will also cover the following topics:
* The introduction of performance tuning and optimization for TensorRT-LLM large-scale EP GB200 implementation.
* The introduction of performance tuning and optimization for TensorRT LLM large-scale EP GB200 implementation.
* How to implement efficient large-scale EP support for B200/Hopper and other NVIDIA GPUs without MNNVL.
* The best practices to leverage large-scale EP and get performance gains.
* How to combine large-scale EP with other system optimization techniques.
Even if, in this tech blog, we focus on TensorRT-LLM, we believe the core ideas and implementation can also be applied to other inference engines to help the inference performance on NVIDIA GPUs. Also, with the help of the community, we would like to figure out how to better modularize the current TensorRT-LLM large-scale EP implementation and make it more easily reusable by the community.
Even if, in this tech blog, we focus on TensorRT LLM , we believe the core ideas and implementation can also be applied to other inference engines to help the inference performance on NVIDIA GPUs. Also, with the help of the community, we would like to figure out how to better modularize the current TensorRT LLM large-scale EP implementation and make it more easily reusable by the community.
Finally, in this tech blog, there are implementation details which are targeted towards the GB200 system, such as the communication components leveraging the GB200 MNNVL inter-GPU connection, and the MoE weight load/re-distributer module leveraging the high bandwidth C2C connection between Grace CPU and Blackwell GPU. Nevertheless, the overall design principle and software architecture can still apply to non-GB200 NVIDIA GPU systems. To facilitate the extension to other non-GB200 system, we have, on purpose, paid attention to the generalization of the design and implementation. These changes should be easily composable with other existing components.
@ -221,7 +221,7 @@ To make sure large-scale EP can run well, careful considerations are needed to m
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog4_Picture12.png">
</figure>
</div>
<p align="center"><sub><em>Figure 12: the high-level design of TensorRT-LLM large-scale EP</em></sub></p>
<p align="center"><sub><em>Figure 12: the high-level design of TensorRT LLM large-scale EP</em></sub></p>
In this design, there are both CPU and GPU side logics:
@ -306,7 +306,7 @@ The current technical decision is:
The considerations are:
* DeepEP is a great piece of work done by the DeepSeek team. When we started the TensorRT-LLM large-scale EP efforts, our first focus was on GB200. We chose to implement our own custom EP communication kernels as it was easier to introduce optimizations requiring the GB200 MNNVL capability. Also, based on our current evaluation, DeepEP does not provide CUDA graph compatibility for all the scenarios. We believe that CUDA graph is needed for the scenario we are interested in.
* DeepEP is a great piece of work done by the DeepSeek team. When we started the TensorRT LLM large-scale EP efforts, our first focus was on GB200. We chose to implement our own custom EP communication kernels as it was easier to introduce optimizations requiring the GB200 MNNVL capability. Also, based on our current evaluation, DeepEP does not provide CUDA graph compatibility for all the scenarios. We believe that CUDA graph is needed for the scenario we are interested in.
* When we started the efforts to enable large-scale EP on Hopper, we concluded that DeepEP could be adapted and meet our needs on this platform. We plan to extend DeepEP to work for B200 in the future.
We are also actively evaluating the possibility of consolidating GB200 and non-GB200 EP communication kernels into a single solution to make the system simpler, and we will keep the community posted on the status.
@ -333,7 +333,7 @@ More details can be found in [PR 3504](https://github.com/NVIDIA/TensorRT-LLM/pu
## EP Load Balancer
TensorRT-LLM implements a set of functionalities to achieve EP Load Balancing. There are several key components:
TensorRT LLM implements a set of functionalities to achieve EP Load Balancing. There are several key components:
### Python Interface
@ -364,7 +364,7 @@ The GPU core logic contains the following components:
There are GPU/CPU synchronization components implemented. More details can be found in [PR 4384](https://github.com/NVIDIA/TensorRT-LLM/pull/4384) and [PR 4495](https://github.com/NVIDIA/TensorRT-LLM/pull/4495).
Based on these core utilities, there are two versions of EP Load Balancer in TensorRT-LLM: Offline EP Load Balancer and Online EP Load Balancer.
Based on these core utilities, there are two versions of EP Load Balancer in TensorRT LLM : Offline EP Load Balancer and Online EP Load Balancer.
### Online EP Load Balancer
@ -687,14 +687,14 @@ Based on our current performance analysis, when you plan to apply large-scale EP
**Please use your own judgement to decide whether to use large-scale EP into your system or not, and when you use it, what is the suitable EP size and concrete deployment settings suitable for your own requirements.**
The current TensorRT-LLM large-scale EP implementation is not perfect and there are still known limitations (community contributions are welcome to help us improve). For example, we need:
The current TensorRT LLM large-scale EP implementation is not perfect and there are still known limitations (community contributions are welcome to help us improve). For example, we need:
* More platforms coverage
* Extending the support to cover other non-GB200 NVIDIA GPU HWs. **We are actively working on this now.**
* Currently the large-EP support only covers NVFP4 data precision, incremental efforts are needed to cover FP8 and INT8/INT4 data precision.
* Performance
* Further performance tuning and optimizations. **We are actively working on this now.**
* More validation with workloads close to production traffic. **Here we highly welcome the communitys feedback to help us calibrate TensorRT-LLM large-scale EP implementation based on more concrete workloads.**
* More validation with workloads close to production traffic. **Here we highly welcome the communitys feedback to help us calibrate TensorRT LLM large-scale EP implementation based on more concrete workloads.**
* The thorough validation of combination with other inference core features, such as dis-aggregated serving, speculative decoding, validation on more MoE model families, etc. **We are actively working on this now.**
* Ease-of-use
* Easy customization
@ -707,11 +707,11 @@ The current TensorRT-LLM large-scale EP implementation is not perfect and there
* Because large-scale EP deployment solution may lead to an increased fault ratio of the online deployment system, it may increase the need for cross-layer interactions with multiple components of the E2E LLM inference system on NVIDIA GPUs. This includes the low-level communication kernel, the cluster-level orchestrator and scheduler, etc. We are actively working with various NVIDIA engineering teams to push forward on this.
We believe the current implementation can be viewed as a reasonable E2E large-scale EP implementation and we encourage the community to try new ideas and performance validation. We encourage the community to share feedback to help us move fast in this area. We are actively tracking the TensorRT-LLM large-scale EP execution in [this](https://github.com/NVIDIA/TensorRT-LLM/issues/4127) GitHub issue to ensure transparency to the community.
We believe the current implementation can be viewed as a reasonable E2E large-scale EP implementation and we encourage the community to try new ideas and performance validation. We encourage the community to share feedback to help us move fast in this area. We are actively tracking the TensorRT LLM large-scale EP execution in [this](https://github.com/NVIDIA/TensorRT-LLM/issues/4127) GitHub issue to ensure transparency to the community.
## Acknowledgement
The large-scale EP work is another great team effort, spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT-LLM.
The large-scale EP work is another great team effort, spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT LLM .
Through this collaborative endeavor, we have developed valuable insights to allow us improve GPU utilization for large language model inference. We hope that the techniques and the experience shared in this blog will help the developer community to better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications.

View File

@ -1,10 +1,10 @@
# Disaggregated Serving in TensorRT-LLM
# Disaggregated Serving in TensorRT LLM
By NVIDIA TensorRT-LLM Team
By NVIDIA TensorRT LLM Team
- [Disaggregated Serving in TensorRT-LLM](#disaggregated-serving-in-tensorrt-llm)
- [Disaggregated Serving in TensorRT LLM](#disaggregated-serving-in-tensorrt-llm)
- [Motivation](#motivation)
- [Disaggregated Serving in TensorRT-LLM](#disaggregated-serving-in-tensorrt-llm-1)
- [Disaggregated Serving in TensorRT LLM](#disaggregated-serving-in-tensorrt-llm-1)
- [trtllm-serve](#trtllm-serve)
- [Dynamo](#dynamo)
- [Triton Inference Server](#triton-inference-server)
@ -24,7 +24,7 @@ By NVIDIA TensorRT-LLM Team
- [Future Work](#future-work)
- [Acknowledgement](#acknowledgement)
In the past tech blogs, we have introduced optimization specifically for [low-latency](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) and [throughput](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md) oriented optimizations. For production deployment, users also care about per GPU throughput satisfying certain latency constraints. In this tech blog, we will introduce the design concept and usage of the TensorRT-LLM disaggregated serving which directly targets throughput@latency performance scenarios, together with performance study results.
In the past tech blogs, we have introduced optimization specifically for [low-latency](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.md) and [throughput](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.md) oriented optimizations. For production deployment, users also care about per GPU throughput satisfying certain latency constraints. In this tech blog, we will introduce the design concept and usage of the TensorRT LLM disaggregated serving which directly targets throughput@latency performance scenarios, together with performance study results.
## Motivation
@ -55,15 +55,15 @@ Disaggregated serving resolves these challenges by decoupling the two phases, al
You can also refer to [this paper](https://arxiv.org/pdf/2506.05508) for more details about the rational and design considerations of disaggregated serving.
## Disaggregated Serving in TensorRT-LLM
## Disaggregated Serving in TensorRT LLM
There are three different approaches to do disaggregation LLM inference with TensorRT-LLM, where each approach offers distinct architectural and operational characteristics suited to different deployment scenarios.
There are three different approaches to do disaggregation LLM inference with TensorRT LLM, where each approach offers distinct architectural and operational characteristics suited to different deployment scenarios.
### trtllm-serve
[`trtllm-serve`](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html) is a command-line utility that facilitates the deployment of an OpenAI-compatible server for TensorRT-LLM instances.
[`trtllm-serve`](https://nvidia.github.io/TensorRT-LLM/commands/trtllm-serve.html) is a command-line utility that facilitates the deployment of an OpenAI-compatible server for TensorRT LLM instances.
The first approach to do disaggregated LLM inference with TensorRT-LLM involves launching a separate OpenAI-compatible server per context and generation instance using `trtllm-serve`. An additional server, referred to as the "disaggregated" server, is also launched with `trtllm-serve` and acts as an orchestrator which receives client requests and dispatches them to the appropriate context and generation servers via OpenAI REST API. Figure 3 below illustrates the disaggregated serving workflow when using this approach. When a context instance is done generating the KV blocks associated with the prompt, it returns a response to the disaggregated server. This response includes the prompt tokens, the first generated token and metadata associated with the context request and context instance. This metadata is referred to as context parameters (`ctx_params` in Figure 3). These parameters are then used by the generation instances to establish communication with the context instance and retrieve the KV cache blocks associated with the request.
The first approach to do disaggregated LLM inference with TensorRT LLM involves launching a separate OpenAI-compatible server per context and generation instance using `trtllm-serve`. An additional server, referred to as the "disaggregated" server, is also launched with `trtllm-serve` and acts as an orchestrator which receives client requests and dispatches them to the appropriate context and generation servers via OpenAI REST API. Figure 3 below illustrates the disaggregated serving workflow when using this approach. When a context instance is done generating the KV blocks associated with the prompt, it returns a response to the disaggregated server. This response includes the prompt tokens, the first generated token and metadata associated with the context request and context instance. This metadata is referred to as context parameters (`ctx_params` in Figure 3). These parameters are then used by the generation instances to establish communication with the context instance and retrieve the KV cache blocks associated with the request.
<div align="center">
<figure>
@ -124,11 +124,11 @@ In the Dynamo workflow, requests are initially processed by pre- and post-proces
Dynamo also includes built-in support for Kubernetes deployment, monitoring, and metrics collection. The development team is actively working on enabling dynamic instance scaling, further enhancing its suitability for production environments.
For more information on how to use Dynamo with TensorRT-LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html).
For more information on how to use Dynamo with TensorRT LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html).
### Triton Inference Server
The third approach to do disaggregated LLM inference with TensorRT-LLM utilizes the Triton Inference Server. With this approach a Triton ensemble model is employed, comprising a preprocessor, an orchestrator implemented as [a Python business logic scripting (BLS) backend](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/bls.html), and a post-processor. The orchestrator is responsible for routing client requests to context and generation instances, managing the flow of prompt tokens, and handling the return of generated tokens. This approach is illustrated in Figure 5. The Triton Inference Server approach relies on the Triton TensorRT-LLM backend and the Executor API, which is supported only for the TensorRT backend. For more information on how to use this approach, please refer to [this documentation](https://github.com/NVIDIA/TensorRT-LLM/tree/main/triton_backend/all_models/disaggregated_serving#running-disaggregated-serving-with-triton-tensorrt-llm-backend).
The third approach to do disaggregated LLM inference with TensorRT LLM utilizes the Triton Inference Server. With this approach a Triton ensemble model is employed, comprising a preprocessor, an orchestrator implemented as [a Python business logic scripting (BLS) backend](https://docs.nvidia.com/deeplearning/triton-inference-server/user-guide/docs/user_guide/bls.html), and a post-processor. The orchestrator is responsible for routing client requests to context and generation instances, managing the flow of prompt tokens, and handling the return of generated tokens. This approach is illustrated in Figure 5. The Triton Inference Server approach relies on the Triton TensorRT LLM backend and the Executor API, which is supported only for the TensorRT backend. For more information on how to use this approach, please refer to [this documentation](https://github.com/NVIDIA/TensorRT-LLM/tree/main/triton_backend/all_models/disaggregated_serving#running-disaggregated-serving-with-triton-tensorrt-llm-backend).
<div align="center">
<figure>
@ -141,7 +141,7 @@ The third approach to do disaggregated LLM inference with TensorRT-LLM utilizes
### Multi-backend Support
In TensorRT-LLM, the KV cache exchange is modularly decoupled from the KV cache manager and the underlying communication libraries, as shown in Figure 6. The KV cache exchange module is responsible for efficient transmission and reception of the cache, promptly releasing cache space, and performing cache layout conversions during the exchange process. Currently, mainstream communication protocols—MPI, UCX, and NIXL—are all supported by TensorRT-LLM, and the underlying communication protocols utilize RDMA / NVLink. Currently, we recommend using UCX and NIXL backends, as we are adding a dynamic scaling mechanism on top of them—specifically, dynamic node joining and leaving. This allows customers to adjust the load based on traffic demands or switch roles between context and generation dynamically.
In TensorRT LLM, the KV cache exchange is modularly decoupled from the KV cache manager and the underlying communication libraries, as shown in Figure 6. The KV cache exchange module is responsible for efficient transmission and reception of the cache, promptly releasing cache space, and performing cache layout conversions during the exchange process. Currently, mainstream communication protocols—MPI, UCX, and NIXL—are all supported by TensorRT LLM, and the underlying communication protocols utilize RDMA / NVLink. Currently, we recommend using UCX and NIXL backends, as we are adding a dynamic scaling mechanism on top of them—specifically, dynamic node joining and leaving. This allows customers to adjust the load based on traffic demands or switch roles between context and generation dynamically.
<div align="center">
<figure>
@ -152,7 +152,7 @@ In TensorRT-LLM, the KV cache exchange is modularly decoupled from the KV cache
### Overlap Optimization
To optimize the overall performance of disaggregated serving, TensorRT-LLM overlaps the KV cache transmission with computation for multiple independent requests. While one request is sending or receiving its KV cache blocks, other requests can proceed with computation, as illustrated in Figure 7. Furthermore, if context and generation instances are using multiple GPUs per instance, KV cache transmission between different sets of GPUs can occur in parallel.
To optimize the overall performance of disaggregated serving, TensorRT LLM overlaps the KV cache transmission with computation for multiple independent requests. While one request is sending or receiving its KV cache blocks, other requests can proceed with computation, as illustrated in Figure 7. Furthermore, if context and generation instances are using multiple GPUs per instance, KV cache transmission between different sets of GPUs can occur in parallel.
<div align="center">
<figure>
@ -163,7 +163,7 @@ To optimize the overall performance of disaggregated serving, TensorRT-LLM overl
### Cache Layout Transformation
To minimize KV cache transmission latency, TensorRT-LLM currently uses direct transmission between device memories for cache transfer. The KV cache transmission supports using different parallel strategies for the context and generation phases. In such cases, careful orchestration of KV cache block mapping is required. Figure 8 illustrates this using the example of context phase with TP2 and generation phase with PP2.
To minimize KV cache transmission latency, TensorRT LLM currently uses direct transmission between device memories for cache transfer. The KV cache transmission supports using different parallel strategies for the context and generation phases. In such cases, careful orchestration of KV cache block mapping is required. Figure 8 illustrates this using the example of context phase with TP2 and generation phase with PP2.
<div align="center">
<figure>
@ -172,7 +172,7 @@ To minimize KV cache transmission latency, TensorRT-LLM currently uses direct tr
</div>
<p align="center"><sub><em>Figure 8. KV cache layout conversion</em></sub></p>
The optimizations required for KV cache transmission vary depending on whether it's single-node multi-GPU, multi-node multi-GPU, or different GPU models. To accommodate this, TensorRT-LLM provides a set of environment variables for selection in different environments. Please refer to [this document](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/advanced/disaggregated-service.md) for details.
The optimizations required for KV cache transmission vary depending on whether it's single-node multi-GPU, multi-node multi-GPU, or different GPU models. To accommodate this, TensorRT LLM provides a set of environment variables for selection in different environments. Please refer to [this document](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/features/disagg-serving.md) for details.
## Performance Studies
@ -281,7 +281,7 @@ We provide a set of scripts to reproduce the performance data presented in this
## Future Work
Although we can already demonstrate the performance benefits of doing disaggregated LLM inference with TensorRT-LLM, there is still work to be done to further improve the performance and ease of use. Among other things, we plan to:
Although we can already demonstrate the performance benefits of doing disaggregated LLM inference with TensorRT LLM, there is still work to be done to further improve the performance and ease of use. Among other things, we plan to:
* Provide detailed steps and scripts to automate the generation of throughput-latency performance curves comparing aggregated with disaggregated.
* Continue to improve performance at larger scales (large-scale EP for example).
@ -290,4 +290,4 @@ Although we can already demonstrate the performance benefits of doing disaggrega
## Acknowledgement
Adding support for disaggregated serving in TensorRT-LLM is a typical one-team effort requiring close collaboration spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT-LLM. Through this collaborative endeavor, we have developed valuable insights to allow us to improve GPU utilization for large language model inference. We hope that the techniques and the experience shared in this blog will help the developer community better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications.
Adding support for disaggregated serving in TensorRT LLM is a typical one-team effort requiring close collaboration spanning kernel-level optimizations, runtime enhancements, and systematic performance analysis and tuning. While we cannot individually acknowledge every contributor, we are proud to recognize the dedicated team of engineers whose collective expertise has helped advance the state-of-the-art in terms of performance in TensorRT LLM. Through this collaborative endeavor, we have developed valuable insights to allow us to improve GPU utilization for large language model inference. We hope that the techniques and the experience shared in this blog will help the developer community better leverage NVIDIA GPU capabilities in their mission-critical LLM inference applications.

View File

@ -1,8 +1,8 @@
# How to launch Llama4 Maverick + Eagle3 TensorRT-LLM server
# How to launch Llama4 Maverick + Eagle3 TensorRT LLM server
Artificial Analysis has benchmarked the Llama4 Maverick with Eagle3 enabled TensorRT-LLM server running at over [1000 tokens per second per user on 8xB200 GPUs](https://developer.nvidia.com/blog/blackwell-breaks-the-1000-tps-user-barrier-with-metas-llama-4-maverick/). This implementation leverages NVIDIA's TensorRT-LLM combined with speculative decoding using the Eagle3 model to further boost performance.
Artificial Analysis has benchmarked the Llama4 Maverick with Eagle3 enabled TensorRT LLM server running at over [1000 tokens per second per user on 8xB200 GPUs](https://developer.nvidia.com/blog/blackwell-breaks-the-1000-tps-user-barrier-with-metas-llama-4-maverick/). This implementation leverages NVIDIA's TensorRT LLM combined with speculative decoding using the Eagle3 model to further boost performance.
In the guide below, we will walk you through how to launch your own high-performance Llama4 Maverick with Eagle3 enabled TensorRT-LLM server, from build to deployment. (Note that your specific performance numbers may vary—speculative decoding speedups depend upon the dataset!)
In the guide below, we will walk you through how to launch your own high-performance Llama4 Maverick with Eagle3 enabled TensorRT LLM server, from build to deployment. (Note that your specific performance numbers may vary—speculative decoding speedups depend upon the dataset!)
## Prerequisites
@ -18,7 +18,7 @@ In the guide below, we will walk you through how to launch your own high-perform
* [NVIDIA Llama 4 Maverick 17B 128E Instruct FP8](https://huggingface.co/nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8)
* [NVIDIA Llama 4 Maverick 17B 128E Eagle3 BF16](https://huggingface.co/nvidia/Llama-4-Maverick-17B-128E-Eagle3)
In [Step 4: Start the TensorRT-LLM server](#step-4-start-the-tensorrt-llm-server), `/path/to/maverick` and `/path/to/eagle` refer to the download paths of the above respective models.
In [Step 4: Start the TensorRT LLM server](#step-4-start-the-tensorrt-llm-server), `/path/to/maverick` and `/path/to/eagle` refer to the download paths of the above respective models.
## Launching the server
@ -33,14 +33,14 @@ git lfs pull
The last command, `git lfs pull`, ensures all large files stored with Git LFS are properly downloaded. If `git lfs` is not installed, please install following [Install Git LFS](https://docs.github.com/en/repositories/working-with-files/managing-large-files/installing-git-large-file-storage)
### Step 2: Prepare the TensorRT-LLM release Docker image
### Step 2: Prepare the TensorRT LLM release Docker image
#### Option 1. Use weekly release NGC docker image
TensorRT-LLM provides weekly release [docker image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release)
TensorRT LLM provides weekly release [docker image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release)
#### Option 2. Build TensorRT-LLM Docker image (Alternative way)
If you want to compile a specific TensorRT-LLM commit, you can build the docker image by checking out the specific branch or commit and running a make command. This may take 15-30 minutes depending on your system.
#### Option 2. Build TensorRT LLM Docker image (Alternative way)
If you want to compile a specific TensorRT LLM commit, you can build the docker image by checking out the specific branch or commit and running a make command. This may take 15-30 minutes depending on your system.
```
make -C docker release_build
@ -57,7 +57,7 @@ docker push docker.io/<username>/tensorrt_llm:main
Replace `<username>` with your Docker Hub username or your private registry path.
### Step 4: Start the TensorRT-LLM server
### Step 4: Start the TensorRT LLM server
This command launches the server with Llama4 Maverick as the main model and Eagle3 as the draft model for speculative decoding. Make sure you have downloaded both model checkpoints before running this command.

View File

@ -1,4 +1,4 @@
# N-GramSpeculativeDecodingin TensorRTLLM
# N-GramSpeculativeDecodingin TensorRT LLM
N-Gram speculative decoding leverages the natural repetition in many LLM workloads. It splits previously seen text into configurable (key,value) ngram pairs and, during generation, swiftly proposes draft tokens by matching the current key against n-gram pools in memory.
In this blog, we introduce design choices in TensorRTLLMs N-Gram speculative decoding algorithm, share our experimental results of performance gains, and explain N-Gram's low barrier to adoption by deriving a simple heuristic to enable it.
@ -35,7 +35,7 @@ Speculative decoding drafts several tokens, verifies them on the model, and keep
## Algorithm & Complexity
`NGramDecodingConfig` in TensorRT-LLM:
`NGramDecodingConfig` in TensorRT LLM:
```python
spec_config = NGramDecodingConfig(
max_draft_len = v , # max length of draft tokens

View File

@ -1,11 +1,11 @@
# Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)
# Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)
This blog post continues our previous work on [Scaling Expert Parallelism in TensorRT-LLM (Part 1: Design and Implementation of Large-scale EP)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md), where we introduced the fundamental design and implementation of large-scale Expert Parallelism (EP) in TensorRT-LLM. Building upon that foundation, we have made significant performance improvements through various optimizations, achieving better throughput and latency for large-scale MoE models.
This blog post continues our previous work on [Scaling Expert Parallelism in TensorRT LLM (Part 1: Design and Implementation of Large-scale EP)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md), where we introduced the fundamental design and implementation of large-scale Expert Parallelism (EP) in TensorRT LLM. Building upon that foundation, we have made significant performance improvements through various optimizations, achieving better throughput and latency for large-scale MoE models.
*By NVIDIA TensorRT-LLM Team*
*By NVIDIA TensorRT LLM Team*
## Table of Contents
- [Scaling Expert Parallelism in TensorRT-LLM (Part 2: Performance Status and Optimization)](#scaling-expert-parallelism-in-tensorrt-llm-part-2-performance-status-and-optimization)
- [Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)](#scaling-expert-parallelism-in-tensorrt-llm-part-2-performance-status-and-optimization)
- [Table of Contents](#table-of-contents)
- [Optimization Highlights](#optimization-highlights)
- [Kernel Optimizations](#kernel-optimizations)
@ -28,7 +28,7 @@ This blog post continues our previous work on [Scaling Expert Parallelism in Ten
## Optimization Highlights
Following the introduction of the fundamental design and implementation of large-scale Expert Parallelism (EP) in TensorRT-LLM in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md), the TensorRT-LLM team has focused on optimizing the large EP implementation to improve performance.
Following the introduction of the fundamental design and implementation of large-scale Expert Parallelism (EP) in TensorRT LLM in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md), the TensorRT LLM team has focused on optimizing the large EP implementation to improve performance.
At the kernel level, we analyzed kernel duration and optimized performance by either improving existing kernels or developing new kernels that perform better. At the system level, we refined and optimized the EPLB implementation (which also helps reduce kernel scalability issues), integrated additional features such as MTP, and optimized host overhead to prevent Python code from slowing down inference.
@ -94,7 +94,7 @@ This optimization was implemented in [PR 5570](https://github.com/NVIDIA/TensorR
### Expert Parallelism Load Balancer (EPLB)
As introduced in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#ep-load-balancer), EP-level workload imbalance is common for large-scale EP inference across multiple datasets and has significant performance impacts. TensorRT-LLM implements a set of functionalities to address this issue. We have refined the code and improved the usability of this feature, and the benefits of EPLB are directly reflected in kernel duration improvements.
As introduced in our [previous blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md#ep-load-balancer), EP-level workload imbalance is common for large-scale EP inference across multiple datasets and has significant performance impacts. TensorRT LLM implements a set of functionalities to address this issue. We have refined the code and improved the usability of this feature, and the benefits of EPLB are directly reflected in kernel duration improvements.
The core challenge with EP scaling is that different experts receive varying amounts of work based on the routing decisions made by the MoE layer. This imbalance becomes more pronounced as EP size increases, leading to scenarios where some GPUs are heavily loaded while others remain underutilized. The Expert Parallelism Load Balancer (EPLB) addresses this by dynamically redistributing expert assignments to achieve better load balance across all participating GPUs.
@ -235,7 +235,7 @@ After implementing huge pages, we found that warmup kernels now execute in only
### Multi-Token Prediction (MTP)
MTP allows verifying and accepting several draft tokens in a single iteration, which is very beneficial for scenarios that prefer low latency. TensorRT-LLM has supported MTP, and we refer to our previous [MTP blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md#mtp-implementation-in-tensorrt-llm) for more details on the implementation.
MTP allows verifying and accepting several draft tokens in a single iteration, which is very beneficial for scenarios that prefer low latency. TensorRT LLM has supported MTP, and we refer to our previous [MTP blog](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.md#mtp-implementation-in-tensorrt-llm) for more details on the implementation.
For large EP, we have also extended the implementation so that it works well with online EPLB. This was implemented in [PR 5213](https://github.com/NVIDIA/TensorRT-LLM/pull/5213).
@ -247,11 +247,11 @@ To address the increased host overhead when scaling parallelism in the system, w
#### Reduce Binding and Inter-Process Communication Overhead
TensorRT-LLM is designed to be composed of both C++ and Python code, so that C++ can handle the most performance-sensitive parts while Python handles higher-level logic. As we try to put more logic into Python to make the program easier to read and debug, there are still frequent conversations through binding interfaces between C++ and Python. Besides, since most of the logic is implemented in Python, there are several layers of implementation that communicate with each other through inter-process communication overhead. Frequent binding calls and serialization/deserialization introduced by inter-process communication slow down the core library.
TensorRT LLM is designed to be composed of both C++ and Python code, so that C++ can handle the most performance-sensitive parts while Python handles higher-level logic. As we try to put more logic into Python to make the program easier to read and debug, there are still frequent conversations through binding interfaces between C++ and Python. Besides, since most of the logic is implemented in Python, there are several layers of implementation that communicate with each other through inter-process communication overhead. Frequent binding calls and serialization/deserialization introduced by inter-process communication slow down the core library.
To improve program efficiency, we used environment variables introduced in the [performance analysis guidance](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/performance/perf-analysis.md) to measure and profile CPU overhead, and improved performance by reducing and reusing different binding calls as much as possible, and delaying Python object deserialization to avoid duplicated serialization and reduce message size when doing inter-process communication. This optimization was added in [PR 5224](https://github.com/NVIDIA/TensorRT-LLM/pull/5224). We have also reduced Python garbage collection (GC) impacts in [PR 5141](https://github.com/NVIDIA/TensorRT-LLM/pull/5141).
To enable powerful NVTX markers for easier analysis of host overheads, TensorRT-LLM provides several useful environment variables:
To enable powerful NVTX markers for easier analysis of host overheads, TensorRT LLM provides several useful environment variables:
```bash
export TLLM_NVTX_DEBUG=1 # enables more NVTX markers
@ -261,9 +261,9 @@ export TLLM_PROFILE_START_STOP=100-150 # enable specific iterations profiling
#### Support Stream Interval
As mentioned previously, one outcome of large-scale workloads is that they significantly increase the number of requests and responses that the system must handle, putting huge pressure on Python threads. When the GPU finishes one iteration of calculation, a batch of responses are generated under streaming mode. For each response, TensorRT-LLM must perform detokenization so that output IDs are converted to strings, and OpenAI API protocol objects need to be initialized so that responses can be returned to the user. This becomes time-consuming, especially when the number of responses is huge and the CPU must process them on each iteration. One observation from the user side will be reduced streaming performance when compared to non-streaming.
As mentioned previously, one outcome of large-scale workloads is that they significantly increase the number of requests and responses that the system must handle, putting huge pressure on Python threads. When the GPU finishes one iteration of calculation, a batch of responses are generated under streaming mode. For each response, TensorRT LLM must perform detokenization so that output IDs are converted to strings, and OpenAI API protocol objects need to be initialized so that responses can be returned to the user. This becomes time-consuming, especially when the number of responses is huge and the CPU must process them on each iteration. One observation from the user side will be reduced streaming performance when compared to non-streaming.
To address this problem, TensorRT-LLM has supported a feature called stream interval. Instead of handling all responses on each iteration, a user-specified `stream_interval` `N` indicates that responses will be handled and returned every `N` iterations. This way, on each iteration, there will still be one output ID generated, but it won't be returned to users immediately (except for the first token for the sake of time-to-first-token latency). Instead, tokens accumulate for `N` iterations, and one response is created to handle those `N` generated tokens, which greatly reduces pressure on the CPU side by giving more time for the CPU to catch up. Meanwhile, users can still get streamed output.
To address this problem, TensorRT LLM has supported a feature called stream interval. Instead of handling all responses on each iteration, a user-specified `stream_interval` `N` indicates that responses will be handled and returned every `N` iterations. This way, on each iteration, there will still be one output ID generated, but it won't be returned to users immediately (except for the first token for the sake of time-to-first-token latency). Instead, tokens accumulate for `N` iterations, and one response is created to handle those `N` generated tokens, which greatly reduces pressure on the CPU side by giving more time for the CPU to catch up. Meanwhile, users can still get streamed output.
This feature was added in [PR 5284](https://github.com/NVIDIA/TensorRT-LLM/pull/5284), and we have verified that it works effectively to reduce host overhead. In most cases, setting `stream_interval` to 2 or 4 should close the gap (if any) between streaming and non-streaming modes. The feature can be enabled by setting the following in the YAML extra config file:
@ -307,7 +307,7 @@ When enabling MTP, there is an extra performance boost compared to the baseline.
</div>
<p align="center"><sub><em>Figure 8: DeepSeek R1 throughput on ISL/OSL 8k/1k with MTP enabled.</em></sub></p>
To reproduce the numbers, refer to the [`examples/wide_ep/slurm_scripts`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/wide_ep/slurm_scripts) directory. The scripts there demonstrate how to launch TensorRT-LLM disaggregated serving with large-scale EP and other features enabled on a SLURM cluster.
To reproduce the numbers, refer to the [`examples/wide_ep/slurm_scripts`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/wide_ep/slurm_scripts) directory. The scripts there demonstrate how to launch TensorRT LLM disaggregated serving with large-scale EP and other features enabled on a SLURM cluster.
## Future Work
@ -317,6 +317,6 @@ We are planning to implement more performance optimizations for the large EP imp
## Acknowledgements
This work represents an outstanding example of collaborative engineering excellence within the TensorRT-LLM team. The successful implementation and optimization of large-scale Expert Parallelism required coordinated efforts across multiple domains - from low-level CUDA kernel optimizations to high-level system architecture design. The dedication and technical expertise demonstrated by our team members throughout this project has been truly remarkable.
This work represents an outstanding example of collaborative engineering excellence within the TensorRT LLM team. The successful implementation and optimization of large-scale Expert Parallelism required coordinated efforts across multiple domains - from low-level CUDA kernel optimizations to high-level system architecture design. The dedication and technical expertise demonstrated by our team members throughout this project has been truly remarkable.
Large-scale Expert Parallelism represents one of the important workloads for users productive scenarios, enabling efficient deployment of large MoE models. The performance improvements achieved through this work demonstrate the transformative potential of expert parallelism at scale, and this work opens new possibilities for deploying increasingly sophisticated AI models in production environments.

View File

@ -1,7 +1,8 @@
# Running a High Performance GPT-OSS-120B Inference Server with TensorRT-LLM
# Running a High Performance GPT-OSS-120B Inference Server with TensorRT LLM
NVIDIA has [announced](https://developer.nvidia.com/blog/delivering-1-5-m-tps-inference-on-nvidia-gb200-nvl72-nvidia-accelerates-openai-gpt-oss-models-from-cloud-to-edge/) day-0 support for OpenAI's new open-source model series, [gpt-oss](https://openai.com/index/introducing-gpt-oss/). In the guide below, we will walk you through how to launch your own
high-performance TensorRT-LLM server for **gpt-oss-120b** for inference.
In the guide below, we will walk you through how to launch your own
high-performance TensorRT LLM server for **gpt-oss-120b** for inference.
This guide covers both low-latency and max-throughput cases.
**Low-latency** use cases aim to maximize the number of tokens per second per user (tps/user) with limited concurrency.
@ -13,17 +14,14 @@ For **max-throughput**, the goal is to maximize the tokens produced per GPU per
- Fast SSD storage for model weights
- Access to the gpt-oss-120b model checkpoint
We have a forthcoming guide for achieving great performance on H100; however, this guide focuses on the GPUs listed above.
We have a forthcoming guide for getting great performance on H100, however this guide focuses on the above GPUs.
## Install TensorRT-LLM
In this section, we introduce several ways to install TensorRT-LLM.
## Launching the TensorRT LLM docker container
### NGC Docker Image
The container image that you will use will be pulled from NVIDIA's NGC. This container is multi-platform and will run on both x64 and arm64 architectures: `nvcr.io/nvidia/tensorrt-llm/release:gpt-oss-dev`
Visit the [NGC TensorRT-LLM Release page](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) to find the most up-to-date NGC container image to use. You can also check the latest [release notes](https://github.com/NVIDIA/TensorRT-LLM/releases) to keep track of the support status of the latest releases.
Run the following Docker command to start the TensorRT-LLM container in interactive mode (change the image tag to match latest release):
Run the follow docker command to start the TensorRT LLM container in interactive mode:
```bash
docker run --rm --ipc=host -it \
@ -43,49 +41,18 @@ Explanation of the command:
- Allows container to interact with the host's IPC resources and shared memory for optimal performance (`--ipc=host`)
- Runs the container in interactive mode (`-it`)
- Sets up shared memory and stack limits for optimal performance
- Maps port 8000 from the container to the host
- Enables PDL for performance optimization
- Maps port 8000 from the container to your host
- enables PDL for low-latency perf optimization
- disables parallel weight loading
Additionally, the container mounts your user `.cache` directory to save the downloaded model checkpoints, which are stored in `~/.cache/huggingface/hub/` by default. This prevents having to redownload the weights each time you rerun the container. You can also download the weights to a custom location (we assume `${local_model_path}` is the path to the local model weights).
### Build from source
Support for gpt-oss has been [merged](https://github.com/NVIDIA/TensorRT-LLM/pull/6645) into the **main branch** of TensorRT-LLM. As we continue to optimize gpt-oss performance, you can build TensorRT-LLM from source to get the latest features and support. Please refer to the [doc](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html) if you want to build from source yourself.
Lastly the container mounts your user `.cache` directory to save the downloaded model checkpoints which are saved to `~/.cache/huggingface/hub/` by default. This prevents having to redownload the weights each time you rerun the container.
### TensorRT-LLM Python Wheel Install
## Running the TensorRT LLM Server
Regular releases of TensorRT-LLM are also provided as [Python wheels](https://pypi.org/project/tensorrt-llm/#history). You can find instructions on the pip install [here](https://nvidia.github.io/TensorRT-LLM/installation/linux.html).
## Performance Benchmarking and Model Serving
This guide covers how to configure for both low-latency and max-throughput cases, as well as how to benchmark end-to-end performance.
### Prepare the dataset
Before getting started, we need to prepare a dataset of randomized tokens for benchmarking:
```bash
python benchmarks/cpp/prepare_dataset.py \
--stdout \
--tokenizer openai/gpt-oss-120b \
token-norm-dist \
--input-mean 1024 \
--output-mean 2048 \
--input-stdev 0 \
--output-stdev 0 \
--num-requests 20000 > gpt-oss-120b-1k2k.txt
```
### Low-latency Use Case
The low-latency configuration maximizes tps/user under limited concurrency (e.g., 1, 4, 8, or 16 users). Please set the number of GPUs and concurrency according to your specific situation and workload.
```bash
num_gpus=8
max_batch_size=1
```
As pointed out in the introduction, this guide covers low-latency and max-throughput cases. Each requires a different configurations and commands to run. We will first cover the Low-Latency use-case, followed by the max throughput use-case.
### Low-latency Use-Case
#### Creating the Extra Options Configuration
@ -102,18 +69,14 @@ moe_config:
EOF
```
Key takeaways:
- `enable_attention_dp` is set to `false` to use TP instead of DP for attention.
s- `cuda_graph_config.max_batch_size` is the maximum batch size for CUDA graph.
- `cuda_graph_config.enable_padding` is set to `true` to enable CUDA graph padding.
- `moe_config.backend` is set to `TRTLLM` to use the `trtllm-gen` MoE kernels which are optimized for low concurrency.
> Note: If you are using NVIDIA H200 GPUs it is highly recommended to set the `moe_config.backend` to TRITON to use the OpenAI Triton MoE kernel. See the section [(H200 Only) Using OpenAI Triton Kernels for MoE](#h200-only-using-openai-triton-kernels-for-moe) for more details.
> Note: If you are using NVIDIA H200 GPUs please set the `moe_config.backend` to `TRITON` to use the OpenAI Triton MoE kernel regardless of use case. See the section [(H200/H100 Only) Using OpenAI Triton Kernels for MoE](#h200h100-only-using-openai-triton-kernels-for-moe) for more details.
#### Launching TensorRT LLM Serve
To launch the TensorRT LLM Server to serve the model with the **low latency** config, run the following command. Commands for different GPU configurations are provided (1xGPU, 8xGPU, 4xGPU):
#### Run the benchmark
Use `trtllm-bench` to benchmark the performance of your system:
<details open> <summary>1x B200/GB200/H200</summary>
```bash
trtllm-bench \
@ -172,9 +135,11 @@ Compared to the low-latency configuration, we:
- set `stream_interval` to 10 to stream results to the client every 10 tokens. At high concurrency, the detokenization overhead of streaming mode cannot be hidden under GPU execution time, so `stream_interval` serves as a workaround to reduce this overhead.
- set `moe_config.backend` to `CUTLASS` to use the `CUTLASS` MoE kernels which are optimized for high throughput.
#### Run the benchmark
#### Launching TensorRT LLM Serve
Run the following command to benchmark the throughput of your system:
To launch the TensorRT LLM Server to serve the model with the **max throughput** config, run the following command. Commands for different GPU configurations are provided (1xGPU, 8xGPU, 4xGPU):
<details open> <summary>1x B200/GB200/H200</summary>
```bash
trtllm-bench \
@ -210,22 +175,21 @@ We can use `trtllm-serve` to serve the model by translating the benchmark comman
**Note:** You can also point to a local path containing the model weights instead of the HF repo (e.g., `${local_model_path}`).
```bash
trtllm-serve \
openai/gpt-oss-120b \
mpirun -n 1 --oversubscribe --allow-run-as-root \
trtllm-serve openai/gpt-oss-120b \
--host 0.0.0.0 \
--port 8000 \
--backend pytorch \
--tp_size ${num_gpus} \
--ep_size 1 \
--extra_llm_api_options low_latency.yaml \
--kv_cache_free_gpu_memory_fraction 0.9 \
--max_batch_size ${max_batch_size} \ # E.g., 1
--trust_remote_code
--tp_size 8 \
--ep_size 8 \
--max_batch_size 640 \
--trust_remote_code \
--extra_llm_api_options max_throughput.yaml \
--kv_cache_free_gpu_memory_fraction 0.9
```
</details>
The initialization may take several minutes as it loads and optimizes the models.
For max-throughput configuration, run:
<details> <summary>4x GB200/B200/H200</summary>
```bash
trtllm-serve \
@ -233,17 +197,47 @@ trtllm-serve \
--host 0.0.0.0 \
--port 8000 \
--backend pytorch \
--tp_size ${num_gpus} \
--ep_size ${num_gpus} \
--tp_size 4 \
--ep_size 4 \
--max_batch_size 640 \
--trust_remote_code \
--extra_llm_api_options max_throughput.yaml \
--kv_cache_free_gpu_memory_fraction 0.9 \
--max_batch_size ${max_batch_size} \ # E.g., 640
--trust_remote_code
--kv_cache_free_gpu_memory_fraction 0.9
```
</details>
This command:
- Maps port 8000 from the container to your host
- Uses the PyTorch backend and specifies the tensor and expert parallel sizes
- References the low latency or max throughput configuration file for extra options
- Configures memory settings for optimal performance
- Enables all GPUs with attention data parallelism for the max throughput scenario
The initialization may take several minutes as it loads and optimizes the models.
## (H200 Only) Using OpenAI Triton Kernels for MoE
OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT LLM can leverage these kernels for Hopper based GPUs like NVIDIA's H200 for best performance. The NGC TensorRT LLM container image mentioned above already includes the required kernels so you do not need to build or install them. It is highly recommended to enable them with the steps below:
### Selecting Triton as the MoE backend
To use the Triton MoE backend with **trtllm-serve** (or other similar commands) add this snippet to the YAML file passed via `--extra_llm_api_options`:
```yaml
moe_config:
backend: TRITON
```
Alternatively the TRITON backend can be enabled by passing the CLI flag to the trtllm-server command at runtime:
```bash
--moe_backend TRITON
```
### Test the Server with a Sample Request
## Test the Server with a Sample Request
To check the server's health and readiness:

View File

@ -1,7 +1,7 @@
trtllm-bench
===========================
trtllm-bench is a comprehensive benchmarking tool for TensorRT-LLM engines. It provides three main subcommands for different benchmarking scenarios:
trtllm-bench is a comprehensive benchmarking tool for TensorRT LLM engines. It provides three main subcommands for different benchmarking scenarios:
**Common Options for All Commands:**

View File

@ -0,0 +1,89 @@
trtllm-eval
===========
About
-----
The ``trtllm-eval`` command provides developers with a unified entry point for accuracy evaluation. It shares the core evaluation logic with the `accuracy test suite <https://github.com/NVIDIA/TensorRT-LLM/tree/main/tests/integration/defs/accuracy>`_ of TensorRT LLM.
``trtllm-eval`` is built on the offline API -- LLM API. Compared to the online ``trtllm-serve``, the offline API provides clearer error messages and simplifies the debugging workflow.
The following tasks are currently supported:
.. list-table::
:header-rows: 1
:widths: 20 25 15 15 15
* - Dataset
- Task
- Metric
- Default ISL
- Default OSL
* - CNN Dailymail
- summarization
- rouge
- 924
- 100
* - MMLU
- QA; multiple choice
- accuracy
- 4,094
- 2
* - GSM8K
- QA; regex matching
- accuracy
- 4,096
- 256
* - GPQA
- QA; multiple choice
- accuracy
- 32,768
- 4,096
* - JSON mode eval
- structured generation
- accuracy
- 1,024
- 512
.. note::
``trtllm-eval`` originates from the TensorRT LLM accuracy test suite and serves as a lightweight utility for verifying and debugging accuracy. At this time, ``trtllm-eval`` is intended solely for development and is not recommended for production use.
Usage and Examples
------------------
Some evaluation tasks (e.g., GSM8K and GPQA) depend on the ``lm_eval`` package. To run these tasks, you need to install ``lm_eval`` with:
.. code-block:: bash
pip install -r requirements-dev.txt
Alternatively, you can install the ``lm_eval`` version specified in ``requirements-dev.txt``.
Here are some examples:
.. code-block:: bash
# Evaluate Llama-3.1-8B-Instruct on MMLU
trtllm-eval --model meta-llama/Llama-3.1-8B-Instruct mmlu
# Evaluate Llama-3.1-8B-Instruct on GSM8K
trtllm-eval --model meta-llama/Llama-3.1-8B-Instruct gsm8k
# Evaluate Llama-3.3-70B-Instruct on GPQA Diamond
trtllm-eval --model meta-llama/Llama-3.3-70B-Instruct gpqa_diamond
The ``--model`` argument accepts either a Hugging Face model ID or a local checkpoint path. By default, ``trtllm-eval`` runs the model with the PyTorch backend; you can pass ``--backend tensorrt`` to switch to the TensorRT backend.
Alternatively, the ``--model`` argument also accepts a local path to pre-built TensorRT engines. In this case, you should pass the Hugging Face tokenizer path to the ``--tokenizer`` argument.
For more details, see ``trtllm-eval --help`` and ``trtllm-eval <task> --help``.
Syntax
------
.. click:: tensorrt_llm.commands.eval:main
:prog: trtllm-eval
:nested: full

View File

@ -1,6 +1,6 @@
# Run benchmarking with `trtllm-serve`
TensorRT-LLM provides the OpenAI-compatiable API via `trtllm-serve` command.
TensorRT LLM provides the OpenAI-compatiable API via `trtllm-serve` command.
A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference).
This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B and Qwen2.5-VL-7B for multimodal models:
@ -20,7 +20,7 @@ The overall performance benchmarking involves:
## Launch the NGC container
TensorRT-LLM distributes the pre-built container on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags).
TensorRT LLM distributes the pre-built container on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags).
You can launch the container using the following command:
@ -115,7 +115,7 @@ Then we can run the benchmark using the command below.
bash -x bench.sh &> output_bench.log
```
Below is some example TensorRT-LLM serving benchmark output. Your actual results may vary.
Below is some example TensorRT LLM serving benchmark output. Your actual results may vary.
```
============ Serving Benchmark Result ============

View File

@ -106,7 +106,7 @@ container published for a previous
[GitHub pre-release or release](https://github.com/NVIDIA/TensorRT-LLM/releases)
(see also [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags)).
```
""",
"""
}
autosummary_generate = True

View File

@ -0,0 +1,12 @@
Model Recipes
================
.. toctree::
:maxdepth: 1
:caption: Model Recipes
:name: Model Recipes
quick-start-recipe-for-deepseek-r1-on-trtllm.md
quick-start-recipe-for-llama3.3-70b-on-trtllm.md
quick-start-recipe-for-llama4-scout-on-trtllm.md
quick-start-recipe-for-gpt-oss-on-trtllm.md

View File

@ -1,10 +1,10 @@
# Quick Start Recipe for DeepSeek R1 on TensorRT-LLM - Blackwell & Hopper Hardware
# Quick Start Recipe for DeepSeek R1 on TensorRT LLM - Blackwell & Hopper Hardware
## Introduction
This deployment guide provides step-by-step instructions for running the DeepSeek R1 model using TensorRT-LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT-LLM parameters, launching the server, and validating inference output.
This deployment guide provides step-by-step instructions for running the DeepSeek R1 model using TensorRT LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT LLM parameters, launching the server, and validating inference output.
The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIAs accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT-LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution.
The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIAs accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution.
## Prerequisites
@ -38,7 +38,7 @@ The default moe backend is `CUTLASS`, so for the combination which is not suppor
### Run Docker Container
Run the docker container using the TensorRT-LLM NVIDIA NGC image.
Run the docker container using the TensorRT LLM NVIDIA NGC image.
```shell
docker run --rm -it \
@ -58,11 +58,11 @@ Note:
* The command also maps port `8000` from the container to your host so you can access the LLM API endpoint from your host
* See the <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags> for all the available containers. The containers published in the main branch weekly have `rcN` suffix, while the monthly release with QA tests has no `rcN` suffix. Use the `rc` release to get the latest model and feature support.
If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to <https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html>.
If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html)
### Creating the TRT-LLM Server config
We create a YAML configuration file `/tmp/config.yml` for the TensorRT-LLM Server and populate it with the following recommended performance settings.
We create a YAML configuration file /tmp/config.yml for the TensorRT LLM Server and populate it with the following recommended performance settings.
```shell
EXTRA_LLM_API_FILE=/tmp/config.yml
@ -143,7 +143,7 @@ These options are used directly on the command line when you start the `trtllm-s
#### `--backend pytorch`
* **Description:** Tells TensorRT-LLM to use the **pytorch** backend.
&emsp;**Description:** Tells TensorRT LLM to use the **pytorch** backend.
#### `--max_batch_size`
@ -159,7 +159,7 @@ These options are used directly on the command line when you start the `trtllm-s
#### `--trust_remote_code`
* **Description:** Allows TensorRT-LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
&emsp;**Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
#### Extra LLM API Options (YAML Configuration)
@ -230,7 +230,7 @@ Refer to the wide EP [examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main
### Basic Test
Start a new terminal on the host to test the TensorRT-LLM server you just launched.
Start a new terminal on the host to test the TensorRT LLM server you just launched.
You can query the health/readiness of the server using:
@ -270,12 +270,12 @@ Here is an example response, showing that the TRT-LLM server returns “New York
We use the `lm-eval` tool to test the models accuracy. For more information see [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
To run the evaluation harness exec into the running TensorRT-LLM container and install with this command:
To run the evaluation harness exec into the running TensorRT LLM container and install with this command:
```shell
docker exec -it tensorrt_llm /bin/bash
pip install lm_eval
pip install -U lm-eval
```
FP8 command for GSM8K:
@ -318,7 +318,7 @@ Sample result in Blackwell:
## Benchmarking Performance
To benchmark the performance of your TensorRT-LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper `bench.sh` script.
To benchmark the performance of your TensorRT LLM server you can leverage the built-in “benchmark\_serving.py” script. To do this first creating a wrapper [bench.sh](http://bench.sh) script.
```shell
cat <<EOF > bench.sh
@ -366,7 +366,7 @@ Run `bench.sh` to begin a serving benchmark. This will take a long time if you r
./bench.sh
```
Sample TensorRT-LLM serving benchmark output. Your results may vary due to ongoing software optimizations.
Sample TensorRT LLM serving benchmark output. Your results may vary due to ongoing software optimizations.
```
============ Serving Benchmark Result ============

View File

@ -1,10 +1,10 @@
# Quick Start Recipe for Llama3.3 70B on TensorRT-LLM - Blackwell & Hopper Hardware
# Quick Start Recipe for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware
## Introduction
This deployment guide provides step-by-step instructions for running the Llama 3.3-70B Instruct model using TensorRT-LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT-LLM parameters, launching the server, and validating inference output.
This deployment guide provides step-by-step instructions for running the Llama 3.3-70B Instruct model using TensorRT LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT LLM parameters, launching the server, and validating inference output.
The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIAs accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT-LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution.
The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIAs accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution.
## Access & Licensing
@ -12,15 +12,15 @@ To use Llama 3.3-70B, you must first agree to Metas Llama 3 Community License
## Prerequisites
GPU: NVIDIA Blackwell or Hopper Architecture
OS: Linux
Drivers: CUDA Driver 575 or Later
Docker with NVIDIA Container Toolkit installed
GPU: NVIDIA Blackwell or Hopper Architecture
OS: Linux
Drivers: CUDA Driver 575 or Later
Docker with NVIDIA Container Toolkit installed
Python3 and python3-pip (Optional, for accuracy evaluation only)
## Models
* FP8 model: [Llama-3.3-70B-Instruct-FP8](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8)
* FP8 model: [Llama-3.3-70B-Instruct-FP8](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP8)
* NVFP4 model: [Llama-3.3-70B-Instruct-FP4](https://huggingface.co/nvidia/Llama-3.3-70B-Instruct-FP4)
@ -30,7 +30,7 @@ Note that NVFP4 is only supported on NVIDIA Blackwell
### Run Docker Container
Run the docker container using the TensorRT-LLM NVIDIA NGC image.
Run the docker container using the TensorRT LLM NVIDIA NGC image.
```shell
docker run --rm -it \
@ -43,18 +43,18 @@ nvcr.io/nvidia/tensorrt-llm/release:1.0.0rc6 \
/bin/bash
```
Note:
Note:
* The command mounts your user `.cache` directory to save the downloaded model checkpoints which are saved to `~/.cache/huggingface/hub/` by default. This prevents having to redownload the weights each time you rerun the container. If the `~/.cache` directory doesnt exist please create it using `$ mkdir ~/.cache`.
* You can mount additional directories and paths using the `-v <host_path>:<container_path>` flag if needed, such as mounting the downloaded weight paths.
* The command also maps port `8000` from the container to your host so you can access the LLM API endpoint from your host
* See the <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags> for all the available containers. The containers published in the main branch weekly have `rcN` suffix, while the monthly release with QA tests has no `rcN` suffix. Use the `rc` release to get the latest model and feature support.
* You can mount additional directories and paths using the \-v \<local\_path\>:\<path\> flag if needed, such as mounting the downloaded weight paths.
* The command mounts your user .cache directory to save the downloaded model checkpoints which are saved to \~/.cache/huggingface/hub/ by default. This prevents having to redownload the weights each time you rerun the container. If the \~/.cache directory doesnt exist please create it using mkdir \~/.cache
* The command also maps port **8000** from the container to your host so you can access the LLM API endpoint from your host
* See the [https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags) for all the available containers. The containers published in the main branch weekly have “rcN” suffix, while the monthly release with QA tests has no “rcN” suffix. Use the rc release to get the latest model and feature support.
If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to <https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html>.
If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html)
### Creating the TRT-LLM Server config
We create a YAML configuration file `/tmp/config.yml` for the TensorRT-LLM Server and populate it with the following recommended performance settings.
We create a YAML configuration file /tmp/config.yml for the TensorRT LLM Server and populate it with the following recommended performance settings.
```shell
EXTRA_LLM_API_FILE=/tmp/config.yml
@ -64,7 +64,7 @@ enable_attention_dp: false
cuda_graph_config:
enable_padding: true
max_batch_size: 1024
kv_cache_config:
kv_cache_config:
dtype: fp8
EOF
```
@ -93,96 +93,98 @@ After the server is set up, the client can now send prompt requests to the serve
### Configs and Parameters
These options are used directly on the command line when you start the `trtllm-serve` process.
#### `--tp_size`
* **Description:** Sets the **tensor-parallel size**. This should typically match the number of GPUs you intend to use for a single model instance.
&emsp;**Description:** Sets the **tensor-parallel size**. This should typically match the number of GPUs you intend to use for a single model instance.
#### `--ep_size`
* **Description:** Sets the **expert-parallel size** for Mixture-of-Experts (MoE) models. Like `tp_size`, this should generally match the number of GPUs you're using. This setting has no effect on non-MoE models.
&emsp;**Description:** Sets the **expert-parallel size** for Mixture-of-Experts (MoE) models. Like `tp_size`, this should generally match the number of GPUs you're using. This setting has no effect on non-MoE models.
#### `--kv_cache_free_gpu_memory_fraction`
* **Description:** A value between `0.0` and `1.0` that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors.
* **Recommendation:** If you experience OOM errors, try reducing this value to `0.8` or lower.
&emsp;**Description:** A value between 0.0 and 1.0 that specifies the fraction of free GPU memory to reserve for the KV cache after the model is loaded. Since memory usage can fluctuate, this buffer helps prevent out-of-memory (OOM) errors.
&emsp;**Recommendation:** If you experience OOM errors, try reducing this value to **0.8** or lower.
#### `--backend pytorch`
* **Description:** Tells TensorRT-LLM to use the **pytorch** backend.
&emsp;**Description:** Tells TensorRT LLM to use the **pytorch** backend.
#### `--max_batch_size`
* **Description:** The maximum number of user requests that can be grouped into a single batch for processing.
&emsp;**Description:** The maximum number of user requests that can be grouped into a single batch for processing.
#### `--max_num_tokens`
* **Description:** The maximum total number of tokens (across all requests) allowed inside a single scheduled batch.
&emsp;**Description:** The maximum total number of tokens (across all requests) allowed inside a single scheduled batch.
#### `--max_seq_len`
* **Description:** The maximum possible sequence length for a single request, including both input and generated output tokens.
&emsp;**Description:** The maximum possible sequence length for a single request, including both input and generated output tokens.
#### `--trust_remote_code`
* **Description:** Allows TensorRT-LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
&emsp;**Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
#### Extra LLM API Options (YAML Configuration)
These options provide finer control over performance and are set within a YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument.
These options provide finer control over performance and are set within a YAML file passed to the trtllm-serve command via the \--extra\_llm\_api\_options argument.
#### `kv_cache_config`
* **Description**: A section for configuring the Key-Value (KV) cache.
&emsp;**Description**: A section for configuring the Key-Value (KV) cache.
* **Options**:
&emsp;**Options**:
* `dtype`: Sets the data type for the KV cache.
**Default**: `"auto"` (uses the data type specified in the model checkpoint).
&emsp;&emsp;dtype: Sets the data type for the KV cache.
&emsp;&emsp;**Default**: auto (uses the data type specified in the model checkpoint).
#### `cuda_graph_config`
* **Description**: A section for configuring CUDA graphs to optimize performance.
&emsp;**Description**: A section for configuring CUDA graphs to optimize performance.
* **Options**:
&emsp;**Options**:
* `enable_padding`: If `"true"`, input batches are padded to the nearest `cuda_graph_batch_size`. This can significantly improve performance.
&emsp;&emsp;enable\_padding: If true, input batches are padded to the nearest cuda\_graph\_batch\_size. This can significantly improve performance.
**Default**: `false`
&emsp;&emsp;**Default**: false
* `max_batch_size`: Sets the maximum batch size for which a CUDA graph will be created.
&emsp;&emsp;max\_batch\_size: Sets the maximum batch size for which a CUDA graph will be created.
**Default**: `0`
&emsp;&emsp;**Default**: 0
**Recommendation**: Set this to the same value as the `--max_batch_size` command-line option.
&emsp;&emsp;**Recommendation**: Set this to the same value as the \--max\_batch\_size command-line option.
* `batch_sizes`: A specific list of batch sizes to create CUDA graphs for.
&emsp;&emsp;batch\_sizes: A specific list of batch sizes to create CUDA graphs for.
**Default**: `None`
&emsp;&emsp;**Default**: None
#### `moe_config`
* **Description**: Configuration for Mixture-of-Experts (MoE) models.
&emsp;**Description**: Configuration for Mixture-of-Experts (MoE) models.
* **Options**:
&emsp;**Options**:
* `backend`: The backend to use for MoE operations.
**Default**: `CUTLASS`
&emsp;&emsp;backend: The backend to use for MoE operations.
&emsp;&emsp;**Default**: CUTLASS
#### `attention_backend`
* **Description**: The backend to use for attention calculations.
&emsp;**Description**: The backend to use for attention calculations.
* **Default**: `TRTLLM`
&emsp;**Default**: TRTLLM
See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`.
## Testing API Endpoint
### Basic Test
Start a new terminal on the host to test the TensorRT-LLM server you just launched.
Start a new terminal on the host to test the TensorRT LLM server you just launched.
You can query the health/readiness of the server using:
@ -192,7 +194,7 @@ curl -s -o /dev/null -w "Status: %{http_code}\n" "http://localhost:8000/health"
When the `Status: 200` code is returned, the server is ready for queries. Note that the very first query may take longer due to initialization and compilation.
After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server.
After the TRT-LLM server is set up and shows Application startup complete, you can send requests to the server.
```shell
curl http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{
@ -211,35 +213,35 @@ Here is an example response, showing that the TRT-LLM server returns “New York
### Troubleshooting Tips
* If you encounter CUDA out-of-memory errors, try reducing `max_batch_size` or `max_seq_len`.
* Ensure your model checkpoints are compatible with the expected format.
* For performance issues, check GPU utilization with nvidia-smi while the server is running.
* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed.
* For connection issues, make sure the server port (`8000` in this guide) is not being used by another application.
* If you encounter CUDA out-of-memory errors, try reducing max\_batch\_size or max\_seq\_len
* Ensure your model checkpoints are compatible with the expected format
* For performance issues, check GPU utilization with nvidia-smi while the server is running
* If the container fails to start, verify that the NVIDIA Container Toolkit is properly installed
* For connection issues, make sure port 8000 is not being used by another application
### Running Evaluations to Verify Accuracy (Optional)
We use the lm-eval tool to test the models accuracy. For more information see <https://github.com/EleutherAI/lm-evaluation-harness>.
We use the lm-eval tool to test the models accuracy. For more information see [https://github.com/EleutherAI/lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness).
To run the evaluation harness exec into the running TensorRT-LLM container and install with this command:
To run the evaluation harness exec into the running TensorRT LLM container and install with this command:
```shell
docker exec -it tensorrt_llm /bin/bash
pip install lm_eval
pip install -U lm-eval
```
FP8 command for GSM8K
* Note: The tokenizer will add BOS (beginning of sentence token) before input prompt by default which leads to accuracy regression on GSM8K task for Llama 3.3 70B instruction model. So, set `add_special_tokens=False` to avoid it.
* Note: The tokenizer will add BOS (beginning of sentence token) before input prompt by default which leads to accuracy regression on GSM8K task for Llama 3.3 70B instruction model. So, set add\_special\_tokens=False to avoid it.
```shell
```
MODEL_PATH=nvidia/Llama-3.3-70B-Instruct-FP8
lm_eval --model local-completions --tasks gsm8k --batch_size 256 --gen_kwargs temperature=0.0,add_special_tokens=False --num_fewshot 5 --model_args model=${MODEL_PATH},base_url=http://localhost:8000/v1/completions,num_concurrent=32,max_retries=20,tokenized_requests=False --log_samples --output_path trtllm.fp8.gsm8k
```
Sample result in Blackwell.
Sample result in Blackwell.
```
|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
@ -250,7 +252,7 @@ Sample result in Blackwell.
FP4 command for GSM8K
* Note: The tokenizer will add BOS before input prompt by default, which leads to accuracy regression on GSM8K task for LLama 3.3 70B instruction model. So set `add_special_tokens=False` to avoid it.
* Note: The tokenizer will add BOS before input prompt by default, which leads to accuracy regression on GSM8K task for LLama 3.3 70B instruction model. So set add\_special\_tokens=False to avoid it.
```shell
MODEL_PATH=nvidia/Llama-3.3-70B-Instruct-FP4
@ -260,7 +262,7 @@ lm_eval --model local-completions --tasks gsm8k --batch_size 256 --gen_kwargs t
Sample result in Blackwell
```
```shell
|Tasks|Version| Filter |n-shot| Metric | |Value | |Stderr|
|-----|------:|----------------|-----:|-----------|---|-----:|---|-----:|
|gsm8k| 3|flexible-extract| 5|exact_match|↑ |0.9356|± |0.0068|
@ -269,7 +271,7 @@ Sample result in Blackwell
## Benchmarking Performance
To benchmark the performance of your TensorRT-LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper `bench.sh` script.
To benchmark the performance of your TensorRT LLM server you can leverage the built-in “benchmark\_serving.py” script. To do this first creating a wrapper [bench.sh](http://bench.sh) script.
```shell
cat <<EOF > bench.sh
@ -299,7 +301,7 @@ EOF
chmod +x bench.sh
```
To benchmark the FP4 model, replace `--model nvidia/Llama-3.3-70B-Instruct-FP8` with `--model nvidia/Llama-3.3-70B-Instruct-FP4`.
To benchmark the FP4 model, replace \--model nvidia/Llama-3.3-70B-Instruct-FP8 with \--model nvidia/Llama-3.3-70B-Instruct-FP4.
If you want to save the results to a file add the following options.
@ -309,15 +311,15 @@ If you want to save the results to a file add the following options.
--result-filename "concurrency_${concurrency}.json"
```
For more benchmarking options see <https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt\_llm/serve/scripts/benchmark\_serving.py>.
For more benchmarking options see. [https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt\_llm/serve/scripts/benchmark\_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py)
Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script.
Run bench.sh to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above bench.sh script.
```shell
./bench.sh
```
Sample TensorRT-LLM serving benchmark output. Your results may vary due to ongoing software optimizations.
Sample TensorRT LLM serving benchmark output. Your results may vary due to ongoing software optimizations.
```
============ Serving Benchmark Result ============
@ -350,13 +352,13 @@ P99 E2EL (ms): [result]
### Key Metrics
* Median Time to First Token (TTFT)
* The typical time elapsed from when a request is sent until the first output token is generated.
* Median Time Per Output Token (TPOT)
* The typical time required to generate each token *after* the first one.
* Median Inter-Token Latency (ITL)
* The typical time delay between the completion of one token and the completion of the next.
* Median End-to-End Latency (E2EL)
* The typical total time from when a request is submitted until the final token of the response is received.
* Total Token Throughput
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.
* Median Time to First Token (TTFT)
* The typical time elapsed from when a request is sent until the first output token is generated.
* Median Time Per Output Token (TPOT)
* The typical time required to generate each token *after* the first one.
* Median Inter-Token Latency (ITL)
* The typical time delay between the completion of one token and the completion of the next.
* Median End-to-End Latency (E2EL)
* The typical total time from when a request is submitted until the final token of the response is received.
* Total Token Throughput
* The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens.

View File

@ -1,10 +1,10 @@
# Quick Start Recipe for Llama4 Scout 17B on TensorRT-LLM - Blackwell & Hopper Hardware
# Quick Start Recipe for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware
## Introduction
This deployment guide provides step-by-step instructions for running the Llama-4-Scout-17B-16E-Instruct model using TensorRT-LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT-LLM parameters, launching the server, and validating inference output.
This deployment guide provides step-by-step instructions for running the Llama-4-Scout-17B-16E-Instruct model using TensorRT LLM with FP8 and NVFP4 quantization, optimized for NVIDIA GPUs. It covers the complete setup required; from accessing model weights and preparing the software environment to configuring TensorRT LLM parameters, launching the server, and validating inference output.
The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIAs accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT-LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution.
The guide is intended for developers and practitioners seeking high-throughput or low-latency inference using NVIDIAs accelerated stack—starting with the PyTorch container from NGC, then installing TensorRT LLM for model serving, FlashInfer for optimized CUDA kernels, and ModelOpt to enable FP8 and NVFP4 quantized execution.
## Access & Licensing
@ -29,7 +29,7 @@ Note that NVFP4 is only supported on NVIDIA Blackwell platform.
### Run Docker Container
Run the docker container using the TensorRT-LLM NVIDIA NGC image.
Run the docker container using the TensorRT LLM NVIDIA NGC image.
```shell
docker run --rm -it \
@ -49,11 +49,11 @@ Note:
* The command also maps port `8000` from the container to your host so you can access the LLM API endpoint from your host
* See the <https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags> for all the available containers. The containers published in the main branch weekly have `rcN` suffix, while the monthly release with QA tests has no `rcN` suffix. Use the `rc` release to get the latest model and feature support.
If you want to use latest main branch, you can choose to build from source to install TensorRT-LLM, the steps refer to <https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html>.
If you want to use latest main branch, you can choose to build from source to install TensorRT LLM, the steps refer to [https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html](https://nvidia.github.io/TensorRT-LLM/latest/installation/build-from-source-linux.html)
### Creating the TRT-LLM Server config
We create a YAML configuration file `/tmp/config.yml` for the TensorRT-LLM Server and populate it with the following recommended performance settings.
We create a YAML configuration file /tmp/config.yml for the TensorRT LLM Server and populate it with the following recommended performance settings.
```shell
EXTRA_LLM_API_FILE=/tmp/config.yml
@ -108,7 +108,7 @@ These options are used directly on the command line when you start the `trtllm-s
#### `--backend pytorch`
* **Description:** Tells TensorRT-LLM to use the **pytorch** backend.
&emsp;**Description:** Tells TensorRT LLM to use the **pytorch** backend.
#### `--max_batch_size`
@ -124,7 +124,7 @@ These options are used directly on the command line when you start the `trtllm-s
#### `--trust_remote_code`
* **Description:** Allows TensorRT-LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
&emsp;**Description:** Allows TensorRT LLM to download models and tokenizers from Hugging Face. This flag is passed directly to the Hugging Face API.
#### Extra LLM API Options (YAML Configuration)
@ -175,13 +175,13 @@ These options provide finer control over performance and are set within a YAML f
* **Default**: `TRTLLM`
See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`.
See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`.
## Testing API Endpoint
### Basic Test
Start a new terminal on the host to test the TensorRT-LLM server you just launched.
Start a new terminal on the host to test the TensorRT LLM server you just launched.
You can query the health/readiness of the server using:
@ -220,12 +220,12 @@ Here is an example response, showing that the TRT-LLM server returns “New York
We use the lm-eval tool to test the models accuracy. For more information see <https://github.com/EleutherAI/lm-evaluation-harness>.
To run the evaluation harness exec into the running TensorRT-LLM container and install with this command:
To run the evaluation harness exec into the running TensorRT LLM container and install with this command:
```shell
docker exec -it tensorrt_llm /bin/bash
pip install lm_eval
pip install -U lm-eval
```
FP8 command for GSM8K
@ -264,7 +264,7 @@ Sample result in Blackwell
## Benchmarking Performance
To benchmark the performance of your TensorRT-LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper `bench.sh` script.
To benchmark the performance of your TensorRT LLM server you can leverage the built-in `benchmark_serving.py` script. To do this first creating a wrapper `bench.sh`(http://bench.sh) script.
```shell
cat <<EOF > bench.sh
@ -312,7 +312,7 @@ Run bench.sh to begin a serving benchmark. This will take a long time if you run
./bench.sh
```
Sample TensorRT-LLM serving benchmark output. Your results may vary due to ongoing software optimizations.
Sample TensorRT LLM serving benchmark output. Your results may vary due to ongoing software optimizations.
```
============ Serving Benchmark Result ============

View File

@ -1,9 +1,9 @@
(build-image-to-dockerhub)=
# Build the TensorRT-LLM Docker Image
# Build the TensorRT LLM Docker Image
When you develop trt-llm on cloud platform such as runpod, you may need to provide a docker image for the platform. So you firstly need to upload the image to dockerhub.
## Build the TensorRT-LLM Docker Image and Upload to DockerHub
## Build the TensorRT LLM Docker Image and Upload to DockerHub
```bash
make -C docker build

View File

@ -0,0 +1,98 @@
(perf-analysis)=
# Performance Analysis
NVIDIA Nsight Systems reports at the application level are highly informative. Metric sampling capabilities have increased over generations and provide a clean middle-ground between timing analysis and kernel-level deep dives with NVIDIA Nsight Compute.
Given the potential long runtimes of Large Languages Models (LLMs) and the diversity of workloads a model may experience during a single inference pass or binary execution, NVIDIA has added features to TensorRT LLM to get the most out of Nsight Systems capabilities. This document outlines those features as well as provides examples of how to best utilize them to understand your application.
## Feature Descriptions
The main functionality:
* Relies on toggling the CUDA profiler runtime API on and off.
* (PyTorch workflow only) Toggling the PyTorch profiler on and off.
* Provides a means to understand which regions a user may want to focus on.
Toggling the CUDA profiler runtime API on and off:
* Allows users to know specifically what the profiled region corresponds to.
* Results in smaller files to post-process (for metric extraction or similar).
(PyTorch workflow only) Toggling the PyTorch profiler on and off:
* Help users to analysis the performance breakdown in the model.
* Results in smaller files to post-process (for metric extraction or similar).
## Coordinating with NVIDIA Nsight Systems Launch
Consult the Nsight Systems User Guide for full overview of options.
On the PyTorch workflow, basic NVTX markers are by default provided. On the C++/TensorRT workflow, append `--nvtx` when calling `scripts/build_wheel.py` script to compile, and clean build the code.
### Only collect specific iterations
To reduce the Nsight Systems profile size, and ensure that only specific iterations are collected, set environment variable `TLLM_PROFILE_START_STOP=A-B`, and append `-c cudaProfilerApi` to `nsys profile` command.
### Enable more NVTX markers for debugging
Set environment variable `TLLM_NVTX_DEBUG=1`.
### Enable garbage collection (GC) NVTX markers
Set environment variable `TLLM_PROFILE_RECORD_GC=1`.
### Enable GIL information in NVTX markers
Append “python-gil” to Nsys “-t” option.
## Coordinating with PyTorch profiler (PyTorch workflow only)
### Collect PyTorch profiler results
1. Set environment variable `TLLM_PROFILE_START_STOP=A-B` to specify the range of the iterations to be collected.
2. Set environment variable `TLLM_TORCH_PROFILE_TRACE=<path>`, and the results will be saved to `<path>`.
### Visualize the PyTorch profiler results
Use [chrome://tracing/](chrome://tracing/) to inspect the saved profile.
## Examples
Consult the Nsight Systems User Guide for full overview of MPI-related options.
### Profiling specific iterations on a `trtllm-bench`/`trtllm-serve` run
Say we want to profile iterations 100 to 150 on a `trtllm-bench`/`trtllm-serve` run, we want to collect as much information as possible for debugging, such as GIL, debugging NVTX markers, etc:
```bash
#!/bin/bash
# Prepare dataset for the benchmark
python3 benchmarks/cpp/prepare_dataset.py \
--tokenizer=${MODEL_PATH} \
--stdout token-norm-dist --num-requests=${NUM_SAMPLES} \
--input-mean=1000 --output-mean=1000 --input-stdev=0 --output-stdev=0 > /tmp/dataset.txt
# Benchmark and profile
TLLM_PROFILE_START_STOP=100-150 nsys profile \
-o trace -f true \
-t 'cuda,nvtx,python-gil' -c cudaProfilerApi \
--cuda-graph-trace node \
-e TLLM_PROFILE_RECORD_GC=1,TLLM_LLMAPI_ENABLE_NVTX=1,TLLM_TORCH_PROFILE_TRACE=trace.json \
--trace-fork-before-exec=true \
trtllm-bench \ # or trtllm-serve command
--model deepseek-ai/DeepSeek-V3 \
--model_path ${MODEL_PATH} \
throughput \
--dataset /tmp/dataset.txt --warmup 0 \
--backend pytorch \
--streaming
```
The Nsight Systems reports will be saved to `trace.nsys-rep`. Use NVIDIA Nsight Systems application to open it.
The PyTorch profiler results will be saved to `trace.json`. Use [chrome://tracing/](chrome://tracing/) to inspect the saved profile.

View File

@ -0,0 +1,468 @@
(perf-benchmarking)=
# TensorRT LLM Benchmarking
```{important}
This benchmarking suite is a work in progress.
Expect breaking API changes.
```
TensorRT LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility that aims to make it
easier for users to reproduce our officially published [performance overview](./perf-overview.md#throughput-measurements). `trtllm-bench` provides the follows:
- A streamlined way to build tuned engines for benchmarking for a variety of models and platforms.
- An entirely Python workflow for benchmarking.
- Ability to benchmark various flows and features within TensorRT LLM.
`trtllm-bench` executes all benchmarks using [in-flight batching] -- for more information see
the [in-flight batching section](../advanced/gpt-attention.md#in-flight-batching) that describes the concept
in further detail.
## Before Benchmarking
For rigorous benchmarking where consistent and reproducible results are critical, proper GPU configuration is essential. These settings help maximize GPU utilization, eliminate performance variability, and ensure optimal conditions for accurate measurements. While not strictly required for normal operation, we recommend applying these configurations when conducting performance comparisons or publishing benchmark results.
### Persistence mode
Ensure persistence mode is enabled to maintain consistent GPU state:
```shell
sudo nvidia-smi -pm 1
```
### GPU Clock Management
Allow the GPU to dynamically adjust its clock speeds based on workload and temperature. While locking clocks at maximum frequency might seem beneficial, it can sometimes lead to thermal throttling and reduced performance. Reset GPU clocks using:
```shell
sudo nvidia-smi -rgc
```
### Set power limits
First query the maximum power limit:
```shell
nvidia-smi -q -d POWER
```
Then configure the GPU to operate at its maximum power limit for consistent performance:
```shell
sudo nvidia-smi -pl <max_power_limit>
```
### Boost settings
Potentially a GPU may support boost levels. First query available boost levels:
```shell
sudo nvidia-smi boost-slider -l
```
If supported, enable the boost slider using one of the available levels for maximum performance:
```shell
sudo nvidia-smi boost-slider --vboost <max_boost_slider>
```
## Throughput Benchmarking
### Limitations and Caveats
#### Validated Networks for Benchmarking
While `trtllm-bench` should be able to run any network that TensorRT LLM supports, the following are the list
that have been validated extensively and is the same listing as seen on the
[Performance Overview](./perf-overview.md) page.
- [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
- [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf)
- [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B)
- [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)
- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
- [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
- [meta-llama/Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B)
- [meta-llama/Llama-3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B)
- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
- [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
- [meta-llama/Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)
- [meta-llama/Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)
- [mistralai/Mixtral-8x7B-v0.1-Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1-Instruct)
```{tip}
`trtllm-bench` can automatically download the model from Hugging Face Model Hub.
Export your token in the `HF_TOKEN` environment variable.
```
#### Supported Quantization Modes
`trtllm-bench` supports the following quantization modes:
- None (no quantization applied)
- `FP8`
- `NVFP4`
For more information about quantization, refer to [](../reference/precision.md) and
the [support matrix](../reference/precision.md#support-matrix) of the supported quantization methods for each network.
```{tip}
Although TensorRT LLM supports more quantization modes than listed above, `trtllm-bench` currently only configures for
a smaller subset.
```
### Preparing a Dataset
The throughput benchmark utilizes a fixed JSON schema to specify requests. The schema is defined as follows:
| Key | Required | Type | Description |
| :-------------- | :------: | :-----------: | :---------------------------------------------- |
| `task_id` | Y | String | Unique identifier for the request. |
| `prompt` | N* | String | Input text for a generation request. |
| `input_ids` | Y* | List[Integer] | List of logits that make up the request prompt. |
| `output_tokens` | Y | Integer | Number of generated tokens for this request. |
```{tip}
\* Specifying `prompt` or `input_ids` is required. However, you can not have both prompts and logits (`input_ids`)
defined at the same time. If you specify `input_ids`, the `prompt` entry is ignored for request generation.
```
Refer to the following examples of valid entries for the benchmark:
- Entries with a human-readable prompt and no logits.
```json
{"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000}
{"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000}
```
- Entries which contain logits.
```json
{"task_id":0,"input_ids":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128}
{"task_id":1,"input_ids":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128}
```
```{tip}
Specify each entry on one line.
To simplify passing the data, a complete JSON entry is on each line so that the benchmarker
can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete
JSON entry is on every line.
```
In order to prepare a synthetic dataset, you can use the provided script in the `benchmarks/cpp`
directory. For example, to generate a synthetic dataset of 1000 requests with a uniform ISL/OSL of
128/128 for [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B), run:
```shell
python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-3.1-8B token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 1000 > /tmp/synthetic_128_128.txt
```
### Running with the PyTorch Workflow
To benchmark the PyTorch backend (`tensorrt_llm._torch`), use the following command with [dataset](#preparing-a-dataset) generated from previous steps. The `throughput` benchmark initializes the backend by tuning against the
dataset provided via `--dataset` (or the other build mode settings described [above](#other-build-modes)).
Note that CUDA graph is enabled by default. You can add additional pytorch config with
`--extra_llm_api_options` followed by the path to a YAML file. For more details, please refer to the
help text by running the command with `--help`.
```{tip}
The command below specifies the `--model_path` option. The model path is optional and used only when you want to run a locally
stored checkpoint. When using `--model_path`, the `--model` is still required for reporting reasons and in order to look up parameters
for build heuristics.
```
```shell
trtllm-bench --model meta-llama/Llama-3.1-8B \
--model_path /Ckpt/Path/To/Llama-3.1-8B \
throughput \
--dataset /tmp/synthetic_128_128.txt \
--backend pytorch
# Example output
<snip verbose logging>
===========================================================
= PyTorch backend
===========================================================
Model: meta-llama/Llama-3.1-8B
Model Path: /Ckpt/Path/To/Llama-3.1-8B
TensorRT-LLM Version: 0.17.0
Dtype: bfloat16
KV Cache Dtype: None
Quantization: FP8
===========================================================
= WORLD + RUNTIME INFORMATION
===========================================================
TP Size: 1
PP Size: 1
Max Runtime Batch Size: 2048
Max Runtime Tokens: 4096
Scheduling Policy: Guaranteed No Evict
KV Memory Percentage: 90.00%
Issue Rate (req/sec): 7.6753E+14
===========================================================
= PERFORMANCE OVERVIEW
===========================================================
Number of requests: 3000
Average Input Length (tokens): 128.0000
Average Output Length (tokens): 128.0000
Token Throughput (tokens/sec): 20685.5510
Request Throughput (req/sec): 161.6059
Total Latency (ms): 18563.6825
```
When enabling streaming, time to first token (TTFT) and inter-token latency (ITL) metrics will also be recorded.
```shell
trtllm-bench --model meta-llama/Llama-3.1-8B \
--model_path /Ckpt/Path/To/Llama-3.1-8B \
throughput \
--dataset /tmp/synthetic_128_128.txt \
--backend pytorch
```
Alternatively, users can benchmark the low latency mode:
```shell
trtllm-bench --model meta-llama/Llama-3.1-8B \
--model_path /Ckpt/Path/To/Llama-3.1-8B \
latency \
--dataset /tmp/synthetic_128_128.txt \
--backend pytorch
```
#### Benchmarking with LoRA Adapters in PyTorch workflow
The PyTorch workflow supports benchmarking with LoRA (Low-Rank Adaptation) adapters. This requires preparing a dataset with LoRA metadata and configuring the LoRA settings.
**Preparing LoRA Dataset**
Use `prepare_dataset.py` with LoRA-specific options to generate requests with LoRA metadata:
```shell
python3 benchmarks/cpp/prepare_dataset.py \
--stdout \
--rand-task-id 0 1 \
--tokenizer /path/to/tokenizer \
--lora-dir /path/to/loras \
token-norm-dist \
--num-requests 100 \
--input-mean 128 \
--output-mean 128 \
--input-stdev 16 \
--output-stdev 24 \
> synthetic_lora_data.json
```
Key LoRA options:
- `--lora-dir`: Parent directory containing LoRA adapter subdirectories named by their task IDs (e.g., `0/`, `1/`, etc.)
- `--rand-task-id`: Range of LoRA task IDs to randomly assign to requests
- `--task-id`: Fixed LoRA task ID for all requests (alternative to `--rand-task-id`)
The generated dataset will include LoRA request metadata. Below is an example of a single such request data entry:
```json
{
"task_id": 0,
"input_ids": [3452, 88226, 102415, ...],
"output_tokens": 152,
"lora_request": {
"lora_name": "lora_0",
"lora_int_id": 0,
"lora_path": "/path/to/loras/0"
}
}
```
**LoRA Configuration**
Create an `extra-llm-api-options.yaml` file with LoRA configuration:
```yaml
lora_config:
lora_dir:
- /path/to/loras/0
- /path/to/loras/1
max_lora_rank: 64
lora_target_modules:
- attn_q
- attn_k
- attn_v
trtllm_modules_to_hf_modules:
attn_q: q_proj
attn_k: k_proj
attn_v: v_proj
```
**Running LoRA Benchmark**
```shell
trtllm-bench --model /path/to/base/model \
throughput \
--dataset synthetic_lora_data.json \
--backend pytorch \
--extra_llm_api_options extra-llm-api-options.yaml
```
```{note}
The LoRA directory structure should have task-specific subdirectories named by their task IDs (e.g., `loras/0/`, `loras/1/`).
Each subdirectory should contain the LoRA adapter files for that specific task.
```
#### Running multi-modal models in the PyTorch Workflow
To benchmark multi-modal models with PyTorch workflow, you can follow the similar approach as above.
First, prepare the dataset:
```
python ./benchmarks/cpp/prepare_dataset.py \
--tokenizer Qwen/Qwen2-VL-2B-Instruct \
--stdout \
dataset \
--dataset-name lmms-lab/MMMU \
--dataset-split test \
--dataset-image-key image \
--dataset-prompt-key question \
--num-requests 10 \
--output-len-dist 128,5 > mm_data.jsonl
```
It will download the media files to `/tmp` directory and prepare the dataset with their paths. Note that the `prompt` fields are texts and not tokenized ids. This is due to the fact that
the `prompt` and the media (image/video) are processed by a preprocessor for multimodal files.
Sample dataset for multimodal:
```
{"task_id":0,"prompt":"Brahma Industries sells vinyl replacement windows to home improvement retailers nationwide. The national sales manager believes that if they invest an additional $25,000 in advertising, they would increase sales volume by 10,000 units. <image 1> What is the total contribution margin?","media_paths":["/tmp/tmp9so41y3r.jpg"],"output_tokens":126}
{"task_id":1,"prompt":"Let us compute for the missing amounts under work in process inventory, what is the cost of goods manufactured? <image 1>","media_paths":["/tmp/tmpowsrb_f4.jpg"],"output_tokens":119}
{"task_id":2,"prompt":"Tsuji is reviewing the price of a 3-month Japanese yen/U.S. dollar currency futures contract, using the currency and interest rate data shown below. Because the 3-month Japanese interest rate has just increased to .50%, Itsuji recognizes that an arbitrage opportunity exists nd decides to borrow $1 million U.S. dollars to purchase Japanese yen. Calculate the yen arbitrage profit from Itsuji's strategy, using the following data: <image 1> ","media_paths":["/tmp/tmpxhdvasex.jpg"],"output_tokens":126}
...
```
Run the benchmark:
```
trtllm-bench --model Qwen/Qwen2-VL-2B-Instruct \
throughput \
--dataset mm_data.jsonl \
--backend pytorch \
--num_requests 10 \
--max_batch_size 4 \
--modality image
```
Sample output:
```
===========================================================
= REQUEST DETAILS
===========================================================
Number of requests: 10
Number of concurrent requests: 5.3019
Average Input Length (tokens): 411.6000
Average Output Length (tokens): 128.7000
===========================================================
= WORLD + RUNTIME INFORMATION
===========================================================
TP Size: 1
PP Size: 1
EP Size: None
Max Runtime Batch Size: 4
Max Runtime Tokens: 12288
Scheduling Policy: GUARANTEED_NO_EVICT
KV Memory Percentage: 90.00%
Issue Rate (req/sec): 1.4117E+17
===========================================================
= PERFORMANCE OVERVIEW
===========================================================
Request Throughput (req/sec): 1.4439
Total Output Throughput (tokens/sec): 185.8351
Per User Output Throughput (tokens/sec/user): 38.1959
Per GPU Output Throughput (tokens/sec/gpu): 185.8351
Total Token Throughput (tokens/sec): 780.1607
Total Latency (ms): 6925.4963
Average request latency (ms): 3671.8441
-- Request Latency Breakdown (ms) -----------------------
[Latency] P50 : 3936.3022
[Latency] P90 : 5514.4701
[Latency] P95 : 5514.4701
[Latency] P99 : 5514.4701
[Latency] MINIMUM: 2397.1047
[Latency] MAXIMUM: 5514.4701
[Latency] AVERAGE: 3671.8441
===========================================================
= DATASET DETAILS
===========================================================
Dataset Path: /workspaces/tensorrt_llm/mm_data.jsonl
Number of Sequences: 10
-- Percentiles statistics ---------------------------------
Input Output Seq. Length
-----------------------------------------------------------
MIN: 167.0000 119.0000 300.0000
MAX: 1059.0000 137.0000 1178.0000
AVG: 411.6000 128.7000 540.3000
P50: 299.0000 128.0000 427.0000
P90: 1059.0000 137.0000 1178.0000
P95: 1059.0000 137.0000 1178.0000
P99: 1059.0000 137.0000 1178.0000
===========================================================
```
**Notes and Limitations**:
- Only image datasets are supported for now.
- `--output-len-dist` is a required argument for multimodal datasets.
- Tokenizer is unused during the prepare step but it is still a required argument.
- Since the images are converted to tokens when the model is run, `trtllm-bench` uses a default large value for the maximum input sequence length when setting up the execution settings.
You can also modify the behavior by specifying a different value with the flag `--max_input_len` that suits your use-case.
#### Quantization in the PyTorch Flow
To run a quantized benchmark with `trtllm-bench` utilizing the PyTorch flow, you will need to use a pre-quantized
checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkpoints via HuggingFace:
- [`nvidia/Llama-3.1-8B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-8B-Instruct-FP8)
- [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8)
- [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8)
To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/TensorRT-Model-Optimizer/getting_started/3_quantization.html).
`trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration
file is present in checkpoints quantized with [TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer)
and describes the compute and KV cache quantization that checkpoint was compiled with. For example, from the checkpoints
above:
```json
{
"producer": {
"name": "modelopt",
"version": "0.23.0rc1"
},
"quantization": {
"quant_algo": "FP8",
"kv_cache_quant_algo": null
}
}
```
The checkpoints above are quantized to run with a compute precision of `FP8` and default to no KV cache quantization (full
`FP16` cache). When running `trtllm-bench throughput`. The benchmark will select a KV cache quantization that is best suited
for the compute precision in the checkpoint automatically if `kv_cache_quant_algo` is specified as `null`, otherwise it will
be forced to match the specified non-null KV cache quantization. The following are the mappings that `trtllm-bench` will
follow when a checkpoint does not specify a KV cache quantization algorithm:
| Checkpoint Compute Quant | Checkpoint KV Cache Quant | `trtllm-bench` | Note |
| - | - | - | - |
| `null` | `null` | `null` | In this case, a quantization config doesn't exist. |
| `FP8` | `FP8` | `FP8` | Matches the checkpoint |
| `FP8` | `null` | `FP8` | Set to `FP8` via benchmark |
| `NVFP4` | `null` | `FP8` | Set to `FP8` via benchmark |
If you would like to force the KV cache quantization, you can specify the following in the YAML file to force the precision
when the checkpoint precision is `null`:
```yaml
kv_cache_dtype: "fp8"
```
```{tip}
The two valid values for `kv_cache_dtype` are `auto` and `fp8`.
```

View File

@ -2,7 +2,7 @@
## Quantization
TensorRT-LLM can quantize the Hugging Face model automatically. By setting the appropriate flags in the `LLM` instance. For example, to perform an Int4 AWQ quantization, the following code triggers the model quantization. Please refer to complete list of [supported flags](https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/quantization/mode.html#QuantAlgo) and acceptable values.
TensorRT LLM can quantize the Hugging Face model automatically. By setting the appropriate flags in the `LLM` instance. For example, to perform an Int4 AWQ quantization, the following code triggers the model quantization. Please refer to complete list of [supported flags](https://nvidia.github.io/TensorRT-LLM/_modules/tensorrt_llm/quantization/mode.html#QuantAlgo) and acceptable values.
``` python
from tensorrt_llm.llmapi import QuantConfig, QuantAlgo

View File

@ -0,0 +1,18 @@
Dynamo K8s Example
=================================
1. Install Dynamo Cloud
Please follow `this guide <https://docs.nvidia.com/dynamo/latest/guides/dynamo_deploy/dynamo_cloud.html>`_
to install Dynamo cloud for your Kubernetes cluster.
2. Deploy the TRT-LLM Deployment
Dynamo uses custom resource definitions (CRDs) to manage the lifecycle of the
deployments. You can use the `DynamoDeploymentGraph yaml <https://github.com/ai-dynamo/dynamo/tree/main/components/backends/trtllm/deploy>`_
files to create aggregated, and disaggregated TRT-LLM deployments.
Please see `Deploying Dynamo Inference Graphs to Kubernetes using the Dynamo
Cloud Platform <https://docs.nvidia.com/dynamo/latest/guides/dynamo_deploy/operator_deployment.html>`_
for more details.

View File

@ -0,0 +1,47 @@
# How to Change KV Cache Behavior
Set KV cache behavior by providing the optional ```kv_cache_config argument``` when you create the LLM engine. Consider the quickstart example found in ```examples/pytorch/quickstart.py```:
```python
from tensorrt_llm import LLM, SamplingParams
def main():
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(max_tokens=32)
llm = LLM(model='TinyLlama/TinyLlama-1.1B-Chat-v1.0')
outputs = llm.generate(prompts, sampling_params)
for i, output in enumerate(outputs):
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"[{i}] Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == '__main__':
main()
```
This example runs with default KV cache properties. The default for `free_gpu_memory_fraction` is 0.9, which means TensorRT LLM will try to allocate 90% of free GPU memory for KV cache. Depending on your system, this may be too aggressive, so you decide to dial that back to 0.7. This is done by adding the following lines to the quickstart example:
```python
from tensorrt_llm.llmapi import KvCacheConfig
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
llm = LLM(model='TinyLlama/TinyLlama-1.1B-Chat-v1.0', kv_cache_config=kv_cache_config)
```
You can also set properties after you create ```KvCacheConfig```. For example:
```python
kv_cache_config = KvCacheConfig()
kv_cache_config.enable_block_reuse = False
llm = LLM(model='TinyLlama/TinyLlama-1.1B-Chat-v1.0', kv_cache_config=kv_cache_config)
```
This code disables block reuse for the quick start example.

View File

@ -0,0 +1,68 @@
# How to Change Block Priorities
You can change block priority by providing the optional ```kv_cache_retention_config``` argument when you submit a request to the LLM engine. Consider the quick start example found in ```examples/pytorch/quickstart.py```:
```python
from tensorrt_llm import LLM, SamplingParams
def main():
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(max_tokens=32)
llm = LLM(model='TinyLlama/TinyLlama-1.1B-Chat-v1.0')
outputs = llm.generate(prompts, sampling_params)
for i, output in enumerate(outputs):
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"[{i}] Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == '__main__':
main()
```
The blocks from the prompts are stored for reuse with the default priority of 35 on a scale from 1 to 100, where 100 is highest priority and 1 is lowest priority. Assume you know that the first four tokens of each prompt represent a system prompt that should be stored with high priority (100). You can achieve this by providing a KV cache retention config object when you submit the prompts for generation:
```python
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import KvCacheRetentionConfig
def main():
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
]
sampling_params = SamplingParams(max_tokens=32)
llm = LLM(model='TinyLlama/TinyLlama-1.1B-Chat-v1.0')
# Set priority for first 4 prompt tokens to 100. All other tokens set to default (35) priority.
# This policy never lapses.
tokenRangeRetentionConfig = KvCacheRetentionConfig.TokenRangeRetentionConfig(0, 4, 100, None)
kv_cache_retention_config = KvCacheRetentionConfig(
token_range_retention_configs=[tokenRangeRetentionConfig],
decode_retention_priority=35, # Set generated tokens to default priority
decode_duration_ms=None)
outputs = llm.generate(prompts, sampling_params, kv_cache_retention_config=kv_cache_retention_config)
for i, output in enumerate(outputs):
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"[{i}] Prompt: {prompt!r}, Generated text: {generated_text!r}")
if __name__ == '__main__':
main()
```
This example uses a single ```kv_cache_retention_config``` object for all the prompts. You can also provide a list that must have the same length as the list of prompts.

View File

@ -0,0 +1,390 @@
(attention)=
# Multi-Head, Multi-Query, and Group-Query Attention
This document details the implementation of multi-head attention (MHA),
multi-query attention (MQA), and group-query attention (GQA) for autoregressive
models in TensorRT LLM's PyTorch backend.
Multi-head attention involves a sequence of batched matrix multiplications, a softmax operation, and another batched matrix multiplication,
as described in the [Attention Is All You Need](https://arxiv.org/abs/1706.03762) paper.
[Multi-query Attention (MQA)](https://arxiv.org/abs/1911.02150) and [Group-query Attention (GQA)](https://arxiv.org/abs/2307.09288) are
variants of MHA that use fewer KV heads than the number of query heads.
TensorRT LLM provides several implementations using different backends in `tensorrt_llm/_torch/attention_backend/`.
The following sections explain how to use these implementations and provide a brief guide on implementing new backends.
## Attention Backends
There are currently three available attention backends: the vanilla backend, the TRT-LLM backend, and the Flashinfer backend.
You can specify the desired attention backend using `PyTorchConfig.attn_backend`. For instance, to utilize the Flashinfer backend, you can pass `attn_backend="flashinfer"` to the `LLM` constructor as follows: `LLM(attn_backend="flashinfer")`. This will enable the use of the Flashinfer backend for your model.
The vanilla backend, `VanillaAttention`, is a reference implementation designed primarily for inflight batching and linear KV cache support. While it serves as a useful baseline, it is not recommended for production use due to its limited optimizations.
In contrast, the Flashinfer backend, `FlashInferAttention`, is performance-optimized and supports both inflight batching and paged KV cache. It also includes the following advanced features:
1. **FP8 Quantization**: This feature enables the quantization of inputs and KV cache into FP8 format, significantly reducing memory usage and improving computational throughput.
2. **RoPE Fusion**: By integrating rotary position embedding (RoPE) directly into the attention computation, this feature enhances efficiency and reduces overhead.
The TRT-LLM backend, `TrtllmAttention`, serves as the default backend and supports all the features available in the Flashinfer backend while being further optimized for enhanced performance. It is the recommended choice for production environments. Additionally, it offers the following advanced features:
1. **Fused QKV Input**: It can accept a single QKV tensor as input, which is more efficient compared to using separate Q, K, and V tensors.
2. **FP8 Output**: It supports outputting the attention result in FP8 format, fusing quantization into the attention computation process.
## Implement a New Attention Backend
You can implement a new attention backend to integrate other attention libraries.
An attention backend consists of an `AttentionBackend` class and an `AttentionMetadata` class.
There are three stages in the PyTorch that involve the attention backend:
1. Model construction: During the model's `__init__`, call `AttentionBackend.__init__` to create an attention backend for each layer.
2. Metadata preparation: Before each forward step of the model:
1. If the metadata is uninitialized, call `AttentionMetadata.__init__` to create the attention metadata.
2. If using CUDA graphs, call `AttentionMetadata.create_cuda_graph_metadata` to convert the metadata to CUDA graph metadata, which pre-allocates all tensors and can be used to capture CUDA graphs. Do not re-allocate any tensors stored inside `AttentionMetadata` after the initial warmup run when using CUDA graphs.
3. To prepare parameters of the input and KV cache, call `AttentionMetadata.prepare` to convert from existing metadata and KV cache manager.
3. Single step forward: During the forward pass of each attention layer, call `AttentionBackend.forward` to perform the attention operation. The `AttentionMetadata` will be provided as a forward argument.
### Implement `AttentionMetadata`
The `AttentionMetadata` class stores metadata from the batched input and KV cache for the attention backend.
It contains the following predefined fields:
| Field | Type | Description |
| ----- | ---- | ----------- |
| max_num_requests | int | The max number of requests in a single batch. |
| num_contexts | int | The number of context-phase sequences in the batch. |
| num_generations | int | The number of generation-phase sequences in the batch. |
| max_num_tokens | int | The max number of tokens in all requests in a single batch. |
| num_tokens | int | Number of tokens in the batch. |
| num_ctx_tokens | int | Number of tokens in sequences in the context phase. |
| kv_cache_manager | KVCacheManager | The KV cache manager. |
| is_cuda_graph | bool | Whether CUDA graph is enabled. |
| seq_lens | Tensor | The length of each sequence in the batch. The shape is (batch_size), and located on CPU memory. |
| seq_lens_cuda | Tensor | A copy of `seq_lens` store on the GPU. |
| context_lens | Tensor | The length of each context-phase sequence in the batch. The shape is (`num_contexts`). |
| position_ids | Optional[Tensor] | The position of each token in each sequence. May be None if positional embedding is applied outside of the backend. |
| request_ids | List[int] | The request ID of each sequence in the batch. |
| prompt_lens | List[int] | The prompt length of each sequence in the batch. |
| kv_cache_params | KVCacheParams | The parameters for the KV cache. |
During `AttentionMetadata.__init__`, you can initialize additional fields for the new attention metadata.
For example, the Flashinfer metadata initializes `decode_wrapper` here.
During `AttentionMetadata.prepare`, the runtime will fill all predefined fields, and you can fill your customized fields according to these predefined fields.
For example, the Flashinfer metadata fills `qo_indptr` by combining `context_lens` and `num_generations` here.
### Implement `AttentionBackend`
The `AttentionBackend` delegates the attention operation to the backend implementation.
Its `__init__` accepts the following arguments:
| Field | Type | Description |
| ----- | ---- | ----------- |
| layer_idx | int | The index of the attention layer in the model. |
| num_heads | int | The number of query heads. |
| head_dim | int | The size of each attention head `(hidden_size // num_heads)`. |
| num_kv_heads | Optional[int] | The number of KV heads. Defaults to num_heads if None. |
| quant_config | QuantConfig | Optional quantization configuration. If None, no quantization is applied. |
| pos_embd_params | PositionalEmbeddingParams | Optional parameters defining how positional embedding should be applied. If None, positional embedding should be applied by the model before calling the backend. Otherwise, the backend is in-charge of applying positional embedding and may cache K without embedding it first. |
Its `forward` accepts the following arguments:
| Field | Type | Description |
| ----- | ---- | ----------- |
| q | Tensor | Query tensor with shape `(num_tokens, num_heads * head_dim)`. |
| k | Tensor | Key tensor with shape `(num_tokens, num_kv_heads * head_dim)`. |
| v | Tensor | Value tensor with shape `(num_tokens, num_kv_heads * head_dim)`. |
| metadata | AttentionMetadata | Metadata for the attention operation. |
| attention_mask | AttentionMask | Optional attention mask. If None, causal mask is applied. |
For example, the Flashinfer backend calls `append_paged_kv_cache` and then `wrapper.run` to perform the attention operation here.
## The Features of the `TrtllmAttention` Backend
The following sections introduce some features of the default `TrtllmAttention` backend.
### Packed Tensors
In the `TrtllmAttention` backend, the attention operator supports the packed (i.e. non padded) QKV inputs.
A naive layout for the QKV inputs is padding the sequences
that are shorter than the `max_sequence_length` to the maximum
length. It may result in excessive memory consumption as well as unneeded
computations on padding tokens (in the various matrix multiplications that
surround the MHA block).
To overcome that problem, TensorRT LLM supports a mode without padding where
the different tokens are packed together and the user provides the operator
with a 1D tensor containing the lengths of the different sequences.
### Context and Generation Phases
The `TrtllmAttention` backend encapsulates different implementations for both
context and generation phases into a single custom torch op.
#### Context Phase
A context-phase implementation without optimization maps to a sequence of GPU kernels that will store the
intermediate `Q*K^T` tensor in memory before calling the softmax operator. It
is the slowest method and the memory footprint is significant (grows quadratically in proportion to the sequence length).
The `TrtllmAttention` backend will trigger a kernel that performs the MHA/MQA block
using a single kernel instead. For short sequences, that kernel uses a vanilla
implementation of MHA/MQA. For larger sequences, this kernel uses the Flash
Attention algorithm as described in
[FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness](https://arxiv.org/abs/2205.14135)
and
[FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning](https://arxiv.org/abs/2307.08691).
Currently, the implementation triggers extra kernels that apply pre-processing
to the elements (like RoPE) and populate the KV cache (see below). In a future
release, the number of such kernels may be reduced to improve the overall performance.
#### FP8 Context FMHA
When FP8 quantization is activated, the attention can be further accelerated by
enabling FP8 Context FMHA.
FP8 Paged Context FMHA is also supported with the fp8 quantization workflow.
You need to specify `use_paged_context_fmha = True` for the attention operator.
Please be aware that this feature is only supported on Ada, Hopper and above.
#### Generation Phase
The generation phase is implemented using a single kernel called the masked
multi-head attention in TensorRT LLM. That kernel is able to apply
pre-processing on the Q, K, and V elements on-the-fly: it adds the QKV bias, applies
RoPE, and performs dequantization and quantization. TensorRT LLM will continue to add (or
enable) additional features in future releases, such as enabling support for IA3.
The masked MHA kernel has a special version that distributes the work across
multiple CUDA thread-blocks on the GPU for cases where the GPU occupancy is
low. That mode called multi-block is always enabled.
NVIDIA recommends users to test that mode in scenarios where both the batch
size and the number of heads in the model are relatively small.
The definition of 'small' in that context is hard to quantify because it depends on the model of the GPU.
However, NVIDIA currently recommends testing that mode when `batch_size * num_heads` is less than the number of multi-processors on the GPU.
This guidance may be subject to change in the future.
Note that even if the multi-block mode is enabled, the attention operator will
not immediately trigger the multi-block version of the GPU kernel. There is a
minimum number of tokens (input + generated) that are required for the
multi-block version to become more efficient than the "vanilla" implementation
that uses a single CUDA thread-block per head. It is controlled by an internal
heuristic.
Another note is that as the masked MHA kernels use shared memory size
proportional to sequence length, so there can be some cases that GPU's shared
memory is not enough when multi-block mode is not enabled. To get masked MHA kernel to work in those cases, multi-block mode is forced on and a warning message is printed in the log.
#### XQA Optimization
XQA optimization is another optimization for MQA/GQA in the generation phase.
It currently only supports a limited number of model configurations, such as the LLAMA2 70B model.
Support matrix of the XQA optimization:
- FP16 / BF16 compute data type.
- FP16 / BF16 / FP8 / INT8 KV cache data type.
- Paged KV cache (8 / 16 / 32 / 64 / 128 tokens per block).
By default, this is enabled. Note that a heuristic algorithm
is also used to decide whether to use XQA kernel or masked MHA kernel to get
better performance.
If you want to use that kernel whenever possible, set `TRTLLM_FORCE_XQA=1` to force use of the XQA kernel when the model config is supported.
Supported configurations can be found using the `shouldUse` function of the `DecoderXQARunner` class in
`cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h`.
(inflight-batching)=
### In-flight Batching
TensorRT LLM supports in-flight batching of requests (also known as continuous
batching or iteration-level batching) for higher serving throughput. With this feature,
sequences in the context phase can be processed together with sequences in
the generation phase. The purpose of that technique is to better interleave
requests to reduce latency as well as make better use of the GPUs.
For efficiency reasons (1), the support for inflight batching ***requires the
input tensors to be packed (no padding)***.
***In the current implementation, the sequences that are going through the
context phase must be before the sequences in the generation phase in the input
tensor. For example, for sequences `S0`, `S1` and `S2`, if `S0` and `S2` are in
context phase (and `S1` in generation), tokens from `S0` and `S2` must appear
before the tokens of `S1` in the input tensor***.
_(1) Padding sequences in the generation phase, that contain a single token, to
the length of the maximum input sequence is inefficient use of resources_.
### Chunked Context
In the original state, the common behavior was to process all context tokens at
once. This feature splits the context into several chunks. In this way, the
context chunks can be batched with more tokens during the generation phase,
which is expected to increase the total throughput. Chunking contexts also removes
constraints on input length. Except for the last one, the size of the context chunk needs
to be an integer multiple of the kv-cache block size.
> To enable this feature, the FMHA paged kv-cache also needs to be enabled.
### KV Cache
In the generation phase, a common optimization is to provide the MHA kernel
with a cache containing the values of the past K and V elements that have
already been computed. That cache is known as the KV cache. TensorRT LLM uses
that technique to accelerate its generation phase. In TensorRT LLM, there is
one KV cache per Transformer layer, which means that there are as many KV
caches as layers in a model. The current version of TensorRT LLM supports two
different types of KV caches: **contiguous** and **paged** KV caches.
#### Contiguous KV Cache
The contiguous KV cache is a monolithic tensor. Its shape is:
```
[max_batch_size * max_beam_width, 2, num_heads, max_seqlen, hidden_dim_per_head].
```
That implementation uses a lot more memory than needed when the sequences are
shorter than the maximum sequence length (even if they end up close to the
limit after the generation of many output tokens, it may take a lot of steps to
reach that point).
#### Paged KV Cache
The paged KV cache decomposes the KV cache into blocks that are distributed to
the different requests by a cache manager during processing. That cache manager
keeps track of the sequences, allocates new blocks from a pool and recycles those
blocks when required. See the implementation of
[`KVCacheManager`](source:tensorrt_llm/_torch/pyexecutor/resource_manager.py).
#### INT8/FP8 KV Caches
In its current implementation, even if the rest of the network runs in INT8 or
FP8, the attention operator works with FP32, FP16, and BFloat16 inputs and
outputs. However, TensorRT LLM supports INT8 and FP8
(`QuantMode.INT8_KV_CACHE` and
`QuantMode.FP8_KV_CACHE`) KV caches.
The attention operator populates the KV cache. When INT8 or FP8 KV caches
are enabled, the input values have to be quantized to 8 bits using a scaling
factor. For quantization, the scaling factor is stored in the
`kv_cache_scaling_factor` tensor. Its shape is `[1]` and only per-tensor
quantization is supported in the current version. Quantization uses inversed scale
since it does multiply as `fp_value * (1.0 / kv_cache_scaling_factor)` in plugin.
During generation, the values read from the cache are dequantized on-the-fly in
the MHA/MQA kernel. Dequantization is defined as
`quantized_value * kv_cache_scaling_factor`.
### Sliding Window Attention, Cyclic (Rolling Buffer) KV Cache
TensorRT LLM has a feature called `Cyclic KV Cache`, which treats the kv cache
as a circular buffer. This means that it only stores the kv cache for the last N
tokens, where N is determined by the `attention_window_size` parameter in
`TrtllmAttention.forward`. When the cache is full, new tokens kv cache will
overwrite the "least recently used" caches.
In the context phase, if the input length surpasses the `attention_window_size`,
`Sliding Window Attention` will be activated. This serves the same function as
the sliding window size.
This feature helps to reduce the memory footprint of the kv cache when
dealing with very long sequences.
_Note that the cyclic kv cache feature doesn't work with beam searching currently as
the context kv cache are shared across beams.
### StreamingLLM
The StreamingLLM feature uses a window attention to perform efficient and stable LLM
on long texts, which means that only `N` tokens need to be stored in the KV cache.
Similar to the cyclic KV cache feature in TensorRT LLM, `attention_window_size`
parameter is used to determine `N`. Different from the cyclic KV cache feature,
the first `S` tokens, called sink tokens, are always kept in the attention window,
where `S` is determined by `sink_token_length` parameter.
But in context phase, the self-attentions are dense in the official implementation of
StreamingLLM. It uses all of the tokens for computation and only saves `N` tokens
to the KV cache.
In addition, the relative position embedding is also changed in StreamingLLM.
When determining the relative distance and adding positional information to tokens,
StreamingLLM use the positions within the cache rather than those in the original text.
`sink_token_length` is also used to enable this feature.
### Beam-Search
The attention operator supports beam-search. In the context phase, a single
beam is computed per input sequence. In the generation phase, the MHA/MQA/GQA
kernel uses an additional tensor to reconstruct the correct path for each beam.
That tensor is called the `cache_indirection`. Its shape is `[batch_size,
beam_width, max_seqlen]`.
For a sequence `si`, a beam `bi` and a token `ti`, the element
`cache_indirection[si][bi][ti]` is an integer between `0` and `beam_width-1`
that indicates which path in the beam to read the K and V elements from in the
KV cache. This tensor is populated in the sampling stage.
### Input QKV tensor
The input QKV tensor packs the Q, K and V tensors (concatenated along the last
dimension) after the projection of the hidden states. It is a 3D tensor. RoPE
and quantization to INT8 or FP8 (when needed) are performed by the GPT
attention operator.
In packed mode, its shape is `[num_tokens, 3 * hidden_dim]` where
`num_tokens` is the total number of tokens in the batch. For the sequences in
context phase, the number of tokens of a sequence corresponds to its input
length (even if the beam width is greater than `1` for beam search). For the
sequences in generation phase, there are `beam_width` tokens per sequence. The
beam width can be different for each sequence.
The following pseudo code explains how the number of tokens is computed:
```python
num_tokens = 0
# Add the length of each sequence in context phase.
for seq in context_phase:
num_tokens += seq.length
# Add the width of the beam for each sequence in generation phase.
for seq in generation_phase:
num_tokens += seq.beam_width
```
### Rotary Positional Embedding (RoPE)
The attention operator can perform the computation of the Rotary
Positional Embedding (RoPE). When that operation is enabled,
`rotary_embedding_dim` is set to a value greater than 0, it is fused with other
operations. The GPT operator supports GPT-NeoX and GPT-J forms of RoPE by
setting `position_embedding_type` to `PositionEmbeddingType.rope_gpt_neox`
or `PositionEmbeddingType.rope_gptj`.
### ALiBi
The attention operator can apply ALiBi to the result of the `Q*K^T`
product. The bias is computed on-the-fly from the ALiBi slopes in the optimized
kernel.
### Scaling factor(s)
In MHA, the output of the `Q*K^T` product is scaled by a constant value that
is computed as:
```
norm_factor = 1.f / (q_scaling * sqrt(head_size)).
```
### Cross Attention
On top of the MHA as self attention needed by GPT-style decoder-only models, the attention operator also supports cross attention.
This enables the attention operator to be more broadly used as a generic decoder component. For example, the Encoder-Decoder model uses it to issue both the self attention and cross attention modules in its Decoder.

View File

@ -0,0 +1,93 @@
# Benchmarking with trtllm-bench
AutoDeploy is integrated with the `trtllm-bench` performance benchmarking utility, enabling you to measure comprehensive performance metrics such as token throughput, request throughput, and latency for your AutoDeploy-optimized models.
## Getting Started
Before benchmarking with AutoDeploy, review the [TensorRT-LLM benchmarking guide](../../performance/perf-benchmarking.md#running-with-the-pytorch-workflow) to familiarize yourself with the standard trtllm-bench workflow and best practices.
## Basic Usage
Invoke the AutoDeploy backend by specifying `--backend _autodeploy` in your `trtllm-bench` command:
```bash
trtllm-bench \
--model meta-llama/Llama-3.1-8B \
throughput \
--dataset /tmp/synthetic_128_128.txt \
--backend _autodeploy
```
```{note}
As in the PyTorch workflow, AutoDeploy does not require a separate `trtllm-bench build` step. The model is automatically optimized during benchmark initialization.
```
## Advanced Configuration
For more granular control over AutoDeploy's behavior during benchmarking, use the `--extra_llm_api_options` flag with a YAML configuration file:
```bash
trtllm-bench \
--model meta-llama/Llama-3.1-8B \
throughput \
--dataset /tmp/synthetic_128_128.txt \
--backend _autodeploy \
--extra_llm_api_options autodeploy_config.yaml
```
### Configuration Examples
#### Basic Performance Configuration (`autodeploy_config.yaml`)
```yaml
# Compilation backend
compile_backend: torch-opt
# Runtime engine
runtime: trtllm
# Model loading
skip_loading_weights: false
# Fraction of free memory to use for kv-caches
free_mem_ratio: 0.8
# CUDA Graph optimization
cuda_graph_batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256]
# Attention backend
attn_backend: flashinfer
# Sequence configuration
max_batch_size: 256
```
Enable multi-GPU execution by specifying `--tp n`, where `n` is the number of GPUs
## Configuration Options Reference
### Core Performance Settings
| Parameter | Default | Description |
|-----------|---------|-------------|
| `compile_backend` | `torch-compile` | Compilation backend: `torch-simple`, `torch-compile`, `torch-cudagraph`, `torch-opt` |
| `runtime` | `trtllm` | Runtime engine: `trtllm`, `demollm` |
| `free_mem_ratio` | `0.0` | Fraction of available GPU memory for KV cache (0.0-1.0) |
| `skip_loading_weights` | `false` | Skip weight loading for architecture-only benchmarks |
### CUDA Graph Optimization
| Parameter | Default | Description |
|-----------|---------|-------------|
| `cuda_graph_batch_sizes` | `null` | List of batch sizes for CUDA graph creation |
```{tip}
For optimal CUDA graph performance, specify batch sizes that match your expected workload patterns. For example: `[1, 2, 4, 8, 16, 32, 64, 128]`
```
## Performance Optimization Tips
1. **Memory Management**: Set `free_mem_ratio` to 0.8-0.9 for optimal KV cache utilization
1. **Compilation Backend**: Use `torch-opt` for production workloads
1. **Attention Backend**: `flashinfer` generally provides the best performance for most models
1. **CUDA Graphs**: Enable CUDA graphs for batch sizes that match your production traffic patterns.

View File

@ -0,0 +1,49 @@
# Example Run Script
To build and run AutoDeploy example, use the `examples/auto_deploy/build_and_run_ad.py` script:
```bash
cd examples/auto_deploy
python build_and_run_ad.py --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
```
You can configure your experiment with various options. Use the `-h/--help` flag to see available options:
```bash
python build_and_run_ad.py --help
```
The following is a non-exhaustive list of common configuration options:
| Configuration Key | Description |
|-------------------|-------------|
| `--model` | The HF model card or path to a HF checkpoint folder |
| `--args.model-factory` | Choose model factory implementation (`"AutoModelForCausalLM"`, ...) |
| `--args.skip-loading-weights` | Only load the architecture, not the weights |
| `--args.model-kwargs` | Extra kwargs that are being passed to the model initializer in the model factory |
| `--args.tokenizer-kwargs` | Extra kwargs that are being passed to the tokenizer initializer in the model factory |
| `--args.world-size` | The number of GPUs used for auto-sharding the model |
| `--args.runtime` | Specifies which type of Engine to use during runtime (`"demollm"` or `"trtllm"`) |
| `--args.compile-backend` | Specifies how to compile the graph at the end |
| `--args.attn-backend` | Specifies kernel implementation for attention |
| `--args.mla-backend` | Specifies implementation for multi-head latent attention |
| `--args.max-seq-len` | Maximum sequence length for inference/cache |
| `--args.max-batch-size` | Maximum dimension for statically allocated KV cache |
| `--args.attn-page-size` | Page size for attention |
| `--prompt.batch-size` | Number of queries to generate |
| `--benchmark.enabled` | Whether to run the built-in benchmark (true/false) |
For default values and additional configuration options, refer to the `ExperimentConfig` class in `examples/auto_deploy/build_and_run_ad.py` file.
The following is a more complete example of using the script:
```bash
cd examples/auto_deploy
python build_and_run_ad.py \
--model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
--args.world-size 2 \
--args.runtime "demollm" \
--args.compile-backend "torch-compile" \
--args.attn-backend "flashinfer" \
--benchmark.enabled True
```

View File

@ -0,0 +1,178 @@
# Expert Configuration of LLM API
For advanced TensorRT-LLM users, the full set of `tensorrt_llm._torch.auto_deploy.llm_args.LlmArgs` is exposed. Use at your own risk. The argument list may diverge from the standard TRT-LLM argument list.
- All configuration fields used by the AutoDeploy core pipeline, `InferenceOptimizer`, are exposed exclusively in `AutoDeployConfi`g in `tensorrt_llm._torch.auto_deploy.llm_args`.
Please make sure to refer to those first.
- For advanced users, the full set of `LlmArgs` in `tensorrt_llm._torch.auto_deploy.llm_args` can be used to configure the AutoDeploy `LLM` API, including runtime options.
- Note that some fields in the full `LlmArgs`
object are overlapping, duplicated, and/or _ignored_ in AutoDeploy, particularly arguments
pertaining to configuring the model itself since AutoDeploy's model ingestion+optimize pipeline
significantly differs from the default manual workflow in TensorRT-LLM.
- However, with the proper care the full `LlmArgs`
objects can be used to configure advanced runtime options in TensorRT-LLM.
- Any valid field can be simply provided as keyword argument ("`**kwargs`") to the AutoDeploy `LLM` API.
# Expert Configuration of `build_and_run_ad.py`
For advanced users, `build_and_run_ad.py` provides advanced configuration capabilities using a flexible argument parser powered by PyDantic Settings and OmegaConf. You can use dot notation for CLI arguments, provide multiple YAML configuration files, and utilize sophisticated configuration precedence rules to create complex deployment configurations.
## CLI Arguments with Dot Notation
The script supports flexible CLI argument parsing using dot notation to modify nested configurations dynamically. You can target any field in both the `ExperimentConfig` in `examples/auto_deploy/build_and_run_ad.py` and nested `AutoDeployConfig` or `LlmArgs` objects in `tensorrt_llm._torch.auto_deploy.llm_args`:
```bash
# Configure model parameters
# NOTE: config values like num_hidden_layers are automatically resolved into the appropriate nested
# dict value ``{"args": {"model_kwargs": {"num_hidden_layers": 10}}}`` although not explicitly
# specified as CLI arg
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
--args.model-kwargs.num-hidden-layers=10 \
--args.model-kwargs.hidden-size=2048 \
--args.tokenizer-kwargs.padding-side=left
# Configure runtime and backend options
python build_and_run_ad.py \
--model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
--args.world-size=2 \
--args.compile-backend=torch-opt \
--args.attn-backend=flashinfer
# Configure prompting and benchmarking
python build_and_run_ad.py \
--model "microsoft/phi-4" \
--prompt.batch-size=4 \
--prompt.sp-kwargs.max-tokens=200 \
--prompt.sp-kwargs.temperature=0.7 \
--benchmark.enabled=true \
--benchmark.bs=8 \
--benchmark.isl=1024
```
## YAML Configuration Files
Both `ExperimentConfig` and `AutoDeployConfig`/`LlmArgs` inherit from `DynamicYamlMixInForSettings`, which enables you to provide multiple YAML configuration files that are automatically deep-merged at runtime.
Create a YAML configuration file (e.g., `my_config.yaml`):
```yaml
# my_config.yaml
args:
model_kwargs:
num_hidden_layers: 12
hidden_size: 1024
world_size: 4
compile_backend: torch-compile
attn_backend: triton
max_seq_len: 2048
max_batch_size: 16
transforms:
sharding:
strategy: auto
quantization:
enabled: false
prompt:
batch_size: 8
sp_kwargs:
max_tokens: 150
temperature: 0.8
top_k: 50
benchmark:
enabled: true
num: 20
bs: 4
isl: 1024
osl: 256
```
Create an additional override file (e.g., `production.yaml`):
```yaml
# production.yaml
args:
world_size: 8
compile_backend: torch-opt
max_batch_size: 32
benchmark:
enabled: false
```
Then use these configurations:
```bash
# Using single YAML config
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
--yaml-configs my_config.yaml
# Using multiple YAML configs (deep merged in order, later files have higher priority)
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
--yaml-configs my_config.yaml production.yaml
# Targeting nested AutoDeployConfig with separate YAML
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
--yaml-configs my_config.yaml \
--args.yaml-configs autodeploy_overrides.yaml
```
## Configuration Precedence and Deep Merging
The configuration system follows a precedence order in which higher priority sources override lower priority ones:
1. **CLI Arguments** (highest priority) - Direct command line arguments
1. **YAML Configs** - Files specified via `--yaml-configs` and `--args.yaml-configs`
1. **Default Settings** (lowest priority) - Built-in defaults from the config classes
**Deep Merging**: Unlike simple overwriting, deep merging recursively combines nested dictionaries. For example:
```yaml
# Base config
args:
model_kwargs:
num_hidden_layers: 10
hidden_size: 1024
max_seq_len: 2048
```
```yaml
# Override config
args:
model_kwargs:
hidden_size: 2048 # This will override
# num_hidden_layers: 10 remains unchanged
world_size: 4 # This gets added
```
**Nested Config Behavior**: When using nested configurations, outer YAML configuration files become initialization settings for inner objects, giving them higher precedence:
```bash
# The outer yaml-configs affects the entire ExperimentConfig
# The inner args.yaml-configs affects only the AutoDeployConfig
python build_and_run_ad.py \
--model "meta-llama/Meta-Llama-3.1-8B-Instruct" \
--yaml-configs experiment_config.yaml \
--args.yaml-configs autodeploy_config.yaml \
--args.world-size=8 # CLI override beats both YAML configs
```
## Built-in Default Configuration
Both `AutoDeployConfig` and `LlmArgs` classes automatically load a built-in `default.yaml` configuration file that provides defaults for the AutoDeploy inference optimizer pipeline. This file is specified in the `_get_config_dict()` function in `tensorrt_llm._torch.auto_deploy.llm_args` and defines default transform configurations for graph optimization stages.
The built-in defaults are automatically merged with your configurations at the lowest priority level, ensuring that your custom settings always override the defaults. You can inspect the current default configuration to understand the baseline transform pipeline:
```bash
# View the default configuration
cat tensorrt_llm/_torch/auto_deploy/config/default.yaml
# Override specific transform settings
python build_and_run_ad.py \
--model "TinyLlama/TinyLlama-1.1B-Chat-v1.0" \
--args.transforms.export-to-gm.strict=true
```

View File

@ -0,0 +1,14 @@
# Logging Level
Use the following env variable to specify the logging level of our built-in logger, ordered by
decreasing verbosity;
```bash
AUTO_DEPLOY_LOG_LEVEL=DEBUG
AUTO_DEPLOY_LOG_LEVEL=INFO
AUTO_DEPLOY_LOG_LEVEL=WARNING
AUTO_DEPLOY_LOG_LEVEL=ERROR
AUTO_DEPLOY_LOG_LEVEL=INTERNAL_ERROR
```
The default log level is `INFO`.

View File

@ -0,0 +1,30 @@
### Incorporating `auto_deploy` into your own workflow
AutoDeploy can be seamlessly integrated into existing workflows using TRT-LLM's LLM high-level API. This section provides an example for configuring and invoking AutoDeploy in custom applications.
The following example demonstrates how to build an LLM object with AutoDeploy integration:
```
from tensorrt_llm._torch.auto_deploy import LLM
# Construct the LLM high-level interface object with autodeploy as backend
llm = LLM(
model=<HF_MODEL_CARD_OR_DIR>,
world_size=<DESIRED_WORLD_SIZE>,
compile_backend="torch-compile",
model_kwargs={"num_hidden_layers": 2}, # test with smaller model configuration
attn_backend="flashinfer", # choose between "triton" and "flashinfer"
attn_page_size=64, # page size for attention (tokens_per_block, should be == max_seq_len for triton)
skip_loading_weights=False,
model_factory="AutoModelForCausalLM", # choose appropriate model factory
mla_backend="MultiHeadLatentAttention", # for models that support MLA
free_mem_ratio=0.8, # fraction of available memory for cache
simple_shard_only=False, # tensor parallelism sharding strategy
max_seq_len=<MAX_SEQ_LEN>,
max_batch_size=<MAX_BATCH_SIZE>,
)
```
For more information about configuring AutoDeploy via the `LLM` API using `**kwargs`, see the AutoDeploy LLM API in `tensorrt_llm._torch.auto_deploy.llm` and the `AutoDeployConfig` class in `tensorrt_llm._torch.auto_deploy.llm_args`.

View File

@ -0,0 +1,80 @@
# AutoDeploy (Prototype)
```{note}
This project is under active development and is currently in a prototype stage. The code is a prototype, subject to change, and may include backward-incompatible updates. While we strive for correctness, there are no guarantees regarding functionality, stability, or reliability.
```
## Seamless Model Deployment from PyTorch to TensorRT LLM
AutoDeploy is a prototype designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models such as those from the Hugging Face Transformers library, to TensorRT LLM.
![AutoDeploy overview](../../media/ad_overview.png)
<sub><em>AutoDeploy overview and relation with TensorRT LLM's LLM API</em></sub>
AutoDeploy provides an alternative method for deploying models using the LLM API without requiring code changes to the source model (for example, Hugging Face Transformers models) or manual implementation of inference optimizations, such as KV-caches, multi-GPU parallelism, or quantization. Instead, AutoDeploy extracts a computation graph from the source model and applies inference optimizations through a series of automated graph transformations. AutoDeploy generates an inference-optimized graph that can be directly executed in the TensorRT LLM PyTorch runtime and leverages various runtime optimizations including in-flight batching, paging, and overlap scheduling.
## Key Features
- **Seamless Model Translation:** Automatically converts PyTorch/Hugging Face models to TensorRT LLM without manual rewrites.
- **Unified Model Definition:** Maintain a single source of truth with your original PyTorch/Hugging Face model.
- **Optimized Inference:** Built-in transformations for sharding, quantization, KV-cache integration, MHA fusion, and CudaGraph optimization.
- **Immediate Deployment:** Day-0 support for models with continuous performance enhancements.
- **Quick Setup & Prototyping:** Lightweight pip package for easy installation with a demo environment for fast testing.
## Get Started
1. **Install AutoDeploy:**
AutoDeploy is included with the TRT-LLM installation.
```bash
sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
```
You can refer to [TRT-LLM installation guide](../../installation/linux.md) for more information.
2. **Run Llama Example:**
You are now ready to run an in-framework LLama Demo.
The general entry point for running the AutoDeploy demo is the `build_and_run_ad.py` script, Checkpoints are loaded directly from Huggingface (HF) or a local HF-like directory:
```bash
cd examples/auto_deploy
python build_and_run_ad.py --model "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
```
## Support Matrix
AutoDeploy streamlines the model deployment process through an automated workflow designed for efficiency and performance. The workflow begins with a PyTorch model, which is exported using `torch.export` to generate a standard Torch graph. This graph contains core PyTorch ATen operations alongside custom attention operations, determined by the attention backend specified in the configuration.
The exported graph then undergoes a series of automated transformations, including graph sharding, KV-cache insertion, and GEMM fusion, to optimize model performance. After these transformations, the graph is compiled using one of the supported compile backends (like `torch-opt`), followed by deploying it via the TensorRT LLM runtime.
- [Support Matrix](support_matrix.md)
## Advanced Usage
- [Example Run Script](./advanced/example_run.md)
- [Logging Level](./advanced/logging.md)
- [Incorporating AutoDeploy into Your Own Workflow](./advanced/workflow.md)
- [Expert Configurations](./advanced/expert_configurations.md)
- [Performance Benchmarking](./advanced/benchmarking_with_trtllm_bench.md)
## Roadmap
We are actively expanding AutoDeploy to support a broader range of model architectures and inference features.
**Upcoming Model Support:**
- Vision-Language Models (VLMs)
- Structured State Space Models (SSMs) and Linear Attention architectures
**Planned Features:**
- Low-Rank Adaptation (LoRA)
- Speculative Decoding for accelerated generation
To track development progress and contribute, visit our [Github Project Board](https://github.com/orgs/NVIDIA/projects/83/views/13).
We welcome community contributions, see `examples/auto_deploy/CONTRIBUTING.md` for guidelines.

View File

@ -0,0 +1,127 @@
## Support Matrix
AutoDeploy streamlines model deployment with an automated workflow designed for efficiency and performance. The workflow begins with a PyTorch model, which is exported using `torch.export` to generate a standard Torch graph. This graph contains core PyTorch ATen operations alongside custom attention operations, determined by the attention backend specified in the configuration.
The exported graph then undergoes a series of automated transformations, including graph sharding, KV-cache insertion, and GEMM fusion, to optimize model performance. After these transformations, the graph is compiled using one of the supported compile backends (like `torch-opt`), followed by deploying it via the TRT-LLM runtime.
### Support Models
**Bring Your Own Model**: AutoDeploy leverages `torch.export` and dynamic graph pattern matching, enabling seamless integration for a wide variety of models without relying on hard-coded architectures.
AutoDeploy supports Hugging Face models compatible with `AutoModelForCausalLM` and `AutoModelForImageTextToText`.
In addition, the following models have been officially validated using the default configuration: `runtime=trtllm`, `compile_backend=torch-compile`, and `attn_backend=flashinfer`
<details>
<summary>Click to expand supported models list</summary>
- Qwen/QwQ-32B
- Qwen/Qwen2.5-0.5B-Instruct
- Qwen/Qwen2.5-1.5B-Instruct
- Qwen/Qwen2.5-3B-Instruct
- Qwen/Qwen2.5-7B-Instruct
- Qwen/Qwen3-0.6B
- Qwen/Qwen3-235B-A22B
- Qwen/Qwen3-30B-A3B
- Qwen/Qwen3-4B
- Qwen/Qwen3-8B
- TinyLlama/TinyLlama-1.1B-Chat-v1.0
- apple/OpenELM-1_1B-Instruct
- apple/OpenELM-270M-Instruct
- apple/OpenELM-3B-Instruct
- apple/OpenELM-450M-Instruct
- bigcode/starcoder2-15b-instruct-v0.1
- bigcode/starcoder2-7b
- deepseek-ai/DeepSeek-Prover-V1.5-SFT
- deepseek-ai/DeepSeek-Prover-V2-7B
- deepseek-ai/DeepSeek-R1-Distill-Llama-70B
- deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B
- deepseek-ai/DeepSeek-R1-Distill-Qwen-32B
- google/codegemma-7b-it
- google/gemma-1.1-7b-it
- google/gemma-2-27b-it
- google/gemma-2-2b-it
- google/gemma-2-9b-it
- google/gemma-2b
- google/gemma-3-1b-it
- ibm-granite/granite-3.1-2b-instruct
- ibm-granite/granite-3.1-8b-instruct
- ibm-granite/granite-3.3-2b-instruct
- ibm-granite/granite-3.3-8b-instruct
- ibm-granite/granite-guardian-3.1-2b
- ibm-granite/granite-guardian-3.2-5b
- meta-llama/CodeLlama-34b-Instruct-hf
- meta-llama/CodeLlama-7b-Instruct-hf
- meta-llama/CodeLlama-7b-Python-hf
- meta-llama/Llama-2-13b-chat-hf
- meta-llama/Llama-2-7b-chat-hf
- meta-llama/Llama-3.1-8B-Instruct
- meta-llama/Llama-3.2-1B-Instruct
- meta-llama/Llama-3.2-3B-Instruct
- meta-llama/Llama-3.3-70B-Instruct
- meta-llama/Llama-4-Maverick-17B-128E-Instruct
- meta-llama/Llama-4-Scout-17B-16E-Instruct
- microsoft/Phi-3-medium-128k-instruct
- microsoft/Phi-3-medium-4k-instruct
- microsoft/Phi-4-mini-instruct
- microsoft/Phi-4-mini-reasoning
- microsoft/Phi-4-reasoning
- microsoft/Phi-4-reasoning-plus
- microsoft/phi-4
- mistralai/Codestral-22B-v0.1
- mistralai/Mistral-7B-Instruct-v0.2
- mistralai/Mistral-7B-Instruct-v0.3
- mistralai/Mixtral-8x22B-Instruct-v0.1
- nvidia/Llama-3.1-405B-Instruct-FP8
- nvidia/Llama-3.1-70B-Instruct-FP8
- nvidia/Llama-3.1-8B-Instruct-FP8
- nvidia/Llama-3.1-Minitron-4B-Depth-Base
- nvidia/Llama-3.1-Minitron-4B-Width-Base
- nvidia/Llama-3.1-Nemotron-70B-Instruct-HF
- nvidia/Llama-3.1-Nemotron-Nano-8B-v1
- nvidia/Llama-3_1-Nemotron-51B-Instruct
- nvidia/Llama-3_1-Nemotron-Ultra-253B-v1
- nvidia/Llama-3_1-Nemotron-Ultra-253B-v1-FP8
- nvidia/Llama-3_3-Nemotron-Super-49B-v1
- nvidia/Mistral-NeMo-Minitron-8B-Base
- perplexity-ai/r1-1776-distill-llama-70b
</details>
### Runtime Integrations
AutoDeploy runs natively with the complete `TRT-LLM` stack via the `LLM` API. In addition, we provide a light-weight wrapper of the `LLM` API for onboarding and debugging new models:
| `"runtime"` | Description |
|-------------|-------------|
| `trtllm` | A robust, production-grade runtime optimized for high-performance inference. |
| `demollm` | A lightweight runtime wrapper designed for development and testing, featuring a naive scheduler and KV-cache manager for simplified debugging and testing. |
### Compile Backends
AutoDeploy supports multiple backends for compiling the exported Torch graph:
| `"compile_backend"` | Description |
|--------------------|-------------|
| `torch-simple` | Exports the graph without additional optimizations. |
| `torch-compile` | Applies `torch.compile` to the graph after all AutoDeploy transformations have been completed. |
| `torch-cudagraph` | Performs CUDA graph capture (without torch.compile). |
| `torch-opt` | Uses `torch.compile` along with CUDA Graph capture to enhance inference performance. |
### Attention backends
Optimize attention operations with different attention kernel implementations:
| `"attn_backend"` | Description |
|----------------------|-------------|
| `triton` | Custom fused multi-head attention (MHA) with KV Cache kernels for efficient attention processing. |
| `flashinfer` | Uses optimized attention kernels with KV Cache from the [`flashinfer`](https://github.com/flashinfer-ai/flashinfer.git) library. |
### Precision Support
AutoDeploy supports models with various precision formats, including quantized checkpoints generated by [`TensorRT-Model-Optimizer`](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
**Supported precision types include:**
- BF16 / FP16 / FP32
- FP8
- [NVFP4](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/)

View File

@ -0,0 +1,326 @@
# Checkpoint Loading
The PyTorch backend provides a flexible and extensible infrastructure for loading model checkpoints from different formats, such as HuggingFace (HF). This system allows you to load models from various sources (e.g., HuggingFace or custom formats) by implementing the required components, such as the checkpoints weight loader, mapper, and configuration parser.
## Table of Contents
1. [Overview](#overview)
2. [Core Components](#core-components)
3. [Built-in Checkpoint Formats](#built-in-checkpoint-formats)
4. [Using Checkpoint Loaders](#using-checkpoint-loaders)
5. [Creating Custom Checkpoint Loaders](#creating-custom-checkpoint-loaders)
## Overview
The checkpoint loading design is built around a plugin-like architecture that is separated into four distinct components:
- **Checkpoint Loaders**: Orchestrate the loading process for specific formats
- **Config Loaders**: Handle model configuration parsing and validation
- **Weight Loaders**: Manage the actual loading of model weights from storage into memory
- **Weight Mappers**: Map and transform loaded weights to TensorRT LLM model's definition
This modular design allows for easy extension to support new checkpoint formats while maintaining backward compatibility and performance optimizations. By separating the checkpoint loading components into four different subcomponents, any user can employ any relevant previous work while also introducing their own custom checkpoint-specific components.
If one wishes to support a new checkpoint format, they must implement all four components.
Likewise, if the format shares some components with an already supported framework (e.g., HF), only the custom-specific components need to be implemented.
## Core Components
### BaseCheckpointLoader
The `BaseCheckpointLoader` is the central base interface for all checkpoint loading required operators. It provides a unified API regardless of the underlying checkpoint format. This interface is responsible for holding and exposing all objects required for the loading and parsing process.
**Key Methods:**
- `load_config(checkpoint_dir, **kwargs)`: Loads and returns a `ModelConfig` object
- `load_weights(checkpoint_dir, **kwargs)`: Loads and returns a dictionary of weights
- `get_initialized_weight_mapper(model, config)`: Returns a runtime initialized weight mapper for the model
- `cleanup()`: Releases resources and cleans up internal state
### BaseConfigLoader
Responsible for loading model configurations from checkpoint directories and parsing them into TRTLLM `ModelConfig`:
```python
from tensorrt_llm._torch.models.checkpoints.base_config_loader import BaseConfigLoader
class CustomConfigLoader(BaseConfigLoader):
def load(self, checkpoint_dir: str, **kwargs) -> ModelConfig:
# Load and parse configuration from your custom format
pretrained_config = self._get_pretrained_config(checkpoint_dir, **kwargs)
return ModelConfig(pretrained_config=pretrained_config,
...)
def _get_pretrained_config(self, checkpoint_dir, **kwargs):
...
```
### BaseWeightLoader
Handles the loading of model weights from storage:
```python
from tensorrt_llm._torch.models.checkpoints.base_weight_loader import BaseWeightLoader
class CustomWeightLoader(BaseWeightLoader):
def load_weights(self, checkpoint_dir: str) -> dict[str, Any]:
# Load weights from your custom format
# Return a dictionary mapping parameter names to tensors
return weights_dict
```
### BaseWeightMapper
Transforms weights between different naming conventions and applies model-specific transformations into TRTLLM model's object.
## Built-in Checkpoint Formats
### HuggingFace Format
Currently, HF checkpoint loader is the primary built-in format, supporting:
- **Weights loading** (`.safetensors/.bin/.pth`) - Loading HF compatible weights from disk
- **Configuration parser** - Parsing HF stored configuration information to TRTLLM `ModelConfig` object
- **Weights Mapping** - Converting HF weights into TRTLLM compatible representation
## Using Checkpoint Loaders
### Basic Usage
There are two main approaches to trigger the use of checkpoint loading objects.
The first approach, through llm-api, as shown in the following example:
```python
from tensorrt_llm import LLM
hf_model_dir = "llama-models-v2/llama-v2-13b-hf"
llm = LLM(model=hf_model_dir)
```
In this example, `HfCheckpointLoader` will be selected by default.
To explicitly set the checkpoint loader, you need to call the required checkpoint-specific loader
```python
from tensorrt_llm import LLM
from tensorrt_llm._torch.models.checkpoints.hf.checkpoint_loader import HfCheckpointLoader
hf_model_dir = "llama-models-v2/llama-v2-13b-hf"
llm = LLM(model=hf_model_dir,
checkpoint_loader=HfCheckpointLoader())
```
Similarly, if one wants to use a basic implemented checkpoint loader, but with a specific subcomponent, they can provide any specific subcomponent upon need
```python
from tensorrt_llm import LLM
from tensorrt_llm._torch.models.checkpoints.hf.checkpoint_loader import HfCheckpointLoader
hf_model_dir = "llama-models-v2/llama-v2-13b-hf"
llm = LLM(model=hf_model_dir,
checkpoint_loader=HfCheckpointLoader(weight_loader=MyCustomWeightLoader()))
```
In the second approach, one can directly use the components of the checkpoint loading.
```python
from tensorrt_llm._torch.models.checkpoints.hf.gemma3_weight_mapper import \
Gemma3HfWeightMapper
from tensorrt_llm._torch.models.modeling_gemma3 import Gemma3ForCausalLM
gemma3 = Gemma3ForCausalLM(model_config)
weight_mapper = Gemma3HfWeightMapper()
weight_mapper.init_model_and_config(gemma3, model_config)
gemma3.load_weights(hf_gemma3.state_dict(), weight_mapper)
```
## Creating Custom Checkpoint Loaders
To support a new checkpoint format, you need to implement all four components. This section provides minimal templates for each component.
### When to Create Custom Components
- **Complete New Format**: Implement all four components when supporting a completely new checkpoint format
- **Custom Weight Storage**: Only implement a custom weight loader if you have a unique weight storage format (e.g., custom binary format, database storage, etc.)
- **Custom Configuration**: Only implement a custom config loader if your configuration format cannot be parsed by existing parsers.
- **Custom Weight Mapping**: Only implement a custom weight mapper if your model has unique weight naming or transformation requirements that are checkpoint-specific.
### Step 1: Create the Checkpoint Loader
```python
from typing import Optional
from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import BaseCheckpointLoader
from tensorrt_llm._torch.models.checkpoints.base_config_loader import BaseConfigLoader
from tensorrt_llm._torch.models.checkpoints.base_weight_loader import BaseWeightLoader
from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import BaseWeightMapper
from tensorrt_llm._torch.models.modeling_utils import register_checkpoint_loader
@register_checkpoint_loader("CUSTOM_FORMAT")
class CustomCheckpointLoader(BaseCheckpointLoader):
def __init__(self,
*,
weight_loader: Optional[BaseWeightLoader] = None,
weight_mapper: Optional[BaseWeightMapper] = None,
config_loader: Optional[BaseConfigLoader] = None):
self._weight_loader = weight_loader or self.get_default_weight_loader()
self._config_loader = config_loader or self.get_default_config_loader()
self._weight_mapper = weight_mapper
self._checkpoint_format = "CUSTOM_FORMAT"
def get_default_weight_loader(self) -> BaseWeightLoader:
return CustomWeightLoader()
def get_default_config_loader(self) -> BaseConfigLoader:
return CustomConfigLoader()
```
### Step 2: Create the Checkpoint Weight Loader
```python
from typing import Any
from tensorrt_llm._torch.models.checkpoints.base_weight_loader import BaseWeightLoader
from tensorrt_llm._torch.models.modeling_utils import register_checkpoint_weight_loader
@register_checkpoint_weight_loader("CUSTOM_FORMAT")
class CustomWeightLoader(BaseWeightLoader):
def load_weights(self, checkpoint_dir: str, **kwargs) -> dict[str, Any]:
"""
Load weights from your custom format.
Args:
checkpoint_dir: Directory containing checkpoint files
**kwargs: Additional loading parameters
Returns:
Dictionary mapping parameter names to tensors
"""
weights = {}
# Implement your custom weight loading logic here
# Examples:
# - Load from custom binary files
# - Load from databases
# - Load from compressed archives
# - Apply custom preprocessing
return weights
```
### Step 3: Create the Checkpoint Config Loader
```python
from tensorrt_llm._torch.model_config import ModelConfig
from tensorrt_llm._torch.models.checkpoints.base_config_loader import BaseConfigLoader
from tensorrt_llm._torch.models.modeling_utils import register_config_loader
@register_config_loader("CUSTOM_FORMAT")
class CustomConfigLoader(BaseConfigLoader):
def load(self, checkpoint_dir: str, **kwargs) -> ModelConfig:
"""
Load and parse configuration from your custom format.
Args:
checkpoint_dir: Directory containing configuration files
**kwargs: Additional loading parameters
Returns:
ModelConfig object containing parsed configuration
"""
# Load your custom configuration format
# Examples:
# - Parse YAML/TOML files
# - Convert from proprietary formats
pretrained_config = self._load_pretrained_config(checkpoint_dir, **kwargs)
return ModelConfig(
pretrained_config=pretrained_config,
# Add other ModelConfig parameters as needed
)
def _load_pretrained_config(self, checkpoint_dir: str, **kwargs):
"""Load the raw configuration from your custom format."""
pass
```
### Step 4: Create the Checkpoint Weight Mapper
```python
from torch import nn
from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import BaseWeightMapper
from tensorrt_llm._torch.models.modeling_utils import register_mapper
@register_mapper("CUSTOM_FORMAT")
class CustomWeightMapper(BaseWeightMapper):
def __init__(self):
super().__init__()
# Define any weight transformation callbacks
self._callbacks = [
# Add your custom weight transformation functions
# self._custom_transform_function,
]
def map_weights(self) -> None:
"""
Define mappings between source and target weight names.
"""
self.mapping.update({
# Map source names to target names
# 'target_module_name': ['source_param1', 'source_param2'],
# Example: 'qkv_proj': ['q_proj', 'k_proj', 'v_proj']
})
def apply_callbacks(self, module: nn.Module, module_name: str,
module_names_breakdown: list[str],
weights: dict) -> list[dict]:
"""
Apply weight transformations for modules that require special handling.
Args:
module: The target module
module_name: The specific module name being processed
module_names_breakdown: Module path components
weights: Source weights dictionary
Returns:
List of transformed weight dictionaries
"""
module_weights = []
for new_name in self._mapping[module_name]:
# Filter weights for this specific parameter
fw = self.filter_weights(
'.'.join(module_names_breakdown + [new_name]), weights)
# Apply transformation callbacks
for callback in self._callbacks:
fw = callback(module, new_name, fw)
module_weights.append(fw)
return module_weights
def should_skip_module(self, module_name: str) -> bool:
"""
Define which modules should be skipped during loading.
"""
# Add logic to skip specific modules based on your requirements
# Examples:
# - Skip LoRA-specific modules
# - Skip temporary/auxiliary modules
return super().should_skip_module(module_name)
```
Note: when creating a custom mapper, you can either define a checkpoint-format-specific mapper. For example:
```python
@register_mapper("CUSTOM_FORMAT")
class CustomWeightMapper(BaseWeightMapper)
```
Alternatively, you can define a checkpoint-model-specific mapper. For example:
```python
@register_mapper("CUSTOM_FORMAT", "Gemma3ForCausalLM")
class CustomWeightMapper(BaseWeightMapper)
```
By setting the model name, the registered mapper will be asscoiated with the specific model.

View File

@ -0,0 +1,267 @@
# Disaggregated Serving (Beta)
```{note}
Note:
This feature is currently in beta, and the related APIs are subjected to change in future versions.
```
- [Motivation](#Motivation)
- [KV Cache Exchange](#KV-Cache-Exchange)
- [Multi-backend Support](#Multi-backend-Support)
- [Overlap Optimization](#Overlap-Optimization)
- [Cache Layout Transformation](#Cache-Layout-Transformation)
- [Usage](#Usage)
- [trtllm-serve](#trtllm-serve)
- [Dynamo](#Dynamo)
- [Environment Variables](#Environment-Variables)
- [Troubleshooting and FAQ](#Troubleshooting-and-FAQ)
## Motivation
LLM inference has two stages: context (prefill) and generation (decode) phases. The context phase computes KV cache for prompt tokens whereas the generation phase generates tokens one by one using cached values. These phases have different compute characteristics.
There are two ways of serving LLM inference requests:
* Aggregated LLM serving (sometimes called in-flight batching or IFB in this tech blog), in which the context and generation phases are run on the same GPU.
* Disaggregated LLM serving, in which the context and generation phases are run on different GPUs.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture1.png" width="640" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 1. The execution timeline of aggregated LLM serving</em></sub></p>
In aggregated LLM serving, both the context and generation phases share the same GPU resources and parallelism strategy. This can lead to interference where context processing delays token generation, increasing token-to-token latency (TPOT) and reducing interactivity. This is illustrated in Figure 1 which shows the execution timeline for aggregated LLM serving. Aggregated LLM serving also forces a single GPU type and parallelism configuration for both phases, even though their compute needs differ. As a result, optimizing for one metric such as time-to-first-token (TTFT), often comes at the expense of another metric such as TPOT.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture2.png" width="580" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 2. The execution timeline of dis-aggregated LLM serving</em></sub></p>
Disaggregated serving resolves these challenges by decoupling the two phases, allowing each to run on separate GPU pools and using different parallelism strategies. This separation removes the interference between context and generation phases, as shown in Figure 2, and enables independent optimization of TTFT and TPOT. Although disaggregation incurs overhead for transferring the KV cache blocks from context to generation GPUs, the advantages can be substantial—particularly for workloads with long input sequences and moderate output lengths where interference is most severe.
You can also refer to [this paper](https://arxiv.org/pdf/2506.05508) for more details about the rational and design considerations of disaggregated serving.
## KV Cache Exchange
### Multi-backend Support
In TensorRT-LLM, the KV cache exchange is modularly decoupled from the KV cache manager and the underlying communication libraries, as shown in Figure 3. The KV cache exchange module is responsible for efficient transmission and reception of the cache, promptly releasing cache space, and performing cache layout conversions during the exchange process. Currently, mainstream communication protocols—MPI, UCX, and NIXL—are all supported by TensorRT-LLM, and the underlying communication protocols utilize RDMA / NVLink. Currently, we recommend using UCX and NIXL backends, as we are adding a dynamic scaling mechanism on top of them—specifically, dynamic node joining and leaving. This allows customers to adjust the load based on traffic demands or switch roles between context and generation dynamically.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture6.png" width="890" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 3. KV cache exchange architecture</em></sub></p>
### Overlap Optimization
To optimize the overall performance of disaggregated serving, TensorRT-LLM overlaps the KV cache transmission with computation for multiple independent requests. While one request is sending or receiving its KV cache blocks, other requests can proceed with computation, as illustrated in Figure 4. Furthermore, if context and generation instances are using multiple GPUs per instance, KV cache transmission between different sets of GPUs can occur in parallel.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture7.png" width="800" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 4. KV cache exchange timing diagram</em></sub></p>
### Cache Layout Transformation
To minimize KV cache transmission latency, TensorRT-LLM currently uses direct transmission between device memories for cache transfer. The KV cache transmission supports using different parallel strategies for the context and generation phases. In such cases, careful orchestration of KV cache block mapping is required. Figure 5 illustrates this using the example of context phase with TP2 and generation phase with PP2.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture8.png" width="680" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 5. KV cache layout conversion</em></sub></p>
The optimizations required for KV cache transmission vary depending on whether it's single-node multi-GPU, multi-node multi-GPU, or different GPU models. To accommodate this, TensorRT-LLM provides a set of environment variables for selection in different environments. Please refer to the following section for details [Environment Variables](#Environment-Variables).
## Usage
### trtllm-serve
The first approach to do disaggregated LLM inference with TensorRT-LLM involves launching a separate OpenAI-compatible server per context and generation instance using `trtllm-serve`. An additional server, referred to as the "disaggregated" server, is also launched with `trtllm-serve` and acts as an orchestrator which receives client requests and dispatches them to the appropriate context and generation servers via OpenAI REST API. Figure 6 below illustrates the disaggregated serving workflow when using this approach. When a context instance is done generating the KV blocks associated with the prompt, it returns a response to the disaggregated server. This response includes the prompt tokens, the first generated token and metadata associated with the context request and context instance. This metadata is referred to as context parameters (`ctx_params` in Figure 6). These parameters are then used by the generation instances to establish communication with the context instance and retrieve the KV cache blocks associated with the request.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture3.png" width="800" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 6. `trtllm-serve` integration with disaggregated service</em></sub></p>
To run TRT-LLM in disaggregated mode, you must first launch context (prefill) and generation (decode) servers using `trtllm-serve`.
We use the `cache_transceiver_config` configuration to set up disaggregated serving, which includes the following parameters:
```yaml
cache_transceiver_config:
backend: <str>
max_tokens_in_buffer: <int>
```
`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is UCX.
`max_tokens_in_buffer` defines the buffer size for kvCache transfers, it is recommended to set this value greater than or equal to the maximum ISL (Input Sequence Length) of all requests for optimal performance.
For example, you could launch two context servers and one generation servers as follows:
```
# Generate context_extra-llm-api-config.yml
# Overlap scheduler for context servers are disabled because it's not supported for disaggregated context servers yet
echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > context_extra-llm-api-config.yml
# Start Context servers
CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_0 &
CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_1 &
# Generate gen_extra-llm-api-config.yml
echo -e "cache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > gen_extra-llm-api-config.yml
# Start Generation servers
CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --backend pytorch --extra_llm_api_options ./gen_extra-llm-api-config.yml &> log_gen_0 &
```
Once the context and generation servers are launched, you can launch the disaggregated
server, which will accept requests from clients and do the orchestration between context
and generation servers. The disaggregated server can be launched with:
```
trtllm-serve disaggregated -c disagg_config.yaml
```
where `disagg_config.yaml` contains information about the context and generation servers. For the current example,
it would look like:
```
hostname: localhost
port: 8000
backend: pytorch
context_servers:
num_instances: 2
urls:
- "localhost:8001"
- "localhost:8002"
generation_servers:
num_instances: 1
urls:
- "localhost:8003"
```
When routing requests to the context servers, the disaggregated server will mark the requests as "context-only" to skip the generation phase. Similarly,
when routing requests to the generation servers, the disaggregated server will mark the requests as "generation-only" to skip the context phase.
Clients can then send requests to the disaggregated server at `localhost:8000`, which is an OpenAI compatible endpoint. For example, you can send requests to the disaggregated server using curl:
```bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
"prompt": "NVIDIA is a great company because",
"max_tokens": 16,
"temperature": 0
}' -w "\n"
```
#### Launching disaggregated servers on SLURM clusters
Please refer to [Disaggregated Inference Benchmark Scripts](../../scripts/disaggregated).
### Dynamo
The second approach involves the use of [Dynamo](https://github.com/ai-dynamo/dynamo), a data center-scale inference server developed specifically for LLM workloads. Dynamo introduces several advanced features not present in the other methods, including decoupled pre- and post-processing workers, which are particularly beneficial under high concurrency conditions. The disaggregated LLM inference workflow with Dynamo is illustrated in Figure 7.
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/tech_blog5_Picture4.png" width="800" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 7. Dynamo integration with disaggregated service</em></sub></p>
In the Dynamo workflow, requests are initially processed by pre- and post-processing workers, which then query a smart router to determine the optimal decode worker to route the requests to. Depending on the availability of KV cache blocks, the decoder worker may bypass the prefill stage or forward the request to the prefill worker. Once the prefill worker is done processing the prompt, the KV cache blocks can be sent from the prefill worker to the decoder worker, using the metadata referred to as ctx_params in the figure above.
Dynamo also includes built-in support for Kubernetes deployment, monitoring, and metrics collection. The development team is actively working on enabling dynamic instance scaling, further enhancing its suitability for production environments.
For more information on how to use Dynamo with TensorRT-LLM, please refer to [this documentation](https://docs.nvidia.com/dynamo/latest/examples/trtllm.html).
## Environment Variables
TRT-LLM uses some environment variables to control the behavior of disaggregated service.
* `TRTLLM_PARALLEL_CACHE_SEND`: If set to `1`, contextExecutor will attempt to send KV cache for multiple requests in parallel. The default value is `0`.
* `TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP`: If set to `1`, generationExecutor will not overlap KV cache transfer with model inference. The default value is `0`.
* `TRTLLM_ENABLE_KVCACHE_RECEIVE_PARALLEL`: When the generation rank receives KV cache from multiple context ranks within a single context instance, it will receive KV cache from each rank sequentially. If set to `1`, the generation rank will receive KV cache from each rank within one context instance in parallel. The default value is `0`.
* `TRTLLM_REQUEST_KV_CACHE_CONCURRENT`: If set to `1`, generationExecutor prepares independent resources for each context executor to receive KV cache, requests whose KV cache are received from different context executors will be processed concurrently. If set to `0`, the generation executor will reuse the same resource to process KV cache transfer for each request sequentially, reducing the resources used by KV cache transmission and thereby lowering the risk of running out of memory. The default value is `0`.
* `TRTLLM_TRY_ZCOPY_FOR_KVCACHE_TRANSFER`: TRT-LLM typically copies non-contiguous data into a temporary buffer before sending KV cache. If set to `1`, TRT-LLM will attempt to directly transmit each KV cache block, eliminating extra copies. The default value is `0`.
* `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE`: By default, TRT-LLM uses a `stream-ordered memory allocator` to allocate temporary buffers. If this environment variable is set to #Size, TRT-LLM will use `cudaMalloc` to allocate buffer of size #Size for KV cache transmission. The default value is `512MB`. Users can set `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE=1GB` to allocate a 1 GB buffer with `cudaMalloc` for KV cache transmission.
* `TRTLLM_KVCACHE_TRANSFER_USE_ASYNC_BUFFER`: If set to `1`, TRT-LLM will use `cudaMallocAsync` to allocate buffers for KV cache transmission. The default value is `0`. This environment variable only takes effect when `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE` is greater than 0.
* `TRTLLM_KVCACHE_SEND_MAX_CONCURRENCY_NUM`: The maximum number of concurrent KV cache sends. The default value is `4`. This environment variable only takes effect when `TRTLLM_KVCACHE_TRANSFER_BUFFER_SIZE` is greater than 0.
There are some other useful environment variables that may help when encountering failures or performance issues.
* `NCCL_GRAPH_MIXING_SUPPORT`: With the default value `1`, the CUDA driver may create too many CUDA streams while working with one CUDA graph, leading to performance drop. Setting it to `0` will reduce the number of CUDA streams, but please make sure there are no other NCCL ops outside the one CUDA graph, otherwise it's unsafe.
* ``UCX_MAX_RNDV_RAILS`: With the default value 2, UCX attempts to use two InfiniBand (IB) NIC devices per GPU for Rendezvous (RNDV) transfers. When both the context and generation instances enable tensor- and expert-parallel (TEP), multiple TP ranks may transfer KV cache concurrently. Because each TP rank can use up to two NIC devices, some NIC devices can be shared across GPUs, causing contention and reduced throughput. Setting UCX_MAX_RNDV_RAILS=1 can reduce contention in this case.
## Troubleshooting and FAQ
### General FAQs
*Q. What are the limitations of disaggregated serving in TRT-LLM?*
A. Currently, only decoder-only models and beam width of 1 are supported. Also the KV cache at each layer of the model is required to be homogeneous, with the same data type and the same number of attention heads.
*Q. When using the TRT backend, is the engine used for disaggregated serving different from other engines?*
A. No. There are no special requirements for the arguments to build engine.
*Q. When using the TRT backend, do the engines used by the context and generation instances need to be the same?*
A. No. The engines used by context and generation instances can be different, and their parallelism can be heterogeneous, i.e., TP,PP can be different, and TRT-LLM will handle the heterogeneity of KV cache.
*Q. Can a TRT-LLM server instance handle both context-only requests and generation-only requests?*
A. Yes, but it's not recommended. TRT-LLM does not implement optimal scheduling for the case where the instance handles mixed context-only requests and generation-only requests. It's better to run context-only requests and generation-only requests on sets of servers.
*Q. Does disaggregated serving in TRT-LLM support multi-gpu and multi-node?*
A. Yes, it's recommended that different server instances use different GPUs. We support running context and generation servers on the same node or different nodes. The `CUDA_VISIBLE_DEVICES` env variable can be used to control which GPUs are used by each instance.
### Debugging FAQs
*Q. How to handle error `Disaggregated serving is not enabled, please check the configuration?`*
A. please set `backendType` of `CacheTransceiverConfig`.
```cpp
ExecutorConfig executorConfig{...};
executorConfig.setCacheTransceiverConfig(texec::CacheTransceiverConfig(BackendType::DEFAULT));
```
*Q. Does TRT-LLM support using GPU direct RDMA for inter-node KV Cache transfer?*
A. Yes, TRT-LLM supports using GPU direct RDMA for inter-node KV cache transfer.
*Q. What causes the substantial bandwidth fluctuations in kvCache transfers, especially during the first few requests following service initialization?*
A. The communication for kvCache transfer between executors are established dynamically. The connection establishment process incurs significant overhead, which explains the apparently lower kvCache transfer bandwidth observed during the initial requests after service startup. This lower bandwidth reflects the inclusion of connection establishment overhead. When conducting benchmarks, it is recommended to perform a warm-up phase to ensure accurate performance measurements.
*Q. When my servers are running on different NVLink domains, some servers hang or have a lower performance. How to fix that?*
A. NVLink domain can be found with `nvidia-smi -q` in the `Fabric.ClusterUUID` field. A few UCX environment variables can be adjusted when your servers have different NVLink domains:
* `UCX_CUDA_IPC_ENABLE_MNNVL`: Set to `n`. This also can reduce UCX timeout error messages like `UCX ERROR cuMemImportFromShareableHandle failed: invalid resource handle`, although these errors don't necessarily cause your trtllm-serve to fail.
* `UCX_NET_DEVICES`: Check if this is set correctly, or unset this variable to allow UCX to use all possible devices.
* `UCX_RNDV_SCHEME`: Set to `get_zcopy` or `put_zcopy` on GB200 for better performance. The default value is `auto`.

View File

@ -0,0 +1,19 @@
# Feature Combination Matrix
| Feature | Overlap Scheduler | CUDA Graph | Attention Data Parallelism | Disaggregated Serving | Chunked Prefill | MTP | EAGLE-3(One Model Engine) | EAGLE-3(Two Model Engine) | Torch Sampler | TLLM C++ Sampler | KV Cache Reuse | Slide Window Attention | Logits Post Processor | Guided Decoding | LoRA |
| -------------------------- | ----------------- | ---------- | -------------------------- | --------------------- | --------------- | -------- | ------------------------- | ------------------------- | ------------- | ---------------- | -------------- | ---------------------- | --------------------- | --------------- | ---- |
| Overlap Scheduler | --- | | | | | | | | | | | | | | |
| CUDA Graph | Yes | --- | | | | | | | | | | | | | |
| Attention Data Parallelism | Yes | Yes | --- | | | | | | | | | | | | |
| Disaggregated Serving | Yes | Yes | Yes | --- | | | | | | | | | | | |
| Chunked Prefill | Yes | Yes | Yes | Yes | --- | | | | | | | | | | |
| MTP | Yes | Yes | Yes | Yes | Yes | --- | | | | | | | | | |
| EAGLE-3(One Model Engine) | Yes | Yes | Yes | Yes | Yes | No | --- | | | | | | | | |
| EAGLE-3(Two Model Engine) | No | Yes | Yes | Yes | Yes | No | No | --- | | | | | | | |
| Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | | |
| TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | | |
| KV Cache Reuse | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | |
| Slide Window Attention | Yes | Yes | Yes | Yes | Yes | No | Untested | Untested | Yes | Yes | WIP | --- | | | |
| Logits Post Processor | Yes | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | --- | | |
| Guided Decoding | Yes | Yes | Yes | Yes | Yes | No | No | Yes | Yes | Yes | Yes | Yes | Yes | --- | |
| LoRA | Yes | No | Untested | Untested | Untested | Untested | Untested | Untested | Yes | Yes | Yes | Yes | Yes | Untested | --- |

View File

@ -0,0 +1,79 @@
# KV Cache System
The KV cache stores previously computed key-value pairs for reuse during generation in order to avoid redundant calculations. The TensorRT-LLM KV cache system also supports reuse across requests and uses a suite of tools like offloading and prioritized eviction to increase reuse. It supports variable attention window sizes and Multi-Head Attention (MHA) optimization techniques such as MQA and GQA.
## The Basics
The KV cache is a pool of blocks that can hold KV state for a fixed number of tokens. Multiple layers are packed within a single block, which requires all the layers to have the same number of heads and the same attention window size. A separate pool is created for each combination of attention window size and number of heads to support variable attention window size and optimization techniques like GQA.
The number of tokens that can be stored in a single block can be set by user when the model engine is created. It must be a power of two greater than 1. Blocks are assigned to requests as needed. Blocks are stored in a search structure as they are filled by requests, this allows later requests to reuse KV state if they have a matching prefix.
If more than one pool is created, available memory is divided among the pools. The fraction to assign to each pool is determined during initialization and is static. This is not optimal and we are working on providing a better solution.
## Reuse Across Requests
Blocks containing KV state computed for previous requests are stored in a radix search tree as soon as they are filled. A search is performed when a new request is added, and matched blocks are reused instead of calculated. Blocks that are reused can be shared among multiple requests, so reuse saves memory as well as computations.
Blocks remain reusable until they are evicted from the search tree. Eviction happens when a new (blank) block is needed. The core eviction scheme is prioritized LRU. All blocks are assigned a priority between 0 and 100 (100 being most important). All blocks of the lowest priority must be evicted before any blocks of the next priority can be evicted. If all blocks have the same priority, the least recently used block is evicted.
When a block is evicted from primary memory, its KV state is copied to a block in secondary memory. The secondary memory block remains in the search tree, so the block remains reusable until it is evicted from secondary memory. Eviction from secondary memory happens when a new block in secondary memory is needed to offload a primary block. The eviction scheme is the same for primary and secondary blocks.
One caveat in the current code is that only leaf blocks can be evicted (leaves are blocks with no descendants in the radix tree). This design works well for full attention layers, but not for limited attention layers. This will be fixed in a future version.
### Retention Policy
Blocks are assigned priority in line with the [retention policy](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.KvCacheRetentionConfig) of the request. Blocks with lower priority scores will be freed preferentially to blocks with higher priority. The retention policy is a list of [TokenRangeRetentionConfig](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.KvCacheRetentionConfig.TokenRangeRetentionConfig) objects, each specifying priority for a given range of tokens, such as "assign priority X to tokens 10 through 61". You can also assign a duration in milliseconds for this to remain in effect. Priority reverts to the default of 35 after a period of ```duration_ms``` has elapsed from the first time the block was made available for reuse. TokenRangeRetentionConfig only applies to input (prompt) tokens. The property ```decode_retention_policy``` specifies what priority to assign to blocks with generated (decoded) tokens and ```decode_duration_ms``` specifies how long this should remain in effect. Priority reverts to the default after expiration. Any property that expects a duration can be set to None. This indicates that particular part of the retention policy never expires.
Not in use: ```transfer_mode``` is a debug option and should not be used.
See [this example](../examples/kvcacheretentionconfig.md) for an example of how to change block priorities of specific requests by altering their retention policy.
### Speculative Decoding
Reuse across requests is supported by all speculative decoding models. Please see [speculative decoding](speculative-decoding.md) for more details.
## Limited Attention Window Size
TensorRT-LLM takes advantage of layers with limited attention window size in order to reduce computations and memory usage. Blocks that leave the attention window are freed and placed on the radix search tree so they can be reused.
## MQA / GQA
TensorRT-LLM takes advantage of grouped query attention in order to save memory. KV cache will create blocks with only enough space to store state for the discrete query head groups. For MHA, there is one group per head, for MQA there is a single group for all the heads. GQA strikes a balance between these two.
## Controlling KV Cache Behavior
Many of the features in the KV cache system are optional or have user defined properties that alter how they work. Users can control KV cache features through class [KVCacheConfig](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.KvCacheConfig). The remainder of this section describes how to change the most important behaviors of the KV cache system.
See [this example](../examples/kvcacheconfig.md) for an example of how to use KvCacheConfig to control KV cache behavior.
### Datatype
Perhaps the most important property is ```dtype``` which specifies what data type is held in KV cache. The default 'auto' specifies that data type should be inferred from model config.
### How Much Memory is Allocated to KV Cache
Property ```free_gpu_memory_fraction``` is a ratio > 0 and < 1 that specifies how much of free GPU memory should be allocated to KV cache. The default is 90% (ratio of 0.9). If ```max_tokens``` is also set, KV cache will determine how much memory is needed to hold ```max_tokens``` and will allocate the lesser of ```max_tokens``` and ```free_gpu_memory_fraction```.
### Enable/Disable Cross Request Reuse
Block reuse across requests is enabled by default, but can be disabled by setting ```enable_block_reuse``` to False.
### Enable Offloading to Host Memory
Before a block is evicted from GPU memory, it can optionally be offloaded to host (CPU) memory. The block remains reusable until it is evicted from host memory. When an offloaded block is reused, it is first copied back into GPU memory. Offloading is controlled with property ```host_cache_size``` which specifies how much host memory (in bytes) should be allocated for offloading. The default is 0.
When offloading is enabled, the client can prevent specific blocks from being offloaded by toggling block priority. Blocks with lower priority than a certain threshold are not offloaded; they are evicted directly from GPU memory to reduce traffic between GPU and host. This priority is set with ```secondary_offload_min_priority```. Default value is 35, meaning any block with lower priority than 35 will not be offloaded.
### Partial Reuse
Partial reuse of a block can happen when some but not all tokens are matched. It is enabled by default, but can be disabled by setting ```enable_partial_reuse``` to False.
The property ```copy_on_partial_reuse``` specifies whether a block should be copied or not in order to allow partial reuse. If copying is disabled, a partially matched block can only be reused if no other request is using it. If copying is enabled, partially matched blocks are not reused directly, instead a new block is created and the matched tokens are copied into the new block. This allows multiple requests to partially reuse a block.
### Attention Window Size
Property ```max_attention_window``` specifies the maximum attention window size for each layer in the model as a list of integer values. If the length of this list is less than number of layers, the list is repeated as many times as necessary. For instance, if the model has only full attention layers and maximum sequence length is 4096, you can specify this as ```max_attention_window = [4096]```. If the first layer is full attention, the second layer is limited attention with window size 256 and then this repeats for the remaining layers, you specify this as ```max_attention_window = [4096,256]```. This means first layer is full attention, second layer is limited attention, third layer is full attention, fourth layer is limited attention and so on.
### Deprecated Properties
Properties ```use_uvm``` and ```sink_token_length``` have been deprecated and will be removed in a future release.

View File

@ -0,0 +1,71 @@
# Long Sequences
In many real-world scenarios, such as long documents summarization or multi-turn conversations, LLMs are required to perform cognitive tasks across long sequences to get better results. This will present challenges to the LLM inference. TensorRT-LLM can support different methods to process long sequences efficiently. This document will introduce those optimization techniques.
## Chunked Context
Chunked context allows TensorRT-LLM to divide the input tokens into smaller chunks and batch those chunks with the decode requests.
With the chunked context feature, there are two benefits:
- This can prevent the context phase from becoming a bottleneck, enable more parallelization with tokens in the decode phase, and increase GPU utilization.
- Chunked context allows TensorRT-LLM to handle requests with longer contexts while achieving higher concurrency. Since memory usage depends on the number of tokens processed per iteration, chunked context decouples memory consumption from the input request's context length, changing it to the smaller chunk size. This enables TensorRT-LLM to process longer contexts without increasing memory requirements, which can also help increase the concurrency under the same memory consumption.
To enable chunked context, please set the `enable_chunked_prefill` in `LLM` API to `True`.
```bash
llm = LLM(
...
enable_chunked_prefill=True,
...
)
```
Note that if chunked context is enabled, please set the `max_num_tokens` to be an integer multiple of the kv-cache block size `tokens_per_block`, which defaults to 64.
## Chunked attention
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/feat_long_seq_chunked_attention.png" alt="feat_long_seq_chunked_attention" width="240" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 1. Illustration of chunked attention </em></sub></p>
Instead of splitting the input tokens into smaller chunks for the whole model, chunked attention is another method that is only applied to the attention layers in models.
With chunked attention, the tokens in context requests are split into chunks of a specified size. Then tokens can only attend to other tokens in the same chunk. For example, if the chunk size is 3, we might have a mask illustrated in Figure 1. Each token only needs to attend to at most the past chunk-sized tokens. As a result, both the KV cache size and the attention computation can be significantly reduced.
Currently TensorRT-LLM can only support chunked attention in llama4 model with TRTLLM attention backend. TensorRT-LLM will read `attention_chunk_size` from the model config. If it is not None, the chunked attention will be enabled with chunk size `attention_chunk_size`. If you want to enable chunked attention to other models, you can set the `attention_chunk_size` in attention API to a valid value.
Note that chunked attention can only be applied to context requests.
## Sliding Window Attention
<div align="center">
<figure>
<img src="https://github.com/NVIDIA/TensorRT-LLM/raw/main/docs/source/blogs/media/feat_long_seq_chunked_attention.png" alt="feat_long_seq_sliding_win_attn" width="240" height="auto">
</figure>
</div>
<p align="center"><sub><em>Figure 2. Illustration of sliding window attention </em></sub></p>
Since attention layers are usually the performance bottleneck when processing requests with long sequences, sliding window attention is an effective method to limit the attention span of each token to a fixed size window around it, dramatically reducing the amount of computation and memory required.
Figure 2 shows the sliding window attention mask. Each token will only attend to the past `N` tokens. If the number of past tokens surpasses the max attention window size, `Sliding Window Attention` will be activated.
TensorRT-LLM treats the kv cache as a circular buffer to support this feature, which is also called `Cyclic KV Cache`. It only stores the kv cache for the last `N` tokens, where `N` is determined by the `KvCacheConfig.max_attention_window` parameter in `LLM` API. TensorRT-LLM allows different `N` values for each layer and users can simply provide a `list[int]` to the `KvCacheConfig.max_attention_window`. To enable this feature, users can set
```bash
kv_cache_config = KvCacheConfig(
...
max_attention_window = [...],
...
)
llm = LLM(
...
kv_cache_config=kv_cache_config,
...
)
```
If the number of the provided elements in `KvCacheConfig.max_attention_window` is less than the number of layers, the provided list will be repeated multiple times to the number of layers to set unique values for each layer. However, it's important to note that the memory allocation for the kv cache still relies on the buffer's maximum value.
Note that the `Sliding Window Attention` feature doesn't work with beam searching currently as the context kv cache is shared across beams.

View File

@ -0,0 +1,220 @@
# LoRA (Low-Rank Adaptation)
LoRA (Low-Rank Adaptation) is a parameter-efficient fine-tuning technique that enables adapting large language models to specific tasks without modifying the original model weights. Instead of fine-tuning all parameters, LoRA introduces small trainable rank decomposition matrices that are added to existing weights during inference.
## Table of Contents
1. [Background](#background)
2. [Basic Usage](#basic-usage)
- [Single LoRA Adapter](#single-lora-adapter)
- [Multi-LoRA Support](#multi-lora-support)
3. [Advanced Usage](#advanced-usage)
- [LoRA with Quantization](#lora-with-quantization)
- [NeMo LoRA Format](#nemo-lora-format)
- [Cache Management](#cache-management)
4. [TRTLLM serve with LoRA](#trtllm-serve-with-lora)
- [YAML Configuration](#yaml-configuration)
- [Starting the Server](#starting-the-server)
- [Client Usage](#client-usage)
5. [TRTLLM bench with LORA](#trtllm-bench-with-lora)
- [YAML Configuration](#yaml-configuration)
- [Run trtllm-bench](#run-trtllm-bench)
## Background
The PyTorch backend provides LoRA support, allowing you to:
- Load and apply multiple LoRA adapters simultaneously
- Switch between different adapters for different requests
- Use LoRA with quantized models
- Support both HuggingFace and NeMo LoRA formats
## Basic Usage
### Single LoRA Adapter
```python
from tensorrt_llm import LLM
from tensorrt_llm.lora_manager import LoraConfig
from tensorrt_llm.executor.request import LoRARequest
from tensorrt_llm.sampling_params import SamplingParams
# Configure LoRA
lora_config = LoraConfig(
lora_dir=["/path/to/lora/adapter"],
max_lora_rank=8,
max_loras=1,
max_cpu_loras=1
)
# Initialize LLM with LoRA support
llm = LLM(
model="/path/to/base/model",
lora_config=lora_config
)
# Create LoRA request
lora_request = LoRARequest("my-lora-task", 0, "/path/to/lora/adapter")
# Generate with LoRA
prompts = ["Hello, how are you?"]
sampling_params = SamplingParams(max_tokens=50)
outputs = llm.generate(
prompts,
sampling_params,
lora_request=[lora_request]
)
```
### Multi-LoRA Support
```python
# Configure for multiple LoRA adapters
lora_config = LoraConfig(
lora_target_modules=['attn_q', 'attn_k', 'attn_v'],
max_lora_rank=8,
max_loras=4,
max_cpu_loras=8
)
llm = LLM(model="/path/to/base/model", lora_config=lora_config)
# Create multiple LoRA requests
lora_req1 = LoRARequest("task-1", 0, "/path/to/adapter1")
lora_req2 = LoRARequest("task-2", 1, "/path/to/adapter2")
prompts = [
"Translate to French: Hello world",
"Summarize: This is a long document..."
]
# Apply different LoRAs to different prompts
outputs = llm.generate(
prompts,
sampling_params,
lora_request=[lora_req1, lora_req2]
)
```
## Advanced Usage
### LoRA with Quantization
```python
from tensorrt_llm.models.modeling_utils import QuantConfig
from tensorrt_llm.quantization.mode import QuantAlgo
# Configure quantization
quant_config = QuantConfig(
quant_algo=QuantAlgo.FP8,
kv_cache_quant_algo=QuantAlgo.FP8
)
# LoRA works with quantized models
llm = LLM(
model="/path/to/model",
quant_config=quant_config,
lora_config=lora_config
)
```
### NeMo LoRA Format
```python
# For NeMo-format LoRA checkpoints
lora_config = LoraConfig(
lora_dir=["/path/to/nemo/lora"],
lora_ckpt_source="nemo",
max_lora_rank=8
)
lora_request = LoRARequest(
"nemo-task",
0,
"/path/to/nemo/lora",
lora_ckpt_source="nemo"
)
```
### Cache Management
```python
from tensorrt_llm.llmapi.llm_args import PeftCacheConfig
# Fine-tune cache sizes
peft_cache_config = PeftCacheConfig(
host_cache_size=1024*1024*1024, # 1GB CPU cache
device_cache_percent=0.1 # 10% of free GPU memory
)
llm = LLM(
model="/path/to/model",
lora_config=lora_config,
peft_cache_config=peft_cache_config
)
```
## TRTLLM serve with LoRA
### YAML Configuration
Create an `extra_llm_api_options.yaml` file:
```yaml
lora_config:
lora_target_modules: ['attn_q', 'attn_k', 'attn_v']
max_lora_rank: 8
```
### Starting the Server
```bash
python -m tensorrt_llm.commands.serve
/path/to/model \
--extra_llm_api_options extra_llm_api_options.yaml
```
### Client Usage
```python
import openai
client = openai.OpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
response = client.completions.create(
model="/path/to/model",
prompt="What is the capital city of France?",
max_tokens=20,
extra_body={
"lora_request": {
"lora_name": "lora-example-0",
"lora_int_id": 0,
"lora_path": "/path/to/lora_adapter"
}
},
)
```
## TRTLLM bench with LORA
### YAML Configuration
Create an `extra_llm_api_options.yaml` file:
```yaml
lora_config:
lora_dir:
- /workspaces/tensorrt_llm/loras/0
max_lora_rank: 64
max_loras: 8
max_cpu_loras: 8
lora_target_modules:
- attn_q
- attn_k
- attn_v
trtllm_modules_to_hf_modules:
attn_q: q_proj
attn_k: k_proj
attn_v: v_proj
```
### Run trtllm-bench
```bash
trtllm-bench --model $model_path throughput --dataset $dataset_path --extra_llm_api_options extra_llm_api_options.yaml --num_requests 64 --concurrency 16
```

View File

@ -0,0 +1,53 @@
# Multimodal Support in TensorRT LLM
TensorRT LLM supports a variety of multimodal models, enabling efficient inference with inputs beyond just text.
---
## Background
Multimodal LLMs typically handle non-text inputs by combining a multimodal encoder with an LLM decoder. The encoder first transforms non-text modality input into embeddings, which are then fused with text embeddings and fed into the LLM decoder for downstream inference. Compared to standard LLM inference, multimodal LLM inference involves three additional stages to support non-text modalities.
* **Multimodal Input Processor**: Preprocess raw multimodal input into a format suitable for the multimodal encoder, such as pixel values for vision models.
* **Multimodal Encoder**: Encodes the processed input into embeddings that are aligned with the LLMs embedding space.
* **Integration with LLM Decoder**: Fuses multimodal embeddings with text embeddings as the input to the LLM decoder.
## Optimizations
TensorRT LLM incorporates some key optimizations to enhance the performance of multimodal inference:
* **In-Flight Batching**: Batches multimodal requests within the GPU executor to improve GPU utilization and throughput.
* **CPU/GPU Concurrency**: Asynchronously overlaps data preprocessing on the CPU with image encoding on the GPU.
* **Raw data hashing**: Leverages image hashes and token chunk information to improve KV cache reuse and minimize collisions.
Further optimizations are under development and will be updated as they become available.
## Model Support Matrix
Please refer to the latest multimodal [support matrix](../models/supported-models.md#multimodal-feature-support-matrix-pytorch-backend).
## Examples
The following examples demonstrate how to use TensorRT LLM's multimodal support in various scenarios, including quick run examples, serving endpoints, and performance benchmarking.
### Quick start
Quickly try out TensorRT LLM's multimodal support using our `LLM-API` and a ready-to-run [example](source:examples/llm-api/quickstart_multimodal.py):
```bash
python3 quickstart_multimodal.py --model_dir Efficient-Large-Model/NVILA-8B --modality image
```
### OpenAI-Compatible Server via [`trtllm-serve`](../../source/commands/trtllm-serve/trtllm-serve.rst)
Launch an OpenAI-compatible server with multimodal support using the `trtllm-serve` command, for example:
```bash
trtllm-serve Qwen/Qwen2-VL-7B-Instruct --backend pytorch
```
You can then send OpenAI-compatible requests, such as via curl or API clients, to the server endpoint. See [curl chat client for multimodal script](source:examples/serve/curl_chat_client_for_multimodal.sh) as an example.
### Run with [`trtllm-bench`](../../source/commands/trtllm-bench.rst)
Evaluate offline inference performance with multimodal inputs using the `trtllm-bench` tool. For detailed instructions, see the [benchmarking guide](../../source/performance/perf-benchmarking.md).

View File

@ -0,0 +1,37 @@
# Overlap Scheduler
To maximize GPU utilization, the scheduler overlaps CPU tasks (e.g., checking sampling stop criteria, updating responses, scheduling the next batch) with GPU computation.
## How It Works
At step *n*, the system launches GPU computation for step *n+1* without waiting for CPU tasks (e.g., stop criteria checks) from step *n* to complete. This allows:
- CPU work (step *n*) and GPU computation (step *n+1*) to run concurrently.
- Better GPU occupancy by reducing idle time.
This concurrent execution pipeline is illustrated in the `PyExecutor`'s logic:
```python
# Schedule and launch GPU work for the current step (n)
scheduled_batch, _, _ = self._schedule()
batch_outputs = self._forward_step(scheduled_batch, previous_tensors_device)
sample_state = self._sample_async(scheduled_batch, batch_outputs)
# While the GPU is busy, process the CPU-bound results from the previous step (n-1)
if self.previous_batch is not None:
self._process_previous_batch()
```
## Tradeoff
The optimization introduces one extra decoding step but significantly improves throughput.
## Usage
Enabled by default. To disable, set `disable_overlap_scheduler=True` in the configuration.
## References
- [NanoFlow: Towards Optimal Large Language Model Serving Throughput](https://arxiv.org/abs/2408.12757)
- https://lmsys.org/blog/2024-12-04-sglang-v0-4/#zero-overhead-batch-scheduler

View File

@ -0,0 +1,144 @@
# Paged Attention, IFB, and Request Scheduling
## In-flight Batching
TensorRT LLM supports in-flight batching of requests (also known as continuous
batching or iteration-level batching) for higher serving throughput. With this feature,
sequences in the context phase can be processed together with sequences in the
generation phase. The purpose of that technique is to better interleave
requests to reduce latency as well as make better use of the GPUs.
For efficiency reasons (1), the support for inflight batching ***requires the
input tensors to be packed (no padding)***.
***In the current implementation, the sequences that are going through the
context phase must come before the sequences in the generation phase in the input
tensor. For example, for sequences `S0`, `S1` and `S2`, if `S0` and `S2` are in
context phase (and `S1` in generation), tokens from `S0` and `S2` must appear
before the tokens of `S1` in the input tensor***. The constraint may or may not
be relaxed in a future version.
_(1) Padding sequences in the generation phase that contain a single token to
the length of the maximum input sequence is inefficient use of resources.
### `max_batch_size`, `max_seq_len` and `max_num_tokens`
<p align="center">
<img src="https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/media/max_bs_toks_len.svg?raw=true" alt="Explain `max_batch_size`, `max_seq_len` and `max_num_tokens`" width="30%" height="auto">
</p>
#### `max_batch_size`
`max_batch_size` defines the maximum number of requests that the engine can handle.
It controls the maximum number of requests that can be scheduled at runtime.
Set a sufficiently high `max_batch_size` when building the engine so that it does not become the bottleneck of the throughput, and use runtime `max_batch_size` to tune throughput or latency without rebuilding the engine.
#### `max_seq_len`
`max_seq_len` defines the maximum sequence length of single request
Starting from TensorRT LLM v0.11, when `--remove_input_padding` and `--context_fmha` are enabled, `max_seq_len` can replace `max_input_len` and `max_output_len`, and is set to `max_position_embeddings` by default.
Use default `max_seq_len` (which is `max_position_embeddings`), no need to tune it unless you are very sure what max sequence lengths would be on your workloads. If GPU memory is so limited that it cannot support even one request reaching `max_seq_len`, you need to reduce it.
#### `max_num_tokens`
`max_num_tokens` defines the maximum number of batched input tokens after padding is removed in each batch.
`max_num_tokens` is set to 8192 by default starting from v0.11. You can tune it using the runtime `max_num_tokens` without re-buliding the engine. It is recommended to tune `--max_num_tokens` for better performance.
The maximum number of tokens will not take effect when input padding is not removed. When input padding is removed, the tokens from different sequences are
packed together and the maximum number of the tokens can be set to a different
(lower) value, which by default is 8192.
There are two aspects that must be considered. Firstly, some input sequences
will be shorter than the maximum input length. Secondly, when in-flight
sequence batching is enabled, requests in context phase will be executed with
requests in generation phase. Those latter requests produce a lot fewer tokens
than `max_input_len` (at most, `beam_width` tokens).
Using a more realistic value for `max_num_tokens` allows TensorRT LLM to
allocate more memory to store the KV cache and execute more requests together.
It leads to an increased efficiency.
Increasing `max_num_tokens` appropriately will be beneficial to performance.
When increasing `--max_num_tokens` to some point, GPU utilization will plateau,
going beyond that saturation point may hurt both first token latency as well as
total end-to-end latency.
## Chunked Context (a.k.a Chunked Prefill)
The original behavior was to process all context tokens at once. However, this feature splits the context into several chunks. In this way, the
context chunks can be batched with more tokens during the generation phase,
which should increase overall throughput. Chunking contexts also removes
constraints on input length. To enable this feature, the FMHA paged kv-cache also
needs to be enabled. Except for the last chunk, the size of each context chunk needs to be an integer multiple of the kv-cache block size.
## KV Cache
In the generation phase, a common optimization is to provide the MHA kernel
with a cache containing the values of the past K and V elements that have
already been computed. That cache is known as the KV cache. TensorRT LLM uses
that technique to accelerate its generation phase. In TensorRT LLM, there is
one KV cache per Transformer layer, which means that there are as many KV
caches as layers in a model. The current version of TensorRT LLM supports two
different types of KV caches: **contiguous** and **paged** KV caches.
### Contiguous KV Cache
The contiguous KV cache is a monolithic tensor. Its shape is:
```
[max_batch_size * max_beam_width, 2, num_heads, max_seqlen, hidden_dim_per_head].
```
This implementation uses much more memory than needed when sequences are shorter than the maximum sequence length. Even if they approach the limit after generating many output tokens, it may take many steps to reach that point.
### Paged KV Cache
The paged KV cache decomposes the KV cache into blocks that are distributed to
the different requests by a cache manager during processing. That cache manager
keeps track of the sequences, allocates new blocks from a pool, and recycles those blocks when required. See the simplified implementation of
[`tensorrt_llm.runtime.KVCacheManager`](source:tensorrt_llm/runtime/kv_cache_manager.py).
A more efficient C++ implementation is included in the
[Batch Manager](source:cpp/include/tensorrt_llm/batch_manager).
## The schedulers
This section visualizes how TensorRT LLM schedules requests based on max-batch size and max-num tokens. The example starts out with a newly initialized engine as well as a few unscheduled requests that have come in. For the sake of this example, toy values are set to `max batch size = 4` and `max num tokens = 12`. Each square block represents a token, and its color represents which request it belongs to.
![TRT-LLM Scheduler Visualization 1](../media/TRTLLM_Scheduler_Vis_1.svg)
Now the scheduler takes the first two requests, Request 1 and Request 2, and schedules them to execute the context phase. However, it cannot schedule any more requests because the prompts of the first two requests had 5 tokens each, leaving a budget of 2 tokens due to the max num tokens limit. Since all remaining requests have more than 2 prompt tokens none of them can be scheduled (context chunking can help in this situation, see the paged context attention section below). The tokens are marked with a "C" on them to represent that they are prompt tokens that were processed in the context phase.
> Note: The tokens for different requests are shown on different rows simply for visualization purposes and are not representative of actual memory layouts
![TRT-LLM Scheduler Visualization 2](../media/TRTLLM_Scheduler_Vis_2.svg)
Now the engine runs an iteration of execution, completing the context phases for both of the scheduled requests. After it is done, the kv-cache of the prompts for both requests have been created and the first token has been generated. Tokens that were generated are marked with "G(n)" - for example a token marked "G1" represents that it is the first token generated for its request.
TRT-LLM prioritizes scheduling requests in generation phase first so the two generated tokens are queued to be processed in the next iteration. Now, since the two previously scheduled requests have entered generation phase and only take up two tokens out of the max num token budget of 12, the scheduler is able to schedule two additional requests, Request 3 and Request 4. It cannot schedule the last request, Request 5, even though there is space for it in the max num tokens budget because of the max batch size limit of 4.
![TRT-LLM Scheduler Visualization 3](../media/TRTLLM_Scheduler_Vis_3.svg)
After the next iteration of execution, the second tokens for Requests 1 and 2 have been generated, and the first tokens for Request 3 and 4 have been generated. Let's say that G2, which was generated for Request 1, is the stop token signifying that Request 1 is completed. In this case the scheduler would evict Request 1 before performing another execution iteration and prepare to return it to the user. This eviction puts the state of the engine below the max batch size limit and allows Request 5 to be scheduled.
Also note that G1, which was generated for Request 2, has been added to the kv-cache for Request 2, illustrating how the kv-cache for a request grows as more tokens are generated.
![TRT-LLM Scheduler Visualization 4](../media/TRTLLM_Scheduler_Vis_4.svg)
Overall, the max batch size and max num tokens limits play a key role in determining when requests are executed. Tuning these parameters can significantly impact throughput, as well as how the engine balances previously scheduled requests in the generation phase with new requests in the context phase.
> Note: This presents a simplified visualization of the scheduler to highlight how max batch size and max num tokens affect it. The scheduler also considers things like amount of free memory available to be used for kv-cache and has other configurable options that can affect its behavior. See the Runtime Flags of the Additional Options page for more on this.
## Revisiting Paged Context Attention and Context Chunking
[Previously](./useful-build-time-flags.md#paged-context-attention) we recommended enabling paged context attention even though in our case study it didn't affect performance significantly. Now that we understand the TensorRT LLM scheduler, we can explain why this is beneficial. In short, we recommend enabling it because it enables context chunking, which allows the context phase of a request to be broken up into pieces and processed over several execution iterations, allowing the engine to provide a more stable balance of context and generation phase execution.
The [visualization](#the-schedulers) of the TensorRT LLM scheduler showed that initially Request 3 couldn't be scheduled because it would put the scheduler over the max-num tokens limit. However, with context chunking, this is no longer the case, and the first chunk of Request 3 can be scheduled.
![TRT-LLM Scheduler Visualization Chunked Context 1](../media/TRTLLM_Scheduler_Vis_Chunked_Context_1.svg)
This is extremely beneficial for several reasons. First, it eliminates the possibility of requests with large prompts (relative to max num tokens) not being scheduled due to other requests already in-flight. In production workloads, this can help improve worst case TTFT numbers. Second, it allows for setting smaller values of max num tokens, since you no longer need max num tokens to be at least as large as the longest prompt you want to support. For long-context cases this is extremely important, because setting extremely large values of max-num tokens takes away from memory available to be used as kv-cache. Given that, in the worst-case scenario, chunked context has minimal impact on performance but can significantly benefit it in many situations, NVIDIA recommends that you always enable it.

View File

@ -0,0 +1,180 @@
# Parallelism in TensorRT LLM
Parallelism across multiple GPUs becomes necessary when either
* the model cannot fit in a single GPUs memory, or
* a single GPU cannot deliver the desired performance.
TensorRT LLM supports multiple parallelism strategies for deployment on both single and multiple nodes:
* **Tensor Parallel (TP)** - Shards model weights across GPUs
* **Pipeline Parallel (PP)** - Distributes model layers across GPUs
* **Data Parallel (DP)** - Replicates model across GPUs for different requests
* **Expert Parallel (EP)** - Distributes experts across GPUs for MoE models
* **Context Parallel (CP)** - Distributes context processing across GPUs
* **Wide Expert Parallel (Wide-EP)** - Advanced EP with load balancing for large-scale MoE models
## Overview of Parallelism Strategies
### Tensor Parallelism (TP)
Tensor parallelism splits the model weights across multiple GPUs. Each GPU holds a portion of the weights and processes the same input tokens, with results combined through communication.
**Best for:** Small batch sizes, memory-constrained scenarios
### Pipeline Parallelism (PP)
Pipeline parallelism distributes different layers of the model across multiple GPUs. Each GPU processes a subset of layers, with activations passed between GPUs.
**Best for:** Large models that don't fit in single GPU memory
### Data Parallelism (DP)
Data parallelism replicates the entire model across multiple GPUs. Each GPU processes different requests independently.
**Best for:** Large batch sizes, high throughput scenarios
### Expert Parallelism (EP)
Expert parallelism is specifically designed for Mixture of Experts (MoE) models, where different experts are distributed across GPUs.
**Best for:** MoE models with high expert count
### Context Parallelism (CP)
Context parallelism distributes the processing of long sequences across multiple GPUs.
**Best for:** Long context scenarios
### Wide Expert Parallelism (Wide-EP)
Wide-EP is an advanced form of expert parallelism that addresses the inherent workload imbalance in large-scale MoE models through intelligent load balancing and expert replication.
**Best for:** Large-scale MoE models like DeepSeek-V3/R1, LLaMA4, Qwen3
## Module-level Parallelism Guide
### Attention Module
TensorRT LLM supports two strategies for attention modules:
- **Tensor Parallelism (TP)** — best for small batch sizes
- **Data Parallelism (DP)** — best for large batch sizes
#### Tensor Parallelism (TP)
* The GEMM weights before and after the attention kernel are evenly sharded across GPUs, as are the attention `num_heads`.
* Exceptions:
1. **DeepSeek-R1**: the `fused_A` GEMM is *not* sharded.
2. **GQA / MQA / MLA**: if `num_heads < tensor_parallel_size`, the KV-cache is replicated on every GPU.
#### Data Parallelism (DP)
* All GEMM weights are **replicated** on every GPU.
* The KV-cache is **partitioned**, because different user requests are routed to different DP ranks.
#### How to Enable Attention Parallelism
To deploy a model with the above parallel strategies using `trtllm-serve` or run benchmarking with `trtllm-bench`, create a YAML configuration file named `parallel_config.yaml`:
```bash
cat <<EOF > parallel_config.yaml
# TP-8
tensor_parallel_size: 8
enable_attention_dp: false # default
# DP-8
tensor_parallel_size: 8
enable_attention_dp: true
EOF
```
### FFN Module
#### Dense Models
Tensor Parallelism is supported for the FFN layers of dense models.
#### Mixture of Experts (MoE)
MoE replaces a single FFN with multiple experts. A router selects the top-k experts for each token and dispatches the corresponding hidden states.
TensorRT LLM supports three execution patterns for MoE:
* **TP** - Every expert's weight matrix is sliced across all GPUs. Each GPU sees all tokens.
* **EP** - Full weights of each expert reside on a single GPU. Each GPU only sees tokens routed to its local experts.
* **Hybrid ETP** - Each GPU stores a subset of experts (EP) and shards those weights further (TP), balancing workload and kernel efficiency.
#### How to Enable MoE Parallelism
To deploy a model with the above parallel strategies using `trtllm-serve` or run benchmarking with `trtllm-bench`, create a YAML configuration file named `parallel_config.yaml` as follows:
```bash
cat <<EOF > parallel_config.yaml
# TP only
tensor_parallel_size: 8
moe_tensor_parallel_size: 8
# EP only
tensor_parallel_size: 8
moe_expert_parallel_size: 8
# Hybrid (TP-4 × EP-2)
tensor_parallel_size: 8 # 4 × 2
moe_tensor_parallel_size: 4
moe_expert_parallel_size: 2
EOF
```
```{note}
The product of `moe_tensor_parallel_size` and `moe_expert_parallel_size` must equal `tensor_parallel_size`.
```
## Wide Expert Parallelism (Wide-EP)
Wide Expert Parallelism (Wide-EP) is TensorRT LLM's advanced solution for large-scale MoE model inference. It addresses the challenges of traditional expert parallelism through intelligent load balancing and expert replication strategies.
### Motivation for Wide-EP
Large-scale MoE models like DeepSeek-V3/R1, LLaMA4, and Qwen3 use fine-grained expert designs that introduce new challenges:
- **High memory demands** for expert weights
- **Inherent expert-level workload imbalance** due to sparse execution patterns
- **Communication overhead** in distributed expert parallelism
- **Hot expert problem** where certain experts receive significantly more tokens than others
### Key Features of Wide-EP
#### 1. Expert Replication and Load Balancing
Wide-EP introduces the concept of **expert slots** that are decoupled from specific experts. This allows:
- Multiple replicas of hot experts across different GPUs
- Dynamic expert placement based on workload patterns
- Both offline and online load balancing strategies
#### 2. Custom EP Communication Kernels
- Optimized for NVIDIA GB200 Multi-Node NVLink (MNNVL)
- Efficient all-to-all communication for expert dispatch and combine
- Reduced communication overhead compared to traditional EP
#### 3. Expert Parallelism Load Balancer (EPLB)
- **Offline EPLB**: Pre-computed expert placement based on historical workload statistics
- **Online EPLB**: Dynamic expert placement that adapts to real-time traffic patterns
- Layer-wise weight redistribution to minimize inference disruption
### Architecture Overview
Wide-EP separates the concepts of **experts** and **slots**:
- **Expert**: The concept from the model's perspective (e.g., Expert 0, Expert 1, etc.)
- **Slot**: The concept from the model engine's perspective (e.g., Slot 0, Slot 1, etc.)
The system maintains a routing table that maps Expert IDs to Slot IDs, which can be updated by the load balancing policy.
### Best Practices
1. **Start with offline EPLB** for production deployments with known workload patterns
2. **Use online EPLB** for dynamic workloads or when traffic patterns change frequently
3. **Monitor expert statistics** to understand workload distribution
4. **Tune max_num_tokens** based on your memory constraints and EP size
5. **Test with representative datasets** to validate load balancing effectiveness
### References
- [Technical Blog: Scaling Expert Parallelism in TensorRT LLM](https://github.com/NVIDIA/TensorRT-LLM/blob/main/docs/source/blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.md)
- [DeepSeek-V3 Paper](https://arxiv.org/abs/2412.19437)
- [EPLB Implementation](https://github.com/deepseek-ai/EPLB)
For detailed implementation examples and advanced usage, see:
- [`examples/wide_ep/`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/wide_ep/): Complete Wide-EP examples
- [`examples/wide_ep/ep_load_balancer/`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/wide_ep/ep_load_balancer/): Load balancing tools
- [`examples/wide_ep/slurm_scripts/`](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/wide_ep/slurm_scripts/): Cluster deployment scripts

View File

@ -0,0 +1,111 @@
# Quantization
## Quantization in TensorRT LLM
Quantization is a technique used to reduces memory footprint and computational cost by converting the model's weights and/or activations from high-precision floating-point numbers (like BF16) to lower-precision data types, such as INT8, FP8, or FP4.
TensorRT LLM offers a variety of quantization recipes to optimize LLM inference. These recipes can be broadly categorized as follows:
* FP4
* FP8 Per Tensor
* FP8 Block Scaling
* FP8 Rowwise
* FP8 KV Cache
* W4A16 GPTQ
* W4A8 GPTQ
* W4A16 AWQ
* W4A8 AWQ
## Usage
The default PyTorch backend supports FP4 and FP8 quantization on the latest Blackwell and Hopper GPUs.
### Running Pre-quantized Models
TensorRT LLM can directly run [pre-quantized models](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4) generated with the [NVIDIA TensorRT Model Optimizer](https://github.com/NVIDIA/TensorRT-Model-Optimizer).
```python
from tensorrt_llm import LLM
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
llm.generate("Hello, my name is")
```
#### FP8 KV Cache
```{note}
TensorRT LLM allows you to enable the FP8 KV cache manually, even for checkpoints that do not have it enabled by default.
```
Here is an example of how to set the FP8 KV Cache option:
```python
from tensorrt_llm import LLM
from tensorrt_llm.llmapi import KvCacheConfig
llm = LLM(model='/path/to/model',
kv_cache_config=KvCacheConfig(dtype='fp8'))
llm.generate("Hello, my name is")
```
### Offline Quantization with ModelOpt
If a pre-quantized model is not available on the [Hugging Face Hub](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4), you can quantize it offline using ModelOpt.
Follow this step-by-step guide to quantize a model:
```bash
git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
cd TensorRT-Model-Optimizer/examples/llm_ptq
scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
```
## Model Supported Matrix
| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache |W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ |
| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: |
| BERT | . | . | . | . | . | Y | . | . | . | . |
| DeepSeek-R1 | Y | . | . | Y | . | Y | . | . | . | . |
| EXAONE | . | . | Y | . | . | Y | Y | Y | . | . |
| Gemma 3 | . | . | Y | . | . | Y | Y | Y | . | . |
| GPT-OSS | . | Y | . | . | . | Y | . | . | . | . |
| LLaMA | Y | . | Y | . | . | Y | . | Y | . | Y |
| LLaMA-v2 | Y | . | Y | . | . | Y | Y | Y | . | Y |
| LLaMA 3 | . | . | . | . | Y | Y | Y | . | . | . |
| LLaMA 4 | Y | . | Y | . | . | Y | . | . | . | . |
| Mistral | . | . | Y | . | . | Y | . | Y | . | . |
| Mixtral | Y | . | Y | . | . | Y | . | . | . | . |
| Phi | . | . | . | . | . | Y | Y | . | . | . |
| Qwen | . | . | . | . | . | Y | Y | Y | . | Y |
| Qwen-2/2.5 | Y | . | Y | . | . | Y | Y | Y | . | Y |
| Qwen-3 | Y | . | Y | . | . | Y | . | Y | . | Y |
| BLIP2-OPT | . | . | . | . | . | Y | . | . | . | . |
| BLIP2-T5 | . | . | . | . | . | Y | . | . | . | . |
| LLaVA | . | . | Y | . | . | Y | . | Y | . | Y |
| VILA | . | . | Y | . | . | Y | . | Y | . | Y |
| Nougat | . | . | . | . | . | Y | . | . | . | . |
```{note}
The vision component of multi-modal models(BLIP2-OPT/BLIP2-T5/LLaVA/VILA/Nougat) uses FP16 by default.
The language component decides which quantization methods are supported by a given multi-modal model.
```
## Hardware Support Matrix
| Model | NVFP4 | MXFP4 | FP8(per tensor)| FP8(block scaling) | FP8(rowwise) | FP8 KV Cache |W4A8 AWQ | W4A16 AWQ | W4A8 GPTQ | W4A16 GPTQ |
| :------------- | :---: | :---: | :---: | :---: | :---: | :---: | :-------: | :-------: | :--------: | :--------: |
| Blackwell(sm120) | Y | Y | Y | . | . | Y | . | . | . | . |
| Blackwell(sm100) | Y | Y | Y | Y | . | Y | . | . | . | . |
| Hopper | . | . | Y | Y | Y | Y | Y | Y | Y | Y |
| Ada Lovelace | . | . | Y | . | . | Y | Y | Y | Y | Y |
| Ampere | . | . | . | . | . | Y | . | Y | . | Y |
```{note}
FP8 block wise scaling GEMM kernels for sm100 are using MXFP8 recipe (E4M3 act/weight and UE8M0 act/weight scale), which is slightly different from SM90 FP8 recipe (E4M3 act/weight and FP32 act/weight scale).
```
## Quick Links
- [Pre-quantized Models by ModelOpt](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4)
- [ModelOpt Support Matrix](https://nvidia.github.io/TensorRT-Model-Optimizer/guides/0_support_matrix.html)

View File

@ -0,0 +1,135 @@
# Sampling
The PyTorch backend supports most of the sampling features that are supported on the C++ backend, such as temperature, top-k and top-p sampling, beam search, stop words, bad words, penalty, context and generation logits, log probability, guided decoding and logits processors
## General usage
To use the feature:
1. Enable the `enable_trtllm_sampler` option in the `LLM` class
2. Pass a [`SamplingParams`](../../../../tensorrt_llm/sampling_params.py#L125) object with the desired options to the `generate()` function
The following example prepares two identical prompts which will give different results due to the sampling parameters chosen:
```python
from tensorrt_llm import LLM, SamplingParams
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
enable_trtllm_sampler=True)
sampling_params = SamplingParams(
temperature=1.0,
top_k=8,
top_p=0.5,
)
llm.generate(["Hello, my name is",
"Hello, my name is"], sampling_params)
```
Note: The `enable_trtllm_sampler` option is not currently supported when using speculative decoders, such as MTP or Eagle-3, so there is a smaller subset of sampling options available.
## Beam search
Beam search is a decoding strategy that maintains multiple candidate sequences (beams) during text generation, exploring different possible continuations to find higher quality outputs. Unlike greedy decoding or sampling, beam search considers multiple hypotheses simultaneously.
To enable beam search, you must:
1. Enable the `use_beam_search` option in the `SamplingParams` object
2. Set the `max_beam_width` parameter in the `LLM` class to match the `best_of` parameter in `SamplingParams`
3. Disable overlap scheduling using the `disable_overlap_scheduler` parameter of the `LLM` class
4. Disable the usage of CUDA Graphs by passing `None` to the `cuda_graph_config` parameter of the `LLM` class
Parameter Configuration:
- `best_of`: Controls the number of beams processed during generation (beam width)
- `n`: Controls the number of output sequences returned (can be less than `best_of`)
- If `best_of` is omitted, the number of beams processed defaults to `n`
- `max_beam_width` in the `LLM` class must equal `best_of` in `SamplingParams`
The following example demonstrates beam search with a beam width of 4, returning the top 3 sequences:
```python
from tensorrt_llm import LLM, SamplingParams
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
enable_trtllm_sampler=True,
max_beam_width=4, # must equal SamplingParams.best_of
disable_overlap_scheduler=True,
cuda_graph_config=None)
sampling_params = SamplingParams(
best_of=4, # must equal LLM.max_beam_width
use_beam_search=True,
n=3, # return top 3 sequences
)
llm.generate(["Hello, my name is",
"Hello, my name is"], sampling_params)
```
## Guided decoding
Guided decoding controls the generation outputs to conform to pre-defined structured formats, ensuring outputs follow specific schemas or patterns.
The PyTorch backend supports guided decoding with the XGrammar and Low-level Guidance (llguidance) backends and the following formats:
- JSON schema
- JSON object
- Regular expressions
- Extended Backus-Naur form (EBNF) grammar
- Structural tags
To enable guided decoding, you must:
1. Set the `guided_decoding_backend` parameter to `'xgrammar'` or `'llguidance'` in the `LLM` class
2. Create a [`GuidedDecodingParams`](../../../../tensorrt_llm/sampling_params.py#L14) object with the desired format specification
* Note: Depending on the type of format, a different parameter needs to be chosen to construct the object (`json`, `regex`, `grammar`, `structural_tag`).
3. Pass the `GuidedDecodingParams` object to the `guided_decoding` parameter of the `SamplingParams` object
The following example demonstrates guided decoding with a JSON schema:
```python
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.llmapi import GuidedDecodingParams
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8',
guided_decoding_backend='xgrammar')
structure = '{"title": "Example JSON", "type": "object", "properties": {...}}'
guided_decoding_params = GuidedDecodingParams(json=structure)
sampling_params = SamplingParams(
guided_decoding=guided_decoding_params,
)
llm.generate("Generate a JSON response", sampling_params)
```
You can find a more detailed example on guided decoding [here](../../../../examples/llm-api/llm_guided_decoding.py).
## Logits processor
Logits processors allow you to modify the logits produced by the network before sampling, enabling custom generation behavior and constraints.
To use a custom logits processor:
1. Create a custom class that inherits from [`LogitsProcessor`](../../../../tensorrt_llm/sampling_params.py#L48) and implements the `__call__` method
2. Pass an instance of this class to the `logits_processor` parameter of `SamplingParams`
The following example demonstrates logits processing:
```python
import torch
from typing import List, Optional
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.sampling_params import LogitsProcessor
class MyCustomLogitsProcessor(LogitsProcessor):
def __call__(self,
req_id: int,
logits: torch.Tensor,
token_ids: List[List[int]],
stream_ptr: Optional[int],
client_id: Optional[int]
) -> None:
# Implement your custom inplace logits processing logic
logits *= logits
llm = LLM(model='nvidia/Llama-3.1-8B-Instruct-FP8')
sampling_params = SamplingParams(
logits_processor=MyCustomLogitsProcessor()
)
llm.generate(["Hello, my name is"], sampling_params)
```
You can find a more detailed example on logits processors [here](../../../../examples/llm-api/llm_logits_processor.py).

View File

@ -0,0 +1,241 @@
# Speculative Decoding
There are two flavors of speculative decoding currently supported in the PyTorch backend:
- The "one model" implementation -- a variant which inserts a drafter directly into the model code as a submodule.
- The "two model" implementation -- a variant which produces draft tokens in the `PyExecutor`. The draft tokens are attached to requests before they are passed
into the target model's `ModelEngine`.
In general, the one model implementation is faster. It's able to achieve better performance in extreme low latency
scenarios because it can launch the entire drafting loop as a single CUDA graph. The trade off is flexibility. The one model implementation
does not support dynamic draft lengths. Additionally, only a subset of models/speculative decoding algorithms support the one model implementation.
The table below enumerates all of the algorithm/model combinations that are supported.
| Speculative Decoding Algorithm | Model |
| ------------------------------ | ------------------------------ |
| EAGLE 3 | Llama 4 Maverick |
| MTP | Deepseek V3/R1 |
| EAGLE-style MTP | Deepseek V3/R1 |
The two model implementation supports the following speculative decoding algorithms:
| Speculative Decoding Algorithm | Model |
| --------------------------------------------- | --------------------------------------------- |
| EAGLE 3 | Llama 4 Maverick, Llama 3.1 8B, Llama 3.3 70B |
| Draft/target | All models |
| NGram | All models |
| User-provided | All models |
## Quick Start
For all speculation algorithms, when speculation is enabled, a single sequence of draft tokens with length `max_draft_len` is created for every request. There is currently no way to dynamically disable speculation, thus speed ups are only observable at low batch sizes.
### Draft/Target
Draft/target is the simplest form of speculative decoding. In this approach, an arbitrary draft model is used to produce draft tokens. It is important to make sure that the draft and target models were trained with the same tokenizer, else the acceptance rate is extremely low and performance is regressed.
```python
from tensorrt_llm.llmapi import DraftTargetDecodingConfig
speculative_config = DraftTargetDecodingConfig(
max_draft_len=3, speculative_model="/path/to/draft_model")
llm = LLM("/path/to/target_model", speculative_config=speculative_config, disable_overlap_scheduler=True)
```
### EAGLE 3
The EAGLE 3 algorithm is described in the paper [EAGLE-3: Scaling up Inference Acceleration of Large Language Models via Training-Time Test](https://arxiv.org/pdf/2503.01840).
TRT-LLM supports a modified version of the algorithm presented in the paper: tree structures for draft sequences are not supported. Instead, each request uses a single sequence of draft tokens with length `max_draft_len`.
The following draft model checkpoints can be used for EAGLE 3:
* Llama 3 variants: [use the checkpoints from the authors of the original EAGLE 3 paper](https://huggingface.co/yuhuili).
* Llama 4 Maverick: [use the checkpoint from the NVIDIA HuggingFace repository](https://huggingface.co/nvidia/Llama-4-Maverick-17B-128E-Eagle3).
```python
from tensorrt_llm.llmapi import EagleDecodingConfig
# Enable to use the faster one-model implementation for Llama 4.
eagle3_one_model = False
speculative_config = EagleDecodingConfig(
max_draft_len=3, speculative_model="/path/to/draft_model", eagle3_one_model=eagle3_one_model)
# Only need to disable overlap scheduler if eagle3_one_model is False.
llm = LLM("/path/to/target_model", speculative_config=speculative_config, disable_overlap_scheduler=True)
```
### NGram
The NGram method is an implementation of [this Prompt Lookup Decoding algorithm](https://github.com/apoorvumang/prompt-lookup-decoding).
When the NGram algorithm is used, TRT-LLM will maintain a map from token prefixes to candidate draft sequences. For example, the 3-gram ["The ", " future ", " is"] could map to the draft sequence [" bright", " because"]. The prefixes are token sequences that are extracted from the prompt and the tokens generated by the target model. The NGram pool and matching procedure can be tuned with the following options:
* `max_draft_len`: Maximum draft candidate length.
* `max_matching_ngram_size`: Maximum prompt suffix length to match with keys in the pool.
* `is_public_pool`: If true, a single ngram pool is shared for all requests. Otherwise, each request has its own ngram pool.
* `is_keep_all`: If true, draft candidates will be retained in the pool forever. Otherwise, only the largest draft candidate is retained.
* `is_use_oldest`: If true, the oldest draft candidate is always proposed for a given match. Otherwise, the newest draft candidate is used. Only applicable if `is_keep_all == True` because `is_keep_all == False` means we'll only ever have a single value for each key.
```python
from tensorrt_llm.llmapi import NGramDecodingConfig
speculative_config = NGramDecodingConfig(
max_draft_len=3, max_matching_ngram_size=4, is_public_pool=True)
llm = LLM("/path/to/target_model", speculative_config=speculative_config, disable_overlap_scheduler=True)
```
### MTP
MTP is currently only supported by Deepseek. MTP can be tuned with the following configuration options:
* `max_draft_len`: Maximum draft candidate length.
* `num_nextn_predict_layers`: Number of MTP modules to use. Currently must match `max_draft_len`.
* `use_relaxed_acceptance_for_thinking`: If true, use relaxed decoding for reasoning models in the thinking phase. In this mode, speculation requirements are relaxed for the thinking phase - a draft token may be accepted if it appears in a candidate set constructed with `relaxed_topk` and `relaxed_delta`.
* `relaxed_topk`: The top K tokens are sampled from the target model's logits to create the initial candidate set for relaxed decoding.
* `relaxed_delta`: Used to further filter the top K candidate set for relaxed decoding. We remove tokens `t` for which `log(P(top 1 token)) - log(P(t)) > relaxed_delta`.
```python
from tensorrt_llm.llmapi import MTPDecodingConfig
speculative_config = MTPDecodingConfig(
max_draft_len=3, num_nextn_predict_layers=3)
llm = LLM("/path/to/deepseek_model", speculative_config=speculative_config)
```
### User-provided drafting
A completely user-defined drafting method can be supplied with a `UserProvidedDecodingConfig` that includes
* `max_draft_len`: Maximum draft candidate length.
* `drafter`: An object of type `Drafter` that implements the `prepare_draft_tokens` method (see [Developer Guide](speculative-decoding.md#developer-guide) 7.)
* `resource_manager`: An optional `ResourceManager` object (see [Developer Guide](speculative-decoding.md#developer-guide) 4.)
```python
from tensorrt_llm.llmapi import UserProvidedDecodingConfig
speculative_config = UserProvidedDecodingConfig(
max_draft_len=3, drafter=MyDrafter())
llm = LLM("/path/to/target_model", speculative_config=speculative_config)
```
## Usage with `trtllm-bench` and `trtllm-serve`
Speculative decoding options must be specified via `--extra_llm_api_options config.yaml` for both `trtllm-bench` and `trtllm-serve`. All speculative decoding options can be specified in this YAML file. An additional `decoding_type` option is used to specify the type of speculation to use. The available options are:
* `MTP`
* `Eagle` (for EAGLE 3)
* `NGram`
* `DraftTarget`
The rest of the argument names/valid values are the same as in their corresponding configuration class described in the Quick Start section. For example, a YAML configuration could look like this:
```
disable_overlap_scheduler: true
speculative_config:
decoding_type: Eagle
max_draft_len: 4
speculative_model: /path/to/draft/model
```
## Developer Guide
This section describes the components of a speculative decoding algorithm. All of the interfaces are defined in [`_torch/speculative/interface.py`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/speculative/interface.py).
1. `SpeculativeDecodingMode`: this is a simple `IntEnum`, one for each supported algorithm. There are a few
nontrivial methods, however.
- `needs_kv_cache_rewind`. See "KV Cache Rewind" below. In general, this is true for all two model speculative
decoding algorithms.
- `extend_ctx`: If true, the speculative decoding dispatches requests with `py_draft_tokens` attached to them
to the *prefill* version of the attention kernels. This usually needs to be true. The exception is when you're on
Blackwell using the TensorRT LLM attention backend. In that case, use the generation kernels for better performance.
This optimized kernel has one limitation; all draft lengths must be the same (or padding must be used) in this case.
> *These may be refactored in the future to reduce the difficulty of adding a new speculative
decoding algorithm. `extend_ctx` in particular is problematic. Ideally, we would move all of the kernel dispatching logic
to a lower level of abstraction.*
2. `SpecMetadata`: Defines all metadata that should be passed to the model during the forward pass to facilitate speculative decoding.
Each speculative decoding algorithm defines a subclass of `SpecMetadata`. Similar to `AttentionMetadata`, each `CUDAGraphRunner` owns
its own `SpecMetadata`, and CUDA-graph compatible `SpecMetadata` objects may be created by invoking `create_cuda_graph_metadata(batch_size)`.
`SpecMetadata` has many fields. Many of them are exclusively used by the one model implementation. For the two model implementation, the
main purpose of `SpecMetadata` is to facilitate the capture of hidden states. In EAGLE 3, we need to capture hidden states from the
target model to use as draft model inputs. The `SpecMetadata` stores a list of layers to capture and the model calls
`maybe_capture_hidden_states(layer_id, hidden_states, residual)` during its forward pass. If the layer ID is in the list of layers to capture,
the hidden states are saved. For CUDA graph compatibility, these may be saved in pre-allocated buffers.
`SpecMetadata` is derived from a `SpecConfig` object in `_torch/speculative/utils.py`. There are a few other optional components created in
this file too:
4. `ResourceManager`: Create a custom resource manager to prepare and free resources before and after target forward passes; see
the section on `ResourceManager` in `arch.md`. This is used by the n-gram method to manage its pool. The one model implementation also uses
`ResourceManager`s to manage hidden states.
5. `Sampler`: Each speculative decoding algorithm can optionally create its own sampler. This is mostly used by the one model implementation.
The default `TorchSampler` is used as a fallback if no custom sampler is provided. EAGLE 3 two model also has a simple custom decoder to handle
differences in the draft/target model vocab sizes.
6. `Worker`: This is exclusive to the one-model implementation. The `Worker` is the object that gets injected into the target model as a
submodule.
7. `Drafter`: All of the logic required to actually produce draft tokens should be implemented in a `Drafter` subclass. There is a single
abstract method, `prepare_draft_tokens`. It takes a set of requests (a `ScheduledRequests` object) and returns nothing. The [`PyExecutor`](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/_torch/pyexecutor/py_executor.py#L162) expects
draft tokens to be attached to the `py_draft_tokens` field of request that speculation is to be done for.
## Two Model Speculative Decoding Architecture
Two-model based speculation implementations do not support overlap scheduler. It will be disabled automatically.
In this approach, there are two new steps to the `PyExecutor`'s `_executor_loop`.
* `_prepare_draft_requests`
* `_prepare_draft_tokens`
### `_prepare_draft_requests`
This stage occurs for all speculative decoding algorithms before scheduling. The purpose
of this stage is to make the KV cache and scheduler aware of the fact that speculative decoding
will occur. Draft tokens take up extra KV cache pages and count towards the executor's
`max_num_tokens` limit. Thus, we need a way to tell the scheduler that drafting will occur
**before we do the scheduling**.
To achieve this, we simply attach the maximum number of draft tokens to each request. The
scheduler and KV cache manager will automatically account for tokens attached to the
`py_draft_tokens` attribute.
```python
for req in self.active_requests:
req.py_draft_tokens = [0] * max_draft_len
```
### `_prepare_draft_tokens`
This stage occurs after scheduling and KV cache allocation. The purpose of this stage
is to attach draft tokens to the `py_draft_tokens` attribute. This occurs by calling `self.drafter.prepare_draft_tokens`;
each speculative decoding algorithm should have a concrete instance of the `Drafter` class associated with it that defines
the drafting logic.
In addition to producing all "real" draft tokens, `_prepare_draft_tokens` currently must also pad
all `py_draft_tokens` to the maximum draft length. This is a CUDA graph limitation - the target
model captures its CUDA graphs using the maximum number of draft tokens on each request.
### Verification and Sampling
Once the draft tokens are obtained, the target model runs a forward pass through the usual flow.
Everything is the same, except that the logits for all the draft tokens are returned and passed
to the sampler.
Currently, only greedy sampling is supported for speculative decoding. A draft token is accepted if
matches the previously decoded token exactly. For example, suppose there is a generation request
`[t, d1, d2, d3]`, where `d1`, `d2`, and `d3` are drat tokens. Suppose the token after `t` is `d1`
(determined with the `argmax` of the logits). `d1` is then accepted. If the token after `d1` is `d2`,
then `d2` can be accepted. And so on until draft tokens cannot be accepted anymore.
### KV Cache Rewind
KV cache space allocated to rejected tokens is freed before the next iteration. This is achieved by setting
the `request.py_rewind_len` attribute to `num_draft_tokens_allocated - num_accepted_tokens`. The pages are
freed as part of the `resource_manager.free_resources` routine.
The purpose of KV cache rewind is to avoid complicated page reuse logic in the KV cache manager's `prepare_resources`
function. In practice, this is very cheap since the blocks are just marked as available; no memory is actually freed.

View File

@ -1,9 +1,9 @@
.. TensorRT-LLM documentation master file, created by
.. TensorRT LLM documentation master file, created by
sphinx-quickstart on Wed Sep 20 08:35:21 2023.
You can adapt this file completely to your liking, but it should at least
contain the root `toctree` directive.
Welcome to TensorRT-LLM's Documentation!
Welcome to TensorRT LLM's Documentation!
========================================
.. toctree::
@ -13,21 +13,7 @@ Welcome to TensorRT-LLM's Documentation!
overview.md
quick-start-guide.md
key-features.md
torch.md
release-notes.md
.. toctree::
:maxdepth: 2
:caption: Installation
:name: Installation
.. installation/overview.md
installation/containers.md
installation/linux.md
installation/build-from-source-linux.md
installation/index.rst
.. toctree::
@ -35,117 +21,121 @@ Welcome to TensorRT-LLM's Documentation!
:caption: Deployment Guide
:name: Deployment Guide
deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.md
deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.md
deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.md
deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.md
.. toctree::
:maxdepth: 2
:caption: LLM API
:hidden:
:glob:
llm-api/*
.. toctree::
:maxdepth: 2
:caption: Examples
:hidden:
examples/index.rst
examples/customization.md
examples/llm_api_examples
examples/llm_api_examples.rst
examples/trtllm_serve_examples
examples/dynamo_k8s_example.rst
deployment-guide/index.rst
.. toctree::
:maxdepth: 2
:caption: Models
:name: Models
models/supported-models.md
models/adding-new-model.md
.. toctree::
:maxdepth: 2
:caption: Model Definition API
:hidden:
python-api/tensorrt_llm.layers.rst
python-api/tensorrt_llm.functional.rst
python-api/tensorrt_llm.models.rst
python-api/tensorrt_llm.plugin.rst
python-api/tensorrt_llm.quantization.rst
python-api/tensorrt_llm.runtime.rst
.. toctree::
:maxdepth: 2
:caption: C++ API
:hidden:
_cpp_gen/executor.rst
_cpp_gen/runtime.rst
.. toctree::
:maxdepth: 2
:caption: Command-Line Reference
:name: Command-Line Reference
:caption: CLI Reference
:name: CLI Reference
commands/trtllm-bench
commands/trtllm-build
commands/trtllm-eval
commands/trtllm-serve/index
.. toctree::
:maxdepth: 2
:caption: Architecture
:name: Architecture
:caption: API Reference
llm-api/index.md
llm-api/reference.rst
.. toctree::
:maxdepth: 2
:caption: Features
features/feature-combination-matrix.md
features/attention.md
features/disagg-serving.md
features/kvcache.md
features/long-sequence.md
features/lora.md
features/multi-modality.md
features/overlap-scheduler.md
features/paged-attention-ifb-scheduler.md
features/parallel-strategy.md
features/quantization.md
features/sampling.md
features/speculative-decoding.md
features/checkpoint-loading.md
features/auto_deploy/auto-deploy.md
.. toctree::
:maxdepth: 2
:caption: Developer Guide
architecture/overview.md
architecture/core-concepts.md
architecture/checkpoint.md
architecture/workflow.md
architecture/add-model.md
.. toctree::
:maxdepth: 2
:caption: Advanced
:name: Advanced
advanced/gpt-attention.md
advanced/gpt-runtime.md
advanced/executor.md
advanced/graph-rewriting.md
advanced/inference-request.md
advanced/lora.md
advanced/expert-parallelism.md
advanced/kv-cache-management.md
advanced/kv-cache-reuse.md
advanced/speculative-decoding.md
advanced/disaggregated-service.md
.. toctree::
:maxdepth: 2
:caption: Performance
:name: Performance
performance/perf-overview.md
Benchmarking <performance/perf-benchmarking.md>
performance/performance-tuning-guide/index
performance/perf-analysis.md
developer-guide/perf-analysis.md
developer-guide/perf-benchmarking.md
developer-guide/ci-overview.md
developer-guide/dev-containers.md
.. toctree::
:maxdepth: 2
:caption: Reference
:name: Reference
.. .. toctree::
.. :maxdepth: 2
.. :caption: Architecture
.. :name: Architecture
reference/troubleshooting.md
reference/support-matrix.md
.. architecture/overview.md
.. architecture/core-concepts.md
.. architecture/checkpoint.md
.. architecture/workflow.md
.. architecture/add-model.md
.. reference/upgrading.md
.. .. toctree::
.. :maxdepth: 2
.. :caption: Advanced
.. :name: Advanced
reference/precision.md
reference/memory.md
reference/ci-overview.md
reference/dev-containers.md
.. advanced/gpt-attention.md
.. advanced/gpt-runtime.md
.. advanced/executor.md
.. advanced/graph-rewriting.md
.. advanced/inference-request.md
.. advanced/lora.md
.. advanced/expert-parallelism.md
.. advanced/kv-cache-management.md
.. advanced/kv-cache-reuse.md
.. advanced/speculative-decoding.md
.. advanced/disaggregated-service.md
.. .. toctree::
.. :maxdepth: 2
.. :caption: Performance
.. :name: Performance
.. performance/perf-overview.md
.. Benchmarking <performance/perf-benchmarking.md>
.. performance/performance-tuning-guide/index
.. performance/perf-analysis.md
.. .. toctree::
.. :maxdepth: 2
.. :caption: Reference
.. :name: Reference
.. reference/troubleshooting.md
.. reference/support-matrix.md
.. .. reference/upgrading.md
.. reference/precision.md
.. reference/memory.md
.. toctree::
@ -153,12 +143,21 @@ Welcome to TensorRT-LLM's Documentation!
:caption: Blogs
:glob:
blogs/H100vsA100.md
blogs/H200launch.md
blogs/Falcon180B-H200.md
blogs/quantization-in-TRT-LLM.md
blogs/XQA-kernel.md
blogs/tech_blog/*
blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md
blogs/H200launch.md
blogs/XQA-kernel.md
blogs/H100vsA100.md
.. toctree::
:maxdepth: 2
:caption: Quick Links
Releases <https://github.com/NVIDIA/TensorRT-LLM/releases>
Github Code <https://github.com/NVIDIA/TensorRT-LLM>
Roadmap <https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap>
.. toctree::
:maxdepth: 2

View File

@ -2,17 +2,17 @@
# Building from Source Code on Linux
This document provides instructions for building TensorRT-LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT-LLM wheel on PyPI. Note that the current pre-built TensorRT-LLM wheel on PyPI is linked against PyTorch 2.7.0 and subsequent versions, which uses the new CXX11 ABI.
This document provides instructions for building TensorRT LLM from source code on Linux. Building from source is recommended for achieving optimal performance, enabling debugging capabilities, or when you need a different [GNU CXX11 ABI](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html) configuration than what is available in the pre-built TensorRT LLM wheel on PyPI. Note that the current pre-built TensorRT LLM wheel on PyPI is linked against PyTorch 2.7.0 and subsequent versions, which uses the new CXX11 ABI.
## Prerequisites
Use [Docker](https://www.docker.com) to build and run TensorRT-LLM. Instructions to install an environment to run Docker containers for the NVIDIA platform can be found [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
Use [Docker](https://www.docker.com) to build and run TensorRT LLM. Instructions to install an environment to run Docker containers for the NVIDIA platform can be found [here](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html).
If you intend to build any TensortRT-LLM artifacts, such as any of the container images (note that there exist pre-built [develop](#build-from-source-tip-develop-container) and [release](#build-from-source-tip-release-container) container images in NGC), or the TensorRT-LLM Python wheel, you first need to clone the TensorRT-LLM repository:
If you intend to build any TensortRT-LLM artifacts, such as any of the container images (note that there exist pre-built [develop](#build-from-source-tip-develop-container) and [release](#build-from-source-tip-release-container) container images in NGC), or the TensorRT LLM Python wheel, you first need to clone the TensorRT LLM repository:
```bash
# TensorRT-LLM uses git-lfs, which needs to be installed in advance.
# TensorRT LLM uses git-lfs, which needs to be installed in advance.
apt-get update && apt-get -y install git git-lfs
git lfs install
@ -22,24 +22,24 @@ git submodule update --init --recursive
git lfs pull
```
## Building a TensorRT-LLM Docker Image
## Building a TensorRT LLM Docker Image
There are two options to create a TensorRT-LLM Docker image. The approximate disk space required to build the image is 63 GB.
There are two options to create a TensorRT LLM Docker image. The approximate disk space required to build the image is 63 GB.
### Option 1: Build TensorRT-LLM in One Step
### Option 1: Build TensorRT LLM in One Step
```{tip}
:name: build-from-source-tip-release-container
If you just want to run TensorRT-LLM, you can instead [use the pre-built TensorRT-LLM Release container images](containers).
If you just want to run TensorRT LLM, you can instead [use the pre-built TensorRT LLM Release container images](containers).
```
TensorRT-LLM contains a simple command to create a Docker image. Note that if you plan to develop on TensorRT-LLM, we recommend using [Option 2: Build TensorRT-LLM Step-By-Step](#option-2-build-tensorrt-llm-step-by-step).
TensorRT LLM contains a simple command to create a Docker image. Note that if you plan to develop on TensorRT LLM, we recommend using [Option 2: Build TensorRT LLM Step-By-Step](#option-2-build-tensorrt-llm-step-by-step).
```bash
make -C docker release_build
```
You can add the `CUDA_ARCHS="<list of architectures in CMake format>"` optional argument to specify which architectures should be supported by TensorRT-LLM. It restricts the supported GPU architectures but helps reduce compilation time:
You can add the `CUDA_ARCHS="<list of architectures in CMake format>"` optional argument to specify which architectures should be supported by TensorRT LLM. It restricts the supported GPU architectures but helps reduce compilation time:
```bash
# Restrict the compilation to Ada and Hopper architectures.
@ -52,19 +52,20 @@ After the image is built, the Docker container can be run.
make -C docker release_run
```
The `make` command supports the `LOCAL_USER=1` argument to switch to the local user account instead of `root` inside the container. The examples of TensorRT-LLM are installed in the `/app/tensorrt_llm/examples` directory.
The `make` command supports the `LOCAL_USER=1` argument to switch to the local user account instead of `root` inside the container. The examples of TensorRT LLM are installed in the `/app/tensorrt_llm/examples` directory.
Since TensorRT-LLM has been built and installed, you can skip the remaining steps.
Since TensorRT LLM has been built and installed, you can skip the remaining steps.
### Option 2: Container for building TensorRT-LLM Step-by-Step
(option-2-build-tensorrt-llm-step-by-step)=
### Option 2: Container for building TensorRT LLM Step-by-Step
If you are looking for more flexibility, TensorRT-LLM has commands to create and run a development container in which TensorRT-LLM can be built.
If you are looking for more flexibility, TensorRT LLM has commands to create and run a development container in which TensorRT LLM can be built.
```{tip}
:name: build-from-source-tip-develop-container
As an alternative to building the container image following the instructions below,
you can pull a pre-built [TensorRT-LLM Develop container image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/devel) from NGC (see [here](containers) for information on container tags).
Follow the linked catalog entry to enter a new container based on the pre-built container image, with the TensorRT source repository mounted into it. You can then skip this section and continue straight to [building TensorRT-LLM](#build-tensorrt-llm).
you can pull a pre-built [TensorRT LLM Develop container image](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/devel) from NGC (see [here](containers) for information on container tags).
Follow the linked catalog entry to enter a new container based on the pre-built container image, with the TensorRT source repository mounted into it. You can then skip this section and continue straight to [building TensorRT LLM](#build-tensorrt-llm).
```
**On systems with GNU `make`**
@ -110,21 +111,21 @@ Follow the linked catalog entry to enter a new container based on the pre-built
```
Note: please make sure to set `--ipc=host` as a docker run argument to avoid `Bus error (core dumped)`.
Once inside the container, follow the next steps to build TensorRT-LLM from source.
Once inside the container, follow the next steps to build TensorRT LLM from source.
### Advanced topics
For more information on building and running various TensorRT-LLM container images,
For more information on building and running various TensorRT LLM container images,
check <https://github.com/NVIDIA/TensorRT-LLM/tree/main/docker>.
## Build TensorRT-LLM
## Build TensorRT LLM
### Option 1: Full Build with C++ Compilation
The following command compiles the C++ code and packages the compiled libraries along with the Python files into a wheel. When developing C++ code, you need this full build command to apply your code changes.
```bash
# To build the TensorRT-LLM code.
# To build the TensorRT LLM code.
python3 ./scripts/build_wheel.py
```
@ -147,13 +148,13 @@ directory, add the `--clean` option:
python3 ./scripts/build_wheel.py --clean
```
It is possible to restrict the compilation of TensorRT-LLM to specific CUDA
It is possible to restrict the compilation of TensorRT LLM to specific CUDA
architectures. For that purpose, the `build_wheel.py` script accepts a
semicolon separated list of CUDA architecture as shown in the following
example:
```bash
# Build TensorRT-LLM for Ampere.
# Build TensorRT LLM for Ampere.
python3 ./scripts/build_wheel.py --cuda_architectures "80-real;86-real"
```
@ -179,15 +180,15 @@ relevant classes. The associated unit tests should also be consulted for underst
This feature will not be enabled when [`building only the C++ runtime`](#link-with-the-tensorrt-llm-c++-runtime).
#### Linking with the TensorRT-LLM C++ Runtime
#### Linking with the TensorRT LLM C++ Runtime
The `build_wheel.py` script will also compile the library containing the C++ runtime of TensorRT-LLM. If Python support and `torch` modules are not required, the script provides the option `--cpp_only` which restricts the build to the C++ runtime only.
The `build_wheel.py` script will also compile the library containing the C++ runtime of TensorRT LLM. If Python support and `torch` modules are not required, the script provides the option `--cpp_only` which restricts the build to the C++ runtime only.
```bash
python3 ./scripts/build_wheel.py --cuda_architectures "80-real;86-real" --cpp_only --clean
```
This is particularly useful for avoiding linking issues that may arise with older versions of `torch` (prior to 2.7.0) due to the [Dual ABI support in GCC](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html). The `--clean` option removes the build directory before starting a new build. By default, TensorRT-LLM uses `cpp/build` as the build directory, but you can specify a different location with the `--build_dir` option. For a complete list of available build options, run `python3 ./scripts/build_wheel.py --help`.
This is particularly useful for avoiding linking issues that may arise with older versions of `torch` (prior to 2.7.0) due to the [Dual ABI support in GCC](https://gcc.gnu.org/onlinedocs/libstdc++/manual/using_dual_abi.html). The `--clean` option removes the build directory before starting a new build. By default, TensorRT LLM uses `cpp/build` as the build directory, but you can specify a different location with the `--build_dir` option. For a complete list of available build options, run `python3 ./scripts/build_wheel.py --help`.
The shared library can be found in the following location:
@ -203,18 +204,18 @@ cpp/build/tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so
#### Supported C++ Header Files
When using TensorRT-LLM, you need to add the `cpp` and `cpp/include` directories to the project's include paths. Only header files contained in `cpp/include` are part of the supported API and may be directly included. Other headers contained under `cpp` should not be included directly since they might change in future versions.
When using TensorRT LLM, you need to add the `cpp` and `cpp/include` directories to the project's include paths. Only header files contained in `cpp/include` are part of the supported API and may be directly included. Other headers contained under `cpp` should not be included directly since they might change in future versions.
### Option 2: Python-Only Build without C++ Compilation
If you only need to modify Python code, it is possible to package and install TensorRT-LLM without compilation.
If you only need to modify Python code, it is possible to package and install TensorRT LLM without compilation.
```bash
# Package TensorRT-LLM wheel.
# Package TensorRT LLM wheel.
TRTLLM_USE_PRECOMPILED=1 pip wheel . --no-deps --wheel-dir ./build
# Install TensorRT-LLM wheel.
# Install TensorRT LLM wheel.
pip install ./build/tensorrt_llm*.whl
```

View File

@ -1,7 +1,9 @@
(containers)=
# Pre-built release container images on NGC
Pre-built TensorRT-LLM releases are made available as container images
on NGC. This is likely the simplest way to obtain TensorRT-LLM. Please refer to the [documentation in NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) for usage instructions.
Pre-built TensorRT LLM releases are made available as container images
on NGC. This is likely the simplest way to obtain TensorRT LLM. Please refer to the [documentation in NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release) for usage instructions.
{{container_tag_admonition}}

View File

@ -0,0 +1,23 @@
.. _installation:
Installation
============
There are multiple ways to install and run TensorRT LLM. For most users, the options below should be ordered from simple to complex. The approaches are equivalent in terms of the supported features.
Note: **This project will download and install additional third-party open source software projects. Review the license terms of these open source projects before use.**
1. :ref:`containers`
2. Pre-built release wheels on `PyPI <https://pypi.org/project/tensorrt-llm>`_ (see :ref:`linux`)
3. :ref:`build-from-source-linux`
.. toctree::
:maxdepth: 1
:caption: Links
:hidden:
containers
linux
build-from-source-linux

View File

@ -2,7 +2,7 @@
# Installing on Linux via `pip`
1. Install TensorRT-LLM (tested on Ubuntu 24.04).
1. Install TensorRT LLM (tested on Ubuntu 24.04).
### Install prerequisites
@ -23,14 +23,14 @@
```{tip}
Instead of manually installing the preqrequisites as described
above, it is also possible to use the pre-built [TensorRT-LLM Develop container
above, it is also possible to use the pre-built [TensorRT LLM Develop container
image hosted on NGC](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/devel)
(see [here](containers) for information on container tags).
```
### Install pre-built TensorRT-LLM wheel
### Install pre-built TensorRT LLM wheel
Once all prerequisites are in place, TensorRT-LLM can be installed as follows:
Once all prerequisites are in place, TensorRT LLM can be installed as follows:
```bash
pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm
@ -46,12 +46,12 @@
**Known limitations**
There are some known limitations when you pip install pre-built TensorRT-LLM wheel package.
There are some known limitations when you pip install pre-built TensorRT LLM wheel package.
1. MPI in the Slurm environment
If you encounter an error while running TensorRT-LLM in a Slurm-managed cluster, you need to reconfigure the MPI installation to work with Slurm.
The setup methods depends on your slurm configuration, pls check with your admin. This is not a TensorRT-LLM specific, rather a general mpi+slurm issue.
If you encounter an error while running TensorRT LLM in a Slurm-managed cluster, you need to reconfigure the MPI installation to work with Slurm.
The setup methods depends on your slurm configuration, pls check with your admin. This is not a TensorRT LLM specific, rather a general mpi+slurm issue.
```
The application appears to have been direct launched using "srun",
but OMPI was not built with SLURM support. This usually happens

View File

@ -10,7 +10,7 @@ While the LLM API simplifies inference workflows with a high-level interface, it
## Quick Start Example
A simple inference example with TinyLlama using the LLM API:
```{literalinclude} ../../examples/llm-api/quickstart_example.py
```{literalinclude} ../../../examples/llm-api/quickstart_example.py
:language: python
:linenos:
```

Some files were not shown because too many files have changed in this diff Show More