Update TensorRT-LLM (#2389)

* Update TensorRT-LLM

---------

Co-authored-by: Alessio Netti <netti.alessio@gmail.com>
This commit is contained in:
Kaiyu Xie 2024-10-29 22:24:38 +08:00 committed by GitHub
parent 3c46c2794e
commit f14d1d433c
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
197 changed files with 4711 additions and 811 deletions

2
.gitmodules vendored
View File

@ -13,7 +13,7 @@
url = https://github.com/NVIDIA/NVTX.git
[submodule "3rdparty/ucxx"]
path = 3rdparty/ucxx
url = https://github.com/GuanLuo/ucxx.git
url = https://github.com/rapidsai/ucxx.git
[submodule "3rdparty/pybind11"]
path = 3rdparty/pybind11
url = https://github.com/pybind/pybind11.git

2
3rdparty/ucxx vendored

@ -1 +1 @@
Subproject commit b99181779672965c6f325a95a29eb433b6e9cbbd
Subproject commit 5c745102c26df11e68f11368bcd9649e81e981da

View File

@ -6,8 +6,8 @@ TensorRT-LLM
[![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
[![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt)
[![cuda](https://img.shields.io/badge/cuda-12.6.1-green)](https://developer.nvidia.com/cuda-downloads)
[![trt](https://img.shields.io/badge/TRT-10.5.0-green)](https://developer.nvidia.com/tensorrt)
[![version](https://img.shields.io/badge/release-0.15.0.dev-green)](./tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

View File

@ -7,5 +7,6 @@ There are currently three workflows to benchmark TensorRT-LLM:
- The recommended workflow that uses TensorRT-LLM C++ API and can take advantage of the latest features of TensorRT-LLM.
* [Python benchmarks](./python)
- The Python benchmarking scripts can only benchmark the Python runtime, which do not support the latest features, such as in-flight batching.
* [The Python benchmarking suite](./Suite.md)
- This benchmarking suite is a current work in progress and is prone to large changes.
* [The Python benchmarking suite](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html)
- This benchmarker is native to TensorRT-LLM and is a Python benchmarker for reproducing and testing the performance of TensorRT-LLM.
- _NOTE_: This benchmarking suite is a current work in progress and is prone to large changes.

View File

@ -145,6 +145,7 @@ struct BenchmarkParams
{
std::optional<SizeType32> maxTokensInPagedKvCache{std::nullopt};
std::optional<float> freeGpuMemoryFraction{std::nullopt};
std::optional<float> crossKvCacheFraction{std::nullopt};
bool enableTrtOverlap{false};
bool enableBlockReuse{false};
bool enableChunkedContext{false};
@ -882,7 +883,8 @@ public:
texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy);
texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength,
benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks,
benchmarkParams.crossKvCacheFraction);
texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
std::nullopt, benchmarkParams.loraHostCacheSize);
texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode,
@ -1486,6 +1488,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
std::nullopt, // pTuning
loraConfig, // loraConfig
lookaheadConfig, // lookaheadConfig
std::nullopt, // kvCacheRetentionConfig
std::nullopt, // logitsPostProcessorName
encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
}
@ -1509,6 +1512,10 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
{
optionalParams.kvCacheConfig.freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction;
}
if (benchmarkParams.crossKvCacheFraction)
{
optionalParams.kvCacheConfig.crossKvCacheFraction = benchmarkParams.crossKvCacheFraction;
}
if (benchmarkParams.maxAttentionWindowVec)
{
optionalParams.kvCacheConfig.maxAttentionWindowVec = benchmarkParams.maxAttentionWindowVec;
@ -1953,6 +1960,8 @@ int main(int argc, char* argv[])
"random_seed", "integer random seed for exponential time delays.", cxxopts::value<int>()->default_value("420"));
options.add_options()(
"kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value<float>());
options.add_options()(
"cross_kv_cache_fraction", "Cross K-V Cache Fraction (from 0.0 to 1.0).", cxxopts::value<float>());
options.add_options()("request_rate",
"request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
cxxopts::value<float>());
@ -2126,6 +2135,20 @@ int main(int argc, char* argv[])
{
benchmarkParams.freeGpuMemoryFraction = result["kv_cache_free_gpu_mem_fraction"].as<float>();
}
// Argument: K-V Cache Cross Attention Fraction. Only applicable to enc-dec models.
if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir"))
{
if (result.count("cross_kv_cache_fraction"))
{
benchmarkParams.crossKvCacheFraction = result["cross_kv_cache_fraction"].as<float>();
}
else
{
benchmarkParams.crossKvCacheFraction
= 0.5f; // default value if not set. but non enc-dec should not even have this param set
}
}
// Argument: Enable TRT overlap
benchmarkParams.enableTrtOverlap = result["enable_trt_overlap"].as<bool>();
@ -2342,14 +2365,14 @@ int main(int argc, char* argv[])
{
texec::ModelType executorModelType;
std::optional<std::string> decoderEngineDir = std::nullopt, encoderEngineDir = std::nullopt;
if (result.count("encoder_engine_dir") && result.count("engine_dir"))
if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir"))
{
TLLM_CHECK_WITH_INFO(api == "executor", "encoder-decoder only support executor api.");
TLLM_CHECK_WITH_INFO(
modelType == TrtGptModelType::InflightFusedBatching, "encoder-decoder only support inflight batching.");
executorModelType = texec::ModelType::kENCODER_DECODER;
decoderEngineDir = result["engine_dir"].as<std::string>();
encoderEngineDir = result["encoder_engine_dir"].as<std::string>();
decoderEngineDir = result["decoder_engine_dir"].as<std::string>();
}
else if (result.count("engine_dir"))
{

View File

@ -30,45 +30,47 @@ class BaseEvictionPolicy
public:
virtual ~BaseEvictionPolicy() = default;
virtual void initialize(
std::vector<BlockPtr>& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks)
// TODO(TRTLLM-1564): Don't use a separate `initialize` function. Ensure eviction policies can't be in-between a
// state of construction and initialization.
virtual void initialize(std::vector<BlockPtr>& mAllBlocksById, std::vector<SizeType32> sizes,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt)
= 0;
// Get a free block from the primary memory pool
virtual BlockPtr getFreePrimaryBlock() = 0;
// Get a free block from the secondary memory pool
virtual BlockPtr getFreeSecondaryBlock() = 0;
// Release a block. Prioritize the block for eviction if toFront=true
/// @brief Get a free block from the specified cache level
/// @returns The pointer to the free block, along with whether it can be offloaded
virtual std::tuple<BlockPtr, bool> getFreeBlock(SizeType32 cacheLevel) = 0;
/// @brief Release a block. Prioritize the block for eviction if toFront=true
virtual void releaseBlock(BlockPtr block, bool toFront = false) = 0;
// Get the amount of free blocks in the primary memory pool
virtual SizeType32 getNumFreePrimaryBlocks() = 0;
// Get the amount of free blocks in the secondary memory pool
virtual SizeType32 getNumFreeSecondaryBlocks() = 0;
// Claim a free block. Called when the cache manager allocates or reuses a new block
virtual void claimBlock(KVCacheBlock block) = 0;
/// @brief Get the amount of free blocks in the primary memory pool
virtual SizeType32 getNumFreeBlocks(SizeType32 cacheLevel) = 0;
/// @brief Claim a free block. Called when the cache manager allocates or reuses a new block
virtual void claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt) = 0;
};
class LRUEvictionPolicy : public BaseEvictionPolicy
{
public:
void initialize(
std::vector<BlockPtr>& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks) override;
BlockPtr getFreePrimaryBlock() override;
BlockPtr getFreeSecondaryBlock() override;
void initialize(std::vector<BlockPtr>& mAllBlocksById, std::vector<SizeType32> sizes,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt) override;
std::tuple<BlockPtr, bool> getFreeBlock(SizeType32 cacheLevel) override;
void releaseBlock(BlockPtr block, bool toFront = false) override;
SizeType32 getNumFreePrimaryBlocks() override;
SizeType32 getNumFreeSecondaryBlocks() override;
void claimBlock(KVCacheBlock block);
SizeType32 getNumFreeBlocks(SizeType32 cacheLevel) override;
void claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt) override;
private:
FreeBlocksQueue mFreePrimaryBlocks;
FreeBlocksQueue mFreeSecondaryBlocks;
// Check if the block should be added to mFreeQueues.
bool isReleasedLeafBlock(BlockPtr block);
// Queues of available leaf blocks, split by cache level and priority level
std::vector<std::vector<FreeBlocksQueue>> mFreeQueues;
// All blocks that have been released, along with the amount of released children
std::vector<std::unordered_set<SizeType32>> mReleasedBlocks;
// Iterators to block entries in mFreeQueues
std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
SizeType32 mFreePrimaryBlocksSize;
SizeType32 mFreeSecondaryBlocksSize;
// Amount of free blocks at each cache level
std::vector<SizeType32> mNumFreeBlocksPerLevel;
// Secondary offload threshold. Blocks below this priority won't be evicted.
executor::RetentionPriority mSecondaryOffloadMinPriority;
};
} // namespace tensorrt_llm::batch_manager::eviction_policy

View File

@ -41,7 +41,9 @@ public:
std::optional<std::vector<SizeType32>> maxAttentionWindowVec = std::nullopt,
std::optional<SizeType32> sinkTokenLength = std::nullopt,
std::optional<float> freeGpuMemoryFraction = std::nullopt, bool enableBlockReuse = false, bool useUvm = false,
std::optional<size_t> hostCacheSize = std::nullopt, bool onboardBlocks = true)
std::optional<size_t> hostCacheSize = std::nullopt, bool onboardBlocks = true,
std::optional<float> crossKvCacheFraction = std::nullopt,
std::optional<SizeType32> secondaryOffloadMinPriority = std::nullopt)
: maxTokens{maxTokens}
, maxAttentionWindowVec{maxAttentionWindowVec}
, sinkTokenLength{sinkTokenLength}
@ -50,6 +52,8 @@ public:
, useUvm(useUvm)
, hostCacheSize(hostCacheSize)
, onboardBlocks(onboardBlocks)
, crossKvCacheFraction{crossKvCacheFraction}
, secondaryOffloadMinPriority(secondaryOffloadMinPriority)
{
}
@ -57,7 +61,7 @@ public:
: KvCacheConfig(kvCacheConfig.getMaxTokens(), kvCacheConfig.getMaxAttentionWindowVec(),
kvCacheConfig.getSinkTokenLength(), kvCacheConfig.getFreeGpuMemoryFraction(),
kvCacheConfig.getEnableBlockReuse(), false, kvCacheConfig.getHostCacheSize(),
kvCacheConfig.getOnboardBlocks())
kvCacheConfig.getOnboardBlocks(), kvCacheConfig.getCrossKvCacheFraction())
{
}
@ -66,7 +70,8 @@ public:
return maxTokens == other.maxTokens && maxAttentionWindowVec == other.maxAttentionWindowVec
&& sinkTokenLength == other.sinkTokenLength && freeGpuMemoryFraction == other.freeGpuMemoryFraction
&& enableBlockReuse == other.enableBlockReuse && useUvm == other.useUvm
&& hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks;
&& hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks
&& crossKvCacheFraction == other.crossKvCacheFraction;
}
friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self);
@ -80,5 +85,9 @@ public:
bool useUvm;
std::optional<size_t> hostCacheSize;
bool onboardBlocks;
// Cross will use crossKvCacheFraction of KV Cache and self attention will use the rest.
std::optional<float> crossKvCacheFraction;
// The minimum priority level to allow blocks to be offloaded to secondary memory.
std::optional<SizeType32> secondaryOffloadMinPriority;
};
} // namespace tensorrt_llm::batch_manager::kv_cache_manager

View File

@ -60,12 +60,13 @@ using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType;
struct BlockKey
{
bool hasLora;
LoraTaskIdType loraTaskId;
VecUniqueTokens uniqueTokens;
bool operator==(BlockKey const& other) const noexcept
{
return (loraTaskId == other.loraTaskId && uniqueTokens == other.uniqueTokens);
return (hasLora == other.hasLora && loraTaskId == other.loraTaskId && uniqueTokens == other.uniqueTokens);
}
};
@ -97,8 +98,14 @@ struct BlockKeyHasher
c = (c ^ (c >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
c = (c ^ (c >> 27)) * UINT64_C(0x94d049bb133111eb);
c = c ^ (c >> 31);
seed ^= c + 0x9e3779b9 + (seed << 6) + (seed >> 2);
uint32_t d = static_cast<uint32_t>(blockKey.hasLora);
d = ((d >> 16) ^ d) * 0x45d9f3b;
d = ((d >> 16) ^ d) * 0x45d9f3b;
d = (d >> 16) ^ d;
seed ^= d + 0x9e3779b9 + (seed << 6) + (seed >> 2);
return seed;
}
};
@ -107,13 +114,24 @@ using NextBlockMap = std::unordered_map<BlockKey, BlockPtr, BlockKeyHasher>;
struct KvCacheStats
{
// Number of maximum available blocks in the primary memory pool. This is determined and set by available primary
// memory. See calculateMaxNumBlocks for details.
SizeType32 maxNumBlocks;
// Number of free blocks in the primary memory pool.
SizeType32 freeNumBlocks;
// Number of used blocks in the primary memory pool. usedNumBlocks = maxNumBlocks - freeNumBlocks.
SizeType32 usedNumBlocks;
SizeType32 toksPerBlock;
// Total number of blocks allocated by all requests.
SizeType32 allocTotalBlocks;
// Number of new blocks that were allocated.
SizeType32 allocNewBlocks;
// Number of blocks that were matched and reused.
SizeType32 reusedBlocks;
// Number of blocks that were not matched and not reused.
SizeType32 missedBlocks;
// Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
float cacheHitRate;
};
// Basic building block of a paged KV cache - a single
@ -152,6 +170,8 @@ public:
[[nodiscard]] VecUniqueTokens const& getUniqueTokens() const;
BlockPtr getPrevBlock() const;
void setPrevBlock(BlockPtr prevBlock);
void addNextBlock(BlockKey const& blockKey, BlockPtr block);
@ -167,6 +187,10 @@ public:
[[nodiscard]] bool isShared() const;
void setPriority(executor::RetentionPriority priority);
executor::RetentionPriority getPriority() const;
private:
// Linear ID of block independent of pool
IdType mBlockId;
@ -195,6 +219,9 @@ private:
// Flag indicating if block is full
bool mIsFull;
// Priority of the block
executor::RetentionPriority mPriority;
};
class GenerationRequest
@ -203,14 +230,17 @@ public:
using SizeType32 = tensorrt_llm::runtime::SizeType32;
explicit GenerationRequest(LlmRequest::RequestIdType requestId, SizeType32 numTokens, SizeType32 beamWidth,
SizeType32 maxBlocks, SizeType32 numPools = 1)
SizeType32 maxBlocks, SizeType32 numPools = 1,
executor::RetentionPriority decodeRetentionPriority
= executor::KvCacheRetentionConfig::kDefaultRetentionPriority)
: mRequestId(requestId)
, mNumTokens(numTokens)
, mBeamWidth(beamWidth)
, mCacheBlockIds(beamWidth)
, mCacheBlockIndices{
runtime::BufferManager::cpu(runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value)}
, mCacheBlockIndices{runtime::BufferManager::cpu(
runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value)}
, mDecodeRetentionPriority(decodeRetentionPriority)
{
auto cacheBlockIdsRange = runtime::BufferRange<tensorrt_llm::kernels::KVCacheIndex>(*mCacheBlockIndices);
std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(),
@ -286,6 +316,11 @@ public:
}
}
[[nodiscard]] executor::RetentionPriority getDecodeRetentionPriority() const
{
return mDecodeRetentionPriority;
}
private:
// Request id of the sequence
LlmRequest::RequestIdType mRequestId;
@ -297,6 +332,8 @@ private:
std::vector<std::vector<KVCacheBlock::IdType>> mCacheBlockIds;
// Tensor of block indices allocated for each beam of the sequence
runtime::ITensor::SharedPtr mCacheBlockIndices;
// The retention priority to assign to decode blocks
executor::RetentionPriority mDecodeRetentionPriority;
};
// attach metadata to a pool pointer
@ -344,7 +381,8 @@ public:
explicit BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
CacheType cacheType = CacheType::kSELF);
CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt);
~BlockManager();
@ -396,6 +434,11 @@ public:
return getMaxNumBlocks() - getNumFreeBlocks();
}
[[nodiscard]] SizeType32 getNumMissedBlocks() const noexcept
{
return mMissedBlocks;
}
[[nodiscard]] bool hasFreeBlocks(SizeType32 numRequired = 1) const noexcept
{
return getNumFreeBlocks() >= numRequired;
@ -494,20 +537,23 @@ private:
//! \brief Store blocks in cached blocks.
//! \param blockKeys Key of each block.
//! \param blockIds Id of each block.
void storeBlocks(std::list<BlockKey> blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds);
//! \param isChunkedContext Whether these blocks are being stored for chunked context.
void storeBlocks(std::list<BlockKey> blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
bool isChunkedContext = false);
//! \brief Try to load blocks from cache. Allocate new blocks if necessary.
//! \param blockKeys Key of each block.
//! \param sequence Sequence to which blocks are assigned.
//! \return Number of matched tokens from loaded blocks.
SizeType32 loadOrAllocateBlocks(
std::list<BlockKey> const& blockKeys, SizeType32 numContextBlocks, GenerationRequest& sequence);
SizeType32 loadOrAllocateBlocks(std::list<BlockKey> const& blockKeys, SizeType32 numContextBlocks,
GenerationRequest& sequence, std::vector<std::optional<executor::RetentionPriority>> blockPriorities);
//! \brief Find block least likely to be reused, free it if necessary and return.
[[nodiscard]] BlockPtr getFreeBlock();
[[nodiscard]] BlockPtr getFreeBlock(
executor::RetentionPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority);
//! \brief Free block from previous block and claim it from free blocks list.
void claimLeafBlock(KVCacheBlock& block);
void claimLeafBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt);
//! \brief Compute pointer to raw KV block (K & V, all layers).
[[nodiscard]] runtime::ITensor::SharedPtr computeBlockPointer(
@ -546,13 +592,24 @@ private:
std::vector<BlockPtr> mAllBlocksById;
// Dummy block acting as root for BlockToken searches
BlockPtr mCachedBlocksRoot;
// Statistics for block allocations/reuse
std::size_t mAllocTotalBlocks, mAllocNewBlocks, mReusedBlocks;
// KV cache type (self or cross)
CacheType mCacheType;
// Eviction Policy
std::shared_ptr<BaseEvictionPolicy> mEvictionPolicy;
// Statistics for block allocations/reuse
// Total number of blocks allocated by all requests
SizeType32 mAllocTotalBlocks;
// Number of new blocks that were allocated
SizeType32 mAllocNewBlocks;
// Number of blocks that were reused
SizeType32 mReusedBlocks;
// Number of unique blocks that were reused
SizeType32 mReusedUniqueBlocks;
// Number of blocks that were not reused
SizeType32 mMissedBlocks;
std::set<KVCacheBlock::IdType> reusedBlockIds;
private:
friend class KVCacheManager;
};
@ -570,13 +627,15 @@ public:
SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, bool useOneMoreBlock,
CudaStreamPtr stream, bool enableBlockReuse = false, bool onboardBlocks = true,
CacheType cacheType = CacheType::kSELF);
CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt);
KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, bool useOneMoreBlock,
CudaStreamPtr stream, bool enableBlockReuse = true, bool onboardBlocks = true,
CacheType cacheType = CacheType::kSELF);
CacheType cacheType = CacheType::kSELF,
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt);
void allocatePools(nvinfer1::DataType dtype, bool useUvm = false);
@ -617,6 +676,11 @@ public:
return mBlockManager.getNumReusedBlocks();
}
[[nodiscard]] SizeType32 getNumMissedBlocks() const noexcept
{
return mBlockManager.getNumMissedBlocks();
}
[[nodiscard]] KvCacheStats getKvCacheStats() const
{
KvCacheStats kvCacheStats;
@ -627,7 +691,10 @@ public:
kvCacheStats.allocTotalBlocks = getNumAllocTotalBlocks();
kvCacheStats.allocNewBlocks = getNumAllocNewBlocks();
kvCacheStats.reusedBlocks = getNumReusedBlocks();
kvCacheStats.missedBlocks = getNumMissedBlocks();
kvCacheStats.cacheHitRate = kvCacheStats.reusedBlocks == 0 ? 0
: static_cast<float>(kvCacheStats.reusedBlocks)
/ static_cast<float>(kvCacheStats.reusedBlocks + kvCacheStats.missedBlocks);
return kvCacheStats;
}
@ -689,14 +756,14 @@ public:
runtime::ITensor& output, SizeType32 outputSlotOffset, LlmRequest::RequestIdType requestId) const;
// Sum of numLayers * 2 * numKvHeads * sizePerHead for each pool
[[nodiscard]] static SizeType32 calculateCacheSizePerToken(
tensorrt_llm::runtime::ModelConfig const& modelConfig, tensorrt_llm::runtime::WorldConfig const& worldConfig)
[[nodiscard]] static SizeType32 calculateCacheSizePerToken(tensorrt_llm::runtime::ModelConfig const& modelConfig,
tensorrt_llm::runtime::WorldConfig const& worldConfig, bool isCrossAttention = false)
{
// NOTE: We expect the initialization of modelConfig to have already taken the tp size into account and do not
// address it here
// consider only local layers for the calculation
return modelConfig.getSumLocalKvHeads(
worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank())
worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank(), isCrossAttention)
* 2 * modelConfig.getSizePerHead();
}

View File

@ -95,8 +95,9 @@ public:
std::optional<SizeType32> promptVocabSize = std::nullopt,
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
std::optional<TensorPtr> loraConfig = std::nullopt,
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
bool returnContextLogits = false, bool returnGenerationLogits = false,
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
std::optional<std::shared_ptr<VecTokens>> draftTokens = std::nullopt,
std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
@ -134,6 +135,7 @@ public:
, mLoraWeights(std::move(loraWeights))
, mLoraConfig(std::move(loraConfig))
, mLookaheadConfig(std::move(lookaheadConfig))
, mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig))
, mContextChunkSize{mPromptLen}
, mLogProbs(samplingConfig.beamWidth)
, mCumLogProbs(samplingConfig.beamWidth)
@ -188,6 +190,7 @@ public:
, mLoraWeights(std::nullopt)
, mLoraConfig(std::nullopt)
, mLookaheadConfig(std::nullopt)
, mKvCacheRetentionConfig(std::nullopt)
, mContextChunkSize{mPromptLen}
, mLogProbs(mSamplingConfig.beamWidth)
, mCumLogProbs(mSamplingConfig.beamWidth)
@ -827,6 +830,16 @@ public:
mLookaheadConfig = config;
}
[[nodiscard]] std::optional<executor::KvCacheRetentionConfig> getKvCacheRetentionConfig() const
{
return mKvCacheRetentionConfig;
}
void setKvCacheRetentionConfig(executor::KvCacheRetentionConfig config)
{
mKvCacheRetentionConfig = config;
}
void clearLookaheadConfig()
{
mLookaheadConfig = std::nullopt;
@ -1502,6 +1515,23 @@ public:
return mReusedBlocksPerRequest;
}
void updateMissedBlocksPerRequest(SizeType32 missedBlocksPerRequest)
{
mMissedBlocksPerRequest += missedBlocksPerRequest;
}
[[nodiscard]] SizeType32 getMissedBlocksPerRequest() const
{
return mMissedBlocksPerRequest;
}
[[nodiscard]] float getKVCacheHitRatePerRequest() const
{
return mReusedBlocksPerRequest == 0 ? 0
: static_cast<float>(mReusedBlocksPerRequest)
/ (static_cast<float>(mReusedBlocksPerRequest + mMissedBlocksPerRequest));
}
RequestIdType mRequestId;
SizeType32 mPromptLen;
SizeType32 mMaxNewTokens;
@ -1552,6 +1582,7 @@ protected:
std::optional<TensorPtr> mLoraConfig;
std::optional<executor::LookaheadDecodingConfig> mLookaheadConfig;
std::optional<executor::KvCacheRetentionConfig> mKvCacheRetentionConfig;
// To enable chunked context, the FHMA paged kv-cache also needs to be enabled. Except for the last one,
// the size of the context chunk needs to be an integer multiple of the kv-cache block size. The meaning
// of null value is that the context is not chunked.
@ -1613,6 +1644,7 @@ protected:
SizeType32 mAllocTotalBlocksPerRequest{0};
SizeType32 mAllocNewBlocksPerRequest{0};
SizeType32 mReusedBlocksPerRequest{0};
SizeType32 mMissedBlocksPerRequest{0};
private:
void initialize(VecTokens const& inputTokens, bool outputLogProbs)
@ -1757,8 +1789,9 @@ public:
std::optional<SizeType32> promptVocabSize = std::nullopt,
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
std::optional<TensorPtr> loraConfig = std::nullopt,
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
bool returnContextLogits = false, bool returnGenerationLogits = false,
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
std::optional<std::shared_ptr<VecTokens>> draftTokens = std::nullopt,
std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
@ -1775,11 +1808,11 @@ public:
: Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
std::move(lookaheadConfig), returnLogProbs, returnContextLogits, returnGenerationLogits,
std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
std::move(encoderInputFeatures), std::move(encoderOutputLength), std::move(crossAttentionMask),
llmRequestType, std::move(inputTokenExtraIds), numReturnSequences)
std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs, returnContextLogits,
returnGenerationLogits, std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput,
std::move(logitsPostProcessor), applyLogitsPostProcessorBatched, std::move(encoderInputTokens),
returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures), std::move(encoderOutputLength),
std::move(crossAttentionMask), llmRequestType, std::move(inputTokenExtraIds), numReturnSequences)
{
}
@ -1791,6 +1824,7 @@ public:
mLogitsPostProcessor = std::move(logitsPostProcessor);
mApplyLogitsPostProcessorBatched = applyLogitsPostProcessorBatched;
mLookaheadConfig = request.getLookaheadConfig();
mKvCacheRetentionConfig = request.getKvCacheRetentionConfig();
}
std::shared_ptr<LlmRequest> createChildRequest(RequestIdType requestId)

View File

@ -1,13 +1,17 @@
/*
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: LicenseRef-NvidiaProprietary
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
* property and proprietary rights in and to this material, related
* documentation and any modifications thereto. Any use, reproduction,
* disclosure or distribution of this material and related documentation
* without an express license agreement from NVIDIA CORPORATION or
* its affiliates is strictly prohibited.
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

View File

@ -28,6 +28,7 @@
#include <memory>
#include <optional>
#include <string>
#include <utility>
#include <vector>
namespace tensorrt_llm::mpi
@ -348,6 +349,71 @@ public:
bool fastLogits;
};
using RetentionPriority = SizeType32;
/// @brief Configuration for the request's retention in the KV Cache
class KvCacheRetentionConfig
{
public:
static constexpr RetentionPriority kMinRetentionPriority = 0;
static constexpr RetentionPriority kMaxRetentionPriority = 100;
static constexpr RetentionPriority kDefaultRetentionPriority = 35;
/// @brief A single entry to set block priorities over a token range. Earlier ranges always take priority over later
/// ones. For example, with a block size of 16, a range of [0, 17] would be applied to the first two blocks.
struct TokenRangeRetentionPriority
{
public:
explicit TokenRangeRetentionPriority(SizeType32 tokenStart, std::optional<SizeType32> tokenEnd = std::nullopt,
RetentionPriority priority = KvCacheRetentionConfig::kDefaultRetentionPriority)
: tokenStart{tokenStart}
, tokenEnd{tokenEnd}
, priority{priority}
{
TLLM_CHECK_WITH_INFO(priority >= KvCacheRetentionConfig::kMinRetentionPriority
&& priority <= KvCacheRetentionConfig::kMaxRetentionPriority,
"Invalid priority value. Must be between %d and %d", KvCacheRetentionConfig::kMinRetentionPriority,
KvCacheRetentionConfig::kMaxRetentionPriority);
};
/// @brief The first token of this range.
SizeType32 tokenStart;
/// @brief The final token of this range. The end is not included in the range. This can be set to std::nullopt
/// to extend the range to the end of the sequence.
std::optional<SizeType32> tokenEnd;
/// @brief The priority of this token range. Higher priorities are less likely to be evicted or offloaded.
RetentionPriority priority;
bool operator==(TokenRangeRetentionPriority const& other) const
{
return tokenStart == other.tokenStart && tokenEnd == other.tokenEnd && priority == other.priority;
}
};
explicit KvCacheRetentionConfig()
: KvCacheRetentionConfig({}, kDefaultRetentionPriority)
{
}
KvCacheRetentionConfig(std::vector<TokenRangeRetentionPriority> const& tokenRangeRetentionPriorities,
RetentionPriority decodeRetentionPriority);
[[nodiscard]] std::vector<TokenRangeRetentionPriority> getTokenRangeRetentionPriorities() const;
[[nodiscard]] RetentionPriority getDecodeRetentionPriority() const;
/// @brief Convert the token range data into an entry per kv cache block for a given seqLen
std::vector<std::optional<RetentionPriority>> getPerBlockEvictionPolicy(SizeType32 blockSize, SizeType32 seqLen);
private:
/// @brief The token ranges and priority levels to update. Ranges must be non-overlapping. For example [(0, 64),
/// (100, 128), (70, 80)] is valid, whereas
/// [(0, 64), (60, 128)] is not.
std::vector<TokenRangeRetentionPriority> mTokenRangeRetentionPriorities;
/// @brief The priority level to assign to blocks allocated in the decode phase
RetentionPriority mDecodeRetentionPriority;
};
/// @brief A class that holds information about the request
class Request
{
@ -371,6 +437,7 @@ public:
/// @param pTuningConfig The prompt tuning configuration
/// @param loraConfig The LoRA configuration
/// @param logitsPostProcessorName The logits postprocessor name. Must correspond to one of the logits postprocessor
/// @param kvCacheRetentionConfig The configuration used for KV cache block eviction.
/// name provided to the ExecutorConfig.
/// @param encoderInputTokenIds The encoder input token ids for encoder-decoder models, or encoder-only models
/// @param returnAllGeneratedTokens Indicates whether to return the full beams or just the newly generated tokens
@ -394,6 +461,7 @@ public:
std::optional<PromptTuningConfig> pTuningConfig = std::nullopt,
std::optional<LoraConfig> loraConfig = std::nullopt,
std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
std::optional<std::string> logitsPostProcessorName = std::nullopt,
std::optional<VecTokens> encoderInputTokenIds = std::nullopt, std::optional<IdType> clientId = std::nullopt,
bool returnAllGeneratedTokens = false, PriorityType priority = kDefaultPriority,
@ -428,6 +496,7 @@ public:
[[nodiscard]] std::optional<PromptTuningConfig> getPromptTuningConfig() const;
[[nodiscard]] std::optional<LoraConfig> getLoraConfig() const;
[[nodiscard]] std::optional<LookaheadDecodingConfig> getLookaheadConfig() const;
[[nodiscard]] std::optional<KvCacheRetentionConfig> getKvCacheRetentionConfig() const;
[[nodiscard]] std::optional<std::string> getLogitsPostProcessorName() const;
[[nodiscard]] std::optional<VecTokens> getEncoderInputTokenIds() const;
[[nodiscard]] std::optional<IdType> getClientId() const;
@ -453,6 +522,7 @@ public:
void setPromptTuningConfig(PromptTuningConfig const& pTuningConfig);
void setLoraConfig(LoraConfig const& loraConfig);
void setLookaheadConfig(LookaheadDecodingConfig const& lookaheadConfig);
void setKvCacheRetentionConfig(KvCacheRetentionConfig const& kvCacheRetentionConfig);
void setLogitsPostProcessorName(std::string const& logitsPostProcessorName);
void setEncoderInputTokenIds(VecTokens const& encoderInputTokenIds);
void setClientId(IdType clientId);
@ -598,23 +668,29 @@ public:
std::optional<std::vector<SizeType32>> const& maxAttentionWindowVec = std::nullopt,
std::optional<SizeType32> const& sinkTokenLength = std::nullopt,
std::optional<FloatType> const& freeGpuMemoryFraction = std::nullopt,
std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true);
std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true,
std::optional<FloatType> const& crossKvCacheFraction = std::nullopt,
std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt);
[[nodiscard]] bool getEnableBlockReuse() const;
[[nodiscard]] std::optional<SizeType32> getMaxTokens() const;
[[nodiscard]] std::optional<std::vector<SizeType32>> getMaxAttentionWindowVec() const;
[[nodiscard]] std::optional<SizeType32> getSinkTokenLength() const;
[[nodiscard]] std::optional<FloatType> getFreeGpuMemoryFraction() const;
[[nodiscard]] std::optional<FloatType> getCrossKvCacheFraction() const;
[[nodiscard]] std::optional<size_t> getHostCacheSize() const;
[[nodiscard]] bool getOnboardBlocks() const;
[[nodiscard]] std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const;
void setEnableBlockReuse(bool enableBlockReuse);
void setMaxTokens(SizeType32 maxTokens);
void setMaxAttentionWindowVec(std::vector<SizeType32> maxAttentionWindowVec);
void setSinkTokenLength(SizeType32 sinkTokenLength);
void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction);
void setCrossKvCacheFraction(FloatType crossKvCacheFraction);
void setHostCacheSize(size_t hostCacheSize);
void setOnboardBlocks(bool onboardBlocks);
void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority);
private:
friend class Serialization;
@ -641,12 +717,21 @@ private:
/// allocated.
std::optional<FloatType> mFreeGpuMemoryFraction;
/// @brief The fraction of the KV Cache memory should be reserved for cross attention
/// If set to p, self attention will use 1-p of KV Cache memory and cross attention
/// will use p of KV Cache memory. Default is 50%.
/// Should only be set when using encoder-decoder model.
std::optional<FloatType> mCrossKvCacheFraction;
/// @brief Size of secondary memory pool in bytes. Default is 0.
/// Having a secondary memory pool increases KV cache block reuse potential.
std::optional<size_t> mHostCacheSize;
/// @brief Controls whether offloaded blocks should be onboarded back into primary memory before being reused.
bool mOnboardBlocks;
/// @brief Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory.
std::optional<RetentionPriority> mSecondaryOffloadMinPriority;
};
/// @brief Configuration class for the runtime perf knobs

View File

@ -154,6 +154,18 @@ public:
static void serialize(LookaheadDecodingConfig const& lookaheadDecodingConfig, std::ostream& os);
static size_t serializedSize(LookaheadDecodingConfig const& lookaheadDecodingConfig);
// KvCacheRetentionConfig
static KvCacheRetentionConfig deserializeKvCacheRetentionConfig(std::istream& is);
static void serialize(KvCacheRetentionConfig const& kvCacheRetentionConfig, std::ostream& os);
static size_t serializedSize(KvCacheRetentionConfig const& kvCacheRetentionConfig);
// TokenRangeRetentionPriority
static KvCacheRetentionConfig::TokenRangeRetentionPriority deserializeTokenRangeRetentionPriority(std::istream& is);
static void serialize(
KvCacheRetentionConfig::TokenRangeRetentionPriority const& tokenRangeRetentionPriority, std::ostream& os);
static size_t serializedSize(
KvCacheRetentionConfig::TokenRangeRetentionPriority const& tokenRangeRetentionPriority);
// DecodingConfig
static DecodingConfig deserializeDecodingConfig(std::istream& is);
static void serialize(DecodingConfig const& decodingConfig, std::ostream& os);

View File

@ -233,7 +233,8 @@ enum class CommunicationMode
// execution of the model
};
/// @brief Struct that holds the stats of a KV cache manager
/// @brief Struct that holds the stats of a KV cache manager.
// See KvCacheStats definition in kvCacheManager.h for more information about each field.
struct KvCacheStats
{
/// @brief Max number of blocks
@ -250,6 +251,10 @@ struct KvCacheStats
SizeType32 allocNewBlocks;
/// @brief Number of reused block
SizeType32 reusedBlocks;
/// @brief Number of not reused block
SizeType32 missedBlocks;
/// @brief Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
float cacheHitRate;
};
/// @brief Struct that holds the stats of static batching models for a single iteration
@ -372,6 +377,10 @@ struct RequestStats
SizeType32 allocNewBlocksPerRequest;
/// @brief Number of reused blocks per request
SizeType32 reusedBlocksPerRequest;
/// @brief Number of missed blocks per request
SizeType32 missedBlocksPerRequest;
/// @brief KV Cache Hit Rate per request, defined as reusedBlocks / (reusedBlocks + missedBlocks)
SizeType32 kvCacheHitRatePerRequest;
};
/// @brief Struct that holds the stats of all requests in an iteration
@ -459,6 +468,11 @@ public:
return DecodingMode{kExternalDraftTokens | kUsePenalties | kUseBanTokens | kStandardStopCriteria};
}
static auto constexpr Eagle()
{
return DecodingMode{kEagle | kStandardStopCriteria | kUseExplicitEosStop};
}
auto constexpr useTemperature(bool useTemp)
{
mState = setBitTo(kUseTemperature, useTemp);
@ -581,6 +595,11 @@ public:
return anyBitSet(kExternalDraftTokens);
}
[[nodiscard]] bool constexpr isEagle() const
{
return anyBitSet(kEagle);
}
[[nodiscard]] bool constexpr isUseTemperature() const
{
return anyBitSet(kUseTemperature);
@ -695,6 +714,7 @@ private:
static UnderlyingType constexpr kLookahead{1u << (kNumFlags + 5)};
static UnderlyingType constexpr kExplicitDraftTokens{1u << (kNumFlags + 6)};
static UnderlyingType constexpr kExternalDraftTokens{1u << (kNumFlags + 7)};
static UnderlyingType constexpr kEagle{1u << (kNumFlags + 8)};
static UnderlyingType constexpr kTopKTopP{kTopK | kTopP};
[[nodiscard]] bool constexpr anyBitSet(UnderlyingType bits) const
@ -726,6 +746,7 @@ static_assert(!DecodingMode::Auto().isMedusa());
static_assert(!DecodingMode::Auto().isLookahead());
static_assert(!DecodingMode::Auto().isExplicitDraftTokens());
static_assert(!DecodingMode::Auto().isExternalDraftTokens());
static_assert(!DecodingMode::Auto().isEagle());
static_assert(DecodingMode::TopK().isTopK());
static_assert(DecodingMode::TopK().isTopKorTopP());
@ -747,6 +768,7 @@ static_assert(!DecodingMode::TopK().isMedusa());
static_assert(!DecodingMode::TopK().isLookahead());
static_assert(!DecodingMode::TopK().isExplicitDraftTokens());
static_assert(!DecodingMode::TopK().isExternalDraftTokens());
static_assert(!DecodingMode::TopK().isEagle());
static_assert(DecodingMode::TopP().isTopP());
static_assert(DecodingMode::TopP().isTopKorTopP());
@ -760,7 +782,7 @@ static_assert(!DecodingMode::TopP().isBeamSearch());
static_assert(!DecodingMode::TopP().isMedusa());
static_assert(!DecodingMode::TopP().isLookahead());
static_assert(!DecodingMode::TopP().isExplicitDraftTokens());
static_assert(!DecodingMode::TopP().isExternalDraftTokens());
static_assert(!DecodingMode::TopP().isEagle());
static_assert(DecodingMode::TopKTopP().isTopK());
static_assert(DecodingMode::TopKTopP().isTopP());
@ -775,6 +797,7 @@ static_assert(!DecodingMode::TopKTopP().isMedusa());
static_assert(!DecodingMode::TopKTopP().isLookahead());
static_assert(!DecodingMode::TopKTopP().isExplicitDraftTokens());
static_assert(!DecodingMode::TopKTopP().isExternalDraftTokens());
static_assert(!DecodingMode::TopKTopP().isEagle());
static_assert(DecodingMode::BeamSearch().isBeamSearch());
static_assert(DecodingMode::BeamSearch().isUseStopCriteria());
@ -784,6 +807,7 @@ static_assert(!DecodingMode::BeamSearch().isMedusa());
static_assert(!DecodingMode::BeamSearch().isLookahead());
static_assert(!DecodingMode::BeamSearch().isExplicitDraftTokens());
static_assert(!DecodingMode::BeamSearch().isExternalDraftTokens());
static_assert(!DecodingMode::BeamSearch().isEagle());
static_assert(!DecodingMode::Medusa().isAuto());
static_assert(!DecodingMode::Medusa().isTopK());
@ -800,6 +824,7 @@ static_assert(DecodingMode::Medusa().isUsePenalty());
static_assert(DecodingMode::Medusa().isUseMinLength());
static_assert(DecodingMode::Medusa().isMedusa());
static_assert(!DecodingMode::Medusa().isExternalDraftTokens());
static_assert(!DecodingMode::Medusa().isEagle());
static_assert(!DecodingMode::Lookahead().isAuto());
static_assert(!DecodingMode::Lookahead().isTopK());
@ -814,6 +839,7 @@ static_assert(DecodingMode::Lookahead().isUseStopWords());
static_assert(DecodingMode::Lookahead().isUseExplicitEosStop());
static_assert(DecodingMode::Lookahead().isLookahead());
static_assert(!DecodingMode::Lookahead().isExternalDraftTokens());
static_assert(!DecodingMode::Lookahead().isEagle());
static_assert(!DecodingMode::ExplicitDraftTokens().isAuto());
static_assert(!DecodingMode::ExplicitDraftTokens().isTopK());
@ -828,6 +854,7 @@ static_assert(DecodingMode::ExplicitDraftTokens().isUseStopCriteria());
static_assert(!DecodingMode::ExplicitDraftTokens().isUseBanWords());
static_assert(DecodingMode::ExplicitDraftTokens().isExplicitDraftTokens());
static_assert(!DecodingMode::ExplicitDraftTokens().isExternalDraftTokens());
static_assert(!DecodingMode::ExplicitDraftTokens().isEagle());
static_assert(!DecodingMode::ExternalDraftTokens().isTopK());
static_assert(!DecodingMode::ExternalDraftTokens().isTopP());
@ -841,5 +868,21 @@ static_assert(!DecodingMode::ExternalDraftTokens().isBeamSearch());
static_assert(!DecodingMode::ExternalDraftTokens().isMedusa());
static_assert(!DecodingMode::ExternalDraftTokens().isLookahead());
static_assert(!DecodingMode::ExternalDraftTokens().isExplicitDraftTokens());
static_assert(!DecodingMode::ExternalDraftTokens().isEagle());
static_assert(DecodingMode::ExternalDraftTokens().isExternalDraftTokens());
static_assert(!DecodingMode::Eagle().isTopK());
static_assert(!DecodingMode::Eagle().isTopP());
static_assert(!DecodingMode::Eagle().isTopKorTopP());
static_assert(!DecodingMode::Eagle().isTopKandTopP());
static_assert(!DecodingMode::Eagle().isUseBanWords());
static_assert(!DecodingMode::Eagle().isUseOccurrencePenalty());
static_assert(DecodingMode::Eagle().isUseStopCriteria());
static_assert(!DecodingMode::Eagle().isAuto());
static_assert(!DecodingMode::Eagle().isBeamSearch());
static_assert(!DecodingMode::Eagle().isMedusa());
static_assert(!DecodingMode::Eagle().isLookahead());
static_assert(!DecodingMode::Eagle().isExplicitDraftTokens());
static_assert(!DecodingMode::Eagle().isExternalDraftTokens());
static_assert(DecodingMode::Eagle().isEagle());
} // namespace tensorrt_llm::executor

View File

@ -18,6 +18,7 @@
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/eagleBuffers.h"
#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/lookaheadBuffers.h"
@ -121,6 +122,8 @@ public:
std::optional<ExplicitDraftTokensBuffers::Inputs> explicitDraftTokensBuffers;
std::optional<LookaheadDecodingBuffers> lookaheadOutputs;
std::optional<EagleBuffers::Inputs> eagleBuffers;
};
} // namespace tensorrt_llm::runtime

View File

@ -0,0 +1,134 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/executor/executor.h"
#include "tensorrt_llm/runtime/eagleModule.h"
#include "tensorrt_llm/runtime/iBuffer.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/modelConfig.h"
#include "tensorrt_llm/runtime/tllmRuntime.h"
#include "tensorrt_llm/runtime/worldConfig.h"
#include <cstddef>
namespace tensorrt_llm::runtime
{
class EagleBuffers
{
public:
using SizeType32 = runtime::SizeType32;
using ITensor = runtime::ITensor;
using BufferPtr = runtime::IBuffer::SharedPtr;
using TensorPtr = runtime::ITensor::SharedPtr;
using TensorMap = runtime::StringPtrMap<runtime::ITensor>;
// The datastruct is used for runtime buffer that is holding runtime state per request (shape starts with
// maxBatchSize) and for engine inputs (shape starts with numSequences).
class Inputs
{
public:
//! [maxBatchSize] or [numSequences]
TensorPtr temperatures;
//! [maxBatchSize] or [numSequences]
TensorPtr randomDataSample;
//! [maxBatchSize, maxNumPaths, maxPathDraftLen] or [numSequences, maxNumPaths, maxPathDraftLen]
TensorPtr randomDataValidation;
//! [maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
TensorPtr draftTokens;
//! [maxBatchSize] or [numSequences]
TensorPtr draftLens;
//! [maxBatchSize, maxNumPaths, maxPathLen]
//! or [numSequences, maxNumPaths, maxPathLen]
TensorPtr draftPaths;
//! [maxBatchSize] or [numGenSequences]
TensorPtr specDecodingGenerationLengths;
//! [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
//! or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
TensorPtr specDecodingPackedMasks;
//! [maxBatchSize] or [numGenSequences]
TensorPtr specDecodingPositionOffsets;
//! [maxBatchSize] or [numSequences]
TensorPtr eagleNetCtxRequestTypesHost;
//! [maxBatchSize] or [numSequences]
TensorPtr eagleNetCtxContextLengthsHost;
//! [maxBatchSize] or [numSequences]
TensorPtr eagleNetCtxPastKeyValueLengthsHost;
//! [maxBatchSize] or [numSequences]
TensorPtr eagleNetGenRequestTypesHost;
//! [maxBatchSize] or [numSequences]
TensorPtr eagleNetGenContextLengthsHost;
//! [maxBatchSize] or [numSequences]
TensorPtr eagleNetGenPastKeyValueLengthsHost;
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const& runtime,
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig);
};
Inputs engineInputs;
class EngineOutputs
{
public:
//! [batchSize, maxDecodingDraftTokens]
TensorPtr nextDraftTokens;
//! [batchSize]
TensorPtr nextDraftLens;
//! [batchSize, maxNumPaths, maxPathLen]
TensorPtr nextDraftPaths;
//! [batchSize, maxPathLen]
TensorPtr acceptedTokens;
//! [batchSize]
TensorPtr acceptedLens;
//! [batchSize]
TensorPtr acceptedPaths;
} engineOutputs;
public:
EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const& manager,
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
executor::DecodingConfig const& decodingConfig, runtime::TllmRuntime const& runtime);
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const& modelConfig);
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const& requestTypes,
ITensor const& seqSlots, EagleBuffers::Inputs const& decoderBuffers, ITensor const& contextPositionIds,
runtime::TllmRuntime const& runtime, runtime::ModelConfig const& modelConfig,
runtime::WorldConfig const& worldConfig) const;
void insertInputTensors(
TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::WorldConfig const& worldConfig) const;
private:
template <typename T>
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded,
ITensor const& seqSlots, EagleBuffers::Inputs const& draftBuffers, ITensor const& contextPositionIds,
runtime::EagleModule const& eagleModule, runtime::CudaStream const& stream) const;
private:
// helper tensors
std::size_t scanTempStorageBytes{0};
std::size_t reduceTempStorageBytes{0};
BufferPtr scanReduceTempStorage;
TensorPtr cumSumGenerationLengths;
TensorPtr maxGenerationLength;
};
} // namespace tensorrt_llm::runtime

View File

@ -54,6 +54,8 @@ public:
void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override;
void setupEagle(EagleBuffers::Inputs eagleBuffers) override;
void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) override;
void newBatch(
@ -270,12 +272,18 @@ private:
//! @brief Setups decoder internal tensors for new Explicit draft tokens request
void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const& request);
//! @brief Setups decoder internal tensors for new Eagle request
void newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const& request);
//! @brief Updates finished state on host for all active requests
void updateFinished(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent);
//! @brief Sets inputs for explicit draft tokens.
void setExplicitDraftTokensInputs(decoder_batch::Input const& input);
//! @brief Sets inputs for eagle decoding.
void setEagleInputs(decoder_batch::Input const& input);
//! @brief Calls decoders for tokens per engine step
void forwardDispatch(decoder_batch::Output& output, decoder_batch::Input const& input, ForwardType forwardType);

View File

@ -18,6 +18,7 @@
#include "tensorrt_llm/runtime/cudaEvent.h"
#include "tensorrt_llm/runtime/cudaStream.h"
#include "tensorrt_llm/runtime/eagleBuffers.h"
#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
#include "tensorrt_llm/runtime/iStatefulGptDecoder.h"
#include "tensorrt_llm/runtime/iTensor.h"
@ -101,6 +102,9 @@ public:
//! @brief Setup buffers for ExplicitDraftTokens decoding.
virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) = 0;
//! @brief Setup buffers for Eagle decoding.
virtual void setupEagle(EagleBuffers::Inputs eagleBuffers) = 0;
//! @brief Setup buffers for Lookahead decoding.
virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) = 0;

View File

@ -227,6 +227,12 @@ public:
mNumKvHeadsPerAttentionLayer = std::vector<SizeType32>(mNbAttentionLayers, nbKvHeads);
}
// set the number of kv heads for all layers
void setNbCrossKvHeads(SizeType32 nbKvHeads)
{
mNumKvHeadsPerCrossAttentionLayer = std::vector<SizeType32>(mNbAttentionLayers, nbKvHeads);
}
[[nodiscard]] SizeType32 constexpr getHiddenSize() const noexcept
{
return mHiddenSize;
@ -712,13 +718,18 @@ public:
}
[[nodiscard]] std::pair<std::vector<SizeType32>::const_iterator, std::vector<SizeType32>::const_iterator>
getNumKvHeadsPerLayerLocalRange(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
getNumKvHeadsPerLayerLocalRange(
SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const
{
TLLM_LOG_TRACE("%s start: %d", __PRETTY_FUNCTION__);
TLLM_CHECK_WITH_INFO(pipelineParallelism > 0, "Invalid pipelineParallelism: %d", pipelineParallelism);
// count number of previous non-local attention layers
auto const numPrevAttnLayers
= countLowerRankLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank);
auto const firstLocalAttentionLayerIt = mNumKvHeadsPerAttentionLayer.cbegin() + numPrevAttnLayers;
auto const firstLocalAttentionLayerIt = isCrossAttention
? mNumKvHeadsPerCrossAttentionLayer.cbegin()
: mNumKvHeadsPerAttentionLayer.cbegin() + numPrevAttnLayers;
auto const numLocalAttentionLayers
= countLocalLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank);
return std::make_pair(firstLocalAttentionLayerIt, firstLocalAttentionLayerIt + numLocalAttentionLayers);
@ -732,10 +743,19 @@ public:
mNumKvHeadsPerAttentionLayer = headsPerLayer;
}
[[nodiscard]] SizeType32 getSumLocalKvHeads(
SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
void setNumKvHeadsPerCrossLayer(std::vector<SizeType32> const& headsPerLayer)
{
auto [cbegin, cend] = getNumKvHeadsPerLayerLocalRange(pipelineParallelism, pipelineParallelismRank);
auto const numElems = static_cast<SizeType32>(headsPerLayer.size());
TLLM_CHECK_WITH_INFO(numElems == mNbAttentionLayers,
"Length of head_per_layer (%d) must match number of attention layers (%d)", numElems, mNbAttentionLayers);
mNumKvHeadsPerCrossAttentionLayer = headsPerLayer;
}
[[nodiscard]] SizeType32 getSumLocalKvHeads(
SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const
{
auto [cbegin, cend]
= getNumKvHeadsPerLayerLocalRange(pipelineParallelism, pipelineParallelismRank, isCrossAttention);
auto const sumLocalHeads = std::reduce(cbegin, cend);
return sumLocalHeads;
}
@ -800,6 +820,7 @@ private:
ManageWeightsType mManageWeightsType;
std::string mModelName;
std::vector<SizeType32> mNumKvHeadsPerAttentionLayer;
std::vector<SizeType32> mNumKvHeadsPerCrossAttentionLayer;
};
} // namespace tensorrt_llm::runtime

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9cbefbfec3c95fba9ffae469c63e29594488a2419bfd5dfd3abbd234da7e7eed
size 5203208
oid sha256:e7cbfb1f4f7a74411bb349b54add6c2f0b645674835dacce89b39ea9fcff53b7
size 5307422

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2a5430463d420db41a6024a3d7a16d06a607f0f48c991066fbaa056b756f1f06
size 5318774
oid sha256:e52f067f1a5127c512eb0fc298d6608644f3cde5079dd78b6fc6258be353afbe
size 5422318

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:569c6009be0bf4b404d752468c771a551480d3785ac73678f9e393df2a05ba93
size 1982856
oid sha256:941a0dcdf53fe2952baf73720ae3afe8556e629f1873ecf5e0a4235ef074a0ed
size 1993520

View File

@ -1,2 +1,2 @@
68b4fbc2bb6ed68a051b11a006b8984f libtensorrt_llm_ucx_wrapper.so
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
93eb7c6098fe8327cd90ce1cd564e9a9 libtensorrt_llm_ucx_wrapper.so
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b382d9cbfc7b4a4dd55b44c87d8dc196726e94d0a0e74c11aa9e2673e7445d3b
size 5053722
oid sha256:f16aa394aaeb2fd6e7f61cf1e171fac868ee1857b9df582d640c1bb03574c942
size 5152714

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:385cd17e249762358be090d392087f299462ab478e0581b3a5d29d1f7d12d769
size 5021342
oid sha256:7962f37b09d99eec224b5c7a96c4fb895c983b35cb2abb00cf498fb671c183c4
size 5121480

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:4b143227c742854cd898d1460b5fe6bc1088d38532947c775dfaa27361040909
oid sha256:5446aea5178d0d0a7b4f8be4a750065b3bfcdd35f91730c5bcb2dff079fbe287
size 11648

View File

@ -1,2 +1,2 @@
9f28106626687e336dba9e9ce51b6353 libtensorrt_llm_ucx_wrapper.so
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
1fe3fbf1da6fb9689237743d902f59e7 libtensorrt_llm_ucx_wrapper.so
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:fefbf87e36ee6b652bb52df28f06b47edbb64f16af5ac4497318d3af23dc6e84
size 32610670
oid sha256:d3fc5de5061f30e7875f0d86bfecfe1c42f98319f7a6a4339b3027bfe39e9eb5
size 33892906

View File

@ -1,2 +1,2 @@
e82c837214de4ebf6941beced3d528e1 tensorrt_llm_batch_manager_static.lib
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
19f7f28149bccf535e56ba0bc96a5954 tensorrt_llm_batch_manager_static.lib
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -157,4 +157,42 @@ bool getEnvEnablePDL()
return enablePDL;
}
bool getEnvUseUCXKvCache()
{
static bool init = false;
static bool useUCXKVCache = false;
if (!init)
{
init = true;
{
char const* use_ucx_kv_cache = std::getenv("TRTLLM_USE_UCX_KVCACHE");
if (use_ucx_kv_cache)
{
if (use_ucx_kv_cache[0] == '1' && use_ucx_kv_cache[1] == '\0')
{
useUCXKVCache = true;
}
}
}
}
return useUCXKVCache;
}
std::string getEnvUCXInterface()
{
static bool init = false;
static std::string ucxInterface;
if (!init)
{
init = true;
{
char const* ucx_interface = std::getenv("TRTLLM_UCX_INTERFACE");
if (ucx_interface)
{
ucxInterface = ucx_interface;
}
}
}
return ucxInterface;
}
} // namespace tensorrt_llm::common

View File

@ -18,6 +18,7 @@
#pragma once
#include <cstdint>
#include <optional>
#include <string>
namespace tensorrt_llm::common
{
@ -40,4 +41,8 @@ int getEnvMmhaKernelBlockSize();
// Whether PDL is enabled.
bool getEnvEnablePDL();
bool getEnvUseUCXKvCache();
std::string getEnvUCXInterface();
} // namespace tensorrt_llm::common

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a88c97950343f3bb6e630c032858099c118a24b086aa97b4d0aefd3f50439497
size 3382878
oid sha256:7c39ae574064fae76e804f9ca65321c00a3ee1400ed84ed5e5ab4098c335acfa
size 2277310

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:aefeea1ece28c2a9e5e87150b2c0dbbfa88333470ebaf424dc8ea1f5ede69a07
size 3408900
oid sha256:45b2680b72e076323e36b36fa4946d9d043136cf05818afe626d3645b8bf1896
size 2308950

View File

@ -1,3 +1,3 @@
e69c2223ff214d2339b7466115346146 libtensorrt_llm_executor_static.a
65deeb71a548a81e57153fa3ad21669c libtensorrt_llm_executor_static.pre_cxx11.a
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
ce1a318e5b898bddb293de47729b4be4 libtensorrt_llm_executor_static.a
9370519464da3b7dfc95c5fcb3c40d13 libtensorrt_llm_executor_static.pre_cxx11.a
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:22b35d0a4dc2de4c25873d162a5b1edf2d3d16fcbef413aee5a80bbe69db2e37
size 7880014
oid sha256:9fcdfc9bc55d856e532cf351145ca98e4b351c41452fdeec8b4d880a592316e2
size 3356696

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ebf47f01dd8235116ac28beea016600bffb2fa36cf3b568ebfb2da2b91469379
size 7796554
oid sha256:8a8d6e3c6d24decec3b9b352a36d7d898d32bfadf3a0645424bd8ae426b04c4f
size 3278590

View File

@ -1,3 +1,3 @@
4c2d8fce7081ac6148b13717636be5ff libtensorrt_llm_executor_static.a
3341fe285c45cd3b86ea512099bda6a0 libtensorrt_llm_executor_static.pre_cxx11.a
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
b56fd65be76f98bca18ef4e6e1a2744a libtensorrt_llm_executor_static.a
9f7fc03c3b2a9e7690663adb23a4f036 libtensorrt_llm_executor_static.pre_cxx11.a
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:69b4f3070af978cb53036e7370f59e0a6902f5aca782a8a0a2bc34246cae0637
size 24707382
oid sha256:bd6c3e401f6fd0636f43337ddcfb2bd5c7d5d21c1ea41d421152b9c2e8fec810
size 21794946

View File

@ -1,2 +1,2 @@
520287b802be61e0d36ba8b1df8f06a0 tensorrt_llm_executor_static.lib
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
054197bdbdeace4f818ca8c1bc1f7308 tensorrt_llm_executor_static.lib
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -1565,10 +1565,17 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
if constexpr (DO_CROSS_ATTENTION)
{
auto const k_idx = QK_VEC_SIZE * tidx;
int const inBlockIdx = kvCacheBuffer.getKVLocalIdx(cyclic_tlength, hi_kv, Dh, k_idx);
Tcache* k_cache = reinterpret_cast<Tcache*>(kvCacheBuffer.getKBlockPtr(batch_beam_idx, cyclic_tlength));
int const inBlockIdx = pastKCache.getKVLocalIdx(cyclic_tlength, hi_kv, Dh, k_idx);
Tcache* k_cache = reinterpret_cast<Tcache*>(pastKCache.getKBlockPtr(batch_beam_idx, cyclic_tlength));
k = vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<Qk_vec_m const*>(&k_cache[inBlockIdx]));
if constexpr (ENABLE_8BITS_K_CACHE)
{
load_8bits_kv_cache_vec(&k, k_cache, inBlockIdx, k_scale_quant_orig_f);
}
else
{
k = vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<Qk_vec_m const*>(&k_cache[inBlockIdx]));
}
}
else
{
@ -2359,7 +2366,14 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
V_vec_k v;
if (DO_CROSS_ATTENTION)
{
v = vec_conversion<V_vec_k, V_vec_k>(*reinterpret_cast<V_vec_k const*>(&v_cache_base[inBlockIdx]));
if constexpr (ENABLE_8BITS_KV_CACHE)
{
load_8bits_kv_cache_vec(&v, v_cache_base, inBlockIdx, kv_scale_quant_orig_f);
}
else
{
v = vec_conversion<V_vec_k, V_vec_k>(*reinterpret_cast<V_vec_k const*>(&v_cache_base[inBlockIdx]));
}
}
else
{
@ -2401,7 +2415,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
// Store the values with bias back to global memory in the cache for V.
//*reinterpret_cast<V_vec_k*>(&v_cache[params.timestep*Dh]) = v;
// For MQA/GQA mode, write only with the first Q head of each group per KV head.
if (hi == (hi_kv * qhead_per_kv))
if (hi == (hi_kv * qhead_per_kv) && !DO_CROSS_ATTENTION)
{
if (ENABLE_8BITS_KV_CACHE)
{

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e73dd3a8859cd67c62ab89a98381028bd20ac9e756f0346bbbaab0fb6c566eb7
size 81578760
oid sha256:1397678ac1cab957f7d272750035ffd88b9e2b3b9d4f132073119d21c288b5da
size 82262624

View File

@ -1,2 +1,2 @@
88c30973b9b3452baa3f063d34d08169 libtensorrt_llm_nvrtc_wrapper.so
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
5ea3eabf1c58887230ba5ebc583e0d3c libtensorrt_llm_nvrtc_wrapper.so
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c11e0550552f4cc3568ac11de47079d5c6bd88aeb34ebbd52b39f4f732afbd7d
size 84839528
oid sha256:17481ff01045ac335223451c83943a7f97b5c63ca2ab5da3e71d0909c8f4e68b
size 84578328

View File

@ -1,2 +1,2 @@
95e9f87610383348e444d2d0b8396f2d libtensorrt_llm_nvrtc_wrapper.so
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
270f246f5eccb170a759cda4787216f4 libtensorrt_llm_nvrtc_wrapper.so
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f703220b82bac0368893a9df301a8e756e92546248bf0212e804b92fc6cd593e
oid sha256:c641f66a5f508d75ca8ac1d74b99fd61a8f9b6f96fe9820620520bb0a0beda23
size 1128448

View File

@ -1,3 +1,3 @@
c5f36e093e875c8ea84523fb1566d986 tensorrt_llm_nvrtc_wrapper.lib
54a96b4243b8fbcb0a331986b3bf70b9 tensorrt_llm_nvrtc_wrapper.dll
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
f7224da82447d5396ce952c4edb0b044 tensorrt_llm_nvrtc_wrapper.dll
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -624,18 +624,19 @@ void invokeCopyBeamHypotheses(DecodingOutput::BeamHypotheses const& src, Decodin
copyBeamHypotheses<<<numSMs, 256, 0, stream.get()>>>(copyStruct);
}
__global__ void initializeOutput(TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const nMaxSeqLen)
__global__ void initializeOutput(
TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const beam, SizeType32 const nMaxSeqLen)
{
for (int i = threadIdx.x; i < nMaxSeqLen; i += blockDim.x)
{
finalOutputIds[blockIdx.x * nMaxSeqLen + i] = endIds[blockIdx.x];
finalOutputIds[blockIdx.x * nMaxSeqLen + i] = endIds[blockIdx.x / beam];
}
}
void invokeInitializeOutput(TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const batchBeam,
SizeType32 const nMaxSeqLen, cudaStream_t stream)
void invokeInitializeOutput(TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const batch,
SizeType32 const beam, SizeType32 const nMaxSeqLen, cudaStream_t stream)
{
initializeOutput<<<batchBeam, 256, 0, stream>>>(finalOutputIds, endIds, nMaxSeqLen);
initializeOutput<<<batch * beam, 256, 0, stream>>>(finalOutputIds, endIds, beam, nMaxSeqLen);
}
__global__ void copyNextStepIds(TokenIdType* nextStepIds, TokenIdType const* const* outputIdsPtr,
@ -742,7 +743,7 @@ void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decod
// prefill finalOutputIds with the EOS tokens from decodingInput.endIds
tensorrt_llm::kernels::invokeInitializeOutput(bufferCast<TokenIdType>(finalOutputIds),
bufferCast<TokenIdType>(*decodingInput.endIds), batchSize * beamWidth, maxSeqLength, stream);
bufferCast<TokenIdType>(*decodingInput.endIds), batchSize, beamWidth, maxSeqLength, stream);
sync_check_cuda_error();
std::vector<float> lengthPenaltyVec;

View File

@ -72,7 +72,7 @@ void invokeFinalize(BeamHypotheses& bh, cudaStream_t stream);
//! \param maxSeqLen The maximum sequence length, inferred from the finalOutputIds.shape[3]
//! \param stream The CUDA stream on which to perform the operation.
void invokeInitializeOutput(runtime::TokenIdType* finalOutputIds, runtime::TokenIdType const* endIds,
runtime::SizeType32 batchBeam, runtime::SizeType32 maxSeqLen, cudaStream_t stream);
runtime::SizeType32 batch, runtime::SizeType32 beam, runtime::SizeType32 maxSeqLen, cudaStream_t stream);
//! \brief Copies the data from the buffers in src to dst to reduce the kernel launch overhead of individual memcpy.
//! for streaming + beam search, where we need to avoid overwriting the beam search buffers.

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:5081f260dc93b1caf996803253ab27fb290229e914feebfb8bd76b57e4e0a33e
size 25294842
oid sha256:60f54be9c7f7517b76112dcf9185cb8f4b2c69c11aec3b7afaaf86f7d0cc9b70
size 25257850

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b2d4225f5c76786ee808df56277cbe9328cd4da1a6265a15fc9c730353cdcdbf
size 25699742
oid sha256:2e2f1a38e7df380dd6048158565ffdc269dea2251078b3c8f2265f8e533337a4
size 25663182

View File

@ -1,3 +1,3 @@
87ec6b3278d52f9e2d8a23a20c38560b libtensorrt_llm_internal_cutlass_kernels_static.a
d8d2c5cb915dcac97f2279166f7eae58 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
74f70e14a884d8e29144595686634296 libtensorrt_llm_internal_cutlass_kernels_static.a
2de60e6d3211395ce4481ffede35c2f4 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ed0568d70c32b592a1a9fbd5c129b7c19ae1eca5fba2b68375f29b75bca32499
size 44104384
oid sha256:68ff5e652a2ec0d295583f4e3a8696df5d3606c2ffde50e4f205b87aea2f6c91
size 44055616

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:135a32f33cec8c10c7d780df3d3f55ea8b9d2b1fe0a553ad6834cff81d42cbaf
size 43491958
oid sha256:b598263e5df63ad110ab82bd2c3ea86cf4760350488122c8d14d908c1b84c045
size 43443190

View File

@ -1,3 +1,3 @@
2bb339277f872a112fc9c373128a7685 libtensorrt_llm_internal_cutlass_kernels_static.a
d212bad6fa0f17d00ee90376a2a250c0 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
6c4a716c6aefe9a68501d4bd6901cc95 libtensorrt_llm_internal_cutlass_kernels_static.a
0134143d17d8ff5b7326235f85f664f5 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2de729c93894e86ad9cae674a567c02677acb90bb6c970f2ce5dfcd04c489816
size 88140802
oid sha256:e72bd01eced686736f3e96d445e80964b83da77067c0c622f512630f625c2f40
size 88140804

View File

@ -1,2 +1,2 @@
8bf62183399a59548962a18ab2dec352 tensorrt_llm_internal_cutlass_kernels_static.lib
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
d908f9e8ed1928239467a78b30ae7b35 tensorrt_llm_internal_cutlass_kernels_static.lib
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit

View File

@ -127,12 +127,10 @@ size_t LoraImpl::getWorkspaceSize(
{
TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
auto const typeSize = tensorrt_llm::common::getDTypeSize(type);
TLLM_CHECK_WITH_INFO(
numTokens >= numReqs, fmtstr("num tokens %ld should be greater than num reqs %ld", numTokens, numReqs));
return (size_t) getGemmWorkSpaceSize(numTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices)
+ getLowRankWorkSpaceSize(numTokens, mNumLoraModules, mMaxLowRank, typeSize)
+ getGemmParamsWorkSpaceSize(numReqs * mNumLoraModules);
+ getGemmParamsWorkSpaceSize(std::min(numReqs, numTokens) * mNumLoraModules);
}
void LoraImpl::setBestTactic(std::optional<Config> config)
@ -163,7 +161,7 @@ int LoraImpl::run(int64_t numTokens, int64_t numReqs, void const* input, int32_t
setGemmConfig();
int64_t GemmWorkSpaceSize = getGemmWorkSpaceSize(numTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices);
int64_t groupGemmParamsWorkSpaceSize = getGemmParamsWorkSpaceSize(numReqs * mNumLoraModules);
int64_t groupGemmParamsWorkSpaceSize = getGemmParamsWorkSpaceSize(std::min(numReqs, numTokens) * mNumLoraModules);
void* gemmWorkSpace = workspace; // [gemmWorkSpace, lowrankWorkSpace, groupGemmParamsWorkSpace]
void* lowRankWorkSpace = static_cast<char*>(gemmWorkSpace) + GemmWorkSpaceSize;
void* groupGemmParamsWorkSpace = static_cast<char*>(lowRankWorkSpace)

View File

@ -1033,4 +1033,104 @@ void invokeCopyOutputTokensIds(runtime::TokenIdType** tmpOutputIdsPtrs, runtime:
batchSize, numInputLogits, maxDecodingDraftTokens);
}
namespace
{
__global__ void packEagleGenerationLengths(PackEagleParams params)
{
auto const batchIdx = static_cast<SizeType32>(blockIdx.x);
auto const batchSlot = params.batchSlots[batchIdx];
auto const isGenerationRequest = batchIdx >= params.numContextRequests;
auto const genIdx = batchIdx - params.numContextRequests;
if (threadIdx.x == 0 && isGenerationRequest)
{
params.outputSpecDecodingGenerationLengths[genIdx] = params.inputNextDraftLens[batchSlot];
}
}
__global__ void packEagleTensors(PackEagleParams params)
{
auto const batchIdx = static_cast<SizeType32>(blockIdx.x);
auto const batchSlot = params.batchSlots[batchIdx];
auto const isGenerationRequest = batchIdx >= params.numContextRequests;
auto const genIdx = batchIdx - params.numContextRequests;
// Copy data that is 1 elem per request
if (threadIdx.x == 0)
{
params.outputRandomDataSample[batchIdx] = params.inputRandomDataSample[batchSlot];
params.outputTemperatures[batchIdx] = params.inputTemperatures[batchSlot];
// FIXME we need 1 value per draft token
params.outputRandomDataValidation[batchIdx] = params.inputRandomDataValidation[batchSlot];
// 0 for ctx request and actual draft len for gen requests.
params.outputNextDraftLens[batchIdx] = isGenerationRequest ? params.inputNextDraftLens[batchSlot] : 0;
}
// Copy draft paths
auto const numPathElts = params.maxNumPaths * params.maxPathLength;
auto outputNextDraftPaths = params.outputNextDraftPaths + batchIdx * numPathElts;
auto const inputNextDraftPaths = params.inputNextDraftPaths + batchSlot * numPathElts;
for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < numPathElts; ti += static_cast<SizeType32>(blockDim.x))
{
outputNextDraftPaths[ti] = inputNextDraftPaths[ti];
}
if (isGenerationRequest)
{
// Copy draft tokens. We do it only for gen requests as for ctx requests outputNextDraftLens is 0.
auto const maxDecodingDraftTokens = params.maxDecodingTokens - 1;
auto outputNextDraftTokens = params.outputNextDraftTokens + batchIdx * maxDecodingDraftTokens;
auto const inputNextDraftTokens = params.inputNextDraftTokens + batchSlot * maxDecodingDraftTokens;
for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < maxDecodingDraftTokens;
ti += static_cast<SizeType32>(blockDim.x))
{
outputNextDraftTokens[ti] = inputNextDraftTokens[ti];
}
auto const maxGenerationLength = params.maxGenerationLength[0];
auto const numPackedMasks = divUp(params.maxDecodingTokens, 32);
auto const outputStartId = (genIdx == 0) ? 0 : params.cumSumGenerationLengths[genIdx - 1];
auto const numTokens = (genIdx == 0)
? params.cumSumGenerationLengths[0]
: params.cumSumGenerationLengths[genIdx] - params.cumSumGenerationLengths[genIdx - 1];
// Copy packed masks.
// Masks are placed next to each other with offsets of cumSumGenerationLengths[bi-1]
auto const inputPackedMask
= params.inputSpecDecodingPackedMasks + batchSlot * numPackedMasks * params.maxDecodingTokens;
auto outputPackedMask = params.outputSpecDecodingPackedMasks + outputStartId * numPackedMasks;
for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < numTokens * numPackedMasks;
ti += static_cast<SizeType32>(blockDim.x))
{
outputPackedMask[ti] = inputPackedMask[ti];
}
// Copy pos offsets. Copy only for maxGenerationLength
auto const inputPositionOffsets
= params.inputSpecDecodingPositionOffsets + batchSlot * params.maxDecodingTokens;
auto outputPositionOffsets = params.outputSpecDecodingPositionOffsets + genIdx * maxGenerationLength;
for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < maxGenerationLength;
ti += static_cast<SizeType32>(blockDim.x))
{
outputPositionOffsets[ti] = inputPositionOffsets[ti];
}
}
}
} // namespace
void invokePackEagleGenerationLengths(PackEagleParams const& params, cudaStream_t stream)
{
SizeType32 constexpr BLOCK_SIZE = 32;
packEagleGenerationLengths<<<params.batchSize, BLOCK_SIZE, 0, stream>>>(params);
}
void invokePackEagle(PackEagleParams const& params, cudaStream_t stream)
{
SizeType32 constexpr BLOCK_SIZE = 128;
packEagleTensors<<<params.batchSize, BLOCK_SIZE, 0, stream>>>(params);
}
} // namespace tensorrt_llm::kernels::speculative_decoding

View File

@ -295,4 +295,103 @@ struct PrepareGenEagleNetInputsParams
//! \brief Prepares inputs for the gen stage EagleNet itearion (layerIdx > 0).
//! For input/output examples visit test/model/eagle/test_prepare_drafter_inputs_plugin.py (gen Eagle Net examples)
void invokePrepareGenEagleNetInputs(PrepareGenEagleNetInputsParams const& params);
struct PackEagleParams
{
runtime::SizeType32 batchSize{0};
runtime::SizeType32 maxNumPaths{0};
runtime::SizeType32 maxDecodingTokens{0};
runtime::SizeType32 maxPathLength{0};
runtime::SizeType32 numContextRequests{0};
runtime::SizeType32 numGenerationRequests{0};
//! inputs
//! [batchSize]
runtime::SizeType32 const* batchSlots{nullptr};
//! [maxBatchSize]
float const* inputTemperatures{nullptr};
//! [maxBatchSize]
float const* inputRandomDataSample{nullptr};
//! [maxBatchSize]
float const* inputRandomDataValidation{nullptr};
//! [maxBatchSize, maxDecodingDraftTokens]
runtime::TokenIdType const* inputNextDraftTokens{nullptr};
//! [maxBatchSize]
runtime::SizeType32 const* inputNextDraftLens{nullptr};
//! [maxBatchSize, maxDecodingTokens, maxPathLen]
runtime::SizeType32 const* inputNextDraftPaths{nullptr};
//! [maxBatchSize]
runtime::SizeType32 const* inputSpecDecodingGenerationLengths{nullptr};
//! [maxBatchSize]
runtime::SizeType32 const* inputSpecDecodingPositionOffsets{nullptr};
//! [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
int32_t const* inputSpecDecodingPackedMasks{nullptr};
//! outputs
//! [batchSize]
float* outputTemperatures{nullptr};
//! [batchSize]
float* outputRandomDataSample{nullptr};
//! [batchSize]
float* outputRandomDataValidation{nullptr};
//! [batchSize, maxDecodingDraftTokens]
runtime::TokenIdType* outputNextDraftTokens{nullptr};
//! [batchSize]
runtime::SizeType32* outputNextDraftLens{nullptr};
//! [batchSize, maxDecodingTokens, maxPathLen]
runtime::SizeType32* outputNextDraftPaths{nullptr};
//! [batchSize]
runtime::SizeType32* outputSpecDecodingGenerationLengths{nullptr};
//! [batchSize]
runtime::SizeType32* outputSpecDecodingPositionOffsets{nullptr};
//! [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
int32_t* outputSpecDecodingPackedMasks{nullptr};
// workspace
//! [1]
runtime::SizeType32* maxGenerationLength{nullptr};
//! [batchSize + 1]
runtime::SizeType32* cumSumGenerationLengths{nullptr};
void checkParams()
{
TLLM_CHECK(batchSlots);
TLLM_CHECK(inputTemperatures);
TLLM_CHECK(inputRandomDataSample);
TLLM_CHECK(inputRandomDataValidation);
TLLM_CHECK(inputNextDraftTokens);
TLLM_CHECK(inputNextDraftLens);
TLLM_CHECK(inputNextDraftPaths);
TLLM_CHECK(inputSpecDecodingGenerationLengths);
TLLM_CHECK(inputSpecDecodingPositionOffsets);
TLLM_CHECK(inputSpecDecodingPackedMasks);
TLLM_CHECK(outputTemperatures);
TLLM_CHECK(outputRandomDataSample);
TLLM_CHECK(outputRandomDataValidation);
TLLM_CHECK(outputNextDraftTokens);
TLLM_CHECK(outputNextDraftLens);
TLLM_CHECK(outputNextDraftPaths);
TLLM_CHECK(outputSpecDecodingGenerationLengths);
TLLM_CHECK(outputSpecDecodingPositionOffsets);
TLLM_CHECK(outputSpecDecodingPackedMasks);
TLLM_CHECK(maxGenerationLength);
TLLM_CHECK(cumSumGenerationLengths);
TLLM_CHECK(batchSize > 0);
TLLM_CHECK(batchSize == numContextRequests + numGenerationRequests);
TLLM_CHECK(maxDecodingTokens > 0);
TLLM_CHECK(maxPathLength > 0);
TLLM_CHECK(maxNumPaths > 0);
}
};
//! \brief packs outputSpecDecodingGenerationLengths from batch slots positions to continuous memory.
void invokePackEagleGenerationLengths(PackEagleParams const& params, cudaStream_t stream);
//! \brief packs the rest of the output tensors from batch slots positions to continuous memory.
void invokePackEagle(PackEagleParams const& params, cudaStream_t stream);
} // namespace tensorrt_llm::kernels::speculative_decoding

View File

@ -1063,7 +1063,7 @@ void kernelV2DispatchHeadSize(QKVPreprocessingParams<T, KVCacheBuffer> params, c
///////////////////////////////////////////////////////////////////////////////////////////////////
template <typename T, typename TCache, typename KVCacheBuffer, int BLOCK_SIZE, int Dh>
template <typename T, typename TCache, typename KVCacheBuffer, int BLOCK_SIZE, int Dh, bool FP8_OUTPUT>
__global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCacheBuffer> params)
{
// For cross-attention,
@ -1121,6 +1121,14 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
int const src_k_offset = params.q_hidden_size;
int const src_v_offset = src_k_offset + params.kv_hidden_size;
// Cast float scale to dst data type.
using TScale = typename mmha::kv_cache_scale_type_t<T, TCache>::Type;
[[maybe_unused]] TScale scale_orig_quant;
if constexpr (sizeof(TCache) == 1 || FP8_OUTPUT)
{
mmha::convert_from_float(&scale_orig_quant, params.kvScaleOrigQuant ? params.kvScaleOrigQuant[0] : 1.0f);
}
// For loop in the sequence length dimension.
// There might be multiple blocks (blockIdx.x) that process the same sequence in order to fully utilize
for (int token_idx = blockIdx.x * TOKENS_PER_BLOCK + (threadIdx.x / VECS_PER_HEAD); token_idx < max_seq_len;
@ -1139,8 +1147,18 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
// Only load Q tokens from decoder qkv input.
auto q = *reinterpret_cast<VecT const*>(params.qkv_input + src_q_idx);
// Store it to a separate q output.
*reinterpret_cast<VecT*>(params.q_output + dst_q_idx) = q;
// Quantize the output to fp8.
if constexpr (FP8_OUTPUT)
{
using OutputType = __nv_fp8_e4m3;
OutputType* quantized_q_ptr = reinterpret_cast<OutputType*>(params.q_output) + dst_q_idx;
mmha::store_8bits_vec(quantized_q_ptr, q, 0, scale_orig_quant);
}
else
{
// Store it to a separate q output.
*reinterpret_cast<VecT*>(params.q_output + dst_q_idx) = q;
}
}
// Encoder tokens (i.e. KV tokens).
@ -1175,10 +1193,6 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
{
// The element index inside the block.
auto block_elt_idx = block_vec_idx * ELTS_PER_VEC;
// Cast float scale to dst data type.
using TScale = typename mmha::kv_cache_scale_type_t<T, TCache>::Type;
TScale scale_orig_quant;
mmha::convert_from_float(&scale_orig_quant, params.kvScaleOrigQuant[0]);
// Store 8bits kv cache.
mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant);
mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant);
@ -1217,7 +1231,16 @@ void invokeUpdateKvCacheForCrossAttention(QKVPreprocessingParams<T, KVCacheBuffe
dim3 grid(num_seq_blocks, params.head_num, params.batch_size);
// Launch the kernel.
updateKVCacheForCrossAttention<T, TCache, KVCacheBuffer, BLOCK_SIZE, Dh><<<grid, block, 0, stream>>>(params);
if (params.quantized_fp8_output)
{
updateKVCacheForCrossAttention<T, TCache, KVCacheBuffer, BLOCK_SIZE, Dh, true>
<<<grid, block, 0, stream>>>(params);
}
else
{
updateKVCacheForCrossAttention<T, TCache, KVCacheBuffer, BLOCK_SIZE, Dh, false>
<<<grid, block, 0, stream>>>(params);
}
}
///////////////////////////////////////////////////////////////////////////////////////////////////

View File

@ -785,7 +785,7 @@ int GPTAttentionPluginCommon::enqueueContext(EnqueueContextParams<T> const& para
= mEnableContextFMHA ? 0 : sizeof(T) * params.batch_size * params.input_seq_length * kv_seq_length;
size_t const cu_seqlens_size = sizeof(int) * (params.batch_size + 1);
size_t const rotary_inv_freq_size = sizeof(float) * params.batch_size * mRotaryEmbeddingDim / 2;
size_t const q_buf_2_size = mFMHARunner->isSeparateQAndKvInput() || !mEnableContextFMHA
size_t const q_buf_2_size = !mEnableContextFMHA || mFMHARunner->isSeparateQAndKvInput()
? sizeof(T) * params.batch_size * params.input_seq_length * local_hidden_units_qo
: 0;
size_t const k_buf_2_size

View File

@ -86,10 +86,10 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
return std::make_shared<tb::LlmRequest>(mRequestId, mMaxNewTokens,
std::make_shared<std::vector<TokenIdType>>(mTokens.at(0)), mSamplingConfig, mIsStreaming, mEndId, mPadId,
embeddingBias, badWordsList, stopWordsList, mPositionIds, promptEmbeddingTable, mPromptVocabSize, mLoraTaskId,
loraWeights, loraConfig, mLookaheadConfig, returnLogProbs(), mReturnContextLogits, mReturnGenerationLogits,
mDraftTokens, draftLogits, mExcludeInputFromOutput, callbackAdapter(mLogitsPostProcessor),
mApplyLogitsPostProcessorBatched, mEncoderTokens, mReturnEncoderOutput, mClientId, mPriority,
encoderInputFeatures, mEncoderOutputLength, crossAttentionMask,
loraWeights, loraConfig, mLookaheadConfig, mKvCacheRetentionConfig, returnLogProbs(), mReturnContextLogits,
mReturnGenerationLogits, mDraftTokens, draftLogits, mExcludeInputFromOutput,
callbackAdapter(mLogitsPostProcessor), mApplyLogitsPostProcessorBatched, mEncoderTokens, mReturnEncoderOutput,
mClientId, mPriority, encoderInputFeatures, mEncoderOutputLength, crossAttentionMask,
tb::LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, mInputTokenExtraIds, mNumReturnSequences);
}
@ -102,8 +102,8 @@ void LlmRequest::initBindings(py::module_& m)
std::optional<LlmRequest::TensorPtr>, std::optional<std::vector<LlmRequest::SizeType32>>,
std::optional<LlmRequest::TensorPtr>, std::optional<LlmRequest::SizeType32>, std::optional<uint64_t>,
std::optional<LlmRequest::TensorPtr>, std::optional<LlmRequest::TensorPtr>,
std::optional<executor::LookaheadDecodingConfig>, bool, bool, bool,
std::optional<LlmRequest::VecTokens>, std::optional<LlmRequest::TensorPtr>, bool,
std::optional<executor::LookaheadDecodingConfig>, std::optional<executor::KvCacheRetentionConfig>,
bool, bool, bool, std::optional<LlmRequest::VecTokens>, std::optional<LlmRequest::TensorPtr>, bool,
std::optional<LlmRequest::LogitsPostProcessor>, bool, std::optional<LlmRequest::VecTokens>, bool,
std::optional<RequestIdType>, executor::PriorityType, std::optional<LlmRequest::TensorPtr>,
std::optional<LlmRequest::SizeType32>, std::optional<LlmRequest::TensorPtr>,
@ -115,15 +115,15 @@ void LlmRequest::initBindings(py::module_& m)
py::arg("prompt_embedding_table") = std::nullopt, py::arg("prompt_vocab_size") = std::nullopt,
py::arg("lora_task_id") = std::nullopt, py::arg("lora_weights") = std::nullopt,
py::arg("lora_config") = std::nullopt, py::arg("lookahead_config") = std::nullopt,
py::arg("return_log_probs") = false, py::arg("return_context_logits") = false,
py::arg("return_generation_logits") = false, py::arg("draft_tokens") = std::nullopt,
py::arg("draft_logits") = std::nullopt, py::arg("exclude_input_from_output") = false,
py::arg("logits_post_processor") = std::nullopt, py::arg("apply_logits_post_processor_batched") = false,
py::arg("encoder_input_tokens") = std::nullopt, py::arg("return_encoder_output") = false,
py::arg("client_id") = std::nullopt, py::arg("priority") = executor::Request::kDefaultPriority,
py::arg("encoder_input_features") = std::nullopt, py::arg("encoder_output_length") = std::nullopt,
py::arg("cross_attention_mask") = std::nullopt, py::arg("input_token_extra_ids") = std::nullopt,
py::arg("num_return_sequences") = 1)
py::arg("kv_cache_retention_config") = std::nullopt, py::arg("return_log_probs") = false,
py::arg("return_context_logits") = false, py::arg("return_generation_logits") = false,
py::arg("draft_tokens") = std::nullopt, py::arg("draft_logits") = std::nullopt,
py::arg("exclude_input_from_output") = false, py::arg("logits_post_processor") = std::nullopt,
py::arg("apply_logits_post_processor_batched") = false, py::arg("encoder_input_tokens") = std::nullopt,
py::arg("return_encoder_output") = false, py::arg("client_id") = std::nullopt,
py::arg("priority") = executor::Request::kDefaultPriority, py::arg("encoder_input_features") = std::nullopt,
py::arg("encoder_output_length") = std::nullopt, py::arg("cross_attention_mask") = std::nullopt,
py::arg("input_token_extra_ids") = std::nullopt, py::arg("num_return_sequences") = 1)
.def("get_num_tokens", &LlmRequest::getNumTokens, py::arg("beam"))
.def_property_readonly("max_beam_num_tokens", &LlmRequest::getMaxBeamNumTokens)
.def("get_token", &LlmRequest::getToken, py::arg("beam"), py::arg("pos"))

View File

@ -68,8 +68,9 @@ public:
std::optional<SizeType32> promptVocabSize = std::nullopt,
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
std::optional<TensorPtr> loraConfig = std::nullopt,
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
bool returnContextLogits = false, bool returnGenerationLogits = false,
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
std::optional<VecTokens> draftTokens = std::nullopt, std::optional<TensorPtr> draftLogits = std::nullopt,
bool excludeInputFromOutput = false, std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
bool applyLogitsPostProcessorBatched = false, std::optional<VecTokens> encoderInputTokens = std::nullopt,
@ -83,8 +84,8 @@ public:
samplingConfig, isStreaming, endId, padId, embeddingBias, badWordsList, stopWordsList,
positionIds.has_value() ? std::make_shared<std::vector<SizeType32>>(std::move(positionIds.value()))
: std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),
promptEmbeddingTable, promptVocabSize, loraTaskId, loraWeights, loraConfig, lookaheadConfig, returnLogProbs,
returnContextLogits, returnGenerationLogits,
promptEmbeddingTable, promptVocabSize, loraTaskId, loraWeights, loraConfig, lookaheadConfig,
kvCacheRetentionConfig, returnLogProbs, returnContextLogits, returnGenerationLogits,
draftTokens.has_value() ? std::make_shared<VecTokens>(std::move(draftTokens.value()))
: std::make_shared<VecTokens>(),
draftLogits, excludeInputFromOutput, logitsPostProcessor, applyLogitsPostProcessorBatched,

View File

@ -224,7 +224,9 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
.def_property("compute_generation_logits",
py::overload_cast<>(&tr::ModelConfig::computeGenerationLogits, py::const_),
py::overload_cast<bool>(&tr::ModelConfig::computeGenerationLogits))
.def_property("model_variant", &tr::ModelConfig::getModelVariant, &tr::ModelConfig::setModelVariant);
.def_property("model_variant", &tr::ModelConfig::getModelVariant, &tr::ModelConfig::setModelVariant)
.def_property(
"use_cross_attention", &tr::ModelConfig::useCrossAttention, &tr::ModelConfig::setUseCrossAttention);
py::class_<tr::WorldConfig>(m, "WorldConfig")
.def(py::init<SizeType32, SizeType32, SizeType32, SizeType32, std::optional<std::vector<SizeType32>> const&>(),

View File

@ -114,7 +114,9 @@ void InitBindings(pybind11::module_& m)
.def_readwrite("tokens_per_block", &tle::KvCacheStats::tokensPerBlock)
.def_readwrite("alloc_total_blocks", &tle::KvCacheStats::allocTotalBlocks)
.def_readwrite("alloc_new_blocks", &tle::KvCacheStats::allocNewBlocks)
.def_readwrite("reused_blocks", &tle::KvCacheStats::reusedBlocks);
.def_readwrite("reused_blocks", &tle::KvCacheStats::reusedBlocks)
.def_readwrite("missed_blocks", &tle::KvCacheStats::missedBlocks)
.def_readwrite("cache_hit_rate", &tle::KvCacheStats::cacheHitRate);
py::class_<tle::StaticBatchingStats>(m, "StaticBatchingStats")
.def(py::init<>())
@ -184,6 +186,8 @@ void InitBindings(pybind11::module_& m)
.def_readwrite("alloc_total_blocks_per_request", &tle::RequestStats::allocTotalBlocksPerRequest)
.def_readwrite("alloc_new_blocks_per_request", &tle::RequestStats::allocNewBlocksPerRequest)
.def_readwrite("reused_blocks_per_request", &tle::RequestStats::reusedBlocksPerRequest)
.def_readwrite("missed_blocks_per_request", &tle::RequestStats::missedBlocksPerRequest)
.def_readwrite("kv_cache_hit_rate_per_request", &tle::RequestStats::kvCacheHitRatePerRequest)
.def("to_json_str",
[](tle::RequestStats const& iterationStats) { return tle::JsonSerialization::toJsonStr(iterationStats); });
@ -307,6 +311,24 @@ void InitBindings(pybind11::module_& m)
.def_property_readonly("max_ngram_size", &tle::LookaheadDecodingConfig::getNgramSize)
.def_property_readonly("max_verification_set_size", &tle::LookaheadDecodingConfig::getVerificationSetSize);
auto kvCacheRetentionConfig
= py::class_<tle::KvCacheRetentionConfig>(m, "KvCacheRetentionConfig")
.def(py::init<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionPriority>,
tle::RetentionPriority>(),
py::arg("token_range_retention_priorities"), py::arg("decode_retention_priority"))
.def_property_readonly(
"token_range_retention_priorities", &tle::KvCacheRetentionConfig::getTokenRangeRetentionPriorities)
.def_property_readonly(
"decode_retention_priority", &tle::KvCacheRetentionConfig::getDecodeRetentionPriority);
py::class_<tle::KvCacheRetentionConfig::TokenRangeRetentionPriority>(
kvCacheRetentionConfig, "TokenRangeRetentionPriority")
.def(py::init<SizeType32, std::optional<SizeType32>, tle::RetentionPriority>(), py::arg("token_start"),
py::arg("token_end"), py::arg("priority"))
.def_readwrite("token_start", &tle::KvCacheRetentionConfig::TokenRangeRetentionPriority::tokenStart)
.def_readwrite("token_end", &tle::KvCacheRetentionConfig::TokenRangeRetentionPriority::tokenEnd)
.def_readwrite("priority", &tle::KvCacheRetentionConfig::TokenRangeRetentionPriority::priority);
py::class_<tle::ContextPhaseParams>(m, "ContextPhaseParams")
.def(py::init<VecTokens, tle::ContextPhaseParams::RequestIdType>(), py::arg("first_gen_tokens"),
py::arg("req_id"));
@ -327,6 +349,7 @@ void InitBindings(pybind11::module_& m)
std::optional<tle::ExternalDraftTokensConfig> externalDraftTokensConfig,
std::optional<tle::PromptTuningConfig> pTuningConfig, std::optional<tle::LoraConfig> loraConfig,
std::optional<tle::LookaheadDecodingConfig> lookaheadConfig,
std::optional<tle::KvCacheRetentionConfig> kvCacheRetentionConfig,
std::optional<std::string> logitsPostProcessorName,
std::optional<tle::VecTokens> encoderInputTokenIds, std::optional<tle::IdType> clientId,
bool returnAllGeneratedTokens, tle::PriorityType priority, tle::RequestType type,
@ -343,12 +366,12 @@ void InitBindings(pybind11::module_& m)
}
}
TLLM_CHECK_WITH_INFO(maxTokens.has_value(), "missing required argument max_tokens");
return std::make_unique<tle::Request>(inputTokenIds, maxTokens.value(), streaming, samplingConfig,
outputConfig, endId, padId, positionIds, badWords, stopWords, embeddingBias,
externalDraftTokensConfig, pTuningConfig, loraConfig, lookaheadConfig, logitsPostProcessorName,
encoderInputTokenIds, clientId, returnAllGeneratedTokens, priority, type, contextPhaseParams,
encoderInputFeatures, encoderOutputLength, crossAttentionMask, numReturnSequences);
externalDraftTokensConfig, pTuningConfig, loraConfig, lookaheadConfig, kvCacheRetentionConfig,
logitsPostProcessorName, encoderInputTokenIds, clientId, returnAllGeneratedTokens, priority,
type, contextPhaseParams, encoderInputFeatures, encoderOutputLength, crossAttentionMask,
numReturnSequences);
}),
py::arg("input_token_ids"), py::kw_only(), py::arg("max_tokens") = py::none(),
py::arg("max_new_tokens") = py::none(), py::arg("streaming") = false,
@ -358,9 +381,9 @@ void InitBindings(pybind11::module_& m)
py::arg("stop_words") = py::none(), py::arg("embedding_bias") = py::none(),
py::arg("external_draft_tokens_config") = py::none(), py::arg("prompt_tuning_config") = py::none(),
py::arg("lora_config") = py::none(), py::arg("lookahead_config") = py::none(),
py::arg("logits_post_processor_name") = py::none(), py::arg("encoder_input_token_ids") = py::none(),
py::arg("client_id") = py::none(), py::arg("return_all_generated_tokens") = false,
py::arg("priority") = tle::Request::kDefaultPriority,
py::arg("kv_cache_retention_config") = py::none(), py::arg("logits_post_processor_name") = py::none(),
py::arg("encoder_input_token_ids") = py::none(), py::arg("client_id") = py::none(),
py::arg("return_all_generated_tokens") = false, py::arg("priority") = tle::Request::kDefaultPriority,
py::arg_v("type", tle::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION,
"RequestType.REQUEST_TYPE_CONTEXT_AND_GENERATION"),
py::arg("context_phase_params") = py::none(), py::arg("encoder_input_features") = py::none(),
@ -384,6 +407,8 @@ void InitBindings(pybind11::module_& m)
"prompt_tuning_config", &tle::Request::getPromptTuningConfig, &tle::Request::setPromptTuningConfig)
.def_property("lora_config", &tle::Request::getLoraConfig, &tle::Request::setLoraConfig)
.def_property("lookahead_config", &tle::Request::getLookaheadConfig, &tle::Request::setLookaheadConfig)
.def_property("kv_cache_retention_config", &tle::Request::getKvCacheRetentionConfig,
&tle::Request::setKvCacheRetentionConfig)
.def_property("logits_post_processor_name", &tle::Request::getLogitsPostProcessorName,
&tle::Request::setLogitsPostProcessorName)
.def_property(
@ -417,7 +442,9 @@ void InitBindings(pybind11::module_& m)
.def_readwrite("encoder_output", &tle::Result::encoderOutput)
.def_readwrite("finish_reasons", &tle::Result::finishReasons)
.def_readwrite("sequence_index", &tle::Result::sequenceIndex)
.def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal);
.def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal)
.def_readwrite("decoding_iter", &tle::Result::decodingIter)
.def_readwrite("context_phase_params", &tle::Result::contextPhaseParams);
py::class_<tle::Response>(m, "Response")
.def(py::init<IdType, std::string, std::optional<IdType>>(), py::arg("request_id"), py::arg("error_msg"),
@ -457,25 +484,28 @@ void InitBindings(pybind11::module_& m)
{
return py::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindowVec(),
self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
self.getOnboardBlocks());
self.getOnboardBlocks(), self.getCrossKvCacheFraction(), self.getSecondaryOffloadMinPriority());
};
auto kvCacheConfigSetstate = [](py::tuple state)
{
if (state.size() != 7)
if (state.size() != 9)
{
throw std::runtime_error("Invalid state!");
}
return tle::KvCacheConfig(state[0].cast<bool>(), state[1].cast<std::optional<SizeType32>>(),
state[2].cast<std::optional<std::vector<SizeType32>>>(), state[3].cast<std::optional<SizeType32>>(),
state[4].cast<std::optional<float>>(), state[5].cast<std::optional<size_t>>(), state[6].cast<bool>());
state[4].cast<std::optional<float>>(), state[5].cast<std::optional<size_t>>(), state[6].cast<bool>(),
state[7].cast<std::optional<float>>(), state[8].cast<std::optional<tle::RetentionPriority>>());
};
py::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
.def(py::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool>(),
std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
std::optional<float> const&, std::optional<tle::RetentionPriority>>(),
py::arg("enable_block_reuse") = false, py::arg("max_tokens") = py::none(),
py::arg("max_attention_window") = py::none(), py::arg("sink_token_length") = py::none(),
py::arg("free_gpu_memory_fraction") = py::none(), py::arg("host_cache_size") = py::none(),
py::arg("onboard_blocks") = true)
py::arg("onboard_blocks") = true, py::arg("cross_kv_cache_fraction") = py::none(),
py::arg("secondary_offload_min_priority") = py::none())
.def_property(
"enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
.def_property("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
@ -487,6 +517,10 @@ void InitBindings(pybind11::module_& m)
&tle::KvCacheConfig::setFreeGpuMemoryFraction)
.def_property("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize)
.def_property("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks)
.def_property("cross_kv_cache_fraction", &tle::KvCacheConfig::getCrossKvCacheFraction,
&tle::KvCacheConfig::setCrossKvCacheFraction)
.def_property("secondary_offload_min_priority", &tle::KvCacheConfig::getSecondaryOffloadMinPriority,
&tle::KvCacheConfig::setSecondaryOffloadMinPriority)
.def(py::pickle(kvCacheConfigGetstate, kvCacheConfigSetstate));
py::class_<tle::OrchestratorConfig>(m, "OrchestratorConfig")

View File

@ -21,6 +21,7 @@ set(SRCS
bufferManager.cpp
cudaMemPool.cpp
decodingLayerWorkspace.cpp
eagleBuffers.cpp
explicitDraftTokensBuffers.cpp
lookaheadBuffers.cpp
layerProfiler.cpp

View File

@ -0,0 +1,351 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "tensorrt_llm/runtime/eagleBuffers.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h"
#include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/iBuffer.h"
namespace tksd = tensorrt_llm::kernels::speculative_decoding;
namespace tensorrt_llm::runtime
{
void EagleBuffers::Inputs::create(SizeType32 maxNumSequences, TllmRuntime const& runtime,
ModelConfig const& modelConfig, WorldConfig const& worldConfig)
{
auto const& manager = runtime.getBufferManager();
auto const& speculativeDecodingModule = modelConfig.getSpeculativeDecodingModule();
auto const maxNumPaths = speculativeDecodingModule.getMaxNumPaths();
auto const maxDraftPathLen = speculativeDecodingModule.getMaxDraftPathLen();
auto const maxPathLen = speculativeDecodingModule.getMaxPathLen();
auto const maxDecodingTokens = speculativeDecodingModule.getMaxDecodingTokens();
auto const maxDecodingDraftTokens = speculativeDecodingModule.getMaxDecodingDraftTokens();
auto constexpr TRTTokenIdType = runtime::TRTDataType<runtime::TokenIdType>::value;
temperatures = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kFLOAT);
randomDataSample = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kFLOAT);
randomDataValidation
= manager.gpu(ITensor::makeShape({maxNumSequences, maxNumPaths, maxDraftPathLen}), nvinfer1::DataType::kFLOAT);
draftTokens = manager.gpu(ITensor::makeShape({maxNumSequences, maxDecodingDraftTokens}), TRTTokenIdType);
draftLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
draftPaths
= manager.gpu(ITensor::makeShape({maxNumSequences, maxNumPaths, maxPathLen}), nvinfer1::DataType::kINT32);
specDecodingGenerationLengths = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
specDecodingPackedMasks
= manager.gpu(ITensor::makeShape({maxNumSequences, maxDecodingTokens, common::ceilDiv(maxDecodingTokens, 32)}),
nvinfer1::DataType::kINT32);
specDecodingPositionOffsets
= manager.gpu(ITensor::makeShape({maxNumSequences * maxDecodingTokens}), nvinfer1::DataType::kINT32);
eagleNetCtxRequestTypesHost = manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
eagleNetCtxContextLengthsHost
= manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
eagleNetCtxPastKeyValueLengthsHost
= manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
eagleNetGenRequestTypesHost = manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
eagleNetGenContextLengthsHost
= manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
eagleNetGenPastKeyValueLengthsHost
= manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
}
EagleBuffers::EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const& manager,
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
executor::DecodingConfig const& decodingConfig, runtime::TllmRuntime const& runtime)
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
TLLM_CHECK_WITH_INFO(maxBeamWidth == 1, "EAGLE does not support beam search");
auto const maxNumSequences = maxBatchSize;
auto const eagleModule = std::dynamic_pointer_cast<tensorrt_llm::runtime::EagleModule const>(
modelConfig.getSpeculativeDecodingModulePtr());
auto const numPaths = eagleModule->getMaxNumPaths();
auto const pathLen = eagleModule->getMaxPathLen();
auto const maxDecodingDraftTokens = eagleModule->getMaxDecodingDraftTokens();
auto constexpr TRTTokenIdType = runtime::TRTDataType<runtime::TokenIdType>::value;
// input tensors
engineInputs.temperatures = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
engineInputs.draftTokens
= manager.gpu(ITensor::makeShape({maxNumSequences, maxDecodingDraftTokens}), TRTTokenIdType);
engineInputs.draftLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
engineInputs.draftPaths
= manager.gpu(ITensor::makeShape({maxNumSequences, numPaths, pathLen}), nvinfer1::DataType::kINT32);
engineInputs.specDecodingGenerationLengths
= manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
engineInputs.specDecodingPositionOffsets
= manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
engineInputs.specDecodingPackedMasks = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
engineInputs.randomDataSample = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
engineInputs.randomDataValidation = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
engineInputs.eagleNetCtxRequestTypesHost
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
engineInputs.eagleNetCtxContextLengthsHost
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
engineInputs.eagleNetCtxPastKeyValueLengthsHost
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
engineInputs.eagleNetGenRequestTypesHost
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
engineInputs.eagleNetGenContextLengthsHost
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
engineInputs.eagleNetGenPastKeyValueLengthsHost
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
// output tensors
engineOutputs.nextDraftTokens
= manager.gpu(ITensor::makeShape({maxNumSequences, numPaths, pathLen}), TRTTokenIdType);
engineOutputs.nextDraftLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
engineOutputs.nextDraftPaths
= manager.gpu(ITensor::makeShape({maxNumSequences, numPaths, pathLen}), nvinfer1::DataType::kINT32);
engineOutputs.acceptedTokens
= manager.gpu(ITensor::makeShape({maxNumSequences, pathLen}), nvinfer1::DataType::kINT32);
engineOutputs.acceptedLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
engineOutputs.acceptedPaths = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
// helper tensors
auto const& stream = manager.getStream();
scanTempStorageBytes
= tksd::invokeScanGenerationLengths(nullptr, 0, nullptr, nullptr, maxNumSequences, stream.get());
reduceTempStorageBytes
= tksd::invokeReduceMaxGenerationLengths(nullptr, 0, nullptr, nullptr, maxNumSequences, stream.get());
scanReduceTempStorage = manager.gpu(std::max(reduceTempStorageBytes, scanTempStorageBytes));
cumSumGenerationLengths = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
maxGenerationLength = manager.gpu(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
// pre-allocate empty tensors
reshape(0, maxNumSequences, modelConfig);
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
void EagleBuffers::reshape(
SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const& modelConfig)
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
auto const numSequences = numCtxSequences + numGenSequences;
auto const eagleModule = std::dynamic_pointer_cast<tensorrt_llm::runtime::EagleModule const>(
modelConfig.getSpeculativeDecodingModulePtr());
auto const maxDecodingTokens = eagleModule->getMaxDecodingTokens();
// input tensors
engineInputs.temperatures->reshape(ITensor::makeShape({numSequences}));
auto draftTokensShape = engineInputs.draftTokens->getShape();
draftTokensShape.d[0] = numSequences;
engineInputs.draftTokens->reshape(draftTokensShape);
auto draftLensShape = engineInputs.draftLens->getShape();
draftLensShape.d[0] = numSequences;
engineInputs.draftLens->reshape(draftLensShape);
auto draftPathsShape = engineInputs.draftPaths->getShape();
draftPathsShape.d[0] = numSequences;
engineInputs.draftPaths->reshape(draftPathsShape);
engineInputs.specDecodingGenerationLengths->reshape(ITensor::makeShape({numGenSequences}));
engineInputs.specDecodingPositionOffsets->reshape(ITensor::makeShape({numGenSequences, maxDecodingTokens}));
engineInputs.specDecodingPackedMasks->reshape(
ITensor::makeShape({numGenSequences * maxDecodingTokens, common::ceilDiv(maxDecodingTokens, 32)}));
engineInputs.randomDataSample->reshape(ITensor::makeShape({numSequences}));
engineInputs.randomDataValidation->reshape(ITensor::makeShape({numSequences}));
engineInputs.eagleNetCtxRequestTypesHost->reshape(ITensor::makeShape({numSequences}));
engineInputs.eagleNetCtxContextLengthsHost->reshape(ITensor::makeShape({numSequences}));
engineInputs.eagleNetCtxPastKeyValueLengthsHost->reshape(ITensor::makeShape({numSequences}));
engineInputs.eagleNetGenRequestTypesHost->reshape(ITensor::makeShape({numSequences}));
engineInputs.eagleNetGenContextLengthsHost->reshape(ITensor::makeShape({numSequences}));
engineInputs.eagleNetGenPastKeyValueLengthsHost->reshape(ITensor::makeShape({numSequences}));
cumSumGenerationLengths->reshape(ITensor::makeShape({numSequences}));
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
template <typename T>
void EagleBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded,
ITensor const& seqSlots, EagleBuffers::Inputs const& draftBuffers, ITensor const& contextPositionIds,
runtime::EagleModule const& eagleModule, runtime::CudaStream const& stream) const
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
using runtime::bufferCast;
tksd::PackEagleParams params;
params.batchSize = numCtxSequences + numGenSequences;
params.maxNumPaths = eagleModule.getMaxNumPaths();
params.maxDecodingTokens = eagleModule.getMaxDecodingTokens();
params.maxPathLength = eagleModule.getMaxPathLen();
params.numContextRequests = numCtxSequences;
params.numGenerationRequests = numGenSequences;
params.batchSlots = bufferCast<SizeType32>(seqSlots);
// Outputs from decoder -- inputs to the packing kernel
params.inputTemperatures = bufferCast<float>(*draftBuffers.temperatures);
params.inputRandomDataSample = bufferCast<float>(*draftBuffers.randomDataSample);
params.inputRandomDataValidation = bufferCast<float>(*draftBuffers.randomDataValidation);
params.inputNextDraftTokens = bufferCast<runtime::TokenIdType>(*draftBuffers.draftTokens);
params.inputNextDraftLens = bufferCast<SizeType32>(*draftBuffers.draftLens);
params.inputNextDraftPaths = bufferCast<SizeType32>(*draftBuffers.draftPaths);
params.inputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*draftBuffers.specDecodingGenerationLengths);
params.inputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*draftBuffers.specDecodingPositionOffsets);
params.inputSpecDecodingPackedMasks = bufferCast<int32_t>(*draftBuffers.specDecodingPackedMasks);
// Outputs of the packing kernel -- inputs to the engine
params.outputTemperatures = bufferCast<float>(*engineInputs.temperatures);
params.outputRandomDataSample = bufferCast<float>(*engineInputs.randomDataSample);
params.outputRandomDataValidation = bufferCast<float>(*engineInputs.randomDataValidation);
params.outputNextDraftTokens = bufferCast<runtime::TokenIdType>(*engineInputs.draftTokens);
params.outputNextDraftLens = bufferCast<SizeType32>(*engineInputs.draftLens);
params.outputNextDraftPaths = bufferCast<SizeType32>(*engineInputs.draftPaths);
params.outputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*engineInputs.specDecodingGenerationLengths);
params.outputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*engineInputs.specDecodingPositionOffsets);
params.outputSpecDecodingPackedMasks = bufferCast<int32_t>(*engineInputs.specDecodingPackedMasks);
params.maxGenerationLength = bufferCast<SizeType32>(*maxGenerationLength);
params.cumSumGenerationLengths = bufferCast<SizeType32>(*cumSumGenerationLengths);
params.checkParams();
// Pack tensors from batch slot position to continuous array
tksd::invokePackEagleGenerationLengths(params, stream.get());
if (numGenSequences)
{
// Compute inclusive sum and max
tksd::invokeScanReduceGenerationLengths(numGenSequences,
bufferCast<SizeType32>(*engineInputs.specDecodingGenerationLengths),
bufferCast<uint8_t>(*scanReduceTempStorage), scanTempStorageBytes,
bufferCast<SizeType32>(*cumSumGenerationLengths), bufferCast<uint8_t>(*scanReduceTempStorage),
reduceTempStorageBytes, bufferCast<SizeType32>(*maxGenerationLength), stream.get());
}
// Pack tensors from batch slot position to continuous array
tksd::invokePackEagle(params, stream.get());
// Pack host data.
for (SizeType32 bi = 0; bi < params.batchSize; ++bi)
{
auto const batchSlot = params.batchSlots[bi];
bufferCast<SizeType32>(*engineInputs.eagleNetCtxRequestTypesHost)[bi]
= bufferCast<SizeType32>(*draftBuffers.eagleNetCtxRequestTypesHost)[batchSlot];
bufferCast<SizeType32>(*engineInputs.eagleNetCtxContextLengthsHost)[bi]
= bufferCast<SizeType32>(*draftBuffers.eagleNetCtxContextLengthsHost)[batchSlot];
bufferCast<SizeType32>(*engineInputs.eagleNetCtxPastKeyValueLengthsHost)[bi]
= bufferCast<SizeType32>(*draftBuffers.eagleNetCtxPastKeyValueLengthsHost)[batchSlot];
bufferCast<SizeType32>(*engineInputs.eagleNetGenRequestTypesHost)[bi]
= bufferCast<SizeType32>(*draftBuffers.eagleNetGenRequestTypesHost)[batchSlot];
bufferCast<SizeType32>(*engineInputs.eagleNetGenContextLengthsHost)[bi]
= bufferCast<SizeType32>(*draftBuffers.eagleNetGenContextLengthsHost)[batchSlot];
bufferCast<SizeType32>(*engineInputs.eagleNetGenPastKeyValueLengthsHost)[bi]
= bufferCast<SizeType32>(*draftBuffers.eagleNetGenPastKeyValueLengthsHost)[batchSlot];
}
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
void EagleBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, ITensor const& requestTypes,
ITensor const& seqSlots, EagleBuffers::Inputs const& draftBuffers, ITensor const& contextPositionIds,
runtime::TllmRuntime const& runtime, runtime::ModelConfig const& modelConfig,
runtime::WorldConfig const& worldConfig) const
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
auto const& stream = runtime.getStream();
auto const eagleModule
= std::dynamic_pointer_cast<runtime::EagleModule const>(modelConfig.getSpeculativeDecodingModulePtr());
auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize());
auto const dtype = modelConfig.getDataType();
switch (dtype)
{
case nvinfer1::DataType::kFLOAT:
setFromInputs<float>(numCtxSequences, numGenSequences, vocabSizePadded, seqSlots, draftBuffers,
contextPositionIds, *eagleModule, stream);
break;
case nvinfer1::DataType::kHALF:
setFromInputs<half>(numCtxSequences, numGenSequences, vocabSizePadded, seqSlots, draftBuffers,
contextPositionIds, *eagleModule, stream);
break;
default: TLLM_THROW("DataType %d not supported in EagleBuffers", static_cast<SizeType32>(dtype)); break;
}
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
void EagleBuffers::insertInputTensors(
TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::WorldConfig const& /* worldConfig */) const
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
// inputs
inputBuffers.insert_or_assign("eagle_temperature", engineInputs.temperatures);
inputBuffers.insert_or_assign("spec_decoding_generation_lengths", engineInputs.specDecodingGenerationLengths);
inputBuffers.insert_or_assign("spec_decoding_position_offsets", engineInputs.specDecodingPositionOffsets);
inputBuffers.insert_or_assign("spec_decoding_packed_mask", engineInputs.specDecodingPackedMasks);
inputBuffers.insert_or_assign("rand_data_sample", engineInputs.randomDataSample);
inputBuffers.insert_or_assign("rand_data_validation", engineInputs.randomDataValidation);
inputBuffers.insert_or_assign("draft_tokens", engineInputs.draftTokens);
inputBuffers.insert_or_assign("draft_lens", engineInputs.draftLens);
inputBuffers.insert_or_assign("draft_paths", engineInputs.draftPaths);
inputBuffers.insert_or_assign("host_ctx_eagle_net_request_types", engineInputs.eagleNetCtxRequestTypesHost);
inputBuffers.insert_or_assign("host_ctx_eagle_net_context_lengths", engineInputs.eagleNetCtxContextLengthsHost);
inputBuffers.insert_or_assign(
"host_ctx_eagle_net_past_key_value_lengths", engineInputs.eagleNetCtxPastKeyValueLengthsHost);
inputBuffers.insert_or_assign("host_gen_eagle_net_request_types", engineInputs.eagleNetGenRequestTypesHost);
inputBuffers.insert_or_assign("host_gen_eagle_net_context_lengths", engineInputs.eagleNetGenContextLengthsHost);
inputBuffers.insert_or_assign(
"host_gen_eagle_net_past_key_value_lengths", engineInputs.eagleNetGenPastKeyValueLengthsHost);
// outputs
outputBuffers.insert_or_assign("next_draft_tokens", engineOutputs.nextDraftTokens);
outputBuffers.insert_or_assign("next_draft_lens", engineOutputs.nextDraftLens);
outputBuffers.insert_or_assign("accepted_tokens", engineOutputs.acceptedTokens);
outputBuffers.insert_or_assign("num_accepted_tokens", engineOutputs.acceptedLens);
outputBuffers.insert_or_assign("accepted_paths", engineOutputs.acceptedPaths);
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
} // namespace tensorrt_llm::runtime

View File

@ -0,0 +1,39 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
namespace tensorrt_llm::runtime
{
class EagleModule : public SpeculativeDecodingModule
{
public:
explicit EagleModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens, SizeType32 maxNumPaths) noexcept
: SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxNumPaths)
{
TLLM_CHECK(maxNumPaths * maxDraftPathLen == maxDecodingDraftTokens);
}
explicit EagleModule() noexcept
: EagleModule(0, 0, 0)
{
}
};
} // namespace tensorrt_llm::runtime

View File

@ -233,6 +233,16 @@ void GptDecoderBatched::setupLookahead(LookaheadDecodingBuffers lookaheadDecodin
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
void GptDecoderBatched::setupEagle(EagleBuffers::Inputs eagleBuffers)
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
TLLM_CHECK(mSpeculativeDecodingMode.isEagle());
mJointDecodingOutput->eagleBuffers = std::move(eagleBuffers);
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength,
SizeType32 maxTokensPerEngineStep, nvinfer1::DataType dtype, ModelConfig const& modelConfig)
@ -628,6 +638,10 @@ void GptDecoderBatched::newRequestSpeculativeDecoding(
{
newRequestExplicitDraftTokens(batchIdx, request);
}
else if (mSpeculativeDecodingMode.isEagle())
{
newRequestEagle(batchIdx, request);
}
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
@ -739,6 +753,17 @@ void GptDecoderBatched::newRequestExplicitDraftTokens(SizeType32 batchIdx, decod
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
void GptDecoderBatched::newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const& request)
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
TLLM_CHECK(mJointDecodingOutput->eagleBuffers);
// TODO fill me
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
void GptDecoderBatched::setExplicitDraftTokensInputs(decoder_batch::Input const& input)
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@ -767,6 +792,15 @@ void GptDecoderBatched::setExplicitDraftTokensInputs(decoder_batch::Input const&
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
void GptDecoderBatched::setEagleInputs(decoder_batch::Input const& input)
{
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
// TODO fill me
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
void GptDecoderBatched::newRequests(std::vector<SizeType32> const& seqSlots,
std::vector<decoder_batch::Request> const& requests, std::vector<SamplingConfig> const& samplingConfigs)
{
@ -854,6 +888,10 @@ void GptDecoderBatched::forwardDecoder(
{
setExplicitDraftTokensInputs(input);
}
else if (mSpeculativeDecodingMode.isEagle())
{
setEagleInputs(input);
}
bool const async = forwardType == ForwardType::kASYNC;

View File

@ -160,6 +160,8 @@ ModelConfig createModelConfig(
auto numKvHeadsPerAttentionLayer
= parseJsonFieldOr<std::vector<SizeType32>>(config, "num_kv_heads_per_layer", std::vector<SizeType32>());
auto numKvHeadsPerCrossAttentionLayer = parseJsonFieldOr<std::vector<SizeType32>>(
config, "num_kv_heads_per_cross_attn_layer", std::vector<SizeType32>());
auto modelConfig
= ModelConfig{vocabSize, numLayers, numAttentionLayers, numRnnLayers, numHeads, hiddenSize, dataType};
@ -167,7 +169,8 @@ ModelConfig createModelConfig(
{
std::transform(numKvHeadsPerAttentionLayer.cbegin(), numKvHeadsPerAttentionLayer.cend(),
numKvHeadsPerAttentionLayer.begin(),
[tensorParallelism](SizeType32 const numKvHeads) { return std::max(numKvHeads / tensorParallelism, 1); });
[tensorParallelism](SizeType32 const numKvHeads)
{ return ((numKvHeads + tensorParallelism - 1) / tensorParallelism); });
modelConfig.setNumKvHeadsPerLayer(numKvHeadsPerAttentionLayer);
}
else
@ -175,6 +178,19 @@ ModelConfig createModelConfig(
modelConfig.setNbKvHeads(numKvHeads);
}
if (!numKvHeadsPerCrossAttentionLayer.empty())
{
std::transform(numKvHeadsPerCrossAttentionLayer.cbegin(), numKvHeadsPerCrossAttentionLayer.cend(),
numKvHeadsPerCrossAttentionLayer.begin(),
[tensorParallelism](SizeType32 const numKvHeads)
{ return ((numKvHeads + tensorParallelism - 1) / tensorParallelism); });
modelConfig.setNumKvHeadsPerCrossLayer(numKvHeadsPerCrossAttentionLayer);
}
else
{
modelConfig.setNbCrossKvHeads(numKvHeads);
}
modelConfig.setSizePerHead(sizePerHead);
modelConfig.setLayerTypes(layerTypes);

View File

@ -60,7 +60,7 @@ th::Tensor gatherTree( // BS: batch_size, BM:
{
int32_t* final_output_ids_ptr = get_ptr<int32_t>(final_output_ids);
tk::invokeInitializeOutput(
final_output_ids_ptr, get_ptr<int32_t>(end_ids), batch_size * beam_width, max_seq_len, stream);
final_output_ids_ptr, get_ptr<int32_t>(end_ids), batch_size, beam_width, max_seq_len, stream);
tk::BeamHypotheses bh;
bh.nBatchSize = batch_size;

View File

@ -130,6 +130,7 @@ if(NOT ENABLE_MULTI_DEVICE EQUAL 0)
add_gtest(allReduceKernelTest kernels/allReduce/allReduceKernelTest.cu)
endif()
add_gtest(decodingKernelsTest kernels/decodingKernelTest.cpp)
add_gtest(eaglePackDataTest kernels/eaglePackDataTest.cpp)
add_gtest(banRepeatNGramsKernelsTest kernels/banRepeatNGramsKernelsTest.cpp)
add_gtest(stopCriteriaKernelsTest kernels/stopCriteriaKernelsTest.cpp)
add_gtest(shiftKCacheKernelTest kernels/shiftKCacheKernelTest.cu)

View File

@ -426,6 +426,10 @@ public:
TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)};
mBufferManager->copy(len.data(),*inputLengths);
std::vector<SizeType32> eid = {0};
TensorPtr endIds{ITensor::slice(constPointerCast(decodingInput->endIds), 0, 1)};
mBufferManager->copy(eid.data(),*endIds);
std::vector<std::vector<float>> logProbs =
{
{-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -0.696636, -2.41985},
@ -555,6 +559,10 @@ public:
TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)};
mBufferManager->copy(len.data(),*inputLengths);
std::vector<SizeType32> eid = {0};
TensorPtr endIds{ITensor::slice(constPointerCast(decodingInput->endIds), 0, 1)};
mBufferManager->copy(eid.data(),*endIds);
std::vector<std::vector<float> >logProbs =
{
{-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524},

View File

@ -0,0 +1,521 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include <gtest/gtest.h>
#include "tensorrt_llm/common/memoryUtils.h"
#include "tensorrt_llm/kernels/decodingCommon.h"
#include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h"
#include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h"
#include "tensorrt_llm/runtime/bufferManager.h"
#include "tensorrt_llm/runtime/common.h"
#include "tensorrt_llm/runtime/iBuffer.h"
#include "tensorrt_llm/runtime/iTensor.h"
#include "tensorrt_llm/runtime/runtimeKernels.h"
#include "tensorrt_llm/runtime/tllmLogger.h"
#include <NvInferRuntimeBase.h>
#include <algorithm>
#include <cstdint>
#include <random>
namespace
{
using namespace tensorrt_llm::runtime;
using namespace tensorrt_llm::common;
namespace tk = tensorrt_llm::kernels;
namespace trk = tensorrt_llm::runtime::kernels;
namespace tksd = tensorrt_llm::kernels::speculative_decoding;
class SamplingParams
{
public:
SamplingParams() {}
inline void setNumCtxRequests(SizeType32 numCtxRequests)
{
mNumCtxRequests = numCtxRequests;
}
inline void setNumGenRequests(SizeType32 numGenRequests)
{
mNumGenRequests = numGenRequests;
}
inline void setMaxPathLen(SizeType32 maxPathLen)
{
mMaxPathLen = maxPathLen;
}
[[nodiscard]] inline SizeType32 getNumCtxRequests() const
{
return mNumCtxRequests;
}
[[nodiscard]] inline SizeType32 getNumGenRequests() const
{
return mNumGenRequests;
}
[[nodiscard]] inline SizeType32 getBatchSize() const
{
return getNumCtxRequests() + getNumGenRequests();
}
[[nodiscard]] inline SizeType32 getVocabSize() const
{
return mVocabSize;
}
[[nodiscard]] inline SizeType32 getMaxBatchSize() const
{
return 2 * getBatchSize();
}
[[nodiscard]] inline SizeType32 getMaxPathLen() const
{
return mMaxPathLen;
}
[[nodiscard]] inline SizeType32 getMaxDecodingTokens() const
{
return mMaxDecodingTokens;
}
[[nodiscard]] inline SizeType32 getMaxDecodingDraftTokens() const
{
return getMaxDecodingTokens() - 1;
}
[[nodiscard]] inline SizeType32 getMaxSeqLen() const
{
return getMaxDecodingTokens() * 2;
}
private:
SizeType32 mNumCtxRequests{6};
SizeType32 mNumGenRequests{6};
SizeType32 mMaxPathLen{4};
SizeType32 mMaxDecodingTokens{32};
SizeType32 mVocabSize{256};
};
class EaglePackDataTest : public ::testing::Test
{
public:
using BufferPtr = IBuffer::SharedPtr;
using TensorPtr = ITensor::SharedPtr;
void SetUp() override
{
mStream = std::make_shared<CudaStream>();
mBufferManager = std::make_shared<BufferManager>(mStream);
}
void allocateBuffers()
{
// inputs
mBatchSlots = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kINT32);
mInputTemperatures = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kFLOAT);
mInputRandomDataSample = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kFLOAT);
mInputRandomDataValidation = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kFLOAT);
mInputNextDraftTokens = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getMaxBatchSize(), mSamplingParams.getMaxDecodingDraftTokens()}),
nvinfer1::DataType::kINT32);
mInputNextDraftLens = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kINT32);
mInputNextDraftPaths
= BufferManager::pinnedPool(ITensor::makeShape({mSamplingParams.getMaxBatchSize(),
mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen()}),
nvinfer1::DataType::kINT32);
mInputSpecDecodingGenerationLengths = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kINT32);
mInputSpecDecodingPositionOffsets = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getMaxBatchSize(), mSamplingParams.getMaxDecodingTokens()}),
nvinfer1::DataType::kINT32);
auto const numPackedMasks
= static_cast<SizeType32>(tensorrt_llm::common::divUp(mSamplingParams.getMaxDecodingTokens(), 32));
mInputSpecDecodingPackedMasks = BufferManager::pinnedPool(
ITensor::makeShape(
{mSamplingParams.getMaxBatchSize(), mSamplingParams.getMaxDecodingTokens(), numPackedMasks}),
nvinfer1::DataType::kINT32);
// outputs
mOutputTemperatures = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kFLOAT);
mOutputRandomDataSample = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kFLOAT);
mOutputRandomDataValidation = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kFLOAT);
mOutputNextDraftTokens = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingDraftTokens()}),
nvinfer1::DataType::kINT32);
mOutputNextDraftLens = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kINT32);
mOutputNextDraftPaths
= BufferManager::pinnedPool(ITensor::makeShape({mSamplingParams.getBatchSize(),
mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen()}),
nvinfer1::DataType::kINT32);
mOutputSpecDecodingGenerationLengths = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kINT32);
mOutputSpecDecodingPositionOffsets = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingTokens()}),
nvinfer1::DataType::kINT32);
mOutputSpecDecodingPackedMasks = BufferManager::pinnedPool(
ITensor::makeShape(
{mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingTokens(), numPackedMasks}),
nvinfer1::DataType::kINT32);
// workspace
mMaxGenerationLength = BufferManager::pinnedPool(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
mCumSumGenerationLengths = BufferManager::pinnedPool(
ITensor::makeShape({mSamplingParams.getBatchSize() + 1}), nvinfer1::DataType::kINT32);
mScanTempStorageBytes = tksd::invokeScanGenerationLengths(
nullptr, 0, nullptr, nullptr, mSamplingParams.getBatchSize(), mStream->get());
mReduceTempStorageBytes = tksd::invokeReduceMaxGenerationLengths(
nullptr, 0, nullptr, nullptr, mSamplingParams.getBatchSize(), mStream->get());
mScanReduceTempStorage = mBufferManager->gpu(std::max(mReduceTempStorageBytes, mScanTempStorageBytes));
}
void initBuffers()
{
trk::invokeFill(*mOutputTemperatures, float{0}, *mStream);
trk::invokeFill(*mOutputRandomDataSample, float{0}, *mStream);
trk::invokeFill(*mOutputRandomDataValidation, float{0}, *mStream);
trk::invokeFill(*mOutputNextDraftTokens, TokenIdType{-1}, *mStream);
trk::invokeFill(*mOutputNextDraftLens, SizeType32{0}, *mStream);
trk::invokeFill(*mOutputNextDraftPaths, SizeType32{0}, *mStream);
trk::invokeFill(*mOutputSpecDecodingGenerationLengths, SizeType32{0}, *mStream);
trk::invokeFill(*mOutputSpecDecodingPositionOffsets, SizeType32{0}, *mStream);
trk::invokeFill(*mOutputSpecDecodingPackedMasks, SizeType32{0}, *mStream);
auto batchSlotsPtr = bufferCast<SizeType32>(*mBatchSlots);
for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
{
batchSlotsPtr[bi] = 2 * bi;
}
std::mt19937 gen(42);
std::uniform_real_distribution<float> distr(0.0, 1.0);
std::uniform_int_distribution<SizeType32> intDistr(0, 1000);
std::uniform_int_distribution<SizeType32> lenDistr(0, 32);
for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
{
bufferCast<float>(*mInputTemperatures)[batchSlotsPtr[bi]] = distr(gen);
bufferCast<float>(*mInputRandomDataSample)[batchSlotsPtr[bi]] = distr(gen);
bufferCast<float>(*mInputRandomDataValidation)[batchSlotsPtr[bi]] = distr(gen);
}
for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
{
for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingDraftTokens(); ++ti)
{
bufferCast<SizeType32>(*mInputNextDraftTokens)[flat_index2(
batchSlotsPtr[bi], ti, mSamplingParams.getMaxDecodingDraftTokens())]
= intDistr(gen);
}
for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingTokens(); ++ti)
{
for (SizeType32 pi = 0; pi < mSamplingParams.getMaxPathLen(); ++pi)
{
bufferCast<SizeType32>(*mInputNextDraftPaths)[flat_index3(batchSlotsPtr[bi], ti, pi,
mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen())]
= intDistr(gen);
}
auto const numPackedMasks
= static_cast<SizeType32>(tensorrt_llm::common::divUp(mSamplingParams.getMaxDecodingTokens(), 32));
for (SizeType32 mi = 0; mi < numPackedMasks; ++mi)
{
bufferCast<SizeType32>(*mInputSpecDecodingPackedMasks)[flat_index3(
batchSlotsPtr[bi], ti, mi, mSamplingParams.getMaxDecodingTokens(), numPackedMasks)]
= intDistr(gen);
}
bufferCast<SizeType32>(*mInputSpecDecodingPositionOffsets)[flat_index2(
batchSlotsPtr[bi], ti, mSamplingParams.getMaxDecodingTokens())]
= intDistr(gen);
}
bufferCast<SizeType32>(*mInputNextDraftLens)[batchSlotsPtr[bi]] = lenDistr(gen);
bufferCast<SizeType32>(*mInputSpecDecodingGenerationLengths)[batchSlotsPtr[bi]]
= bufferCast<SizeType32>(*mInputNextDraftLens)[batchSlotsPtr[bi]];
}
}
void callPackData()
{
tksd::PackEagleParams params;
params.batchSize = mSamplingParams.getBatchSize();
params.maxNumPaths = mSamplingParams.getMaxDecodingTokens();
params.maxDecodingTokens = mSamplingParams.getMaxDecodingTokens();
params.maxPathLength = mSamplingParams.getMaxPathLen();
params.numContextRequests = mSamplingParams.getNumCtxRequests();
params.numGenerationRequests = mSamplingParams.getNumGenRequests();
params.batchSlots = bufferCast<SizeType32>(*mBatchSlots);
// Outputs from decoder -- inputs to the packing kernel
params.inputTemperatures = bufferCast<float>(*mInputTemperatures);
params.inputRandomDataSample = bufferCast<float>(*mInputRandomDataSample);
params.inputRandomDataValidation = bufferCast<float>(*mInputRandomDataValidation);
params.inputNextDraftTokens = bufferCast<TokenIdType>(*mInputNextDraftTokens);
params.inputNextDraftLens = bufferCast<SizeType32>(*mInputNextDraftLens);
params.inputNextDraftPaths = bufferCast<SizeType32>(*mInputNextDraftPaths);
params.inputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*mInputSpecDecodingGenerationLengths);
params.inputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*mInputSpecDecodingPositionOffsets);
params.inputSpecDecodingPackedMasks = bufferCast<int32_t>(*mInputSpecDecodingPackedMasks);
// Outputs of the packing kernel -- inputs to the engine
params.outputTemperatures = bufferCast<float>(*mOutputTemperatures);
params.outputRandomDataSample = bufferCast<float>(*mOutputRandomDataSample);
params.outputRandomDataValidation = bufferCast<float>(*mOutputRandomDataValidation);
params.outputNextDraftTokens = bufferCast<TokenIdType>(*mOutputNextDraftTokens);
params.outputNextDraftLens = bufferCast<SizeType32>(*mOutputNextDraftLens);
params.outputNextDraftPaths = bufferCast<SizeType32>(*mOutputNextDraftPaths);
params.outputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*mOutputSpecDecodingGenerationLengths);
params.outputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*mOutputSpecDecodingPositionOffsets);
params.outputSpecDecodingPackedMasks = bufferCast<int32_t>(*mOutputSpecDecodingPackedMasks);
params.maxGenerationLength = bufferCast<SizeType32>(*mMaxGenerationLength);
params.cumSumGenerationLengths = bufferCast<SizeType32>(*mCumSumGenerationLengths);
params.checkParams();
if (mSamplingParams.getNumGenRequests())
{
// Pack tensors from batch slot position to continuous array
tksd::invokePackEagleGenerationLengths(params, mStream->get());
sync_check_cuda_error();
// Compute inclusive sum and max
tksd::invokeScanReduceGenerationLengths(mSamplingParams.getNumGenRequests(),
bufferCast<SizeType32>(*mOutputSpecDecodingGenerationLengths),
bufferCast<uint8_t>(*mScanReduceTempStorage), mScanTempStorageBytes,
bufferCast<SizeType32>(*mCumSumGenerationLengths), bufferCast<uint8_t>(*mScanReduceTempStorage),
mReduceTempStorageBytes, bufferCast<SizeType32>(*mMaxGenerationLength), mStream->get());
sync_check_cuda_error();
}
mStream->synchronize();
// Pack tensors from batch slot position to continuous array
tksd::invokePackEagle(params, mStream->get());
sync_check_cuda_error();
}
void verifyResults()
{
auto batchSlotsPtr = bufferCast<SizeType32>(*mBatchSlots);
for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
{
EXPECT_EQ(BufferRange<float>(*mInputTemperatures)[batchSlotsPtr[bi]],
BufferRange<float>(*mOutputTemperatures)[bi]);
EXPECT_EQ(BufferRange<float>(*mInputRandomDataSample)[batchSlotsPtr[bi]],
BufferRange<float>(*mOutputRandomDataSample)[bi]);
EXPECT_EQ(BufferRange<float>(*mInputRandomDataValidation)[batchSlotsPtr[bi]],
BufferRange<float>(*mOutputRandomDataValidation)[bi]);
}
auto const numCtxRequests = mSamplingParams.getNumCtxRequests();
for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
{
for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingTokens(); ++ti)
{
for (SizeType32 pi = 0; pi < mSamplingParams.getMaxPathLen(); ++pi)
{
EXPECT_EQ(BufferRange<SizeType32>(*mInputNextDraftPaths)[flat_index3(batchSlotsPtr[bi], ti, pi,
mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen())],
BufferRange<SizeType32>(*mOutputNextDraftPaths)[flat_index3(
bi, ti, pi, mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen())]);
}
}
EXPECT_EQ(BufferRange<SizeType32>(*mOutputNextDraftLens)[bi],
bi < numCtxRequests ? 0 : BufferRange<SizeType32>(*mInputNextDraftLens)[batchSlotsPtr[bi]]);
}
auto const maxGenerationLength = bufferCast<SizeType32>(*mMaxGenerationLength)[0];
for (SizeType32 bi = 0; bi < mSamplingParams.getNumGenRequests(); ++bi)
{
for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingDraftTokens(); ++ti)
{
EXPECT_EQ(BufferRange<SizeType32>(*mInputNextDraftTokens)[flat_index2(
batchSlotsPtr[numCtxRequests + bi], ti, mSamplingParams.getMaxDecodingDraftTokens())],
BufferRange<SizeType32>(*mOutputNextDraftTokens)[flat_index2(
numCtxRequests + bi, ti, mSamplingParams.getMaxDecodingDraftTokens())]);
}
EXPECT_EQ(BufferRange<SizeType32>(*mInputSpecDecodingGenerationLengths)[batchSlotsPtr[numCtxRequests + bi]],
BufferRange<SizeType32>(*mOutputSpecDecodingGenerationLengths)[bi]);
for (SizeType32 ti = 0; ti < maxGenerationLength; ++ti)
{
EXPECT_EQ(BufferRange<SizeType32>(*mInputSpecDecodingPositionOffsets)[flat_index2(
batchSlotsPtr[numCtxRequests + bi], ti, mSamplingParams.getMaxDecodingTokens())],
BufferRange<SizeType32>(
*mOutputSpecDecodingPositionOffsets)[flat_index2(bi, ti, maxGenerationLength)])
<< "bi: " << bi << " ti: " << ti;
}
auto const numTokens = (bi == 0) ? bufferCast<SizeType32>(*mCumSumGenerationLengths)[0]
: bufferCast<SizeType32>(*mCumSumGenerationLengths)[bi]
- bufferCast<SizeType32>(*mCumSumGenerationLengths)[bi - 1];
auto const outputStartId = (bi == 0) ? 0 : bufferCast<SizeType32>(*mCumSumGenerationLengths)[bi - 1];
auto const numPackedMasks
= static_cast<SizeType32>(tensorrt_llm::common::divUp(mSamplingParams.getMaxDecodingTokens(), 32));
for (SizeType32 ti = 0; ti < numTokens * numPackedMasks; ++ti)
{
EXPECT_EQ(BufferRange<SizeType32>(
*mInputSpecDecodingPackedMasks)[flat_index2(batchSlotsPtr[numCtxRequests + bi], ti,
mSamplingParams.getMaxDecodingTokens() * numPackedMasks)],
BufferRange<SizeType32>(
*mOutputSpecDecodingPackedMasks)[flat_index2(outputStartId, ti, numPackedMasks)])
<< "bi: " << bi << " ti: " << ti;
}
}
}
void run(SamplingParams samplingParams)
{
mSamplingParams = samplingParams;
allocateBuffers();
initBuffers();
callPackData();
mStream->synchronize();
verifyResults();
}
private:
std::shared_ptr<tensorrt_llm::runtime::CudaStream> mStream;
std::shared_ptr<tensorrt_llm::runtime::BufferManager> mBufferManager;
// input
TensorPtr mBatchSlots;
TensorPtr mInputTemperatures;
TensorPtr mInputRandomDataSample;
TensorPtr mInputRandomDataValidation;
TensorPtr mInputNextDraftTokens;
TensorPtr mInputNextDraftLens;
TensorPtr mInputNextDraftPaths;
TensorPtr mInputSpecDecodingGenerationLengths;
TensorPtr mInputSpecDecodingPositionOffsets;
TensorPtr mInputSpecDecodingPackedMasks;
// output
TensorPtr mOutputTemperatures;
TensorPtr mOutputRandomDataSample;
TensorPtr mOutputRandomDataValidation;
TensorPtr mOutputNextDraftTokens;
TensorPtr mOutputNextDraftLens;
TensorPtr mOutputNextDraftPaths;
TensorPtr mOutputSpecDecodingGenerationLengths;
TensorPtr mOutputSpecDecodingPositionOffsets;
TensorPtr mOutputSpecDecodingPackedMasks;
// workspace
TensorPtr mMaxGenerationLength;
TensorPtr mCumSumGenerationLengths;
BufferPtr mScanReduceTempStorage;
SizeType32 mScanTempStorageBytes;
SizeType32 mReduceTempStorageBytes;
SamplingParams mSamplingParams;
};
TEST_F(EaglePackDataTest, Ctx1Gen0)
{
SamplingParams params;
params.setNumCtxRequests(1);
params.setNumGenRequests(0);
this->run(params);
}
TEST_F(EaglePackDataTest, Ctx0Gen1)
{
SamplingParams params;
params.setNumCtxRequests(0);
params.setNumGenRequests(1);
this->run(params);
}
TEST_F(EaglePackDataTest, Ctx100Gen0)
{
SamplingParams params;
params.setNumCtxRequests(100);
params.setNumGenRequests(0);
this->run(params);
}
TEST_F(EaglePackDataTest, Ctx0Gen100)
{
SamplingParams params;
params.setNumCtxRequests(0);
params.setNumGenRequests(100);
this->run(params);
}
TEST_F(EaglePackDataTest, Ctx100Gen100)
{
SamplingParams params;
params.setNumCtxRequests(100);
params.setNumGenRequests(100);
this->run(params);
}
} // namespace

View File

@ -137,7 +137,6 @@
"layernorm_quantization_plugin": null,
"rmsnorm_quantization_plugin": null,
"nccl_plugin": null,
"lookup_plugin": null,
"lora_plugin": "auto",
"weight_only_groupwise_quant_matmul_plugin": null,
"weight_only_quant_matmul_plugin": null,

View File

@ -139,7 +139,8 @@ def parallel_run_ctest(
# Some catastrophic fail happened that there's no report generated
raise
parallel_report = 'parallel-' + report
# Avoid .xml extension to prevent CI from reading failures from it
parallel_report = 'parallel-' + report + ".intermediate"
_os.rename(cwd / report, cwd / parallel_report)
try:
@ -153,7 +154,7 @@ def parallel_run_ctest(
# Use parallel result as final report
_os.rename(cwd / parallel_report, cwd / report)
else:
retry_report = 'retry-' + report
retry_report = 'retry-' + report + ".intermediate"
_os.rename(cwd / report, cwd / retry_report)
merge_report(cwd / parallel_report, cwd / retry_report,
cwd / report)
@ -472,6 +473,12 @@ def prepare_multi_gpu_model_tests(python_exe: str,
model_cache_arg=model_cache_arg,
only_multi_gpu_arg=only_multi_gpu_arg)
prepare_model_tests(model_name="llama",
python_exe=python_exe,
root_dir=root_dir,
resources_dir=resources_dir,
model_cache_arg=model_cache_arg)
prepare_model_tests(model_name="t5",
python_exe=python_exe,
root_dir=root_dir,
@ -704,6 +711,19 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
]
run_command(cache_trans_test, cwd=tests_dir, env=cpp_env, timeout=300)
# Cache transceiver tests
cache_trans_test_8_proc = [
"mpirun",
"-n",
"8",
"--allow-run-as-root",
"batch_manager/cacheTransceiverTest",
]
run_command(cache_trans_test_8_proc,
cwd=tests_dir,
env=cpp_env,
timeout=600)
# UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
if _os.path.exists(
_os.path.join(tests_dir, "batch_manager/ucxDataTransceiverTest")):
@ -843,6 +863,19 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
new_env = copy.copy(cpp_env)
new_env["RUN_LLAMA_MULTI_GPU"] = "true"
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml"
trt_model_test = produce_mpirun_command(
global_commands=["mpirun", "--allow-run-as-root"],
nranks=8,
local_commands=[
"executor/executorTest",
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
],
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
def run_benchmarks(model_name: str, python_exe: str, root_dir: _pl.Path,
build_dir: _pl.Path, resources_dir: _pl.Path,

View File

@ -1,6 +1,6 @@
# Multi-stage Dockerfile
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
ARG BASE_TAG=24.07-py3
ARG BASE_TAG=24.09-py3
ARG DEVEL_IMAGE=devel
FROM ${BASE_IMAGE}:${BASE_TAG} as base

View File

@ -155,7 +155,7 @@ centos7_%: BASE_TAG = 12.4.0-devel-centos7
# For x86_64 and aarch64
ubuntu22_%: STAGE = devel
ubuntu22_%: BASE_IMAGE = nvidia/cuda
ubuntu22_%: BASE_TAG = 12.5.1-devel-ubuntu22.04
ubuntu22_%: BASE_TAG = 12.6.1-devel-ubuntu22.04
trtllm_%: STAGE = release
trtllm_%: PUSH_TO_STAGING := 0

View File

@ -4,7 +4,7 @@ set -ex
# This script is used for reinstalling CUDA on CentOS 7 with the run file.
# CUDA version is usually aligned with the latest NGC CUDA image tag.
CUDA_VER="12.5.1_555.42.06"
CUDA_VER="12.6.1_560.35.03"
CUDA_VER_SHORT="${CUDA_VER%_*}"
NVCC_VERSION_OUTPUT=$(nvcc --version)

View File

@ -4,12 +4,8 @@ set -ex
# Use latest stable version from https://pypi.org/project/torch/#history
# and closest to the version specified in
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07
TORCH_VERSION="2.4.0"
# Check the compatible torchvision from
# https://github.com/pytorch/vision/tree/main?tab=readme-ov-file#installation
# and also confirm with https://pypi.org/pypi/torchvision/0.19.0/json
TORCHVISION_VERSION="0.19.0"
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-09.html#rel-24-09
TORCH_VERSION="2.4.1"
SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
prepare_environment() {
@ -61,6 +57,9 @@ install_from_source() {
python3 setup.py install
cd /tmp && rm -rf /tmp/pytorch
# Get torchvision version by dry run
TORCHVISION_VERSION=$(pip3 install --dry-run torch==${TORCH_VERSION} torchvision | grep "Would install" | tr ' ' '\n' | grep torchvision | cut -d "-" -f 2)
export PYTORCH_VERSION=${PYTORCH_BUILD_VERSION}
export FORCE_CUDA=1
export BUILD_VERSION=${TORCHVISION_VERSION}
@ -76,7 +75,7 @@ install_from_source() {
install_from_pypi() {
pip3 uninstall -y torch torchvision
pip3 install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION}
pip3 install torch==${TORCH_VERSION} torchvision
}
case "$1" in

View File

@ -2,19 +2,20 @@
set -ex
TRT_VER="10.4.0.26"
TRT_VER="10.5.0.18"
# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07
CUDA_VER="12.5" # 12.5.1
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-09.html#rel-24-09
CUDA_VER="12.6" # 12.6.1
# Keep the installation for cuDNN if users want to install PyTorch with source codes.
# PyTorch 2.x can compile with cuDNN v9.
CUDNN_VER="9.2.1.18-1"
NCCL_VER="2.22.3-1+cuda12.5"
CUBLAS_VER="12.5.3.2-1"
CUDA_DRIVER_VERSION="560.35.03-1"
CUDNN_VER="9.4.0.58-1"
NCCL_VER="2.22.3-1+cuda12.6"
CUBLAS_VER="12.6.1.4-1"
# Align with the pre-installed CUDA / NVCC / NVRTC versions from
# https://docs.nvidia.com/cuda/archive/12.5.1/cuda-toolkit-release-notes/index.html
NVRTC_VER="12.5.82-1"
CUDA_RUNTIME="12.5.82-1"
# https://docs.nvidia.com/cuda/archive/12.6.1/cuda-toolkit-release-notes/index.html
NVRTC_VER="12.6.68-1"
CUDA_RUNTIME="12.6.68-1"
for i in "$@"; do
case $i in
@ -76,7 +77,9 @@ install_centos_requirements() {
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-toolkit-${CUBLAS_CUDA_VERSION}-config-common-${CUDA_RUNTIME}.noarch.rpm
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-toolkit-12-config-common-${CUDA_RUNTIME}.noarch.rpm
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-toolkit-config-common-${CUDA_RUNTIME}.noarch.rpm
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-compat-${CUBLAS_CUDA_VERSION}-${CUDA_DRIVER_VERSION}.x86_64.rpm
yum remove -y "cuda-toolkit*" && yum -y localinstall cuda-toolkit-${CUBLAS_CUDA_VERSION}-config-common-${CUDA_RUNTIME}.noarch.rpm cuda-toolkit-12-config-common-${CUDA_RUNTIME}.noarch.rpm cuda-toolkit-config-common-${CUDA_RUNTIME}.noarch.rpm
yum remove -y "cuda-compat*" && yum -y localinstall cuda-compat-${CUBLAS_CUDA_VERSION}-${CUDA_DRIVER_VERSION}.x86_64.rpm
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm
yum remove -y "libcublas*" && yum -y localinstall libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm
@ -96,7 +99,7 @@ install_tensorrt() {
if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu24_04" && OS2="Ubuntu-24.04" && OS="ubuntu-24.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
fi
wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
tar -xf /tmp/TensorRT.tar -C /usr/local/

View File

@ -46,6 +46,12 @@ myst_url_schemes = {
"https://github.com/NVIDIA/TensorRT-LLM/tree/" + branch_name + "/{{path}}",
}
myst_heading_anchors = 4
myst_enable_extensions = [
"deflist",
]
autosummary_generate = True
copybutton_exclude = '.linenos, .gp, .go'
copybutton_prompt_text = ">>> |$ |# "

View File

@ -111,7 +111,8 @@ Welcome to TensorRT-LLM's Documentation!
:name: Performance
performance/perf-overview.md
performance/perf-best-practices.md
Benchmarking <performance/perf-benchmarking.md>
Best Practices <performance/perf-best-practices.md>
performance/perf-analysis.md

View File

@ -185,12 +185,15 @@ Building from source produces the following library files.
- `th_common.exp`
- `th_common.lib`
The locations of the DLLs, in addition to some `torch` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
The locations of the DLLs, in addition to some `torch` DLLs and `TensorRT` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
```bash
%USERPROFILE%\inference\TensorRT\lib
%USERPROFILE%\inference\TensorRT-LLM\cpp\build\tensorrt_llm
%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\tensorrt_llm\libs
%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\torch\lib
```
Your `Path` additions may differ, particularly if you used the Docker method and copied all the relevant DLLs into a single folder.
Again, close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.

View File

@ -4,7 +4,7 @@
```{note}
The Windows release of TensorRT-LLM is currently in beta.
We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.13.0) for the most stable experience.
We recommend checking out the [v0.14.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.14.0) for the most stable experience.
```
**Prerequisites**
@ -52,7 +52,7 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L
before installing TensorRT-LLM with the following command.
```bash
pip install tensorrt_llm==0.13.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/
pip install tensorrt_llm==0.14.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/
```
Run the following command to verify that your TensorRT-LLM installation is working properly.
@ -70,7 +70,7 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L
This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install
[the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version)
and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSCV DLL).
and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSVC DLL).
2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies.

View File

@ -1,26 +1,36 @@
(perf-benchmarking)=
# TensorRT-LLM Benchmarking
> [!WARNING] Work in Progress
> This benchmarking suite is a current work in progress and is prone to large changes.
```{important}
This benchmarking suite is a work in progress.
Expect breaking API changes.
```
TensorRT-LLM provides a packaged benchmarking utility that is accessible via the `trtllm-bench` CLI tool.
TensorRT-LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility.
#### Supported Networks for Benchmarking
- [`tiiuae/falcon-180B`](https://huggingface.co/tiiuae/falcon-180B)
- [`meta-llama/Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf)
- [`meta-llama/Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf)
- [`meta-llama/Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
- [`meta-llama/Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
- [`EleutherAI/gpt-j-6b`](https://huggingface.co/EleutherAI/gpt-j-6b)
- [`mistralai/Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [`mistralai/Mixtral-8x7B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
- [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
- [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf)
- [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B)
- [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)
- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
- [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
- [meta-llama/Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B)
- [meta-llama/Llama-3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B)
- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
- [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
- [meta-llama/Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)
- [meta-llama/Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)
- [mistralai/Mixtral-8x7B-v0.1-Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1-Instruct)
#### Support Quantization Modes
TensorRT-LLM supports a number of quanization modes. For more information about quantization, see the
[documentation](https://nvidia.github.io/TensorRT-LLM/reference/precision.html).
TensorRT-LLM supports a number of quantization modes:
- None (no quantization applied)
- W8A16
@ -31,7 +41,8 @@ TensorRT-LLM supports a number of quanization modes. For more information about
- FP8
- INT8
> [!NOTE] Please see the supported quantization methods for each network [here](https://nvidia.github.io/TensorRT-LLM/reference/precision.html#support-matrix)
For more information about quantization, refer to [](../reference/precision.md) and
the [support matrix](../reference/precision.md#support-matrix) of the supported quantization methods for each network.
## Inflight Benchmarking with a Dataset
@ -41,9 +52,10 @@ This section covers how to benchmark TensorRT-LLM using inflight batching.
### Quickstart
For this quick start guide, we will focus on running a short max throughput benchmark on
This quick start focuses on running a short max throughput benchmark on
`meta-llama/Llama-2-7b-hf` on a synthetic dataset with a uniform distribution of prompts with ISL:OSL
of 128:128. In order to run the benchmark from start to finish simply run the following commands:
of 128:128.
To run the benchmark from start to finish, run the following commands:
```shell
python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-2-7b-hf token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 > /tmp/synthetic_128_128.txt
@ -51,7 +63,8 @@ trtllm-bench --model meta-llama/Llama-2-7b-hf build --dataset /tmp/synthetic_128
trtllm-bench --model meta-llama/Llama-2-7b-hf throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
```
And that's it! Once the benchmark completes, a summary will be printed with summary metrics.
And that's it!
After the benchmark completes, `trtllm-bench` prints a summary with summary metrics.
```shell
===========================================================
@ -108,28 +121,31 @@ straightforward to specify requests. The schema is defined as follows:
| `logits` | N* | List[Integer] | List of logits that make up the request prompt. |
| `output_tokens` | Y | Integer | Number of generated tokens for this request. |
> [!NOTE] Prompt and logits are mutually exclusive*
> While having both `prompt` and `logits` is not required, at least one is required.
> If `logits` are specified, the `prompt` entry is ignored for request generation.
Prompt and logits are mutually exclusive, but one of `prompt` or `logits` is required.
If you specify `logits`, the `prompt` entry is ignored for request generation.
Examples of valid entries for the inflight benchmark are:
Refer to the following examples of valid entries for the inflight benchmark:
- Entries with a human-readable prompt and no logits.
```json
{"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000}
{"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000}
```
```json
{"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000}
{"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000}
```
- Entries which contain logits.
```json
{"task_id":0,"logits":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128}
{"task_id":1,"logits":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128}
```
> [!INFO] A whole entry is on a line!
> To make the passing of data simpler, a complete JSON entry is on each line so that the benchmarker
> can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete
> JSON entry is on every line.
```json
{"task_id":0,"logits":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128}
{"task_id":1,"logits":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128}
```
```{tip}
Specify each entry on one line.
To simplify passing the data, a complete JSON entry is on each line so that the benchmarker
can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete
JSON entry is on every line.
```
#### Using `prepare_dataset` to Create Synthetic Datasets
@ -162,12 +178,12 @@ trtllm-bench --model meta-llama/Llama-2-7b-hf build --max_seq_len 256 --quantiza
> [!NOTE] `trtllm-bench build` reproduces benchmark engines for performance study. These engine
configurations are not guaranteed to be optimal for all cases and should be viewed as reproducers
for the benchmark data we provide on our [Performance Overview](../docs/source/performance/perf-overview.md).
for the benchmark data we provide on our [Performance Overview](./perf-overview.md).
Looking a little closer, the `build` sub-command
will perform a lookup and build an engine using those reference settings. The
look up table directly corresponds to the performance table found in our
[Performance Overview](../docs/source/performance/perf-overview.md#throughput-measurements). The
[Performance Overview](./perf-overview.md#throughput-measurements). The
output of the `build` sub-command looks similar to the snippet below (for `meta-llama/Llama-2-7b-hf`):
```shell
@ -236,16 +252,17 @@ upper bound throughput number.
#### How the Benchmarker Works
The benchmarker will read in a data file or standard input (stdin) as a stream where a single line contains
a complete JSON request entry. The process that the benchmarker is as follows:
The benchmarker reads a data file where a single line contains
a complete JSON request entry as specified in [](#preparing-a-dataset).
The process that the benchmarker is as follows:
1. Iterate over all input requests. If `logits` is specified, construct the request using the specified
list of logits. Otherwise, tokenize the `prompt` with as specified by `--model $HF_MODEL_NAME`.
3. Submit the dataset to the TensorRT-LLM `Executor` API at as fast of a rate as possible (offline mode).
4. Wait for all requests to return, compute statistics, then report out results.
1. Submit the dataset to the TensorRT-LLM `Executor` API as fast as possible (offline mode).
1. Wait for all requests to return, compute statistics, and then report results.
To run the benchmarker, run the following with the [engine](#building-a-benchmark-engine) and
[dataset](#preparing-a-dataset) generated above:
To run the benchmarker, run the following commands with the [engine](#building-a-benchmark-engine) and
[dataset](#preparing-a-dataset) generated from previous steps:
```shell
trtllm-bench --model meta-llama/Llama-2-7b-hf throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
@ -316,16 +333,157 @@ Total Latency (seconds): 20.331645167
[TensorRT-LLM][INFO] Refreshed the MPI local session
```
## Low Latency Benchmark
The low latency benchmark follows a similar workflow to the [throughput benchmark](#running-a-max-throughput-benchmark)
but requires building the engine separately from `trtllm-bench`. Low latency benchmarks has the following modes:
- A single-request low-latency engine
- A Medusa-enabled speculative-decoding engine
### Low Latency TensorRT-LLM Engine for Llama-3 70B
To build a low-latency engine for the latency benchmark, run the following quantize and build commands.
The `$checkpoint_dir` is the path to the [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) Hugging Face checkpoint in your cache or downloaded to a specific location with the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
To prepare a dataset, follow the same process as specified in [](#preparing-a-dataset).
#### Benchmarking a non-Medusa Low Latency Engine
To quantize the checkpoint:
```shell
cd tensorrt_llm/examples/llama
python ../quantization/quantize.py \
--model_dir $checkpoint_dir \
--dtype bfloat16 \
--qformat fp8 \
--kv_cache_dtype fp8 \
--output_dir /tmp/meta-llama/Meta-Llama-3-70B/checkpoint \
--calib_size 512 \
--tp_size $tp_size
```
then build,
```shell
trtllm-build \
--checkpoint_dir /tmp/meta-llama/Meta-Llama-3-70B/checkpoint \
--use_fused_mlp enable \
--gpt_attention_plugin bfloat16 \
--output_dir /tmp/meta-llama/Meta-Llama-3-70B/engine \
--max_batch_size 1 \
--max_seq_len $(($isl+$osl)) \
--reduce_fusion enable \
--gemm_plugin fp8 \
--workers $tp_size \
--use_fp8_context_fmha enable \
--max_num_tokens $isl \
--use_paged_context_fmha disable \
--multiple_profiles enable
```
After the engine is built, run the low-latency benchmark:
```shell
env TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG=1 \
TRTLLM_MMHA_KERNEL_BLOCK_SIZE=256 \
TRTLLM_MMHA_BLOCKS_PER_SEQUENCE=32 \
FORCE_MULTI_BLOCK_MODE=ON \
TRTLLM_ENABLE_FDL=1 \
trtllm-bench --model meta-llama/Meta-Llama-3-70B \
latency \
--dataset $DATASET_PATH \
--engine_dir /tmp/meta-llama/Meta-Llama-3-70B/engine
```
#### Building a Medusa Low-Latency Engine
To build a Medusa-enabled engine requires checkpoints that contain Medusa heads.
NVIDIA provides TensorRT-LLM checkpoints on the [NVIDIA](https://huggingface.co/nvidia) page on Hugging Face.
The checkpoints are pre-quantized and can be directly built after downloading them with the
[huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
After you download the checkpoints, run the following command and specify the `$tp_size` supported by your Medusa checkpoint:
```shell
trtllm-build --checkpoint_dir $checkpoint_dir \
--speculative_decoding_mode medusa \
--max_batch_size 1 \
--gpt_attention_plugin bfloat16 \
--max_seq_len $(($isl+$osl)) \
--output_dir /tmp/meta-llama/Meta-Llama-3-70B/medusa/engine \
--use_fused_mlp enable \
--paged_kv_cache enable \
--use_paged_context_fmha disable \
--multiple_profiles enable \
--reduce_fusion enable \
--use_fp8_context_fmha enable \
--workers $tp_size \
--low_latency_gemm_plugin fp8
```
After the engine is built, you need to define the Medusa choices.
The choices are specify with a YAML file like the following example (`medusa.yaml`):
```yaml
- [0]
- [0, 0]
- [1]
- [0, 1]
- [2]
- [0, 0, 0]
- [1, 0]
- [0, 2]
- [3]
- [0, 3]
- [4]
- [0, 4]
- [2, 0]
- [0, 5]
- [0, 0, 1]
```
To run the Medusa-enabled engine, run the following command:
```shell
env TRTLLM_ENABLE_PDL=1 \
UB_ONESHOT=1 \
UB_TP_SIZE=$tp_size \
TRTLLM_ENABLE_PDL=1 \
TRTLLM_PDL_OVERLAP_RATIO=0.15 \
TRTLLM_PREFETCH_RATIO=-1 \
trtllm-bench --model meta-llama/Meta-Llama-3-70B \
latency \
--dataset $DATASET_PATH \
--engine_dir /tmp/meta-llama/Meta-Llama-3-70B/medusa/engine \
--medusa_choices medusa.yml
```
## Summary
In summary, the general process for reproducing a benchmark point is as follows:
The following table summarizes the commands needed for running benchmarks:
- Prepare a dataset: `python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $HF_MODEL token-norm-dist --input-mean $ISL --output-mean $OSL --input-stdev 0 --output-stdev 0 --num-requests $NUM_REQUESTS > $DATASET_PATH`
- Build engine: `trtllm-bench --model $HF_MODEL build --dataset $DATASET_PATH`
- Benchmark engine: trtllm-bench --model $HF_MODEL throughput --dataset $DATASET_PATH --engine_dir $ENGINE_DIR`
| Scenario | Phase | Command |
| - | - | - |
| Dataset | Preparation | `python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $HF_MODEL token-norm-dist --input-mean $ISL --output-mean $OSL --input-stdev 0 --output-stdev 0 --num-requests $NUM_REQUESTS > $DATASET_PATH` |
| Throughput | Build | `trtllm-bench --model $HF_MODEL build --dataset $DATASET_PATH` |
| Throughput | Benchmark | `trtllm-bench --model $HF_MODEL throughput --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` |
| Latency | Build | See [section about building low latency engines](#low-latency-tensorrt-llm-engine-for-llama-3-70b) |
| Non-Medusa Latency | Benchmark | `trtllm-bench --model $HF_MODEL latency --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` |
| Medusa Latency | Benchmark | `trtllm-bench --model $HF_MODEL latency --dataset $DATASET_PATH --engine_dir $ENGINE_DIR --medusa_choices $MEDUSA_CHOICES` |
where,
- `$HF_MODEL` is the Huggingface name of a model.
- `$NUM_REQUESTS` is the number of requests to generate.
- `$DATASET_PATH` is the path where the dataset was written when preparing the dataset.
- `$ENGINE_DIR` the engine directory as printed by `trtllm-bench build`.
`$HF_MODEL`
: The Hugging Face name of a model.
`$NUM_REQUESTS`
: The number of requests to generate.
`$DATASET_PATH`
: The path where the dataset was written when preparing the dataset.
`$ENGINE_DIR`
: The engine directory as printed by `trtllm-bench build`.
`$MEDUSA_CHOICES`
: A YAML config representing the Medusa tree for the benchmark.

View File

@ -149,24 +149,16 @@ only supported for the llama model. It is recommended to enable this feature whe
The embedding parallelism feature enables the sharding of the embedding table
across multiple GPUs, so that the memory usage could be reduced and the
throughput improved. The embedding sharing feature enables the sharing of the
embedding table between `look_up` and `lm_head` layers.
embedding table between `look_up` and `lm_head` layers to reduced memory usage.
The look-up plugin implements the embedding sharing feature and is required to
enable the aforementioned features for now (until TensorRT native layers
support embedding sharing).
It is recommended to enable embedding parallelism to improve throughput with `--use_parallel_embedding` and `--embedding_sharding_dim` in `convert_checkpoint.py`.
It is recommended to enable the embedding parallelism and sharing features to
improve throughput. However, the following conditions have to be satisfied:
Embedding sharing is by default enabled if following conditions are met:
1. `look_up` and `lm_head` layers have identical weights.
2. `--gemm_plugin` is not used when building the engine.
3. For tensor parallelism cases, `-embedding_sharding_dim 0` must be set. In other words, we must enable embedding parallelism along the vocab dimension,
1. The model shares the embedding table between `look_up` and `lm_head` layers,
2. Both look_up plugin and gemm plugin are enabled,
3. The sharding dimension of the embedding lookup table is set correctly.
To enable the features, use the `--use_parallel_embedding`, `--embedding_sharding_dim` and
`--use_embedding_sharing` arguments in `convert_checkpoint.py`, and use the
`--lookup_plugin`, `--gemm_plugin` arguments in `trtllm-build` command. See those
[Examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt#embedding-parallelism-and-sharing)
for details.
See those [Examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt#embedding-parallelism) for details.
### Horizontal Fusion in Gated-MLP

View File

@ -226,7 +226,7 @@ The performance numbers below were collected using the steps described in this d
> [!NOTE] The only models supported in this workflow are those listed in the table above.
The following tables are references for commands that are used as part of the benchmarking process. For a more detailed
description of this benchmarking workflow, see the [Benchmarking Suite README](../../../benchmarks/Suite.md).
description of this benchmarking workflow, see the [benchmarking suite documentation](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html).
### Commands

View File

@ -105,9 +105,9 @@ The following table shows the supported software for TensorRT-LLM.
* -
- Software Compatibility
* - Container
- [24.07](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
- [24.09](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
* - TensorRT
- [10.4](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
- [10.5](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
* - Precision
-
- Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4

View File

@ -12,27 +12,6 @@ In most occasions, these problems are caused by the workflow like: an old compil
Solution: try running build script with `--clean`, or try running `rm -r build cpp/build` before running build script again.
## cuDNN Linking Errors
Errors such as "Entry Point Not Found" (for example [#1062](https://github.com/NVIDIA/TensorRT-LLM/issues/1062)).
Solution: the issue might be a mismatch in the `cuDNN` libraries shipped from `torch` and `tensorrt`. To rectify this, please try the following steps
```bash
python -m pip uninstall -y tensorrt_llm
python -m pip install --upgrade pip
python -m pip install nvidia-cudnn-cu11==8.9.4.25 --no-cache-dir
python -m pip install --pre --extra-index-url https://pypi.nvidia.com/ tensorrt==9.2.0.post12.dev5 --no-cache-dir
python -m pip uninstall -y nvidia-cudnn-cu11
python -m pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com/ --extra-index-url https://download.pytorch.org/whl/cu121
```
## Model Debug
When debugging a TensorRT-LLM model, we usually want to print the value of the intermediate tensors.
We should mark the interested intermediate tensors as the network outputs, then print their values at runtime, since TensorRT-LLM obeys define-and-run paradigm.
## Debug on Unit Tests
Here is an example to print the values of the MLP output tensor in the a unit test ([full example](../../../tests/test_debugging_api.py)).

View File

@ -5,6 +5,49 @@
All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).
## TensorRT-LLM Release 0.14.0
### Key Features and Enhancements
- Enhanced the `LLM` class in the [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/index.html).
- Added support for calibration with offline dataset.
- Added support for Mamba2.
- Added support for `finish_reason` and `stop_reason`.
- Added FP8 support for CodeLlama.
- Added `__repr__` methods for class `Module`, thanks to the contribution from @1ytic in #2191.
- Added BFloat16 support for fused gated MLP.
- Updated ReDrafter beam search logic to match Apple ReDrafter v1.1.
- Improved `customAllReduce` performance.
- Draft model now can copy logits directly over MPI to the target model's process in `orchestrator` mode. This fast logits copy reduces the delay between draft token generation and the beginning of target model inference.
- NVIDIA Volta GPU support is deprecated and will be removed in a future release.
### API Changes
- [BREAKING CHANGE] The default `max_batch_size` of the `trtllm-build` command is set to `2048`.
- [BREAKING CHANGE] Remove `builder_opt` from the `BuildConfig` class and the `trtllm-build` command.
- Add logits post-processor support to the `ModelRunnerCpp` class.
- Added `isParticipant` method to the C++ `Executor` API to check if the current process is a participant in the executor instance.
### Model Updates
- Added support for NemotronNas, see `examples/nemotron_nas/README.md`.
- Added support for Deepseek-v1, see `examples/deepseek_v1/README.md`.
- Added support for Phi-3.5 models, see `examples/phi/README.md`.
### Fixed Issues
- Fixed a typo in `tensorrt_llm/models/model_weights_loader.py`, thanks to the contribution from @wangkuiyi in #2152.
- Fixed duplicated import module in `tensorrt_llm/runtime/generation.py`, thanks to the contribution from @lkm2835 in #2182.
- Enabled `share_embedding` for the models that have no `lm_head` in legacy checkpoint conversion path, thanks to the contribution from @lkm2835 in #2232.
- Fixed `kv_cache_type` issue in the Python benchmark, thanks to the contribution from @qingquansong in #2219.
- Fixed an issue with SmoothQuant calibration with custom datasets. Thanks to the contribution by @Bhuvanesh09 in #2243.
- Fixed an issue surrounding `trtllm-build --fast-build` with fake or random weights. Thanks to @ZJLi2013 for flagging it in #2135.
- Fixed missing `use_fused_mlp` when constructing `BuildConfig` from dict, thanks for the fix from @ethnzhng in #2081.
- Fixed lookahead batch layout for `numNewTokensCumSum`. (#2263)
### Infrastructure Changes
- The dependent ModelOpt version is updated to v0.17.
### Documentation
- @Sherlock113 added a [tech blog](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml) to the latest news in #2169, thanks for the contribution.
## TensorRT-LLM Release 0.13.0
### Key Features and Enhancements

View File

@ -2,6 +2,9 @@
import asyncio
import json
import logging
import signal
from contextlib import asynccontextmanager
from http import HTTPStatus
from typing import AsyncGenerator, Optional
import click
@ -9,6 +12,7 @@ import uvicorn
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse
from tensorrt_llm.executor import CppExecutorError, RequestError
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
TIMEOUT_KEEP_ALIVE = 5 # seconds.
@ -16,11 +20,16 @@ TIMEOUT_KEEP_ALIVE = 5 # seconds.
class LlmServer:
def __init__(self, llm: LLM, kv_cache_config: KvCacheConfig):
def __init__(self, llm: LLM):
self.llm = llm
self.kv_cache_config = kv_cache_config
self.app = FastAPI()
@asynccontextmanager
async def lifespan(app: FastAPI):
# terminate rank0 worker
yield
self.llm._shutdown()
self.app = FastAPI(lifespan=lifespan)
self.register_routes()
def register_routes(self):
@ -50,20 +59,27 @@ class LlmServer:
sampling_params = SamplingParams(**request_dict)
promise = self.llm.generate_async(prompt,
streaming=streaming,
sampling_params=sampling_params)
try:
promise = self.llm.generate_async(prompt,
streaming=streaming,
sampling_params=sampling_params)
async def stream_results() -> AsyncGenerator[bytes, None]:
async for output in promise:
yield output.outputs[0].text_diff.encode("utf-8")
async def stream_results() -> AsyncGenerator[bytes, None]:
async for output in promise:
yield output.outputs[0].text_diff.encode("utf-8")
if streaming:
return StreamingResponse(stream_results())
if streaming:
return StreamingResponse(stream_results())
# Non-streaming case
await promise.aresult()
return JSONResponse({"text": promise.outputs[0].text})
# Non-streaming case
await promise.aresult()
return JSONResponse({"text": promise.outputs[0].text})
except RequestError as e:
return JSONResponse(content=str(e),
status_code=HTTPStatus.BAD_REQUEST)
except CppExecutorError:
# If internal executor error is raised, shutdown the server
signal.raise_signal(signal.SIGINT)
async def __call__(self, host, port):
config = uvicorn.Config(self.app,
@ -82,28 +98,32 @@ class LlmServer:
@click.option("--max_beam_width", type=int, default=1)
@click.option("--tp_size", type=int, default=1)
@click.option("--pp_size", type=int, default=1)
@click.option("--kv_cache_free_gpu_memory_fraction", type=float, default=0.8)
def entrypoint(model_dir: str,
tokenizer: Optional[str] = None,
host: Optional[str] = None,
port: int = 8000,
max_beam_width: int = 1,
tp_size: int = 1,
pp_size: int = 1):
pp_size: int = 1,
kv_cache_free_gpu_memory_fraction: float = 0.8):
host = host or "0.0.0.0"
port = port or 8000
logging.info(f"Starting server at {host}:{port}")
build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width)
kv_cache_config = KvCacheConfig(
free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction)
llm = LLM(model_dir,
tokenizer,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
build_config=build_config)
build_config=build_config,
kv_cache_config=kv_cache_config)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
server = LlmServer(llm=llm, kv_cache_config=kv_cache_config)
server = LlmServer(llm=llm)
asyncio.run(server(host, port))

View File

@ -1,6 +1,8 @@
#!/usr/bin/env python
import asyncio
import logging
import signal
from contextlib import asynccontextmanager
from http import HTTPStatus
from pathlib import Path
from typing import (AsyncGenerator, AsyncIterator, List, Optional, Tuple,
@ -15,6 +17,7 @@ from openai.types.chat import ChatCompletionMessageParam
from transformers import AutoTokenizer, PreTrainedTokenizer
# yapf: disable
from tensorrt_llm.executor import CppExecutorError
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig
from tensorrt_llm.llmapi.llm import RequestOutput
from tensorrt_llm.llmapi.openai_protocol import (
@ -66,10 +69,8 @@ class OpenaiServer:
def __init__(self,
llm: LLM,
model: str,
kv_cache_config: KvCacheConfig,
hf_tokenizer: PreTrainedTokenizer = None):
self.llm = llm
self.kv_cache_config = kv_cache_config
self.tokenizer = hf_tokenizer
model_dir = Path(model)
@ -78,7 +79,13 @@ class OpenaiServer:
else:
self.model = model
self.app = FastAPI()
@asynccontextmanager
async def lifespan(app: FastAPI):
# terminate rank0 worker
yield
self.llm._shutdown()
self.app = FastAPI(lifespan=lifespan)
@self.app.exception_handler(RequestValidationError)
async def validation_exception_handler(_, exc):
@ -326,7 +333,9 @@ class OpenaiServer:
else:
response = await create_chat_response(promise)
return JSONResponse(content=response.model_dump())
except CppExecutorError:
# If internal executor error is raised, shutdown the server
signal.raise_signal(signal.SIGINT)
except Exception as e:
return self.create_error_response(str(e))
@ -432,6 +441,9 @@ class OpenaiServer:
else:
response = await create_completion_response(generator, num_choices)
return JSONResponse(content=response.model_dump())
except CppExecutorError:
# If internal executor error is raised, shutdown the server
signal.raise_signal(signal.SIGINT)
except Exception as e:
return self.create_error_response(str(e))
@ -453,6 +465,7 @@ class OpenaiServer:
@click.option("--max_seq_len", type=int, default=512)
@click.option("--tp_size", type=int, default=1)
@click.option("--pp_size", type=int, default=1)
@click.option("--kv_cache_free_gpu_memory_fraction", type=float, default=0.8)
def entrypoint(model_dir: str,
tokenizer: Optional[str] = None,
host: Optional[str] = None,
@ -460,25 +473,27 @@ def entrypoint(model_dir: str,
max_beam_width: int = 1,
max_seq_len: int = 512,
tp_size: int = 1,
pp_size: int = 1):
pp_size: int = 1,
kv_cache_free_gpu_memory_fraction: float = 0.8):
host = host or "0.0.0.0"
port = port or 8000
logging.info(f"Starting server at {host}:{port}")
build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width, max_seq_len=max_seq_len)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction)
llm = LLM(model_dir,
tokenizer,
tensor_parallel_size=tp_size,
pipeline_parallel_size=pp_size,
build_config=build_config)
build_config=build_config,
kv_cache_config=kv_cache_config)
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer or model_dir)
server = OpenaiServer(llm=llm,
model=model_dir,
kv_cache_config=kv_cache_config,
hf_tokenizer=hf_tokenizer)
asyncio.run(server(host, port))

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.15.0.dev2024102200
tensorrt_llm==0.15.0.dev2024102900
datasets~=2.15.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -106,7 +106,6 @@ python convert_checkpoint.py --model_dir ./bloom/176B/ \
--embedding_sharding_dim 0
trtllm-build --checkpoint_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
--gemm_plugin float16 \
--lookup_plugin float16 \
--output_dir ./bloom/176B/trt_engines/fp16/8-gpu/ \
--workers 2
@ -123,7 +122,7 @@ trtllm-build --checkpoint_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
--workers 2
# share embedding table between embedding() and lm_head() layers
# To reduce the generated engine size, we has to use gemm and lookup plugin (--gemm_plugin --lookup_plugin) and must shard the embedding table in the vocab dimension.
# To reduce the generated engine size, we can turn off gemm plugin and shard the embedding table in the vocab dimension.
python convert_checkpoint.py --model_dir ./bloom/176B/ \
--dtype float16 \
--output_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
@ -132,8 +131,6 @@ python convert_checkpoint.py --model_dir ./bloom/176B/ \
--embedding_sharding_dim 0 \
--use_embedding_sharing
trtllm-build --checkpoint_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
--gemm_plugin float16 \
--lookup_plugin float16 \
--output_dir ./bloom/176B/trt_engines/fp16/8-gpu/ \
--workers 2
```

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.15.0.dev2024102200
tensorrt_llm==0.15.0.dev2024102900
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,8 +1,10 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.15.0.dev2024102200
tensorrt_llm==0.15.0.dev2024102900
datasets~=2.14.5
evaluate~=0.4.1
protobuf
rouge_score~=0.1.2
sentencepiece
tiktoken
# https://github.com/THUDM/ChatGLM3/issues/1324
transformers<=4.43.0

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.15.0.dev2024102200
tensorrt_llm==0.15.0.dev2024102900
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.15.0.dev2024102200
tensorrt_llm==0.15.0.dev2024102900
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.15.0.dev2024102200
tensorrt_llm==0.15.0.dev2024102900
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece~=0.1.99

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.15.0.dev2024102200
tensorrt_llm==0.15.0.dev2024102900
datasets~=2.14.5
rouge_score~=0.1.2
SentencePiece~=0.1.99

Some files were not shown because too many files have changed in this diff Show More