mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
Update TensorRT-LLM (#2389)
* Update TensorRT-LLM --------- Co-authored-by: Alessio Netti <netti.alessio@gmail.com>
This commit is contained in:
parent
3c46c2794e
commit
f14d1d433c
2
.gitmodules
vendored
2
.gitmodules
vendored
@ -13,7 +13,7 @@
|
||||
url = https://github.com/NVIDIA/NVTX.git
|
||||
[submodule "3rdparty/ucxx"]
|
||||
path = 3rdparty/ucxx
|
||||
url = https://github.com/GuanLuo/ucxx.git
|
||||
url = https://github.com/rapidsai/ucxx.git
|
||||
[submodule "3rdparty/pybind11"]
|
||||
path = 3rdparty/pybind11
|
||||
url = https://github.com/pybind/pybind11.git
|
||||
|
||||
2
3rdparty/ucxx
vendored
2
3rdparty/ucxx
vendored
@ -1 +1 @@
|
||||
Subproject commit b99181779672965c6f325a95a29eb433b6e9cbbd
|
||||
Subproject commit 5c745102c26df11e68f11368bcd9649e81e981da
|
||||
@ -6,8 +6,8 @@ TensorRT-LLM
|
||||
|
||||
[](https://nvidia.github.io/TensorRT-LLM/)
|
||||
[](https://www.python.org/downloads/release/python-31012/)
|
||||
[](https://developer.nvidia.com/cuda-downloads)
|
||||
[](https://developer.nvidia.com/tensorrt)
|
||||
[](https://developer.nvidia.com/cuda-downloads)
|
||||
[](https://developer.nvidia.com/tensorrt)
|
||||
[](./tensorrt_llm/version.py)
|
||||
[](./LICENSE)
|
||||
|
||||
|
||||
@ -7,5 +7,6 @@ There are currently three workflows to benchmark TensorRT-LLM:
|
||||
- The recommended workflow that uses TensorRT-LLM C++ API and can take advantage of the latest features of TensorRT-LLM.
|
||||
* [Python benchmarks](./python)
|
||||
- The Python benchmarking scripts can only benchmark the Python runtime, which do not support the latest features, such as in-flight batching.
|
||||
* [The Python benchmarking suite](./Suite.md)
|
||||
- This benchmarking suite is a current work in progress and is prone to large changes.
|
||||
* [The Python benchmarking suite](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html)
|
||||
- This benchmarker is native to TensorRT-LLM and is a Python benchmarker for reproducing and testing the performance of TensorRT-LLM.
|
||||
- _NOTE_: This benchmarking suite is a current work in progress and is prone to large changes.
|
||||
|
||||
@ -145,6 +145,7 @@ struct BenchmarkParams
|
||||
{
|
||||
std::optional<SizeType32> maxTokensInPagedKvCache{std::nullopt};
|
||||
std::optional<float> freeGpuMemoryFraction{std::nullopt};
|
||||
std::optional<float> crossKvCacheFraction{std::nullopt};
|
||||
bool enableTrtOverlap{false};
|
||||
bool enableBlockReuse{false};
|
||||
bool enableChunkedContext{false};
|
||||
@ -882,7 +883,8 @@ public:
|
||||
texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy);
|
||||
texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
|
||||
benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength,
|
||||
benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
|
||||
benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks,
|
||||
benchmarkParams.crossKvCacheFraction);
|
||||
texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
|
||||
std::nullopt, benchmarkParams.loraHostCacheSize);
|
||||
texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode,
|
||||
@ -1486,6 +1488,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
|
||||
std::nullopt, // pTuning
|
||||
loraConfig, // loraConfig
|
||||
lookaheadConfig, // lookaheadConfig
|
||||
std::nullopt, // kvCacheRetentionConfig
|
||||
std::nullopt, // logitsPostProcessorName
|
||||
encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
|
||||
}
|
||||
@ -1509,6 +1512,10 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
|
||||
{
|
||||
optionalParams.kvCacheConfig.freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction;
|
||||
}
|
||||
if (benchmarkParams.crossKvCacheFraction)
|
||||
{
|
||||
optionalParams.kvCacheConfig.crossKvCacheFraction = benchmarkParams.crossKvCacheFraction;
|
||||
}
|
||||
if (benchmarkParams.maxAttentionWindowVec)
|
||||
{
|
||||
optionalParams.kvCacheConfig.maxAttentionWindowVec = benchmarkParams.maxAttentionWindowVec;
|
||||
@ -1953,6 +1960,8 @@ int main(int argc, char* argv[])
|
||||
"random_seed", "integer random seed for exponential time delays.", cxxopts::value<int>()->default_value("420"));
|
||||
options.add_options()(
|
||||
"kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value<float>());
|
||||
options.add_options()(
|
||||
"cross_kv_cache_fraction", "Cross K-V Cache Fraction (from 0.0 to 1.0).", cxxopts::value<float>());
|
||||
options.add_options()("request_rate",
|
||||
"request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
|
||||
cxxopts::value<float>());
|
||||
@ -2126,6 +2135,20 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
benchmarkParams.freeGpuMemoryFraction = result["kv_cache_free_gpu_mem_fraction"].as<float>();
|
||||
}
|
||||
// Argument: K-V Cache Cross Attention Fraction. Only applicable to enc-dec models.
|
||||
if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir"))
|
||||
{
|
||||
if (result.count("cross_kv_cache_fraction"))
|
||||
{
|
||||
benchmarkParams.crossKvCacheFraction = result["cross_kv_cache_fraction"].as<float>();
|
||||
}
|
||||
else
|
||||
{
|
||||
benchmarkParams.crossKvCacheFraction
|
||||
= 0.5f; // default value if not set. but non enc-dec should not even have this param set
|
||||
}
|
||||
}
|
||||
|
||||
// Argument: Enable TRT overlap
|
||||
benchmarkParams.enableTrtOverlap = result["enable_trt_overlap"].as<bool>();
|
||||
|
||||
@ -2342,14 +2365,14 @@ int main(int argc, char* argv[])
|
||||
{
|
||||
texec::ModelType executorModelType;
|
||||
std::optional<std::string> decoderEngineDir = std::nullopt, encoderEngineDir = std::nullopt;
|
||||
if (result.count("encoder_engine_dir") && result.count("engine_dir"))
|
||||
if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir"))
|
||||
{
|
||||
TLLM_CHECK_WITH_INFO(api == "executor", "encoder-decoder only support executor api.");
|
||||
TLLM_CHECK_WITH_INFO(
|
||||
modelType == TrtGptModelType::InflightFusedBatching, "encoder-decoder only support inflight batching.");
|
||||
executorModelType = texec::ModelType::kENCODER_DECODER;
|
||||
decoderEngineDir = result["engine_dir"].as<std::string>();
|
||||
encoderEngineDir = result["encoder_engine_dir"].as<std::string>();
|
||||
decoderEngineDir = result["decoder_engine_dir"].as<std::string>();
|
||||
}
|
||||
else if (result.count("engine_dir"))
|
||||
{
|
||||
|
||||
@ -30,45 +30,47 @@ class BaseEvictionPolicy
|
||||
public:
|
||||
virtual ~BaseEvictionPolicy() = default;
|
||||
|
||||
virtual void initialize(
|
||||
std::vector<BlockPtr>& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks)
|
||||
// TODO(TRTLLM-1564): Don't use a separate `initialize` function. Ensure eviction policies can't be in-between a
|
||||
// state of construction and initialization.
|
||||
virtual void initialize(std::vector<BlockPtr>& mAllBlocksById, std::vector<SizeType32> sizes,
|
||||
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt)
|
||||
= 0;
|
||||
|
||||
// Get a free block from the primary memory pool
|
||||
virtual BlockPtr getFreePrimaryBlock() = 0;
|
||||
// Get a free block from the secondary memory pool
|
||||
virtual BlockPtr getFreeSecondaryBlock() = 0;
|
||||
// Release a block. Prioritize the block for eviction if toFront=true
|
||||
/// @brief Get a free block from the specified cache level
|
||||
/// @returns The pointer to the free block, along with whether it can be offloaded
|
||||
virtual std::tuple<BlockPtr, bool> getFreeBlock(SizeType32 cacheLevel) = 0;
|
||||
/// @brief Release a block. Prioritize the block for eviction if toFront=true
|
||||
virtual void releaseBlock(BlockPtr block, bool toFront = false) = 0;
|
||||
// Get the amount of free blocks in the primary memory pool
|
||||
virtual SizeType32 getNumFreePrimaryBlocks() = 0;
|
||||
// Get the amount of free blocks in the secondary memory pool
|
||||
virtual SizeType32 getNumFreeSecondaryBlocks() = 0;
|
||||
// Claim a free block. Called when the cache manager allocates or reuses a new block
|
||||
virtual void claimBlock(KVCacheBlock block) = 0;
|
||||
/// @brief Get the amount of free blocks in the primary memory pool
|
||||
virtual SizeType32 getNumFreeBlocks(SizeType32 cacheLevel) = 0;
|
||||
/// @brief Claim a free block. Called when the cache manager allocates or reuses a new block
|
||||
virtual void claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt) = 0;
|
||||
};
|
||||
|
||||
class LRUEvictionPolicy : public BaseEvictionPolicy
|
||||
{
|
||||
public:
|
||||
void initialize(
|
||||
std::vector<BlockPtr>& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks) override;
|
||||
BlockPtr getFreePrimaryBlock() override;
|
||||
BlockPtr getFreeSecondaryBlock() override;
|
||||
void initialize(std::vector<BlockPtr>& mAllBlocksById, std::vector<SizeType32> sizes,
|
||||
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt) override;
|
||||
std::tuple<BlockPtr, bool> getFreeBlock(SizeType32 cacheLevel) override;
|
||||
void releaseBlock(BlockPtr block, bool toFront = false) override;
|
||||
SizeType32 getNumFreePrimaryBlocks() override;
|
||||
SizeType32 getNumFreeSecondaryBlocks() override;
|
||||
|
||||
void claimBlock(KVCacheBlock block);
|
||||
SizeType32 getNumFreeBlocks(SizeType32 cacheLevel) override;
|
||||
void claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt) override;
|
||||
|
||||
private:
|
||||
FreeBlocksQueue mFreePrimaryBlocks;
|
||||
FreeBlocksQueue mFreeSecondaryBlocks;
|
||||
// Check if the block should be added to mFreeQueues.
|
||||
bool isReleasedLeafBlock(BlockPtr block);
|
||||
|
||||
// Queues of available leaf blocks, split by cache level and priority level
|
||||
std::vector<std::vector<FreeBlocksQueue>> mFreeQueues;
|
||||
// All blocks that have been released, along with the amount of released children
|
||||
std::vector<std::unordered_set<SizeType32>> mReleasedBlocks;
|
||||
// Iterators to block entries in mFreeQueues
|
||||
std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
|
||||
|
||||
SizeType32 mFreePrimaryBlocksSize;
|
||||
SizeType32 mFreeSecondaryBlocksSize;
|
||||
// Amount of free blocks at each cache level
|
||||
std::vector<SizeType32> mNumFreeBlocksPerLevel;
|
||||
// Secondary offload threshold. Blocks below this priority won't be evicted.
|
||||
executor::RetentionPriority mSecondaryOffloadMinPriority;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::batch_manager::eviction_policy
|
||||
|
||||
@ -41,7 +41,9 @@ public:
|
||||
std::optional<std::vector<SizeType32>> maxAttentionWindowVec = std::nullopt,
|
||||
std::optional<SizeType32> sinkTokenLength = std::nullopt,
|
||||
std::optional<float> freeGpuMemoryFraction = std::nullopt, bool enableBlockReuse = false, bool useUvm = false,
|
||||
std::optional<size_t> hostCacheSize = std::nullopt, bool onboardBlocks = true)
|
||||
std::optional<size_t> hostCacheSize = std::nullopt, bool onboardBlocks = true,
|
||||
std::optional<float> crossKvCacheFraction = std::nullopt,
|
||||
std::optional<SizeType32> secondaryOffloadMinPriority = std::nullopt)
|
||||
: maxTokens{maxTokens}
|
||||
, maxAttentionWindowVec{maxAttentionWindowVec}
|
||||
, sinkTokenLength{sinkTokenLength}
|
||||
@ -50,6 +52,8 @@ public:
|
||||
, useUvm(useUvm)
|
||||
, hostCacheSize(hostCacheSize)
|
||||
, onboardBlocks(onboardBlocks)
|
||||
, crossKvCacheFraction{crossKvCacheFraction}
|
||||
, secondaryOffloadMinPriority(secondaryOffloadMinPriority)
|
||||
{
|
||||
}
|
||||
|
||||
@ -57,7 +61,7 @@ public:
|
||||
: KvCacheConfig(kvCacheConfig.getMaxTokens(), kvCacheConfig.getMaxAttentionWindowVec(),
|
||||
kvCacheConfig.getSinkTokenLength(), kvCacheConfig.getFreeGpuMemoryFraction(),
|
||||
kvCacheConfig.getEnableBlockReuse(), false, kvCacheConfig.getHostCacheSize(),
|
||||
kvCacheConfig.getOnboardBlocks())
|
||||
kvCacheConfig.getOnboardBlocks(), kvCacheConfig.getCrossKvCacheFraction())
|
||||
{
|
||||
}
|
||||
|
||||
@ -66,7 +70,8 @@ public:
|
||||
return maxTokens == other.maxTokens && maxAttentionWindowVec == other.maxAttentionWindowVec
|
||||
&& sinkTokenLength == other.sinkTokenLength && freeGpuMemoryFraction == other.freeGpuMemoryFraction
|
||||
&& enableBlockReuse == other.enableBlockReuse && useUvm == other.useUvm
|
||||
&& hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks;
|
||||
&& hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks
|
||||
&& crossKvCacheFraction == other.crossKvCacheFraction;
|
||||
}
|
||||
|
||||
friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self);
|
||||
@ -80,5 +85,9 @@ public:
|
||||
bool useUvm;
|
||||
std::optional<size_t> hostCacheSize;
|
||||
bool onboardBlocks;
|
||||
// Cross will use crossKvCacheFraction of KV Cache and self attention will use the rest.
|
||||
std::optional<float> crossKvCacheFraction;
|
||||
// The minimum priority level to allow blocks to be offloaded to secondary memory.
|
||||
std::optional<SizeType32> secondaryOffloadMinPriority;
|
||||
};
|
||||
} // namespace tensorrt_llm::batch_manager::kv_cache_manager
|
||||
|
||||
@ -60,12 +60,13 @@ using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType;
|
||||
|
||||
struct BlockKey
|
||||
{
|
||||
bool hasLora;
|
||||
LoraTaskIdType loraTaskId;
|
||||
VecUniqueTokens uniqueTokens;
|
||||
|
||||
bool operator==(BlockKey const& other) const noexcept
|
||||
{
|
||||
return (loraTaskId == other.loraTaskId && uniqueTokens == other.uniqueTokens);
|
||||
return (hasLora == other.hasLora && loraTaskId == other.loraTaskId && uniqueTokens == other.uniqueTokens);
|
||||
}
|
||||
};
|
||||
|
||||
@ -97,8 +98,14 @@ struct BlockKeyHasher
|
||||
c = (c ^ (c >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
|
||||
c = (c ^ (c >> 27)) * UINT64_C(0x94d049bb133111eb);
|
||||
c = c ^ (c >> 31);
|
||||
|
||||
seed ^= c + 0x9e3779b9 + (seed << 6) + (seed >> 2);
|
||||
|
||||
uint32_t d = static_cast<uint32_t>(blockKey.hasLora);
|
||||
d = ((d >> 16) ^ d) * 0x45d9f3b;
|
||||
d = ((d >> 16) ^ d) * 0x45d9f3b;
|
||||
d = (d >> 16) ^ d;
|
||||
seed ^= d + 0x9e3779b9 + (seed << 6) + (seed >> 2);
|
||||
|
||||
return seed;
|
||||
}
|
||||
};
|
||||
@ -107,13 +114,24 @@ using NextBlockMap = std::unordered_map<BlockKey, BlockPtr, BlockKeyHasher>;
|
||||
|
||||
struct KvCacheStats
|
||||
{
|
||||
// Number of maximum available blocks in the primary memory pool. This is determined and set by available primary
|
||||
// memory. See calculateMaxNumBlocks for details.
|
||||
SizeType32 maxNumBlocks;
|
||||
// Number of free blocks in the primary memory pool.
|
||||
SizeType32 freeNumBlocks;
|
||||
// Number of used blocks in the primary memory pool. usedNumBlocks = maxNumBlocks - freeNumBlocks.
|
||||
SizeType32 usedNumBlocks;
|
||||
SizeType32 toksPerBlock;
|
||||
// Total number of blocks allocated by all requests.
|
||||
SizeType32 allocTotalBlocks;
|
||||
// Number of new blocks that were allocated.
|
||||
SizeType32 allocNewBlocks;
|
||||
// Number of blocks that were matched and reused.
|
||||
SizeType32 reusedBlocks;
|
||||
// Number of blocks that were not matched and not reused.
|
||||
SizeType32 missedBlocks;
|
||||
// Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
|
||||
float cacheHitRate;
|
||||
};
|
||||
|
||||
// Basic building block of a paged KV cache - a single
|
||||
@ -152,6 +170,8 @@ public:
|
||||
|
||||
[[nodiscard]] VecUniqueTokens const& getUniqueTokens() const;
|
||||
|
||||
BlockPtr getPrevBlock() const;
|
||||
|
||||
void setPrevBlock(BlockPtr prevBlock);
|
||||
|
||||
void addNextBlock(BlockKey const& blockKey, BlockPtr block);
|
||||
@ -167,6 +187,10 @@ public:
|
||||
|
||||
[[nodiscard]] bool isShared() const;
|
||||
|
||||
void setPriority(executor::RetentionPriority priority);
|
||||
|
||||
executor::RetentionPriority getPriority() const;
|
||||
|
||||
private:
|
||||
// Linear ID of block independent of pool
|
||||
IdType mBlockId;
|
||||
@ -195,6 +219,9 @@ private:
|
||||
|
||||
// Flag indicating if block is full
|
||||
bool mIsFull;
|
||||
|
||||
// Priority of the block
|
||||
executor::RetentionPriority mPriority;
|
||||
};
|
||||
|
||||
class GenerationRequest
|
||||
@ -203,14 +230,17 @@ public:
|
||||
using SizeType32 = tensorrt_llm::runtime::SizeType32;
|
||||
|
||||
explicit GenerationRequest(LlmRequest::RequestIdType requestId, SizeType32 numTokens, SizeType32 beamWidth,
|
||||
SizeType32 maxBlocks, SizeType32 numPools = 1)
|
||||
SizeType32 maxBlocks, SizeType32 numPools = 1,
|
||||
executor::RetentionPriority decodeRetentionPriority
|
||||
= executor::KvCacheRetentionConfig::kDefaultRetentionPriority)
|
||||
: mRequestId(requestId)
|
||||
, mNumTokens(numTokens)
|
||||
, mBeamWidth(beamWidth)
|
||||
, mCacheBlockIds(beamWidth)
|
||||
, mCacheBlockIndices{
|
||||
runtime::BufferManager::cpu(runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
|
||||
runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value)}
|
||||
, mCacheBlockIndices{runtime::BufferManager::cpu(
|
||||
runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
|
||||
runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value)}
|
||||
, mDecodeRetentionPriority(decodeRetentionPriority)
|
||||
{
|
||||
auto cacheBlockIdsRange = runtime::BufferRange<tensorrt_llm::kernels::KVCacheIndex>(*mCacheBlockIndices);
|
||||
std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(),
|
||||
@ -286,6 +316,11 @@ public:
|
||||
}
|
||||
}
|
||||
|
||||
[[nodiscard]] executor::RetentionPriority getDecodeRetentionPriority() const
|
||||
{
|
||||
return mDecodeRetentionPriority;
|
||||
}
|
||||
|
||||
private:
|
||||
// Request id of the sequence
|
||||
LlmRequest::RequestIdType mRequestId;
|
||||
@ -297,6 +332,8 @@ private:
|
||||
std::vector<std::vector<KVCacheBlock::IdType>> mCacheBlockIds;
|
||||
// Tensor of block indices allocated for each beam of the sequence
|
||||
runtime::ITensor::SharedPtr mCacheBlockIndices;
|
||||
// The retention priority to assign to decode blocks
|
||||
executor::RetentionPriority mDecodeRetentionPriority;
|
||||
};
|
||||
|
||||
// attach metadata to a pool pointer
|
||||
@ -344,7 +381,8 @@ public:
|
||||
explicit BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
|
||||
SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
|
||||
SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
|
||||
CacheType cacheType = CacheType::kSELF);
|
||||
CacheType cacheType = CacheType::kSELF,
|
||||
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt);
|
||||
|
||||
~BlockManager();
|
||||
|
||||
@ -396,6 +434,11 @@ public:
|
||||
return getMaxNumBlocks() - getNumFreeBlocks();
|
||||
}
|
||||
|
||||
[[nodiscard]] SizeType32 getNumMissedBlocks() const noexcept
|
||||
{
|
||||
return mMissedBlocks;
|
||||
}
|
||||
|
||||
[[nodiscard]] bool hasFreeBlocks(SizeType32 numRequired = 1) const noexcept
|
||||
{
|
||||
return getNumFreeBlocks() >= numRequired;
|
||||
@ -494,20 +537,23 @@ private:
|
||||
//! \brief Store blocks in cached blocks.
|
||||
//! \param blockKeys Key of each block.
|
||||
//! \param blockIds Id of each block.
|
||||
void storeBlocks(std::list<BlockKey> blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds);
|
||||
//! \param isChunkedContext Whether these blocks are being stored for chunked context.
|
||||
void storeBlocks(std::list<BlockKey> blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
|
||||
bool isChunkedContext = false);
|
||||
|
||||
//! \brief Try to load blocks from cache. Allocate new blocks if necessary.
|
||||
//! \param blockKeys Key of each block.
|
||||
//! \param sequence Sequence to which blocks are assigned.
|
||||
//! \return Number of matched tokens from loaded blocks.
|
||||
SizeType32 loadOrAllocateBlocks(
|
||||
std::list<BlockKey> const& blockKeys, SizeType32 numContextBlocks, GenerationRequest& sequence);
|
||||
SizeType32 loadOrAllocateBlocks(std::list<BlockKey> const& blockKeys, SizeType32 numContextBlocks,
|
||||
GenerationRequest& sequence, std::vector<std::optional<executor::RetentionPriority>> blockPriorities);
|
||||
|
||||
//! \brief Find block least likely to be reused, free it if necessary and return.
|
||||
[[nodiscard]] BlockPtr getFreeBlock();
|
||||
[[nodiscard]] BlockPtr getFreeBlock(
|
||||
executor::RetentionPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority);
|
||||
|
||||
//! \brief Free block from previous block and claim it from free blocks list.
|
||||
void claimLeafBlock(KVCacheBlock& block);
|
||||
void claimLeafBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt);
|
||||
|
||||
//! \brief Compute pointer to raw KV block (K & V, all layers).
|
||||
[[nodiscard]] runtime::ITensor::SharedPtr computeBlockPointer(
|
||||
@ -546,13 +592,24 @@ private:
|
||||
std::vector<BlockPtr> mAllBlocksById;
|
||||
// Dummy block acting as root for BlockToken searches
|
||||
BlockPtr mCachedBlocksRoot;
|
||||
// Statistics for block allocations/reuse
|
||||
std::size_t mAllocTotalBlocks, mAllocNewBlocks, mReusedBlocks;
|
||||
// KV cache type (self or cross)
|
||||
CacheType mCacheType;
|
||||
// Eviction Policy
|
||||
std::shared_ptr<BaseEvictionPolicy> mEvictionPolicy;
|
||||
|
||||
// Statistics for block allocations/reuse
|
||||
// Total number of blocks allocated by all requests
|
||||
SizeType32 mAllocTotalBlocks;
|
||||
// Number of new blocks that were allocated
|
||||
SizeType32 mAllocNewBlocks;
|
||||
// Number of blocks that were reused
|
||||
SizeType32 mReusedBlocks;
|
||||
// Number of unique blocks that were reused
|
||||
SizeType32 mReusedUniqueBlocks;
|
||||
// Number of blocks that were not reused
|
||||
SizeType32 mMissedBlocks;
|
||||
std::set<KVCacheBlock::IdType> reusedBlockIds;
|
||||
|
||||
private:
|
||||
friend class KVCacheManager;
|
||||
};
|
||||
@ -570,13 +627,15 @@ public:
|
||||
SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
|
||||
SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, bool useOneMoreBlock,
|
||||
CudaStreamPtr stream, bool enableBlockReuse = false, bool onboardBlocks = true,
|
||||
CacheType cacheType = CacheType::kSELF);
|
||||
CacheType cacheType = CacheType::kSELF,
|
||||
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt);
|
||||
|
||||
KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
|
||||
SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
|
||||
SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, bool useOneMoreBlock,
|
||||
CudaStreamPtr stream, bool enableBlockReuse = true, bool onboardBlocks = true,
|
||||
CacheType cacheType = CacheType::kSELF);
|
||||
CacheType cacheType = CacheType::kSELF,
|
||||
std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt);
|
||||
|
||||
void allocatePools(nvinfer1::DataType dtype, bool useUvm = false);
|
||||
|
||||
@ -617,6 +676,11 @@ public:
|
||||
return mBlockManager.getNumReusedBlocks();
|
||||
}
|
||||
|
||||
[[nodiscard]] SizeType32 getNumMissedBlocks() const noexcept
|
||||
{
|
||||
return mBlockManager.getNumMissedBlocks();
|
||||
}
|
||||
|
||||
[[nodiscard]] KvCacheStats getKvCacheStats() const
|
||||
{
|
||||
KvCacheStats kvCacheStats;
|
||||
@ -627,7 +691,10 @@ public:
|
||||
kvCacheStats.allocTotalBlocks = getNumAllocTotalBlocks();
|
||||
kvCacheStats.allocNewBlocks = getNumAllocNewBlocks();
|
||||
kvCacheStats.reusedBlocks = getNumReusedBlocks();
|
||||
|
||||
kvCacheStats.missedBlocks = getNumMissedBlocks();
|
||||
kvCacheStats.cacheHitRate = kvCacheStats.reusedBlocks == 0 ? 0
|
||||
: static_cast<float>(kvCacheStats.reusedBlocks)
|
||||
/ static_cast<float>(kvCacheStats.reusedBlocks + kvCacheStats.missedBlocks);
|
||||
return kvCacheStats;
|
||||
}
|
||||
|
||||
@ -689,14 +756,14 @@ public:
|
||||
runtime::ITensor& output, SizeType32 outputSlotOffset, LlmRequest::RequestIdType requestId) const;
|
||||
|
||||
// Sum of numLayers * 2 * numKvHeads * sizePerHead for each pool
|
||||
[[nodiscard]] static SizeType32 calculateCacheSizePerToken(
|
||||
tensorrt_llm::runtime::ModelConfig const& modelConfig, tensorrt_llm::runtime::WorldConfig const& worldConfig)
|
||||
[[nodiscard]] static SizeType32 calculateCacheSizePerToken(tensorrt_llm::runtime::ModelConfig const& modelConfig,
|
||||
tensorrt_llm::runtime::WorldConfig const& worldConfig, bool isCrossAttention = false)
|
||||
{
|
||||
// NOTE: We expect the initialization of modelConfig to have already taken the tp size into account and do not
|
||||
// address it here
|
||||
// consider only local layers for the calculation
|
||||
return modelConfig.getSumLocalKvHeads(
|
||||
worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank())
|
||||
worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank(), isCrossAttention)
|
||||
* 2 * modelConfig.getSizePerHead();
|
||||
}
|
||||
|
||||
|
||||
@ -95,8 +95,9 @@ public:
|
||||
std::optional<SizeType32> promptVocabSize = std::nullopt,
|
||||
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
|
||||
std::optional<TensorPtr> loraConfig = std::nullopt,
|
||||
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
|
||||
bool returnContextLogits = false, bool returnGenerationLogits = false,
|
||||
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
|
||||
std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
|
||||
bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
|
||||
std::optional<std::shared_ptr<VecTokens>> draftTokens = std::nullopt,
|
||||
std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
|
||||
std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
|
||||
@ -134,6 +135,7 @@ public:
|
||||
, mLoraWeights(std::move(loraWeights))
|
||||
, mLoraConfig(std::move(loraConfig))
|
||||
, mLookaheadConfig(std::move(lookaheadConfig))
|
||||
, mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig))
|
||||
, mContextChunkSize{mPromptLen}
|
||||
, mLogProbs(samplingConfig.beamWidth)
|
||||
, mCumLogProbs(samplingConfig.beamWidth)
|
||||
@ -188,6 +190,7 @@ public:
|
||||
, mLoraWeights(std::nullopt)
|
||||
, mLoraConfig(std::nullopt)
|
||||
, mLookaheadConfig(std::nullopt)
|
||||
, mKvCacheRetentionConfig(std::nullopt)
|
||||
, mContextChunkSize{mPromptLen}
|
||||
, mLogProbs(mSamplingConfig.beamWidth)
|
||||
, mCumLogProbs(mSamplingConfig.beamWidth)
|
||||
@ -827,6 +830,16 @@ public:
|
||||
mLookaheadConfig = config;
|
||||
}
|
||||
|
||||
[[nodiscard]] std::optional<executor::KvCacheRetentionConfig> getKvCacheRetentionConfig() const
|
||||
{
|
||||
return mKvCacheRetentionConfig;
|
||||
}
|
||||
|
||||
void setKvCacheRetentionConfig(executor::KvCacheRetentionConfig config)
|
||||
{
|
||||
mKvCacheRetentionConfig = config;
|
||||
}
|
||||
|
||||
void clearLookaheadConfig()
|
||||
{
|
||||
mLookaheadConfig = std::nullopt;
|
||||
@ -1502,6 +1515,23 @@ public:
|
||||
return mReusedBlocksPerRequest;
|
||||
}
|
||||
|
||||
void updateMissedBlocksPerRequest(SizeType32 missedBlocksPerRequest)
|
||||
{
|
||||
mMissedBlocksPerRequest += missedBlocksPerRequest;
|
||||
}
|
||||
|
||||
[[nodiscard]] SizeType32 getMissedBlocksPerRequest() const
|
||||
{
|
||||
return mMissedBlocksPerRequest;
|
||||
}
|
||||
|
||||
[[nodiscard]] float getKVCacheHitRatePerRequest() const
|
||||
{
|
||||
return mReusedBlocksPerRequest == 0 ? 0
|
||||
: static_cast<float>(mReusedBlocksPerRequest)
|
||||
/ (static_cast<float>(mReusedBlocksPerRequest + mMissedBlocksPerRequest));
|
||||
}
|
||||
|
||||
RequestIdType mRequestId;
|
||||
SizeType32 mPromptLen;
|
||||
SizeType32 mMaxNewTokens;
|
||||
@ -1552,6 +1582,7 @@ protected:
|
||||
std::optional<TensorPtr> mLoraConfig;
|
||||
std::optional<executor::LookaheadDecodingConfig> mLookaheadConfig;
|
||||
|
||||
std::optional<executor::KvCacheRetentionConfig> mKvCacheRetentionConfig;
|
||||
// To enable chunked context, the FHMA paged kv-cache also needs to be enabled. Except for the last one,
|
||||
// the size of the context chunk needs to be an integer multiple of the kv-cache block size. The meaning
|
||||
// of null value is that the context is not chunked.
|
||||
@ -1613,6 +1644,7 @@ protected:
|
||||
SizeType32 mAllocTotalBlocksPerRequest{0};
|
||||
SizeType32 mAllocNewBlocksPerRequest{0};
|
||||
SizeType32 mReusedBlocksPerRequest{0};
|
||||
SizeType32 mMissedBlocksPerRequest{0};
|
||||
|
||||
private:
|
||||
void initialize(VecTokens const& inputTokens, bool outputLogProbs)
|
||||
@ -1757,8 +1789,9 @@ public:
|
||||
std::optional<SizeType32> promptVocabSize = std::nullopt,
|
||||
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
|
||||
std::optional<TensorPtr> loraConfig = std::nullopt,
|
||||
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
|
||||
bool returnContextLogits = false, bool returnGenerationLogits = false,
|
||||
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
|
||||
std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
|
||||
bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
|
||||
std::optional<std::shared_ptr<VecTokens>> draftTokens = std::nullopt,
|
||||
std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
|
||||
std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
|
||||
@ -1775,11 +1808,11 @@ public:
|
||||
: Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
|
||||
std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
|
||||
std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
|
||||
std::move(lookaheadConfig), returnLogProbs, returnContextLogits, returnGenerationLogits,
|
||||
std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
|
||||
applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
|
||||
std::move(encoderInputFeatures), std::move(encoderOutputLength), std::move(crossAttentionMask),
|
||||
llmRequestType, std::move(inputTokenExtraIds), numReturnSequences)
|
||||
std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs, returnContextLogits,
|
||||
returnGenerationLogits, std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput,
|
||||
std::move(logitsPostProcessor), applyLogitsPostProcessorBatched, std::move(encoderInputTokens),
|
||||
returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures), std::move(encoderOutputLength),
|
||||
std::move(crossAttentionMask), llmRequestType, std::move(inputTokenExtraIds), numReturnSequences)
|
||||
{
|
||||
}
|
||||
|
||||
@ -1791,6 +1824,7 @@ public:
|
||||
mLogitsPostProcessor = std::move(logitsPostProcessor);
|
||||
mApplyLogitsPostProcessorBatched = applyLogitsPostProcessorBatched;
|
||||
mLookaheadConfig = request.getLookaheadConfig();
|
||||
mKvCacheRetentionConfig = request.getKvCacheRetentionConfig();
|
||||
}
|
||||
|
||||
std::shared_ptr<LlmRequest> createChildRequest(RequestIdType requestId)
|
||||
|
||||
@ -1,13 +1,17 @@
|
||||
/*
|
||||
* SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
* SPDX-License-Identifier: LicenseRef-NvidiaProprietary
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
|
||||
* property and proprietary rights in and to this material, related
|
||||
* documentation and any modifications thereto. Any use, reproduction,
|
||||
* disclosure or distribution of this material and related documentation
|
||||
* without an express license agreement from NVIDIA CORPORATION or
|
||||
* its affiliates is strictly prohibited.
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
@ -28,6 +28,7 @@
|
||||
#include <memory>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
namespace tensorrt_llm::mpi
|
||||
@ -348,6 +349,71 @@ public:
|
||||
bool fastLogits;
|
||||
};
|
||||
|
||||
using RetentionPriority = SizeType32;
|
||||
|
||||
/// @brief Configuration for the request's retention in the KV Cache
|
||||
class KvCacheRetentionConfig
|
||||
{
|
||||
|
||||
public:
|
||||
static constexpr RetentionPriority kMinRetentionPriority = 0;
|
||||
static constexpr RetentionPriority kMaxRetentionPriority = 100;
|
||||
static constexpr RetentionPriority kDefaultRetentionPriority = 35;
|
||||
|
||||
/// @brief A single entry to set block priorities over a token range. Earlier ranges always take priority over later
|
||||
/// ones. For example, with a block size of 16, a range of [0, 17] would be applied to the first two blocks.
|
||||
struct TokenRangeRetentionPriority
|
||||
{
|
||||
public:
|
||||
explicit TokenRangeRetentionPriority(SizeType32 tokenStart, std::optional<SizeType32> tokenEnd = std::nullopt,
|
||||
RetentionPriority priority = KvCacheRetentionConfig::kDefaultRetentionPriority)
|
||||
: tokenStart{tokenStart}
|
||||
, tokenEnd{tokenEnd}
|
||||
, priority{priority}
|
||||
{
|
||||
TLLM_CHECK_WITH_INFO(priority >= KvCacheRetentionConfig::kMinRetentionPriority
|
||||
&& priority <= KvCacheRetentionConfig::kMaxRetentionPriority,
|
||||
"Invalid priority value. Must be between %d and %d", KvCacheRetentionConfig::kMinRetentionPriority,
|
||||
KvCacheRetentionConfig::kMaxRetentionPriority);
|
||||
};
|
||||
|
||||
/// @brief The first token of this range.
|
||||
SizeType32 tokenStart;
|
||||
/// @brief The final token of this range. The end is not included in the range. This can be set to std::nullopt
|
||||
/// to extend the range to the end of the sequence.
|
||||
std::optional<SizeType32> tokenEnd;
|
||||
/// @brief The priority of this token range. Higher priorities are less likely to be evicted or offloaded.
|
||||
RetentionPriority priority;
|
||||
|
||||
bool operator==(TokenRangeRetentionPriority const& other) const
|
||||
{
|
||||
return tokenStart == other.tokenStart && tokenEnd == other.tokenEnd && priority == other.priority;
|
||||
}
|
||||
};
|
||||
|
||||
explicit KvCacheRetentionConfig()
|
||||
: KvCacheRetentionConfig({}, kDefaultRetentionPriority)
|
||||
{
|
||||
}
|
||||
|
||||
KvCacheRetentionConfig(std::vector<TokenRangeRetentionPriority> const& tokenRangeRetentionPriorities,
|
||||
RetentionPriority decodeRetentionPriority);
|
||||
|
||||
[[nodiscard]] std::vector<TokenRangeRetentionPriority> getTokenRangeRetentionPriorities() const;
|
||||
[[nodiscard]] RetentionPriority getDecodeRetentionPriority() const;
|
||||
|
||||
/// @brief Convert the token range data into an entry per kv cache block for a given seqLen
|
||||
std::vector<std::optional<RetentionPriority>> getPerBlockEvictionPolicy(SizeType32 blockSize, SizeType32 seqLen);
|
||||
|
||||
private:
|
||||
/// @brief The token ranges and priority levels to update. Ranges must be non-overlapping. For example [(0, 64),
|
||||
/// (100, 128), (70, 80)] is valid, whereas
|
||||
/// [(0, 64), (60, 128)] is not.
|
||||
std::vector<TokenRangeRetentionPriority> mTokenRangeRetentionPriorities;
|
||||
/// @brief The priority level to assign to blocks allocated in the decode phase
|
||||
RetentionPriority mDecodeRetentionPriority;
|
||||
};
|
||||
|
||||
/// @brief A class that holds information about the request
|
||||
class Request
|
||||
{
|
||||
@ -371,6 +437,7 @@ public:
|
||||
/// @param pTuningConfig The prompt tuning configuration
|
||||
/// @param loraConfig The LoRA configuration
|
||||
/// @param logitsPostProcessorName The logits postprocessor name. Must correspond to one of the logits postprocessor
|
||||
/// @param kvCacheRetentionConfig The configuration used for KV cache block eviction.
|
||||
/// name provided to the ExecutorConfig.
|
||||
/// @param encoderInputTokenIds The encoder input token ids for encoder-decoder models, or encoder-only models
|
||||
/// @param returnAllGeneratedTokens Indicates whether to return the full beams or just the newly generated tokens
|
||||
@ -394,6 +461,7 @@ public:
|
||||
std::optional<PromptTuningConfig> pTuningConfig = std::nullopt,
|
||||
std::optional<LoraConfig> loraConfig = std::nullopt,
|
||||
std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
|
||||
std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
|
||||
std::optional<std::string> logitsPostProcessorName = std::nullopt,
|
||||
std::optional<VecTokens> encoderInputTokenIds = std::nullopt, std::optional<IdType> clientId = std::nullopt,
|
||||
bool returnAllGeneratedTokens = false, PriorityType priority = kDefaultPriority,
|
||||
@ -428,6 +496,7 @@ public:
|
||||
[[nodiscard]] std::optional<PromptTuningConfig> getPromptTuningConfig() const;
|
||||
[[nodiscard]] std::optional<LoraConfig> getLoraConfig() const;
|
||||
[[nodiscard]] std::optional<LookaheadDecodingConfig> getLookaheadConfig() const;
|
||||
[[nodiscard]] std::optional<KvCacheRetentionConfig> getKvCacheRetentionConfig() const;
|
||||
[[nodiscard]] std::optional<std::string> getLogitsPostProcessorName() const;
|
||||
[[nodiscard]] std::optional<VecTokens> getEncoderInputTokenIds() const;
|
||||
[[nodiscard]] std::optional<IdType> getClientId() const;
|
||||
@ -453,6 +522,7 @@ public:
|
||||
void setPromptTuningConfig(PromptTuningConfig const& pTuningConfig);
|
||||
void setLoraConfig(LoraConfig const& loraConfig);
|
||||
void setLookaheadConfig(LookaheadDecodingConfig const& lookaheadConfig);
|
||||
void setKvCacheRetentionConfig(KvCacheRetentionConfig const& kvCacheRetentionConfig);
|
||||
void setLogitsPostProcessorName(std::string const& logitsPostProcessorName);
|
||||
void setEncoderInputTokenIds(VecTokens const& encoderInputTokenIds);
|
||||
void setClientId(IdType clientId);
|
||||
@ -598,23 +668,29 @@ public:
|
||||
std::optional<std::vector<SizeType32>> const& maxAttentionWindowVec = std::nullopt,
|
||||
std::optional<SizeType32> const& sinkTokenLength = std::nullopt,
|
||||
std::optional<FloatType> const& freeGpuMemoryFraction = std::nullopt,
|
||||
std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true);
|
||||
std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true,
|
||||
std::optional<FloatType> const& crossKvCacheFraction = std::nullopt,
|
||||
std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt);
|
||||
|
||||
[[nodiscard]] bool getEnableBlockReuse() const;
|
||||
[[nodiscard]] std::optional<SizeType32> getMaxTokens() const;
|
||||
[[nodiscard]] std::optional<std::vector<SizeType32>> getMaxAttentionWindowVec() const;
|
||||
[[nodiscard]] std::optional<SizeType32> getSinkTokenLength() const;
|
||||
[[nodiscard]] std::optional<FloatType> getFreeGpuMemoryFraction() const;
|
||||
[[nodiscard]] std::optional<FloatType> getCrossKvCacheFraction() const;
|
||||
[[nodiscard]] std::optional<size_t> getHostCacheSize() const;
|
||||
[[nodiscard]] bool getOnboardBlocks() const;
|
||||
[[nodiscard]] std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const;
|
||||
|
||||
void setEnableBlockReuse(bool enableBlockReuse);
|
||||
void setMaxTokens(SizeType32 maxTokens);
|
||||
void setMaxAttentionWindowVec(std::vector<SizeType32> maxAttentionWindowVec);
|
||||
void setSinkTokenLength(SizeType32 sinkTokenLength);
|
||||
void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction);
|
||||
void setCrossKvCacheFraction(FloatType crossKvCacheFraction);
|
||||
void setHostCacheSize(size_t hostCacheSize);
|
||||
void setOnboardBlocks(bool onboardBlocks);
|
||||
void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority);
|
||||
|
||||
private:
|
||||
friend class Serialization;
|
||||
@ -641,12 +717,21 @@ private:
|
||||
/// allocated.
|
||||
std::optional<FloatType> mFreeGpuMemoryFraction;
|
||||
|
||||
/// @brief The fraction of the KV Cache memory should be reserved for cross attention
|
||||
/// If set to p, self attention will use 1-p of KV Cache memory and cross attention
|
||||
/// will use p of KV Cache memory. Default is 50%.
|
||||
/// Should only be set when using encoder-decoder model.
|
||||
std::optional<FloatType> mCrossKvCacheFraction;
|
||||
|
||||
/// @brief Size of secondary memory pool in bytes. Default is 0.
|
||||
/// Having a secondary memory pool increases KV cache block reuse potential.
|
||||
std::optional<size_t> mHostCacheSize;
|
||||
|
||||
/// @brief Controls whether offloaded blocks should be onboarded back into primary memory before being reused.
|
||||
bool mOnboardBlocks;
|
||||
|
||||
/// @brief Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory.
|
||||
std::optional<RetentionPriority> mSecondaryOffloadMinPriority;
|
||||
};
|
||||
|
||||
/// @brief Configuration class for the runtime perf knobs
|
||||
|
||||
@ -154,6 +154,18 @@ public:
|
||||
static void serialize(LookaheadDecodingConfig const& lookaheadDecodingConfig, std::ostream& os);
|
||||
static size_t serializedSize(LookaheadDecodingConfig const& lookaheadDecodingConfig);
|
||||
|
||||
// KvCacheRetentionConfig
|
||||
static KvCacheRetentionConfig deserializeKvCacheRetentionConfig(std::istream& is);
|
||||
static void serialize(KvCacheRetentionConfig const& kvCacheRetentionConfig, std::ostream& os);
|
||||
static size_t serializedSize(KvCacheRetentionConfig const& kvCacheRetentionConfig);
|
||||
|
||||
// TokenRangeRetentionPriority
|
||||
static KvCacheRetentionConfig::TokenRangeRetentionPriority deserializeTokenRangeRetentionPriority(std::istream& is);
|
||||
static void serialize(
|
||||
KvCacheRetentionConfig::TokenRangeRetentionPriority const& tokenRangeRetentionPriority, std::ostream& os);
|
||||
static size_t serializedSize(
|
||||
KvCacheRetentionConfig::TokenRangeRetentionPriority const& tokenRangeRetentionPriority);
|
||||
|
||||
// DecodingConfig
|
||||
static DecodingConfig deserializeDecodingConfig(std::istream& is);
|
||||
static void serialize(DecodingConfig const& decodingConfig, std::ostream& os);
|
||||
|
||||
@ -233,7 +233,8 @@ enum class CommunicationMode
|
||||
// execution of the model
|
||||
};
|
||||
|
||||
/// @brief Struct that holds the stats of a KV cache manager
|
||||
/// @brief Struct that holds the stats of a KV cache manager.
|
||||
// See KvCacheStats definition in kvCacheManager.h for more information about each field.
|
||||
struct KvCacheStats
|
||||
{
|
||||
/// @brief Max number of blocks
|
||||
@ -250,6 +251,10 @@ struct KvCacheStats
|
||||
SizeType32 allocNewBlocks;
|
||||
/// @brief Number of reused block
|
||||
SizeType32 reusedBlocks;
|
||||
/// @brief Number of not reused block
|
||||
SizeType32 missedBlocks;
|
||||
/// @brief Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
|
||||
float cacheHitRate;
|
||||
};
|
||||
|
||||
/// @brief Struct that holds the stats of static batching models for a single iteration
|
||||
@ -372,6 +377,10 @@ struct RequestStats
|
||||
SizeType32 allocNewBlocksPerRequest;
|
||||
/// @brief Number of reused blocks per request
|
||||
SizeType32 reusedBlocksPerRequest;
|
||||
/// @brief Number of missed blocks per request
|
||||
SizeType32 missedBlocksPerRequest;
|
||||
/// @brief KV Cache Hit Rate per request, defined as reusedBlocks / (reusedBlocks + missedBlocks)
|
||||
SizeType32 kvCacheHitRatePerRequest;
|
||||
};
|
||||
|
||||
/// @brief Struct that holds the stats of all requests in an iteration
|
||||
@ -459,6 +468,11 @@ public:
|
||||
return DecodingMode{kExternalDraftTokens | kUsePenalties | kUseBanTokens | kStandardStopCriteria};
|
||||
}
|
||||
|
||||
static auto constexpr Eagle()
|
||||
{
|
||||
return DecodingMode{kEagle | kStandardStopCriteria | kUseExplicitEosStop};
|
||||
}
|
||||
|
||||
auto constexpr useTemperature(bool useTemp)
|
||||
{
|
||||
mState = setBitTo(kUseTemperature, useTemp);
|
||||
@ -581,6 +595,11 @@ public:
|
||||
return anyBitSet(kExternalDraftTokens);
|
||||
}
|
||||
|
||||
[[nodiscard]] bool constexpr isEagle() const
|
||||
{
|
||||
return anyBitSet(kEagle);
|
||||
}
|
||||
|
||||
[[nodiscard]] bool constexpr isUseTemperature() const
|
||||
{
|
||||
return anyBitSet(kUseTemperature);
|
||||
@ -695,6 +714,7 @@ private:
|
||||
static UnderlyingType constexpr kLookahead{1u << (kNumFlags + 5)};
|
||||
static UnderlyingType constexpr kExplicitDraftTokens{1u << (kNumFlags + 6)};
|
||||
static UnderlyingType constexpr kExternalDraftTokens{1u << (kNumFlags + 7)};
|
||||
static UnderlyingType constexpr kEagle{1u << (kNumFlags + 8)};
|
||||
static UnderlyingType constexpr kTopKTopP{kTopK | kTopP};
|
||||
|
||||
[[nodiscard]] bool constexpr anyBitSet(UnderlyingType bits) const
|
||||
@ -726,6 +746,7 @@ static_assert(!DecodingMode::Auto().isMedusa());
|
||||
static_assert(!DecodingMode::Auto().isLookahead());
|
||||
static_assert(!DecodingMode::Auto().isExplicitDraftTokens());
|
||||
static_assert(!DecodingMode::Auto().isExternalDraftTokens());
|
||||
static_assert(!DecodingMode::Auto().isEagle());
|
||||
|
||||
static_assert(DecodingMode::TopK().isTopK());
|
||||
static_assert(DecodingMode::TopK().isTopKorTopP());
|
||||
@ -747,6 +768,7 @@ static_assert(!DecodingMode::TopK().isMedusa());
|
||||
static_assert(!DecodingMode::TopK().isLookahead());
|
||||
static_assert(!DecodingMode::TopK().isExplicitDraftTokens());
|
||||
static_assert(!DecodingMode::TopK().isExternalDraftTokens());
|
||||
static_assert(!DecodingMode::TopK().isEagle());
|
||||
|
||||
static_assert(DecodingMode::TopP().isTopP());
|
||||
static_assert(DecodingMode::TopP().isTopKorTopP());
|
||||
@ -760,7 +782,7 @@ static_assert(!DecodingMode::TopP().isBeamSearch());
|
||||
static_assert(!DecodingMode::TopP().isMedusa());
|
||||
static_assert(!DecodingMode::TopP().isLookahead());
|
||||
static_assert(!DecodingMode::TopP().isExplicitDraftTokens());
|
||||
static_assert(!DecodingMode::TopP().isExternalDraftTokens());
|
||||
static_assert(!DecodingMode::TopP().isEagle());
|
||||
|
||||
static_assert(DecodingMode::TopKTopP().isTopK());
|
||||
static_assert(DecodingMode::TopKTopP().isTopP());
|
||||
@ -775,6 +797,7 @@ static_assert(!DecodingMode::TopKTopP().isMedusa());
|
||||
static_assert(!DecodingMode::TopKTopP().isLookahead());
|
||||
static_assert(!DecodingMode::TopKTopP().isExplicitDraftTokens());
|
||||
static_assert(!DecodingMode::TopKTopP().isExternalDraftTokens());
|
||||
static_assert(!DecodingMode::TopKTopP().isEagle());
|
||||
|
||||
static_assert(DecodingMode::BeamSearch().isBeamSearch());
|
||||
static_assert(DecodingMode::BeamSearch().isUseStopCriteria());
|
||||
@ -784,6 +807,7 @@ static_assert(!DecodingMode::BeamSearch().isMedusa());
|
||||
static_assert(!DecodingMode::BeamSearch().isLookahead());
|
||||
static_assert(!DecodingMode::BeamSearch().isExplicitDraftTokens());
|
||||
static_assert(!DecodingMode::BeamSearch().isExternalDraftTokens());
|
||||
static_assert(!DecodingMode::BeamSearch().isEagle());
|
||||
|
||||
static_assert(!DecodingMode::Medusa().isAuto());
|
||||
static_assert(!DecodingMode::Medusa().isTopK());
|
||||
@ -800,6 +824,7 @@ static_assert(DecodingMode::Medusa().isUsePenalty());
|
||||
static_assert(DecodingMode::Medusa().isUseMinLength());
|
||||
static_assert(DecodingMode::Medusa().isMedusa());
|
||||
static_assert(!DecodingMode::Medusa().isExternalDraftTokens());
|
||||
static_assert(!DecodingMode::Medusa().isEagle());
|
||||
|
||||
static_assert(!DecodingMode::Lookahead().isAuto());
|
||||
static_assert(!DecodingMode::Lookahead().isTopK());
|
||||
@ -814,6 +839,7 @@ static_assert(DecodingMode::Lookahead().isUseStopWords());
|
||||
static_assert(DecodingMode::Lookahead().isUseExplicitEosStop());
|
||||
static_assert(DecodingMode::Lookahead().isLookahead());
|
||||
static_assert(!DecodingMode::Lookahead().isExternalDraftTokens());
|
||||
static_assert(!DecodingMode::Lookahead().isEagle());
|
||||
|
||||
static_assert(!DecodingMode::ExplicitDraftTokens().isAuto());
|
||||
static_assert(!DecodingMode::ExplicitDraftTokens().isTopK());
|
||||
@ -828,6 +854,7 @@ static_assert(DecodingMode::ExplicitDraftTokens().isUseStopCriteria());
|
||||
static_assert(!DecodingMode::ExplicitDraftTokens().isUseBanWords());
|
||||
static_assert(DecodingMode::ExplicitDraftTokens().isExplicitDraftTokens());
|
||||
static_assert(!DecodingMode::ExplicitDraftTokens().isExternalDraftTokens());
|
||||
static_assert(!DecodingMode::ExplicitDraftTokens().isEagle());
|
||||
|
||||
static_assert(!DecodingMode::ExternalDraftTokens().isTopK());
|
||||
static_assert(!DecodingMode::ExternalDraftTokens().isTopP());
|
||||
@ -841,5 +868,21 @@ static_assert(!DecodingMode::ExternalDraftTokens().isBeamSearch());
|
||||
static_assert(!DecodingMode::ExternalDraftTokens().isMedusa());
|
||||
static_assert(!DecodingMode::ExternalDraftTokens().isLookahead());
|
||||
static_assert(!DecodingMode::ExternalDraftTokens().isExplicitDraftTokens());
|
||||
static_assert(!DecodingMode::ExternalDraftTokens().isEagle());
|
||||
static_assert(DecodingMode::ExternalDraftTokens().isExternalDraftTokens());
|
||||
|
||||
static_assert(!DecodingMode::Eagle().isTopK());
|
||||
static_assert(!DecodingMode::Eagle().isTopP());
|
||||
static_assert(!DecodingMode::Eagle().isTopKorTopP());
|
||||
static_assert(!DecodingMode::Eagle().isTopKandTopP());
|
||||
static_assert(!DecodingMode::Eagle().isUseBanWords());
|
||||
static_assert(!DecodingMode::Eagle().isUseOccurrencePenalty());
|
||||
static_assert(DecodingMode::Eagle().isUseStopCriteria());
|
||||
static_assert(!DecodingMode::Eagle().isAuto());
|
||||
static_assert(!DecodingMode::Eagle().isBeamSearch());
|
||||
static_assert(!DecodingMode::Eagle().isMedusa());
|
||||
static_assert(!DecodingMode::Eagle().isLookahead());
|
||||
static_assert(!DecodingMode::Eagle().isExplicitDraftTokens());
|
||||
static_assert(!DecodingMode::Eagle().isExternalDraftTokens());
|
||||
static_assert(DecodingMode::Eagle().isEagle());
|
||||
} // namespace tensorrt_llm::executor
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
|
||||
#include "tensorrt_llm/runtime/bufferManager.h"
|
||||
#include "tensorrt_llm/runtime/common.h"
|
||||
#include "tensorrt_llm/runtime/eagleBuffers.h"
|
||||
#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
|
||||
#include "tensorrt_llm/runtime/iTensor.h"
|
||||
#include "tensorrt_llm/runtime/lookaheadBuffers.h"
|
||||
@ -121,6 +122,8 @@ public:
|
||||
std::optional<ExplicitDraftTokensBuffers::Inputs> explicitDraftTokensBuffers;
|
||||
|
||||
std::optional<LookaheadDecodingBuffers> lookaheadOutputs;
|
||||
|
||||
std::optional<EagleBuffers::Inputs> eagleBuffers;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::runtime
|
||||
|
||||
134
cpp/include/tensorrt_llm/runtime/eagleBuffers.h
Normal file
134
cpp/include/tensorrt_llm/runtime/eagleBuffers.h
Normal file
@ -0,0 +1,134 @@
|
||||
/*
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/executor/executor.h"
|
||||
#include "tensorrt_llm/runtime/eagleModule.h"
|
||||
#include "tensorrt_llm/runtime/iBuffer.h"
|
||||
#include "tensorrt_llm/runtime/iTensor.h"
|
||||
#include "tensorrt_llm/runtime/modelConfig.h"
|
||||
#include "tensorrt_llm/runtime/tllmRuntime.h"
|
||||
#include "tensorrt_llm/runtime/worldConfig.h"
|
||||
|
||||
#include <cstddef>
|
||||
|
||||
namespace tensorrt_llm::runtime
|
||||
{
|
||||
|
||||
class EagleBuffers
|
||||
{
|
||||
public:
|
||||
using SizeType32 = runtime::SizeType32;
|
||||
using ITensor = runtime::ITensor;
|
||||
using BufferPtr = runtime::IBuffer::SharedPtr;
|
||||
using TensorPtr = runtime::ITensor::SharedPtr;
|
||||
using TensorMap = runtime::StringPtrMap<runtime::ITensor>;
|
||||
|
||||
// The datastruct is used for runtime buffer that is holding runtime state per request (shape starts with
|
||||
// maxBatchSize) and for engine inputs (shape starts with numSequences).
|
||||
class Inputs
|
||||
{
|
||||
public:
|
||||
//! [maxBatchSize] or [numSequences]
|
||||
TensorPtr temperatures;
|
||||
//! [maxBatchSize] or [numSequences]
|
||||
TensorPtr randomDataSample;
|
||||
//! [maxBatchSize, maxNumPaths, maxPathDraftLen] or [numSequences, maxNumPaths, maxPathDraftLen]
|
||||
TensorPtr randomDataValidation;
|
||||
//! [maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
|
||||
TensorPtr draftTokens;
|
||||
//! [maxBatchSize] or [numSequences]
|
||||
TensorPtr draftLens;
|
||||
//! [maxBatchSize, maxNumPaths, maxPathLen]
|
||||
//! or [numSequences, maxNumPaths, maxPathLen]
|
||||
TensorPtr draftPaths;
|
||||
//! [maxBatchSize] or [numGenSequences]
|
||||
TensorPtr specDecodingGenerationLengths;
|
||||
//! [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
|
||||
//! or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
|
||||
TensorPtr specDecodingPackedMasks;
|
||||
//! [maxBatchSize] or [numGenSequences]
|
||||
TensorPtr specDecodingPositionOffsets;
|
||||
//! [maxBatchSize] or [numSequences]
|
||||
TensorPtr eagleNetCtxRequestTypesHost;
|
||||
//! [maxBatchSize] or [numSequences]
|
||||
TensorPtr eagleNetCtxContextLengthsHost;
|
||||
//! [maxBatchSize] or [numSequences]
|
||||
TensorPtr eagleNetCtxPastKeyValueLengthsHost;
|
||||
//! [maxBatchSize] or [numSequences]
|
||||
TensorPtr eagleNetGenRequestTypesHost;
|
||||
//! [maxBatchSize] or [numSequences]
|
||||
TensorPtr eagleNetGenContextLengthsHost;
|
||||
//! [maxBatchSize] or [numSequences]
|
||||
TensorPtr eagleNetGenPastKeyValueLengthsHost;
|
||||
|
||||
void create(SizeType32 maxNumSequences, runtime::TllmRuntime const& runtime,
|
||||
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig);
|
||||
};
|
||||
|
||||
Inputs engineInputs;
|
||||
|
||||
class EngineOutputs
|
||||
{
|
||||
public:
|
||||
//! [batchSize, maxDecodingDraftTokens]
|
||||
TensorPtr nextDraftTokens;
|
||||
//! [batchSize]
|
||||
TensorPtr nextDraftLens;
|
||||
//! [batchSize, maxNumPaths, maxPathLen]
|
||||
TensorPtr nextDraftPaths;
|
||||
|
||||
//! [batchSize, maxPathLen]
|
||||
TensorPtr acceptedTokens;
|
||||
//! [batchSize]
|
||||
TensorPtr acceptedLens;
|
||||
//! [batchSize]
|
||||
TensorPtr acceptedPaths;
|
||||
|
||||
} engineOutputs;
|
||||
|
||||
public:
|
||||
EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const& manager,
|
||||
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
|
||||
executor::DecodingConfig const& decodingConfig, runtime::TllmRuntime const& runtime);
|
||||
|
||||
void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const& modelConfig);
|
||||
|
||||
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const& requestTypes,
|
||||
ITensor const& seqSlots, EagleBuffers::Inputs const& decoderBuffers, ITensor const& contextPositionIds,
|
||||
runtime::TllmRuntime const& runtime, runtime::ModelConfig const& modelConfig,
|
||||
runtime::WorldConfig const& worldConfig) const;
|
||||
|
||||
void insertInputTensors(
|
||||
TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::WorldConfig const& worldConfig) const;
|
||||
|
||||
private:
|
||||
template <typename T>
|
||||
void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded,
|
||||
ITensor const& seqSlots, EagleBuffers::Inputs const& draftBuffers, ITensor const& contextPositionIds,
|
||||
runtime::EagleModule const& eagleModule, runtime::CudaStream const& stream) const;
|
||||
|
||||
private:
|
||||
// helper tensors
|
||||
std::size_t scanTempStorageBytes{0};
|
||||
std::size_t reduceTempStorageBytes{0};
|
||||
BufferPtr scanReduceTempStorage;
|
||||
TensorPtr cumSumGenerationLengths;
|
||||
TensorPtr maxGenerationLength;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::runtime
|
||||
@ -54,6 +54,8 @@ public:
|
||||
|
||||
void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override;
|
||||
|
||||
void setupEagle(EagleBuffers::Inputs eagleBuffers) override;
|
||||
|
||||
void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) override;
|
||||
|
||||
void newBatch(
|
||||
@ -270,12 +272,18 @@ private:
|
||||
//! @brief Setups decoder internal tensors for new Explicit draft tokens request
|
||||
void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const& request);
|
||||
|
||||
//! @brief Setups decoder internal tensors for new Eagle request
|
||||
void newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const& request);
|
||||
|
||||
//! @brief Updates finished state on host for all active requests
|
||||
void updateFinished(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent);
|
||||
|
||||
//! @brief Sets inputs for explicit draft tokens.
|
||||
void setExplicitDraftTokensInputs(decoder_batch::Input const& input);
|
||||
|
||||
//! @brief Sets inputs for eagle decoding.
|
||||
void setEagleInputs(decoder_batch::Input const& input);
|
||||
|
||||
//! @brief Calls decoders for tokens per engine step
|
||||
void forwardDispatch(decoder_batch::Output& output, decoder_batch::Input const& input, ForwardType forwardType);
|
||||
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
|
||||
#include "tensorrt_llm/runtime/cudaEvent.h"
|
||||
#include "tensorrt_llm/runtime/cudaStream.h"
|
||||
#include "tensorrt_llm/runtime/eagleBuffers.h"
|
||||
#include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
|
||||
#include "tensorrt_llm/runtime/iStatefulGptDecoder.h"
|
||||
#include "tensorrt_llm/runtime/iTensor.h"
|
||||
@ -101,6 +102,9 @@ public:
|
||||
//! @brief Setup buffers for ExplicitDraftTokens decoding.
|
||||
virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) = 0;
|
||||
|
||||
//! @brief Setup buffers for Eagle decoding.
|
||||
virtual void setupEagle(EagleBuffers::Inputs eagleBuffers) = 0;
|
||||
|
||||
//! @brief Setup buffers for Lookahead decoding.
|
||||
virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) = 0;
|
||||
|
||||
|
||||
@ -227,6 +227,12 @@ public:
|
||||
mNumKvHeadsPerAttentionLayer = std::vector<SizeType32>(mNbAttentionLayers, nbKvHeads);
|
||||
}
|
||||
|
||||
// set the number of kv heads for all layers
|
||||
void setNbCrossKvHeads(SizeType32 nbKvHeads)
|
||||
{
|
||||
mNumKvHeadsPerCrossAttentionLayer = std::vector<SizeType32>(mNbAttentionLayers, nbKvHeads);
|
||||
}
|
||||
|
||||
[[nodiscard]] SizeType32 constexpr getHiddenSize() const noexcept
|
||||
{
|
||||
return mHiddenSize;
|
||||
@ -712,13 +718,18 @@ public:
|
||||
}
|
||||
|
||||
[[nodiscard]] std::pair<std::vector<SizeType32>::const_iterator, std::vector<SizeType32>::const_iterator>
|
||||
getNumKvHeadsPerLayerLocalRange(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
|
||||
getNumKvHeadsPerLayerLocalRange(
|
||||
SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start: %d", __PRETTY_FUNCTION__);
|
||||
TLLM_CHECK_WITH_INFO(pipelineParallelism > 0, "Invalid pipelineParallelism: %d", pipelineParallelism);
|
||||
|
||||
// count number of previous non-local attention layers
|
||||
auto const numPrevAttnLayers
|
||||
= countLowerRankLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank);
|
||||
auto const firstLocalAttentionLayerIt = mNumKvHeadsPerAttentionLayer.cbegin() + numPrevAttnLayers;
|
||||
auto const firstLocalAttentionLayerIt = isCrossAttention
|
||||
? mNumKvHeadsPerCrossAttentionLayer.cbegin()
|
||||
: mNumKvHeadsPerAttentionLayer.cbegin() + numPrevAttnLayers;
|
||||
auto const numLocalAttentionLayers
|
||||
= countLocalLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank);
|
||||
return std::make_pair(firstLocalAttentionLayerIt, firstLocalAttentionLayerIt + numLocalAttentionLayers);
|
||||
@ -732,10 +743,19 @@ public:
|
||||
mNumKvHeadsPerAttentionLayer = headsPerLayer;
|
||||
}
|
||||
|
||||
[[nodiscard]] SizeType32 getSumLocalKvHeads(
|
||||
SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
|
||||
void setNumKvHeadsPerCrossLayer(std::vector<SizeType32> const& headsPerLayer)
|
||||
{
|
||||
auto [cbegin, cend] = getNumKvHeadsPerLayerLocalRange(pipelineParallelism, pipelineParallelismRank);
|
||||
auto const numElems = static_cast<SizeType32>(headsPerLayer.size());
|
||||
TLLM_CHECK_WITH_INFO(numElems == mNbAttentionLayers,
|
||||
"Length of head_per_layer (%d) must match number of attention layers (%d)", numElems, mNbAttentionLayers);
|
||||
mNumKvHeadsPerCrossAttentionLayer = headsPerLayer;
|
||||
}
|
||||
|
||||
[[nodiscard]] SizeType32 getSumLocalKvHeads(
|
||||
SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const
|
||||
{
|
||||
auto [cbegin, cend]
|
||||
= getNumKvHeadsPerLayerLocalRange(pipelineParallelism, pipelineParallelismRank, isCrossAttention);
|
||||
auto const sumLocalHeads = std::reduce(cbegin, cend);
|
||||
return sumLocalHeads;
|
||||
}
|
||||
@ -800,6 +820,7 @@ private:
|
||||
ManageWeightsType mManageWeightsType;
|
||||
std::string mModelName;
|
||||
std::vector<SizeType32> mNumKvHeadsPerAttentionLayer;
|
||||
std::vector<SizeType32> mNumKvHeadsPerCrossAttentionLayer;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::runtime
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9cbefbfec3c95fba9ffae469c63e29594488a2419bfd5dfd3abbd234da7e7eed
|
||||
size 5203208
|
||||
oid sha256:e7cbfb1f4f7a74411bb349b54add6c2f0b645674835dacce89b39ea9fcff53b7
|
||||
size 5307422
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2a5430463d420db41a6024a3d7a16d06a607f0f48c991066fbaa056b756f1f06
|
||||
size 5318774
|
||||
oid sha256:e52f067f1a5127c512eb0fc298d6608644f3cde5079dd78b6fc6258be353afbe
|
||||
size 5422318
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:569c6009be0bf4b404d752468c771a551480d3785ac73678f9e393df2a05ba93
|
||||
size 1982856
|
||||
oid sha256:941a0dcdf53fe2952baf73720ae3afe8556e629f1873ecf5e0a4235ef074a0ed
|
||||
size 1993520
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
68b4fbc2bb6ed68a051b11a006b8984f libtensorrt_llm_ucx_wrapper.so
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
93eb7c6098fe8327cd90ce1cd564e9a9 libtensorrt_llm_ucx_wrapper.so
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b382d9cbfc7b4a4dd55b44c87d8dc196726e94d0a0e74c11aa9e2673e7445d3b
|
||||
size 5053722
|
||||
oid sha256:f16aa394aaeb2fd6e7f61cf1e171fac868ee1857b9df582d640c1bb03574c942
|
||||
size 5152714
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:385cd17e249762358be090d392087f299462ab478e0581b3a5d29d1f7d12d769
|
||||
size 5021342
|
||||
oid sha256:7962f37b09d99eec224b5c7a96c4fb895c983b35cb2abb00cf498fb671c183c4
|
||||
size 5121480
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:4b143227c742854cd898d1460b5fe6bc1088d38532947c775dfaa27361040909
|
||||
oid sha256:5446aea5178d0d0a7b4f8be4a750065b3bfcdd35f91730c5bcb2dff079fbe287
|
||||
size 11648
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
9f28106626687e336dba9e9ce51b6353 libtensorrt_llm_ucx_wrapper.so
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
1fe3fbf1da6fb9689237743d902f59e7 libtensorrt_llm_ucx_wrapper.so
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:fefbf87e36ee6b652bb52df28f06b47edbb64f16af5ac4497318d3af23dc6e84
|
||||
size 32610670
|
||||
oid sha256:d3fc5de5061f30e7875f0d86bfecfe1c42f98319f7a6a4339b3027bfe39e9eb5
|
||||
size 33892906
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
e82c837214de4ebf6941beced3d528e1 tensorrt_llm_batch_manager_static.lib
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
19f7f28149bccf535e56ba0bc96a5954 tensorrt_llm_batch_manager_static.lib
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -157,4 +157,42 @@ bool getEnvEnablePDL()
|
||||
return enablePDL;
|
||||
}
|
||||
|
||||
bool getEnvUseUCXKvCache()
|
||||
{
|
||||
static bool init = false;
|
||||
static bool useUCXKVCache = false;
|
||||
if (!init)
|
||||
{
|
||||
init = true;
|
||||
{
|
||||
char const* use_ucx_kv_cache = std::getenv("TRTLLM_USE_UCX_KVCACHE");
|
||||
if (use_ucx_kv_cache)
|
||||
{
|
||||
if (use_ucx_kv_cache[0] == '1' && use_ucx_kv_cache[1] == '\0')
|
||||
{
|
||||
useUCXKVCache = true;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
return useUCXKVCache;
|
||||
}
|
||||
|
||||
std::string getEnvUCXInterface()
|
||||
{
|
||||
static bool init = false;
|
||||
static std::string ucxInterface;
|
||||
if (!init)
|
||||
{
|
||||
init = true;
|
||||
{
|
||||
char const* ucx_interface = std::getenv("TRTLLM_UCX_INTERFACE");
|
||||
if (ucx_interface)
|
||||
{
|
||||
ucxInterface = ucx_interface;
|
||||
}
|
||||
}
|
||||
}
|
||||
return ucxInterface;
|
||||
}
|
||||
} // namespace tensorrt_llm::common
|
||||
|
||||
@ -18,6 +18,7 @@
|
||||
#pragma once
|
||||
#include <cstdint>
|
||||
#include <optional>
|
||||
#include <string>
|
||||
|
||||
namespace tensorrt_llm::common
|
||||
{
|
||||
@ -40,4 +41,8 @@ int getEnvMmhaKernelBlockSize();
|
||||
// Whether PDL is enabled.
|
||||
bool getEnvEnablePDL();
|
||||
|
||||
bool getEnvUseUCXKvCache();
|
||||
|
||||
std::string getEnvUCXInterface();
|
||||
|
||||
} // namespace tensorrt_llm::common
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a88c97950343f3bb6e630c032858099c118a24b086aa97b4d0aefd3f50439497
|
||||
size 3382878
|
||||
oid sha256:7c39ae574064fae76e804f9ca65321c00a3ee1400ed84ed5e5ab4098c335acfa
|
||||
size 2277310
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:aefeea1ece28c2a9e5e87150b2c0dbbfa88333470ebaf424dc8ea1f5ede69a07
|
||||
size 3408900
|
||||
oid sha256:45b2680b72e076323e36b36fa4946d9d043136cf05818afe626d3645b8bf1896
|
||||
size 2308950
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
e69c2223ff214d2339b7466115346146 libtensorrt_llm_executor_static.a
|
||||
65deeb71a548a81e57153fa3ad21669c libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
ce1a318e5b898bddb293de47729b4be4 libtensorrt_llm_executor_static.a
|
||||
9370519464da3b7dfc95c5fcb3c40d13 libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:22b35d0a4dc2de4c25873d162a5b1edf2d3d16fcbef413aee5a80bbe69db2e37
|
||||
size 7880014
|
||||
oid sha256:9fcdfc9bc55d856e532cf351145ca98e4b351c41452fdeec8b4d880a592316e2
|
||||
size 3356696
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ebf47f01dd8235116ac28beea016600bffb2fa36cf3b568ebfb2da2b91469379
|
||||
size 7796554
|
||||
oid sha256:8a8d6e3c6d24decec3b9b352a36d7d898d32bfadf3a0645424bd8ae426b04c4f
|
||||
size 3278590
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
4c2d8fce7081ac6148b13717636be5ff libtensorrt_llm_executor_static.a
|
||||
3341fe285c45cd3b86ea512099bda6a0 libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
b56fd65be76f98bca18ef4e6e1a2744a libtensorrt_llm_executor_static.a
|
||||
9f7fc03c3b2a9e7690663adb23a4f036 libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:69b4f3070af978cb53036e7370f59e0a6902f5aca782a8a0a2bc34246cae0637
|
||||
size 24707382
|
||||
oid sha256:bd6c3e401f6fd0636f43337ddcfb2bd5c7d5d21c1ea41d421152b9c2e8fec810
|
||||
size 21794946
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
520287b802be61e0d36ba8b1df8f06a0 tensorrt_llm_executor_static.lib
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
054197bdbdeace4f818ca8c1bc1f7308 tensorrt_llm_executor_static.lib
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -1565,10 +1565,17 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
|
||||
if constexpr (DO_CROSS_ATTENTION)
|
||||
{
|
||||
auto const k_idx = QK_VEC_SIZE * tidx;
|
||||
int const inBlockIdx = kvCacheBuffer.getKVLocalIdx(cyclic_tlength, hi_kv, Dh, k_idx);
|
||||
Tcache* k_cache = reinterpret_cast<Tcache*>(kvCacheBuffer.getKBlockPtr(batch_beam_idx, cyclic_tlength));
|
||||
int const inBlockIdx = pastKCache.getKVLocalIdx(cyclic_tlength, hi_kv, Dh, k_idx);
|
||||
Tcache* k_cache = reinterpret_cast<Tcache*>(pastKCache.getKBlockPtr(batch_beam_idx, cyclic_tlength));
|
||||
|
||||
k = vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<Qk_vec_m const*>(&k_cache[inBlockIdx]));
|
||||
if constexpr (ENABLE_8BITS_K_CACHE)
|
||||
{
|
||||
load_8bits_kv_cache_vec(&k, k_cache, inBlockIdx, k_scale_quant_orig_f);
|
||||
}
|
||||
else
|
||||
{
|
||||
k = vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<Qk_vec_m const*>(&k_cache[inBlockIdx]));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -2359,7 +2366,14 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
|
||||
V_vec_k v;
|
||||
if (DO_CROSS_ATTENTION)
|
||||
{
|
||||
v = vec_conversion<V_vec_k, V_vec_k>(*reinterpret_cast<V_vec_k const*>(&v_cache_base[inBlockIdx]));
|
||||
if constexpr (ENABLE_8BITS_KV_CACHE)
|
||||
{
|
||||
load_8bits_kv_cache_vec(&v, v_cache_base, inBlockIdx, kv_scale_quant_orig_f);
|
||||
}
|
||||
else
|
||||
{
|
||||
v = vec_conversion<V_vec_k, V_vec_k>(*reinterpret_cast<V_vec_k const*>(&v_cache_base[inBlockIdx]));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
@ -2401,7 +2415,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
|
||||
// Store the values with bias back to global memory in the cache for V.
|
||||
//*reinterpret_cast<V_vec_k*>(&v_cache[params.timestep*Dh]) = v;
|
||||
// For MQA/GQA mode, write only with the first Q head of each group per KV head.
|
||||
if (hi == (hi_kv * qhead_per_kv))
|
||||
if (hi == (hi_kv * qhead_per_kv) && !DO_CROSS_ATTENTION)
|
||||
{
|
||||
if (ENABLE_8BITS_KV_CACHE)
|
||||
{
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:e73dd3a8859cd67c62ab89a98381028bd20ac9e756f0346bbbaab0fb6c566eb7
|
||||
size 81578760
|
||||
oid sha256:1397678ac1cab957f7d272750035ffd88b9e2b3b9d4f132073119d21c288b5da
|
||||
size 82262624
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
88c30973b9b3452baa3f063d34d08169 libtensorrt_llm_nvrtc_wrapper.so
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
5ea3eabf1c58887230ba5ebc583e0d3c libtensorrt_llm_nvrtc_wrapper.so
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c11e0550552f4cc3568ac11de47079d5c6bd88aeb34ebbd52b39f4f732afbd7d
|
||||
size 84839528
|
||||
oid sha256:17481ff01045ac335223451c83943a7f97b5c63ca2ab5da3e71d0909c8f4e68b
|
||||
size 84578328
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
95e9f87610383348e444d2d0b8396f2d libtensorrt_llm_nvrtc_wrapper.so
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
270f246f5eccb170a759cda4787216f4 libtensorrt_llm_nvrtc_wrapper.so
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f703220b82bac0368893a9df301a8e756e92546248bf0212e804b92fc6cd593e
|
||||
oid sha256:c641f66a5f508d75ca8ac1d74b99fd61a8f9b6f96fe9820620520bb0a0beda23
|
||||
size 1128448
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
c5f36e093e875c8ea84523fb1566d986 tensorrt_llm_nvrtc_wrapper.lib
|
||||
54a96b4243b8fbcb0a331986b3bf70b9 tensorrt_llm_nvrtc_wrapper.dll
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
f7224da82447d5396ce952c4edb0b044 tensorrt_llm_nvrtc_wrapper.dll
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -624,18 +624,19 @@ void invokeCopyBeamHypotheses(DecodingOutput::BeamHypotheses const& src, Decodin
|
||||
copyBeamHypotheses<<<numSMs, 256, 0, stream.get()>>>(copyStruct);
|
||||
}
|
||||
|
||||
__global__ void initializeOutput(TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const nMaxSeqLen)
|
||||
__global__ void initializeOutput(
|
||||
TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const beam, SizeType32 const nMaxSeqLen)
|
||||
{
|
||||
for (int i = threadIdx.x; i < nMaxSeqLen; i += blockDim.x)
|
||||
{
|
||||
finalOutputIds[blockIdx.x * nMaxSeqLen + i] = endIds[blockIdx.x];
|
||||
finalOutputIds[blockIdx.x * nMaxSeqLen + i] = endIds[blockIdx.x / beam];
|
||||
}
|
||||
}
|
||||
|
||||
void invokeInitializeOutput(TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const batchBeam,
|
||||
SizeType32 const nMaxSeqLen, cudaStream_t stream)
|
||||
void invokeInitializeOutput(TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const batch,
|
||||
SizeType32 const beam, SizeType32 const nMaxSeqLen, cudaStream_t stream)
|
||||
{
|
||||
initializeOutput<<<batchBeam, 256, 0, stream>>>(finalOutputIds, endIds, nMaxSeqLen);
|
||||
initializeOutput<<<batch * beam, 256, 0, stream>>>(finalOutputIds, endIds, beam, nMaxSeqLen);
|
||||
}
|
||||
|
||||
__global__ void copyNextStepIds(TokenIdType* nextStepIds, TokenIdType const* const* outputIdsPtr,
|
||||
@ -742,7 +743,7 @@ void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decod
|
||||
|
||||
// prefill finalOutputIds with the EOS tokens from decodingInput.endIds
|
||||
tensorrt_llm::kernels::invokeInitializeOutput(bufferCast<TokenIdType>(finalOutputIds),
|
||||
bufferCast<TokenIdType>(*decodingInput.endIds), batchSize * beamWidth, maxSeqLength, stream);
|
||||
bufferCast<TokenIdType>(*decodingInput.endIds), batchSize, beamWidth, maxSeqLength, stream);
|
||||
sync_check_cuda_error();
|
||||
|
||||
std::vector<float> lengthPenaltyVec;
|
||||
|
||||
@ -72,7 +72,7 @@ void invokeFinalize(BeamHypotheses& bh, cudaStream_t stream);
|
||||
//! \param maxSeqLen The maximum sequence length, inferred from the finalOutputIds.shape[3]
|
||||
//! \param stream The CUDA stream on which to perform the operation.
|
||||
void invokeInitializeOutput(runtime::TokenIdType* finalOutputIds, runtime::TokenIdType const* endIds,
|
||||
runtime::SizeType32 batchBeam, runtime::SizeType32 maxSeqLen, cudaStream_t stream);
|
||||
runtime::SizeType32 batch, runtime::SizeType32 beam, runtime::SizeType32 maxSeqLen, cudaStream_t stream);
|
||||
|
||||
//! \brief Copies the data from the buffers in src to dst to reduce the kernel launch overhead of individual memcpy.
|
||||
//! for streaming + beam search, where we need to avoid overwriting the beam search buffers.
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:5081f260dc93b1caf996803253ab27fb290229e914feebfb8bd76b57e4e0a33e
|
||||
size 25294842
|
||||
oid sha256:60f54be9c7f7517b76112dcf9185cb8f4b2c69c11aec3b7afaaf86f7d0cc9b70
|
||||
size 25257850
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b2d4225f5c76786ee808df56277cbe9328cd4da1a6265a15fc9c730353cdcdbf
|
||||
size 25699742
|
||||
oid sha256:2e2f1a38e7df380dd6048158565ffdc269dea2251078b3c8f2265f8e533337a4
|
||||
size 25663182
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
87ec6b3278d52f9e2d8a23a20c38560b libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
d8d2c5cb915dcac97f2279166f7eae58 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
74f70e14a884d8e29144595686634296 libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
2de60e6d3211395ce4481ffede35c2f4 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ed0568d70c32b592a1a9fbd5c129b7c19ae1eca5fba2b68375f29b75bca32499
|
||||
size 44104384
|
||||
oid sha256:68ff5e652a2ec0d295583f4e3a8696df5d3606c2ffde50e4f205b87aea2f6c91
|
||||
size 44055616
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:135a32f33cec8c10c7d780df3d3f55ea8b9d2b1fe0a553ad6834cff81d42cbaf
|
||||
size 43491958
|
||||
oid sha256:b598263e5df63ad110ab82bd2c3ea86cf4760350488122c8d14d908c1b84c045
|
||||
size 43443190
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
2bb339277f872a112fc9c373128a7685 libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
d212bad6fa0f17d00ee90376a2a250c0 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
6c4a716c6aefe9a68501d4bd6901cc95 libtensorrt_llm_internal_cutlass_kernels_static.a
|
||||
0134143d17d8ff5b7326235f85f664f5 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2de729c93894e86ad9cae674a567c02677acb90bb6c970f2ce5dfcd04c489816
|
||||
size 88140802
|
||||
oid sha256:e72bd01eced686736f3e96d445e80964b83da77067c0c622f512630f625c2f40
|
||||
size 88140804
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
8bf62183399a59548962a18ab2dec352 tensorrt_llm_internal_cutlass_kernels_static.lib
|
||||
d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
|
||||
d908f9e8ed1928239467a78b30ae7b35 tensorrt_llm_internal_cutlass_kernels_static.lib
|
||||
082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
|
||||
@ -127,12 +127,10 @@ size_t LoraImpl::getWorkspaceSize(
|
||||
{
|
||||
TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
|
||||
auto const typeSize = tensorrt_llm::common::getDTypeSize(type);
|
||||
TLLM_CHECK_WITH_INFO(
|
||||
numTokens >= numReqs, fmtstr("num tokens %ld should be greater than num reqs %ld", numTokens, numReqs));
|
||||
|
||||
return (size_t) getGemmWorkSpaceSize(numTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices)
|
||||
+ getLowRankWorkSpaceSize(numTokens, mNumLoraModules, mMaxLowRank, typeSize)
|
||||
+ getGemmParamsWorkSpaceSize(numReqs * mNumLoraModules);
|
||||
+ getGemmParamsWorkSpaceSize(std::min(numReqs, numTokens) * mNumLoraModules);
|
||||
}
|
||||
|
||||
void LoraImpl::setBestTactic(std::optional<Config> config)
|
||||
@ -163,7 +161,7 @@ int LoraImpl::run(int64_t numTokens, int64_t numReqs, void const* input, int32_t
|
||||
setGemmConfig();
|
||||
|
||||
int64_t GemmWorkSpaceSize = getGemmWorkSpaceSize(numTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices);
|
||||
int64_t groupGemmParamsWorkSpaceSize = getGemmParamsWorkSpaceSize(numReqs * mNumLoraModules);
|
||||
int64_t groupGemmParamsWorkSpaceSize = getGemmParamsWorkSpaceSize(std::min(numReqs, numTokens) * mNumLoraModules);
|
||||
void* gemmWorkSpace = workspace; // [gemmWorkSpace, lowrankWorkSpace, groupGemmParamsWorkSpace]
|
||||
void* lowRankWorkSpace = static_cast<char*>(gemmWorkSpace) + GemmWorkSpaceSize;
|
||||
void* groupGemmParamsWorkSpace = static_cast<char*>(lowRankWorkSpace)
|
||||
|
||||
@ -1033,4 +1033,104 @@ void invokeCopyOutputTokensIds(runtime::TokenIdType** tmpOutputIdsPtrs, runtime:
|
||||
batchSize, numInputLogits, maxDecodingDraftTokens);
|
||||
}
|
||||
|
||||
namespace
|
||||
{
|
||||
__global__ void packEagleGenerationLengths(PackEagleParams params)
|
||||
{
|
||||
auto const batchIdx = static_cast<SizeType32>(blockIdx.x);
|
||||
auto const batchSlot = params.batchSlots[batchIdx];
|
||||
|
||||
auto const isGenerationRequest = batchIdx >= params.numContextRequests;
|
||||
auto const genIdx = batchIdx - params.numContextRequests;
|
||||
|
||||
if (threadIdx.x == 0 && isGenerationRequest)
|
||||
{
|
||||
params.outputSpecDecodingGenerationLengths[genIdx] = params.inputNextDraftLens[batchSlot];
|
||||
}
|
||||
}
|
||||
|
||||
__global__ void packEagleTensors(PackEagleParams params)
|
||||
{
|
||||
auto const batchIdx = static_cast<SizeType32>(blockIdx.x);
|
||||
auto const batchSlot = params.batchSlots[batchIdx];
|
||||
|
||||
auto const isGenerationRequest = batchIdx >= params.numContextRequests;
|
||||
auto const genIdx = batchIdx - params.numContextRequests;
|
||||
|
||||
// Copy data that is 1 elem per request
|
||||
if (threadIdx.x == 0)
|
||||
{
|
||||
params.outputRandomDataSample[batchIdx] = params.inputRandomDataSample[batchSlot];
|
||||
params.outputTemperatures[batchIdx] = params.inputTemperatures[batchSlot];
|
||||
// FIXME we need 1 value per draft token
|
||||
params.outputRandomDataValidation[batchIdx] = params.inputRandomDataValidation[batchSlot];
|
||||
|
||||
// 0 for ctx request and actual draft len for gen requests.
|
||||
params.outputNextDraftLens[batchIdx] = isGenerationRequest ? params.inputNextDraftLens[batchSlot] : 0;
|
||||
}
|
||||
|
||||
// Copy draft paths
|
||||
auto const numPathElts = params.maxNumPaths * params.maxPathLength;
|
||||
auto outputNextDraftPaths = params.outputNextDraftPaths + batchIdx * numPathElts;
|
||||
auto const inputNextDraftPaths = params.inputNextDraftPaths + batchSlot * numPathElts;
|
||||
for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < numPathElts; ti += static_cast<SizeType32>(blockDim.x))
|
||||
{
|
||||
outputNextDraftPaths[ti] = inputNextDraftPaths[ti];
|
||||
}
|
||||
|
||||
if (isGenerationRequest)
|
||||
{
|
||||
// Copy draft tokens. We do it only for gen requests as for ctx requests outputNextDraftLens is 0.
|
||||
auto const maxDecodingDraftTokens = params.maxDecodingTokens - 1;
|
||||
auto outputNextDraftTokens = params.outputNextDraftTokens + batchIdx * maxDecodingDraftTokens;
|
||||
auto const inputNextDraftTokens = params.inputNextDraftTokens + batchSlot * maxDecodingDraftTokens;
|
||||
|
||||
for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < maxDecodingDraftTokens;
|
||||
ti += static_cast<SizeType32>(blockDim.x))
|
||||
{
|
||||
outputNextDraftTokens[ti] = inputNextDraftTokens[ti];
|
||||
}
|
||||
|
||||
auto const maxGenerationLength = params.maxGenerationLength[0];
|
||||
auto const numPackedMasks = divUp(params.maxDecodingTokens, 32);
|
||||
auto const outputStartId = (genIdx == 0) ? 0 : params.cumSumGenerationLengths[genIdx - 1];
|
||||
auto const numTokens = (genIdx == 0)
|
||||
? params.cumSumGenerationLengths[0]
|
||||
: params.cumSumGenerationLengths[genIdx] - params.cumSumGenerationLengths[genIdx - 1];
|
||||
// Copy packed masks.
|
||||
// Masks are placed next to each other with offsets of cumSumGenerationLengths[bi-1]
|
||||
auto const inputPackedMask
|
||||
= params.inputSpecDecodingPackedMasks + batchSlot * numPackedMasks * params.maxDecodingTokens;
|
||||
auto outputPackedMask = params.outputSpecDecodingPackedMasks + outputStartId * numPackedMasks;
|
||||
for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < numTokens * numPackedMasks;
|
||||
ti += static_cast<SizeType32>(blockDim.x))
|
||||
{
|
||||
outputPackedMask[ti] = inputPackedMask[ti];
|
||||
}
|
||||
|
||||
// Copy pos offsets. Copy only for maxGenerationLength
|
||||
auto const inputPositionOffsets
|
||||
= params.inputSpecDecodingPositionOffsets + batchSlot * params.maxDecodingTokens;
|
||||
auto outputPositionOffsets = params.outputSpecDecodingPositionOffsets + genIdx * maxGenerationLength;
|
||||
for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < maxGenerationLength;
|
||||
ti += static_cast<SizeType32>(blockDim.x))
|
||||
{
|
||||
outputPositionOffsets[ti] = inputPositionOffsets[ti];
|
||||
}
|
||||
}
|
||||
}
|
||||
} // namespace
|
||||
|
||||
void invokePackEagleGenerationLengths(PackEagleParams const& params, cudaStream_t stream)
|
||||
{
|
||||
SizeType32 constexpr BLOCK_SIZE = 32;
|
||||
packEagleGenerationLengths<<<params.batchSize, BLOCK_SIZE, 0, stream>>>(params);
|
||||
}
|
||||
|
||||
void invokePackEagle(PackEagleParams const& params, cudaStream_t stream)
|
||||
{
|
||||
SizeType32 constexpr BLOCK_SIZE = 128;
|
||||
packEagleTensors<<<params.batchSize, BLOCK_SIZE, 0, stream>>>(params);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::kernels::speculative_decoding
|
||||
|
||||
@ -295,4 +295,103 @@ struct PrepareGenEagleNetInputsParams
|
||||
//! \brief Prepares inputs for the gen stage EagleNet itearion (layerIdx > 0).
|
||||
//! For input/output examples visit test/model/eagle/test_prepare_drafter_inputs_plugin.py (gen Eagle Net examples)
|
||||
void invokePrepareGenEagleNetInputs(PrepareGenEagleNetInputsParams const& params);
|
||||
|
||||
struct PackEagleParams
|
||||
{
|
||||
runtime::SizeType32 batchSize{0};
|
||||
runtime::SizeType32 maxNumPaths{0};
|
||||
runtime::SizeType32 maxDecodingTokens{0};
|
||||
runtime::SizeType32 maxPathLength{0};
|
||||
runtime::SizeType32 numContextRequests{0};
|
||||
runtime::SizeType32 numGenerationRequests{0};
|
||||
|
||||
//! inputs
|
||||
//! [batchSize]
|
||||
runtime::SizeType32 const* batchSlots{nullptr};
|
||||
|
||||
//! [maxBatchSize]
|
||||
float const* inputTemperatures{nullptr};
|
||||
//! [maxBatchSize]
|
||||
float const* inputRandomDataSample{nullptr};
|
||||
//! [maxBatchSize]
|
||||
float const* inputRandomDataValidation{nullptr};
|
||||
//! [maxBatchSize, maxDecodingDraftTokens]
|
||||
runtime::TokenIdType const* inputNextDraftTokens{nullptr};
|
||||
//! [maxBatchSize]
|
||||
runtime::SizeType32 const* inputNextDraftLens{nullptr};
|
||||
//! [maxBatchSize, maxDecodingTokens, maxPathLen]
|
||||
runtime::SizeType32 const* inputNextDraftPaths{nullptr};
|
||||
//! [maxBatchSize]
|
||||
runtime::SizeType32 const* inputSpecDecodingGenerationLengths{nullptr};
|
||||
//! [maxBatchSize]
|
||||
runtime::SizeType32 const* inputSpecDecodingPositionOffsets{nullptr};
|
||||
//! [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
|
||||
int32_t const* inputSpecDecodingPackedMasks{nullptr};
|
||||
|
||||
//! outputs
|
||||
//! [batchSize]
|
||||
float* outputTemperatures{nullptr};
|
||||
//! [batchSize]
|
||||
float* outputRandomDataSample{nullptr};
|
||||
//! [batchSize]
|
||||
float* outputRandomDataValidation{nullptr};
|
||||
//! [batchSize, maxDecodingDraftTokens]
|
||||
runtime::TokenIdType* outputNextDraftTokens{nullptr};
|
||||
//! [batchSize]
|
||||
runtime::SizeType32* outputNextDraftLens{nullptr};
|
||||
//! [batchSize, maxDecodingTokens, maxPathLen]
|
||||
runtime::SizeType32* outputNextDraftPaths{nullptr};
|
||||
//! [batchSize]
|
||||
runtime::SizeType32* outputSpecDecodingGenerationLengths{nullptr};
|
||||
//! [batchSize]
|
||||
runtime::SizeType32* outputSpecDecodingPositionOffsets{nullptr};
|
||||
//! [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
|
||||
int32_t* outputSpecDecodingPackedMasks{nullptr};
|
||||
|
||||
// workspace
|
||||
//! [1]
|
||||
runtime::SizeType32* maxGenerationLength{nullptr};
|
||||
//! [batchSize + 1]
|
||||
runtime::SizeType32* cumSumGenerationLengths{nullptr};
|
||||
|
||||
void checkParams()
|
||||
{
|
||||
TLLM_CHECK(batchSlots);
|
||||
|
||||
TLLM_CHECK(inputTemperatures);
|
||||
TLLM_CHECK(inputRandomDataSample);
|
||||
TLLM_CHECK(inputRandomDataValidation);
|
||||
TLLM_CHECK(inputNextDraftTokens);
|
||||
TLLM_CHECK(inputNextDraftLens);
|
||||
TLLM_CHECK(inputNextDraftPaths);
|
||||
TLLM_CHECK(inputSpecDecodingGenerationLengths);
|
||||
TLLM_CHECK(inputSpecDecodingPositionOffsets);
|
||||
TLLM_CHECK(inputSpecDecodingPackedMasks);
|
||||
|
||||
TLLM_CHECK(outputTemperatures);
|
||||
TLLM_CHECK(outputRandomDataSample);
|
||||
TLLM_CHECK(outputRandomDataValidation);
|
||||
TLLM_CHECK(outputNextDraftTokens);
|
||||
TLLM_CHECK(outputNextDraftLens);
|
||||
TLLM_CHECK(outputNextDraftPaths);
|
||||
TLLM_CHECK(outputSpecDecodingGenerationLengths);
|
||||
TLLM_CHECK(outputSpecDecodingPositionOffsets);
|
||||
TLLM_CHECK(outputSpecDecodingPackedMasks);
|
||||
|
||||
TLLM_CHECK(maxGenerationLength);
|
||||
TLLM_CHECK(cumSumGenerationLengths);
|
||||
|
||||
TLLM_CHECK(batchSize > 0);
|
||||
TLLM_CHECK(batchSize == numContextRequests + numGenerationRequests);
|
||||
TLLM_CHECK(maxDecodingTokens > 0);
|
||||
TLLM_CHECK(maxPathLength > 0);
|
||||
TLLM_CHECK(maxNumPaths > 0);
|
||||
}
|
||||
};
|
||||
|
||||
//! \brief packs outputSpecDecodingGenerationLengths from batch slots positions to continuous memory.
|
||||
void invokePackEagleGenerationLengths(PackEagleParams const& params, cudaStream_t stream);
|
||||
//! \brief packs the rest of the output tensors from batch slots positions to continuous memory.
|
||||
void invokePackEagle(PackEagleParams const& params, cudaStream_t stream);
|
||||
|
||||
} // namespace tensorrt_llm::kernels::speculative_decoding
|
||||
|
||||
@ -1063,7 +1063,7 @@ void kernelV2DispatchHeadSize(QKVPreprocessingParams<T, KVCacheBuffer> params, c
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
template <typename T, typename TCache, typename KVCacheBuffer, int BLOCK_SIZE, int Dh>
|
||||
template <typename T, typename TCache, typename KVCacheBuffer, int BLOCK_SIZE, int Dh, bool FP8_OUTPUT>
|
||||
__global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCacheBuffer> params)
|
||||
{
|
||||
// For cross-attention,
|
||||
@ -1121,6 +1121,14 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
|
||||
int const src_k_offset = params.q_hidden_size;
|
||||
int const src_v_offset = src_k_offset + params.kv_hidden_size;
|
||||
|
||||
// Cast float scale to dst data type.
|
||||
using TScale = typename mmha::kv_cache_scale_type_t<T, TCache>::Type;
|
||||
[[maybe_unused]] TScale scale_orig_quant;
|
||||
if constexpr (sizeof(TCache) == 1 || FP8_OUTPUT)
|
||||
{
|
||||
mmha::convert_from_float(&scale_orig_quant, params.kvScaleOrigQuant ? params.kvScaleOrigQuant[0] : 1.0f);
|
||||
}
|
||||
|
||||
// For loop in the sequence length dimension.
|
||||
// There might be multiple blocks (blockIdx.x) that process the same sequence in order to fully utilize
|
||||
for (int token_idx = blockIdx.x * TOKENS_PER_BLOCK + (threadIdx.x / VECS_PER_HEAD); token_idx < max_seq_len;
|
||||
@ -1139,8 +1147,18 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
|
||||
// Only load Q tokens from decoder qkv input.
|
||||
auto q = *reinterpret_cast<VecT const*>(params.qkv_input + src_q_idx);
|
||||
|
||||
// Store it to a separate q output.
|
||||
*reinterpret_cast<VecT*>(params.q_output + dst_q_idx) = q;
|
||||
// Quantize the output to fp8.
|
||||
if constexpr (FP8_OUTPUT)
|
||||
{
|
||||
using OutputType = __nv_fp8_e4m3;
|
||||
OutputType* quantized_q_ptr = reinterpret_cast<OutputType*>(params.q_output) + dst_q_idx;
|
||||
mmha::store_8bits_vec(quantized_q_ptr, q, 0, scale_orig_quant);
|
||||
}
|
||||
else
|
||||
{
|
||||
// Store it to a separate q output.
|
||||
*reinterpret_cast<VecT*>(params.q_output + dst_q_idx) = q;
|
||||
}
|
||||
}
|
||||
|
||||
// Encoder tokens (i.e. KV tokens).
|
||||
@ -1175,10 +1193,6 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
|
||||
{
|
||||
// The element index inside the block.
|
||||
auto block_elt_idx = block_vec_idx * ELTS_PER_VEC;
|
||||
// Cast float scale to dst data type.
|
||||
using TScale = typename mmha::kv_cache_scale_type_t<T, TCache>::Type;
|
||||
TScale scale_orig_quant;
|
||||
mmha::convert_from_float(&scale_orig_quant, params.kvScaleOrigQuant[0]);
|
||||
// Store 8bits kv cache.
|
||||
mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant);
|
||||
mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant);
|
||||
@ -1217,7 +1231,16 @@ void invokeUpdateKvCacheForCrossAttention(QKVPreprocessingParams<T, KVCacheBuffe
|
||||
dim3 grid(num_seq_blocks, params.head_num, params.batch_size);
|
||||
|
||||
// Launch the kernel.
|
||||
updateKVCacheForCrossAttention<T, TCache, KVCacheBuffer, BLOCK_SIZE, Dh><<<grid, block, 0, stream>>>(params);
|
||||
if (params.quantized_fp8_output)
|
||||
{
|
||||
updateKVCacheForCrossAttention<T, TCache, KVCacheBuffer, BLOCK_SIZE, Dh, true>
|
||||
<<<grid, block, 0, stream>>>(params);
|
||||
}
|
||||
else
|
||||
{
|
||||
updateKVCacheForCrossAttention<T, TCache, KVCacheBuffer, BLOCK_SIZE, Dh, false>
|
||||
<<<grid, block, 0, stream>>>(params);
|
||||
}
|
||||
}
|
||||
|
||||
///////////////////////////////////////////////////////////////////////////////////////////////////
|
||||
|
||||
@ -785,7 +785,7 @@ int GPTAttentionPluginCommon::enqueueContext(EnqueueContextParams<T> const& para
|
||||
= mEnableContextFMHA ? 0 : sizeof(T) * params.batch_size * params.input_seq_length * kv_seq_length;
|
||||
size_t const cu_seqlens_size = sizeof(int) * (params.batch_size + 1);
|
||||
size_t const rotary_inv_freq_size = sizeof(float) * params.batch_size * mRotaryEmbeddingDim / 2;
|
||||
size_t const q_buf_2_size = mFMHARunner->isSeparateQAndKvInput() || !mEnableContextFMHA
|
||||
size_t const q_buf_2_size = !mEnableContextFMHA || mFMHARunner->isSeparateQAndKvInput()
|
||||
? sizeof(T) * params.batch_size * params.input_seq_length * local_hidden_units_qo
|
||||
: 0;
|
||||
size_t const k_buf_2_size
|
||||
|
||||
@ -86,10 +86,10 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
|
||||
return std::make_shared<tb::LlmRequest>(mRequestId, mMaxNewTokens,
|
||||
std::make_shared<std::vector<TokenIdType>>(mTokens.at(0)), mSamplingConfig, mIsStreaming, mEndId, mPadId,
|
||||
embeddingBias, badWordsList, stopWordsList, mPositionIds, promptEmbeddingTable, mPromptVocabSize, mLoraTaskId,
|
||||
loraWeights, loraConfig, mLookaheadConfig, returnLogProbs(), mReturnContextLogits, mReturnGenerationLogits,
|
||||
mDraftTokens, draftLogits, mExcludeInputFromOutput, callbackAdapter(mLogitsPostProcessor),
|
||||
mApplyLogitsPostProcessorBatched, mEncoderTokens, mReturnEncoderOutput, mClientId, mPriority,
|
||||
encoderInputFeatures, mEncoderOutputLength, crossAttentionMask,
|
||||
loraWeights, loraConfig, mLookaheadConfig, mKvCacheRetentionConfig, returnLogProbs(), mReturnContextLogits,
|
||||
mReturnGenerationLogits, mDraftTokens, draftLogits, mExcludeInputFromOutput,
|
||||
callbackAdapter(mLogitsPostProcessor), mApplyLogitsPostProcessorBatched, mEncoderTokens, mReturnEncoderOutput,
|
||||
mClientId, mPriority, encoderInputFeatures, mEncoderOutputLength, crossAttentionMask,
|
||||
tb::LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, mInputTokenExtraIds, mNumReturnSequences);
|
||||
}
|
||||
|
||||
@ -102,8 +102,8 @@ void LlmRequest::initBindings(py::module_& m)
|
||||
std::optional<LlmRequest::TensorPtr>, std::optional<std::vector<LlmRequest::SizeType32>>,
|
||||
std::optional<LlmRequest::TensorPtr>, std::optional<LlmRequest::SizeType32>, std::optional<uint64_t>,
|
||||
std::optional<LlmRequest::TensorPtr>, std::optional<LlmRequest::TensorPtr>,
|
||||
std::optional<executor::LookaheadDecodingConfig>, bool, bool, bool,
|
||||
std::optional<LlmRequest::VecTokens>, std::optional<LlmRequest::TensorPtr>, bool,
|
||||
std::optional<executor::LookaheadDecodingConfig>, std::optional<executor::KvCacheRetentionConfig>,
|
||||
bool, bool, bool, std::optional<LlmRequest::VecTokens>, std::optional<LlmRequest::TensorPtr>, bool,
|
||||
std::optional<LlmRequest::LogitsPostProcessor>, bool, std::optional<LlmRequest::VecTokens>, bool,
|
||||
std::optional<RequestIdType>, executor::PriorityType, std::optional<LlmRequest::TensorPtr>,
|
||||
std::optional<LlmRequest::SizeType32>, std::optional<LlmRequest::TensorPtr>,
|
||||
@ -115,15 +115,15 @@ void LlmRequest::initBindings(py::module_& m)
|
||||
py::arg("prompt_embedding_table") = std::nullopt, py::arg("prompt_vocab_size") = std::nullopt,
|
||||
py::arg("lora_task_id") = std::nullopt, py::arg("lora_weights") = std::nullopt,
|
||||
py::arg("lora_config") = std::nullopt, py::arg("lookahead_config") = std::nullopt,
|
||||
py::arg("return_log_probs") = false, py::arg("return_context_logits") = false,
|
||||
py::arg("return_generation_logits") = false, py::arg("draft_tokens") = std::nullopt,
|
||||
py::arg("draft_logits") = std::nullopt, py::arg("exclude_input_from_output") = false,
|
||||
py::arg("logits_post_processor") = std::nullopt, py::arg("apply_logits_post_processor_batched") = false,
|
||||
py::arg("encoder_input_tokens") = std::nullopt, py::arg("return_encoder_output") = false,
|
||||
py::arg("client_id") = std::nullopt, py::arg("priority") = executor::Request::kDefaultPriority,
|
||||
py::arg("encoder_input_features") = std::nullopt, py::arg("encoder_output_length") = std::nullopt,
|
||||
py::arg("cross_attention_mask") = std::nullopt, py::arg("input_token_extra_ids") = std::nullopt,
|
||||
py::arg("num_return_sequences") = 1)
|
||||
py::arg("kv_cache_retention_config") = std::nullopt, py::arg("return_log_probs") = false,
|
||||
py::arg("return_context_logits") = false, py::arg("return_generation_logits") = false,
|
||||
py::arg("draft_tokens") = std::nullopt, py::arg("draft_logits") = std::nullopt,
|
||||
py::arg("exclude_input_from_output") = false, py::arg("logits_post_processor") = std::nullopt,
|
||||
py::arg("apply_logits_post_processor_batched") = false, py::arg("encoder_input_tokens") = std::nullopt,
|
||||
py::arg("return_encoder_output") = false, py::arg("client_id") = std::nullopt,
|
||||
py::arg("priority") = executor::Request::kDefaultPriority, py::arg("encoder_input_features") = std::nullopt,
|
||||
py::arg("encoder_output_length") = std::nullopt, py::arg("cross_attention_mask") = std::nullopt,
|
||||
py::arg("input_token_extra_ids") = std::nullopt, py::arg("num_return_sequences") = 1)
|
||||
.def("get_num_tokens", &LlmRequest::getNumTokens, py::arg("beam"))
|
||||
.def_property_readonly("max_beam_num_tokens", &LlmRequest::getMaxBeamNumTokens)
|
||||
.def("get_token", &LlmRequest::getToken, py::arg("beam"), py::arg("pos"))
|
||||
|
||||
@ -68,8 +68,9 @@ public:
|
||||
std::optional<SizeType32> promptVocabSize = std::nullopt,
|
||||
std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
|
||||
std::optional<TensorPtr> loraConfig = std::nullopt,
|
||||
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
|
||||
bool returnContextLogits = false, bool returnGenerationLogits = false,
|
||||
std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
|
||||
std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
|
||||
bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
|
||||
std::optional<VecTokens> draftTokens = std::nullopt, std::optional<TensorPtr> draftLogits = std::nullopt,
|
||||
bool excludeInputFromOutput = false, std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
|
||||
bool applyLogitsPostProcessorBatched = false, std::optional<VecTokens> encoderInputTokens = std::nullopt,
|
||||
@ -83,8 +84,8 @@ public:
|
||||
samplingConfig, isStreaming, endId, padId, embeddingBias, badWordsList, stopWordsList,
|
||||
positionIds.has_value() ? std::make_shared<std::vector<SizeType32>>(std::move(positionIds.value()))
|
||||
: std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),
|
||||
promptEmbeddingTable, promptVocabSize, loraTaskId, loraWeights, loraConfig, lookaheadConfig, returnLogProbs,
|
||||
returnContextLogits, returnGenerationLogits,
|
||||
promptEmbeddingTable, promptVocabSize, loraTaskId, loraWeights, loraConfig, lookaheadConfig,
|
||||
kvCacheRetentionConfig, returnLogProbs, returnContextLogits, returnGenerationLogits,
|
||||
draftTokens.has_value() ? std::make_shared<VecTokens>(std::move(draftTokens.value()))
|
||||
: std::make_shared<VecTokens>(),
|
||||
draftLogits, excludeInputFromOutput, logitsPostProcessor, applyLogitsPostProcessorBatched,
|
||||
|
||||
@ -224,7 +224,9 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
|
||||
.def_property("compute_generation_logits",
|
||||
py::overload_cast<>(&tr::ModelConfig::computeGenerationLogits, py::const_),
|
||||
py::overload_cast<bool>(&tr::ModelConfig::computeGenerationLogits))
|
||||
.def_property("model_variant", &tr::ModelConfig::getModelVariant, &tr::ModelConfig::setModelVariant);
|
||||
.def_property("model_variant", &tr::ModelConfig::getModelVariant, &tr::ModelConfig::setModelVariant)
|
||||
.def_property(
|
||||
"use_cross_attention", &tr::ModelConfig::useCrossAttention, &tr::ModelConfig::setUseCrossAttention);
|
||||
|
||||
py::class_<tr::WorldConfig>(m, "WorldConfig")
|
||||
.def(py::init<SizeType32, SizeType32, SizeType32, SizeType32, std::optional<std::vector<SizeType32>> const&>(),
|
||||
|
||||
@ -114,7 +114,9 @@ void InitBindings(pybind11::module_& m)
|
||||
.def_readwrite("tokens_per_block", &tle::KvCacheStats::tokensPerBlock)
|
||||
.def_readwrite("alloc_total_blocks", &tle::KvCacheStats::allocTotalBlocks)
|
||||
.def_readwrite("alloc_new_blocks", &tle::KvCacheStats::allocNewBlocks)
|
||||
.def_readwrite("reused_blocks", &tle::KvCacheStats::reusedBlocks);
|
||||
.def_readwrite("reused_blocks", &tle::KvCacheStats::reusedBlocks)
|
||||
.def_readwrite("missed_blocks", &tle::KvCacheStats::missedBlocks)
|
||||
.def_readwrite("cache_hit_rate", &tle::KvCacheStats::cacheHitRate);
|
||||
|
||||
py::class_<tle::StaticBatchingStats>(m, "StaticBatchingStats")
|
||||
.def(py::init<>())
|
||||
@ -184,6 +186,8 @@ void InitBindings(pybind11::module_& m)
|
||||
.def_readwrite("alloc_total_blocks_per_request", &tle::RequestStats::allocTotalBlocksPerRequest)
|
||||
.def_readwrite("alloc_new_blocks_per_request", &tle::RequestStats::allocNewBlocksPerRequest)
|
||||
.def_readwrite("reused_blocks_per_request", &tle::RequestStats::reusedBlocksPerRequest)
|
||||
.def_readwrite("missed_blocks_per_request", &tle::RequestStats::missedBlocksPerRequest)
|
||||
.def_readwrite("kv_cache_hit_rate_per_request", &tle::RequestStats::kvCacheHitRatePerRequest)
|
||||
.def("to_json_str",
|
||||
[](tle::RequestStats const& iterationStats) { return tle::JsonSerialization::toJsonStr(iterationStats); });
|
||||
|
||||
@ -307,6 +311,24 @@ void InitBindings(pybind11::module_& m)
|
||||
.def_property_readonly("max_ngram_size", &tle::LookaheadDecodingConfig::getNgramSize)
|
||||
.def_property_readonly("max_verification_set_size", &tle::LookaheadDecodingConfig::getVerificationSetSize);
|
||||
|
||||
auto kvCacheRetentionConfig
|
||||
= py::class_<tle::KvCacheRetentionConfig>(m, "KvCacheRetentionConfig")
|
||||
.def(py::init<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionPriority>,
|
||||
tle::RetentionPriority>(),
|
||||
py::arg("token_range_retention_priorities"), py::arg("decode_retention_priority"))
|
||||
.def_property_readonly(
|
||||
"token_range_retention_priorities", &tle::KvCacheRetentionConfig::getTokenRangeRetentionPriorities)
|
||||
.def_property_readonly(
|
||||
"decode_retention_priority", &tle::KvCacheRetentionConfig::getDecodeRetentionPriority);
|
||||
|
||||
py::class_<tle::KvCacheRetentionConfig::TokenRangeRetentionPriority>(
|
||||
kvCacheRetentionConfig, "TokenRangeRetentionPriority")
|
||||
.def(py::init<SizeType32, std::optional<SizeType32>, tle::RetentionPriority>(), py::arg("token_start"),
|
||||
py::arg("token_end"), py::arg("priority"))
|
||||
.def_readwrite("token_start", &tle::KvCacheRetentionConfig::TokenRangeRetentionPriority::tokenStart)
|
||||
.def_readwrite("token_end", &tle::KvCacheRetentionConfig::TokenRangeRetentionPriority::tokenEnd)
|
||||
.def_readwrite("priority", &tle::KvCacheRetentionConfig::TokenRangeRetentionPriority::priority);
|
||||
|
||||
py::class_<tle::ContextPhaseParams>(m, "ContextPhaseParams")
|
||||
.def(py::init<VecTokens, tle::ContextPhaseParams::RequestIdType>(), py::arg("first_gen_tokens"),
|
||||
py::arg("req_id"));
|
||||
@ -327,6 +349,7 @@ void InitBindings(pybind11::module_& m)
|
||||
std::optional<tle::ExternalDraftTokensConfig> externalDraftTokensConfig,
|
||||
std::optional<tle::PromptTuningConfig> pTuningConfig, std::optional<tle::LoraConfig> loraConfig,
|
||||
std::optional<tle::LookaheadDecodingConfig> lookaheadConfig,
|
||||
std::optional<tle::KvCacheRetentionConfig> kvCacheRetentionConfig,
|
||||
std::optional<std::string> logitsPostProcessorName,
|
||||
std::optional<tle::VecTokens> encoderInputTokenIds, std::optional<tle::IdType> clientId,
|
||||
bool returnAllGeneratedTokens, tle::PriorityType priority, tle::RequestType type,
|
||||
@ -343,12 +366,12 @@ void InitBindings(pybind11::module_& m)
|
||||
}
|
||||
}
|
||||
TLLM_CHECK_WITH_INFO(maxTokens.has_value(), "missing required argument max_tokens");
|
||||
|
||||
return std::make_unique<tle::Request>(inputTokenIds, maxTokens.value(), streaming, samplingConfig,
|
||||
outputConfig, endId, padId, positionIds, badWords, stopWords, embeddingBias,
|
||||
externalDraftTokensConfig, pTuningConfig, loraConfig, lookaheadConfig, logitsPostProcessorName,
|
||||
encoderInputTokenIds, clientId, returnAllGeneratedTokens, priority, type, contextPhaseParams,
|
||||
encoderInputFeatures, encoderOutputLength, crossAttentionMask, numReturnSequences);
|
||||
externalDraftTokensConfig, pTuningConfig, loraConfig, lookaheadConfig, kvCacheRetentionConfig,
|
||||
logitsPostProcessorName, encoderInputTokenIds, clientId, returnAllGeneratedTokens, priority,
|
||||
type, contextPhaseParams, encoderInputFeatures, encoderOutputLength, crossAttentionMask,
|
||||
numReturnSequences);
|
||||
}),
|
||||
py::arg("input_token_ids"), py::kw_only(), py::arg("max_tokens") = py::none(),
|
||||
py::arg("max_new_tokens") = py::none(), py::arg("streaming") = false,
|
||||
@ -358,9 +381,9 @@ void InitBindings(pybind11::module_& m)
|
||||
py::arg("stop_words") = py::none(), py::arg("embedding_bias") = py::none(),
|
||||
py::arg("external_draft_tokens_config") = py::none(), py::arg("prompt_tuning_config") = py::none(),
|
||||
py::arg("lora_config") = py::none(), py::arg("lookahead_config") = py::none(),
|
||||
py::arg("logits_post_processor_name") = py::none(), py::arg("encoder_input_token_ids") = py::none(),
|
||||
py::arg("client_id") = py::none(), py::arg("return_all_generated_tokens") = false,
|
||||
py::arg("priority") = tle::Request::kDefaultPriority,
|
||||
py::arg("kv_cache_retention_config") = py::none(), py::arg("logits_post_processor_name") = py::none(),
|
||||
py::arg("encoder_input_token_ids") = py::none(), py::arg("client_id") = py::none(),
|
||||
py::arg("return_all_generated_tokens") = false, py::arg("priority") = tle::Request::kDefaultPriority,
|
||||
py::arg_v("type", tle::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION,
|
||||
"RequestType.REQUEST_TYPE_CONTEXT_AND_GENERATION"),
|
||||
py::arg("context_phase_params") = py::none(), py::arg("encoder_input_features") = py::none(),
|
||||
@ -384,6 +407,8 @@ void InitBindings(pybind11::module_& m)
|
||||
"prompt_tuning_config", &tle::Request::getPromptTuningConfig, &tle::Request::setPromptTuningConfig)
|
||||
.def_property("lora_config", &tle::Request::getLoraConfig, &tle::Request::setLoraConfig)
|
||||
.def_property("lookahead_config", &tle::Request::getLookaheadConfig, &tle::Request::setLookaheadConfig)
|
||||
.def_property("kv_cache_retention_config", &tle::Request::getKvCacheRetentionConfig,
|
||||
&tle::Request::setKvCacheRetentionConfig)
|
||||
.def_property("logits_post_processor_name", &tle::Request::getLogitsPostProcessorName,
|
||||
&tle::Request::setLogitsPostProcessorName)
|
||||
.def_property(
|
||||
@ -417,7 +442,9 @@ void InitBindings(pybind11::module_& m)
|
||||
.def_readwrite("encoder_output", &tle::Result::encoderOutput)
|
||||
.def_readwrite("finish_reasons", &tle::Result::finishReasons)
|
||||
.def_readwrite("sequence_index", &tle::Result::sequenceIndex)
|
||||
.def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal);
|
||||
.def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal)
|
||||
.def_readwrite("decoding_iter", &tle::Result::decodingIter)
|
||||
.def_readwrite("context_phase_params", &tle::Result::contextPhaseParams);
|
||||
|
||||
py::class_<tle::Response>(m, "Response")
|
||||
.def(py::init<IdType, std::string, std::optional<IdType>>(), py::arg("request_id"), py::arg("error_msg"),
|
||||
@ -457,25 +484,28 @@ void InitBindings(pybind11::module_& m)
|
||||
{
|
||||
return py::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindowVec(),
|
||||
self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
|
||||
self.getOnboardBlocks());
|
||||
self.getOnboardBlocks(), self.getCrossKvCacheFraction(), self.getSecondaryOffloadMinPriority());
|
||||
};
|
||||
auto kvCacheConfigSetstate = [](py::tuple state)
|
||||
{
|
||||
if (state.size() != 7)
|
||||
if (state.size() != 9)
|
||||
{
|
||||
throw std::runtime_error("Invalid state!");
|
||||
}
|
||||
return tle::KvCacheConfig(state[0].cast<bool>(), state[1].cast<std::optional<SizeType32>>(),
|
||||
state[2].cast<std::optional<std::vector<SizeType32>>>(), state[3].cast<std::optional<SizeType32>>(),
|
||||
state[4].cast<std::optional<float>>(), state[5].cast<std::optional<size_t>>(), state[6].cast<bool>());
|
||||
state[4].cast<std::optional<float>>(), state[5].cast<std::optional<size_t>>(), state[6].cast<bool>(),
|
||||
state[7].cast<std::optional<float>>(), state[8].cast<std::optional<tle::RetentionPriority>>());
|
||||
};
|
||||
py::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
|
||||
.def(py::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
|
||||
std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool>(),
|
||||
std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
|
||||
std::optional<float> const&, std::optional<tle::RetentionPriority>>(),
|
||||
py::arg("enable_block_reuse") = false, py::arg("max_tokens") = py::none(),
|
||||
py::arg("max_attention_window") = py::none(), py::arg("sink_token_length") = py::none(),
|
||||
py::arg("free_gpu_memory_fraction") = py::none(), py::arg("host_cache_size") = py::none(),
|
||||
py::arg("onboard_blocks") = true)
|
||||
py::arg("onboard_blocks") = true, py::arg("cross_kv_cache_fraction") = py::none(),
|
||||
py::arg("secondary_offload_min_priority") = py::none())
|
||||
.def_property(
|
||||
"enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
|
||||
.def_property("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
|
||||
@ -487,6 +517,10 @@ void InitBindings(pybind11::module_& m)
|
||||
&tle::KvCacheConfig::setFreeGpuMemoryFraction)
|
||||
.def_property("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize)
|
||||
.def_property("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks)
|
||||
.def_property("cross_kv_cache_fraction", &tle::KvCacheConfig::getCrossKvCacheFraction,
|
||||
&tle::KvCacheConfig::setCrossKvCacheFraction)
|
||||
.def_property("secondary_offload_min_priority", &tle::KvCacheConfig::getSecondaryOffloadMinPriority,
|
||||
&tle::KvCacheConfig::setSecondaryOffloadMinPriority)
|
||||
.def(py::pickle(kvCacheConfigGetstate, kvCacheConfigSetstate));
|
||||
|
||||
py::class_<tle::OrchestratorConfig>(m, "OrchestratorConfig")
|
||||
|
||||
@ -21,6 +21,7 @@ set(SRCS
|
||||
bufferManager.cpp
|
||||
cudaMemPool.cpp
|
||||
decodingLayerWorkspace.cpp
|
||||
eagleBuffers.cpp
|
||||
explicitDraftTokensBuffers.cpp
|
||||
lookaheadBuffers.cpp
|
||||
layerProfiler.cpp
|
||||
|
||||
351
cpp/tensorrt_llm/runtime/eagleBuffers.cpp
Normal file
351
cpp/tensorrt_llm/runtime/eagleBuffers.cpp
Normal file
@ -0,0 +1,351 @@
|
||||
/*
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "tensorrt_llm/runtime/eagleBuffers.h"
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/common/cudaUtils.h"
|
||||
#include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h"
|
||||
#include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h"
|
||||
#include "tensorrt_llm/runtime/common.h"
|
||||
#include "tensorrt_llm/runtime/iBuffer.h"
|
||||
|
||||
namespace tksd = tensorrt_llm::kernels::speculative_decoding;
|
||||
|
||||
namespace tensorrt_llm::runtime
|
||||
{
|
||||
|
||||
void EagleBuffers::Inputs::create(SizeType32 maxNumSequences, TllmRuntime const& runtime,
|
||||
ModelConfig const& modelConfig, WorldConfig const& worldConfig)
|
||||
{
|
||||
auto const& manager = runtime.getBufferManager();
|
||||
|
||||
auto const& speculativeDecodingModule = modelConfig.getSpeculativeDecodingModule();
|
||||
auto const maxNumPaths = speculativeDecodingModule.getMaxNumPaths();
|
||||
auto const maxDraftPathLen = speculativeDecodingModule.getMaxDraftPathLen();
|
||||
auto const maxPathLen = speculativeDecodingModule.getMaxPathLen();
|
||||
auto const maxDecodingTokens = speculativeDecodingModule.getMaxDecodingTokens();
|
||||
auto const maxDecodingDraftTokens = speculativeDecodingModule.getMaxDecodingDraftTokens();
|
||||
|
||||
auto constexpr TRTTokenIdType = runtime::TRTDataType<runtime::TokenIdType>::value;
|
||||
|
||||
temperatures = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kFLOAT);
|
||||
randomDataSample = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kFLOAT);
|
||||
randomDataValidation
|
||||
= manager.gpu(ITensor::makeShape({maxNumSequences, maxNumPaths, maxDraftPathLen}), nvinfer1::DataType::kFLOAT);
|
||||
draftTokens = manager.gpu(ITensor::makeShape({maxNumSequences, maxDecodingDraftTokens}), TRTTokenIdType);
|
||||
draftLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
draftPaths
|
||||
= manager.gpu(ITensor::makeShape({maxNumSequences, maxNumPaths, maxPathLen}), nvinfer1::DataType::kINT32);
|
||||
specDecodingGenerationLengths = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
specDecodingPackedMasks
|
||||
= manager.gpu(ITensor::makeShape({maxNumSequences, maxDecodingTokens, common::ceilDiv(maxDecodingTokens, 32)}),
|
||||
nvinfer1::DataType::kINT32);
|
||||
specDecodingPositionOffsets
|
||||
= manager.gpu(ITensor::makeShape({maxNumSequences * maxDecodingTokens}), nvinfer1::DataType::kINT32);
|
||||
|
||||
eagleNetCtxRequestTypesHost = manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
eagleNetCtxContextLengthsHost
|
||||
= manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
eagleNetCtxPastKeyValueLengthsHost
|
||||
= manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
eagleNetGenRequestTypesHost = manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
eagleNetGenContextLengthsHost
|
||||
= manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
eagleNetGenPastKeyValueLengthsHost
|
||||
= manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
}
|
||||
|
||||
EagleBuffers::EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const& manager,
|
||||
runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
|
||||
executor::DecodingConfig const& decodingConfig, runtime::TllmRuntime const& runtime)
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
TLLM_CHECK_WITH_INFO(maxBeamWidth == 1, "EAGLE does not support beam search");
|
||||
|
||||
auto const maxNumSequences = maxBatchSize;
|
||||
|
||||
auto const eagleModule = std::dynamic_pointer_cast<tensorrt_llm::runtime::EagleModule const>(
|
||||
modelConfig.getSpeculativeDecodingModulePtr());
|
||||
|
||||
auto const numPaths = eagleModule->getMaxNumPaths();
|
||||
auto const pathLen = eagleModule->getMaxPathLen();
|
||||
auto const maxDecodingDraftTokens = eagleModule->getMaxDecodingDraftTokens();
|
||||
|
||||
auto constexpr TRTTokenIdType = runtime::TRTDataType<runtime::TokenIdType>::value;
|
||||
|
||||
// input tensors
|
||||
engineInputs.temperatures = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
|
||||
|
||||
engineInputs.draftTokens
|
||||
= manager.gpu(ITensor::makeShape({maxNumSequences, maxDecodingDraftTokens}), TRTTokenIdType);
|
||||
engineInputs.draftLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
engineInputs.draftPaths
|
||||
= manager.gpu(ITensor::makeShape({maxNumSequences, numPaths, pathLen}), nvinfer1::DataType::kINT32);
|
||||
|
||||
engineInputs.specDecodingGenerationLengths
|
||||
= manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
|
||||
engineInputs.specDecodingPositionOffsets
|
||||
= manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
|
||||
engineInputs.specDecodingPackedMasks = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
|
||||
|
||||
engineInputs.randomDataSample = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
|
||||
engineInputs.randomDataValidation = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
|
||||
|
||||
engineInputs.eagleNetCtxRequestTypesHost
|
||||
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
|
||||
engineInputs.eagleNetCtxContextLengthsHost
|
||||
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
|
||||
engineInputs.eagleNetCtxPastKeyValueLengthsHost
|
||||
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
|
||||
engineInputs.eagleNetGenRequestTypesHost
|
||||
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
|
||||
engineInputs.eagleNetGenContextLengthsHost
|
||||
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
|
||||
engineInputs.eagleNetGenPastKeyValueLengthsHost
|
||||
= manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
|
||||
|
||||
// output tensors
|
||||
engineOutputs.nextDraftTokens
|
||||
= manager.gpu(ITensor::makeShape({maxNumSequences, numPaths, pathLen}), TRTTokenIdType);
|
||||
engineOutputs.nextDraftLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
engineOutputs.nextDraftPaths
|
||||
= manager.gpu(ITensor::makeShape({maxNumSequences, numPaths, pathLen}), nvinfer1::DataType::kINT32);
|
||||
|
||||
engineOutputs.acceptedTokens
|
||||
= manager.gpu(ITensor::makeShape({maxNumSequences, pathLen}), nvinfer1::DataType::kINT32);
|
||||
engineOutputs.acceptedLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
engineOutputs.acceptedPaths = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
|
||||
|
||||
// helper tensors
|
||||
auto const& stream = manager.getStream();
|
||||
scanTempStorageBytes
|
||||
= tksd::invokeScanGenerationLengths(nullptr, 0, nullptr, nullptr, maxNumSequences, stream.get());
|
||||
reduceTempStorageBytes
|
||||
= tksd::invokeReduceMaxGenerationLengths(nullptr, 0, nullptr, nullptr, maxNumSequences, stream.get());
|
||||
scanReduceTempStorage = manager.gpu(std::max(reduceTempStorageBytes, scanTempStorageBytes));
|
||||
cumSumGenerationLengths = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
|
||||
maxGenerationLength = manager.gpu(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
|
||||
|
||||
// pre-allocate empty tensors
|
||||
reshape(0, maxNumSequences, modelConfig);
|
||||
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void EagleBuffers::reshape(
|
||||
SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const& modelConfig)
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
|
||||
auto const numSequences = numCtxSequences + numGenSequences;
|
||||
|
||||
auto const eagleModule = std::dynamic_pointer_cast<tensorrt_llm::runtime::EagleModule const>(
|
||||
modelConfig.getSpeculativeDecodingModulePtr());
|
||||
|
||||
auto const maxDecodingTokens = eagleModule->getMaxDecodingTokens();
|
||||
|
||||
// input tensors
|
||||
engineInputs.temperatures->reshape(ITensor::makeShape({numSequences}));
|
||||
|
||||
auto draftTokensShape = engineInputs.draftTokens->getShape();
|
||||
draftTokensShape.d[0] = numSequences;
|
||||
engineInputs.draftTokens->reshape(draftTokensShape);
|
||||
auto draftLensShape = engineInputs.draftLens->getShape();
|
||||
draftLensShape.d[0] = numSequences;
|
||||
engineInputs.draftLens->reshape(draftLensShape);
|
||||
auto draftPathsShape = engineInputs.draftPaths->getShape();
|
||||
draftPathsShape.d[0] = numSequences;
|
||||
engineInputs.draftPaths->reshape(draftPathsShape);
|
||||
|
||||
engineInputs.specDecodingGenerationLengths->reshape(ITensor::makeShape({numGenSequences}));
|
||||
engineInputs.specDecodingPositionOffsets->reshape(ITensor::makeShape({numGenSequences, maxDecodingTokens}));
|
||||
engineInputs.specDecodingPackedMasks->reshape(
|
||||
ITensor::makeShape({numGenSequences * maxDecodingTokens, common::ceilDiv(maxDecodingTokens, 32)}));
|
||||
|
||||
engineInputs.randomDataSample->reshape(ITensor::makeShape({numSequences}));
|
||||
engineInputs.randomDataValidation->reshape(ITensor::makeShape({numSequences}));
|
||||
|
||||
engineInputs.eagleNetCtxRequestTypesHost->reshape(ITensor::makeShape({numSequences}));
|
||||
engineInputs.eagleNetCtxContextLengthsHost->reshape(ITensor::makeShape({numSequences}));
|
||||
engineInputs.eagleNetCtxPastKeyValueLengthsHost->reshape(ITensor::makeShape({numSequences}));
|
||||
engineInputs.eagleNetGenRequestTypesHost->reshape(ITensor::makeShape({numSequences}));
|
||||
engineInputs.eagleNetGenContextLengthsHost->reshape(ITensor::makeShape({numSequences}));
|
||||
engineInputs.eagleNetGenPastKeyValueLengthsHost->reshape(ITensor::makeShape({numSequences}));
|
||||
|
||||
cumSumGenerationLengths->reshape(ITensor::makeShape({numSequences}));
|
||||
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void EagleBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded,
|
||||
ITensor const& seqSlots, EagleBuffers::Inputs const& draftBuffers, ITensor const& contextPositionIds,
|
||||
runtime::EagleModule const& eagleModule, runtime::CudaStream const& stream) const
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
|
||||
using runtime::bufferCast;
|
||||
|
||||
tksd::PackEagleParams params;
|
||||
params.batchSize = numCtxSequences + numGenSequences;
|
||||
params.maxNumPaths = eagleModule.getMaxNumPaths();
|
||||
params.maxDecodingTokens = eagleModule.getMaxDecodingTokens();
|
||||
params.maxPathLength = eagleModule.getMaxPathLen();
|
||||
params.numContextRequests = numCtxSequences;
|
||||
params.numGenerationRequests = numGenSequences;
|
||||
|
||||
params.batchSlots = bufferCast<SizeType32>(seqSlots);
|
||||
|
||||
// Outputs from decoder -- inputs to the packing kernel
|
||||
params.inputTemperatures = bufferCast<float>(*draftBuffers.temperatures);
|
||||
params.inputRandomDataSample = bufferCast<float>(*draftBuffers.randomDataSample);
|
||||
params.inputRandomDataValidation = bufferCast<float>(*draftBuffers.randomDataValidation);
|
||||
|
||||
params.inputNextDraftTokens = bufferCast<runtime::TokenIdType>(*draftBuffers.draftTokens);
|
||||
params.inputNextDraftLens = bufferCast<SizeType32>(*draftBuffers.draftLens);
|
||||
params.inputNextDraftPaths = bufferCast<SizeType32>(*draftBuffers.draftPaths);
|
||||
|
||||
params.inputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*draftBuffers.specDecodingGenerationLengths);
|
||||
params.inputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*draftBuffers.specDecodingPositionOffsets);
|
||||
params.inputSpecDecodingPackedMasks = bufferCast<int32_t>(*draftBuffers.specDecodingPackedMasks);
|
||||
|
||||
// Outputs of the packing kernel -- inputs to the engine
|
||||
params.outputTemperatures = bufferCast<float>(*engineInputs.temperatures);
|
||||
params.outputRandomDataSample = bufferCast<float>(*engineInputs.randomDataSample);
|
||||
params.outputRandomDataValidation = bufferCast<float>(*engineInputs.randomDataValidation);
|
||||
|
||||
params.outputNextDraftTokens = bufferCast<runtime::TokenIdType>(*engineInputs.draftTokens);
|
||||
params.outputNextDraftLens = bufferCast<SizeType32>(*engineInputs.draftLens);
|
||||
params.outputNextDraftPaths = bufferCast<SizeType32>(*engineInputs.draftPaths);
|
||||
|
||||
params.outputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*engineInputs.specDecodingGenerationLengths);
|
||||
params.outputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*engineInputs.specDecodingPositionOffsets);
|
||||
params.outputSpecDecodingPackedMasks = bufferCast<int32_t>(*engineInputs.specDecodingPackedMasks);
|
||||
|
||||
params.maxGenerationLength = bufferCast<SizeType32>(*maxGenerationLength);
|
||||
params.cumSumGenerationLengths = bufferCast<SizeType32>(*cumSumGenerationLengths);
|
||||
|
||||
params.checkParams();
|
||||
|
||||
// Pack tensors from batch slot position to continuous array
|
||||
tksd::invokePackEagleGenerationLengths(params, stream.get());
|
||||
|
||||
if (numGenSequences)
|
||||
{
|
||||
// Compute inclusive sum and max
|
||||
tksd::invokeScanReduceGenerationLengths(numGenSequences,
|
||||
bufferCast<SizeType32>(*engineInputs.specDecodingGenerationLengths),
|
||||
bufferCast<uint8_t>(*scanReduceTempStorage), scanTempStorageBytes,
|
||||
bufferCast<SizeType32>(*cumSumGenerationLengths), bufferCast<uint8_t>(*scanReduceTempStorage),
|
||||
reduceTempStorageBytes, bufferCast<SizeType32>(*maxGenerationLength), stream.get());
|
||||
}
|
||||
|
||||
// Pack tensors from batch slot position to continuous array
|
||||
tksd::invokePackEagle(params, stream.get());
|
||||
|
||||
// Pack host data.
|
||||
for (SizeType32 bi = 0; bi < params.batchSize; ++bi)
|
||||
{
|
||||
auto const batchSlot = params.batchSlots[bi];
|
||||
bufferCast<SizeType32>(*engineInputs.eagleNetCtxRequestTypesHost)[bi]
|
||||
= bufferCast<SizeType32>(*draftBuffers.eagleNetCtxRequestTypesHost)[batchSlot];
|
||||
bufferCast<SizeType32>(*engineInputs.eagleNetCtxContextLengthsHost)[bi]
|
||||
= bufferCast<SizeType32>(*draftBuffers.eagleNetCtxContextLengthsHost)[batchSlot];
|
||||
bufferCast<SizeType32>(*engineInputs.eagleNetCtxPastKeyValueLengthsHost)[bi]
|
||||
= bufferCast<SizeType32>(*draftBuffers.eagleNetCtxPastKeyValueLengthsHost)[batchSlot];
|
||||
bufferCast<SizeType32>(*engineInputs.eagleNetGenRequestTypesHost)[bi]
|
||||
= bufferCast<SizeType32>(*draftBuffers.eagleNetGenRequestTypesHost)[batchSlot];
|
||||
bufferCast<SizeType32>(*engineInputs.eagleNetGenContextLengthsHost)[bi]
|
||||
= bufferCast<SizeType32>(*draftBuffers.eagleNetGenContextLengthsHost)[batchSlot];
|
||||
bufferCast<SizeType32>(*engineInputs.eagleNetGenPastKeyValueLengthsHost)[bi]
|
||||
= bufferCast<SizeType32>(*draftBuffers.eagleNetGenPastKeyValueLengthsHost)[batchSlot];
|
||||
}
|
||||
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void EagleBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, ITensor const& requestTypes,
|
||||
ITensor const& seqSlots, EagleBuffers::Inputs const& draftBuffers, ITensor const& contextPositionIds,
|
||||
runtime::TllmRuntime const& runtime, runtime::ModelConfig const& modelConfig,
|
||||
runtime::WorldConfig const& worldConfig) const
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
|
||||
auto const& stream = runtime.getStream();
|
||||
|
||||
auto const eagleModule
|
||||
= std::dynamic_pointer_cast<runtime::EagleModule const>(modelConfig.getSpeculativeDecodingModulePtr());
|
||||
|
||||
auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize());
|
||||
|
||||
auto const dtype = modelConfig.getDataType();
|
||||
|
||||
switch (dtype)
|
||||
{
|
||||
case nvinfer1::DataType::kFLOAT:
|
||||
setFromInputs<float>(numCtxSequences, numGenSequences, vocabSizePadded, seqSlots, draftBuffers,
|
||||
contextPositionIds, *eagleModule, stream);
|
||||
break;
|
||||
case nvinfer1::DataType::kHALF:
|
||||
setFromInputs<half>(numCtxSequences, numGenSequences, vocabSizePadded, seqSlots, draftBuffers,
|
||||
contextPositionIds, *eagleModule, stream);
|
||||
break;
|
||||
default: TLLM_THROW("DataType %d not supported in EagleBuffers", static_cast<SizeType32>(dtype)); break;
|
||||
}
|
||||
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void EagleBuffers::insertInputTensors(
|
||||
TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::WorldConfig const& /* worldConfig */) const
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
// inputs
|
||||
inputBuffers.insert_or_assign("eagle_temperature", engineInputs.temperatures);
|
||||
|
||||
inputBuffers.insert_or_assign("spec_decoding_generation_lengths", engineInputs.specDecodingGenerationLengths);
|
||||
inputBuffers.insert_or_assign("spec_decoding_position_offsets", engineInputs.specDecodingPositionOffsets);
|
||||
inputBuffers.insert_or_assign("spec_decoding_packed_mask", engineInputs.specDecodingPackedMasks);
|
||||
|
||||
inputBuffers.insert_or_assign("rand_data_sample", engineInputs.randomDataSample);
|
||||
inputBuffers.insert_or_assign("rand_data_validation", engineInputs.randomDataValidation);
|
||||
|
||||
inputBuffers.insert_or_assign("draft_tokens", engineInputs.draftTokens);
|
||||
inputBuffers.insert_or_assign("draft_lens", engineInputs.draftLens);
|
||||
inputBuffers.insert_or_assign("draft_paths", engineInputs.draftPaths);
|
||||
|
||||
inputBuffers.insert_or_assign("host_ctx_eagle_net_request_types", engineInputs.eagleNetCtxRequestTypesHost);
|
||||
inputBuffers.insert_or_assign("host_ctx_eagle_net_context_lengths", engineInputs.eagleNetCtxContextLengthsHost);
|
||||
inputBuffers.insert_or_assign(
|
||||
"host_ctx_eagle_net_past_key_value_lengths", engineInputs.eagleNetCtxPastKeyValueLengthsHost);
|
||||
inputBuffers.insert_or_assign("host_gen_eagle_net_request_types", engineInputs.eagleNetGenRequestTypesHost);
|
||||
inputBuffers.insert_or_assign("host_gen_eagle_net_context_lengths", engineInputs.eagleNetGenContextLengthsHost);
|
||||
inputBuffers.insert_or_assign(
|
||||
"host_gen_eagle_net_past_key_value_lengths", engineInputs.eagleNetGenPastKeyValueLengthsHost);
|
||||
|
||||
// outputs
|
||||
outputBuffers.insert_or_assign("next_draft_tokens", engineOutputs.nextDraftTokens);
|
||||
outputBuffers.insert_or_assign("next_draft_lens", engineOutputs.nextDraftLens);
|
||||
|
||||
outputBuffers.insert_or_assign("accepted_tokens", engineOutputs.acceptedTokens);
|
||||
outputBuffers.insert_or_assign("num_accepted_tokens", engineOutputs.acceptedLens);
|
||||
outputBuffers.insert_or_assign("accepted_paths", engineOutputs.acceptedPaths);
|
||||
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
} // namespace tensorrt_llm::runtime
|
||||
39
cpp/tensorrt_llm/runtime/eagleModule.h
Normal file
39
cpp/tensorrt_llm/runtime/eagleModule.h
Normal file
@ -0,0 +1,39 @@
|
||||
/*
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include "tensorrt_llm/common/assert.h"
|
||||
#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
|
||||
|
||||
namespace tensorrt_llm::runtime
|
||||
{
|
||||
|
||||
class EagleModule : public SpeculativeDecodingModule
|
||||
{
|
||||
public:
|
||||
explicit EagleModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens, SizeType32 maxNumPaths) noexcept
|
||||
: SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxNumPaths)
|
||||
{
|
||||
TLLM_CHECK(maxNumPaths * maxDraftPathLen == maxDecodingDraftTokens);
|
||||
}
|
||||
|
||||
explicit EagleModule() noexcept
|
||||
: EagleModule(0, 0, 0)
|
||||
{
|
||||
}
|
||||
};
|
||||
} // namespace tensorrt_llm::runtime
|
||||
@ -233,6 +233,16 @@ void GptDecoderBatched::setupLookahead(LookaheadDecodingBuffers lookaheadDecodin
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void GptDecoderBatched::setupEagle(EagleBuffers::Inputs eagleBuffers)
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
|
||||
TLLM_CHECK(mSpeculativeDecodingMode.isEagle());
|
||||
mJointDecodingOutput->eagleBuffers = std::move(eagleBuffers);
|
||||
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
|
||||
SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength,
|
||||
SizeType32 maxTokensPerEngineStep, nvinfer1::DataType dtype, ModelConfig const& modelConfig)
|
||||
@ -628,6 +638,10 @@ void GptDecoderBatched::newRequestSpeculativeDecoding(
|
||||
{
|
||||
newRequestExplicitDraftTokens(batchIdx, request);
|
||||
}
|
||||
else if (mSpeculativeDecodingMode.isEagle())
|
||||
{
|
||||
newRequestEagle(batchIdx, request);
|
||||
}
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
@ -739,6 +753,17 @@ void GptDecoderBatched::newRequestExplicitDraftTokens(SizeType32 batchIdx, decod
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void GptDecoderBatched::newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const& request)
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
|
||||
TLLM_CHECK(mJointDecodingOutput->eagleBuffers);
|
||||
|
||||
// TODO fill me
|
||||
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void GptDecoderBatched::setExplicitDraftTokensInputs(decoder_batch::Input const& input)
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
@ -767,6 +792,15 @@ void GptDecoderBatched::setExplicitDraftTokensInputs(decoder_batch::Input const&
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void GptDecoderBatched::setEagleInputs(decoder_batch::Input const& input)
|
||||
{
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
|
||||
// TODO fill me
|
||||
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
void GptDecoderBatched::newRequests(std::vector<SizeType32> const& seqSlots,
|
||||
std::vector<decoder_batch::Request> const& requests, std::vector<SamplingConfig> const& samplingConfigs)
|
||||
{
|
||||
@ -854,6 +888,10 @@ void GptDecoderBatched::forwardDecoder(
|
||||
{
|
||||
setExplicitDraftTokensInputs(input);
|
||||
}
|
||||
else if (mSpeculativeDecodingMode.isEagle())
|
||||
{
|
||||
setEagleInputs(input);
|
||||
}
|
||||
|
||||
bool const async = forwardType == ForwardType::kASYNC;
|
||||
|
||||
|
||||
@ -160,6 +160,8 @@ ModelConfig createModelConfig(
|
||||
auto numKvHeadsPerAttentionLayer
|
||||
= parseJsonFieldOr<std::vector<SizeType32>>(config, "num_kv_heads_per_layer", std::vector<SizeType32>());
|
||||
|
||||
auto numKvHeadsPerCrossAttentionLayer = parseJsonFieldOr<std::vector<SizeType32>>(
|
||||
config, "num_kv_heads_per_cross_attn_layer", std::vector<SizeType32>());
|
||||
auto modelConfig
|
||||
= ModelConfig{vocabSize, numLayers, numAttentionLayers, numRnnLayers, numHeads, hiddenSize, dataType};
|
||||
|
||||
@ -167,7 +169,8 @@ ModelConfig createModelConfig(
|
||||
{
|
||||
std::transform(numKvHeadsPerAttentionLayer.cbegin(), numKvHeadsPerAttentionLayer.cend(),
|
||||
numKvHeadsPerAttentionLayer.begin(),
|
||||
[tensorParallelism](SizeType32 const numKvHeads) { return std::max(numKvHeads / tensorParallelism, 1); });
|
||||
[tensorParallelism](SizeType32 const numKvHeads)
|
||||
{ return ((numKvHeads + tensorParallelism - 1) / tensorParallelism); });
|
||||
modelConfig.setNumKvHeadsPerLayer(numKvHeadsPerAttentionLayer);
|
||||
}
|
||||
else
|
||||
@ -175,6 +178,19 @@ ModelConfig createModelConfig(
|
||||
modelConfig.setNbKvHeads(numKvHeads);
|
||||
}
|
||||
|
||||
if (!numKvHeadsPerCrossAttentionLayer.empty())
|
||||
{
|
||||
std::transform(numKvHeadsPerCrossAttentionLayer.cbegin(), numKvHeadsPerCrossAttentionLayer.cend(),
|
||||
numKvHeadsPerCrossAttentionLayer.begin(),
|
||||
[tensorParallelism](SizeType32 const numKvHeads)
|
||||
{ return ((numKvHeads + tensorParallelism - 1) / tensorParallelism); });
|
||||
modelConfig.setNumKvHeadsPerCrossLayer(numKvHeadsPerCrossAttentionLayer);
|
||||
}
|
||||
else
|
||||
{
|
||||
modelConfig.setNbCrossKvHeads(numKvHeads);
|
||||
}
|
||||
|
||||
modelConfig.setSizePerHead(sizePerHead);
|
||||
modelConfig.setLayerTypes(layerTypes);
|
||||
|
||||
|
||||
@ -60,7 +60,7 @@ th::Tensor gatherTree( // BS: batch_size, BM:
|
||||
{
|
||||
int32_t* final_output_ids_ptr = get_ptr<int32_t>(final_output_ids);
|
||||
tk::invokeInitializeOutput(
|
||||
final_output_ids_ptr, get_ptr<int32_t>(end_ids), batch_size * beam_width, max_seq_len, stream);
|
||||
final_output_ids_ptr, get_ptr<int32_t>(end_ids), batch_size, beam_width, max_seq_len, stream);
|
||||
|
||||
tk::BeamHypotheses bh;
|
||||
bh.nBatchSize = batch_size;
|
||||
|
||||
@ -130,6 +130,7 @@ if(NOT ENABLE_MULTI_DEVICE EQUAL 0)
|
||||
add_gtest(allReduceKernelTest kernels/allReduce/allReduceKernelTest.cu)
|
||||
endif()
|
||||
add_gtest(decodingKernelsTest kernels/decodingKernelTest.cpp)
|
||||
add_gtest(eaglePackDataTest kernels/eaglePackDataTest.cpp)
|
||||
add_gtest(banRepeatNGramsKernelsTest kernels/banRepeatNGramsKernelsTest.cpp)
|
||||
add_gtest(stopCriteriaKernelsTest kernels/stopCriteriaKernelsTest.cpp)
|
||||
add_gtest(shiftKCacheKernelTest kernels/shiftKCacheKernelTest.cu)
|
||||
|
||||
@ -426,6 +426,10 @@ public:
|
||||
TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)};
|
||||
mBufferManager->copy(len.data(),*inputLengths);
|
||||
|
||||
std::vector<SizeType32> eid = {0};
|
||||
TensorPtr endIds{ITensor::slice(constPointerCast(decodingInput->endIds), 0, 1)};
|
||||
mBufferManager->copy(eid.data(),*endIds);
|
||||
|
||||
std::vector<std::vector<float>> logProbs =
|
||||
{
|
||||
{-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -0.696636, -2.41985},
|
||||
@ -555,6 +559,10 @@ public:
|
||||
TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)};
|
||||
mBufferManager->copy(len.data(),*inputLengths);
|
||||
|
||||
std::vector<SizeType32> eid = {0};
|
||||
TensorPtr endIds{ITensor::slice(constPointerCast(decodingInput->endIds), 0, 1)};
|
||||
mBufferManager->copy(eid.data(),*endIds);
|
||||
|
||||
std::vector<std::vector<float> >logProbs =
|
||||
{
|
||||
{-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524},
|
||||
|
||||
521
cpp/tests/kernels/eaglePackDataTest.cpp
Normal file
521
cpp/tests/kernels/eaglePackDataTest.cpp
Normal file
@ -0,0 +1,521 @@
|
||||
/*
|
||||
* Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <gtest/gtest.h>
|
||||
|
||||
#include "tensorrt_llm/common/memoryUtils.h"
|
||||
#include "tensorrt_llm/kernels/decodingCommon.h"
|
||||
#include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h"
|
||||
#include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h"
|
||||
#include "tensorrt_llm/runtime/bufferManager.h"
|
||||
#include "tensorrt_llm/runtime/common.h"
|
||||
#include "tensorrt_llm/runtime/iBuffer.h"
|
||||
#include "tensorrt_llm/runtime/iTensor.h"
|
||||
#include "tensorrt_llm/runtime/runtimeKernels.h"
|
||||
#include "tensorrt_llm/runtime/tllmLogger.h"
|
||||
|
||||
#include <NvInferRuntimeBase.h>
|
||||
|
||||
#include <algorithm>
|
||||
#include <cstdint>
|
||||
#include <random>
|
||||
|
||||
namespace
|
||||
{
|
||||
|
||||
using namespace tensorrt_llm::runtime;
|
||||
using namespace tensorrt_llm::common;
|
||||
|
||||
namespace tk = tensorrt_llm::kernels;
|
||||
namespace trk = tensorrt_llm::runtime::kernels;
|
||||
namespace tksd = tensorrt_llm::kernels::speculative_decoding;
|
||||
|
||||
class SamplingParams
|
||||
{
|
||||
public:
|
||||
SamplingParams() {}
|
||||
|
||||
inline void setNumCtxRequests(SizeType32 numCtxRequests)
|
||||
{
|
||||
mNumCtxRequests = numCtxRequests;
|
||||
}
|
||||
|
||||
inline void setNumGenRequests(SizeType32 numGenRequests)
|
||||
{
|
||||
mNumGenRequests = numGenRequests;
|
||||
}
|
||||
|
||||
inline void setMaxPathLen(SizeType32 maxPathLen)
|
||||
{
|
||||
mMaxPathLen = maxPathLen;
|
||||
}
|
||||
|
||||
[[nodiscard]] inline SizeType32 getNumCtxRequests() const
|
||||
{
|
||||
return mNumCtxRequests;
|
||||
}
|
||||
|
||||
[[nodiscard]] inline SizeType32 getNumGenRequests() const
|
||||
{
|
||||
return mNumGenRequests;
|
||||
}
|
||||
|
||||
[[nodiscard]] inline SizeType32 getBatchSize() const
|
||||
{
|
||||
return getNumCtxRequests() + getNumGenRequests();
|
||||
}
|
||||
|
||||
[[nodiscard]] inline SizeType32 getVocabSize() const
|
||||
{
|
||||
return mVocabSize;
|
||||
}
|
||||
|
||||
[[nodiscard]] inline SizeType32 getMaxBatchSize() const
|
||||
{
|
||||
return 2 * getBatchSize();
|
||||
}
|
||||
|
||||
[[nodiscard]] inline SizeType32 getMaxPathLen() const
|
||||
{
|
||||
return mMaxPathLen;
|
||||
}
|
||||
|
||||
[[nodiscard]] inline SizeType32 getMaxDecodingTokens() const
|
||||
{
|
||||
return mMaxDecodingTokens;
|
||||
}
|
||||
|
||||
[[nodiscard]] inline SizeType32 getMaxDecodingDraftTokens() const
|
||||
{
|
||||
return getMaxDecodingTokens() - 1;
|
||||
}
|
||||
|
||||
[[nodiscard]] inline SizeType32 getMaxSeqLen() const
|
||||
{
|
||||
return getMaxDecodingTokens() * 2;
|
||||
}
|
||||
|
||||
private:
|
||||
SizeType32 mNumCtxRequests{6};
|
||||
SizeType32 mNumGenRequests{6};
|
||||
SizeType32 mMaxPathLen{4};
|
||||
SizeType32 mMaxDecodingTokens{32};
|
||||
SizeType32 mVocabSize{256};
|
||||
};
|
||||
|
||||
class EaglePackDataTest : public ::testing::Test
|
||||
{
|
||||
public:
|
||||
using BufferPtr = IBuffer::SharedPtr;
|
||||
using TensorPtr = ITensor::SharedPtr;
|
||||
|
||||
void SetUp() override
|
||||
{
|
||||
mStream = std::make_shared<CudaStream>();
|
||||
mBufferManager = std::make_shared<BufferManager>(mStream);
|
||||
}
|
||||
|
||||
void allocateBuffers()
|
||||
{
|
||||
// inputs
|
||||
mBatchSlots = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kINT32);
|
||||
|
||||
mInputTemperatures = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kFLOAT);
|
||||
|
||||
mInputRandomDataSample = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kFLOAT);
|
||||
|
||||
mInputRandomDataValidation = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kFLOAT);
|
||||
|
||||
mInputNextDraftTokens = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getMaxBatchSize(), mSamplingParams.getMaxDecodingDraftTokens()}),
|
||||
nvinfer1::DataType::kINT32);
|
||||
|
||||
mInputNextDraftLens = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kINT32);
|
||||
|
||||
mInputNextDraftPaths
|
||||
= BufferManager::pinnedPool(ITensor::makeShape({mSamplingParams.getMaxBatchSize(),
|
||||
mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen()}),
|
||||
nvinfer1::DataType::kINT32);
|
||||
|
||||
mInputSpecDecodingGenerationLengths = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kINT32);
|
||||
|
||||
mInputSpecDecodingPositionOffsets = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getMaxBatchSize(), mSamplingParams.getMaxDecodingTokens()}),
|
||||
nvinfer1::DataType::kINT32);
|
||||
|
||||
auto const numPackedMasks
|
||||
= static_cast<SizeType32>(tensorrt_llm::common::divUp(mSamplingParams.getMaxDecodingTokens(), 32));
|
||||
mInputSpecDecodingPackedMasks = BufferManager::pinnedPool(
|
||||
ITensor::makeShape(
|
||||
{mSamplingParams.getMaxBatchSize(), mSamplingParams.getMaxDecodingTokens(), numPackedMasks}),
|
||||
nvinfer1::DataType::kINT32);
|
||||
|
||||
// outputs
|
||||
mOutputTemperatures = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kFLOAT);
|
||||
|
||||
mOutputRandomDataSample = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kFLOAT);
|
||||
|
||||
mOutputRandomDataValidation = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kFLOAT);
|
||||
|
||||
mOutputNextDraftTokens = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingDraftTokens()}),
|
||||
nvinfer1::DataType::kINT32);
|
||||
|
||||
mOutputNextDraftLens = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kINT32);
|
||||
|
||||
mOutputNextDraftPaths
|
||||
= BufferManager::pinnedPool(ITensor::makeShape({mSamplingParams.getBatchSize(),
|
||||
mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen()}),
|
||||
nvinfer1::DataType::kINT32);
|
||||
|
||||
mOutputSpecDecodingGenerationLengths = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kINT32);
|
||||
|
||||
mOutputSpecDecodingPositionOffsets = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingTokens()}),
|
||||
nvinfer1::DataType::kINT32);
|
||||
|
||||
mOutputSpecDecodingPackedMasks = BufferManager::pinnedPool(
|
||||
ITensor::makeShape(
|
||||
{mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingTokens(), numPackedMasks}),
|
||||
nvinfer1::DataType::kINT32);
|
||||
|
||||
// workspace
|
||||
mMaxGenerationLength = BufferManager::pinnedPool(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
|
||||
|
||||
mCumSumGenerationLengths = BufferManager::pinnedPool(
|
||||
ITensor::makeShape({mSamplingParams.getBatchSize() + 1}), nvinfer1::DataType::kINT32);
|
||||
|
||||
mScanTempStorageBytes = tksd::invokeScanGenerationLengths(
|
||||
nullptr, 0, nullptr, nullptr, mSamplingParams.getBatchSize(), mStream->get());
|
||||
mReduceTempStorageBytes = tksd::invokeReduceMaxGenerationLengths(
|
||||
nullptr, 0, nullptr, nullptr, mSamplingParams.getBatchSize(), mStream->get());
|
||||
mScanReduceTempStorage = mBufferManager->gpu(std::max(mReduceTempStorageBytes, mScanTempStorageBytes));
|
||||
}
|
||||
|
||||
void initBuffers()
|
||||
{
|
||||
trk::invokeFill(*mOutputTemperatures, float{0}, *mStream);
|
||||
trk::invokeFill(*mOutputRandomDataSample, float{0}, *mStream);
|
||||
trk::invokeFill(*mOutputRandomDataValidation, float{0}, *mStream);
|
||||
trk::invokeFill(*mOutputNextDraftTokens, TokenIdType{-1}, *mStream);
|
||||
trk::invokeFill(*mOutputNextDraftLens, SizeType32{0}, *mStream);
|
||||
trk::invokeFill(*mOutputNextDraftPaths, SizeType32{0}, *mStream);
|
||||
trk::invokeFill(*mOutputSpecDecodingGenerationLengths, SizeType32{0}, *mStream);
|
||||
trk::invokeFill(*mOutputSpecDecodingPositionOffsets, SizeType32{0}, *mStream);
|
||||
trk::invokeFill(*mOutputSpecDecodingPackedMasks, SizeType32{0}, *mStream);
|
||||
|
||||
auto batchSlotsPtr = bufferCast<SizeType32>(*mBatchSlots);
|
||||
for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
|
||||
{
|
||||
batchSlotsPtr[bi] = 2 * bi;
|
||||
}
|
||||
|
||||
std::mt19937 gen(42);
|
||||
std::uniform_real_distribution<float> distr(0.0, 1.0);
|
||||
std::uniform_int_distribution<SizeType32> intDistr(0, 1000);
|
||||
std::uniform_int_distribution<SizeType32> lenDistr(0, 32);
|
||||
for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
|
||||
{
|
||||
bufferCast<float>(*mInputTemperatures)[batchSlotsPtr[bi]] = distr(gen);
|
||||
bufferCast<float>(*mInputRandomDataSample)[batchSlotsPtr[bi]] = distr(gen);
|
||||
bufferCast<float>(*mInputRandomDataValidation)[batchSlotsPtr[bi]] = distr(gen);
|
||||
}
|
||||
|
||||
for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
|
||||
{
|
||||
for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingDraftTokens(); ++ti)
|
||||
{
|
||||
bufferCast<SizeType32>(*mInputNextDraftTokens)[flat_index2(
|
||||
batchSlotsPtr[bi], ti, mSamplingParams.getMaxDecodingDraftTokens())]
|
||||
= intDistr(gen);
|
||||
}
|
||||
for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingTokens(); ++ti)
|
||||
{
|
||||
for (SizeType32 pi = 0; pi < mSamplingParams.getMaxPathLen(); ++pi)
|
||||
{
|
||||
bufferCast<SizeType32>(*mInputNextDraftPaths)[flat_index3(batchSlotsPtr[bi], ti, pi,
|
||||
mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen())]
|
||||
= intDistr(gen);
|
||||
}
|
||||
auto const numPackedMasks
|
||||
= static_cast<SizeType32>(tensorrt_llm::common::divUp(mSamplingParams.getMaxDecodingTokens(), 32));
|
||||
for (SizeType32 mi = 0; mi < numPackedMasks; ++mi)
|
||||
{
|
||||
bufferCast<SizeType32>(*mInputSpecDecodingPackedMasks)[flat_index3(
|
||||
batchSlotsPtr[bi], ti, mi, mSamplingParams.getMaxDecodingTokens(), numPackedMasks)]
|
||||
= intDistr(gen);
|
||||
}
|
||||
bufferCast<SizeType32>(*mInputSpecDecodingPositionOffsets)[flat_index2(
|
||||
batchSlotsPtr[bi], ti, mSamplingParams.getMaxDecodingTokens())]
|
||||
= intDistr(gen);
|
||||
}
|
||||
bufferCast<SizeType32>(*mInputNextDraftLens)[batchSlotsPtr[bi]] = lenDistr(gen);
|
||||
bufferCast<SizeType32>(*mInputSpecDecodingGenerationLengths)[batchSlotsPtr[bi]]
|
||||
= bufferCast<SizeType32>(*mInputNextDraftLens)[batchSlotsPtr[bi]];
|
||||
}
|
||||
}
|
||||
|
||||
void callPackData()
|
||||
{
|
||||
tksd::PackEagleParams params;
|
||||
params.batchSize = mSamplingParams.getBatchSize();
|
||||
params.maxNumPaths = mSamplingParams.getMaxDecodingTokens();
|
||||
params.maxDecodingTokens = mSamplingParams.getMaxDecodingTokens();
|
||||
params.maxPathLength = mSamplingParams.getMaxPathLen();
|
||||
params.numContextRequests = mSamplingParams.getNumCtxRequests();
|
||||
params.numGenerationRequests = mSamplingParams.getNumGenRequests();
|
||||
|
||||
params.batchSlots = bufferCast<SizeType32>(*mBatchSlots);
|
||||
|
||||
// Outputs from decoder -- inputs to the packing kernel
|
||||
params.inputTemperatures = bufferCast<float>(*mInputTemperatures);
|
||||
params.inputRandomDataSample = bufferCast<float>(*mInputRandomDataSample);
|
||||
params.inputRandomDataValidation = bufferCast<float>(*mInputRandomDataValidation);
|
||||
|
||||
params.inputNextDraftTokens = bufferCast<TokenIdType>(*mInputNextDraftTokens);
|
||||
params.inputNextDraftLens = bufferCast<SizeType32>(*mInputNextDraftLens);
|
||||
params.inputNextDraftPaths = bufferCast<SizeType32>(*mInputNextDraftPaths);
|
||||
|
||||
params.inputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*mInputSpecDecodingGenerationLengths);
|
||||
params.inputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*mInputSpecDecodingPositionOffsets);
|
||||
params.inputSpecDecodingPackedMasks = bufferCast<int32_t>(*mInputSpecDecodingPackedMasks);
|
||||
|
||||
// Outputs of the packing kernel -- inputs to the engine
|
||||
params.outputTemperatures = bufferCast<float>(*mOutputTemperatures);
|
||||
params.outputRandomDataSample = bufferCast<float>(*mOutputRandomDataSample);
|
||||
params.outputRandomDataValidation = bufferCast<float>(*mOutputRandomDataValidation);
|
||||
|
||||
params.outputNextDraftTokens = bufferCast<TokenIdType>(*mOutputNextDraftTokens);
|
||||
params.outputNextDraftLens = bufferCast<SizeType32>(*mOutputNextDraftLens);
|
||||
params.outputNextDraftPaths = bufferCast<SizeType32>(*mOutputNextDraftPaths);
|
||||
|
||||
params.outputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*mOutputSpecDecodingGenerationLengths);
|
||||
params.outputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*mOutputSpecDecodingPositionOffsets);
|
||||
params.outputSpecDecodingPackedMasks = bufferCast<int32_t>(*mOutputSpecDecodingPackedMasks);
|
||||
|
||||
params.maxGenerationLength = bufferCast<SizeType32>(*mMaxGenerationLength);
|
||||
params.cumSumGenerationLengths = bufferCast<SizeType32>(*mCumSumGenerationLengths);
|
||||
|
||||
params.checkParams();
|
||||
|
||||
if (mSamplingParams.getNumGenRequests())
|
||||
{
|
||||
// Pack tensors from batch slot position to continuous array
|
||||
tksd::invokePackEagleGenerationLengths(params, mStream->get());
|
||||
|
||||
sync_check_cuda_error();
|
||||
|
||||
// Compute inclusive sum and max
|
||||
tksd::invokeScanReduceGenerationLengths(mSamplingParams.getNumGenRequests(),
|
||||
bufferCast<SizeType32>(*mOutputSpecDecodingGenerationLengths),
|
||||
bufferCast<uint8_t>(*mScanReduceTempStorage), mScanTempStorageBytes,
|
||||
bufferCast<SizeType32>(*mCumSumGenerationLengths), bufferCast<uint8_t>(*mScanReduceTempStorage),
|
||||
mReduceTempStorageBytes, bufferCast<SizeType32>(*mMaxGenerationLength), mStream->get());
|
||||
|
||||
sync_check_cuda_error();
|
||||
}
|
||||
|
||||
mStream->synchronize();
|
||||
|
||||
// Pack tensors from batch slot position to continuous array
|
||||
tksd::invokePackEagle(params, mStream->get());
|
||||
|
||||
sync_check_cuda_error();
|
||||
}
|
||||
|
||||
void verifyResults()
|
||||
{
|
||||
auto batchSlotsPtr = bufferCast<SizeType32>(*mBatchSlots);
|
||||
for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
|
||||
{
|
||||
EXPECT_EQ(BufferRange<float>(*mInputTemperatures)[batchSlotsPtr[bi]],
|
||||
BufferRange<float>(*mOutputTemperatures)[bi]);
|
||||
EXPECT_EQ(BufferRange<float>(*mInputRandomDataSample)[batchSlotsPtr[bi]],
|
||||
BufferRange<float>(*mOutputRandomDataSample)[bi]);
|
||||
EXPECT_EQ(BufferRange<float>(*mInputRandomDataValidation)[batchSlotsPtr[bi]],
|
||||
BufferRange<float>(*mOutputRandomDataValidation)[bi]);
|
||||
}
|
||||
|
||||
auto const numCtxRequests = mSamplingParams.getNumCtxRequests();
|
||||
for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
|
||||
{
|
||||
for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingTokens(); ++ti)
|
||||
{
|
||||
for (SizeType32 pi = 0; pi < mSamplingParams.getMaxPathLen(); ++pi)
|
||||
{
|
||||
EXPECT_EQ(BufferRange<SizeType32>(*mInputNextDraftPaths)[flat_index3(batchSlotsPtr[bi], ti, pi,
|
||||
mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen())],
|
||||
BufferRange<SizeType32>(*mOutputNextDraftPaths)[flat_index3(
|
||||
bi, ti, pi, mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen())]);
|
||||
}
|
||||
}
|
||||
EXPECT_EQ(BufferRange<SizeType32>(*mOutputNextDraftLens)[bi],
|
||||
bi < numCtxRequests ? 0 : BufferRange<SizeType32>(*mInputNextDraftLens)[batchSlotsPtr[bi]]);
|
||||
}
|
||||
|
||||
auto const maxGenerationLength = bufferCast<SizeType32>(*mMaxGenerationLength)[0];
|
||||
for (SizeType32 bi = 0; bi < mSamplingParams.getNumGenRequests(); ++bi)
|
||||
{
|
||||
for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingDraftTokens(); ++ti)
|
||||
{
|
||||
EXPECT_EQ(BufferRange<SizeType32>(*mInputNextDraftTokens)[flat_index2(
|
||||
batchSlotsPtr[numCtxRequests + bi], ti, mSamplingParams.getMaxDecodingDraftTokens())],
|
||||
BufferRange<SizeType32>(*mOutputNextDraftTokens)[flat_index2(
|
||||
numCtxRequests + bi, ti, mSamplingParams.getMaxDecodingDraftTokens())]);
|
||||
}
|
||||
EXPECT_EQ(BufferRange<SizeType32>(*mInputSpecDecodingGenerationLengths)[batchSlotsPtr[numCtxRequests + bi]],
|
||||
BufferRange<SizeType32>(*mOutputSpecDecodingGenerationLengths)[bi]);
|
||||
for (SizeType32 ti = 0; ti < maxGenerationLength; ++ti)
|
||||
{
|
||||
EXPECT_EQ(BufferRange<SizeType32>(*mInputSpecDecodingPositionOffsets)[flat_index2(
|
||||
batchSlotsPtr[numCtxRequests + bi], ti, mSamplingParams.getMaxDecodingTokens())],
|
||||
BufferRange<SizeType32>(
|
||||
*mOutputSpecDecodingPositionOffsets)[flat_index2(bi, ti, maxGenerationLength)])
|
||||
<< "bi: " << bi << " ti: " << ti;
|
||||
}
|
||||
auto const numTokens = (bi == 0) ? bufferCast<SizeType32>(*mCumSumGenerationLengths)[0]
|
||||
: bufferCast<SizeType32>(*mCumSumGenerationLengths)[bi]
|
||||
- bufferCast<SizeType32>(*mCumSumGenerationLengths)[bi - 1];
|
||||
auto const outputStartId = (bi == 0) ? 0 : bufferCast<SizeType32>(*mCumSumGenerationLengths)[bi - 1];
|
||||
auto const numPackedMasks
|
||||
= static_cast<SizeType32>(tensorrt_llm::common::divUp(mSamplingParams.getMaxDecodingTokens(), 32));
|
||||
for (SizeType32 ti = 0; ti < numTokens * numPackedMasks; ++ti)
|
||||
{
|
||||
EXPECT_EQ(BufferRange<SizeType32>(
|
||||
*mInputSpecDecodingPackedMasks)[flat_index2(batchSlotsPtr[numCtxRequests + bi], ti,
|
||||
mSamplingParams.getMaxDecodingTokens() * numPackedMasks)],
|
||||
BufferRange<SizeType32>(
|
||||
*mOutputSpecDecodingPackedMasks)[flat_index2(outputStartId, ti, numPackedMasks)])
|
||||
<< "bi: " << bi << " ti: " << ti;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void run(SamplingParams samplingParams)
|
||||
{
|
||||
mSamplingParams = samplingParams;
|
||||
|
||||
allocateBuffers();
|
||||
|
||||
initBuffers();
|
||||
|
||||
callPackData();
|
||||
|
||||
mStream->synchronize();
|
||||
|
||||
verifyResults();
|
||||
}
|
||||
|
||||
private:
|
||||
std::shared_ptr<tensorrt_llm::runtime::CudaStream> mStream;
|
||||
std::shared_ptr<tensorrt_llm::runtime::BufferManager> mBufferManager;
|
||||
|
||||
// input
|
||||
TensorPtr mBatchSlots;
|
||||
TensorPtr mInputTemperatures;
|
||||
TensorPtr mInputRandomDataSample;
|
||||
TensorPtr mInputRandomDataValidation;
|
||||
TensorPtr mInputNextDraftTokens;
|
||||
TensorPtr mInputNextDraftLens;
|
||||
TensorPtr mInputNextDraftPaths;
|
||||
TensorPtr mInputSpecDecodingGenerationLengths;
|
||||
TensorPtr mInputSpecDecodingPositionOffsets;
|
||||
TensorPtr mInputSpecDecodingPackedMasks;
|
||||
|
||||
// output
|
||||
TensorPtr mOutputTemperatures;
|
||||
TensorPtr mOutputRandomDataSample;
|
||||
TensorPtr mOutputRandomDataValidation;
|
||||
TensorPtr mOutputNextDraftTokens;
|
||||
TensorPtr mOutputNextDraftLens;
|
||||
TensorPtr mOutputNextDraftPaths;
|
||||
TensorPtr mOutputSpecDecodingGenerationLengths;
|
||||
TensorPtr mOutputSpecDecodingPositionOffsets;
|
||||
TensorPtr mOutputSpecDecodingPackedMasks;
|
||||
|
||||
// workspace
|
||||
TensorPtr mMaxGenerationLength;
|
||||
TensorPtr mCumSumGenerationLengths;
|
||||
|
||||
BufferPtr mScanReduceTempStorage;
|
||||
|
||||
SizeType32 mScanTempStorageBytes;
|
||||
SizeType32 mReduceTempStorageBytes;
|
||||
|
||||
SamplingParams mSamplingParams;
|
||||
};
|
||||
|
||||
TEST_F(EaglePackDataTest, Ctx1Gen0)
|
||||
{
|
||||
SamplingParams params;
|
||||
|
||||
params.setNumCtxRequests(1);
|
||||
params.setNumGenRequests(0);
|
||||
|
||||
this->run(params);
|
||||
}
|
||||
|
||||
TEST_F(EaglePackDataTest, Ctx0Gen1)
|
||||
{
|
||||
SamplingParams params;
|
||||
|
||||
params.setNumCtxRequests(0);
|
||||
params.setNumGenRequests(1);
|
||||
|
||||
this->run(params);
|
||||
}
|
||||
|
||||
TEST_F(EaglePackDataTest, Ctx100Gen0)
|
||||
{
|
||||
SamplingParams params;
|
||||
|
||||
params.setNumCtxRequests(100);
|
||||
params.setNumGenRequests(0);
|
||||
|
||||
this->run(params);
|
||||
}
|
||||
|
||||
TEST_F(EaglePackDataTest, Ctx0Gen100)
|
||||
{
|
||||
SamplingParams params;
|
||||
|
||||
params.setNumCtxRequests(0);
|
||||
params.setNumGenRequests(100);
|
||||
|
||||
this->run(params);
|
||||
}
|
||||
|
||||
TEST_F(EaglePackDataTest, Ctx100Gen100)
|
||||
{
|
||||
SamplingParams params;
|
||||
|
||||
params.setNumCtxRequests(100);
|
||||
params.setNumGenRequests(100);
|
||||
|
||||
this->run(params);
|
||||
}
|
||||
} // namespace
|
||||
@ -137,7 +137,6 @@
|
||||
"layernorm_quantization_plugin": null,
|
||||
"rmsnorm_quantization_plugin": null,
|
||||
"nccl_plugin": null,
|
||||
"lookup_plugin": null,
|
||||
"lora_plugin": "auto",
|
||||
"weight_only_groupwise_quant_matmul_plugin": null,
|
||||
"weight_only_quant_matmul_plugin": null,
|
||||
|
||||
@ -139,7 +139,8 @@ def parallel_run_ctest(
|
||||
# Some catastrophic fail happened that there's no report generated
|
||||
raise
|
||||
|
||||
parallel_report = 'parallel-' + report
|
||||
# Avoid .xml extension to prevent CI from reading failures from it
|
||||
parallel_report = 'parallel-' + report + ".intermediate"
|
||||
_os.rename(cwd / report, cwd / parallel_report)
|
||||
|
||||
try:
|
||||
@ -153,7 +154,7 @@ def parallel_run_ctest(
|
||||
# Use parallel result as final report
|
||||
_os.rename(cwd / parallel_report, cwd / report)
|
||||
else:
|
||||
retry_report = 'retry-' + report
|
||||
retry_report = 'retry-' + report + ".intermediate"
|
||||
_os.rename(cwd / report, cwd / retry_report)
|
||||
merge_report(cwd / parallel_report, cwd / retry_report,
|
||||
cwd / report)
|
||||
@ -472,6 +473,12 @@ def prepare_multi_gpu_model_tests(python_exe: str,
|
||||
model_cache_arg=model_cache_arg,
|
||||
only_multi_gpu_arg=only_multi_gpu_arg)
|
||||
|
||||
prepare_model_tests(model_name="llama",
|
||||
python_exe=python_exe,
|
||||
root_dir=root_dir,
|
||||
resources_dir=resources_dir,
|
||||
model_cache_arg=model_cache_arg)
|
||||
|
||||
prepare_model_tests(model_name="t5",
|
||||
python_exe=python_exe,
|
||||
root_dir=root_dir,
|
||||
@ -704,6 +711,19 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
]
|
||||
run_command(cache_trans_test, cwd=tests_dir, env=cpp_env, timeout=300)
|
||||
|
||||
# Cache transceiver tests
|
||||
cache_trans_test_8_proc = [
|
||||
"mpirun",
|
||||
"-n",
|
||||
"8",
|
||||
"--allow-run-as-root",
|
||||
"batch_manager/cacheTransceiverTest",
|
||||
]
|
||||
run_command(cache_trans_test_8_proc,
|
||||
cwd=tests_dir,
|
||||
env=cpp_env,
|
||||
timeout=600)
|
||||
|
||||
# UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
|
||||
if _os.path.exists(
|
||||
_os.path.join(tests_dir, "batch_manager/ucxDataTransceiverTest")):
|
||||
@ -843,6 +863,19 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
||||
|
||||
new_env = copy.copy(cpp_env)
|
||||
new_env["RUN_LLAMA_MULTI_GPU"] = "true"
|
||||
xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml"
|
||||
trt_model_test = produce_mpirun_command(
|
||||
global_commands=["mpirun", "--allow-run-as-root"],
|
||||
nranks=8,
|
||||
local_commands=[
|
||||
"executor/executorTest",
|
||||
"--gtest_filter=*DisaggAsymmetricExecutorTest*"
|
||||
],
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
|
||||
|
||||
|
||||
def run_benchmarks(model_name: str, python_exe: str, root_dir: _pl.Path,
|
||||
build_dir: _pl.Path, resources_dir: _pl.Path,
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
# Multi-stage Dockerfile
|
||||
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
|
||||
ARG BASE_TAG=24.07-py3
|
||||
ARG BASE_TAG=24.09-py3
|
||||
ARG DEVEL_IMAGE=devel
|
||||
|
||||
FROM ${BASE_IMAGE}:${BASE_TAG} as base
|
||||
|
||||
@ -155,7 +155,7 @@ centos7_%: BASE_TAG = 12.4.0-devel-centos7
|
||||
# For x86_64 and aarch64
|
||||
ubuntu22_%: STAGE = devel
|
||||
ubuntu22_%: BASE_IMAGE = nvidia/cuda
|
||||
ubuntu22_%: BASE_TAG = 12.5.1-devel-ubuntu22.04
|
||||
ubuntu22_%: BASE_TAG = 12.6.1-devel-ubuntu22.04
|
||||
|
||||
trtllm_%: STAGE = release
|
||||
trtllm_%: PUSH_TO_STAGING := 0
|
||||
|
||||
@ -4,7 +4,7 @@ set -ex
|
||||
|
||||
# This script is used for reinstalling CUDA on CentOS 7 with the run file.
|
||||
# CUDA version is usually aligned with the latest NGC CUDA image tag.
|
||||
CUDA_VER="12.5.1_555.42.06"
|
||||
CUDA_VER="12.6.1_560.35.03"
|
||||
CUDA_VER_SHORT="${CUDA_VER%_*}"
|
||||
|
||||
NVCC_VERSION_OUTPUT=$(nvcc --version)
|
||||
|
||||
@ -4,12 +4,8 @@ set -ex
|
||||
|
||||
# Use latest stable version from https://pypi.org/project/torch/#history
|
||||
# and closest to the version specified in
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07
|
||||
TORCH_VERSION="2.4.0"
|
||||
# Check the compatible torchvision from
|
||||
# https://github.com/pytorch/vision/tree/main?tab=readme-ov-file#installation
|
||||
# and also confirm with https://pypi.org/pypi/torchvision/0.19.0/json
|
||||
TORCHVISION_VERSION="0.19.0"
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-09.html#rel-24-09
|
||||
TORCH_VERSION="2.4.1"
|
||||
SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
|
||||
|
||||
prepare_environment() {
|
||||
@ -61,6 +57,9 @@ install_from_source() {
|
||||
python3 setup.py install
|
||||
cd /tmp && rm -rf /tmp/pytorch
|
||||
|
||||
# Get torchvision version by dry run
|
||||
TORCHVISION_VERSION=$(pip3 install --dry-run torch==${TORCH_VERSION} torchvision | grep "Would install" | tr ' ' '\n' | grep torchvision | cut -d "-" -f 2)
|
||||
|
||||
export PYTORCH_VERSION=${PYTORCH_BUILD_VERSION}
|
||||
export FORCE_CUDA=1
|
||||
export BUILD_VERSION=${TORCHVISION_VERSION}
|
||||
@ -76,7 +75,7 @@ install_from_source() {
|
||||
|
||||
install_from_pypi() {
|
||||
pip3 uninstall -y torch torchvision
|
||||
pip3 install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION}
|
||||
pip3 install torch==${TORCH_VERSION} torchvision
|
||||
}
|
||||
|
||||
case "$1" in
|
||||
|
||||
@ -2,19 +2,20 @@
|
||||
|
||||
set -ex
|
||||
|
||||
TRT_VER="10.4.0.26"
|
||||
TRT_VER="10.5.0.18"
|
||||
# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07
|
||||
CUDA_VER="12.5" # 12.5.1
|
||||
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-09.html#rel-24-09
|
||||
CUDA_VER="12.6" # 12.6.1
|
||||
# Keep the installation for cuDNN if users want to install PyTorch with source codes.
|
||||
# PyTorch 2.x can compile with cuDNN v9.
|
||||
CUDNN_VER="9.2.1.18-1"
|
||||
NCCL_VER="2.22.3-1+cuda12.5"
|
||||
CUBLAS_VER="12.5.3.2-1"
|
||||
CUDA_DRIVER_VERSION="560.35.03-1"
|
||||
CUDNN_VER="9.4.0.58-1"
|
||||
NCCL_VER="2.22.3-1+cuda12.6"
|
||||
CUBLAS_VER="12.6.1.4-1"
|
||||
# Align with the pre-installed CUDA / NVCC / NVRTC versions from
|
||||
# https://docs.nvidia.com/cuda/archive/12.5.1/cuda-toolkit-release-notes/index.html
|
||||
NVRTC_VER="12.5.82-1"
|
||||
CUDA_RUNTIME="12.5.82-1"
|
||||
# https://docs.nvidia.com/cuda/archive/12.6.1/cuda-toolkit-release-notes/index.html
|
||||
NVRTC_VER="12.6.68-1"
|
||||
CUDA_RUNTIME="12.6.68-1"
|
||||
|
||||
for i in "$@"; do
|
||||
case $i in
|
||||
@ -76,7 +77,9 @@ install_centos_requirements() {
|
||||
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-toolkit-${CUBLAS_CUDA_VERSION}-config-common-${CUDA_RUNTIME}.noarch.rpm
|
||||
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-toolkit-12-config-common-${CUDA_RUNTIME}.noarch.rpm
|
||||
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-toolkit-config-common-${CUDA_RUNTIME}.noarch.rpm
|
||||
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-compat-${CUBLAS_CUDA_VERSION}-${CUDA_DRIVER_VERSION}.x86_64.rpm
|
||||
yum remove -y "cuda-toolkit*" && yum -y localinstall cuda-toolkit-${CUBLAS_CUDA_VERSION}-config-common-${CUDA_RUNTIME}.noarch.rpm cuda-toolkit-12-config-common-${CUDA_RUNTIME}.noarch.rpm cuda-toolkit-config-common-${CUDA_RUNTIME}.noarch.rpm
|
||||
yum remove -y "cuda-compat*" && yum -y localinstall cuda-compat-${CUBLAS_CUDA_VERSION}-${CUDA_DRIVER_VERSION}.x86_64.rpm
|
||||
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm
|
||||
wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm
|
||||
yum remove -y "libcublas*" && yum -y localinstall libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm
|
||||
@ -96,7 +99,7 @@ install_tensorrt() {
|
||||
if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
|
||||
if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
|
||||
if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu24_04" && OS2="Ubuntu-24.04" && OS="ubuntu-24.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
|
||||
RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
|
||||
RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
|
||||
fi
|
||||
wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
|
||||
tar -xf /tmp/TensorRT.tar -C /usr/local/
|
||||
|
||||
@ -46,6 +46,12 @@ myst_url_schemes = {
|
||||
"https://github.com/NVIDIA/TensorRT-LLM/tree/" + branch_name + "/{{path}}",
|
||||
}
|
||||
|
||||
myst_heading_anchors = 4
|
||||
|
||||
myst_enable_extensions = [
|
||||
"deflist",
|
||||
]
|
||||
|
||||
autosummary_generate = True
|
||||
copybutton_exclude = '.linenos, .gp, .go'
|
||||
copybutton_prompt_text = ">>> |$ |# "
|
||||
|
||||
@ -111,7 +111,8 @@ Welcome to TensorRT-LLM's Documentation!
|
||||
:name: Performance
|
||||
|
||||
performance/perf-overview.md
|
||||
performance/perf-best-practices.md
|
||||
Benchmarking <performance/perf-benchmarking.md>
|
||||
Best Practices <performance/perf-best-practices.md>
|
||||
performance/perf-analysis.md
|
||||
|
||||
|
||||
|
||||
@ -185,12 +185,15 @@ Building from source produces the following library files.
|
||||
- `th_common.exp`
|
||||
- `th_common.lib`
|
||||
|
||||
The locations of the DLLs, in addition to some `torch` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
|
||||
The locations of the DLLs, in addition to some `torch` DLLs and `TensorRT` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
|
||||
|
||||
```bash
|
||||
%USERPROFILE%\inference\TensorRT\lib
|
||||
%USERPROFILE%\inference\TensorRT-LLM\cpp\build\tensorrt_llm
|
||||
%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\tensorrt_llm\libs
|
||||
%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\torch\lib
|
||||
```
|
||||
|
||||
Your `Path` additions may differ, particularly if you used the Docker method and copied all the relevant DLLs into a single folder.
|
||||
|
||||
Again, close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
|
||||
|
||||
@ -4,7 +4,7 @@
|
||||
|
||||
```{note}
|
||||
The Windows release of TensorRT-LLM is currently in beta.
|
||||
We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.13.0) for the most stable experience.
|
||||
We recommend checking out the [v0.14.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.14.0) for the most stable experience.
|
||||
```
|
||||
|
||||
**Prerequisites**
|
||||
@ -52,7 +52,7 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L
|
||||
before installing TensorRT-LLM with the following command.
|
||||
|
||||
```bash
|
||||
pip install tensorrt_llm==0.13.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/
|
||||
pip install tensorrt_llm==0.14.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/
|
||||
```
|
||||
|
||||
Run the following command to verify that your TensorRT-LLM installation is working properly.
|
||||
@ -70,7 +70,7 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L
|
||||
|
||||
This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install
|
||||
[the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version)
|
||||
and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSCV DLL).
|
||||
and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSVC DLL).
|
||||
|
||||
2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies.
|
||||
|
||||
|
||||
@ -1,26 +1,36 @@
|
||||
(perf-benchmarking)=
|
||||
|
||||
# TensorRT-LLM Benchmarking
|
||||
|
||||
> [!WARNING] Work in Progress
|
||||
> This benchmarking suite is a current work in progress and is prone to large changes.
|
||||
```{important}
|
||||
This benchmarking suite is a work in progress.
|
||||
Expect breaking API changes.
|
||||
```
|
||||
|
||||
TensorRT-LLM provides a packaged benchmarking utility that is accessible via the `trtllm-bench` CLI tool.
|
||||
TensorRT-LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility.
|
||||
|
||||
#### Supported Networks for Benchmarking
|
||||
|
||||
- [`tiiuae/falcon-180B`](https://huggingface.co/tiiuae/falcon-180B)
|
||||
- [`meta-llama/Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf)
|
||||
- [`meta-llama/Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf)
|
||||
- [`meta-llama/Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
|
||||
- [`meta-llama/Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
|
||||
- [`EleutherAI/gpt-j-6b`](https://huggingface.co/EleutherAI/gpt-j-6b)
|
||||
- [`mistralai/Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
||||
- [`mistralai/Mixtral-8x7B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
|
||||
- [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
|
||||
- [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf)
|
||||
- [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B)
|
||||
- [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)
|
||||
- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
|
||||
- [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
|
||||
- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
|
||||
- [meta-llama/Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B)
|
||||
- [meta-llama/Llama-3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B)
|
||||
- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
|
||||
- [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
|
||||
- [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
|
||||
- [meta-llama/Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)
|
||||
- [meta-llama/Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)
|
||||
- [mistralai/Mixtral-8x7B-v0.1-Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1-Instruct)
|
||||
|
||||
|
||||
#### Support Quantization Modes
|
||||
|
||||
TensorRT-LLM supports a number of quanization modes. For more information about quantization, see the
|
||||
[documentation](https://nvidia.github.io/TensorRT-LLM/reference/precision.html).
|
||||
TensorRT-LLM supports a number of quantization modes:
|
||||
|
||||
- None (no quantization applied)
|
||||
- W8A16
|
||||
@ -31,7 +41,8 @@ TensorRT-LLM supports a number of quanization modes. For more information about
|
||||
- FP8
|
||||
- INT8
|
||||
|
||||
> [!NOTE] Please see the supported quantization methods for each network [here](https://nvidia.github.io/TensorRT-LLM/reference/precision.html#support-matrix)
|
||||
For more information about quantization, refer to [](../reference/precision.md) and
|
||||
the [support matrix](../reference/precision.md#support-matrix) of the supported quantization methods for each network.
|
||||
|
||||
|
||||
## Inflight Benchmarking with a Dataset
|
||||
@ -41,9 +52,10 @@ This section covers how to benchmark TensorRT-LLM using inflight batching.
|
||||
|
||||
### Quickstart
|
||||
|
||||
For this quick start guide, we will focus on running a short max throughput benchmark on
|
||||
This quick start focuses on running a short max throughput benchmark on
|
||||
`meta-llama/Llama-2-7b-hf` on a synthetic dataset with a uniform distribution of prompts with ISL:OSL
|
||||
of 128:128. In order to run the benchmark from start to finish simply run the following commands:
|
||||
of 128:128.
|
||||
To run the benchmark from start to finish, run the following commands:
|
||||
|
||||
```shell
|
||||
python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-2-7b-hf token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 > /tmp/synthetic_128_128.txt
|
||||
@ -51,7 +63,8 @@ trtllm-bench --model meta-llama/Llama-2-7b-hf build --dataset /tmp/synthetic_128
|
||||
trtllm-bench --model meta-llama/Llama-2-7b-hf throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
|
||||
```
|
||||
|
||||
And that's it! Once the benchmark completes, a summary will be printed with summary metrics.
|
||||
And that's it!
|
||||
After the benchmark completes, `trtllm-bench` prints a summary with summary metrics.
|
||||
|
||||
```shell
|
||||
===========================================================
|
||||
@ -108,28 +121,31 @@ straightforward to specify requests. The schema is defined as follows:
|
||||
| `logits` | N* | List[Integer] | List of logits that make up the request prompt. |
|
||||
| `output_tokens` | Y | Integer | Number of generated tokens for this request. |
|
||||
|
||||
> [!NOTE] Prompt and logits are mutually exclusive*
|
||||
> While having both `prompt` and `logits` is not required, at least one is required.
|
||||
> If `logits` are specified, the `prompt` entry is ignored for request generation.
|
||||
Prompt and logits are mutually exclusive, but one of `prompt` or `logits` is required.
|
||||
If you specify `logits`, the `prompt` entry is ignored for request generation.
|
||||
|
||||
Examples of valid entries for the inflight benchmark are:
|
||||
Refer to the following examples of valid entries for the inflight benchmark:
|
||||
|
||||
- Entries with a human-readable prompt and no logits.
|
||||
```json
|
||||
{"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000}
|
||||
{"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000}
|
||||
```
|
||||
|
||||
```json
|
||||
{"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000}
|
||||
{"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000}
|
||||
```
|
||||
|
||||
- Entries which contain logits.
|
||||
```json
|
||||
{"task_id":0,"logits":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128}
|
||||
{"task_id":1,"logits":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128}
|
||||
```
|
||||
|
||||
> [!INFO] A whole entry is on a line!
|
||||
> To make the passing of data simpler, a complete JSON entry is on each line so that the benchmarker
|
||||
> can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete
|
||||
> JSON entry is on every line.
|
||||
```json
|
||||
{"task_id":0,"logits":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128}
|
||||
{"task_id":1,"logits":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128}
|
||||
```
|
||||
|
||||
```{tip}
|
||||
Specify each entry on one line.
|
||||
To simplify passing the data, a complete JSON entry is on each line so that the benchmarker
|
||||
can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete
|
||||
JSON entry is on every line.
|
||||
```
|
||||
|
||||
#### Using `prepare_dataset` to Create Synthetic Datasets
|
||||
|
||||
@ -162,12 +178,12 @@ trtllm-bench --model meta-llama/Llama-2-7b-hf build --max_seq_len 256 --quantiza
|
||||
|
||||
> [!NOTE] `trtllm-bench build` reproduces benchmark engines for performance study. These engine
|
||||
configurations are not guaranteed to be optimal for all cases and should be viewed as reproducers
|
||||
for the benchmark data we provide on our [Performance Overview](../docs/source/performance/perf-overview.md).
|
||||
for the benchmark data we provide on our [Performance Overview](./perf-overview.md).
|
||||
|
||||
Looking a little closer, the `build` sub-command
|
||||
will perform a lookup and build an engine using those reference settings. The
|
||||
look up table directly corresponds to the performance table found in our
|
||||
[Performance Overview](../docs/source/performance/perf-overview.md#throughput-measurements). The
|
||||
[Performance Overview](./perf-overview.md#throughput-measurements). The
|
||||
output of the `build` sub-command looks similar to the snippet below (for `meta-llama/Llama-2-7b-hf`):
|
||||
|
||||
```shell
|
||||
@ -236,16 +252,17 @@ upper bound throughput number.
|
||||
|
||||
#### How the Benchmarker Works
|
||||
|
||||
The benchmarker will read in a data file or standard input (stdin) as a stream where a single line contains
|
||||
a complete JSON request entry. The process that the benchmarker is as follows:
|
||||
The benchmarker reads a data file where a single line contains
|
||||
a complete JSON request entry as specified in [](#preparing-a-dataset).
|
||||
The process that the benchmarker is as follows:
|
||||
|
||||
1. Iterate over all input requests. If `logits` is specified, construct the request using the specified
|
||||
list of logits. Otherwise, tokenize the `prompt` with as specified by `--model $HF_MODEL_NAME`.
|
||||
3. Submit the dataset to the TensorRT-LLM `Executor` API at as fast of a rate as possible (offline mode).
|
||||
4. Wait for all requests to return, compute statistics, then report out results.
|
||||
1. Submit the dataset to the TensorRT-LLM `Executor` API as fast as possible (offline mode).
|
||||
1. Wait for all requests to return, compute statistics, and then report results.
|
||||
|
||||
To run the benchmarker, run the following with the [engine](#building-a-benchmark-engine) and
|
||||
[dataset](#preparing-a-dataset) generated above:
|
||||
To run the benchmarker, run the following commands with the [engine](#building-a-benchmark-engine) and
|
||||
[dataset](#preparing-a-dataset) generated from previous steps:
|
||||
|
||||
```shell
|
||||
trtllm-bench --model meta-llama/Llama-2-7b-hf throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
|
||||
@ -316,16 +333,157 @@ Total Latency (seconds): 20.331645167
|
||||
[TensorRT-LLM][INFO] Refreshed the MPI local session
|
||||
```
|
||||
|
||||
## Low Latency Benchmark
|
||||
|
||||
The low latency benchmark follows a similar workflow to the [throughput benchmark](#running-a-max-throughput-benchmark)
|
||||
but requires building the engine separately from `trtllm-bench`. Low latency benchmarks has the following modes:
|
||||
|
||||
- A single-request low-latency engine
|
||||
- A Medusa-enabled speculative-decoding engine
|
||||
|
||||
### Low Latency TensorRT-LLM Engine for Llama-3 70B
|
||||
|
||||
To build a low-latency engine for the latency benchmark, run the following quantize and build commands.
|
||||
The `$checkpoint_dir` is the path to the [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) Hugging Face checkpoint in your cache or downloaded to a specific location with the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
|
||||
To prepare a dataset, follow the same process as specified in [](#preparing-a-dataset).
|
||||
|
||||
#### Benchmarking a non-Medusa Low Latency Engine
|
||||
|
||||
To quantize the checkpoint:
|
||||
|
||||
```shell
|
||||
cd tensorrt_llm/examples/llama
|
||||
python ../quantization/quantize.py \
|
||||
--model_dir $checkpoint_dir \
|
||||
--dtype bfloat16 \
|
||||
--qformat fp8 \
|
||||
--kv_cache_dtype fp8 \
|
||||
--output_dir /tmp/meta-llama/Meta-Llama-3-70B/checkpoint \
|
||||
--calib_size 512 \
|
||||
--tp_size $tp_size
|
||||
```
|
||||
|
||||
then build,
|
||||
|
||||
```shell
|
||||
trtllm-build \
|
||||
--checkpoint_dir /tmp/meta-llama/Meta-Llama-3-70B/checkpoint \
|
||||
--use_fused_mlp enable \
|
||||
--gpt_attention_plugin bfloat16 \
|
||||
--output_dir /tmp/meta-llama/Meta-Llama-3-70B/engine \
|
||||
--max_batch_size 1 \
|
||||
--max_seq_len $(($isl+$osl)) \
|
||||
--reduce_fusion enable \
|
||||
--gemm_plugin fp8 \
|
||||
--workers $tp_size \
|
||||
--use_fp8_context_fmha enable \
|
||||
--max_num_tokens $isl \
|
||||
--use_paged_context_fmha disable \
|
||||
--multiple_profiles enable
|
||||
```
|
||||
|
||||
After the engine is built, run the low-latency benchmark:
|
||||
|
||||
```shell
|
||||
env TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG=1 \
|
||||
TRTLLM_MMHA_KERNEL_BLOCK_SIZE=256 \
|
||||
TRTLLM_MMHA_BLOCKS_PER_SEQUENCE=32 \
|
||||
FORCE_MULTI_BLOCK_MODE=ON \
|
||||
TRTLLM_ENABLE_FDL=1 \
|
||||
trtllm-bench --model meta-llama/Meta-Llama-3-70B \
|
||||
latency \
|
||||
--dataset $DATASET_PATH \
|
||||
--engine_dir /tmp/meta-llama/Meta-Llama-3-70B/engine
|
||||
```
|
||||
|
||||
#### Building a Medusa Low-Latency Engine
|
||||
|
||||
To build a Medusa-enabled engine requires checkpoints that contain Medusa heads.
|
||||
NVIDIA provides TensorRT-LLM checkpoints on the [NVIDIA](https://huggingface.co/nvidia) page on Hugging Face.
|
||||
The checkpoints are pre-quantized and can be directly built after downloading them with the
|
||||
[huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
|
||||
After you download the checkpoints, run the following command and specify the `$tp_size` supported by your Medusa checkpoint:
|
||||
|
||||
```shell
|
||||
trtllm-build --checkpoint_dir $checkpoint_dir \
|
||||
--speculative_decoding_mode medusa \
|
||||
--max_batch_size 1 \
|
||||
--gpt_attention_plugin bfloat16 \
|
||||
--max_seq_len $(($isl+$osl)) \
|
||||
--output_dir /tmp/meta-llama/Meta-Llama-3-70B/medusa/engine \
|
||||
--use_fused_mlp enable \
|
||||
--paged_kv_cache enable \
|
||||
--use_paged_context_fmha disable \
|
||||
--multiple_profiles enable \
|
||||
--reduce_fusion enable \
|
||||
--use_fp8_context_fmha enable \
|
||||
--workers $tp_size \
|
||||
--low_latency_gemm_plugin fp8
|
||||
```
|
||||
|
||||
After the engine is built, you need to define the Medusa choices.
|
||||
The choices are specify with a YAML file like the following example (`medusa.yaml`):
|
||||
|
||||
```yaml
|
||||
- [0]
|
||||
- [0, 0]
|
||||
- [1]
|
||||
- [0, 1]
|
||||
- [2]
|
||||
- [0, 0, 0]
|
||||
- [1, 0]
|
||||
- [0, 2]
|
||||
- [3]
|
||||
- [0, 3]
|
||||
- [4]
|
||||
- [0, 4]
|
||||
- [2, 0]
|
||||
- [0, 5]
|
||||
- [0, 0, 1]
|
||||
```
|
||||
|
||||
To run the Medusa-enabled engine, run the following command:
|
||||
|
||||
```shell
|
||||
env TRTLLM_ENABLE_PDL=1 \
|
||||
UB_ONESHOT=1 \
|
||||
UB_TP_SIZE=$tp_size \
|
||||
TRTLLM_ENABLE_PDL=1 \
|
||||
TRTLLM_PDL_OVERLAP_RATIO=0.15 \
|
||||
TRTLLM_PREFETCH_RATIO=-1 \
|
||||
trtllm-bench --model meta-llama/Meta-Llama-3-70B \
|
||||
latency \
|
||||
--dataset $DATASET_PATH \
|
||||
--engine_dir /tmp/meta-llama/Meta-Llama-3-70B/medusa/engine \
|
||||
--medusa_choices medusa.yml
|
||||
```
|
||||
|
||||
## Summary
|
||||
|
||||
In summary, the general process for reproducing a benchmark point is as follows:
|
||||
The following table summarizes the commands needed for running benchmarks:
|
||||
|
||||
- Prepare a dataset: `python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $HF_MODEL token-norm-dist --input-mean $ISL --output-mean $OSL --input-stdev 0 --output-stdev 0 --num-requests $NUM_REQUESTS > $DATASET_PATH`
|
||||
- Build engine: `trtllm-bench --model $HF_MODEL build --dataset $DATASET_PATH`
|
||||
- Benchmark engine: trtllm-bench --model $HF_MODEL throughput --dataset $DATASET_PATH --engine_dir $ENGINE_DIR`
|
||||
| Scenario | Phase | Command |
|
||||
| - | - | - |
|
||||
| Dataset | Preparation | `python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $HF_MODEL token-norm-dist --input-mean $ISL --output-mean $OSL --input-stdev 0 --output-stdev 0 --num-requests $NUM_REQUESTS > $DATASET_PATH` |
|
||||
| Throughput | Build | `trtllm-bench --model $HF_MODEL build --dataset $DATASET_PATH` |
|
||||
| Throughput | Benchmark | `trtllm-bench --model $HF_MODEL throughput --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` |
|
||||
| Latency | Build | See [section about building low latency engines](#low-latency-tensorrt-llm-engine-for-llama-3-70b) |
|
||||
| Non-Medusa Latency | Benchmark | `trtllm-bench --model $HF_MODEL latency --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` |
|
||||
| Medusa Latency | Benchmark | `trtllm-bench --model $HF_MODEL latency --dataset $DATASET_PATH --engine_dir $ENGINE_DIR --medusa_choices $MEDUSA_CHOICES` |
|
||||
|
||||
where,
|
||||
- `$HF_MODEL` is the Huggingface name of a model.
|
||||
- `$NUM_REQUESTS` is the number of requests to generate.
|
||||
- `$DATASET_PATH` is the path where the dataset was written when preparing the dataset.
|
||||
- `$ENGINE_DIR` the engine directory as printed by `trtllm-bench build`.
|
||||
|
||||
`$HF_MODEL`
|
||||
: The Hugging Face name of a model.
|
||||
|
||||
`$NUM_REQUESTS`
|
||||
: The number of requests to generate.
|
||||
|
||||
`$DATASET_PATH`
|
||||
: The path where the dataset was written when preparing the dataset.
|
||||
|
||||
`$ENGINE_DIR`
|
||||
: The engine directory as printed by `trtllm-bench build`.
|
||||
|
||||
`$MEDUSA_CHOICES`
|
||||
: A YAML config representing the Medusa tree for the benchmark.
|
||||
@ -149,24 +149,16 @@ only supported for the llama model. It is recommended to enable this feature whe
|
||||
The embedding parallelism feature enables the sharding of the embedding table
|
||||
across multiple GPUs, so that the memory usage could be reduced and the
|
||||
throughput improved. The embedding sharing feature enables the sharing of the
|
||||
embedding table between `look_up` and `lm_head` layers.
|
||||
embedding table between `look_up` and `lm_head` layers to reduced memory usage.
|
||||
|
||||
The look-up plugin implements the embedding sharing feature and is required to
|
||||
enable the aforementioned features for now (until TensorRT native layers
|
||||
support embedding sharing).
|
||||
It is recommended to enable embedding parallelism to improve throughput with `--use_parallel_embedding` and `--embedding_sharding_dim` in `convert_checkpoint.py`.
|
||||
|
||||
It is recommended to enable the embedding parallelism and sharing features to
|
||||
improve throughput. However, the following conditions have to be satisfied:
|
||||
Embedding sharing is by default enabled if following conditions are met:
|
||||
1. `look_up` and `lm_head` layers have identical weights.
|
||||
2. `--gemm_plugin` is not used when building the engine.
|
||||
3. For tensor parallelism cases, `-embedding_sharding_dim 0` must be set. In other words, we must enable embedding parallelism along the vocab dimension,
|
||||
|
||||
1. The model shares the embedding table between `look_up` and `lm_head` layers,
|
||||
2. Both look_up plugin and gemm plugin are enabled,
|
||||
3. The sharding dimension of the embedding lookup table is set correctly.
|
||||
|
||||
To enable the features, use the `--use_parallel_embedding`, `--embedding_sharding_dim` and
|
||||
`--use_embedding_sharing` arguments in `convert_checkpoint.py`, and use the
|
||||
`--lookup_plugin`, `--gemm_plugin` arguments in `trtllm-build` command. See those
|
||||
[Examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt#embedding-parallelism-and-sharing)
|
||||
for details.
|
||||
See those [Examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt#embedding-parallelism) for details.
|
||||
|
||||
### Horizontal Fusion in Gated-MLP
|
||||
|
||||
|
||||
@ -226,7 +226,7 @@ The performance numbers below were collected using the steps described in this d
|
||||
> [!NOTE] The only models supported in this workflow are those listed in the table above.
|
||||
|
||||
The following tables are references for commands that are used as part of the benchmarking process. For a more detailed
|
||||
description of this benchmarking workflow, see the [Benchmarking Suite README](../../../benchmarks/Suite.md).
|
||||
description of this benchmarking workflow, see the [benchmarking suite documentation](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html).
|
||||
|
||||
### Commands
|
||||
|
||||
|
||||
@ -105,9 +105,9 @@ The following table shows the supported software for TensorRT-LLM.
|
||||
* -
|
||||
- Software Compatibility
|
||||
* - Container
|
||||
- [24.07](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
|
||||
- [24.09](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
|
||||
* - TensorRT
|
||||
- [10.4](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
|
||||
- [10.5](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
|
||||
* - Precision
|
||||
-
|
||||
- Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
|
||||
|
||||
@ -12,27 +12,6 @@ In most occasions, these problems are caused by the workflow like: an old compil
|
||||
|
||||
Solution: try running build script with `--clean`, or try running `rm -r build cpp/build` before running build script again.
|
||||
|
||||
## cuDNN Linking Errors
|
||||
|
||||
Errors such as "Entry Point Not Found" (for example [#1062](https://github.com/NVIDIA/TensorRT-LLM/issues/1062)).
|
||||
|
||||
Solution: the issue might be a mismatch in the `cuDNN` libraries shipped from `torch` and `tensorrt`. To rectify this, please try the following steps
|
||||
|
||||
```bash
|
||||
python -m pip uninstall -y tensorrt_llm
|
||||
python -m pip install --upgrade pip
|
||||
python -m pip install nvidia-cudnn-cu11==8.9.4.25 --no-cache-dir
|
||||
python -m pip install --pre --extra-index-url https://pypi.nvidia.com/ tensorrt==9.2.0.post12.dev5 --no-cache-dir
|
||||
python -m pip uninstall -y nvidia-cudnn-cu11
|
||||
python -m pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com/ --extra-index-url https://download.pytorch.org/whl/cu121
|
||||
```
|
||||
|
||||
## Model Debug
|
||||
|
||||
When debugging a TensorRT-LLM model, we usually want to print the value of the intermediate tensors.
|
||||
|
||||
We should mark the interested intermediate tensors as the network outputs, then print their values at runtime, since TensorRT-LLM obeys define-and-run paradigm.
|
||||
|
||||
## Debug on Unit Tests
|
||||
|
||||
Here is an example to print the values of the MLP output tensor in the a unit test ([full example](../../../tests/test_debugging_api.py)).
|
||||
|
||||
@ -5,6 +5,49 @@
|
||||
All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).
|
||||
|
||||
|
||||
## TensorRT-LLM Release 0.14.0
|
||||
|
||||
### Key Features and Enhancements
|
||||
- Enhanced the `LLM` class in the [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/index.html).
|
||||
- Added support for calibration with offline dataset.
|
||||
- Added support for Mamba2.
|
||||
- Added support for `finish_reason` and `stop_reason`.
|
||||
- Added FP8 support for CodeLlama.
|
||||
- Added `__repr__` methods for class `Module`, thanks to the contribution from @1ytic in #2191.
|
||||
- Added BFloat16 support for fused gated MLP.
|
||||
- Updated ReDrafter beam search logic to match Apple ReDrafter v1.1.
|
||||
- Improved `customAllReduce` performance.
|
||||
- Draft model now can copy logits directly over MPI to the target model's process in `orchestrator` mode. This fast logits copy reduces the delay between draft token generation and the beginning of target model inference.
|
||||
- NVIDIA Volta GPU support is deprecated and will be removed in a future release.
|
||||
|
||||
### API Changes
|
||||
- [BREAKING CHANGE] The default `max_batch_size` of the `trtllm-build` command is set to `2048`.
|
||||
- [BREAKING CHANGE] Remove `builder_opt` from the `BuildConfig` class and the `trtllm-build` command.
|
||||
- Add logits post-processor support to the `ModelRunnerCpp` class.
|
||||
- Added `isParticipant` method to the C++ `Executor` API to check if the current process is a participant in the executor instance.
|
||||
|
||||
### Model Updates
|
||||
- Added support for NemotronNas, see `examples/nemotron_nas/README.md`.
|
||||
- Added support for Deepseek-v1, see `examples/deepseek_v1/README.md`.
|
||||
- Added support for Phi-3.5 models, see `examples/phi/README.md`.
|
||||
|
||||
### Fixed Issues
|
||||
- Fixed a typo in `tensorrt_llm/models/model_weights_loader.py`, thanks to the contribution from @wangkuiyi in #2152.
|
||||
- Fixed duplicated import module in `tensorrt_llm/runtime/generation.py`, thanks to the contribution from @lkm2835 in #2182.
|
||||
- Enabled `share_embedding` for the models that have no `lm_head` in legacy checkpoint conversion path, thanks to the contribution from @lkm2835 in #2232.
|
||||
- Fixed `kv_cache_type` issue in the Python benchmark, thanks to the contribution from @qingquansong in #2219.
|
||||
- Fixed an issue with SmoothQuant calibration with custom datasets. Thanks to the contribution by @Bhuvanesh09 in #2243.
|
||||
- Fixed an issue surrounding `trtllm-build --fast-build` with fake or random weights. Thanks to @ZJLi2013 for flagging it in #2135.
|
||||
- Fixed missing `use_fused_mlp` when constructing `BuildConfig` from dict, thanks for the fix from @ethnzhng in #2081.
|
||||
- Fixed lookahead batch layout for `numNewTokensCumSum`. (#2263)
|
||||
|
||||
### Infrastructure Changes
|
||||
- The dependent ModelOpt version is updated to v0.17.
|
||||
|
||||
### Documentation
|
||||
- @Sherlock113 added a [tech blog](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml) to the latest news in #2169, thanks for the contribution.
|
||||
|
||||
|
||||
## TensorRT-LLM Release 0.13.0
|
||||
|
||||
### Key Features and Enhancements
|
||||
|
||||
@ -2,6 +2,9 @@
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import signal
|
||||
from contextlib import asynccontextmanager
|
||||
from http import HTTPStatus
|
||||
from typing import AsyncGenerator, Optional
|
||||
|
||||
import click
|
||||
@ -9,6 +12,7 @@ import uvicorn
|
||||
from fastapi import FastAPI, Request
|
||||
from fastapi.responses import JSONResponse, Response, StreamingResponse
|
||||
|
||||
from tensorrt_llm.executor import CppExecutorError, RequestError
|
||||
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams
|
||||
|
||||
TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
||||
@ -16,11 +20,16 @@ TIMEOUT_KEEP_ALIVE = 5 # seconds.
|
||||
|
||||
class LlmServer:
|
||||
|
||||
def __init__(self, llm: LLM, kv_cache_config: KvCacheConfig):
|
||||
def __init__(self, llm: LLM):
|
||||
self.llm = llm
|
||||
self.kv_cache_config = kv_cache_config
|
||||
|
||||
self.app = FastAPI()
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# terminate rank0 worker
|
||||
yield
|
||||
self.llm._shutdown()
|
||||
|
||||
self.app = FastAPI(lifespan=lifespan)
|
||||
self.register_routes()
|
||||
|
||||
def register_routes(self):
|
||||
@ -50,20 +59,27 @@ class LlmServer:
|
||||
|
||||
sampling_params = SamplingParams(**request_dict)
|
||||
|
||||
promise = self.llm.generate_async(prompt,
|
||||
streaming=streaming,
|
||||
sampling_params=sampling_params)
|
||||
try:
|
||||
promise = self.llm.generate_async(prompt,
|
||||
streaming=streaming,
|
||||
sampling_params=sampling_params)
|
||||
|
||||
async def stream_results() -> AsyncGenerator[bytes, None]:
|
||||
async for output in promise:
|
||||
yield output.outputs[0].text_diff.encode("utf-8")
|
||||
async def stream_results() -> AsyncGenerator[bytes, None]:
|
||||
async for output in promise:
|
||||
yield output.outputs[0].text_diff.encode("utf-8")
|
||||
|
||||
if streaming:
|
||||
return StreamingResponse(stream_results())
|
||||
if streaming:
|
||||
return StreamingResponse(stream_results())
|
||||
|
||||
# Non-streaming case
|
||||
await promise.aresult()
|
||||
return JSONResponse({"text": promise.outputs[0].text})
|
||||
# Non-streaming case
|
||||
await promise.aresult()
|
||||
return JSONResponse({"text": promise.outputs[0].text})
|
||||
except RequestError as e:
|
||||
return JSONResponse(content=str(e),
|
||||
status_code=HTTPStatus.BAD_REQUEST)
|
||||
except CppExecutorError:
|
||||
# If internal executor error is raised, shutdown the server
|
||||
signal.raise_signal(signal.SIGINT)
|
||||
|
||||
async def __call__(self, host, port):
|
||||
config = uvicorn.Config(self.app,
|
||||
@ -82,28 +98,32 @@ class LlmServer:
|
||||
@click.option("--max_beam_width", type=int, default=1)
|
||||
@click.option("--tp_size", type=int, default=1)
|
||||
@click.option("--pp_size", type=int, default=1)
|
||||
@click.option("--kv_cache_free_gpu_memory_fraction", type=float, default=0.8)
|
||||
def entrypoint(model_dir: str,
|
||||
tokenizer: Optional[str] = None,
|
||||
host: Optional[str] = None,
|
||||
port: int = 8000,
|
||||
max_beam_width: int = 1,
|
||||
tp_size: int = 1,
|
||||
pp_size: int = 1):
|
||||
pp_size: int = 1,
|
||||
kv_cache_free_gpu_memory_fraction: float = 0.8):
|
||||
host = host or "0.0.0.0"
|
||||
port = port or 8000
|
||||
logging.info(f"Starting server at {host}:{port}")
|
||||
|
||||
build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width)
|
||||
|
||||
kv_cache_config = KvCacheConfig(
|
||||
free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction)
|
||||
|
||||
llm = LLM(model_dir,
|
||||
tokenizer,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
build_config=build_config)
|
||||
build_config=build_config,
|
||||
kv_cache_config=kv_cache_config)
|
||||
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
|
||||
|
||||
server = LlmServer(llm=llm, kv_cache_config=kv_cache_config)
|
||||
server = LlmServer(llm=llm)
|
||||
|
||||
asyncio.run(server(host, port))
|
||||
|
||||
|
||||
@ -1,6 +1,8 @@
|
||||
#!/usr/bin/env python
|
||||
import asyncio
|
||||
import logging
|
||||
import signal
|
||||
from contextlib import asynccontextmanager
|
||||
from http import HTTPStatus
|
||||
from pathlib import Path
|
||||
from typing import (AsyncGenerator, AsyncIterator, List, Optional, Tuple,
|
||||
@ -15,6 +17,7 @@ from openai.types.chat import ChatCompletionMessageParam
|
||||
from transformers import AutoTokenizer, PreTrainedTokenizer
|
||||
|
||||
# yapf: disable
|
||||
from tensorrt_llm.executor import CppExecutorError
|
||||
from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig
|
||||
from tensorrt_llm.llmapi.llm import RequestOutput
|
||||
from tensorrt_llm.llmapi.openai_protocol import (
|
||||
@ -66,10 +69,8 @@ class OpenaiServer:
|
||||
def __init__(self,
|
||||
llm: LLM,
|
||||
model: str,
|
||||
kv_cache_config: KvCacheConfig,
|
||||
hf_tokenizer: PreTrainedTokenizer = None):
|
||||
self.llm = llm
|
||||
self.kv_cache_config = kv_cache_config
|
||||
self.tokenizer = hf_tokenizer
|
||||
|
||||
model_dir = Path(model)
|
||||
@ -78,7 +79,13 @@ class OpenaiServer:
|
||||
else:
|
||||
self.model = model
|
||||
|
||||
self.app = FastAPI()
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
# terminate rank0 worker
|
||||
yield
|
||||
self.llm._shutdown()
|
||||
|
||||
self.app = FastAPI(lifespan=lifespan)
|
||||
|
||||
@self.app.exception_handler(RequestValidationError)
|
||||
async def validation_exception_handler(_, exc):
|
||||
@ -326,7 +333,9 @@ class OpenaiServer:
|
||||
else:
|
||||
response = await create_chat_response(promise)
|
||||
return JSONResponse(content=response.model_dump())
|
||||
|
||||
except CppExecutorError:
|
||||
# If internal executor error is raised, shutdown the server
|
||||
signal.raise_signal(signal.SIGINT)
|
||||
except Exception as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
@ -432,6 +441,9 @@ class OpenaiServer:
|
||||
else:
|
||||
response = await create_completion_response(generator, num_choices)
|
||||
return JSONResponse(content=response.model_dump())
|
||||
except CppExecutorError:
|
||||
# If internal executor error is raised, shutdown the server
|
||||
signal.raise_signal(signal.SIGINT)
|
||||
except Exception as e:
|
||||
return self.create_error_response(str(e))
|
||||
|
||||
@ -453,6 +465,7 @@ class OpenaiServer:
|
||||
@click.option("--max_seq_len", type=int, default=512)
|
||||
@click.option("--tp_size", type=int, default=1)
|
||||
@click.option("--pp_size", type=int, default=1)
|
||||
@click.option("--kv_cache_free_gpu_memory_fraction", type=float, default=0.8)
|
||||
def entrypoint(model_dir: str,
|
||||
tokenizer: Optional[str] = None,
|
||||
host: Optional[str] = None,
|
||||
@ -460,25 +473,27 @@ def entrypoint(model_dir: str,
|
||||
max_beam_width: int = 1,
|
||||
max_seq_len: int = 512,
|
||||
tp_size: int = 1,
|
||||
pp_size: int = 1):
|
||||
pp_size: int = 1,
|
||||
kv_cache_free_gpu_memory_fraction: float = 0.8):
|
||||
host = host or "0.0.0.0"
|
||||
port = port or 8000
|
||||
logging.info(f"Starting server at {host}:{port}")
|
||||
|
||||
build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width, max_seq_len=max_seq_len)
|
||||
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction)
|
||||
|
||||
llm = LLM(model_dir,
|
||||
tokenizer,
|
||||
tensor_parallel_size=tp_size,
|
||||
pipeline_parallel_size=pp_size,
|
||||
build_config=build_config)
|
||||
build_config=build_config,
|
||||
kv_cache_config=kv_cache_config)
|
||||
|
||||
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
|
||||
hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer or model_dir)
|
||||
|
||||
server = OpenaiServer(llm=llm,
|
||||
model=model_dir,
|
||||
kv_cache_config=kv_cache_config,
|
||||
hf_tokenizer=hf_tokenizer)
|
||||
|
||||
asyncio.run(server(host, port))
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.15.0.dev2024102200
|
||||
tensorrt_llm==0.15.0.dev2024102900
|
||||
datasets~=2.15.0
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -106,7 +106,6 @@ python convert_checkpoint.py --model_dir ./bloom/176B/ \
|
||||
--embedding_sharding_dim 0
|
||||
trtllm-build --checkpoint_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
|
||||
--gemm_plugin float16 \
|
||||
--lookup_plugin float16 \
|
||||
--output_dir ./bloom/176B/trt_engines/fp16/8-gpu/ \
|
||||
--workers 2
|
||||
|
||||
@ -123,7 +122,7 @@ trtllm-build --checkpoint_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
|
||||
--workers 2
|
||||
|
||||
# share embedding table between embedding() and lm_head() layers
|
||||
# To reduce the generated engine size, we has to use gemm and lookup plugin (--gemm_plugin --lookup_plugin) and must shard the embedding table in the vocab dimension.
|
||||
# To reduce the generated engine size, we can turn off gemm plugin and shard the embedding table in the vocab dimension.
|
||||
python convert_checkpoint.py --model_dir ./bloom/176B/ \
|
||||
--dtype float16 \
|
||||
--output_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
|
||||
@ -132,8 +131,6 @@ python convert_checkpoint.py --model_dir ./bloom/176B/ \
|
||||
--embedding_sharding_dim 0 \
|
||||
--use_embedding_sharing
|
||||
trtllm-build --checkpoint_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
|
||||
--gemm_plugin float16 \
|
||||
--lookup_plugin float16 \
|
||||
--output_dir ./bloom/176B/trt_engines/fp16/8-gpu/ \
|
||||
--workers 2
|
||||
```
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.15.0.dev2024102200
|
||||
tensorrt_llm==0.15.0.dev2024102900
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,8 +1,10 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.15.0.dev2024102200
|
||||
tensorrt_llm==0.15.0.dev2024102900
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
protobuf
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece
|
||||
tiktoken
|
||||
# https://github.com/THUDM/ChatGLM3/issues/1324
|
||||
transformers<=4.43.0
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.15.0.dev2024102200
|
||||
tensorrt_llm==0.15.0.dev2024102900
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.15.0.dev2024102200
|
||||
tensorrt_llm==0.15.0.dev2024102900
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.15.0.dev2024102200
|
||||
tensorrt_llm==0.15.0.dev2024102900
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece~=0.1.99
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.15.0.dev2024102200
|
||||
tensorrt_llm==0.15.0.dev2024102900
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
SentencePiece~=0.1.99
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user