Update TensorRT-LLM (#2389)

* Update TensorRT-LLM --------- Co-authored-by: Alessio Netti <netti.alessio@gmail.com>
2026-01-13 22:18:36 +08:00 · 2024-10-29 22:24:38 +08:00 · 2024-10-29 22:24:38 +08:00 · f14d1d433c
commit f14d1d433c
parent 3c46c2794e
197 changed files with 4711 additions and 811 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -13,7 +13,7 @@
 	url = https://github.com/NVIDIA/NVTX.git
 [submodule "3rdparty/ucxx"]
 	path = 3rdparty/ucxx
-	url = https://github.com/GuanLuo/ucxx.git
+	url = https://github.com/rapidsai/ucxx.git
 [submodule "3rdparty/pybind11"]
 	path = 3rdparty/pybind11
 	url = https://github.com/pybind/pybind11.git
--- a/3rdparty/ucxx
+++ b/3rdparty/ucxx
@ -1 +1 @@
-Subproject commit b99181779672965c6f325a95a29eb433b6e9cbbd
+Subproject commit 5c745102c26df11e68f11368bcd9649e81e981da
--- a/README.md
+++ b/README.md
@ -6,8 +6,8 @@ TensorRT-LLM

 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.5.1-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.4.0-green)](https://developer.nvidia.com/tensorrt)
+[![cuda](https://img.shields.io/badge/cuda-12.6.1-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.5.0-green)](https://developer.nvidia.com/tensorrt)
 [![version](https://img.shields.io/badge/release-0.15.0.dev-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

--- a/benchmarks/README.md
+++ b/benchmarks/README.md
@ -7,5 +7,6 @@ There are currently three workflows to benchmark TensorRT-LLM:
  - The recommended workflow that uses TensorRT-LLM C++ API and can take advantage of the latest features of TensorRT-LLM.
 * [Python benchmarks](./python)
  - The Python benchmarking scripts can only benchmark the Python runtime, which do not support the latest features, such as in-flight batching.
-* [The Python benchmarking suite](./Suite.md)
-  - This benchmarking suite is a current work in progress and is prone to large changes.
+* [The Python benchmarking suite](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html)
+  - This benchmarker is native to TensorRT-LLM and is a Python benchmarker for reproducing and testing the performance of TensorRT-LLM.
+  - _NOTE_: This benchmarking suite is a current work in progress and is prone to large changes.
--- a/benchmarks/cpp/gptManagerBenchmark.cpp
+++ b/benchmarks/cpp/gptManagerBenchmark.cpp
@ -145,6 +145,7 @@ struct BenchmarkParams
 {
    std::optional<SizeType32> maxTokensInPagedKvCache{std::nullopt};
    std::optional<float> freeGpuMemoryFraction{std::nullopt};
+    std::optional<float> crossKvCacheFraction{std::nullopt};
    bool enableTrtOverlap{false};
    bool enableBlockReuse{false};
    bool enableChunkedContext{false};
@ -882,7 +883,8 @@ public:
        texec::SchedulerConfig schedulerConfig(capacitySchedulerPolicy);
        texec::KvCacheConfig kvCacheConfig(benchmarkParams.enableBlockReuse, benchmarkParams.maxTokensInPagedKvCache,
            benchmarkParams.maxAttentionWindowVec, benchmarkParams.sinkTokenLength,
-            benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks);
+            benchmarkParams.freeGpuMemoryFraction, benchmarkParams.kvHostCacheSize, benchmarkParams.kvOnboardBlocks,
+            benchmarkParams.crossKvCacheFraction);
        texec::PeftCacheConfig peftCacheConfig(0, benchmarkParams.loraDeviceNumModLayers, 8, 64, 4, 4, 4, 24, 8,
            std::nullopt, benchmarkParams.loraHostCacheSize);
        texec::ExtendedRuntimePerfKnobConfig extendedRuntimePerfKnobConfig(benchmarkParams.multiBlockMode,
@ -1486,6 +1488,7 @@ texec::Request makeExecutorRequest(Sample const& sample, SizeType32 const& beamW
        std::nullopt,    // pTuning
        loraConfig,      // loraConfig
        lookaheadConfig, // lookaheadConfig
+        std::nullopt,    // kvCacheRetentionConfig
        std::nullopt,    // logitsPostProcessorName
        encoderInputTokenIds.has_value() ? encoderInputTokenIds : std::nullopt);
 }
@ -1509,6 +1512,10 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
    {
        optionalParams.kvCacheConfig.freeGpuMemoryFraction = benchmarkParams.freeGpuMemoryFraction;
    }
+    if (benchmarkParams.crossKvCacheFraction)
+    {
+        optionalParams.kvCacheConfig.crossKvCacheFraction = benchmarkParams.crossKvCacheFraction;
+    }
    if (benchmarkParams.maxAttentionWindowVec)
    {
        optionalParams.kvCacheConfig.maxAttentionWindowVec = benchmarkParams.maxAttentionWindowVec;
@ -1953,6 +1960,8 @@ int main(int argc, char* argv[])
        "random_seed", "integer random seed for exponential time delays.", cxxopts::value<int>()->default_value("420"));
    options.add_options()(
        "kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value<float>());
+    options.add_options()(
+        "cross_kv_cache_fraction", "Cross K-V Cache Fraction (from 0.0 to 1.0).", cxxopts::value<float>());
    options.add_options()("request_rate",
        "request rate in reqs/sec. Skipping this arg or negative value will trigger offline/0-delay.",
        cxxopts::value<float>());
@ -2126,6 +2135,20 @@ int main(int argc, char* argv[])
    {
        benchmarkParams.freeGpuMemoryFraction = result["kv_cache_free_gpu_mem_fraction"].as<float>();
    }
+    // Argument: K-V Cache Cross Attention Fraction. Only applicable to enc-dec models.
+    if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir"))
+    {
+        if (result.count("cross_kv_cache_fraction"))
+        {
+            benchmarkParams.crossKvCacheFraction = result["cross_kv_cache_fraction"].as<float>();
+        }
+        else
+        {
+            benchmarkParams.crossKvCacheFraction
+                = 0.5f; // default value if not set. but non enc-dec should not even have this param set
+        }
+    }
+
    // Argument: Enable TRT overlap
    benchmarkParams.enableTrtOverlap = result["enable_trt_overlap"].as<bool>();

@ -2342,14 +2365,14 @@ int main(int argc, char* argv[])
    {
        texec::ModelType executorModelType;
        std::optional<std::string> decoderEngineDir = std::nullopt, encoderEngineDir = std::nullopt;
-        if (result.count("encoder_engine_dir") && result.count("engine_dir"))
+        if (result.count("encoder_engine_dir") && result.count("decoder_engine_dir"))
        {
            TLLM_CHECK_WITH_INFO(api == "executor", "encoder-decoder only support executor api.");
            TLLM_CHECK_WITH_INFO(
                modelType == TrtGptModelType::InflightFusedBatching, "encoder-decoder only support inflight batching.");
            executorModelType = texec::ModelType::kENCODER_DECODER;
-            decoderEngineDir = result["engine_dir"].as<std::string>();
            encoderEngineDir = result["encoder_engine_dir"].as<std::string>();
+            decoderEngineDir = result["decoder_engine_dir"].as<std::string>();
        }
        else if (result.count("engine_dir"))
        {
--- a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
+++ b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
@ -30,45 +30,47 @@ class BaseEvictionPolicy
 public:
    virtual ~BaseEvictionPolicy() = default;

-    virtual void initialize(
-        std::vector<BlockPtr>& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks)
+    // TODO(TRTLLM-1564): Don't use a separate `initialize` function. Ensure eviction policies can't be in-between a
+    // state of construction and initialization.
+    virtual void initialize(std::vector<BlockPtr>& mAllBlocksById, std::vector<SizeType32> sizes,
+        std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt)
        = 0;

-    // Get a free block from the primary memory pool
-    virtual BlockPtr getFreePrimaryBlock() = 0;
-    // Get a free block from the secondary memory pool
-    virtual BlockPtr getFreeSecondaryBlock() = 0;
-    // Release a block. Prioritize the block for eviction if toFront=true
+    /// @brief Get a free block from the specified cache level
+    /// @returns The pointer to the free block, along with whether it can be offloaded
+    virtual std::tuple<BlockPtr, bool> getFreeBlock(SizeType32 cacheLevel) = 0;
+    /// @brief Release a block. Prioritize the block for eviction if toFront=true
    virtual void releaseBlock(BlockPtr block, bool toFront = false) = 0;
-    // Get the amount of free blocks in the primary memory pool
-    virtual SizeType32 getNumFreePrimaryBlocks() = 0;
-    // Get the amount of free blocks in the secondary memory pool
-    virtual SizeType32 getNumFreeSecondaryBlocks() = 0;
-    // Claim a free block. Called when the cache manager allocates or reuses a new block
-    virtual void claimBlock(KVCacheBlock block) = 0;
+    /// @brief Get the amount of free blocks in the primary memory pool
+    virtual SizeType32 getNumFreeBlocks(SizeType32 cacheLevel) = 0;
+    /// @brief Claim a free block. Called when the cache manager allocates or reuses a new block
+    virtual void claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt) = 0;
 };

 class LRUEvictionPolicy : public BaseEvictionPolicy
 {
 public:
-    void initialize(
-        std::vector<BlockPtr>& mAllBlocksById, SizeType32 numPrimaryBlocks, SizeType32 numSecondaryBlocks) override;
-    BlockPtr getFreePrimaryBlock() override;
-    BlockPtr getFreeSecondaryBlock() override;
+    void initialize(std::vector<BlockPtr>& mAllBlocksById, std::vector<SizeType32> sizes,
+        std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt) override;
+    std::tuple<BlockPtr, bool> getFreeBlock(SizeType32 cacheLevel) override;
    void releaseBlock(BlockPtr block, bool toFront = false) override;
-    SizeType32 getNumFreePrimaryBlocks() override;
-    SizeType32 getNumFreeSecondaryBlocks() override;
-
-    void claimBlock(KVCacheBlock block);
+    SizeType32 getNumFreeBlocks(SizeType32 cacheLevel) override;
+    void claimBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt) override;

 private:
-    FreeBlocksQueue mFreePrimaryBlocks;
-    FreeBlocksQueue mFreeSecondaryBlocks;
+    // Check if the block should be added to mFreeQueues.
+    bool isReleasedLeafBlock(BlockPtr block);

+    // Queues of available leaf blocks, split by cache level and priority level
+    std::vector<std::vector<FreeBlocksQueue>> mFreeQueues;
+    // All blocks that have been released, along with the amount of released children
+    std::vector<std::unordered_set<SizeType32>> mReleasedBlocks;
+    // Iterators to block entries in mFreeQueues
    std::vector<std::optional<FreeBlocksQueue::iterator>> mFreeBlockIterators;
-
-    SizeType32 mFreePrimaryBlocksSize;
-    SizeType32 mFreeSecondaryBlocksSize;
+    // Amount of free blocks at each cache level
+    std::vector<SizeType32> mNumFreeBlocksPerLevel;
+    // Secondary offload threshold. Blocks below this priority won't be evicted.
+    executor::RetentionPriority mSecondaryOffloadMinPriority;
 };

 } // namespace tensorrt_llm::batch_manager::eviction_policy
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheConfig.h
@ -41,7 +41,9 @@ public:
        std::optional<std::vector<SizeType32>> maxAttentionWindowVec = std::nullopt,
        std::optional<SizeType32> sinkTokenLength = std::nullopt,
        std::optional<float> freeGpuMemoryFraction = std::nullopt, bool enableBlockReuse = false, bool useUvm = false,
-        std::optional<size_t> hostCacheSize = std::nullopt, bool onboardBlocks = true)
+        std::optional<size_t> hostCacheSize = std::nullopt, bool onboardBlocks = true,
+        std::optional<float> crossKvCacheFraction = std::nullopt,
+        std::optional<SizeType32> secondaryOffloadMinPriority = std::nullopt)
        : maxTokens{maxTokens}
        , maxAttentionWindowVec{maxAttentionWindowVec}
        , sinkTokenLength{sinkTokenLength}
@ -50,6 +52,8 @@ public:
        , useUvm(useUvm)
        , hostCacheSize(hostCacheSize)
        , onboardBlocks(onboardBlocks)
+        , crossKvCacheFraction{crossKvCacheFraction}
+        , secondaryOffloadMinPriority(secondaryOffloadMinPriority)
    {
    }

@ -57,7 +61,7 @@ public:
        : KvCacheConfig(kvCacheConfig.getMaxTokens(), kvCacheConfig.getMaxAttentionWindowVec(),
            kvCacheConfig.getSinkTokenLength(), kvCacheConfig.getFreeGpuMemoryFraction(),
            kvCacheConfig.getEnableBlockReuse(), false, kvCacheConfig.getHostCacheSize(),
-            kvCacheConfig.getOnboardBlocks())
+            kvCacheConfig.getOnboardBlocks(), kvCacheConfig.getCrossKvCacheFraction())
    {
    }

@ -66,7 +70,8 @@ public:
        return maxTokens == other.maxTokens && maxAttentionWindowVec == other.maxAttentionWindowVec
            && sinkTokenLength == other.sinkTokenLength && freeGpuMemoryFraction == other.freeGpuMemoryFraction
            && enableBlockReuse == other.enableBlockReuse && useUvm == other.useUvm
-            && hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks;
+            && hostCacheSize == other.hostCacheSize && onboardBlocks == other.onboardBlocks
+            && crossKvCacheFraction == other.crossKvCacheFraction;
    }

    friend std::ostream& operator<<(std::ostream& os, KvCacheConfig const& self);
@ -80,5 +85,9 @@ public:
    bool useUvm;
    std::optional<size_t> hostCacheSize;
    bool onboardBlocks;
+    // Cross will use crossKvCacheFraction of KV Cache and self attention will use the rest.
+    std::optional<float> crossKvCacheFraction;
+    // The minimum priority level to allow blocks to be offloaded to secondary memory.
+    std::optional<SizeType32> secondaryOffloadMinPriority;
 };
 } // namespace tensorrt_llm::batch_manager::kv_cache_manager
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@ -60,12 +60,13 @@ using LoraTaskIdType = tensorrt_llm::runtime::LoraTaskIdType;

 struct BlockKey
 {
+    bool hasLora;
    LoraTaskIdType loraTaskId;
    VecUniqueTokens uniqueTokens;

    bool operator==(BlockKey const& other) const noexcept
    {
-        return (loraTaskId == other.loraTaskId && uniqueTokens == other.uniqueTokens);
+        return (hasLora == other.hasLora && loraTaskId == other.loraTaskId && uniqueTokens == other.uniqueTokens);
    }
 };

@ -97,8 +98,14 @@ struct BlockKeyHasher
        c = (c ^ (c >> 30)) * UINT64_C(0xbf58476d1ce4e5b9);
        c = (c ^ (c >> 27)) * UINT64_C(0x94d049bb133111eb);
        c = c ^ (c >> 31);
-
        seed ^= c + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+
+        uint32_t d = static_cast<uint32_t>(blockKey.hasLora);
+        d = ((d >> 16) ^ d) * 0x45d9f3b;
+        d = ((d >> 16) ^ d) * 0x45d9f3b;
+        d = (d >> 16) ^ d;
+        seed ^= d + 0x9e3779b9 + (seed << 6) + (seed >> 2);
+
        return seed;
    }
 };
@ -107,13 +114,24 @@ using NextBlockMap = std::unordered_map<BlockKey, BlockPtr, BlockKeyHasher>;

 struct KvCacheStats
 {
+    // Number of maximum available blocks in the primary memory pool. This is determined and set by available primary
+    // memory. See calculateMaxNumBlocks for details.
    SizeType32 maxNumBlocks;
+    // Number of free blocks in the primary memory pool.
    SizeType32 freeNumBlocks;
+    // Number of used blocks in the primary memory pool. usedNumBlocks = maxNumBlocks - freeNumBlocks.
    SizeType32 usedNumBlocks;
    SizeType32 toksPerBlock;
+    // Total number of blocks allocated by all requests.
    SizeType32 allocTotalBlocks;
+    // Number of new blocks that were allocated.
    SizeType32 allocNewBlocks;
+    // Number of blocks that were matched and reused.
    SizeType32 reusedBlocks;
+    // Number of blocks that were not matched and not reused.
+    SizeType32 missedBlocks;
+    // Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
+    float cacheHitRate;
 };

 // Basic building block of a paged KV cache - a single
@ -152,6 +170,8 @@ public:

    [[nodiscard]] VecUniqueTokens const& getUniqueTokens() const;

+    BlockPtr getPrevBlock() const;
+
    void setPrevBlock(BlockPtr prevBlock);

    void addNextBlock(BlockKey const& blockKey, BlockPtr block);
@ -167,6 +187,10 @@ public:

    [[nodiscard]] bool isShared() const;

+    void setPriority(executor::RetentionPriority priority);
+
+    executor::RetentionPriority getPriority() const;
+
 private:
    // Linear ID of block independent of pool
    IdType mBlockId;
@ -195,6 +219,9 @@ private:

    // Flag indicating if block is full
    bool mIsFull;
+
+    // Priority of the block
+    executor::RetentionPriority mPriority;
 };

 class GenerationRequest
@ -203,14 +230,17 @@ public:
    using SizeType32 = tensorrt_llm::runtime::SizeType32;

    explicit GenerationRequest(LlmRequest::RequestIdType requestId, SizeType32 numTokens, SizeType32 beamWidth,
-        SizeType32 maxBlocks, SizeType32 numPools = 1)
+        SizeType32 maxBlocks, SizeType32 numPools = 1,
+        executor::RetentionPriority decodeRetentionPriority
+        = executor::KvCacheRetentionConfig::kDefaultRetentionPriority)
        : mRequestId(requestId)
        , mNumTokens(numTokens)
        , mBeamWidth(beamWidth)
        , mCacheBlockIds(beamWidth)
-        , mCacheBlockIndices{
-              runtime::BufferManager::cpu(runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
-                  runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value)}
+        , mCacheBlockIndices{runtime::BufferManager::cpu(
+              runtime::ITensor::makeShape({numPools, beamWidth, 2, maxBlocks}),
+              runtime::TRTDataType<tensorrt_llm::kernels::KVCacheIndex>::value)}
+        , mDecodeRetentionPriority(decodeRetentionPriority)
    {
        auto cacheBlockIdsRange = runtime::BufferRange<tensorrt_llm::kernels::KVCacheIndex>(*mCacheBlockIndices);
        std::fill(cacheBlockIdsRange.begin(), cacheBlockIdsRange.end(),
@ -286,6 +316,11 @@ public:
        }
    }

+    [[nodiscard]] executor::RetentionPriority getDecodeRetentionPriority() const
+    {
+        return mDecodeRetentionPriority;
+    }
+
 private:
    // Request id of the sequence
    LlmRequest::RequestIdType mRequestId;
@ -297,6 +332,8 @@ private:
    std::vector<std::vector<KVCacheBlock::IdType>> mCacheBlockIds;
    // Tensor of block indices allocated for each beam of the sequence
    runtime::ITensor::SharedPtr mCacheBlockIndices;
+    // The retention priority to assign to decode blocks
+    executor::RetentionPriority mDecodeRetentionPriority;
 };

 // attach metadata to a pool pointer
@ -344,7 +381,8 @@ public:
    explicit BlockManager(std::vector<SizeType32> const& numKvHeadsPerLayer, SizeType32 sizePerHead,
        SizeType32 tokensPerBlock, SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool,
        SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
-        CacheType cacheType = CacheType::kSELF);
+        CacheType cacheType = CacheType::kSELF,
+        std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt);

    ~BlockManager();

@ -396,6 +434,11 @@ public:
        return getMaxNumBlocks() - getNumFreeBlocks();
    }

+    [[nodiscard]] SizeType32 getNumMissedBlocks() const noexcept
+    {
+        return mMissedBlocks;
+    }
+
    [[nodiscard]] bool hasFreeBlocks(SizeType32 numRequired = 1) const noexcept
    {
        return getNumFreeBlocks() >= numRequired;
@ -494,20 +537,23 @@ private:
    //! \brief Store blocks in cached blocks.
    //! \param blockKeys Key of each block.
    //! \param blockIds Id of each block.
-    void storeBlocks(std::list<BlockKey> blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds);
+    //! \param isChunkedContext Whether these blocks are being stored for chunked context.
+    void storeBlocks(std::list<BlockKey> blockKeys, std::vector<KVCacheBlock::IdType> const& blockIds,
+        bool isChunkedContext = false);

    //! \brief Try to load blocks from cache. Allocate new blocks if necessary.
    //! \param blockKeys Key of each block.
    //! \param sequence Sequence to which blocks are assigned.
    //! \return Number of matched tokens from loaded blocks.
-    SizeType32 loadOrAllocateBlocks(
-        std::list<BlockKey> const& blockKeys, SizeType32 numContextBlocks, GenerationRequest& sequence);
+    SizeType32 loadOrAllocateBlocks(std::list<BlockKey> const& blockKeys, SizeType32 numContextBlocks,
+        GenerationRequest& sequence, std::vector<std::optional<executor::RetentionPriority>> blockPriorities);

    //! \brief Find block least likely to be reused, free it if necessary and return.
-    [[nodiscard]] BlockPtr getFreeBlock();
+    [[nodiscard]] BlockPtr getFreeBlock(
+        executor::RetentionPriority = executor::KvCacheRetentionConfig::kDefaultRetentionPriority);

    //! \brief Free block from previous block and claim it from free blocks list.
-    void claimLeafBlock(KVCacheBlock& block);
+    void claimLeafBlock(BlockPtr block, std::optional<executor::RetentionPriority> priority = std::nullopt);

    //! \brief Compute pointer to raw KV block (K & V, all layers).
    [[nodiscard]] runtime::ITensor::SharedPtr computeBlockPointer(
@ -546,13 +592,24 @@ private:
    std::vector<BlockPtr> mAllBlocksById;
    // Dummy block acting as root for BlockToken searches
    BlockPtr mCachedBlocksRoot;
-    // Statistics for block allocations/reuse
-    std::size_t mAllocTotalBlocks, mAllocNewBlocks, mReusedBlocks;
    // KV cache type (self or cross)
    CacheType mCacheType;
    // Eviction Policy
    std::shared_ptr<BaseEvictionPolicy> mEvictionPolicy;

+    // Statistics for block allocations/reuse
+    // Total number of blocks allocated by all requests
+    SizeType32 mAllocTotalBlocks;
+    // Number of new blocks that were allocated
+    SizeType32 mAllocNewBlocks;
+    // Number of blocks that were reused
+    SizeType32 mReusedBlocks;
+    // Number of unique blocks that were reused
+    SizeType32 mReusedUniqueBlocks;
+    // Number of blocks that were not reused
+    SizeType32 mMissedBlocks;
+    std::set<KVCacheBlock::IdType> reusedBlockIds;
+
 private:
    friend class KVCacheManager;
 };
@ -570,13 +627,15 @@ public:
        SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
        SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, bool useOneMoreBlock,
        CudaStreamPtr stream, bool enableBlockReuse = false, bool onboardBlocks = true,
-        CacheType cacheType = CacheType::kSELF);
+        CacheType cacheType = CacheType::kSELF,
+        std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt);

    KVCacheManager(SizeType32 numLayers, SizeType32 numKvHeads, SizeType32 sizePerHead, SizeType32 tokensPerBlock,
        SizeType32 blocksInPrimaryPool, SizeType32 blocksInSecondaryPool, SizeType32 maxNumSequences,
        SizeType32 maxBeamWidth, SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, bool useOneMoreBlock,
        CudaStreamPtr stream, bool enableBlockReuse = true, bool onboardBlocks = true,
-        CacheType cacheType = CacheType::kSELF);
+        CacheType cacheType = CacheType::kSELF,
+        std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt);

    void allocatePools(nvinfer1::DataType dtype, bool useUvm = false);

@ -617,6 +676,11 @@ public:
        return mBlockManager.getNumReusedBlocks();
    }

+    [[nodiscard]] SizeType32 getNumMissedBlocks() const noexcept
+    {
+        return mBlockManager.getNumMissedBlocks();
+    }
+
    [[nodiscard]] KvCacheStats getKvCacheStats() const
    {
        KvCacheStats kvCacheStats;
@ -627,7 +691,10 @@ public:
        kvCacheStats.allocTotalBlocks = getNumAllocTotalBlocks();
        kvCacheStats.allocNewBlocks = getNumAllocNewBlocks();
        kvCacheStats.reusedBlocks = getNumReusedBlocks();
-
+        kvCacheStats.missedBlocks = getNumMissedBlocks();
+        kvCacheStats.cacheHitRate = kvCacheStats.reusedBlocks == 0 ? 0
+                                                                   : static_cast<float>(kvCacheStats.reusedBlocks)
+                / static_cast<float>(kvCacheStats.reusedBlocks + kvCacheStats.missedBlocks);
        return kvCacheStats;
    }

@ -689,14 +756,14 @@ public:
        runtime::ITensor& output, SizeType32 outputSlotOffset, LlmRequest::RequestIdType requestId) const;

    // Sum of numLayers * 2 * numKvHeads * sizePerHead for each pool
-    [[nodiscard]] static SizeType32 calculateCacheSizePerToken(
-        tensorrt_llm::runtime::ModelConfig const& modelConfig, tensorrt_llm::runtime::WorldConfig const& worldConfig)
+    [[nodiscard]] static SizeType32 calculateCacheSizePerToken(tensorrt_llm::runtime::ModelConfig const& modelConfig,
+        tensorrt_llm::runtime::WorldConfig const& worldConfig, bool isCrossAttention = false)
    {
        // NOTE: We expect the initialization of modelConfig to have already taken the tp size into account and do not
        // address it here
        // consider only local layers for the calculation
        return modelConfig.getSumLocalKvHeads(
-                   worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank())
+                   worldConfig.getPipelineParallelism(), worldConfig.getPipelineParallelRank(), isCrossAttention)
            * 2 * modelConfig.getSizePerHead();
    }

--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@ -95,8 +95,9 @@ public:
        std::optional<SizeType32> promptVocabSize = std::nullopt,
        std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
        std::optional<TensorPtr> loraConfig = std::nullopt,
-        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
-        bool returnContextLogits = false, bool returnGenerationLogits = false,
+        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
+        std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
+        bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
        std::optional<std::shared_ptr<VecTokens>> draftTokens = std::nullopt,
        std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
        std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
@ -134,6 +135,7 @@ public:
        , mLoraWeights(std::move(loraWeights))
        , mLoraConfig(std::move(loraConfig))
        , mLookaheadConfig(std::move(lookaheadConfig))
+        , mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig))
        , mContextChunkSize{mPromptLen}
        , mLogProbs(samplingConfig.beamWidth)
        , mCumLogProbs(samplingConfig.beamWidth)
@ -188,6 +190,7 @@ public:
        , mLoraWeights(std::nullopt)
        , mLoraConfig(std::nullopt)
        , mLookaheadConfig(std::nullopt)
+        , mKvCacheRetentionConfig(std::nullopt)
        , mContextChunkSize{mPromptLen}
        , mLogProbs(mSamplingConfig.beamWidth)
        , mCumLogProbs(mSamplingConfig.beamWidth)
@ -827,6 +830,16 @@ public:
        mLookaheadConfig = config;
    }

+    [[nodiscard]] std::optional<executor::KvCacheRetentionConfig> getKvCacheRetentionConfig() const
+    {
+        return mKvCacheRetentionConfig;
+    }
+
+    void setKvCacheRetentionConfig(executor::KvCacheRetentionConfig config)
+    {
+        mKvCacheRetentionConfig = config;
+    }
+
    void clearLookaheadConfig()
    {
        mLookaheadConfig = std::nullopt;
@ -1502,6 +1515,23 @@ public:
        return mReusedBlocksPerRequest;
    }

+    void updateMissedBlocksPerRequest(SizeType32 missedBlocksPerRequest)
+    {
+        mMissedBlocksPerRequest += missedBlocksPerRequest;
+    }
+
+    [[nodiscard]] SizeType32 getMissedBlocksPerRequest() const
+    {
+        return mMissedBlocksPerRequest;
+    }
+
+    [[nodiscard]] float getKVCacheHitRatePerRequest() const
+    {
+        return mReusedBlocksPerRequest == 0 ? 0
+                                            : static_cast<float>(mReusedBlocksPerRequest)
+                / (static_cast<float>(mReusedBlocksPerRequest + mMissedBlocksPerRequest));
+    }
+
    RequestIdType mRequestId;
    SizeType32 mPromptLen;
    SizeType32 mMaxNewTokens;
@ -1552,6 +1582,7 @@ protected:
    std::optional<TensorPtr> mLoraConfig;
    std::optional<executor::LookaheadDecodingConfig> mLookaheadConfig;

+    std::optional<executor::KvCacheRetentionConfig> mKvCacheRetentionConfig;
    // To enable chunked context, the FHMA paged kv-cache also needs to be enabled. Except for the last one,
    // the size of the context chunk needs to be an integer multiple of the kv-cache block size. The meaning
    // of null value is that the context is not chunked.
@ -1613,6 +1644,7 @@ protected:
    SizeType32 mAllocTotalBlocksPerRequest{0};
    SizeType32 mAllocNewBlocksPerRequest{0};
    SizeType32 mReusedBlocksPerRequest{0};
+    SizeType32 mMissedBlocksPerRequest{0};

 private:
    void initialize(VecTokens const& inputTokens, bool outputLogProbs)
@ -1757,8 +1789,9 @@ public:
        std::optional<SizeType32> promptVocabSize = std::nullopt,
        std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
        std::optional<TensorPtr> loraConfig = std::nullopt,
-        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
-        bool returnContextLogits = false, bool returnGenerationLogits = false,
+        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
+        std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
+        bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
        std::optional<std::shared_ptr<VecTokens>> draftTokens = std::nullopt,
        std::optional<TensorPtr> draftLogits = std::nullopt, bool excludeInputFromOutput = false,
        std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
@ -1775,11 +1808,11 @@ public:
        : Base(requestId, maxNewTokens, std::move(inputTokens), samplingConfig, isStreaming, endId, padId,
            std::move(embeddingBias), std::move(badWordsList), std::move(stopWordsList), std::move(positionIds),
            std::move(promptEmbeddingTable), promptVocabSize, loraTaskId, std::move(loraWeights), std::move(loraConfig),
-            std::move(lookaheadConfig), returnLogProbs, returnContextLogits, returnGenerationLogits,
-            std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput, std::move(logitsPostProcessor),
-            applyLogitsPostProcessorBatched, std::move(encoderInputTokens), returnEncoderOutput, clientId, priority,
-            std::move(encoderInputFeatures), std::move(encoderOutputLength), std::move(crossAttentionMask),
-            llmRequestType, std::move(inputTokenExtraIds), numReturnSequences)
+            std::move(lookaheadConfig), std::move(kvCacheRetentionConfig), returnLogProbs, returnContextLogits,
+            returnGenerationLogits, std::move(draftTokens), std::move(draftLogits), excludeInputFromOutput,
+            std::move(logitsPostProcessor), applyLogitsPostProcessorBatched, std::move(encoderInputTokens),
+            returnEncoderOutput, clientId, priority, std::move(encoderInputFeatures), std::move(encoderOutputLength),
+            std::move(crossAttentionMask), llmRequestType, std::move(inputTokenExtraIds), numReturnSequences)
    {
    }

@ -1791,6 +1824,7 @@ public:
        mLogitsPostProcessor = std::move(logitsPostProcessor);
        mApplyLogitsPostProcessorBatched = applyLogitsPostProcessorBatched;
        mLookaheadConfig = request.getLookaheadConfig();
+        mKvCacheRetentionConfig = request.getKvCacheRetentionConfig();
    }

    std::shared_ptr<LlmRequest> createChildRequest(RequestIdType requestId)
--- a/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/peftCacheManager.h
@ -1,13 +1,17 @@
 /*
- * SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
- * SPDX-License-Identifier: LicenseRef-NvidiaProprietary
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
 *
- * NVIDIA CORPORATION, its affiliates and licensors retain all intellectual
- * property and proprietary rights in and to this material, related
- * documentation and any modifications thereto. Any use, reproduction,
- * disclosure or distribution of this material and related documentation
- * without an express license agreement from NVIDIA CORPORATION or
- * its affiliates is strictly prohibited.
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
 */

 #pragma once
--- a/cpp/include/tensorrt_llm/executor/executor.h
+++ b/cpp/include/tensorrt_llm/executor/executor.h
@ -28,6 +28,7 @@
 #include <memory>
 #include <optional>
 #include <string>
+#include <utility>
 #include <vector>

 namespace tensorrt_llm::mpi
@ -348,6 +349,71 @@ public:
    bool fastLogits;
 };

+using RetentionPriority = SizeType32;
+
+/// @brief Configuration for the request's retention in the KV Cache
+class KvCacheRetentionConfig
+{
+
+public:
+    static constexpr RetentionPriority kMinRetentionPriority = 0;
+    static constexpr RetentionPriority kMaxRetentionPriority = 100;
+    static constexpr RetentionPriority kDefaultRetentionPriority = 35;
+
+    /// @brief A single entry to set block priorities over a token range. Earlier ranges always take priority over later
+    /// ones. For example, with a block size of 16, a range of [0, 17] would be applied to the first two blocks.
+    struct TokenRangeRetentionPriority
+    {
+    public:
+        explicit TokenRangeRetentionPriority(SizeType32 tokenStart, std::optional<SizeType32> tokenEnd = std::nullopt,
+            RetentionPriority priority = KvCacheRetentionConfig::kDefaultRetentionPriority)
+            : tokenStart{tokenStart}
+            , tokenEnd{tokenEnd}
+            , priority{priority}
+        {
+            TLLM_CHECK_WITH_INFO(priority >= KvCacheRetentionConfig::kMinRetentionPriority
+                    && priority <= KvCacheRetentionConfig::kMaxRetentionPriority,
+                "Invalid priority value. Must be between %d and %d", KvCacheRetentionConfig::kMinRetentionPriority,
+                KvCacheRetentionConfig::kMaxRetentionPriority);
+        };
+
+        /// @brief The first token of this range.
+        SizeType32 tokenStart;
+        /// @brief The final token of this range. The end is not included in the range. This can be set to std::nullopt
+        /// to extend the range to the end of the sequence.
+        std::optional<SizeType32> tokenEnd;
+        /// @brief The priority of this token range. Higher priorities are less likely to be evicted or offloaded.
+        RetentionPriority priority;
+
+        bool operator==(TokenRangeRetentionPriority const& other) const
+        {
+            return tokenStart == other.tokenStart && tokenEnd == other.tokenEnd && priority == other.priority;
+        }
+    };
+
+    explicit KvCacheRetentionConfig()
+        : KvCacheRetentionConfig({}, kDefaultRetentionPriority)
+    {
+    }
+
+    KvCacheRetentionConfig(std::vector<TokenRangeRetentionPriority> const& tokenRangeRetentionPriorities,
+        RetentionPriority decodeRetentionPriority);
+
+    [[nodiscard]] std::vector<TokenRangeRetentionPriority> getTokenRangeRetentionPriorities() const;
+    [[nodiscard]] RetentionPriority getDecodeRetentionPriority() const;
+
+    /// @brief Convert the token range data into an entry per kv cache block for a given seqLen
+    std::vector<std::optional<RetentionPriority>> getPerBlockEvictionPolicy(SizeType32 blockSize, SizeType32 seqLen);
+
+private:
+    /// @brief The token ranges and priority levels to update. Ranges must be non-overlapping. For example [(0, 64),
+    /// (100, 128), (70, 80)] is valid, whereas
+    /// [(0, 64), (60, 128)] is not.
+    std::vector<TokenRangeRetentionPriority> mTokenRangeRetentionPriorities;
+    /// @brief The priority level to assign to blocks allocated in the decode phase
+    RetentionPriority mDecodeRetentionPriority;
+};
+
 /// @brief A class that holds information about the request
 class Request
 {
@ -371,6 +437,7 @@ public:
    /// @param pTuningConfig The prompt tuning configuration
    /// @param loraConfig The LoRA configuration
    /// @param logitsPostProcessorName The logits postprocessor name. Must correspond to one of the logits postprocessor
+    /// @param kvCacheRetentionConfig The configuration used for KV cache block eviction.
    /// name provided to the ExecutorConfig.
    /// @param encoderInputTokenIds The encoder input token ids for encoder-decoder models, or encoder-only models
    /// @param returnAllGeneratedTokens Indicates whether to return the full beams or just the newly generated tokens
@ -394,6 +461,7 @@ public:
        std::optional<PromptTuningConfig> pTuningConfig = std::nullopt,
        std::optional<LoraConfig> loraConfig = std::nullopt,
        std::optional<LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
+        std::optional<KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
        std::optional<std::string> logitsPostProcessorName = std::nullopt,
        std::optional<VecTokens> encoderInputTokenIds = std::nullopt, std::optional<IdType> clientId = std::nullopt,
        bool returnAllGeneratedTokens = false, PriorityType priority = kDefaultPriority,
@ -428,6 +496,7 @@ public:
    [[nodiscard]] std::optional<PromptTuningConfig> getPromptTuningConfig() const;
    [[nodiscard]] std::optional<LoraConfig> getLoraConfig() const;
    [[nodiscard]] std::optional<LookaheadDecodingConfig> getLookaheadConfig() const;
+    [[nodiscard]] std::optional<KvCacheRetentionConfig> getKvCacheRetentionConfig() const;
    [[nodiscard]] std::optional<std::string> getLogitsPostProcessorName() const;
    [[nodiscard]] std::optional<VecTokens> getEncoderInputTokenIds() const;
    [[nodiscard]] std::optional<IdType> getClientId() const;
@ -453,6 +522,7 @@ public:
    void setPromptTuningConfig(PromptTuningConfig const& pTuningConfig);
    void setLoraConfig(LoraConfig const& loraConfig);
    void setLookaheadConfig(LookaheadDecodingConfig const& lookaheadConfig);
+    void setKvCacheRetentionConfig(KvCacheRetentionConfig const& kvCacheRetentionConfig);
    void setLogitsPostProcessorName(std::string const& logitsPostProcessorName);
    void setEncoderInputTokenIds(VecTokens const& encoderInputTokenIds);
    void setClientId(IdType clientId);
@ -598,23 +668,29 @@ public:
        std::optional<std::vector<SizeType32>> const& maxAttentionWindowVec = std::nullopt,
        std::optional<SizeType32> const& sinkTokenLength = std::nullopt,
        std::optional<FloatType> const& freeGpuMemoryFraction = std::nullopt,
-        std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true);
+        std::optional<size_t> const& hostCacheSize = std::nullopt, bool onboardBlocks = true,
+        std::optional<FloatType> const& crossKvCacheFraction = std::nullopt,
+        std::optional<RetentionPriority> secondaryOffloadMinPriority = std::nullopt);

    [[nodiscard]] bool getEnableBlockReuse() const;
    [[nodiscard]] std::optional<SizeType32> getMaxTokens() const;
    [[nodiscard]] std::optional<std::vector<SizeType32>> getMaxAttentionWindowVec() const;
    [[nodiscard]] std::optional<SizeType32> getSinkTokenLength() const;
    [[nodiscard]] std::optional<FloatType> getFreeGpuMemoryFraction() const;
+    [[nodiscard]] std::optional<FloatType> getCrossKvCacheFraction() const;
    [[nodiscard]] std::optional<size_t> getHostCacheSize() const;
    [[nodiscard]] bool getOnboardBlocks() const;
+    [[nodiscard]] std::optional<RetentionPriority> getSecondaryOffloadMinPriority() const;

    void setEnableBlockReuse(bool enableBlockReuse);
    void setMaxTokens(SizeType32 maxTokens);
    void setMaxAttentionWindowVec(std::vector<SizeType32> maxAttentionWindowVec);
    void setSinkTokenLength(SizeType32 sinkTokenLength);
    void setFreeGpuMemoryFraction(FloatType freeGpuMemoryFraction);
+    void setCrossKvCacheFraction(FloatType crossKvCacheFraction);
    void setHostCacheSize(size_t hostCacheSize);
    void setOnboardBlocks(bool onboardBlocks);
+    void setSecondaryOffloadMinPriority(std::optional<RetentionPriority> secondaryOffloadMinPriority);

 private:
    friend class Serialization;
@ -641,12 +717,21 @@ private:
    /// allocated.
    std::optional<FloatType> mFreeGpuMemoryFraction;

+    /// @brief The fraction of the KV Cache memory should be reserved for cross attention
+    /// If set to p, self attention will use 1-p of KV Cache memory and cross attention
+    /// will use p of KV Cache memory. Default is 50%.
+    /// Should only be set when using encoder-decoder model.
+    std::optional<FloatType> mCrossKvCacheFraction;
+
    /// @brief Size of secondary memory pool in bytes. Default is 0.
    /// Having a secondary memory pool increases KV cache block reuse potential.
    std::optional<size_t> mHostCacheSize;

    /// @brief Controls whether offloaded blocks should be onboarded back into primary memory before being reused.
    bool mOnboardBlocks;
+
+    /// @brief Only blocks with priority > mSecondaryOfflineMinPriority can be offloaded to secondary memory.
+    std::optional<RetentionPriority> mSecondaryOffloadMinPriority;
 };

 /// @brief Configuration class for the runtime perf knobs
--- a/cpp/include/tensorrt_llm/executor/serialization.h
+++ b/cpp/include/tensorrt_llm/executor/serialization.h
@ -154,6 +154,18 @@ public:
    static void serialize(LookaheadDecodingConfig const& lookaheadDecodingConfig, std::ostream& os);
    static size_t serializedSize(LookaheadDecodingConfig const& lookaheadDecodingConfig);

+    // KvCacheRetentionConfig
+    static KvCacheRetentionConfig deserializeKvCacheRetentionConfig(std::istream& is);
+    static void serialize(KvCacheRetentionConfig const& kvCacheRetentionConfig, std::ostream& os);
+    static size_t serializedSize(KvCacheRetentionConfig const& kvCacheRetentionConfig);
+
+    // TokenRangeRetentionPriority
+    static KvCacheRetentionConfig::TokenRangeRetentionPriority deserializeTokenRangeRetentionPriority(std::istream& is);
+    static void serialize(
+        KvCacheRetentionConfig::TokenRangeRetentionPriority const& tokenRangeRetentionPriority, std::ostream& os);
+    static size_t serializedSize(
+        KvCacheRetentionConfig::TokenRangeRetentionPriority const& tokenRangeRetentionPriority);
+
    // DecodingConfig
    static DecodingConfig deserializeDecodingConfig(std::istream& is);
    static void serialize(DecodingConfig const& decodingConfig, std::ostream& os);
--- a/cpp/include/tensorrt_llm/executor/types.h
+++ b/cpp/include/tensorrt_llm/executor/types.h
@ -233,7 +233,8 @@ enum class CommunicationMode
                   // execution of the model
 };

-/// @brief Struct that holds the stats of a KV cache manager
+/// @brief Struct that holds the stats of a KV cache manager.
+// See KvCacheStats definition in kvCacheManager.h for more information about each field.
 struct KvCacheStats
 {
    /// @brief Max number of blocks
@ -250,6 +251,10 @@ struct KvCacheStats
    SizeType32 allocNewBlocks;
    /// @brief Number of reused block
    SizeType32 reusedBlocks;
+    /// @brief Number of not reused block
+    SizeType32 missedBlocks;
+    /// @brief Measuring the KV Cache reuse rate. cacheHitRate = reusedBlocks / (reusedBlocks + missedBlocks).
+    float cacheHitRate;
 };

 /// @brief Struct that holds the stats of static batching models for a single iteration
@ -372,6 +377,10 @@ struct RequestStats
    SizeType32 allocNewBlocksPerRequest;
    /// @brief Number of reused blocks per request
    SizeType32 reusedBlocksPerRequest;
+    /// @brief Number of missed blocks per request
+    SizeType32 missedBlocksPerRequest;
+    /// @brief KV Cache Hit Rate per request, defined as reusedBlocks / (reusedBlocks + missedBlocks)
+    SizeType32 kvCacheHitRatePerRequest;
 };

 /// @brief Struct that holds the stats of all requests in an iteration
@ -459,6 +468,11 @@ public:
        return DecodingMode{kExternalDraftTokens | kUsePenalties | kUseBanTokens | kStandardStopCriteria};
    }

+    static auto constexpr Eagle()
+    {
+        return DecodingMode{kEagle | kStandardStopCriteria | kUseExplicitEosStop};
+    }
+
    auto constexpr useTemperature(bool useTemp)
    {
        mState = setBitTo(kUseTemperature, useTemp);
@ -581,6 +595,11 @@ public:
        return anyBitSet(kExternalDraftTokens);
    }

+    [[nodiscard]] bool constexpr isEagle() const
+    {
+        return anyBitSet(kEagle);
+    }
+
    [[nodiscard]] bool constexpr isUseTemperature() const
    {
        return anyBitSet(kUseTemperature);
@ -695,6 +714,7 @@ private:
    static UnderlyingType constexpr kLookahead{1u << (kNumFlags + 5)};
    static UnderlyingType constexpr kExplicitDraftTokens{1u << (kNumFlags + 6)};
    static UnderlyingType constexpr kExternalDraftTokens{1u << (kNumFlags + 7)};
+    static UnderlyingType constexpr kEagle{1u << (kNumFlags + 8)};
    static UnderlyingType constexpr kTopKTopP{kTopK | kTopP};

    [[nodiscard]] bool constexpr anyBitSet(UnderlyingType bits) const
@ -726,6 +746,7 @@ static_assert(!DecodingMode::Auto().isMedusa());
 static_assert(!DecodingMode::Auto().isLookahead());
 static_assert(!DecodingMode::Auto().isExplicitDraftTokens());
 static_assert(!DecodingMode::Auto().isExternalDraftTokens());
+static_assert(!DecodingMode::Auto().isEagle());

 static_assert(DecodingMode::TopK().isTopK());
 static_assert(DecodingMode::TopK().isTopKorTopP());
@ -747,6 +768,7 @@ static_assert(!DecodingMode::TopK().isMedusa());
 static_assert(!DecodingMode::TopK().isLookahead());
 static_assert(!DecodingMode::TopK().isExplicitDraftTokens());
 static_assert(!DecodingMode::TopK().isExternalDraftTokens());
+static_assert(!DecodingMode::TopK().isEagle());

 static_assert(DecodingMode::TopP().isTopP());
 static_assert(DecodingMode::TopP().isTopKorTopP());
@ -760,7 +782,7 @@ static_assert(!DecodingMode::TopP().isBeamSearch());
 static_assert(!DecodingMode::TopP().isMedusa());
 static_assert(!DecodingMode::TopP().isLookahead());
 static_assert(!DecodingMode::TopP().isExplicitDraftTokens());
-static_assert(!DecodingMode::TopP().isExternalDraftTokens());
+static_assert(!DecodingMode::TopP().isEagle());

 static_assert(DecodingMode::TopKTopP().isTopK());
 static_assert(DecodingMode::TopKTopP().isTopP());
@ -775,6 +797,7 @@ static_assert(!DecodingMode::TopKTopP().isMedusa());
 static_assert(!DecodingMode::TopKTopP().isLookahead());
 static_assert(!DecodingMode::TopKTopP().isExplicitDraftTokens());
 static_assert(!DecodingMode::TopKTopP().isExternalDraftTokens());
+static_assert(!DecodingMode::TopKTopP().isEagle());

 static_assert(DecodingMode::BeamSearch().isBeamSearch());
 static_assert(DecodingMode::BeamSearch().isUseStopCriteria());
@ -784,6 +807,7 @@ static_assert(!DecodingMode::BeamSearch().isMedusa());
 static_assert(!DecodingMode::BeamSearch().isLookahead());
 static_assert(!DecodingMode::BeamSearch().isExplicitDraftTokens());
 static_assert(!DecodingMode::BeamSearch().isExternalDraftTokens());
+static_assert(!DecodingMode::BeamSearch().isEagle());

 static_assert(!DecodingMode::Medusa().isAuto());
 static_assert(!DecodingMode::Medusa().isTopK());
@ -800,6 +824,7 @@ static_assert(DecodingMode::Medusa().isUsePenalty());
 static_assert(DecodingMode::Medusa().isUseMinLength());
 static_assert(DecodingMode::Medusa().isMedusa());
 static_assert(!DecodingMode::Medusa().isExternalDraftTokens());
+static_assert(!DecodingMode::Medusa().isEagle());

 static_assert(!DecodingMode::Lookahead().isAuto());
 static_assert(!DecodingMode::Lookahead().isTopK());
@ -814,6 +839,7 @@ static_assert(DecodingMode::Lookahead().isUseStopWords());
 static_assert(DecodingMode::Lookahead().isUseExplicitEosStop());
 static_assert(DecodingMode::Lookahead().isLookahead());
 static_assert(!DecodingMode::Lookahead().isExternalDraftTokens());
+static_assert(!DecodingMode::Lookahead().isEagle());

 static_assert(!DecodingMode::ExplicitDraftTokens().isAuto());
 static_assert(!DecodingMode::ExplicitDraftTokens().isTopK());
@ -828,6 +854,7 @@ static_assert(DecodingMode::ExplicitDraftTokens().isUseStopCriteria());
 static_assert(!DecodingMode::ExplicitDraftTokens().isUseBanWords());
 static_assert(DecodingMode::ExplicitDraftTokens().isExplicitDraftTokens());
 static_assert(!DecodingMode::ExplicitDraftTokens().isExternalDraftTokens());
+static_assert(!DecodingMode::ExplicitDraftTokens().isEagle());

 static_assert(!DecodingMode::ExternalDraftTokens().isTopK());
 static_assert(!DecodingMode::ExternalDraftTokens().isTopP());
@ -841,5 +868,21 @@ static_assert(!DecodingMode::ExternalDraftTokens().isBeamSearch());
 static_assert(!DecodingMode::ExternalDraftTokens().isMedusa());
 static_assert(!DecodingMode::ExternalDraftTokens().isLookahead());
 static_assert(!DecodingMode::ExternalDraftTokens().isExplicitDraftTokens());
+static_assert(!DecodingMode::ExternalDraftTokens().isEagle());
 static_assert(DecodingMode::ExternalDraftTokens().isExternalDraftTokens());
+
+static_assert(!DecodingMode::Eagle().isTopK());
+static_assert(!DecodingMode::Eagle().isTopP());
+static_assert(!DecodingMode::Eagle().isTopKorTopP());
+static_assert(!DecodingMode::Eagle().isTopKandTopP());
+static_assert(!DecodingMode::Eagle().isUseBanWords());
+static_assert(!DecodingMode::Eagle().isUseOccurrencePenalty());
+static_assert(DecodingMode::Eagle().isUseStopCriteria());
+static_assert(!DecodingMode::Eagle().isAuto());
+static_assert(!DecodingMode::Eagle().isBeamSearch());
+static_assert(!DecodingMode::Eagle().isMedusa());
+static_assert(!DecodingMode::Eagle().isLookahead());
+static_assert(!DecodingMode::Eagle().isExplicitDraftTokens());
+static_assert(!DecodingMode::Eagle().isExternalDraftTokens());
+static_assert(DecodingMode::Eagle().isEagle());
 } // namespace tensorrt_llm::executor
--- a/cpp/include/tensorrt_llm/runtime/decodingOutput.h
+++ b/cpp/include/tensorrt_llm/runtime/decodingOutput.h
@ -18,6 +18,7 @@

 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/eagleBuffers.h"
 #include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
 #include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/lookaheadBuffers.h"
@ -121,6 +122,8 @@ public:
    std::optional<ExplicitDraftTokensBuffers::Inputs> explicitDraftTokensBuffers;

    std::optional<LookaheadDecodingBuffers> lookaheadOutputs;
+
+    std::optional<EagleBuffers::Inputs> eagleBuffers;
 };

 } // namespace tensorrt_llm::runtime
--- a/cpp/include/tensorrt_llm/runtime/eagleBuffers.h
+++ b/cpp/include/tensorrt_llm/runtime/eagleBuffers.h
@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/executor/executor.h"
+#include "tensorrt_llm/runtime/eagleModule.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/modelConfig.h"
+#include "tensorrt_llm/runtime/tllmRuntime.h"
+#include "tensorrt_llm/runtime/worldConfig.h"
+
+#include <cstddef>
+
+namespace tensorrt_llm::runtime
+{
+
+class EagleBuffers
+{
+public:
+    using SizeType32 = runtime::SizeType32;
+    using ITensor = runtime::ITensor;
+    using BufferPtr = runtime::IBuffer::SharedPtr;
+    using TensorPtr = runtime::ITensor::SharedPtr;
+    using TensorMap = runtime::StringPtrMap<runtime::ITensor>;
+
+    // The datastruct is used for runtime buffer that is holding runtime state per request (shape starts with
+    // maxBatchSize) and for engine inputs (shape starts with numSequences).
+    class Inputs
+    {
+    public:
+        //! [maxBatchSize] or [numSequences]
+        TensorPtr temperatures;
+        //! [maxBatchSize] or [numSequences]
+        TensorPtr randomDataSample;
+        //! [maxBatchSize, maxNumPaths, maxPathDraftLen] or [numSequences, maxNumPaths, maxPathDraftLen]
+        TensorPtr randomDataValidation;
+        //! [maxBatchSize, maxDecodingDraftTokens] or [numSequences, maxDecodingDraftTokens]
+        TensorPtr draftTokens;
+        //! [maxBatchSize] or [numSequences]
+        TensorPtr draftLens;
+        //! [maxBatchSize, maxNumPaths, maxPathLen]
+        //! or [numSequences, maxNumPaths, maxPathLen]
+        TensorPtr draftPaths;
+        //! [maxBatchSize] or [numGenSequences]
+        TensorPtr specDecodingGenerationLengths;
+        //! [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
+        //! or [numGenSequences, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
+        TensorPtr specDecodingPackedMasks;
+        //! [maxBatchSize] or [numGenSequences]
+        TensorPtr specDecodingPositionOffsets;
+        //! [maxBatchSize] or [numSequences]
+        TensorPtr eagleNetCtxRequestTypesHost;
+        //! [maxBatchSize] or [numSequences]
+        TensorPtr eagleNetCtxContextLengthsHost;
+        //! [maxBatchSize] or [numSequences]
+        TensorPtr eagleNetCtxPastKeyValueLengthsHost;
+        //! [maxBatchSize] or [numSequences]
+        TensorPtr eagleNetGenRequestTypesHost;
+        //! [maxBatchSize] or [numSequences]
+        TensorPtr eagleNetGenContextLengthsHost;
+        //! [maxBatchSize] or [numSequences]
+        TensorPtr eagleNetGenPastKeyValueLengthsHost;
+
+        void create(SizeType32 maxNumSequences, runtime::TllmRuntime const& runtime,
+            runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig);
+    };
+
+    Inputs engineInputs;
+
+    class EngineOutputs
+    {
+    public:
+        //! [batchSize, maxDecodingDraftTokens]
+        TensorPtr nextDraftTokens;
+        //! [batchSize]
+        TensorPtr nextDraftLens;
+        //! [batchSize, maxNumPaths, maxPathLen]
+        TensorPtr nextDraftPaths;
+
+        //! [batchSize, maxPathLen]
+        TensorPtr acceptedTokens;
+        //! [batchSize]
+        TensorPtr acceptedLens;
+        //! [batchSize]
+        TensorPtr acceptedPaths;
+
+    } engineOutputs;
+
+public:
+    EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const& manager,
+        runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
+        executor::DecodingConfig const& decodingConfig, runtime::TllmRuntime const& runtime);
+
+    void reshape(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const& modelConfig);
+
+    void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ITensor const& requestTypes,
+        ITensor const& seqSlots, EagleBuffers::Inputs const& decoderBuffers, ITensor const& contextPositionIds,
+        runtime::TllmRuntime const& runtime, runtime::ModelConfig const& modelConfig,
+        runtime::WorldConfig const& worldConfig) const;
+
+    void insertInputTensors(
+        TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::WorldConfig const& worldConfig) const;
+
+private:
+    template <typename T>
+    void setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded,
+        ITensor const& seqSlots, EagleBuffers::Inputs const& draftBuffers, ITensor const& contextPositionIds,
+        runtime::EagleModule const& eagleModule, runtime::CudaStream const& stream) const;
+
+private:
+    // helper tensors
+    std::size_t scanTempStorageBytes{0};
+    std::size_t reduceTempStorageBytes{0};
+    BufferPtr scanReduceTempStorage;
+    TensorPtr cumSumGenerationLengths;
+    TensorPtr maxGenerationLength;
+};
+
+} // namespace tensorrt_llm::runtime
--- a/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/gptDecoderBatched.h
@ -54,6 +54,8 @@ public:

    void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) override;

+    void setupEagle(EagleBuffers::Inputs eagleBuffers) override;
+
    void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) override;

    void newBatch(
@ -270,12 +272,18 @@ private:
    //! @brief Setups decoder internal tensors for new Explicit draft tokens request
    void newRequestExplicitDraftTokens(SizeType32 batchIdx, decoder_batch::Request const& request);

+    //! @brief Setups decoder internal tensors for new Eagle request
+    void newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const& request);
+
    //! @brief Updates finished state on host for all active requests
    void updateFinished(decoder_batch::DecoderFinishedEvent const& decoderFinishEvent);

    //! @brief Sets inputs for explicit draft tokens.
    void setExplicitDraftTokensInputs(decoder_batch::Input const& input);

+    //! @brief Sets inputs for eagle decoding.
+    void setEagleInputs(decoder_batch::Input const& input);
+
    //! @brief Calls decoders for tokens per engine step
    void forwardDispatch(decoder_batch::Output& output, decoder_batch::Input const& input, ForwardType forwardType);

--- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
+++ b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatched.h
@ -18,6 +18,7 @@

 #include "tensorrt_llm/runtime/cudaEvent.h"
 #include "tensorrt_llm/runtime/cudaStream.h"
+#include "tensorrt_llm/runtime/eagleBuffers.h"
 #include "tensorrt_llm/runtime/explicitDraftTokensBuffers.h"
 #include "tensorrt_llm/runtime/iStatefulGptDecoder.h"
 #include "tensorrt_llm/runtime/iTensor.h"
@ -101,6 +102,9 @@ public:
    //! @brief Setup buffers for ExplicitDraftTokens decoding.
    virtual void setupExplicitDraftTokens(ExplicitDraftTokensBuffers::Inputs explicitDraftTokensBuffers) = 0;

+    //! @brief Setup buffers for Eagle decoding.
+    virtual void setupEagle(EagleBuffers::Inputs eagleBuffers) = 0;
+
    //! @brief Setup buffers for Lookahead decoding.
    virtual void setupLookahead(LookaheadDecodingBuffers lookaheadDecodingBuffers) = 0;

--- a/cpp/include/tensorrt_llm/runtime/modelConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/modelConfig.h
@ -227,6 +227,12 @@ public:
        mNumKvHeadsPerAttentionLayer = std::vector<SizeType32>(mNbAttentionLayers, nbKvHeads);
    }

+    // set the number of kv heads for all layers
+    void setNbCrossKvHeads(SizeType32 nbKvHeads)
+    {
+        mNumKvHeadsPerCrossAttentionLayer = std::vector<SizeType32>(mNbAttentionLayers, nbKvHeads);
+    }
+
    [[nodiscard]] SizeType32 constexpr getHiddenSize() const noexcept
    {
        return mHiddenSize;
@ -712,13 +718,18 @@ public:
    }

    [[nodiscard]] std::pair<std::vector<SizeType32>::const_iterator, std::vector<SizeType32>::const_iterator>
-    getNumKvHeadsPerLayerLocalRange(SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
+    getNumKvHeadsPerLayerLocalRange(
+        SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const
    {
+        TLLM_LOG_TRACE("%s start: %d", __PRETTY_FUNCTION__);
        TLLM_CHECK_WITH_INFO(pipelineParallelism > 0, "Invalid pipelineParallelism: %d", pipelineParallelism);
+
        // count number of previous non-local attention layers
        auto const numPrevAttnLayers
            = countLowerRankLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank);
-        auto const firstLocalAttentionLayerIt = mNumKvHeadsPerAttentionLayer.cbegin() + numPrevAttnLayers;
+        auto const firstLocalAttentionLayerIt = isCrossAttention
+            ? mNumKvHeadsPerCrossAttentionLayer.cbegin()
+            : mNumKvHeadsPerAttentionLayer.cbegin() + numPrevAttnLayers;
        auto const numLocalAttentionLayers
            = countLocalLayers(LayerType::kATTENTION, pipelineParallelism, pipelineParallelismRank);
        return std::make_pair(firstLocalAttentionLayerIt, firstLocalAttentionLayerIt + numLocalAttentionLayers);
@ -732,10 +743,19 @@ public:
        mNumKvHeadsPerAttentionLayer = headsPerLayer;
    }

-    [[nodiscard]] SizeType32 getSumLocalKvHeads(
-        SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0) const
+    void setNumKvHeadsPerCrossLayer(std::vector<SizeType32> const& headsPerLayer)
    {
-        auto [cbegin, cend] = getNumKvHeadsPerLayerLocalRange(pipelineParallelism, pipelineParallelismRank);
+        auto const numElems = static_cast<SizeType32>(headsPerLayer.size());
+        TLLM_CHECK_WITH_INFO(numElems == mNbAttentionLayers,
+            "Length of head_per_layer (%d) must match number of attention layers (%d)", numElems, mNbAttentionLayers);
+        mNumKvHeadsPerCrossAttentionLayer = headsPerLayer;
+    }
+
+    [[nodiscard]] SizeType32 getSumLocalKvHeads(
+        SizeType32 pipelineParallelism = 1, SizeType32 pipelineParallelismRank = 0, bool isCrossAttention = false) const
+    {
+        auto [cbegin, cend]
+            = getNumKvHeadsPerLayerLocalRange(pipelineParallelism, pipelineParallelismRank, isCrossAttention);
        auto const sumLocalHeads = std::reduce(cbegin, cend);
        return sumLocalHeads;
    }
@ -800,6 +820,7 @@ private:
    ManageWeightsType mManageWeightsType;
    std::string mModelName;
    std::vector<SizeType32> mNumKvHeadsPerAttentionLayer;
+    std::vector<SizeType32> mNumKvHeadsPerCrossAttentionLayer;
 };

 } // namespace tensorrt_llm::runtime
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9cbefbfec3c95fba9ffae469c63e29594488a2419bfd5dfd3abbd234da7e7eed
-size 5203208
+oid sha256:e7cbfb1f4f7a74411bb349b54add6c2f0b645674835dacce89b39ea9fcff53b7
+size 5307422
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2a5430463d420db41a6024a3d7a16d06a607f0f48c991066fbaa056b756f1f06
-size 5318774
+oid sha256:e52f067f1a5127c512eb0fc298d6608644f3cde5079dd78b6fc6258be353afbe
+size 5422318
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:569c6009be0bf4b404d752468c771a551480d3785ac73678f9e393df2a05ba93
-size 1982856
+oid sha256:941a0dcdf53fe2952baf73720ae3afe8556e629f1873ecf5e0a4235ef074a0ed
+size 1993520
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@ -1,2 +1,2 @@
-68b4fbc2bb6ed68a051b11a006b8984f libtensorrt_llm_ucx_wrapper.so
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+93eb7c6098fe8327cd90ce1cd564e9a9 libtensorrt_llm_ucx_wrapper.so
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b382d9cbfc7b4a4dd55b44c87d8dc196726e94d0a0e74c11aa9e2673e7445d3b
-size 5053722
+oid sha256:f16aa394aaeb2fd6e7f61cf1e171fac868ee1857b9df582d640c1bb03574c942
+size 5152714
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:385cd17e249762358be090d392087f299462ab478e0581b3a5d29d1f7d12d769
-size 5021342
+oid sha256:7962f37b09d99eec224b5c7a96c4fb895c983b35cb2abb00cf498fb671c183c4
+size 5121480
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4b143227c742854cd898d1460b5fe6bc1088d38532947c775dfaa27361040909
+oid sha256:5446aea5178d0d0a7b4f8be4a750065b3bfcdd35f91730c5bcb2dff079fbe287
 size 11648
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
@ -1,2 +1,2 @@
-9f28106626687e336dba9e9ce51b6353 libtensorrt_llm_ucx_wrapper.so
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+1fe3fbf1da6fb9689237743d902f59e7 libtensorrt_llm_ucx_wrapper.so
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:fefbf87e36ee6b652bb52df28f06b47edbb64f16af5ac4497318d3af23dc6e84
-size 32610670
+oid sha256:d3fc5de5061f30e7875f0d86bfecfe1c42f98319f7a6a4339b3027bfe39e9eb5
+size 33892906
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
@ -1,2 +1,2 @@
-e82c837214de4ebf6941beced3d528e1 tensorrt_llm_batch_manager_static.lib
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+19f7f28149bccf535e56ba0bc96a5954 tensorrt_llm_batch_manager_static.lib
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/common/envUtils.cpp
+++ b/cpp/tensorrt_llm/common/envUtils.cpp
@ -157,4 +157,42 @@ bool getEnvEnablePDL()
    return enablePDL;
 }

+bool getEnvUseUCXKvCache()
+{
+    static bool init = false;
+    static bool useUCXKVCache = false;
+    if (!init)
+    {
+        init = true;
+        {
+            char const* use_ucx_kv_cache = std::getenv("TRTLLM_USE_UCX_KVCACHE");
+            if (use_ucx_kv_cache)
+            {
+                if (use_ucx_kv_cache[0] == '1' && use_ucx_kv_cache[1] == '\0')
+                {
+                    useUCXKVCache = true;
+                }
+            }
+        }
+    }
+    return useUCXKVCache;
+}
+
+std::string getEnvUCXInterface()
+{
+    static bool init = false;
+    static std::string ucxInterface;
+    if (!init)
+    {
+        init = true;
+        {
+            char const* ucx_interface = std::getenv("TRTLLM_UCX_INTERFACE");
+            if (ucx_interface)
+            {
+                ucxInterface = ucx_interface;
+            }
+        }
+    }
+    return ucxInterface;
+}
 } // namespace tensorrt_llm::common
--- a/cpp/tensorrt_llm/common/envUtils.h
+++ b/cpp/tensorrt_llm/common/envUtils.h
@ -18,6 +18,7 @@
 #pragma once
 #include <cstdint>
 #include <optional>
+#include <string>

 namespace tensorrt_llm::common
 {
@ -40,4 +41,8 @@ int getEnvMmhaKernelBlockSize();
 // Whether PDL is enabled.
 bool getEnvEnablePDL();

+bool getEnvUseUCXKvCache();
+
+std::string getEnvUCXInterface();
+
 } // namespace tensorrt_llm::common
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a88c97950343f3bb6e630c032858099c118a24b086aa97b4d0aefd3f50439497
-size 3382878
+oid sha256:7c39ae574064fae76e804f9ca65321c00a3ee1400ed84ed5e5ab4098c335acfa
+size 2277310
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aefeea1ece28c2a9e5e87150b2c0dbbfa88333470ebaf424dc8ea1f5ede69a07
-size 3408900
+oid sha256:45b2680b72e076323e36b36fa4946d9d043136cf05818afe626d3645b8bf1896
+size 2308950
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-e69c2223ff214d2339b7466115346146 libtensorrt_llm_executor_static.a
-65deeb71a548a81e57153fa3ad21669c libtensorrt_llm_executor_static.pre_cxx11.a
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+ce1a318e5b898bddb293de47729b4be4 libtensorrt_llm_executor_static.a
+9370519464da3b7dfc95c5fcb3c40d13 libtensorrt_llm_executor_static.pre_cxx11.a
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:22b35d0a4dc2de4c25873d162a5b1edf2d3d16fcbef413aee5a80bbe69db2e37
-size 7880014
+oid sha256:9fcdfc9bc55d856e532cf351145ca98e4b351c41452fdeec8b4d880a592316e2
+size 3356696
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ebf47f01dd8235116ac28beea016600bffb2fa36cf3b568ebfb2da2b91469379
-size 7796554
+oid sha256:8a8d6e3c6d24decec3b9b352a36d7d898d32bfadf3a0645424bd8ae426b04c4f
+size 3278590
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-4c2d8fce7081ac6148b13717636be5ff libtensorrt_llm_executor_static.a
-3341fe285c45cd3b86ea512099bda6a0 libtensorrt_llm_executor_static.pre_cxx11.a
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+b56fd65be76f98bca18ef4e6e1a2744a libtensorrt_llm_executor_static.a
+9f7fc03c3b2a9e7690663adb23a4f036 libtensorrt_llm_executor_static.pre_cxx11.a
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:69b4f3070af978cb53036e7370f59e0a6902f5aca782a8a0a2bc34246cae0637
-size 24707382
+oid sha256:bd6c3e401f6fd0636f43337ddcfb2bd5c7d5d21c1ea41d421152b9c2e8fec810
+size 21794946
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
@ -1,2 +1,2 @@
-520287b802be61e0d36ba8b1df8f06a0 tensorrt_llm_executor_static.lib
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+054197bdbdeace4f818ca8c1bc1f7308 tensorrt_llm_executor_static.lib
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
@ -1565,10 +1565,17 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
        if constexpr (DO_CROSS_ATTENTION)
        {
            auto const k_idx = QK_VEC_SIZE * tidx;
-            int const inBlockIdx = kvCacheBuffer.getKVLocalIdx(cyclic_tlength, hi_kv, Dh, k_idx);
-            Tcache* k_cache = reinterpret_cast<Tcache*>(kvCacheBuffer.getKBlockPtr(batch_beam_idx, cyclic_tlength));
+            int const inBlockIdx = pastKCache.getKVLocalIdx(cyclic_tlength, hi_kv, Dh, k_idx);
+            Tcache* k_cache = reinterpret_cast<Tcache*>(pastKCache.getKBlockPtr(batch_beam_idx, cyclic_tlength));

-            k = vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<Qk_vec_m const*>(&k_cache[inBlockIdx]));
+            if constexpr (ENABLE_8BITS_K_CACHE)
+            {
+                load_8bits_kv_cache_vec(&k, k_cache, inBlockIdx, k_scale_quant_orig_f);
+            }
+            else
+            {
+                k = vec_conversion<Qk_vec_k, Qk_vec_m>(*reinterpret_cast<Qk_vec_m const*>(&k_cache[inBlockIdx]));
+            }
        }
        else
        {
@ -2359,7 +2366,14 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
        V_vec_k v;
        if (DO_CROSS_ATTENTION)
        {
-            v = vec_conversion<V_vec_k, V_vec_k>(*reinterpret_cast<V_vec_k const*>(&v_cache_base[inBlockIdx]));
+            if constexpr (ENABLE_8BITS_KV_CACHE)
+            {
+                load_8bits_kv_cache_vec(&v, v_cache_base, inBlockIdx, kv_scale_quant_orig_f);
+            }
+            else
+            {
+                v = vec_conversion<V_vec_k, V_vec_k>(*reinterpret_cast<V_vec_k const*>(&v_cache_base[inBlockIdx]));
+            }
        }
        else
        {
@ -2401,7 +2415,7 @@ __global__ void __launch_bounds__(MAX_THEADS_PER_BLOCK, MIN_BLOCKS_PER_SM) maske
        // Store the values with bias back to global memory in the cache for V.
        //*reinterpret_cast<V_vec_k*>(&v_cache[params.timestep*Dh]) = v;
        // For MQA/GQA mode, write only with the first Q head of each group per KV head.
-        if (hi == (hi_kv * qhead_per_kv))
+        if (hi == (hi_kv * qhead_per_kv) && !DO_CROSS_ATTENTION)
        {
            if (ENABLE_8BITS_KV_CACHE)
            {
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e73dd3a8859cd67c62ab89a98381028bd20ac9e756f0346bbbaab0fb6c566eb7
-size 81578760
+oid sha256:1397678ac1cab957f7d272750035ffd88b9e2b3b9d4f132073119d21c288b5da
+size 82262624
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@ -1,2 +1,2 @@
-88c30973b9b3452baa3f063d34d08169 libtensorrt_llm_nvrtc_wrapper.so
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+5ea3eabf1c58887230ba5ebc583e0d3c libtensorrt_llm_nvrtc_wrapper.so
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c11e0550552f4cc3568ac11de47079d5c6bd88aeb34ebbd52b39f4f732afbd7d
-size 84839528
+oid sha256:17481ff01045ac335223451c83943a7f97b5c63ca2ab5da3e71d0909c8f4e68b
+size 84578328
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
@ -1,2 +1,2 @@
-95e9f87610383348e444d2d0b8396f2d libtensorrt_llm_nvrtc_wrapper.so
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+270f246f5eccb170a759cda4787216f4 libtensorrt_llm_nvrtc_wrapper.so
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f703220b82bac0368893a9df301a8e756e92546248bf0212e804b92fc6cd593e
+oid sha256:c641f66a5f508d75ca8ac1d74b99fd61a8f9b6f96fe9820620520bb0a0beda23
 size 1128448
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
@ -1,3 +1,3 @@
 c5f36e093e875c8ea84523fb1566d986 tensorrt_llm_nvrtc_wrapper.lib
-54a96b4243b8fbcb0a331986b3bf70b9 tensorrt_llm_nvrtc_wrapper.dll
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+f7224da82447d5396ce952c4edb0b044 tensorrt_llm_nvrtc_wrapper.dll
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/kernels/decodingKernels.cu
+++ b/cpp/tensorrt_llm/kernels/decodingKernels.cu
@ -624,18 +624,19 @@ void invokeCopyBeamHypotheses(DecodingOutput::BeamHypotheses const& src, Decodin
    copyBeamHypotheses<<<numSMs, 256, 0, stream.get()>>>(copyStruct);
 }

-__global__ void initializeOutput(TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const nMaxSeqLen)
+__global__ void initializeOutput(
+    TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const beam, SizeType32 const nMaxSeqLen)
 {
    for (int i = threadIdx.x; i < nMaxSeqLen; i += blockDim.x)
    {
-        finalOutputIds[blockIdx.x * nMaxSeqLen + i] = endIds[blockIdx.x];
+        finalOutputIds[blockIdx.x * nMaxSeqLen + i] = endIds[blockIdx.x / beam];
    }
 }

-void invokeInitializeOutput(TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const batchBeam,
-    SizeType32 const nMaxSeqLen, cudaStream_t stream)
+void invokeInitializeOutput(TokenIdType* finalOutputIds, TokenIdType const* endIds, SizeType32 const batch,
+    SizeType32 const beam, SizeType32 const nMaxSeqLen, cudaStream_t stream)
 {
-    initializeOutput<<<batchBeam, 256, 0, stream>>>(finalOutputIds, endIds, nMaxSeqLen);
+    initializeOutput<<<batch * beam, 256, 0, stream>>>(finalOutputIds, endIds, beam, nMaxSeqLen);
 }

 __global__ void copyNextStepIds(TokenIdType* nextStepIds, TokenIdType const* const* outputIdsPtr,
@ -742,7 +743,7 @@ void gatherTree(DecodingOutput const& decodingOutput, DecodingInput const& decod

    // prefill finalOutputIds with the EOS tokens from decodingInput.endIds
    tensorrt_llm::kernels::invokeInitializeOutput(bufferCast<TokenIdType>(finalOutputIds),
-        bufferCast<TokenIdType>(*decodingInput.endIds), batchSize * beamWidth, maxSeqLength, stream);
+        bufferCast<TokenIdType>(*decodingInput.endIds), batchSize, beamWidth, maxSeqLength, stream);
    sync_check_cuda_error();

    std::vector<float> lengthPenaltyVec;
--- a/cpp/tensorrt_llm/kernels/decodingKernels.h
+++ b/cpp/tensorrt_llm/kernels/decodingKernels.h
@ -72,7 +72,7 @@ void invokeFinalize(BeamHypotheses& bh, cudaStream_t stream);
 //! \param maxSeqLen The maximum sequence length, inferred from the finalOutputIds.shape[3]
 //! \param stream The CUDA stream on which to perform the operation.
 void invokeInitializeOutput(runtime::TokenIdType* finalOutputIds, runtime::TokenIdType const* endIds,
-    runtime::SizeType32 batchBeam, runtime::SizeType32 maxSeqLen, cudaStream_t stream);
+    runtime::SizeType32 batch, runtime::SizeType32 beam, runtime::SizeType32 maxSeqLen, cudaStream_t stream);

 //! \brief Copies the data from the buffers in src to dst to reduce the kernel launch overhead of individual memcpy.
 //! for streaming + beam search, where we need to avoid overwriting the beam search buffers.
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:5081f260dc93b1caf996803253ab27fb290229e914feebfb8bd76b57e4e0a33e
-size 25294842
+oid sha256:60f54be9c7f7517b76112dcf9185cb8f4b2c69c11aec3b7afaaf86f7d0cc9b70
+size 25257850
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b2d4225f5c76786ee808df56277cbe9328cd4da1a6265a15fc9c730353cdcdbf
-size 25699742
+oid sha256:2e2f1a38e7df380dd6048158565ffdc269dea2251078b3c8f2265f8e533337a4
+size 25663182
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-87ec6b3278d52f9e2d8a23a20c38560b libtensorrt_llm_internal_cutlass_kernels_static.a
-d8d2c5cb915dcac97f2279166f7eae58 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+74f70e14a884d8e29144595686634296 libtensorrt_llm_internal_cutlass_kernels_static.a
+2de60e6d3211395ce4481ffede35c2f4 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ed0568d70c32b592a1a9fbd5c129b7c19ae1eca5fba2b68375f29b75bca32499
-size 44104384
+oid sha256:68ff5e652a2ec0d295583f4e3a8696df5d3606c2ffde50e4f205b87aea2f6c91
+size 44055616
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:135a32f33cec8c10c7d780df3d3f55ea8b9d2b1fe0a553ad6834cff81d42cbaf
-size 43491958
+oid sha256:b598263e5df63ad110ab82bd2c3ea86cf4760350488122c8d14d908c1b84c045
+size 43443190
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-2bb339277f872a112fc9c373128a7685 libtensorrt_llm_internal_cutlass_kernels_static.a
-d212bad6fa0f17d00ee90376a2a250c0 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+6c4a716c6aefe9a68501d4bd6901cc95 libtensorrt_llm_internal_cutlass_kernels_static.a
+0134143d17d8ff5b7326235f85f664f5 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2de729c93894e86ad9cae674a567c02677acb90bb6c970f2ce5dfcd04c489816
-size 88140802
+oid sha256:e72bd01eced686736f3e96d445e80964b83da77067c0c622f512630f625c2f40
+size 88140804
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
@ -1,2 +1,2 @@
-8bf62183399a59548962a18ab2dec352 tensorrt_llm_internal_cutlass_kernels_static.lib
-d6d4b53bc917674b8da11efd8ff00d12b7e3fcc4 commit
+d908f9e8ed1928239467a78b30ae7b35 tensorrt_llm_internal_cutlass_kernels_static.lib
+082eab3f3ff083d7d2fcc886fa9854580de4eee3 commit
--- a/cpp/tensorrt_llm/kernels/lora/lora.cpp
+++ b/cpp/tensorrt_llm/kernels/lora/lora.cpp
@ -127,12 +127,10 @@ size_t LoraImpl::getWorkspaceSize(
 {
    TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
    auto const typeSize = tensorrt_llm::common::getDTypeSize(type);
-    TLLM_CHECK_WITH_INFO(
-        numTokens >= numReqs, fmtstr("num tokens %ld should be greater than num reqs %ld", numTokens, numReqs));

    return (size_t) getGemmWorkSpaceSize(numTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices)
        + getLowRankWorkSpaceSize(numTokens, mNumLoraModules, mMaxLowRank, typeSize)
-        + getGemmParamsWorkSpaceSize(numReqs * mNumLoraModules);
+        + getGemmParamsWorkSpaceSize(std::min(numReqs, numTokens) * mNumLoraModules);
 }

 void LoraImpl::setBestTactic(std::optional<Config> config)
@ -163,7 +161,7 @@ int LoraImpl::run(int64_t numTokens, int64_t numReqs, void const* input, int32_t
    setGemmConfig();

    int64_t GemmWorkSpaceSize = getGemmWorkSpaceSize(numTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices);
-    int64_t groupGemmParamsWorkSpaceSize = getGemmParamsWorkSpaceSize(numReqs * mNumLoraModules);
+    int64_t groupGemmParamsWorkSpaceSize = getGemmParamsWorkSpaceSize(std::min(numReqs, numTokens) * mNumLoraModules);
    void* gemmWorkSpace = workspace; // [gemmWorkSpace, lowrankWorkSpace, groupGemmParamsWorkSpace]
    void* lowRankWorkSpace = static_cast<char*>(gemmWorkSpace) + GemmWorkSpaceSize;
    void* groupGemmParamsWorkSpace = static_cast<char*>(lowRankWorkSpace)
--- a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu
+++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.cu
@ -1033,4 +1033,104 @@ void invokeCopyOutputTokensIds(runtime::TokenIdType** tmpOutputIdsPtrs, runtime:
        batchSize, numInputLogits, maxDecodingDraftTokens);
 }

+namespace
+{
+__global__ void packEagleGenerationLengths(PackEagleParams params)
+{
+    auto const batchIdx = static_cast<SizeType32>(blockIdx.x);
+    auto const batchSlot = params.batchSlots[batchIdx];
+
+    auto const isGenerationRequest = batchIdx >= params.numContextRequests;
+    auto const genIdx = batchIdx - params.numContextRequests;
+
+    if (threadIdx.x == 0 && isGenerationRequest)
+    {
+        params.outputSpecDecodingGenerationLengths[genIdx] = params.inputNextDraftLens[batchSlot];
+    }
+}
+
+__global__ void packEagleTensors(PackEagleParams params)
+{
+    auto const batchIdx = static_cast<SizeType32>(blockIdx.x);
+    auto const batchSlot = params.batchSlots[batchIdx];
+
+    auto const isGenerationRequest = batchIdx >= params.numContextRequests;
+    auto const genIdx = batchIdx - params.numContextRequests;
+
+    // Copy data that is 1 elem per request
+    if (threadIdx.x == 0)
+    {
+        params.outputRandomDataSample[batchIdx] = params.inputRandomDataSample[batchSlot];
+        params.outputTemperatures[batchIdx] = params.inputTemperatures[batchSlot];
+        // FIXME we need 1 value per draft token
+        params.outputRandomDataValidation[batchIdx] = params.inputRandomDataValidation[batchSlot];
+
+        // 0 for ctx request and actual draft len for gen requests.
+        params.outputNextDraftLens[batchIdx] = isGenerationRequest ? params.inputNextDraftLens[batchSlot] : 0;
+    }
+
+    // Copy draft paths
+    auto const numPathElts = params.maxNumPaths * params.maxPathLength;
+    auto outputNextDraftPaths = params.outputNextDraftPaths + batchIdx * numPathElts;
+    auto const inputNextDraftPaths = params.inputNextDraftPaths + batchSlot * numPathElts;
+    for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < numPathElts; ti += static_cast<SizeType32>(blockDim.x))
+    {
+        outputNextDraftPaths[ti] = inputNextDraftPaths[ti];
+    }
+
+    if (isGenerationRequest)
+    {
+        // Copy draft tokens. We do it only for gen requests as for ctx requests outputNextDraftLens is 0.
+        auto const maxDecodingDraftTokens = params.maxDecodingTokens - 1;
+        auto outputNextDraftTokens = params.outputNextDraftTokens + batchIdx * maxDecodingDraftTokens;
+        auto const inputNextDraftTokens = params.inputNextDraftTokens + batchSlot * maxDecodingDraftTokens;
+
+        for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < maxDecodingDraftTokens;
+             ti += static_cast<SizeType32>(blockDim.x))
+        {
+            outputNextDraftTokens[ti] = inputNextDraftTokens[ti];
+        }
+
+        auto const maxGenerationLength = params.maxGenerationLength[0];
+        auto const numPackedMasks = divUp(params.maxDecodingTokens, 32);
+        auto const outputStartId = (genIdx == 0) ? 0 : params.cumSumGenerationLengths[genIdx - 1];
+        auto const numTokens = (genIdx == 0)
+            ? params.cumSumGenerationLengths[0]
+            : params.cumSumGenerationLengths[genIdx] - params.cumSumGenerationLengths[genIdx - 1];
+        // Copy packed masks.
+        // Masks are placed next to each other with offsets of cumSumGenerationLengths[bi-1]
+        auto const inputPackedMask
+            = params.inputSpecDecodingPackedMasks + batchSlot * numPackedMasks * params.maxDecodingTokens;
+        auto outputPackedMask = params.outputSpecDecodingPackedMasks + outputStartId * numPackedMasks;
+        for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < numTokens * numPackedMasks;
+             ti += static_cast<SizeType32>(blockDim.x))
+        {
+            outputPackedMask[ti] = inputPackedMask[ti];
+        }
+
+        // Copy pos offsets. Copy only for maxGenerationLength
+        auto const inputPositionOffsets
+            = params.inputSpecDecodingPositionOffsets + batchSlot * params.maxDecodingTokens;
+        auto outputPositionOffsets = params.outputSpecDecodingPositionOffsets + genIdx * maxGenerationLength;
+        for (auto ti = static_cast<SizeType32>(threadIdx.x); ti < maxGenerationLength;
+             ti += static_cast<SizeType32>(blockDim.x))
+        {
+            outputPositionOffsets[ti] = inputPositionOffsets[ti];
+        }
+    }
+}
+} // namespace
+
+void invokePackEagleGenerationLengths(PackEagleParams const& params, cudaStream_t stream)
+{
+    SizeType32 constexpr BLOCK_SIZE = 32;
+    packEagleGenerationLengths<<<params.batchSize, BLOCK_SIZE, 0, stream>>>(params);
+}
+
+void invokePackEagle(PackEagleParams const& params, cudaStream_t stream)
+{
+    SizeType32 constexpr BLOCK_SIZE = 128;
+    packEagleTensors<<<params.batchSize, BLOCK_SIZE, 0, stream>>>(params);
+}
+
 } // namespace tensorrt_llm::kernels::speculative_decoding
--- a/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h
+++ b/cpp/tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h
@ -295,4 +295,103 @@ struct PrepareGenEagleNetInputsParams
 //! \brief Prepares inputs for the gen stage EagleNet itearion (layerIdx > 0).
 //! For input/output examples visit test/model/eagle/test_prepare_drafter_inputs_plugin.py (gen Eagle Net examples)
 void invokePrepareGenEagleNetInputs(PrepareGenEagleNetInputsParams const& params);
+
+struct PackEagleParams
+{
+    runtime::SizeType32 batchSize{0};
+    runtime::SizeType32 maxNumPaths{0};
+    runtime::SizeType32 maxDecodingTokens{0};
+    runtime::SizeType32 maxPathLength{0};
+    runtime::SizeType32 numContextRequests{0};
+    runtime::SizeType32 numGenerationRequests{0};
+
+    //! inputs
+    //! [batchSize]
+    runtime::SizeType32 const* batchSlots{nullptr};
+
+    //! [maxBatchSize]
+    float const* inputTemperatures{nullptr};
+    //! [maxBatchSize]
+    float const* inputRandomDataSample{nullptr};
+    //! [maxBatchSize]
+    float const* inputRandomDataValidation{nullptr};
+    //! [maxBatchSize, maxDecodingDraftTokens]
+    runtime::TokenIdType const* inputNextDraftTokens{nullptr};
+    //! [maxBatchSize]
+    runtime::SizeType32 const* inputNextDraftLens{nullptr};
+    //! [maxBatchSize, maxDecodingTokens, maxPathLen]
+    runtime::SizeType32 const* inputNextDraftPaths{nullptr};
+    //! [maxBatchSize]
+    runtime::SizeType32 const* inputSpecDecodingGenerationLengths{nullptr};
+    //! [maxBatchSize]
+    runtime::SizeType32 const* inputSpecDecodingPositionOffsets{nullptr};
+    //! [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
+    int32_t const* inputSpecDecodingPackedMasks{nullptr};
+
+    //! outputs
+    //! [batchSize]
+    float* outputTemperatures{nullptr};
+    //! [batchSize]
+    float* outputRandomDataSample{nullptr};
+    //! [batchSize]
+    float* outputRandomDataValidation{nullptr};
+    //! [batchSize, maxDecodingDraftTokens]
+    runtime::TokenIdType* outputNextDraftTokens{nullptr};
+    //! [batchSize]
+    runtime::SizeType32* outputNextDraftLens{nullptr};
+    //! [batchSize, maxDecodingTokens, maxPathLen]
+    runtime::SizeType32* outputNextDraftPaths{nullptr};
+    //! [batchSize]
+    runtime::SizeType32* outputSpecDecodingGenerationLengths{nullptr};
+    //! [batchSize]
+    runtime::SizeType32* outputSpecDecodingPositionOffsets{nullptr};
+    //! [maxBatchSize, maxDecodingTokens, ceil(maxDecodingTokens / 32)]
+    int32_t* outputSpecDecodingPackedMasks{nullptr};
+
+    // workspace
+    //! [1]
+    runtime::SizeType32* maxGenerationLength{nullptr};
+    //! [batchSize + 1]
+    runtime::SizeType32* cumSumGenerationLengths{nullptr};
+
+    void checkParams()
+    {
+        TLLM_CHECK(batchSlots);
+
+        TLLM_CHECK(inputTemperatures);
+        TLLM_CHECK(inputRandomDataSample);
+        TLLM_CHECK(inputRandomDataValidation);
+        TLLM_CHECK(inputNextDraftTokens);
+        TLLM_CHECK(inputNextDraftLens);
+        TLLM_CHECK(inputNextDraftPaths);
+        TLLM_CHECK(inputSpecDecodingGenerationLengths);
+        TLLM_CHECK(inputSpecDecodingPositionOffsets);
+        TLLM_CHECK(inputSpecDecodingPackedMasks);
+
+        TLLM_CHECK(outputTemperatures);
+        TLLM_CHECK(outputRandomDataSample);
+        TLLM_CHECK(outputRandomDataValidation);
+        TLLM_CHECK(outputNextDraftTokens);
+        TLLM_CHECK(outputNextDraftLens);
+        TLLM_CHECK(outputNextDraftPaths);
+        TLLM_CHECK(outputSpecDecodingGenerationLengths);
+        TLLM_CHECK(outputSpecDecodingPositionOffsets);
+        TLLM_CHECK(outputSpecDecodingPackedMasks);
+
+        TLLM_CHECK(maxGenerationLength);
+        TLLM_CHECK(cumSumGenerationLengths);
+
+        TLLM_CHECK(batchSize > 0);
+        TLLM_CHECK(batchSize == numContextRequests + numGenerationRequests);
+        TLLM_CHECK(maxDecodingTokens > 0);
+        TLLM_CHECK(maxPathLength > 0);
+        TLLM_CHECK(maxNumPaths > 0);
+    }
+};
+
+//! \brief packs outputSpecDecodingGenerationLengths from batch slots positions to continuous memory.
+void invokePackEagleGenerationLengths(PackEagleParams const& params, cudaStream_t stream);
+//! \brief packs the rest of the output tensors from batch slots positions to continuous memory.
+void invokePackEagle(PackEagleParams const& params, cudaStream_t stream);
+
 } // namespace tensorrt_llm::kernels::speculative_decoding
--- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
+++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h
@ -1063,7 +1063,7 @@ void kernelV2DispatchHeadSize(QKVPreprocessingParams<T, KVCacheBuffer> params, c

 ///////////////////////////////////////////////////////////////////////////////////////////////////

-template <typename T, typename TCache, typename KVCacheBuffer, int BLOCK_SIZE, int Dh>
+template <typename T, typename TCache, typename KVCacheBuffer, int BLOCK_SIZE, int Dh, bool FP8_OUTPUT>
 __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCacheBuffer> params)
 {
    // For cross-attention,
@ -1121,6 +1121,14 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
    int const src_k_offset = params.q_hidden_size;
    int const src_v_offset = src_k_offset + params.kv_hidden_size;

+    // Cast float scale to dst data type.
+    using TScale = typename mmha::kv_cache_scale_type_t<T, TCache>::Type;
+    [[maybe_unused]] TScale scale_orig_quant;
+    if constexpr (sizeof(TCache) == 1 || FP8_OUTPUT)
+    {
+        mmha::convert_from_float(&scale_orig_quant, params.kvScaleOrigQuant ? params.kvScaleOrigQuant[0] : 1.0f);
+    }
+
    // For loop in the sequence length dimension.
    // There might be multiple blocks (blockIdx.x) that process the same sequence in order to fully utilize
    for (int token_idx = blockIdx.x * TOKENS_PER_BLOCK + (threadIdx.x / VECS_PER_HEAD); token_idx < max_seq_len;
@ -1139,8 +1147,18 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
            // Only load Q tokens from decoder qkv input.
            auto q = *reinterpret_cast<VecT const*>(params.qkv_input + src_q_idx);

-            // Store it to a separate q output.
-            *reinterpret_cast<VecT*>(params.q_output + dst_q_idx) = q;
+            // Quantize the output to fp8.
+            if constexpr (FP8_OUTPUT)
+            {
+                using OutputType = __nv_fp8_e4m3;
+                OutputType* quantized_q_ptr = reinterpret_cast<OutputType*>(params.q_output) + dst_q_idx;
+                mmha::store_8bits_vec(quantized_q_ptr, q, 0, scale_orig_quant);
+            }
+            else
+            {
+                // Store it to a separate q output.
+                *reinterpret_cast<VecT*>(params.q_output + dst_q_idx) = q;
+            }
        }

        // Encoder tokens (i.e. KV tokens).
@ -1175,10 +1193,6 @@ __global__ void updateKVCacheForCrossAttention(QKVPreprocessingParams<T, KVCache
            {
                // The element index inside the block.
                auto block_elt_idx = block_vec_idx * ELTS_PER_VEC;
-                // Cast float scale to dst data type.
-                using TScale = typename mmha::kv_cache_scale_type_t<T, TCache>::Type;
-                TScale scale_orig_quant;
-                mmha::convert_from_float(&scale_orig_quant, params.kvScaleOrigQuant[0]);
                // Store 8bits kv cache.
                mmha::store_8bits_vec(k_cache_block_ptr, k, block_elt_idx, scale_orig_quant);
                mmha::store_8bits_vec(v_cache_block_ptr, v, block_elt_idx, scale_orig_quant);
@ -1217,7 +1231,16 @@ void invokeUpdateKvCacheForCrossAttention(QKVPreprocessingParams<T, KVCacheBuffe
    dim3 grid(num_seq_blocks, params.head_num, params.batch_size);

    // Launch the kernel.
-    updateKVCacheForCrossAttention<T, TCache, KVCacheBuffer, BLOCK_SIZE, Dh><<<grid, block, 0, stream>>>(params);
+    if (params.quantized_fp8_output)
+    {
+        updateKVCacheForCrossAttention<T, TCache, KVCacheBuffer, BLOCK_SIZE, Dh, true>
+            <<<grid, block, 0, stream>>>(params);
+    }
+    else
+    {
+        updateKVCacheForCrossAttention<T, TCache, KVCacheBuffer, BLOCK_SIZE, Dh, false>
+            <<<grid, block, 0, stream>>>(params);
+    }
 }

 ///////////////////////////////////////////////////////////////////////////////////////////////////
--- a/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
+++ b/cpp/tensorrt_llm/plugins/gptAttentionCommon/gptAttentionCommon.cpp
@ -785,7 +785,7 @@ int GPTAttentionPluginCommon::enqueueContext(EnqueueContextParams<T> const& para
        = mEnableContextFMHA ? 0 : sizeof(T) * params.batch_size * params.input_seq_length * kv_seq_length;
    size_t const cu_seqlens_size = sizeof(int) * (params.batch_size + 1);
    size_t const rotary_inv_freq_size = sizeof(float) * params.batch_size * mRotaryEmbeddingDim / 2;
-    size_t const q_buf_2_size = mFMHARunner->isSeparateQAndKvInput() || !mEnableContextFMHA
+    size_t const q_buf_2_size = !mEnableContextFMHA || mFMHARunner->isSeparateQAndKvInput()
        ? sizeof(T) * params.batch_size * params.input_seq_length * local_hidden_units_qo
        : 0;
    size_t const k_buf_2_size
--- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.cpp
@ -86,10 +86,10 @@ std::shared_ptr<tb::LlmRequest> LlmRequest::toTrtLlm() const
    return std::make_shared<tb::LlmRequest>(mRequestId, mMaxNewTokens,
        std::make_shared<std::vector<TokenIdType>>(mTokens.at(0)), mSamplingConfig, mIsStreaming, mEndId, mPadId,
        embeddingBias, badWordsList, stopWordsList, mPositionIds, promptEmbeddingTable, mPromptVocabSize, mLoraTaskId,
-        loraWeights, loraConfig, mLookaheadConfig, returnLogProbs(), mReturnContextLogits, mReturnGenerationLogits,
-        mDraftTokens, draftLogits, mExcludeInputFromOutput, callbackAdapter(mLogitsPostProcessor),
-        mApplyLogitsPostProcessorBatched, mEncoderTokens, mReturnEncoderOutput, mClientId, mPriority,
-        encoderInputFeatures, mEncoderOutputLength, crossAttentionMask,
+        loraWeights, loraConfig, mLookaheadConfig, mKvCacheRetentionConfig, returnLogProbs(), mReturnContextLogits,
+        mReturnGenerationLogits, mDraftTokens, draftLogits, mExcludeInputFromOutput,
+        callbackAdapter(mLogitsPostProcessor), mApplyLogitsPostProcessorBatched, mEncoderTokens, mReturnEncoderOutput,
+        mClientId, mPriority, encoderInputFeatures, mEncoderOutputLength, crossAttentionMask,
        tb::LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, mInputTokenExtraIds, mNumReturnSequences);
 }

@ -102,8 +102,8 @@ void LlmRequest::initBindings(py::module_& m)
                 std::optional<LlmRequest::TensorPtr>, std::optional<std::vector<LlmRequest::SizeType32>>,
                 std::optional<LlmRequest::TensorPtr>, std::optional<LlmRequest::SizeType32>, std::optional<uint64_t>,
                 std::optional<LlmRequest::TensorPtr>, std::optional<LlmRequest::TensorPtr>,
-                 std::optional<executor::LookaheadDecodingConfig>, bool, bool, bool,
-                 std::optional<LlmRequest::VecTokens>, std::optional<LlmRequest::TensorPtr>, bool,
+                 std::optional<executor::LookaheadDecodingConfig>, std::optional<executor::KvCacheRetentionConfig>,
+                 bool, bool, bool, std::optional<LlmRequest::VecTokens>, std::optional<LlmRequest::TensorPtr>, bool,
                 std::optional<LlmRequest::LogitsPostProcessor>, bool, std::optional<LlmRequest::VecTokens>, bool,
                 std::optional<RequestIdType>, executor::PriorityType, std::optional<LlmRequest::TensorPtr>,
                 std::optional<LlmRequest::SizeType32>, std::optional<LlmRequest::TensorPtr>,
@ -115,15 +115,15 @@ void LlmRequest::initBindings(py::module_& m)
            py::arg("prompt_embedding_table") = std::nullopt, py::arg("prompt_vocab_size") = std::nullopt,
            py::arg("lora_task_id") = std::nullopt, py::arg("lora_weights") = std::nullopt,
            py::arg("lora_config") = std::nullopt, py::arg("lookahead_config") = std::nullopt,
-            py::arg("return_log_probs") = false, py::arg("return_context_logits") = false,
-            py::arg("return_generation_logits") = false, py::arg("draft_tokens") = std::nullopt,
-            py::arg("draft_logits") = std::nullopt, py::arg("exclude_input_from_output") = false,
-            py::arg("logits_post_processor") = std::nullopt, py::arg("apply_logits_post_processor_batched") = false,
-            py::arg("encoder_input_tokens") = std::nullopt, py::arg("return_encoder_output") = false,
-            py::arg("client_id") = std::nullopt, py::arg("priority") = executor::Request::kDefaultPriority,
-            py::arg("encoder_input_features") = std::nullopt, py::arg("encoder_output_length") = std::nullopt,
-            py::arg("cross_attention_mask") = std::nullopt, py::arg("input_token_extra_ids") = std::nullopt,
-            py::arg("num_return_sequences") = 1)
+            py::arg("kv_cache_retention_config") = std::nullopt, py::arg("return_log_probs") = false,
+            py::arg("return_context_logits") = false, py::arg("return_generation_logits") = false,
+            py::arg("draft_tokens") = std::nullopt, py::arg("draft_logits") = std::nullopt,
+            py::arg("exclude_input_from_output") = false, py::arg("logits_post_processor") = std::nullopt,
+            py::arg("apply_logits_post_processor_batched") = false, py::arg("encoder_input_tokens") = std::nullopt,
+            py::arg("return_encoder_output") = false, py::arg("client_id") = std::nullopt,
+            py::arg("priority") = executor::Request::kDefaultPriority, py::arg("encoder_input_features") = std::nullopt,
+            py::arg("encoder_output_length") = std::nullopt, py::arg("cross_attention_mask") = std::nullopt,
+            py::arg("input_token_extra_ids") = std::nullopt, py::arg("num_return_sequences") = 1)
        .def("get_num_tokens", &LlmRequest::getNumTokens, py::arg("beam"))
        .def_property_readonly("max_beam_num_tokens", &LlmRequest::getMaxBeamNumTokens)
        .def("get_token", &LlmRequest::getToken, py::arg("beam"), py::arg("pos"))
--- a/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
+++ b/cpp/tensorrt_llm/pybind/batch_manager/llmRequest.h
@ -68,8 +68,9 @@ public:
        std::optional<SizeType32> promptVocabSize = std::nullopt,
        std::optional<LoraTaskIdType> loraTaskId = std::nullopt, std::optional<TensorPtr> loraWeights = std::nullopt,
        std::optional<TensorPtr> loraConfig = std::nullopt,
-        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt, bool returnLogProbs = false,
-        bool returnContextLogits = false, bool returnGenerationLogits = false,
+        std::optional<executor::LookaheadDecodingConfig> lookaheadConfig = std::nullopt,
+        std::optional<executor::KvCacheRetentionConfig> kvCacheRetentionConfig = std::nullopt,
+        bool returnLogProbs = false, bool returnContextLogits = false, bool returnGenerationLogits = false,
        std::optional<VecTokens> draftTokens = std::nullopt, std::optional<TensorPtr> draftLogits = std::nullopt,
        bool excludeInputFromOutput = false, std::optional<LogitsPostProcessor> logitsPostProcessor = std::nullopt,
        bool applyLogitsPostProcessorBatched = false, std::optional<VecTokens> encoderInputTokens = std::nullopt,
@ -83,8 +84,8 @@ public:
            samplingConfig, isStreaming, endId, padId, embeddingBias, badWordsList, stopWordsList,
            positionIds.has_value() ? std::make_shared<std::vector<SizeType32>>(std::move(positionIds.value()))
                                    : std::optional<std::shared_ptr<std::vector<SizeType32>>>(std::nullopt),
-            promptEmbeddingTable, promptVocabSize, loraTaskId, loraWeights, loraConfig, lookaheadConfig, returnLogProbs,
-            returnContextLogits, returnGenerationLogits,
+            promptEmbeddingTable, promptVocabSize, loraTaskId, loraWeights, loraConfig, lookaheadConfig,
+            kvCacheRetentionConfig, returnLogProbs, returnContextLogits, returnGenerationLogits,
            draftTokens.has_value() ? std::make_shared<VecTokens>(std::move(draftTokens.value()))
                                    : std::make_shared<VecTokens>(),
            draftLogits, excludeInputFromOutput, logitsPostProcessor, applyLogitsPostProcessorBatched,
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@ -224,7 +224,9 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
        .def_property("compute_generation_logits",
            py::overload_cast<>(&tr::ModelConfig::computeGenerationLogits, py::const_),
            py::overload_cast<bool>(&tr::ModelConfig::computeGenerationLogits))
-        .def_property("model_variant", &tr::ModelConfig::getModelVariant, &tr::ModelConfig::setModelVariant);
+        .def_property("model_variant", &tr::ModelConfig::getModelVariant, &tr::ModelConfig::setModelVariant)
+        .def_property(
+            "use_cross_attention", &tr::ModelConfig::useCrossAttention, &tr::ModelConfig::setUseCrossAttention);

    py::class_<tr::WorldConfig>(m, "WorldConfig")
        .def(py::init<SizeType32, SizeType32, SizeType32, SizeType32, std::optional<std::vector<SizeType32>> const&>(),
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@ -114,7 +114,9 @@ void InitBindings(pybind11::module_& m)
        .def_readwrite("tokens_per_block", &tle::KvCacheStats::tokensPerBlock)
        .def_readwrite("alloc_total_blocks", &tle::KvCacheStats::allocTotalBlocks)
        .def_readwrite("alloc_new_blocks", &tle::KvCacheStats::allocNewBlocks)
-        .def_readwrite("reused_blocks", &tle::KvCacheStats::reusedBlocks);
+        .def_readwrite("reused_blocks", &tle::KvCacheStats::reusedBlocks)
+        .def_readwrite("missed_blocks", &tle::KvCacheStats::missedBlocks)
+        .def_readwrite("cache_hit_rate", &tle::KvCacheStats::cacheHitRate);

    py::class_<tle::StaticBatchingStats>(m, "StaticBatchingStats")
        .def(py::init<>())
@ -184,6 +186,8 @@ void InitBindings(pybind11::module_& m)
        .def_readwrite("alloc_total_blocks_per_request", &tle::RequestStats::allocTotalBlocksPerRequest)
        .def_readwrite("alloc_new_blocks_per_request", &tle::RequestStats::allocNewBlocksPerRequest)
        .def_readwrite("reused_blocks_per_request", &tle::RequestStats::reusedBlocksPerRequest)
+        .def_readwrite("missed_blocks_per_request", &tle::RequestStats::missedBlocksPerRequest)
+        .def_readwrite("kv_cache_hit_rate_per_request", &tle::RequestStats::kvCacheHitRatePerRequest)
        .def("to_json_str",
            [](tle::RequestStats const& iterationStats) { return tle::JsonSerialization::toJsonStr(iterationStats); });

@ -307,6 +311,24 @@ void InitBindings(pybind11::module_& m)
        .def_property_readonly("max_ngram_size", &tle::LookaheadDecodingConfig::getNgramSize)
        .def_property_readonly("max_verification_set_size", &tle::LookaheadDecodingConfig::getVerificationSetSize);

+    auto kvCacheRetentionConfig
+        = py::class_<tle::KvCacheRetentionConfig>(m, "KvCacheRetentionConfig")
+              .def(py::init<std::vector<tle::KvCacheRetentionConfig::TokenRangeRetentionPriority>,
+                       tle::RetentionPriority>(),
+                  py::arg("token_range_retention_priorities"), py::arg("decode_retention_priority"))
+              .def_property_readonly(
+                  "token_range_retention_priorities", &tle::KvCacheRetentionConfig::getTokenRangeRetentionPriorities)
+              .def_property_readonly(
+                  "decode_retention_priority", &tle::KvCacheRetentionConfig::getDecodeRetentionPriority);
+
+    py::class_<tle::KvCacheRetentionConfig::TokenRangeRetentionPriority>(
+        kvCacheRetentionConfig, "TokenRangeRetentionPriority")
+        .def(py::init<SizeType32, std::optional<SizeType32>, tle::RetentionPriority>(), py::arg("token_start"),
+            py::arg("token_end"), py::arg("priority"))
+        .def_readwrite("token_start", &tle::KvCacheRetentionConfig::TokenRangeRetentionPriority::tokenStart)
+        .def_readwrite("token_end", &tle::KvCacheRetentionConfig::TokenRangeRetentionPriority::tokenEnd)
+        .def_readwrite("priority", &tle::KvCacheRetentionConfig::TokenRangeRetentionPriority::priority);
+
    py::class_<tle::ContextPhaseParams>(m, "ContextPhaseParams")
        .def(py::init<VecTokens, tle::ContextPhaseParams::RequestIdType>(), py::arg("first_gen_tokens"),
            py::arg("req_id"));
@ -327,6 +349,7 @@ void InitBindings(pybind11::module_& m)
                    std::optional<tle::ExternalDraftTokensConfig> externalDraftTokensConfig,
                    std::optional<tle::PromptTuningConfig> pTuningConfig, std::optional<tle::LoraConfig> loraConfig,
                    std::optional<tle::LookaheadDecodingConfig> lookaheadConfig,
+                    std::optional<tle::KvCacheRetentionConfig> kvCacheRetentionConfig,
                    std::optional<std::string> logitsPostProcessorName,
                    std::optional<tle::VecTokens> encoderInputTokenIds, std::optional<tle::IdType> clientId,
                    bool returnAllGeneratedTokens, tle::PriorityType priority, tle::RequestType type,
@ -343,12 +366,12 @@ void InitBindings(pybind11::module_& m)
                        }
                    }
                    TLLM_CHECK_WITH_INFO(maxTokens.has_value(), "missing required argument max_tokens");
-
                    return std::make_unique<tle::Request>(inputTokenIds, maxTokens.value(), streaming, samplingConfig,
                        outputConfig, endId, padId, positionIds, badWords, stopWords, embeddingBias,
-                        externalDraftTokensConfig, pTuningConfig, loraConfig, lookaheadConfig, logitsPostProcessorName,
-                        encoderInputTokenIds, clientId, returnAllGeneratedTokens, priority, type, contextPhaseParams,
-                        encoderInputFeatures, encoderOutputLength, crossAttentionMask, numReturnSequences);
+                        externalDraftTokensConfig, pTuningConfig, loraConfig, lookaheadConfig, kvCacheRetentionConfig,
+                        logitsPostProcessorName, encoderInputTokenIds, clientId, returnAllGeneratedTokens, priority,
+                        type, contextPhaseParams, encoderInputFeatures, encoderOutputLength, crossAttentionMask,
+                        numReturnSequences);
                }),
            py::arg("input_token_ids"), py::kw_only(), py::arg("max_tokens") = py::none(),
            py::arg("max_new_tokens") = py::none(), py::arg("streaming") = false,
@ -358,9 +381,9 @@ void InitBindings(pybind11::module_& m)
            py::arg("stop_words") = py::none(), py::arg("embedding_bias") = py::none(),
            py::arg("external_draft_tokens_config") = py::none(), py::arg("prompt_tuning_config") = py::none(),
            py::arg("lora_config") = py::none(), py::arg("lookahead_config") = py::none(),
-            py::arg("logits_post_processor_name") = py::none(), py::arg("encoder_input_token_ids") = py::none(),
-            py::arg("client_id") = py::none(), py::arg("return_all_generated_tokens") = false,
-            py::arg("priority") = tle::Request::kDefaultPriority,
+            py::arg("kv_cache_retention_config") = py::none(), py::arg("logits_post_processor_name") = py::none(),
+            py::arg("encoder_input_token_ids") = py::none(), py::arg("client_id") = py::none(),
+            py::arg("return_all_generated_tokens") = false, py::arg("priority") = tle::Request::kDefaultPriority,
            py::arg_v("type", tle::RequestType::REQUEST_TYPE_CONTEXT_AND_GENERATION,
                "RequestType.REQUEST_TYPE_CONTEXT_AND_GENERATION"),
            py::arg("context_phase_params") = py::none(), py::arg("encoder_input_features") = py::none(),
@ -384,6 +407,8 @@ void InitBindings(pybind11::module_& m)
            "prompt_tuning_config", &tle::Request::getPromptTuningConfig, &tle::Request::setPromptTuningConfig)
        .def_property("lora_config", &tle::Request::getLoraConfig, &tle::Request::setLoraConfig)
        .def_property("lookahead_config", &tle::Request::getLookaheadConfig, &tle::Request::setLookaheadConfig)
+        .def_property("kv_cache_retention_config", &tle::Request::getKvCacheRetentionConfig,
+            &tle::Request::setKvCacheRetentionConfig)
        .def_property("logits_post_processor_name", &tle::Request::getLogitsPostProcessorName,
            &tle::Request::setLogitsPostProcessorName)
        .def_property(
@ -417,7 +442,9 @@ void InitBindings(pybind11::module_& m)
        .def_readwrite("encoder_output", &tle::Result::encoderOutput)
        .def_readwrite("finish_reasons", &tle::Result::finishReasons)
        .def_readwrite("sequence_index", &tle::Result::sequenceIndex)
-        .def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal);
+        .def_readwrite("is_sequence_final", &tle::Result::isSequenceFinal)
+        .def_readwrite("decoding_iter", &tle::Result::decodingIter)
+        .def_readwrite("context_phase_params", &tle::Result::contextPhaseParams);

    py::class_<tle::Response>(m, "Response")
        .def(py::init<IdType, std::string, std::optional<IdType>>(), py::arg("request_id"), py::arg("error_msg"),
@ -457,25 +484,28 @@ void InitBindings(pybind11::module_& m)
    {
        return py::make_tuple(self.getEnableBlockReuse(), self.getMaxTokens(), self.getMaxAttentionWindowVec(),
            self.getSinkTokenLength(), self.getFreeGpuMemoryFraction(), self.getHostCacheSize(),
-            self.getOnboardBlocks());
+            self.getOnboardBlocks(), self.getCrossKvCacheFraction(), self.getSecondaryOffloadMinPriority());
    };
    auto kvCacheConfigSetstate = [](py::tuple state)
    {
-        if (state.size() != 7)
+        if (state.size() != 9)
        {
            throw std::runtime_error("Invalid state!");
        }
        return tle::KvCacheConfig(state[0].cast<bool>(), state[1].cast<std::optional<SizeType32>>(),
            state[2].cast<std::optional<std::vector<SizeType32>>>(), state[3].cast<std::optional<SizeType32>>(),
-            state[4].cast<std::optional<float>>(), state[5].cast<std::optional<size_t>>(), state[6].cast<bool>());
+            state[4].cast<std::optional<float>>(), state[5].cast<std::optional<size_t>>(), state[6].cast<bool>(),
+            state[7].cast<std::optional<float>>(), state[8].cast<std::optional<tle::RetentionPriority>>());
    };
    py::class_<tle::KvCacheConfig>(m, "KvCacheConfig")
        .def(py::init<bool, std::optional<SizeType32> const&, std::optional<std::vector<SizeType32>> const&,
-                 std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool>(),
+                 std::optional<SizeType32> const&, std::optional<float> const&, std::optional<size_t> const&, bool,
+                 std::optional<float> const&, std::optional<tle::RetentionPriority>>(),
            py::arg("enable_block_reuse") = false, py::arg("max_tokens") = py::none(),
            py::arg("max_attention_window") = py::none(), py::arg("sink_token_length") = py::none(),
            py::arg("free_gpu_memory_fraction") = py::none(), py::arg("host_cache_size") = py::none(),
-            py::arg("onboard_blocks") = true)
+            py::arg("onboard_blocks") = true, py::arg("cross_kv_cache_fraction") = py::none(),
+            py::arg("secondary_offload_min_priority") = py::none())
        .def_property(
            "enable_block_reuse", &tle::KvCacheConfig::getEnableBlockReuse, &tle::KvCacheConfig::setEnableBlockReuse)
        .def_property("max_tokens", &tle::KvCacheConfig::getMaxTokens, &tle::KvCacheConfig::setMaxTokens)
@ -487,6 +517,10 @@ void InitBindings(pybind11::module_& m)
            &tle::KvCacheConfig::setFreeGpuMemoryFraction)
        .def_property("host_cache_size", &tle::KvCacheConfig::getHostCacheSize, &tle::KvCacheConfig::setHostCacheSize)
        .def_property("onboard_blocks", &tle::KvCacheConfig::getOnboardBlocks, &tle::KvCacheConfig::setOnboardBlocks)
+        .def_property("cross_kv_cache_fraction", &tle::KvCacheConfig::getCrossKvCacheFraction,
+            &tle::KvCacheConfig::setCrossKvCacheFraction)
+        .def_property("secondary_offload_min_priority", &tle::KvCacheConfig::getSecondaryOffloadMinPriority,
+            &tle::KvCacheConfig::setSecondaryOffloadMinPriority)
        .def(py::pickle(kvCacheConfigGetstate, kvCacheConfigSetstate));

    py::class_<tle::OrchestratorConfig>(m, "OrchestratorConfig")
--- a/cpp/tensorrt_llm/runtime/CMakeLists.txt
+++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt
@ -21,6 +21,7 @@ set(SRCS
    bufferManager.cpp
    cudaMemPool.cpp
    decodingLayerWorkspace.cpp
+    eagleBuffers.cpp
    explicitDraftTokensBuffers.cpp
    lookaheadBuffers.cpp
    layerProfiler.cpp
--- a/cpp/tensorrt_llm/runtime/eagleBuffers.cpp
+++ b/cpp/tensorrt_llm/runtime/eagleBuffers.cpp
@ -0,0 +1,351 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/runtime/eagleBuffers.h"
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h"
+#include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h"
+#include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
+
+namespace tksd = tensorrt_llm::kernels::speculative_decoding;
+
+namespace tensorrt_llm::runtime
+{
+
+void EagleBuffers::Inputs::create(SizeType32 maxNumSequences, TllmRuntime const& runtime,
+    ModelConfig const& modelConfig, WorldConfig const& worldConfig)
+{
+    auto const& manager = runtime.getBufferManager();
+
+    auto const& speculativeDecodingModule = modelConfig.getSpeculativeDecodingModule();
+    auto const maxNumPaths = speculativeDecodingModule.getMaxNumPaths();
+    auto const maxDraftPathLen = speculativeDecodingModule.getMaxDraftPathLen();
+    auto const maxPathLen = speculativeDecodingModule.getMaxPathLen();
+    auto const maxDecodingTokens = speculativeDecodingModule.getMaxDecodingTokens();
+    auto const maxDecodingDraftTokens = speculativeDecodingModule.getMaxDecodingDraftTokens();
+
+    auto constexpr TRTTokenIdType = runtime::TRTDataType<runtime::TokenIdType>::value;
+
+    temperatures = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kFLOAT);
+    randomDataSample = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kFLOAT);
+    randomDataValidation
+        = manager.gpu(ITensor::makeShape({maxNumSequences, maxNumPaths, maxDraftPathLen}), nvinfer1::DataType::kFLOAT);
+    draftTokens = manager.gpu(ITensor::makeShape({maxNumSequences, maxDecodingDraftTokens}), TRTTokenIdType);
+    draftLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+    draftPaths
+        = manager.gpu(ITensor::makeShape({maxNumSequences, maxNumPaths, maxPathLen}), nvinfer1::DataType::kINT32);
+    specDecodingGenerationLengths = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+    specDecodingPackedMasks
+        = manager.gpu(ITensor::makeShape({maxNumSequences, maxDecodingTokens, common::ceilDiv(maxDecodingTokens, 32)}),
+            nvinfer1::DataType::kINT32);
+    specDecodingPositionOffsets
+        = manager.gpu(ITensor::makeShape({maxNumSequences * maxDecodingTokens}), nvinfer1::DataType::kINT32);
+
+    eagleNetCtxRequestTypesHost = manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+    eagleNetCtxContextLengthsHost
+        = manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+    eagleNetCtxPastKeyValueLengthsHost
+        = manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+    eagleNetGenRequestTypesHost = manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+    eagleNetGenContextLengthsHost
+        = manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+    eagleNetGenPastKeyValueLengthsHost
+        = manager.pinnedPool(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+}
+
+EagleBuffers::EagleBuffers(SizeType32 maxBatchSize, SizeType32 maxBeamWidth, runtime::BufferManager const& manager,
+    runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig,
+    executor::DecodingConfig const& decodingConfig, runtime::TllmRuntime const& runtime)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    TLLM_CHECK_WITH_INFO(maxBeamWidth == 1, "EAGLE does not support beam search");
+
+    auto const maxNumSequences = maxBatchSize;
+
+    auto const eagleModule = std::dynamic_pointer_cast<tensorrt_llm::runtime::EagleModule const>(
+        modelConfig.getSpeculativeDecodingModulePtr());
+
+    auto const numPaths = eagleModule->getMaxNumPaths();
+    auto const pathLen = eagleModule->getMaxPathLen();
+    auto const maxDecodingDraftTokens = eagleModule->getMaxDecodingDraftTokens();
+
+    auto constexpr TRTTokenIdType = runtime::TRTDataType<runtime::TokenIdType>::value;
+
+    // input tensors
+    engineInputs.temperatures = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
+
+    engineInputs.draftTokens
+        = manager.gpu(ITensor::makeShape({maxNumSequences, maxDecodingDraftTokens}), TRTTokenIdType);
+    engineInputs.draftLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+    engineInputs.draftPaths
+        = manager.gpu(ITensor::makeShape({maxNumSequences, numPaths, pathLen}), nvinfer1::DataType::kINT32);
+
+    engineInputs.specDecodingGenerationLengths
+        = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
+    engineInputs.specDecodingPositionOffsets
+        = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
+    engineInputs.specDecodingPackedMasks = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
+
+    engineInputs.randomDataSample = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
+    engineInputs.randomDataValidation = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kFLOAT);
+
+    engineInputs.eagleNetCtxRequestTypesHost
+        = manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
+    engineInputs.eagleNetCtxContextLengthsHost
+        = manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
+    engineInputs.eagleNetCtxPastKeyValueLengthsHost
+        = manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
+    engineInputs.eagleNetGenRequestTypesHost
+        = manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
+    engineInputs.eagleNetGenContextLengthsHost
+        = manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
+    engineInputs.eagleNetGenPastKeyValueLengthsHost
+        = manager.emptyTensor(runtime::MemoryType::kPINNEDPOOL, nvinfer1::DataType::kINT32);
+
+    // output tensors
+    engineOutputs.nextDraftTokens
+        = manager.gpu(ITensor::makeShape({maxNumSequences, numPaths, pathLen}), TRTTokenIdType);
+    engineOutputs.nextDraftLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+    engineOutputs.nextDraftPaths
+        = manager.gpu(ITensor::makeShape({maxNumSequences, numPaths, pathLen}), nvinfer1::DataType::kINT32);
+
+    engineOutputs.acceptedTokens
+        = manager.gpu(ITensor::makeShape({maxNumSequences, pathLen}), nvinfer1::DataType::kINT32);
+    engineOutputs.acceptedLens = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+    engineOutputs.acceptedPaths = manager.gpu(ITensor::makeShape({maxNumSequences}), nvinfer1::DataType::kINT32);
+
+    // helper tensors
+    auto const& stream = manager.getStream();
+    scanTempStorageBytes
+        = tksd::invokeScanGenerationLengths(nullptr, 0, nullptr, nullptr, maxNumSequences, stream.get());
+    reduceTempStorageBytes
+        = tksd::invokeReduceMaxGenerationLengths(nullptr, 0, nullptr, nullptr, maxNumSequences, stream.get());
+    scanReduceTempStorage = manager.gpu(std::max(reduceTempStorageBytes, scanTempStorageBytes));
+    cumSumGenerationLengths = manager.emptyTensor(runtime::MemoryType::kGPU, nvinfer1::DataType::kINT32);
+    maxGenerationLength = manager.gpu(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
+
+    // pre-allocate empty tensors
+    reshape(0, maxNumSequences, modelConfig);
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+void EagleBuffers::reshape(
+    SizeType32 numCtxSequences, SizeType32 numGenSequences, runtime::ModelConfig const& modelConfig)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    auto const numSequences = numCtxSequences + numGenSequences;
+
+    auto const eagleModule = std::dynamic_pointer_cast<tensorrt_llm::runtime::EagleModule const>(
+        modelConfig.getSpeculativeDecodingModulePtr());
+
+    auto const maxDecodingTokens = eagleModule->getMaxDecodingTokens();
+
+    // input tensors
+    engineInputs.temperatures->reshape(ITensor::makeShape({numSequences}));
+
+    auto draftTokensShape = engineInputs.draftTokens->getShape();
+    draftTokensShape.d[0] = numSequences;
+    engineInputs.draftTokens->reshape(draftTokensShape);
+    auto draftLensShape = engineInputs.draftLens->getShape();
+    draftLensShape.d[0] = numSequences;
+    engineInputs.draftLens->reshape(draftLensShape);
+    auto draftPathsShape = engineInputs.draftPaths->getShape();
+    draftPathsShape.d[0] = numSequences;
+    engineInputs.draftPaths->reshape(draftPathsShape);
+
+    engineInputs.specDecodingGenerationLengths->reshape(ITensor::makeShape({numGenSequences}));
+    engineInputs.specDecodingPositionOffsets->reshape(ITensor::makeShape({numGenSequences, maxDecodingTokens}));
+    engineInputs.specDecodingPackedMasks->reshape(
+        ITensor::makeShape({numGenSequences * maxDecodingTokens, common::ceilDiv(maxDecodingTokens, 32)}));
+
+    engineInputs.randomDataSample->reshape(ITensor::makeShape({numSequences}));
+    engineInputs.randomDataValidation->reshape(ITensor::makeShape({numSequences}));
+
+    engineInputs.eagleNetCtxRequestTypesHost->reshape(ITensor::makeShape({numSequences}));
+    engineInputs.eagleNetCtxContextLengthsHost->reshape(ITensor::makeShape({numSequences}));
+    engineInputs.eagleNetCtxPastKeyValueLengthsHost->reshape(ITensor::makeShape({numSequences}));
+    engineInputs.eagleNetGenRequestTypesHost->reshape(ITensor::makeShape({numSequences}));
+    engineInputs.eagleNetGenContextLengthsHost->reshape(ITensor::makeShape({numSequences}));
+    engineInputs.eagleNetGenPastKeyValueLengthsHost->reshape(ITensor::makeShape({numSequences}));
+
+    cumSumGenerationLengths->reshape(ITensor::makeShape({numSequences}));
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+template <typename T>
+void EagleBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, SizeType32 vocabSizePadded,
+    ITensor const& seqSlots, EagleBuffers::Inputs const& draftBuffers, ITensor const& contextPositionIds,
+    runtime::EagleModule const& eagleModule, runtime::CudaStream const& stream) const
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    using runtime::bufferCast;
+
+    tksd::PackEagleParams params;
+    params.batchSize = numCtxSequences + numGenSequences;
+    params.maxNumPaths = eagleModule.getMaxNumPaths();
+    params.maxDecodingTokens = eagleModule.getMaxDecodingTokens();
+    params.maxPathLength = eagleModule.getMaxPathLen();
+    params.numContextRequests = numCtxSequences;
+    params.numGenerationRequests = numGenSequences;
+
+    params.batchSlots = bufferCast<SizeType32>(seqSlots);
+
+    // Outputs from decoder -- inputs to the packing kernel
+    params.inputTemperatures = bufferCast<float>(*draftBuffers.temperatures);
+    params.inputRandomDataSample = bufferCast<float>(*draftBuffers.randomDataSample);
+    params.inputRandomDataValidation = bufferCast<float>(*draftBuffers.randomDataValidation);
+
+    params.inputNextDraftTokens = bufferCast<runtime::TokenIdType>(*draftBuffers.draftTokens);
+    params.inputNextDraftLens = bufferCast<SizeType32>(*draftBuffers.draftLens);
+    params.inputNextDraftPaths = bufferCast<SizeType32>(*draftBuffers.draftPaths);
+
+    params.inputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*draftBuffers.specDecodingGenerationLengths);
+    params.inputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*draftBuffers.specDecodingPositionOffsets);
+    params.inputSpecDecodingPackedMasks = bufferCast<int32_t>(*draftBuffers.specDecodingPackedMasks);
+
+    // Outputs of the packing kernel -- inputs to the engine
+    params.outputTemperatures = bufferCast<float>(*engineInputs.temperatures);
+    params.outputRandomDataSample = bufferCast<float>(*engineInputs.randomDataSample);
+    params.outputRandomDataValidation = bufferCast<float>(*engineInputs.randomDataValidation);
+
+    params.outputNextDraftTokens = bufferCast<runtime::TokenIdType>(*engineInputs.draftTokens);
+    params.outputNextDraftLens = bufferCast<SizeType32>(*engineInputs.draftLens);
+    params.outputNextDraftPaths = bufferCast<SizeType32>(*engineInputs.draftPaths);
+
+    params.outputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*engineInputs.specDecodingGenerationLengths);
+    params.outputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*engineInputs.specDecodingPositionOffsets);
+    params.outputSpecDecodingPackedMasks = bufferCast<int32_t>(*engineInputs.specDecodingPackedMasks);
+
+    params.maxGenerationLength = bufferCast<SizeType32>(*maxGenerationLength);
+    params.cumSumGenerationLengths = bufferCast<SizeType32>(*cumSumGenerationLengths);
+
+    params.checkParams();
+
+    // Pack tensors from batch slot position to continuous array
+    tksd::invokePackEagleGenerationLengths(params, stream.get());
+
+    if (numGenSequences)
+    {
+        // Compute inclusive sum and max
+        tksd::invokeScanReduceGenerationLengths(numGenSequences,
+            bufferCast<SizeType32>(*engineInputs.specDecodingGenerationLengths),
+            bufferCast<uint8_t>(*scanReduceTempStorage), scanTempStorageBytes,
+            bufferCast<SizeType32>(*cumSumGenerationLengths), bufferCast<uint8_t>(*scanReduceTempStorage),
+            reduceTempStorageBytes, bufferCast<SizeType32>(*maxGenerationLength), stream.get());
+    }
+
+    // Pack tensors from batch slot position to continuous array
+    tksd::invokePackEagle(params, stream.get());
+
+    // Pack host data.
+    for (SizeType32 bi = 0; bi < params.batchSize; ++bi)
+    {
+        auto const batchSlot = params.batchSlots[bi];
+        bufferCast<SizeType32>(*engineInputs.eagleNetCtxRequestTypesHost)[bi]
+            = bufferCast<SizeType32>(*draftBuffers.eagleNetCtxRequestTypesHost)[batchSlot];
+        bufferCast<SizeType32>(*engineInputs.eagleNetCtxContextLengthsHost)[bi]
+            = bufferCast<SizeType32>(*draftBuffers.eagleNetCtxContextLengthsHost)[batchSlot];
+        bufferCast<SizeType32>(*engineInputs.eagleNetCtxPastKeyValueLengthsHost)[bi]
+            = bufferCast<SizeType32>(*draftBuffers.eagleNetCtxPastKeyValueLengthsHost)[batchSlot];
+        bufferCast<SizeType32>(*engineInputs.eagleNetGenRequestTypesHost)[bi]
+            = bufferCast<SizeType32>(*draftBuffers.eagleNetGenRequestTypesHost)[batchSlot];
+        bufferCast<SizeType32>(*engineInputs.eagleNetGenContextLengthsHost)[bi]
+            = bufferCast<SizeType32>(*draftBuffers.eagleNetGenContextLengthsHost)[batchSlot];
+        bufferCast<SizeType32>(*engineInputs.eagleNetGenPastKeyValueLengthsHost)[bi]
+            = bufferCast<SizeType32>(*draftBuffers.eagleNetGenPastKeyValueLengthsHost)[batchSlot];
+    }
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+void EagleBuffers::setFromInputs(SizeType32 numCtxSequences, SizeType32 numGenSequences, ITensor const& requestTypes,
+    ITensor const& seqSlots, EagleBuffers::Inputs const& draftBuffers, ITensor const& contextPositionIds,
+    runtime::TllmRuntime const& runtime, runtime::ModelConfig const& modelConfig,
+    runtime::WorldConfig const& worldConfig) const
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    auto const& stream = runtime.getStream();
+
+    auto const eagleModule
+        = std::dynamic_pointer_cast<runtime::EagleModule const>(modelConfig.getSpeculativeDecodingModulePtr());
+
+    auto const vocabSizePadded = modelConfig.getVocabSizePadded(worldConfig.getSize());
+
+    auto const dtype = modelConfig.getDataType();
+
+    switch (dtype)
+    {
+    case nvinfer1::DataType::kFLOAT:
+        setFromInputs<float>(numCtxSequences, numGenSequences, vocabSizePadded, seqSlots, draftBuffers,
+            contextPositionIds, *eagleModule, stream);
+        break;
+    case nvinfer1::DataType::kHALF:
+        setFromInputs<half>(numCtxSequences, numGenSequences, vocabSizePadded, seqSlots, draftBuffers,
+            contextPositionIds, *eagleModule, stream);
+        break;
+    default: TLLM_THROW("DataType %d not supported in EagleBuffers", static_cast<SizeType32>(dtype)); break;
+    }
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+void EagleBuffers::insertInputTensors(
+    TensorMap& inputBuffers, TensorMap& outputBuffers, runtime::WorldConfig const& /* worldConfig */) const
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+    // inputs
+    inputBuffers.insert_or_assign("eagle_temperature", engineInputs.temperatures);
+
+    inputBuffers.insert_or_assign("spec_decoding_generation_lengths", engineInputs.specDecodingGenerationLengths);
+    inputBuffers.insert_or_assign("spec_decoding_position_offsets", engineInputs.specDecodingPositionOffsets);
+    inputBuffers.insert_or_assign("spec_decoding_packed_mask", engineInputs.specDecodingPackedMasks);
+
+    inputBuffers.insert_or_assign("rand_data_sample", engineInputs.randomDataSample);
+    inputBuffers.insert_or_assign("rand_data_validation", engineInputs.randomDataValidation);
+
+    inputBuffers.insert_or_assign("draft_tokens", engineInputs.draftTokens);
+    inputBuffers.insert_or_assign("draft_lens", engineInputs.draftLens);
+    inputBuffers.insert_or_assign("draft_paths", engineInputs.draftPaths);
+
+    inputBuffers.insert_or_assign("host_ctx_eagle_net_request_types", engineInputs.eagleNetCtxRequestTypesHost);
+    inputBuffers.insert_or_assign("host_ctx_eagle_net_context_lengths", engineInputs.eagleNetCtxContextLengthsHost);
+    inputBuffers.insert_or_assign(
+        "host_ctx_eagle_net_past_key_value_lengths", engineInputs.eagleNetCtxPastKeyValueLengthsHost);
+    inputBuffers.insert_or_assign("host_gen_eagle_net_request_types", engineInputs.eagleNetGenRequestTypesHost);
+    inputBuffers.insert_or_assign("host_gen_eagle_net_context_lengths", engineInputs.eagleNetGenContextLengthsHost);
+    inputBuffers.insert_or_assign(
+        "host_gen_eagle_net_past_key_value_lengths", engineInputs.eagleNetGenPastKeyValueLengthsHost);
+
+    // outputs
+    outputBuffers.insert_or_assign("next_draft_tokens", engineOutputs.nextDraftTokens);
+    outputBuffers.insert_or_assign("next_draft_lens", engineOutputs.nextDraftLens);
+
+    outputBuffers.insert_or_assign("accepted_tokens", engineOutputs.acceptedTokens);
+    outputBuffers.insert_or_assign("num_accepted_tokens", engineOutputs.acceptedLens);
+    outputBuffers.insert_or_assign("accepted_paths", engineOutputs.acceptedPaths);
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
+} // namespace tensorrt_llm::runtime
--- a/cpp/tensorrt_llm/runtime/eagleModule.h
+++ b/cpp/tensorrt_llm/runtime/eagleModule.h
@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/runtime/speculativeDecodingModule.h"
+
+namespace tensorrt_llm::runtime
+{
+
+class EagleModule : public SpeculativeDecodingModule
+{
+public:
+    explicit EagleModule(SizeType32 maxDraftPathLen, SizeType32 maxDecodingDraftTokens, SizeType32 maxNumPaths) noexcept
+        : SpeculativeDecodingModule(maxDraftPathLen, maxDecodingDraftTokens, maxNumPaths)
+    {
+        TLLM_CHECK(maxNumPaths * maxDraftPathLen == maxDecodingDraftTokens);
+    }
+
+    explicit EagleModule() noexcept
+        : EagleModule(0, 0, 0)
+    {
+    }
+};
+} // namespace tensorrt_llm::runtime
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatched.cpp
@ -233,6 +233,16 @@ void GptDecoderBatched::setupLookahead(LookaheadDecodingBuffers lookaheadDecodin
    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }

+void GptDecoderBatched::setupEagle(EagleBuffers::Inputs eagleBuffers)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    TLLM_CHECK(mSpeculativeDecodingMode.isEagle());
+    mJointDecodingOutput->eagleBuffers = std::move(eagleBuffers);
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
 void GptDecoderBatched::setup(executor::DecodingMode const& mode, SizeType32 maxBatchSize, SizeType32 maxBeamWidth,
    SizeType32 maxAttentionWindow, SizeType32 sinkTokenLength, SizeType32 maxSequenceLength,
    SizeType32 maxTokensPerEngineStep, nvinfer1::DataType dtype, ModelConfig const& modelConfig)
@ -628,6 +638,10 @@ void GptDecoderBatched::newRequestSpeculativeDecoding(
    {
        newRequestExplicitDraftTokens(batchIdx, request);
    }
+    else if (mSpeculativeDecodingMode.isEagle())
+    {
+        newRequestEagle(batchIdx, request);
+    }
    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }

@ -739,6 +753,17 @@ void GptDecoderBatched::newRequestExplicitDraftTokens(SizeType32 batchIdx, decod
    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }

+void GptDecoderBatched::newRequestEagle(SizeType32 batchIdx, decoder_batch::Request const& request)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    TLLM_CHECK(mJointDecodingOutput->eagleBuffers);
+
+    // TODO fill me
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
 void GptDecoderBatched::setExplicitDraftTokensInputs(decoder_batch::Input const& input)
 {
    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
@ -767,6 +792,15 @@ void GptDecoderBatched::setExplicitDraftTokensInputs(decoder_batch::Input const&
    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }

+void GptDecoderBatched::setEagleInputs(decoder_batch::Input const& input)
+{
+    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
+
+    // TODO fill me
+
+    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
+}
+
 void GptDecoderBatched::newRequests(std::vector<SizeType32> const& seqSlots,
    std::vector<decoder_batch::Request> const& requests, std::vector<SamplingConfig> const& samplingConfigs)
 {
@ -854,6 +888,10 @@ void GptDecoderBatched::forwardDecoder(
    {
        setExplicitDraftTokensInputs(input);
    }
+    else if (mSpeculativeDecodingMode.isEagle())
+    {
+        setEagleInputs(input);
+    }

    bool const async = forwardType == ForwardType::kASYNC;

--- a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
+++ b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
@ -160,6 +160,8 @@ ModelConfig createModelConfig(
    auto numKvHeadsPerAttentionLayer
        = parseJsonFieldOr<std::vector<SizeType32>>(config, "num_kv_heads_per_layer", std::vector<SizeType32>());

+    auto numKvHeadsPerCrossAttentionLayer = parseJsonFieldOr<std::vector<SizeType32>>(
+        config, "num_kv_heads_per_cross_attn_layer", std::vector<SizeType32>());
    auto modelConfig
        = ModelConfig{vocabSize, numLayers, numAttentionLayers, numRnnLayers, numHeads, hiddenSize, dataType};

@ -167,7 +169,8 @@ ModelConfig createModelConfig(
    {
        std::transform(numKvHeadsPerAttentionLayer.cbegin(), numKvHeadsPerAttentionLayer.cend(),
            numKvHeadsPerAttentionLayer.begin(),
-            [tensorParallelism](SizeType32 const numKvHeads) { return std::max(numKvHeads / tensorParallelism, 1); });
+            [tensorParallelism](SizeType32 const numKvHeads)
+            { return ((numKvHeads + tensorParallelism - 1) / tensorParallelism); });
        modelConfig.setNumKvHeadsPerLayer(numKvHeadsPerAttentionLayer);
    }
    else
@ -175,6 +178,19 @@ ModelConfig createModelConfig(
        modelConfig.setNbKvHeads(numKvHeads);
    }

+    if (!numKvHeadsPerCrossAttentionLayer.empty())
+    {
+        std::transform(numKvHeadsPerCrossAttentionLayer.cbegin(), numKvHeadsPerCrossAttentionLayer.cend(),
+            numKvHeadsPerCrossAttentionLayer.begin(),
+            [tensorParallelism](SizeType32 const numKvHeads)
+            { return ((numKvHeads + tensorParallelism - 1) / tensorParallelism); });
+        modelConfig.setNumKvHeadsPerCrossLayer(numKvHeadsPerCrossAttentionLayer);
+    }
+    else
+    {
+        modelConfig.setNbCrossKvHeads(numKvHeads);
+    }
+
    modelConfig.setSizePerHead(sizePerHead);
    modelConfig.setLayerTypes(layerTypes);

--- a/cpp/tensorrt_llm/thop/gatherTreeOp.cpp
+++ b/cpp/tensorrt_llm/thop/gatherTreeOp.cpp
@ -60,7 +60,7 @@ th::Tensor gatherTree(                                    // BS: batch_size, BM:
    {
        int32_t* final_output_ids_ptr = get_ptr<int32_t>(final_output_ids);
        tk::invokeInitializeOutput(
-            final_output_ids_ptr, get_ptr<int32_t>(end_ids), batch_size * beam_width, max_seq_len, stream);
+            final_output_ids_ptr, get_ptr<int32_t>(end_ids), batch_size, beam_width, max_seq_len, stream);

        tk::BeamHypotheses bh;
        bh.nBatchSize = batch_size;
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@ -130,6 +130,7 @@ if(NOT ENABLE_MULTI_DEVICE EQUAL 0)
  add_gtest(allReduceKernelTest kernels/allReduce/allReduceKernelTest.cu)
 endif()
 add_gtest(decodingKernelsTest kernels/decodingKernelTest.cpp)
+add_gtest(eaglePackDataTest kernels/eaglePackDataTest.cpp)
 add_gtest(banRepeatNGramsKernelsTest kernels/banRepeatNGramsKernelsTest.cpp)
 add_gtest(stopCriteriaKernelsTest kernels/stopCriteriaKernelsTest.cpp)
 add_gtest(shiftKCacheKernelTest kernels/shiftKCacheKernelTest.cu)
--- a/cpp/tests/kernels/decodingKernelTest.cpp
+++ b/cpp/tests/kernels/decodingKernelTest.cpp
@ -426,6 +426,10 @@ public:
        TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)};
        mBufferManager->copy(len.data(),*inputLengths);

+        std::vector<SizeType32> eid = {0};
+        TensorPtr endIds{ITensor::slice(constPointerCast(decodingInput->endIds), 0, 1)};
+        mBufferManager->copy(eid.data(),*endIds);
+
        std::vector<std::vector<float>> logProbs =
        {
            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524, -0.696636, -2.41985},
@ -555,6 +559,10 @@ public:
        TensorPtr inputLengths{ITensor::slice(constPointerCast(decodingInput->lengths), 0, 1)};
        mBufferManager->copy(len.data(),*inputLengths);

+        std::vector<SizeType32> eid = {0};
+        TensorPtr endIds{ITensor::slice(constPointerCast(decodingInput->endIds), 0, 1)};
+        mBufferManager->copy(eid.data(),*endIds);
+
        std::vector<std::vector<float> >logProbs =
        {
            {-2.96689, -1.63675, -2.31329, -0.0377979, -2.2442, -1.57552, -0.310524},
--- a/cpp/tests/kernels/eaglePackDataTest.cpp
+++ b/cpp/tests/kernels/eaglePackDataTest.cpp
@ -0,0 +1,521 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gtest/gtest.h>
+
+#include "tensorrt_llm/common/memoryUtils.h"
+#include "tensorrt_llm/kernels/decodingCommon.h"
+#include "tensorrt_llm/kernels/speculativeDecoding/eagleDecodingKernels.h"
+#include "tensorrt_llm/kernels/speculativeDecoding/explicitDraftTokensKernels.h"
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/iBuffer.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/runtimeKernels.h"
+#include "tensorrt_llm/runtime/tllmLogger.h"
+
+#include <NvInferRuntimeBase.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <random>
+
+namespace
+{
+
+using namespace tensorrt_llm::runtime;
+using namespace tensorrt_llm::common;
+
+namespace tk = tensorrt_llm::kernels;
+namespace trk = tensorrt_llm::runtime::kernels;
+namespace tksd = tensorrt_llm::kernels::speculative_decoding;
+
+class SamplingParams
+{
+public:
+    SamplingParams() {}
+
+    inline void setNumCtxRequests(SizeType32 numCtxRequests)
+    {
+        mNumCtxRequests = numCtxRequests;
+    }
+
+    inline void setNumGenRequests(SizeType32 numGenRequests)
+    {
+        mNumGenRequests = numGenRequests;
+    }
+
+    inline void setMaxPathLen(SizeType32 maxPathLen)
+    {
+        mMaxPathLen = maxPathLen;
+    }
+
+    [[nodiscard]] inline SizeType32 getNumCtxRequests() const
+    {
+        return mNumCtxRequests;
+    }
+
+    [[nodiscard]] inline SizeType32 getNumGenRequests() const
+    {
+        return mNumGenRequests;
+    }
+
+    [[nodiscard]] inline SizeType32 getBatchSize() const
+    {
+        return getNumCtxRequests() + getNumGenRequests();
+    }
+
+    [[nodiscard]] inline SizeType32 getVocabSize() const
+    {
+        return mVocabSize;
+    }
+
+    [[nodiscard]] inline SizeType32 getMaxBatchSize() const
+    {
+        return 2 * getBatchSize();
+    }
+
+    [[nodiscard]] inline SizeType32 getMaxPathLen() const
+    {
+        return mMaxPathLen;
+    }
+
+    [[nodiscard]] inline SizeType32 getMaxDecodingTokens() const
+    {
+        return mMaxDecodingTokens;
+    }
+
+    [[nodiscard]] inline SizeType32 getMaxDecodingDraftTokens() const
+    {
+        return getMaxDecodingTokens() - 1;
+    }
+
+    [[nodiscard]] inline SizeType32 getMaxSeqLen() const
+    {
+        return getMaxDecodingTokens() * 2;
+    }
+
+private:
+    SizeType32 mNumCtxRequests{6};
+    SizeType32 mNumGenRequests{6};
+    SizeType32 mMaxPathLen{4};
+    SizeType32 mMaxDecodingTokens{32};
+    SizeType32 mVocabSize{256};
+};
+
+class EaglePackDataTest : public ::testing::Test
+{
+public:
+    using BufferPtr = IBuffer::SharedPtr;
+    using TensorPtr = ITensor::SharedPtr;
+
+    void SetUp() override
+    {
+        mStream = std::make_shared<CudaStream>();
+        mBufferManager = std::make_shared<BufferManager>(mStream);
+    }
+
+    void allocateBuffers()
+    {
+        // inputs
+        mBatchSlots = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kINT32);
+
+        mInputTemperatures = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kFLOAT);
+
+        mInputRandomDataSample = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kFLOAT);
+
+        mInputRandomDataValidation = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kFLOAT);
+
+        mInputNextDraftTokens = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getMaxBatchSize(), mSamplingParams.getMaxDecodingDraftTokens()}),
+            nvinfer1::DataType::kINT32);
+
+        mInputNextDraftLens = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kINT32);
+
+        mInputNextDraftPaths
+            = BufferManager::pinnedPool(ITensor::makeShape({mSamplingParams.getMaxBatchSize(),
+                                            mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen()}),
+                nvinfer1::DataType::kINT32);
+
+        mInputSpecDecodingGenerationLengths = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getMaxBatchSize()}), nvinfer1::DataType::kINT32);
+
+        mInputSpecDecodingPositionOffsets = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getMaxBatchSize(), mSamplingParams.getMaxDecodingTokens()}),
+            nvinfer1::DataType::kINT32);
+
+        auto const numPackedMasks
+            = static_cast<SizeType32>(tensorrt_llm::common::divUp(mSamplingParams.getMaxDecodingTokens(), 32));
+        mInputSpecDecodingPackedMasks = BufferManager::pinnedPool(
+            ITensor::makeShape(
+                {mSamplingParams.getMaxBatchSize(), mSamplingParams.getMaxDecodingTokens(), numPackedMasks}),
+            nvinfer1::DataType::kINT32);
+
+        // outputs
+        mOutputTemperatures = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kFLOAT);
+
+        mOutputRandomDataSample = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kFLOAT);
+
+        mOutputRandomDataValidation = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kFLOAT);
+
+        mOutputNextDraftTokens = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingDraftTokens()}),
+            nvinfer1::DataType::kINT32);
+
+        mOutputNextDraftLens = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kINT32);
+
+        mOutputNextDraftPaths
+            = BufferManager::pinnedPool(ITensor::makeShape({mSamplingParams.getBatchSize(),
+                                            mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen()}),
+                nvinfer1::DataType::kINT32);
+
+        mOutputSpecDecodingGenerationLengths = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getBatchSize()}), nvinfer1::DataType::kINT32);
+
+        mOutputSpecDecodingPositionOffsets = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingTokens()}),
+            nvinfer1::DataType::kINT32);
+
+        mOutputSpecDecodingPackedMasks = BufferManager::pinnedPool(
+            ITensor::makeShape(
+                {mSamplingParams.getBatchSize(), mSamplingParams.getMaxDecodingTokens(), numPackedMasks}),
+            nvinfer1::DataType::kINT32);
+
+        // workspace
+        mMaxGenerationLength = BufferManager::pinnedPool(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
+
+        mCumSumGenerationLengths = BufferManager::pinnedPool(
+            ITensor::makeShape({mSamplingParams.getBatchSize() + 1}), nvinfer1::DataType::kINT32);
+
+        mScanTempStorageBytes = tksd::invokeScanGenerationLengths(
+            nullptr, 0, nullptr, nullptr, mSamplingParams.getBatchSize(), mStream->get());
+        mReduceTempStorageBytes = tksd::invokeReduceMaxGenerationLengths(
+            nullptr, 0, nullptr, nullptr, mSamplingParams.getBatchSize(), mStream->get());
+        mScanReduceTempStorage = mBufferManager->gpu(std::max(mReduceTempStorageBytes, mScanTempStorageBytes));
+    }
+
+    void initBuffers()
+    {
+        trk::invokeFill(*mOutputTemperatures, float{0}, *mStream);
+        trk::invokeFill(*mOutputRandomDataSample, float{0}, *mStream);
+        trk::invokeFill(*mOutputRandomDataValidation, float{0}, *mStream);
+        trk::invokeFill(*mOutputNextDraftTokens, TokenIdType{-1}, *mStream);
+        trk::invokeFill(*mOutputNextDraftLens, SizeType32{0}, *mStream);
+        trk::invokeFill(*mOutputNextDraftPaths, SizeType32{0}, *mStream);
+        trk::invokeFill(*mOutputSpecDecodingGenerationLengths, SizeType32{0}, *mStream);
+        trk::invokeFill(*mOutputSpecDecodingPositionOffsets, SizeType32{0}, *mStream);
+        trk::invokeFill(*mOutputSpecDecodingPackedMasks, SizeType32{0}, *mStream);
+
+        auto batchSlotsPtr = bufferCast<SizeType32>(*mBatchSlots);
+        for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
+        {
+            batchSlotsPtr[bi] = 2 * bi;
+        }
+
+        std::mt19937 gen(42);
+        std::uniform_real_distribution<float> distr(0.0, 1.0);
+        std::uniform_int_distribution<SizeType32> intDistr(0, 1000);
+        std::uniform_int_distribution<SizeType32> lenDistr(0, 32);
+        for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
+        {
+            bufferCast<float>(*mInputTemperatures)[batchSlotsPtr[bi]] = distr(gen);
+            bufferCast<float>(*mInputRandomDataSample)[batchSlotsPtr[bi]] = distr(gen);
+            bufferCast<float>(*mInputRandomDataValidation)[batchSlotsPtr[bi]] = distr(gen);
+        }
+
+        for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
+        {
+            for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingDraftTokens(); ++ti)
+            {
+                bufferCast<SizeType32>(*mInputNextDraftTokens)[flat_index2(
+                    batchSlotsPtr[bi], ti, mSamplingParams.getMaxDecodingDraftTokens())]
+                    = intDistr(gen);
+            }
+            for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingTokens(); ++ti)
+            {
+                for (SizeType32 pi = 0; pi < mSamplingParams.getMaxPathLen(); ++pi)
+                {
+                    bufferCast<SizeType32>(*mInputNextDraftPaths)[flat_index3(batchSlotsPtr[bi], ti, pi,
+                        mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen())]
+                        = intDistr(gen);
+                }
+                auto const numPackedMasks
+                    = static_cast<SizeType32>(tensorrt_llm::common::divUp(mSamplingParams.getMaxDecodingTokens(), 32));
+                for (SizeType32 mi = 0; mi < numPackedMasks; ++mi)
+                {
+                    bufferCast<SizeType32>(*mInputSpecDecodingPackedMasks)[flat_index3(
+                        batchSlotsPtr[bi], ti, mi, mSamplingParams.getMaxDecodingTokens(), numPackedMasks)]
+                        = intDistr(gen);
+                }
+                bufferCast<SizeType32>(*mInputSpecDecodingPositionOffsets)[flat_index2(
+                    batchSlotsPtr[bi], ti, mSamplingParams.getMaxDecodingTokens())]
+                    = intDistr(gen);
+            }
+            bufferCast<SizeType32>(*mInputNextDraftLens)[batchSlotsPtr[bi]] = lenDistr(gen);
+            bufferCast<SizeType32>(*mInputSpecDecodingGenerationLengths)[batchSlotsPtr[bi]]
+                = bufferCast<SizeType32>(*mInputNextDraftLens)[batchSlotsPtr[bi]];
+        }
+    }
+
+    void callPackData()
+    {
+        tksd::PackEagleParams params;
+        params.batchSize = mSamplingParams.getBatchSize();
+        params.maxNumPaths = mSamplingParams.getMaxDecodingTokens();
+        params.maxDecodingTokens = mSamplingParams.getMaxDecodingTokens();
+        params.maxPathLength = mSamplingParams.getMaxPathLen();
+        params.numContextRequests = mSamplingParams.getNumCtxRequests();
+        params.numGenerationRequests = mSamplingParams.getNumGenRequests();
+
+        params.batchSlots = bufferCast<SizeType32>(*mBatchSlots);
+
+        // Outputs from decoder -- inputs to the packing kernel
+        params.inputTemperatures = bufferCast<float>(*mInputTemperatures);
+        params.inputRandomDataSample = bufferCast<float>(*mInputRandomDataSample);
+        params.inputRandomDataValidation = bufferCast<float>(*mInputRandomDataValidation);
+
+        params.inputNextDraftTokens = bufferCast<TokenIdType>(*mInputNextDraftTokens);
+        params.inputNextDraftLens = bufferCast<SizeType32>(*mInputNextDraftLens);
+        params.inputNextDraftPaths = bufferCast<SizeType32>(*mInputNextDraftPaths);
+
+        params.inputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*mInputSpecDecodingGenerationLengths);
+        params.inputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*mInputSpecDecodingPositionOffsets);
+        params.inputSpecDecodingPackedMasks = bufferCast<int32_t>(*mInputSpecDecodingPackedMasks);
+
+        // Outputs of the packing kernel -- inputs to the engine
+        params.outputTemperatures = bufferCast<float>(*mOutputTemperatures);
+        params.outputRandomDataSample = bufferCast<float>(*mOutputRandomDataSample);
+        params.outputRandomDataValidation = bufferCast<float>(*mOutputRandomDataValidation);
+
+        params.outputNextDraftTokens = bufferCast<TokenIdType>(*mOutputNextDraftTokens);
+        params.outputNextDraftLens = bufferCast<SizeType32>(*mOutputNextDraftLens);
+        params.outputNextDraftPaths = bufferCast<SizeType32>(*mOutputNextDraftPaths);
+
+        params.outputSpecDecodingGenerationLengths = bufferCast<SizeType32>(*mOutputSpecDecodingGenerationLengths);
+        params.outputSpecDecodingPositionOffsets = bufferCast<SizeType32>(*mOutputSpecDecodingPositionOffsets);
+        params.outputSpecDecodingPackedMasks = bufferCast<int32_t>(*mOutputSpecDecodingPackedMasks);
+
+        params.maxGenerationLength = bufferCast<SizeType32>(*mMaxGenerationLength);
+        params.cumSumGenerationLengths = bufferCast<SizeType32>(*mCumSumGenerationLengths);
+
+        params.checkParams();
+
+        if (mSamplingParams.getNumGenRequests())
+        {
+            // Pack tensors from batch slot position to continuous array
+            tksd::invokePackEagleGenerationLengths(params, mStream->get());
+
+            sync_check_cuda_error();
+
+            // Compute inclusive sum and max
+            tksd::invokeScanReduceGenerationLengths(mSamplingParams.getNumGenRequests(),
+                bufferCast<SizeType32>(*mOutputSpecDecodingGenerationLengths),
+                bufferCast<uint8_t>(*mScanReduceTempStorage), mScanTempStorageBytes,
+                bufferCast<SizeType32>(*mCumSumGenerationLengths), bufferCast<uint8_t>(*mScanReduceTempStorage),
+                mReduceTempStorageBytes, bufferCast<SizeType32>(*mMaxGenerationLength), mStream->get());
+
+            sync_check_cuda_error();
+        }
+
+        mStream->synchronize();
+
+        // Pack tensors from batch slot position to continuous array
+        tksd::invokePackEagle(params, mStream->get());
+
+        sync_check_cuda_error();
+    }
+
+    void verifyResults()
+    {
+        auto batchSlotsPtr = bufferCast<SizeType32>(*mBatchSlots);
+        for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
+        {
+            EXPECT_EQ(BufferRange<float>(*mInputTemperatures)[batchSlotsPtr[bi]],
+                BufferRange<float>(*mOutputTemperatures)[bi]);
+            EXPECT_EQ(BufferRange<float>(*mInputRandomDataSample)[batchSlotsPtr[bi]],
+                BufferRange<float>(*mOutputRandomDataSample)[bi]);
+            EXPECT_EQ(BufferRange<float>(*mInputRandomDataValidation)[batchSlotsPtr[bi]],
+                BufferRange<float>(*mOutputRandomDataValidation)[bi]);
+        }
+
+        auto const numCtxRequests = mSamplingParams.getNumCtxRequests();
+        for (SizeType32 bi = 0; bi < mSamplingParams.getBatchSize(); ++bi)
+        {
+            for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingTokens(); ++ti)
+            {
+                for (SizeType32 pi = 0; pi < mSamplingParams.getMaxPathLen(); ++pi)
+                {
+                    EXPECT_EQ(BufferRange<SizeType32>(*mInputNextDraftPaths)[flat_index3(batchSlotsPtr[bi], ti, pi,
+                                  mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen())],
+                        BufferRange<SizeType32>(*mOutputNextDraftPaths)[flat_index3(
+                            bi, ti, pi, mSamplingParams.getMaxDecodingTokens(), mSamplingParams.getMaxPathLen())]);
+                }
+            }
+            EXPECT_EQ(BufferRange<SizeType32>(*mOutputNextDraftLens)[bi],
+                bi < numCtxRequests ? 0 : BufferRange<SizeType32>(*mInputNextDraftLens)[batchSlotsPtr[bi]]);
+        }
+
+        auto const maxGenerationLength = bufferCast<SizeType32>(*mMaxGenerationLength)[0];
+        for (SizeType32 bi = 0; bi < mSamplingParams.getNumGenRequests(); ++bi)
+        {
+            for (SizeType32 ti = 0; ti < mSamplingParams.getMaxDecodingDraftTokens(); ++ti)
+            {
+                EXPECT_EQ(BufferRange<SizeType32>(*mInputNextDraftTokens)[flat_index2(
+                              batchSlotsPtr[numCtxRequests + bi], ti, mSamplingParams.getMaxDecodingDraftTokens())],
+                    BufferRange<SizeType32>(*mOutputNextDraftTokens)[flat_index2(
+                        numCtxRequests + bi, ti, mSamplingParams.getMaxDecodingDraftTokens())]);
+            }
+            EXPECT_EQ(BufferRange<SizeType32>(*mInputSpecDecodingGenerationLengths)[batchSlotsPtr[numCtxRequests + bi]],
+                BufferRange<SizeType32>(*mOutputSpecDecodingGenerationLengths)[bi]);
+            for (SizeType32 ti = 0; ti < maxGenerationLength; ++ti)
+            {
+                EXPECT_EQ(BufferRange<SizeType32>(*mInputSpecDecodingPositionOffsets)[flat_index2(
+                              batchSlotsPtr[numCtxRequests + bi], ti, mSamplingParams.getMaxDecodingTokens())],
+                    BufferRange<SizeType32>(
+                        *mOutputSpecDecodingPositionOffsets)[flat_index2(bi, ti, maxGenerationLength)])
+                    << "bi: " << bi << " ti: " << ti;
+            }
+            auto const numTokens = (bi == 0) ? bufferCast<SizeType32>(*mCumSumGenerationLengths)[0]
+                                             : bufferCast<SizeType32>(*mCumSumGenerationLengths)[bi]
+                    - bufferCast<SizeType32>(*mCumSumGenerationLengths)[bi - 1];
+            auto const outputStartId = (bi == 0) ? 0 : bufferCast<SizeType32>(*mCumSumGenerationLengths)[bi - 1];
+            auto const numPackedMasks
+                = static_cast<SizeType32>(tensorrt_llm::common::divUp(mSamplingParams.getMaxDecodingTokens(), 32));
+            for (SizeType32 ti = 0; ti < numTokens * numPackedMasks; ++ti)
+            {
+                EXPECT_EQ(BufferRange<SizeType32>(
+                              *mInputSpecDecodingPackedMasks)[flat_index2(batchSlotsPtr[numCtxRequests + bi], ti,
+                              mSamplingParams.getMaxDecodingTokens() * numPackedMasks)],
+                    BufferRange<SizeType32>(
+                        *mOutputSpecDecodingPackedMasks)[flat_index2(outputStartId, ti, numPackedMasks)])
+                    << "bi: " << bi << " ti: " << ti;
+            }
+        }
+    }
+
+    void run(SamplingParams samplingParams)
+    {
+        mSamplingParams = samplingParams;
+
+        allocateBuffers();
+
+        initBuffers();
+
+        callPackData();
+
+        mStream->synchronize();
+
+        verifyResults();
+    }
+
+private:
+    std::shared_ptr<tensorrt_llm::runtime::CudaStream> mStream;
+    std::shared_ptr<tensorrt_llm::runtime::BufferManager> mBufferManager;
+
+    // input
+    TensorPtr mBatchSlots;
+    TensorPtr mInputTemperatures;
+    TensorPtr mInputRandomDataSample;
+    TensorPtr mInputRandomDataValidation;
+    TensorPtr mInputNextDraftTokens;
+    TensorPtr mInputNextDraftLens;
+    TensorPtr mInputNextDraftPaths;
+    TensorPtr mInputSpecDecodingGenerationLengths;
+    TensorPtr mInputSpecDecodingPositionOffsets;
+    TensorPtr mInputSpecDecodingPackedMasks;
+
+    // output
+    TensorPtr mOutputTemperatures;
+    TensorPtr mOutputRandomDataSample;
+    TensorPtr mOutputRandomDataValidation;
+    TensorPtr mOutputNextDraftTokens;
+    TensorPtr mOutputNextDraftLens;
+    TensorPtr mOutputNextDraftPaths;
+    TensorPtr mOutputSpecDecodingGenerationLengths;
+    TensorPtr mOutputSpecDecodingPositionOffsets;
+    TensorPtr mOutputSpecDecodingPackedMasks;
+
+    // workspace
+    TensorPtr mMaxGenerationLength;
+    TensorPtr mCumSumGenerationLengths;
+
+    BufferPtr mScanReduceTempStorage;
+
+    SizeType32 mScanTempStorageBytes;
+    SizeType32 mReduceTempStorageBytes;
+
+    SamplingParams mSamplingParams;
+};
+
+TEST_F(EaglePackDataTest, Ctx1Gen0)
+{
+    SamplingParams params;
+
+    params.setNumCtxRequests(1);
+    params.setNumGenRequests(0);
+
+    this->run(params);
+}
+
+TEST_F(EaglePackDataTest, Ctx0Gen1)
+{
+    SamplingParams params;
+
+    params.setNumCtxRequests(0);
+    params.setNumGenRequests(1);
+
+    this->run(params);
+}
+
+TEST_F(EaglePackDataTest, Ctx100Gen0)
+{
+    SamplingParams params;
+
+    params.setNumCtxRequests(100);
+    params.setNumGenRequests(0);
+
+    this->run(params);
+}
+
+TEST_F(EaglePackDataTest, Ctx0Gen100)
+{
+    SamplingParams params;
+
+    params.setNumCtxRequests(0);
+    params.setNumGenRequests(100);
+
+    this->run(params);
+}
+
+TEST_F(EaglePackDataTest, Ctx100Gen100)
+{
+    SamplingParams params;
+
+    params.setNumCtxRequests(100);
+    params.setNumGenRequests(100);
+
+    this->run(params);
+}
+} // namespace
--- a/cpp/tests/resources/data/test_model_lora_config.json
+++ b/cpp/tests/resources/data/test_model_lora_config.json
@ -137,7 +137,6 @@
            "layernorm_quantization_plugin": null,
            "rmsnorm_quantization_plugin": null,
            "nccl_plugin": null,
-            "lookup_plugin": null,
            "lora_plugin": "auto",
            "weight_only_groupwise_quant_matmul_plugin": null,
            "weight_only_quant_matmul_plugin": null,
--- a/cpp/tests/resources/scripts/test_cpp.py
+++ b/cpp/tests/resources/scripts/test_cpp.py
@ -139,7 +139,8 @@ def parallel_run_ctest(
            # Some catastrophic fail happened that there's no report generated
            raise

-        parallel_report = 'parallel-' + report
+        # Avoid .xml extension to prevent CI from reading failures from it
+        parallel_report = 'parallel-' + report + ".intermediate"
        _os.rename(cwd / report, cwd / parallel_report)

        try:
@ -153,7 +154,7 @@ def parallel_run_ctest(
                # Use parallel result as final report
                _os.rename(cwd / parallel_report, cwd / report)
            else:
-                retry_report = 'retry-' + report
+                retry_report = 'retry-' + report + ".intermediate"
                _os.rename(cwd / report, cwd / retry_report)
                merge_report(cwd / parallel_report, cwd / retry_report,
                             cwd / report)
@ -472,6 +473,12 @@ def prepare_multi_gpu_model_tests(python_exe: str,
                        model_cache_arg=model_cache_arg,
                        only_multi_gpu_arg=only_multi_gpu_arg)

+    prepare_model_tests(model_name="llama",
+                        python_exe=python_exe,
+                        root_dir=root_dir,
+                        resources_dir=resources_dir,
+                        model_cache_arg=model_cache_arg)
+
    prepare_model_tests(model_name="t5",
                        python_exe=python_exe,
                        root_dir=root_dir,
@ -704,6 +711,19 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
    ]
    run_command(cache_trans_test, cwd=tests_dir, env=cpp_env, timeout=300)

+    # Cache transceiver tests
+    cache_trans_test_8_proc = [
+        "mpirun",
+        "-n",
+        "8",
+        "--allow-run-as-root",
+        "batch_manager/cacheTransceiverTest",
+    ]
+    run_command(cache_trans_test_8_proc,
+                cwd=tests_dir,
+                env=cpp_env,
+                timeout=600)
+
    # UCX transceiver tests, the test may not be built if ENABLE_UCX is 0
    if _os.path.exists(
            _os.path.join(tests_dir, "batch_manager/ucxDataTransceiverTest")):
@ -843,6 +863,19 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
    run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)

+    new_env = copy.copy(cpp_env)
+    new_env["RUN_LLAMA_MULTI_GPU"] = "true"
+    xml_output_file = build_dir / "results-multi-gpu-disagg-asymmetric-executor-8-process.xml"
+    trt_model_test = produce_mpirun_command(
+        global_commands=["mpirun", "--allow-run-as-root"],
+        nranks=8,
+        local_commands=[
+            "executor/executorTest",
+            "--gtest_filter=*DisaggAsymmetricExecutorTest*"
+        ],
+        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
+    run_command(trt_model_test, cwd=tests_dir, env=new_env, timeout=1500)
+

 def run_benchmarks(model_name: str, python_exe: str, root_dir: _pl.Path,
                   build_dir: _pl.Path, resources_dir: _pl.Path,
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@ -1,6 +1,6 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
-ARG BASE_TAG=24.07-py3
+ARG BASE_TAG=24.09-py3
 ARG DEVEL_IMAGE=devel

 FROM ${BASE_IMAGE}:${BASE_TAG} as base
--- a/docker/Makefile
+++ b/docker/Makefile
@ -155,7 +155,7 @@ centos7_%: BASE_TAG = 12.4.0-devel-centos7
 # For x86_64 and aarch64
 ubuntu22_%: STAGE = devel
 ubuntu22_%: BASE_IMAGE = nvidia/cuda
-ubuntu22_%: BASE_TAG = 12.5.1-devel-ubuntu22.04
+ubuntu22_%: BASE_TAG = 12.6.1-devel-ubuntu22.04

 trtllm_%: STAGE = release
 trtllm_%: PUSH_TO_STAGING := 0
--- a/docker/common/install_cuda_toolkit.sh
+++ b/docker/common/install_cuda_toolkit.sh
@ -4,7 +4,7 @@ set -ex

 # This script is used for reinstalling CUDA on CentOS 7 with the run file.
 # CUDA version is usually aligned with the latest NGC CUDA image tag.
-CUDA_VER="12.5.1_555.42.06"
+CUDA_VER="12.6.1_560.35.03"
 CUDA_VER_SHORT="${CUDA_VER%_*}"

 NVCC_VERSION_OUTPUT=$(nvcc --version)
--- a/docker/common/install_pytorch.sh
+++ b/docker/common/install_pytorch.sh
@ -4,12 +4,8 @@ set -ex

 # Use latest stable version from https://pypi.org/project/torch/#history
 # and closest to the version specified in
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07
-TORCH_VERSION="2.4.0"
-# Check the compatible torchvision from
-# https://github.com/pytorch/vision/tree/main?tab=readme-ov-file#installation
-# and also confirm with https://pypi.org/pypi/torchvision/0.19.0/json
-TORCHVISION_VERSION="0.19.0"
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-09.html#rel-24-09
+TORCH_VERSION="2.4.1"
 SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')

 prepare_environment() {
@ -61,6 +57,9 @@ install_from_source() {
    python3 setup.py install
    cd /tmp && rm -rf /tmp/pytorch

+    # Get torchvision version by dry run
+    TORCHVISION_VERSION=$(pip3 install --dry-run torch==${TORCH_VERSION} torchvision | grep "Would install" | tr ' ' '\n' | grep torchvision | cut -d "-" -f 2)
+
    export PYTORCH_VERSION=${PYTORCH_BUILD_VERSION}
    export FORCE_CUDA=1
    export BUILD_VERSION=${TORCHVISION_VERSION}
@ -76,7 +75,7 @@ install_from_source() {

 install_from_pypi() {
    pip3 uninstall -y torch torchvision
-    pip3 install torch==${TORCH_VERSION} torchvision==${TORCHVISION_VERSION}
+    pip3 install torch==${TORCH_VERSION} torchvision
 }

 case "$1" in
--- a/docker/common/install_tensorrt.sh
+++ b/docker/common/install_tensorrt.sh
@ -2,19 +2,20 @@

 set -ex

-TRT_VER="10.4.0.26"
+TRT_VER="10.5.0.18"
 # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-07.html#rel-24-07
-CUDA_VER="12.5" # 12.5.1
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-09.html#rel-24-09
+CUDA_VER="12.6" # 12.6.1
 # Keep the installation for cuDNN if users want to install PyTorch with source codes.
 # PyTorch 2.x can compile with cuDNN v9.
-CUDNN_VER="9.2.1.18-1"
-NCCL_VER="2.22.3-1+cuda12.5"
-CUBLAS_VER="12.5.3.2-1"
+CUDA_DRIVER_VERSION="560.35.03-1"
+CUDNN_VER="9.4.0.58-1"
+NCCL_VER="2.22.3-1+cuda12.6"
+CUBLAS_VER="12.6.1.4-1"
 # Align with the pre-installed CUDA / NVCC / NVRTC versions from
-# https://docs.nvidia.com/cuda/archive/12.5.1/cuda-toolkit-release-notes/index.html
-NVRTC_VER="12.5.82-1"
-CUDA_RUNTIME="12.5.82-1"
+# https://docs.nvidia.com/cuda/archive/12.6.1/cuda-toolkit-release-notes/index.html
+NVRTC_VER="12.6.68-1"
+CUDA_RUNTIME="12.6.68-1"

 for i in "$@"; do
    case $i in
@ -76,7 +77,9 @@ install_centos_requirements() {
    wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-toolkit-${CUBLAS_CUDA_VERSION}-config-common-${CUDA_RUNTIME}.noarch.rpm
    wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-toolkit-12-config-common-${CUDA_RUNTIME}.noarch.rpm
    wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-toolkit-config-common-${CUDA_RUNTIME}.noarch.rpm
+    wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/cuda-compat-${CUBLAS_CUDA_VERSION}-${CUDA_DRIVER_VERSION}.x86_64.rpm
    yum remove -y "cuda-toolkit*" && yum -y localinstall cuda-toolkit-${CUBLAS_CUDA_VERSION}-config-common-${CUDA_RUNTIME}.noarch.rpm cuda-toolkit-12-config-common-${CUDA_RUNTIME}.noarch.rpm cuda-toolkit-config-common-${CUDA_RUNTIME}.noarch.rpm
+    yum remove -y "cuda-compat*" && yum -y localinstall cuda-compat-${CUBLAS_CUDA_VERSION}-${CUDA_DRIVER_VERSION}.x86_64.rpm
    wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm
    wget -q https://developer.download.nvidia.cn/compute/cuda/repos/rhel8/x86_64/libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm
    yum remove -y "libcublas*" && yum -y localinstall libcublas-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm libcublas-devel-${CUBLAS_CUDA_VERSION}-${CUBLAS_VER}.x86_64.rpm
@ -96,7 +99,7 @@ install_tensorrt() {
        if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
        if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu24_04" && OS2="Ubuntu-24.04" && OS="ubuntu-24.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
-        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.4.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
+        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.5.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
    fi
    wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
    tar -xf /tmp/TensorRT.tar -C /usr/local/
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -46,6 +46,12 @@ myst_url_schemes = {
    "https://github.com/NVIDIA/TensorRT-LLM/tree/" + branch_name + "/{{path}}",
 }

+myst_heading_anchors = 4
+
+myst_enable_extensions = [
+    "deflist",
+]
+
 autosummary_generate = True
 copybutton_exclude = '.linenos, .gp, .go'
 copybutton_prompt_text = ">>> |$ |# "
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -111,7 +111,8 @@ Welcome to TensorRT-LLM's Documentation!
   :name: Performance

   performance/perf-overview.md
-   performance/perf-best-practices.md
+   Benchmarking <performance/perf-benchmarking.md>
+   Best Practices <performance/perf-best-practices.md>
   performance/perf-analysis.md


--- a/docs/source/installation/build-from-source-windows.md
+++ b/docs/source/installation/build-from-source-windows.md
@ -185,12 +185,15 @@ Building from source produces the following library files.
    - `th_common.exp`
    - `th_common.lib`

-The locations of the DLLs, in addition to some `torch` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
+The locations of the DLLs, in addition to some `torch` DLLs and `TensorRT` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:

 ```bash
+%USERPROFILE%\inference\TensorRT\lib
 %USERPROFILE%\inference\TensorRT-LLM\cpp\build\tensorrt_llm
 %USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\tensorrt_llm\libs
 %USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\torch\lib
 ```

 Your `Path` additions may differ, particularly if you used the Docker method and copied all the relevant DLLs into a single folder.
+
+Again, close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
--- a/docs/source/installation/windows.md
+++ b/docs/source/installation/windows.md
@ -4,7 +4,7 @@

 ```{note}
 The Windows release of TensorRT-LLM is currently in beta.
-We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.13.0) for the most stable experience.
+We recommend checking out the [v0.14.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.14.0) for the most stable experience.
 ```

 **Prerequisites**
@ -52,7 +52,7 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L
  before installing TensorRT-LLM with the following command.

  ```bash
-  pip install tensorrt_llm==0.13.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/
+  pip install tensorrt_llm==0.14.0 --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/
  ```

  Run the following command to verify that your TensorRT-LLM installation is working properly.
@ -70,7 +70,7 @@ We recommend checking out the [v0.13.0 tag](https://github.com/NVIDIA/TensorRT-L

 This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install
 [the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version)
-and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSCV DLL).
+and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSVC DLL).

 2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies.

--- a/docs/source/performance/perf-benchmarking.md
+++ b/docs/source/performance/perf-benchmarking.md
@ -1,26 +1,36 @@
+(perf-benchmarking)=
+
 # TensorRT-LLM Benchmarking

-> [!WARNING] Work in Progress
-> This benchmarking suite is a current work in progress and is prone to large changes.
+```{important}
+This benchmarking suite is a work in progress.
+Expect breaking API changes.
+```

-TensorRT-LLM provides a packaged benchmarking utility that is accessible via the `trtllm-bench` CLI tool.
+TensorRT-LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility.

 #### Supported Networks for Benchmarking

- [`tiiuae/falcon-180B`](https://huggingface.co/tiiuae/falcon-180B)
- [`meta-llama/Llama-2-7b-hf`](https://huggingface.co/meta-llama/Llama-2-7b-hf)
- [`meta-llama/Llama-2-70b-hf`](https://huggingface.co/meta-llama/Llama-2-70b-hf)
- [`meta-llama/Meta-Llama-3-8B`](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
- [`meta-llama/Meta-Llama-3-70B`](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
- [`EleutherAI/gpt-j-6b`](https://huggingface.co/EleutherAI/gpt-j-6b)
- [`mistralai/Mistral-7B-v0.1`](https://huggingface.co/mistralai/Mistral-7B-v0.1)
- [`mistralai/Mixtral-8x7B-v0.1`](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
+- [meta-llama/Llama-2-7b-hf](https://huggingface.co/meta-llama/Llama-2-7b-hf)
+- [meta-llama/Llama-2-70b-hf](https://huggingface.co/meta-llama/Llama-2-70b-hf)
+- [tiiuae/falcon-180B](https://huggingface.co/tiiuae/falcon-180B)
+- [EleutherAI/gpt-j-6b](https://huggingface.co/EleutherAI/gpt-j-6b)
+- [meta-llama/Meta-Llama-3-8B](https://huggingface.co/meta-llama/Meta-Llama-3-8B)
+- [meta-llama/Llama-3.1-8B](https://huggingface.co/meta-llama/Llama-3.1-8B)
+- [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B)
+- [meta-llama/Llama-3.1-70B](https://huggingface.co/meta-llama/Llama-3.1-70B)
+- [meta-llama/Llama-3.1-405B](https://huggingface.co/meta-llama/Llama-3.1-405B)
+- [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1)
+- [mistralai/Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
+- [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct)
+- [meta-llama/Llama-3.1-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-70B-Instruct)
+- [meta-llama/Llama-3.1-405B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-405B-Instruct)
+- [mistralai/Mixtral-8x7B-v0.1-Instruct](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1-Instruct)


 #### Support Quantization Modes

-TensorRT-LLM supports a number of quanization modes. For more information about quantization, see the
-[documentation](https://nvidia.github.io/TensorRT-LLM/reference/precision.html).
+TensorRT-LLM supports a number of quantization modes:

 - None (no quantization applied)
 - W8A16
@ -31,7 +41,8 @@ TensorRT-LLM supports a number of quanization modes. For more information about
 - FP8
 - INT8

-> [!NOTE] Please see the supported quantization methods for each network [here](https://nvidia.github.io/TensorRT-LLM/reference/precision.html#support-matrix)
+For more information about quantization, refer to [](../reference/precision.md) and
+the [support matrix](../reference/precision.md#support-matrix) of the supported quantization methods for each network.


 ## Inflight Benchmarking with a Dataset
@ -41,9 +52,10 @@ This section covers how to benchmark TensorRT-LLM using inflight batching.

 ### Quickstart

-For this quick start guide, we will focus on running a short max throughput benchmark on
+This quick start focuses on running a short max throughput benchmark on
 `meta-llama/Llama-2-7b-hf` on a synthetic dataset with a uniform distribution of prompts with ISL:OSL
-of 128:128. In order to run the benchmark from start to finish simply run the following commands:
+of 128:128.
+To run the benchmark from start to finish, run the following commands:

 ```shell
 python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer meta-llama/Llama-2-7b-hf token-norm-dist --input-mean 128 --output-mean 128 --input-stdev 0 --output-stdev 0 --num-requests 3000 > /tmp/synthetic_128_128.txt
@ -51,7 +63,8 @@ trtllm-bench --model meta-llama/Llama-2-7b-hf build --dataset /tmp/synthetic_128
 trtllm-bench --model meta-llama/Llama-2-7b-hf throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
 ```

-And that's it! Once the benchmark completes, a summary will be printed with summary metrics.
+And that's it!
+After the benchmark completes, `trtllm-bench` prints a summary with summary metrics.

 ```shell
 ===========================================================
@ -108,28 +121,31 @@ straightforward to specify requests. The schema is defined as follows:
 | `logits`        |    N*    | List[Integer] | List of logits that make up the request prompt. |
 | `output_tokens` |    Y     |    Integer    | Number of generated tokens for this request.    |

-> [!NOTE] Prompt and logits are mutually exclusive*
-> While having both `prompt` and `logits` is not required, at least one is required.
-> If `logits` are specified, the `prompt` entry is ignored for request generation.
+Prompt and logits are mutually exclusive, but one of `prompt` or `logits` is required.
+If you specify `logits`, the `prompt` entry is ignored for request generation.

-Examples of valid entries for the inflight benchmark are:
+Refer to the following examples of valid entries for the inflight benchmark:

 - Entries with a human-readable prompt and no logits.
-```json
-{"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000}
-{"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000}
-```
+
+  ```json
+  {"task_id": 1, "prompt": "Generate an infinite response to the following: This is the song that never ends, it goes on and on my friend.", "output_tokens": 1000}
+  {"task_id": 2, "prompt": "Generate an infinite response to the following: Na, na, na, na", "output_tokens": 1000}
+  ```

 - Entries which contain logits.
-```json
-{"task_id":0,"logits":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128}
-{"task_id":1,"logits":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128}
-```

-> [!INFO] A whole entry is on a line!
-> To make the passing of data simpler, a complete JSON entry is on each line so that the benchmarker
-> can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete
-> JSON entry is on every line.
+  ```json
+  {"task_id":0,"logits":[863,22056,25603,11943,8932,13195,3132,25032,21747,22213],"output_tokens":128}
+  {"task_id":1,"logits":[14480,13598,15585,6591,1252,8259,30990,26778,7063,30065,21764,11023,1418],"output_tokens":128}
+  ```
+
+```{tip}
+Specify each entry on one line.
+To simplify passing the data, a complete JSON entry is on each line so that the benchmarker
+can simply read a line and assume a complete entry. When creating a dataset, be sure that a complete
+JSON entry is on every line.
+```

 #### Using `prepare_dataset` to Create Synthetic Datasets

@ -162,12 +178,12 @@ trtllm-bench --model meta-llama/Llama-2-7b-hf build --max_seq_len 256 --quantiza

 > [!NOTE] `trtllm-bench build` reproduces benchmark engines for performance study. These engine
 configurations are not guaranteed to be optimal for all cases and should be viewed as reproducers
-for the benchmark data we provide on our [Performance Overview](../docs/source/performance/perf-overview.md).
+for the benchmark data we provide on our [Performance Overview](./perf-overview.md).

 Looking a little closer, the `build` sub-command
 will perform a lookup and build an engine using those reference settings. The
 look up table directly corresponds to the performance table found in our
-[Performance Overview](../docs/source/performance/perf-overview.md#throughput-measurements). The
+[Performance Overview](./perf-overview.md#throughput-measurements). The
 output of the `build` sub-command looks similar to the snippet below (for `meta-llama/Llama-2-7b-hf`):

 ```shell
@ -236,16 +252,17 @@ upper bound throughput number.

 #### How the Benchmarker Works

-The benchmarker will read in a data file or standard input (stdin) as a stream where a single line contains
-a complete JSON request entry. The process that the benchmarker is as follows:
+The benchmarker reads a data file where a single line contains
+a complete JSON request entry as specified in [](#preparing-a-dataset).
+The process that the benchmarker is as follows:

 1. Iterate over all input requests. If `logits` is specified, construct the request using the specified
 list of logits. Otherwise, tokenize the `prompt` with as specified by `--model $HF_MODEL_NAME`.
-3. Submit the dataset to the TensorRT-LLM `Executor` API at as fast of a rate as possible (offline mode).
-4. Wait for all requests to return, compute statistics, then report out results.
+1. Submit the dataset to the TensorRT-LLM `Executor` API as fast as possible (offline mode).
+1. Wait for all requests to return, compute statistics, and then report results.

-To run the benchmarker, run the following with the [engine](#building-a-benchmark-engine) and
-[dataset](#preparing-a-dataset) generated above:
+To run the benchmarker, run the following commands with the [engine](#building-a-benchmark-engine) and
+[dataset](#preparing-a-dataset) generated from previous steps:

 ```shell
 trtllm-bench --model meta-llama/Llama-2-7b-hf throughput --dataset /tmp/synthetic_128_128.txt --engine_dir /tmp/meta-llama/Llama-2-7b-hf/tp_1_pp_1
@ -316,16 +333,157 @@ Total Latency (seconds):        20.331645167
 [TensorRT-LLM][INFO] Refreshed the MPI local session
 ```

+## Low Latency Benchmark
+
+The low latency benchmark follows a similar workflow to the [throughput benchmark](#running-a-max-throughput-benchmark)
+but requires building the engine separately from `trtllm-bench`. Low latency benchmarks has the following modes:
+
+- A single-request low-latency engine
+- A Medusa-enabled speculative-decoding engine
+
+### Low Latency TensorRT-LLM Engine for Llama-3 70B
+
+To build a low-latency engine for the latency benchmark, run the following quantize and build commands.
+The `$checkpoint_dir` is the path to the [meta-llama/Meta-Llama-3-70B](https://huggingface.co/meta-llama/Meta-Llama-3-70B) Hugging Face checkpoint in your cache or downloaded to a specific location with the [huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
+To prepare a dataset, follow the same process as specified in [](#preparing-a-dataset).
+
+#### Benchmarking a non-Medusa Low Latency Engine
+
+To quantize the checkpoint:
+
+```shell
+cd tensorrt_llm/examples/llama
+python ../quantization/quantize.py \
+    --model_dir $checkpoint_dir \
+    --dtype bfloat16 \
+    --qformat fp8 \
+    --kv_cache_dtype fp8 \
+    --output_dir /tmp/meta-llama/Meta-Llama-3-70B/checkpoint \
+    --calib_size 512 \
+    --tp_size $tp_size
+```
+
+then build,
+
+```shell
+trtllm-build \
+    --checkpoint_dir /tmp/meta-llama/Meta-Llama-3-70B/checkpoint \
+    --use_fused_mlp enable \
+    --gpt_attention_plugin bfloat16 \
+    --output_dir /tmp/meta-llama/Meta-Llama-3-70B/engine \
+    --max_batch_size 1 \
+    --max_seq_len $(($isl+$osl)) \
+    --reduce_fusion enable \
+    --gemm_plugin fp8 \
+    --workers $tp_size \
+    --use_fp8_context_fmha enable \
+    --max_num_tokens $isl \
+    --use_paged_context_fmha disable \
+    --multiple_profiles enable
+```
+
+After the engine is built, run the low-latency benchmark:
+
+```shell
+env TRTLLM_ENABLE_MMHA_MULTI_BLOCK_DEBUG=1 \
+  TRTLLM_MMHA_KERNEL_BLOCK_SIZE=256 \
+  TRTLLM_MMHA_BLOCKS_PER_SEQUENCE=32 \
+  FORCE_MULTI_BLOCK_MODE=ON \
+  TRTLLM_ENABLE_FDL=1 \
+  trtllm-bench --model meta-llama/Meta-Llama-3-70B \
+  latency \
+  --dataset $DATASET_PATH \
+  --engine_dir /tmp/meta-llama/Meta-Llama-3-70B/engine
+```
+
+#### Building a Medusa Low-Latency Engine
+
+To build a Medusa-enabled engine requires checkpoints that contain Medusa heads.
+NVIDIA provides TensorRT-LLM checkpoints on the [NVIDIA](https://huggingface.co/nvidia) page on Hugging Face.
+The checkpoints are pre-quantized and can be directly built after downloading them with the
+[huggingface-cli](https://huggingface.co/docs/huggingface_hub/en/guides/cli).
+After you download the checkpoints, run the following command and specify the `$tp_size` supported by your Medusa checkpoint:
+
+```shell
+trtllm-build --checkpoint_dir $checkpoint_dir \
+    --speculative_decoding_mode medusa \
+    --max_batch_size 1 \
+    --gpt_attention_plugin bfloat16 \
+    --max_seq_len $(($isl+$osl)) \
+    --output_dir /tmp/meta-llama/Meta-Llama-3-70B/medusa/engine \
+    --use_fused_mlp enable \
+    --paged_kv_cache enable \
+    --use_paged_context_fmha disable \
+    --multiple_profiles enable \
+    --reduce_fusion enable \
+    --use_fp8_context_fmha enable \
+    --workers $tp_size \
+    --low_latency_gemm_plugin fp8
+```
+
+After the engine is built, you need to define the Medusa choices.
+The choices are specify with a YAML file like the following example (`medusa.yaml`):
+
+```yaml
+- [0]
+- [0, 0]
+- [1]
+- [0, 1]
+- [2]
+- [0, 0, 0]
+- [1, 0]
+- [0, 2]
+- [3]
+- [0, 3]
+- [4]
+- [0, 4]
+- [2, 0]
+- [0, 5]
+- [0, 0, 1]
+```
+
+To run the Medusa-enabled engine, run the following command:
+
+```shell
+env TRTLLM_ENABLE_PDL=1 \
+  UB_ONESHOT=1 \
+  UB_TP_SIZE=$tp_size \
+  TRTLLM_ENABLE_PDL=1 \
+  TRTLLM_PDL_OVERLAP_RATIO=0.15 \
+  TRTLLM_PREFETCH_RATIO=-1 \
+  trtllm-bench --model meta-llama/Meta-Llama-3-70B \
+  latency \
+  --dataset $DATASET_PATH \
+  --engine_dir /tmp/meta-llama/Meta-Llama-3-70B/medusa/engine \
+  --medusa_choices medusa.yml
+```
+
 ## Summary

-In summary, the general process for reproducing a benchmark point is as follows:
+The following table summarizes the commands needed for running benchmarks:

- Prepare a dataset: `python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $HF_MODEL token-norm-dist --input-mean $ISL --output-mean $OSL --input-stdev 0 --output-stdev 0 --num-requests $NUM_REQUESTS > $DATASET_PATH`
- Build engine: `trtllm-bench --model $HF_MODEL build --dataset $DATASET_PATH`
- Benchmark engine: trtllm-bench --model $HF_MODEL throughput --dataset $DATASET_PATH --engine_dir $ENGINE_DIR`
+| Scenario | Phase | Command |
+| - | - | - |
+| Dataset | Preparation | `python benchmarks/cpp/prepare_dataset.py --stdout --tokenizer $HF_MODEL token-norm-dist --input-mean $ISL --output-mean $OSL --input-stdev 0 --output-stdev 0 --num-requests $NUM_REQUESTS > $DATASET_PATH` |
+| Throughput | Build | `trtllm-bench --model $HF_MODEL build --dataset $DATASET_PATH` |
+| Throughput | Benchmark | `trtllm-bench --model $HF_MODEL throughput --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` |
+| Latency | Build | See [section about building low latency engines](#low-latency-tensorrt-llm-engine-for-llama-3-70b) |
+| Non-Medusa Latency | Benchmark | `trtllm-bench --model $HF_MODEL latency --dataset $DATASET_PATH --engine_dir $ENGINE_DIR` |
+| Medusa Latency | Benchmark | `trtllm-bench --model $HF_MODEL latency --dataset $DATASET_PATH --engine_dir $ENGINE_DIR --medusa_choices $MEDUSA_CHOICES` |

 where,
- `$HF_MODEL` is the Huggingface name of a model.
- `$NUM_REQUESTS` is the number of requests to generate.
- `$DATASET_PATH` is the path where the dataset was written when preparing the dataset.
- `$ENGINE_DIR` the engine directory as printed by `trtllm-bench build`.
+
+`$HF_MODEL`
+: The Hugging Face name of a model.
+
+`$NUM_REQUESTS`
+: The number of requests to generate.
+
+`$DATASET_PATH`
+: The path where the dataset was written when preparing the dataset.
+
+`$ENGINE_DIR`
+: The engine directory as printed by `trtllm-bench build`.
+
+`$MEDUSA_CHOICES`
+: A YAML config representing the Medusa tree for the benchmark.
--- a/docs/source/performance/perf-best-practices.md
+++ b/docs/source/performance/perf-best-practices.md
@ -149,24 +149,16 @@ only supported for the llama model. It is recommended to enable this feature whe
 The embedding parallelism feature enables the sharding of the embedding table
 across multiple GPUs, so that the memory usage could be reduced and the
 throughput improved. The embedding sharing feature enables the sharing of the
-embedding table between `look_up` and `lm_head` layers.
+embedding table between `look_up` and `lm_head` layers to reduced memory usage.

-The look-up plugin implements the embedding sharing feature and is required to
-enable the aforementioned features for now (until TensorRT native layers
-support embedding sharing).
+It is recommended to enable embedding parallelism to improve throughput with `--use_parallel_embedding` and `--embedding_sharding_dim` in `convert_checkpoint.py`.

-It is recommended to enable the embedding parallelism and sharing features to
-improve throughput. However, the following conditions have to be satisfied:
+Embedding sharing is by default enabled if following conditions are met:
+1. `look_up` and `lm_head` layers have identical weights.
+2. `--gemm_plugin` is not used when building the engine.
+3. For tensor parallelism cases, `-embedding_sharding_dim 0` must be set. In other words, we must enable embedding parallelism along the vocab dimension,

-1. The model shares the embedding table between `look_up` and `lm_head` layers,
-2. Both look_up plugin and gemm plugin are enabled,
-3. The sharding dimension of the embedding lookup table is set correctly.
-
-To enable the features, use the `--use_parallel_embedding`, `--embedding_sharding_dim` and
-`--use_embedding_sharing` arguments in `convert_checkpoint.py`, and use the
-`--lookup_plugin`, `--gemm_plugin` arguments in `trtllm-build` command. See those
-[Examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt#embedding-parallelism-and-sharing)
-for details.
+See those [Examples](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/gpt#embedding-parallelism) for details.

 ### Horizontal Fusion in Gated-MLP

--- a/docs/source/performance/perf-overview.md
+++ b/docs/source/performance/perf-overview.md
@ -226,7 +226,7 @@ The performance numbers below were collected using the steps described in this d
 > [!NOTE] The only models supported in this workflow are those listed in the table above.

 The following tables are references for commands that are used as part of the benchmarking process. For a more detailed
-description of this benchmarking workflow, see the [Benchmarking Suite README](../../../benchmarks/Suite.md).
+description of this benchmarking workflow, see the [benchmarking suite documentation](https://nvidia.github.io/TensorRT-LLM/performance/perf-benchmarking.html).

 ### Commands

--- a/docs/source/reference/support-matrix.md
+++ b/docs/source/reference/support-matrix.md
@ -105,9 +105,9 @@ The following table shows the supported software for TensorRT-LLM.
 * -
  - Software Compatibility
 * - Container
-  - [24.07](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+  - [24.09](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
 * - TensorRT
-  - [10.4](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
+  - [10.5](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
 * - Precision
  -
    - Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
--- a/docs/source/reference/troubleshooting.md
+++ b/docs/source/reference/troubleshooting.md
@ -12,27 +12,6 @@ In most occasions, these problems are caused by the workflow like: an old compil

 Solution: try running build script with `--clean`, or try running `rm -r build cpp/build` before running build script again.

-## cuDNN Linking Errors
-
-Errors such as "Entry Point Not Found" (for example [#1062](https://github.com/NVIDIA/TensorRT-LLM/issues/1062)).
-
-Solution: the issue might be a mismatch in the `cuDNN` libraries shipped from `torch` and `tensorrt`. To rectify this, please try the following steps
-
-```bash
-python -m pip uninstall -y tensorrt_llm
-python -m pip install --upgrade pip
-python -m pip install nvidia-cudnn-cu11==8.9.4.25 --no-cache-dir
-python -m pip install --pre --extra-index-url https://pypi.nvidia.com/ tensorrt==9.2.0.post12.dev5 --no-cache-dir
-python -m pip uninstall -y nvidia-cudnn-cu11
-python -m pip install tensorrt_llm  --extra-index-url https://pypi.nvidia.com/ --extra-index-url https://download.pytorch.org/whl/cu121
-```
-
-## Model Debug
-
-When debugging a TensorRT-LLM model, we usually want to print the value of the intermediate tensors.
-
-We should mark the interested intermediate tensors as the network outputs, then print their values at runtime, since TensorRT-LLM obeys define-and-run paradigm.
-
 ## Debug on Unit Tests

 Here is an example to print the values of the MLP output tensor in the a unit test ([full example](../../../tests/test_debugging_api.py)).
--- a/docs/source/release-notes.md
+++ b/docs/source/release-notes.md
@ -5,6 +5,49 @@
 All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).


+## TensorRT-LLM Release 0.14.0
+
+### Key Features and Enhancements
+  - Enhanced the `LLM` class in the [LLM API](https://nvidia.github.io/TensorRT-LLM/llm-api/index.html).
+    - Added support for calibration with offline dataset.
+    - Added support for Mamba2.
+    - Added support for `finish_reason` and `stop_reason`.
+  - Added FP8 support for CodeLlama.
+  - Added `__repr__` methods for class `Module`, thanks to the contribution from @1ytic in #2191.
+  - Added BFloat16 support for fused gated MLP.
+  - Updated ReDrafter beam search logic to match Apple ReDrafter v1.1.
+  - Improved `customAllReduce` performance.
+  - Draft model now can copy logits directly over MPI to the target model's process in `orchestrator` mode. This fast logits copy reduces the delay between draft token generation and the beginning of target model inference.
+  - NVIDIA Volta GPU support is deprecated and will be removed in a future release.
+
+### API Changes
+  - [BREAKING CHANGE] The default `max_batch_size` of the `trtllm-build` command is set to `2048`.
+  - [BREAKING CHANGE] Remove `builder_opt` from the `BuildConfig` class and the `trtllm-build` command.
+  - Add logits post-processor support to the `ModelRunnerCpp` class.
+  - Added `isParticipant` method to the C++ `Executor` API to check if the current process is a participant in the executor instance.
+
+### Model Updates
+  - Added support for NemotronNas, see `examples/nemotron_nas/README.md`.
+  - Added support for Deepseek-v1, see `examples/deepseek_v1/README.md`.
+  - Added support for Phi-3.5 models, see `examples/phi/README.md`.
+
+### Fixed Issues
+  - Fixed a typo in `tensorrt_llm/models/model_weights_loader.py`, thanks to the contribution from @wangkuiyi in #2152.
+  - Fixed duplicated import module in `tensorrt_llm/runtime/generation.py`, thanks to the contribution from @lkm2835 in #2182.
+  - Enabled `share_embedding` for the models that have no `lm_head` in legacy  checkpoint conversion path, thanks to the contribution from @lkm2835 in #2232.
+  - Fixed `kv_cache_type` issue in the Python benchmark, thanks to the contribution from @qingquansong in #2219.
+  - Fixed an issue with SmoothQuant calibration with custom datasets. Thanks to the contribution by @Bhuvanesh09 in #2243.
+  - Fixed an issue surrounding `trtllm-build --fast-build` with fake or random weights. Thanks to @ZJLi2013 for flagging it in #2135.
+  - Fixed missing `use_fused_mlp` when constructing `BuildConfig` from dict, thanks for the fix from @ethnzhng in #2081.
+  - Fixed lookahead batch layout for `numNewTokensCumSum`. (#2263)
+
+### Infrastructure Changes
+  - The dependent ModelOpt version is updated to v0.17.
+
+### Documentation
+  - @Sherlock113 added a [tech blog](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml) to the latest news in #2169, thanks for the contribution.
+
+
 ## TensorRT-LLM Release 0.13.0

 ### Key Features and Enhancements
--- a/examples/apps/fastapi_server.py
+++ b/examples/apps/fastapi_server.py
@ -2,6 +2,9 @@
 import asyncio
 import json
 import logging
+import signal
+from contextlib import asynccontextmanager
+from http import HTTPStatus
 from typing import AsyncGenerator, Optional

 import click
@ -9,6 +12,7 @@ import uvicorn
 from fastapi import FastAPI, Request
 from fastapi.responses import JSONResponse, Response, StreamingResponse

+from tensorrt_llm.executor import CppExecutorError, RequestError
 from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig, SamplingParams

 TIMEOUT_KEEP_ALIVE = 5  # seconds.
@ -16,11 +20,16 @@ TIMEOUT_KEEP_ALIVE = 5  # seconds.

 class LlmServer:

-    def __init__(self, llm: LLM, kv_cache_config: KvCacheConfig):
+    def __init__(self, llm: LLM):
        self.llm = llm
-        self.kv_cache_config = kv_cache_config

-        self.app = FastAPI()
+        @asynccontextmanager
+        async def lifespan(app: FastAPI):
+            # terminate rank0 worker
+            yield
+            self.llm._shutdown()
+
+        self.app = FastAPI(lifespan=lifespan)
        self.register_routes()

    def register_routes(self):
@ -50,20 +59,27 @@ class LlmServer:

        sampling_params = SamplingParams(**request_dict)

-        promise = self.llm.generate_async(prompt,
-                                          streaming=streaming,
-                                          sampling_params=sampling_params)
+        try:
+            promise = self.llm.generate_async(prompt,
+                                              streaming=streaming,
+                                              sampling_params=sampling_params)

-        async def stream_results() -> AsyncGenerator[bytes, None]:
-            async for output in promise:
-                yield output.outputs[0].text_diff.encode("utf-8")
+            async def stream_results() -> AsyncGenerator[bytes, None]:
+                async for output in promise:
+                    yield output.outputs[0].text_diff.encode("utf-8")

-        if streaming:
-            return StreamingResponse(stream_results())
+            if streaming:
+                return StreamingResponse(stream_results())

-        # Non-streaming case
-        await promise.aresult()
-        return JSONResponse({"text": promise.outputs[0].text})
+            # Non-streaming case
+            await promise.aresult()
+            return JSONResponse({"text": promise.outputs[0].text})
+        except RequestError as e:
+            return JSONResponse(content=str(e),
+                                status_code=HTTPStatus.BAD_REQUEST)
+        except CppExecutorError:
+            # If internal executor error is raised, shutdown the server
+            signal.raise_signal(signal.SIGINT)

    async def __call__(self, host, port):
        config = uvicorn.Config(self.app,
@ -82,28 +98,32 @@ class LlmServer:
@click.option("--max_beam_width", type=int, default=1)
@click.option("--tp_size", type=int, default=1)
@click.option("--pp_size", type=int, default=1)
+@click.option("--kv_cache_free_gpu_memory_fraction", type=float, default=0.8)
 def entrypoint(model_dir: str,
               tokenizer: Optional[str] = None,
               host: Optional[str] = None,
               port: int = 8000,
               max_beam_width: int = 1,
               tp_size: int = 1,
-               pp_size: int = 1):
+               pp_size: int = 1,
+               kv_cache_free_gpu_memory_fraction: float = 0.8):
    host = host or "0.0.0.0"
    port = port or 8000
    logging.info(f"Starting server at {host}:{port}")

    build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width)

+    kv_cache_config = KvCacheConfig(
+        free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction)
+
    llm = LLM(model_dir,
              tokenizer,
              tensor_parallel_size=tp_size,
              pipeline_parallel_size=pp_size,
-              build_config=build_config)
+              build_config=build_config,
+              kv_cache_config=kv_cache_config)

-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
-
-    server = LlmServer(llm=llm, kv_cache_config=kv_cache_config)
+    server = LlmServer(llm=llm)

    asyncio.run(server(host, port))

--- a/examples/apps/openai_server.py
+++ b/examples/apps/openai_server.py
@ -1,6 +1,8 @@
 #!/usr/bin/env python
 import asyncio
 import logging
+import signal
+from contextlib import asynccontextmanager
 from http import HTTPStatus
 from pathlib import Path
 from typing import (AsyncGenerator, AsyncIterator, List, Optional, Tuple,
@ -15,6 +17,7 @@ from openai.types.chat import ChatCompletionMessageParam
 from transformers import AutoTokenizer, PreTrainedTokenizer

 # yapf: disable
+from tensorrt_llm.executor import CppExecutorError
 from tensorrt_llm.llmapi import LLM, BuildConfig, KvCacheConfig
 from tensorrt_llm.llmapi.llm import RequestOutput
 from tensorrt_llm.llmapi.openai_protocol import (
@ -66,10 +69,8 @@ class OpenaiServer:
    def __init__(self,
                 llm: LLM,
                 model: str,
-                 kv_cache_config: KvCacheConfig,
                 hf_tokenizer: PreTrainedTokenizer = None):
        self.llm = llm
-        self.kv_cache_config = kv_cache_config
        self.tokenizer = hf_tokenizer

        model_dir = Path(model)
@ -78,7 +79,13 @@ class OpenaiServer:
        else:
            self.model = model

-        self.app = FastAPI()
+        @asynccontextmanager
+        async def lifespan(app: FastAPI):
+            # terminate rank0 worker
+            yield
+            self.llm._shutdown()
+
+        self.app = FastAPI(lifespan=lifespan)

        @self.app.exception_handler(RequestValidationError)
        async def validation_exception_handler(_, exc):
@ -326,7 +333,9 @@ class OpenaiServer:
            else:
                response = await create_chat_response(promise)
                return JSONResponse(content=response.model_dump())
-
+        except CppExecutorError:
+            # If internal executor error is raised, shutdown the server
+            signal.raise_signal(signal.SIGINT)
        except Exception as e:
            return self.create_error_response(str(e))

@ -432,6 +441,9 @@ class OpenaiServer:
            else:
                response = await create_completion_response(generator, num_choices)
                return JSONResponse(content=response.model_dump())
+        except CppExecutorError:
+            # If internal executor error is raised, shutdown the server
+            signal.raise_signal(signal.SIGINT)
        except Exception as e:
            return self.create_error_response(str(e))

@ -453,6 +465,7 @@ class OpenaiServer:
@click.option("--max_seq_len", type=int, default=512)
@click.option("--tp_size", type=int, default=1)
@click.option("--pp_size", type=int, default=1)
+@click.option("--kv_cache_free_gpu_memory_fraction", type=float, default=0.8)
 def entrypoint(model_dir: str,
               tokenizer: Optional[str] = None,
               host: Optional[str] = None,
@ -460,25 +473,27 @@ def entrypoint(model_dir: str,
               max_beam_width: int = 1,
               max_seq_len: int = 512,
               tp_size: int = 1,
-               pp_size: int = 1):
+               pp_size: int = 1,
+               kv_cache_free_gpu_memory_fraction: float = 0.8):
    host = host or "0.0.0.0"
    port = port or 8000
    logging.info(f"Starting server at {host}:{port}")

    build_config = BuildConfig(max_batch_size=10, max_beam_width=max_beam_width, max_seq_len=max_seq_len)

+    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=kv_cache_free_gpu_memory_fraction)
+
    llm = LLM(model_dir,
              tokenizer,
              tensor_parallel_size=tp_size,
              pipeline_parallel_size=pp_size,
-              build_config=build_config)
+              build_config=build_config,
+              kv_cache_config=kv_cache_config)

-    kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.8)
    hf_tokenizer = AutoTokenizer.from_pretrained(tokenizer or model_dir)

    server = OpenaiServer(llm=llm,
                          model=model_dir,
-                          kv_cache_config=kv_cache_config,
                          hf_tokenizer=hf_tokenizer)

    asyncio.run(server(host, port))
--- a/examples/baichuan/requirements.txt
+++ b/examples/baichuan/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.15.0.dev2024102200
+tensorrt_llm==0.15.0.dev2024102900
 datasets~=2.15.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/bloom/README.md
+++ b/examples/bloom/README.md
@ -106,7 +106,6 @@ python convert_checkpoint.py --model_dir ./bloom/176B/ \
                --embedding_sharding_dim 0
 trtllm-build --checkpoint_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
                --gemm_plugin float16 \
-                --lookup_plugin float16 \
                --output_dir ./bloom/176B/trt_engines/fp16/8-gpu/ \
                --workers 2

@ -123,7 +122,7 @@ trtllm-build --checkpoint_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
                --workers 2

 # share embedding table between embedding() and lm_head() layers
-# To reduce the generated engine size, we has to use gemm and lookup plugin (--gemm_plugin --lookup_plugin) and must shard the embedding table in the vocab dimension.
+# To reduce the generated engine size, we can turn off gemm plugin and shard the embedding table in the vocab dimension.
 python convert_checkpoint.py --model_dir ./bloom/176B/ \
                --dtype float16 \
                --output_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
@ -132,8 +131,6 @@ python convert_checkpoint.py --model_dir ./bloom/176B/ \
                --embedding_sharding_dim 0 \
                --use_embedding_sharing
 trtllm-build --checkpoint_dir ./bloom/176B/trt_ckpt/fp16/8-gpu/ \
-                --gemm_plugin float16 \
-                --lookup_plugin float16 \
                --output_dir ./bloom/176B/trt_engines/fp16/8-gpu/ \
                --workers 2
 ```
--- a/examples/bloom/requirements.txt
+++ b/examples/bloom/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.15.0.dev2024102200
+tensorrt_llm==0.15.0.dev2024102900
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/chatglm/requirements.txt
+++ b/examples/chatglm/requirements.txt
@ -1,8 +1,10 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.15.0.dev2024102200
+tensorrt_llm==0.15.0.dev2024102900
 datasets~=2.14.5
 evaluate~=0.4.1
 protobuf
 rouge_score~=0.1.2
 sentencepiece
 tiktoken
+# https://github.com/THUDM/ChatGLM3/issues/1324
+transformers<=4.43.0
--- a/examples/commandr/requirements.txt
+++ b/examples/commandr/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.15.0.dev2024102200
+tensorrt_llm==0.15.0.dev2024102900
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/dbrx/requirements.txt
+++ b/examples/dbrx/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.15.0.dev2024102200
+tensorrt_llm==0.15.0.dev2024102900
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/draft_target_model/requirements.txt
+++ b/examples/draft_target_model/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.15.0.dev2024102200
+tensorrt_llm==0.15.0.dev2024102900
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
--- a/examples/eagle/requirements.txt
+++ b/examples/eagle/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.15.0.dev2024102200
+tensorrt_llm==0.15.0.dev2024102900
 datasets~=2.14.5
 rouge_score~=0.1.2
 SentencePiece~=0.1.99
--- a/Show More
+++ b/Show More