TensorRT-LLM v0.18 release (#3231)

2026-01-13 22:18:36 +08:00 · 2025-04-02 17:01:16 +08:00 · 2025-04-02 17:01:16 +08:00 · 3c0462002c
commit 3c0462002c
parent 258c7540c0
113 changed files with 1001 additions and 707 deletions
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -4,7 +4,7 @@ repos:
    hooks:
    -   id: isort
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    rev: v1.1.13
+    rev: v1.5.5
    hooks:
    -   id: remove-crlf
 -   repo: https://github.com/google/yapf
@ -24,7 +24,7 @@ repos:
    -   id: check-yaml
    -   id: trailing-whitespace
 -   repo: https://github.com/PyCQA/autoflake
-    rev: v1.6.1
+    rev: v2.3.1
    hooks:
    -   id: autoflake
        args: ['--in-place', '--remove-all-unused-imports', '--remove-unused-variables']
--- a/README.md
+++ b/README.md
@ -7,9 +7,9 @@ TensorRT-LLM
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.8.0-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.8.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.17.0-green)](./tensorrt_llm/version.py)
+[![cuda](https://img.shields.io/badge/cuda-12.8.1-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.9.0-green)](https://developer.nvidia.com/tensorrt)
+[![version](https://img.shields.io/badge/release-0.18.0-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)

 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
--- a/cpp/include/tensorrt_llm/batch_manager/GptManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@ -94,6 +94,7 @@ private:
    [[nodiscard]] SizeType32 getMaxSequenceLen() const;
    [[nodiscard]] SizeType32 getMaxNumSequences() const;
    [[nodiscard]] SizeType32 getMaxDraftLen() const;
+    [[nodiscard]] SizeType32 getVocabSizePadded() const;

    void validateLlmRequest(
        LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
--- a/cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h
+++ b/cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h
@ -26,8 +26,6 @@
 namespace tensorrt_llm::batch_manager
 {

-namespace tle = tensorrt_llm::executor;
-
 class AllocateKvCache : Algorithm
 {
    using BaseKVCacheManager = tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager;
--- a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
+++ b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
@ -56,7 +56,7 @@ public:

 struct ExpiringBlockComparator
 {
-    inline bool operator()(BlockPtr const& a, BlockPtr const& b) const
+    bool operator()(BlockPtr const& a, BlockPtr const& b) const
    {
        // If two blocks expire in the same millisecond, their expiration times will be equal. As a fallback, check the
        // raw pointer values.
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@ -166,6 +166,8 @@ class KVCacheBlock
 public:
    using IdType = std::int32_t;

+    static constexpr IdType kCachedBlocksRootId = -1;
+
    explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx);

    void startScheduling();
@ -379,6 +381,16 @@ public:
        return mKvCacheRetentionConfig.getDecodeDurationMs();
    }

+    [[nodiscard]] bool getContextRequiresCyclicKvCache() const
+    {
+        return mContextRequiresCyclicKvCache;
+    }
+
+    void setContextRequiresCyclicKvCache(bool contextRequiresCyclicKvCache)
+    {
+        mContextRequiresCyclicKvCache = contextRequiresCyclicKvCache;
+    }
+
 private:
    // Request id of the sequence
    LlmRequest::RequestIdType mRequestId;
@ -392,6 +404,9 @@ private:
    runtime::ITensor::SharedPtr mCacheBlockIndices;
    // The retention priority to assign to decode blocks
    executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
+
+    // A value indicating whether or not the context is long enough to warrant the use of cyclic kv-cache.
+    bool mContextRequiresCyclicKvCache{false};
 };

 // attach metadata to a pool pointer
@ -443,7 +458,7 @@ public:
        SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
        CacheType cacheType = CacheType::kSELF,
        std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
-        std::shared_ptr<KVCacheEventManager> eventManager = nullptr);
+        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false);

    ~BlockManager();

@ -712,6 +727,9 @@ private:
    SizeType32 mMissedBlocks;
    std::set<KVCacheBlock::IdType> reusedBlockIds;

+    // Whether or not to maintain a hashmap of blocks.
+    bool mEnableHashKey;
+
 private:
    friend class KVCacheManager;
 };
@ -818,16 +836,18 @@ public:
    //! \details These blocks become reusable from next step.
    virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0;

-    virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
+    [[nodiscard]] virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;

-    virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(LlmRequest::RequestIdType requestId) const = 0;
+    [[nodiscard]] virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
+        LlmRequest::RequestIdType requestId) const
+        = 0;

-    virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
+    [[nodiscard]] virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
        std::vector<LlmRequest::RequestIdType> const& requestIds) const
        = 0;

-    virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
-    virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
+    [[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
+    [[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;

    virtual void refreshBlocks() = 0;
    virtual void flushIterationEvents() = 0;
@ -846,7 +866,7 @@ public:
            * 2 * modelConfig.getSizePerHead();
    }

-    [[nodiscard]] static std::tuple<SizeType32, SizeType32> const calculateMaxNumBlocks(KvCacheConfig const& config,
+    [[nodiscard]] static std::tuple<SizeType32, SizeType32> calculateMaxNumBlocks(KvCacheConfig const& config,
        nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
        tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);

@ -924,7 +944,7 @@ public:
        return mBlockManager.getNumFreeBlocks();
    }

-    [[nodiscard]] virtual SizeType32 getNumPools() const override
+    [[nodiscard]] SizeType32 getNumPools() const override
    {
        return mBlockManager.getNumPools();
    }
@ -994,8 +1014,6 @@ public:
    /// @return  The number of blocks
    [[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const override;

-    void addContextTokens(LlmRequest::RequestIdType requestId, SizeType32 numTokens);
-
    /// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed.
    void addToken(LlmRequest::RequestIdType requestId) override;

--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
@ -91,7 +91,7 @@ private:

    runtime::ITensor::SharedPtr mPool;
    runtime::ITensor::SharedPtr mCurrent;
-    const std::vector<SizeType32> mBlockIds;
+    std::vector<SizeType32> const mBlockIds;
    size_t mIdx;
 };

--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@ -490,9 +490,14 @@ public:
        initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);
    }

-    void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
+    void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
        std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false)
    {
+        if (mEndId.has_value())
+        {
+            TLLM_CHECK_WITH_INFO(*mEndId >= -1 && *mEndId < vocabSizePadded,
+                "EndId (%d) is not within acceptable range [-1, %d).", *mEndId, vocabSizePadded);
+        }
        TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderInputLen() > maxEncoderInputLen.value()),
            "Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderInputLen(),
            maxEncoderInputLen.value());
--- a/cpp/include/tensorrt_llm/runtime/samplingConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/samplingConfig.h
@ -21,6 +21,7 @@
 #include "tensorrt_llm/layers/defaultDecodingParams.h"
 #include "tensorrt_llm/runtime/common.h"

+#include <algorithm>
 #include <functional>
 #include <optional>
 #include <vector>
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54aeaec28cc8cd7e5f62829fecf5af5be192e906333b108028af951fc6b6346d
-size 9125406
+oid sha256:2d361766d0a13d5d88071e546f5d7ca51fef92300fcc7b261337c638746cbff1
+size 9123884
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d5c2aac4ca61d80f8a61d06a7e17fcfb0c9428a3dd89ade705f14cf651acd4b
-size 9169292
+oid sha256:e7a942b813cd05c5d21c82a7e5b5988227988668bf960b3a954443998b4e2a2b
+size 9167324
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c067e858d968710bfe4575694ff5cabf1fb9d29924f6bd4cee552c4fd485a8ca
-size 2026192
+oid sha256:a11179efe519b2b001d6021c7cbea949c81b3618395fa2ce44da9b09d7d35d14
+size 2029704
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@ -1,2 +1,2 @@
-ca50ae76421863dfebf6080b7f4f6b29 libtensorrt_llm_ucx_wrapper.so
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
+9f9942768fd5b0cf5ed19860ad539dc9 libtensorrt_llm_ucx_wrapper.so
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:092faef60d09d008331b2b09453d89563b5fe6c49a6a195051ad51ca80f79277
-size 8407972
+oid sha256:e2ec997b71d8b990eecc0034930b24319916ed3615a618f982e1b780d7643bc6
+size 8408224
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d00bce3aa2f555f98cb2213d183a3174bb083cd62179ac65dce24d75bd648eb
-size 8374854
+oid sha256:2662dfb4833b41f71f4ef9cfe6a46d6312a04065c2343a899e1476102019a180
+size 8374456
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac4ad59907a322e6fbb85b9e88cea587cc4f5e4018017726895f64bd800c8552
+oid sha256:1a1d23d99e2b91fa11ee3e1fb94179c0731065059eb981da9434a42d98ffa4d8
 size 15592
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
@ -1,2 +1,2 @@
-0ec83a0451530fcf0e3f325cdc185043 libtensorrt_llm_ucx_wrapper.so
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
+e383212a40dca932c7b77bf4544dab80 libtensorrt_llm_ucx_wrapper.so
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a64b3c16150b34ad6437862eaf95c5c35acdf69facc40af14bc0632a16b7d162
-size 54093198
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
@ -1,2 +0,0 @@
-41725f3b40ca44954bb9de6e7dcbfd2c tensorrt_llm_batch_manager_static.lib
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d333952a574e9056a0f01f788f9c053d92a7a9bc988a335df663697405f5659
-size 3102572
+oid sha256:d6ef115e34695dd0bec9df6069dd2e95615f401546ce275b133145fdb7568c6c
+size 3102764
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aa6dcdbe9501429192102c1094be664d0ab07199acc4882ab061eb48b699d83c
-size 3145248
+oid sha256:3bc68d4aec21a361954dd144084edb050e19390b87d6668f88b9e7f110f717a0
+size 3145744
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-15c05b1921f3f8cbb7bc1e53f189c661 libtensorrt_llm_executor_static.a
-b586d90eac1293ea656ff2db8a35cd92 libtensorrt_llm_executor_static.pre_cxx11.a
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
+288c6aa23b993d60d758107188c37d78 libtensorrt_llm_executor_static.a
+20e46fb4b5b23a0f27eb3f8dd7d4d3bf libtensorrt_llm_executor_static.pre_cxx11.a
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4b144867eb79d269db173338ccae8dc6c68a8374a49fe17a555034ac433f46f
-size 3457528
+oid sha256:5b819d5cf3f7d9bd1ee69427db4b7ce3eb9c17f9e2cfa319540dad51ed6648e7
+size 3457520
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7b5f70dda7b8042d09098859a4bd1851159d3c750da97f3bfd55bf713c7a1cc
-size 3447846
+oid sha256:9f123e25ff2e046afff092b082a60b461f3f13853630857bd166e5e8a084e1ee
+size 3448406
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-d573456348a96fd7a97aa832f1113986 libtensorrt_llm_executor_static.a
-b3ba7776e3b5bb6e750e8412fc3b5c60 libtensorrt_llm_executor_static.pre_cxx11.a
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
+eebaf66c6ac42645a9bf12a0b013ac4b libtensorrt_llm_executor_static.a
+1fc4cc62abfb31ad51f6ae3620641c04 libtensorrt_llm_executor_static.pre_cxx11.a
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3722133c17bdad38c3bbd3a5caa4eafbe17805d3a7fa784f10c313902020a13b
-size 26350954
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
@ -1,2 +0,0 @@
-5beaadd32fc3dd25770746016b293229 tensorrt_llm_executor_static.lib
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41cb6788cd975f0f2ef1de0bdff9d3178280a3c2683abb11929e0d5ccecc9d76
-size 126752312
+oid sha256:f82db62eaeeb8a02d44b4cad97ae050cc447eea8a3e48b03f56f6040d1aaccc8
+size 126824176
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@ -1,2 +1,2 @@
-200a2d19469277b9906a00f7da83fd04 libtensorrt_llm_nvrtc_wrapper.so
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
+f3143205203b038b9dca6dd32cf02f59 libtensorrt_llm_nvrtc_wrapper.so
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:738668482149971eaa849b863360e21fe66781c5eeaadab8263c83a3b67637dc
-size 133824576
+oid sha256:bdb100ae1f96025c5dd7e977cbae05005c2b2b3f36b902edc3a4f24d85ec3731
+size 133867944
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
@ -1,2 +1,2 @@
-20761e50ba44b91a7a2d1f8d1c5c780b libtensorrt_llm_nvrtc_wrapper.so
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
+770ca93818f3f04837a67353e3f71fbc libtensorrt_llm_nvrtc_wrapper.so
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5926bdebb3d02686d81b3e29edbdf3fb89e44f1518ae187a66284175fb980613
-size 1230336
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:97eb854ba561c25d46c3e9a074dc5a8ba873923fd28d649c869d9db450e26a8a
-size 3488
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
@ -1,3 +0,0 @@
-3082017cee538017c343567d938bb106 tensorrt_llm_nvrtc_wrapper.lib
-b9b4bf6a2d38abae1d3e038ad73b3890 tensorrt_llm_nvrtc_wrapper.dll
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2c9e20ddfa9d8b200a595a3a59afd53356afe4ad1b030f0b2cf446cac7271c58
-size 53382370
+oid sha256:cb21d66bb8b8eec2f6e11696b2b9b4b629b92ab299fec6702f2102277bb453bf
+size 53355202
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b615014ee1f3e9ef8fd498a72ef8f699b92727177416c05d08bbf1c95d3ff52e
-size 53479636
+oid sha256:7be13e39772baa2ade76d60407fa0d56ecca58c39b24e020f1f0b58c0eede5f0
+size 53469348
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-32e01331abfcacf5b70854104ca4bf20 libtensorrt_llm_internal_cutlass_kernels_static.a
-662d22cefd410c1851ac701e6e3bcbbf libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
+6bf0ba4e9b8b1152a21316243d30bec6 libtensorrt_llm_internal_cutlass_kernels_static.a
+96f8a359c84a78ba415f4d98ef1c4e1d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:baf28ca8666062b417d251b608f31227cfac15676aa0bbbaacb1263befc9d408
-size 68138662
+oid sha256:d5cb27ba31185f16333b697bf9d913015ee85508e96aea2248162f3ff9a618b9
+size 68126454
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:05358d9334e8976d463a40b4c6b0b5d780561556cb89194a8f243b0b69f59f33
-size 68308080
+oid sha256:c22c8b6856111183fc44fb11c8843ea8506f7297a97fee1e9a98414f9526118a
+size 68295728
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-d8af682c4274543b06992255e727f52e libtensorrt_llm_internal_cutlass_kernels_static.a
-c797baf2a0a7538eb8f75e0f898ae208 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
+64df74edb7e87b69478e4f9a2c0b3bb7 libtensorrt_llm_internal_cutlass_kernels_static.a
+68a43f555a5b930950a436ebb54a1267 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3856b1a5ae0f8f8e489f79d6396f2cc2e823536728b042082892502bcd33d76
-size 246691546
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
@ -1,2 +0,0 @@
-ab51496e515622f560f4b989ed1d7e63 tensorrt_llm_internal_cutlass_kernels_static.lib
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
--- a/cpp/tensorrt_llm/kernels/penaltyKernels.cu
+++ b/cpp/tensorrt_llm/kernels/penaltyKernels.cu
@ -213,8 +213,9 @@ __global__ void batchApplyPenalty(T const* const* inputLogits, T* outputLogits,
    if (hasMinLength)
    {
        __syncthreads();
-        // Min length
-        if ((threadIdx.x == 0) && (currentStep - inputLen < minLength))
+        // If current generation length is too short, make sure EOS doesn't have high probability.
+        // This check is not needed when endId is already -1 as generation won't stop on EOS anyway.
+        if ((threadIdx.x == 0) && (currentStep - inputLen < minLength) && endIds[batchSlot] > -1)
        {
            outLogitsPtr[endIds[batchSlot]] = MASK_VAL;
        }
--- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
+++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
@ -79,7 +79,8 @@ __global__ void topKStage1(T const* __restrict logProbs, T const* const* __restr
        if (tid < k && endIds != nullptr) // if returnAllSelectedToken, endIds would not be an input
        {
            auto const index = tmpTopKBufIndex + tid;
-            if (blockLane == 0 && tid == 0)
+            // endId=-1 means generation doesn't stop upon encountering a certain token.
+            if (blockLane == 0 && tid == 0 && endIds[batchSlot] > -1)
            {
                auto const endId = endIds[batchSlot];
                topKTmpIdBuf[index] = tmpLogBufIndex + endId;
--- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu
+++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu
@ -252,29 +252,49 @@ __device__ __forceinline__ void MULTIMEM_ST2(ValType& val, PtrType ptr)
    asm volatile("multimem.st.global.v2.f32 [%0], {%1,%2};" ::"l"(ptr), "r"(val.x), "r"(val.y) : "memory");
 }

-template <typename DType, typename ValType, typename PtrType>
+template <typename DType, bool const DISABLE_FP32_ACC, typename ValType, typename PtrType>
 __device__ __forceinline__ void MULTIMEM_LD(ValType& val, PtrType ptr)
 {
    if constexpr (std::is_same_v<DType, half>)
    {
-        asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"
-            : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
-            : "l"(ptr)
-            : "memory");
+        if (!DISABLE_FP32_ACC)
+        {
+            asm("multimem.ld_reduce.global.add.v4.f16x2.acc::f32 {%0,%1,%2,%3}, [%4];"
+                : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                : "l"(ptr)
+                : "memory");
+        }
+        else
+        {
+            asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"
+                : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                : "l"(ptr)
+                : "memory");
+        }
    }
 #ifdef ENABLE_BF16
    if constexpr (std::is_same_v<DType, __nv_bfloat16>)
    {
-        asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];"
-            : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
-            : "l"(ptr)
-            : "memory");
+        if (!DISABLE_FP32_ACC)
+        {
+            asm("multimem.ld_reduce.global.add.v4.bf16x2.acc::f32 {%0,%1,%2,%3}, [%4];"
+                : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                : "l"(ptr)
+                : "memory");
+        }
+        else
+        {
+            asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];"
+                : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                : "l"(ptr)
+                : "memory");
+        }
    }
 #endif
 }

 // All MC kernels here
-template <typename DType, int RANKS>
+template <typename DType, int RANKS, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset,
    int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
    void** commbuff, int const handleridx, float4* mc_ptr)
@ -310,7 +330,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
        uint4 val[UNROLL_MC];
 #pragma unroll
        for (int i = 0; i < UNROLL_MC; i++)
-            MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
+            MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
 #pragma unroll
        for (int i = 0; i < UNROLL_MC; i++)
            MULTIMEM_ST(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
@ -318,7 +338,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
    for (int line = end_aligned; line < end_elem; line += loop_step0)
    {
        uint4 val;
-        MULTIMEM_LD<DType>(val, mc_ptr + (lineoffset + line));
+        MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val, mc_ptr + (lineoffset + line));
        MULTIMEM_ST(val, mc_ptr + (lineoffset + line));
    }
    __syncthreads();
@ -336,7 +356,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
 } // fp16 inplace reduce kernel (Hopper) MC

 #else
-template <typename DType, int RANKS>
+template <typename DType, int RANKS, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset,
    int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
    void** commbuff, int const handleridx, float4* mc_ptr)
@ -382,8 +402,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
            reinterpret_cast<void*>(&arg3), reinterpret_cast<void*>(&arg4), reinterpret_cast<void*>(&arg5),            \
            reinterpret_cast<void*>(&arg6), reinterpret_cast<void*>(&arg7), reinterpret_cast<void*>(&arg8),            \
            reinterpret_cast<void*>(&arg9), reinterpret_cast<void*>(&arg10)};                                          \
-        TLLM_CUDA_CHECK(                                                                                               \
-            cudaLaunchKernelExC(&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc<DType, x>), kernelArgs));           \
+        TLLM_CUDA_CHECK(cudaLaunchKernelExC(                                                                           \
+            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc<DType, x, DISABLE_FP32_ACC>), kernelArgs));             \
    }

 struct LaunchConfig
@ -529,7 +549,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4_mc(PackedVec<Type>& vec, float SFScaleV
 #endif
 }

-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4(int const op, int const flagoffset, int const firstrank,
        int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff,
@ -574,7 +594,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
        DType* x = reinterpret_cast<DType*>(&val[0]);
 #pragma unroll
        for (int i = 0; i < UNROLL_NLINES; i++)
-            MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
+            MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));

        if (residual_in != nullptr)
        {
@ -643,7 +663,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
 #endif
 }

-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS)
    userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot(int const op, int const flagoffset,
        int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
@ -687,7 +707,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
        DType* x = reinterpret_cast<DType*>(&val[0]);
 #pragma unroll
        for (int i = 0; i < UNROLL_NLINES; i++)
-            MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
+            MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));

        if (residual_in != nullptr)
        {
@ -744,7 +764,7 @@ __global__ void __launch_bounds__(MAX_THREADS)

 #if __CUDA_ARCH__ >= 900

-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant(int const op,
    int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
    int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@ -786,7 +806,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
        DType* x = reinterpret_cast<DType*>(&val[0]);
 #pragma unroll
        for (int i = 0; i < UNROLL_NLINES; i++)
-            MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
+            MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));

        if (residual_in != nullptr)
        {
@ -848,7 +868,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
        *reduceidptr = reduce_id;
 } // quant kernel fp16->fp8 twoshot

-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot(int const op,
    int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
    int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@ -889,7 +909,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
        DType* x = reinterpret_cast<DType*>(&val[0]);
 #pragma unroll
        for (int i = 0; i < UNROLL_NLINES; i++)
-            MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
+            MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));

        if (residual_in != nullptr)
        {
@ -997,7 +1017,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
 } // residual allgather kernel

 #else
-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant(int const op,
    int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
    int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@ -1018,7 +1038,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
    asm volatile("brkpt;\n");
 }

-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot(int const op,
    int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
    int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@ -1059,8 +1079,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
            reinterpret_cast<void*>(&arg12), reinterpret_cast<void*>(&arg13), reinterpret_cast<void*>(&arg14),         \
            reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17),         \
            reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20)};        \
-        TLLM_CUDA_CHECK(cudaLaunchKernelExC(                                                                           \
-            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant<DType, x>), kernelArgs));                 \
+        TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg,                                                                      \
+            (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant<DType, x, DISABLE_FP32_ACC>), kernelArgs));     \
    }

 #define callranksMC_RMSNORM_QUANT_ONESHOT(x)                                                                           \
@ -1091,8 +1111,9 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
            reinterpret_cast<void*>(&arg12), reinterpret_cast<void*>(&arg13), reinterpret_cast<void*>(&arg14),         \
            reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17),         \
            reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20)};        \
-        TLLM_CUDA_CHECK(cudaLaunchKernelExC(                                                                           \
-            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot<DType, x>), kernelArgs));         \
+        TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg,                                                                      \
+            (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot<DType, x, DISABLE_FP32_ACC>),           \
+            kernelArgs));                                                                                              \
    }

 #define callranksMC_RMSNORM_QUANT_FP4(x)                                                                               \
@ -1127,8 +1148,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
            reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17),         \
            reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20),         \
            reinterpret_cast<void*>(&arg21), reinterpret_cast<void*>(&arg22), reinterpret_cast<void*>(&arg23)};        \
-        TLLM_CUDA_CHECK(cudaLaunchKernelExC(                                                                           \
-            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4<DType, x>), kernelArgs));             \
+        TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg,                                                                      \
+            (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4<DType, x, DISABLE_FP32_ACC>), kernelArgs)); \
    }

 #define callranksMC_RMSNORM_QUANT_FP4_ONESHOT(x)                                                                       \
@ -1163,8 +1184,9 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
            reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17),         \
            reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20),         \
            reinterpret_cast<void*>(&arg21), reinterpret_cast<void*>(&arg22)};                                         \
-        TLLM_CUDA_CHECK(cudaLaunchKernelExC(                                                                           \
-            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot<DType, x>), kernelArgs));     \
+        TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg,                                                                      \
+            (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot<DType, x, DISABLE_FP32_ACC>),       \
+            kernelArgs));                                                                                              \
    }
 #define callranksMC_RES_AG(x)                                                                                          \
    if (nlines == x)                                                                                                   \
@ -1189,7 +1211,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_res_allgather<DType, x>), kernelArgs));                 \
    }

-template <typename DType>
+template <typename DType, bool DISABLE_FP32_ACC>
 int allreduce2_userbuff_inplace_gpu(int const maxcredit, int const handler, size_t const offset, size_t const elements,
    int const blocksize, communicator* comm, cudaStream_t stream, int op)
 {
@ -1226,7 +1248,7 @@ int allreduce2_userbuff_inplace_gpu(int const maxcredit, int const handler, size
    return sms;
 }

-template <typename DType>
+template <typename DType, bool DISABLE_FP32_ACC>
 void allreduce_nonsharp_inplace(
    int const handler, size_t const offset, size_t const elements, communicator* comm, cudaStream_t stream, int op)
 {
@ -1234,14 +1256,25 @@ void allreduce_nonsharp_inplace(
        return;
    int blocksize = elements * 2;
    int maxcredit = 0;
-    int sms = allreduce2_userbuff_inplace_gpu<DType>(maxcredit, handler, offset, elements, blocksize, comm, stream, op);
+    int sms;
+    if (DISABLE_FP32_ACC)
+    {
+        sms = allreduce2_userbuff_inplace_gpu<DType, true>(
+            maxcredit, handler, offset, elements, blocksize, comm, stream, op);
+    }
+    else
+    {
+        sms = allreduce2_userbuff_inplace_gpu<DType, false>(
+            maxcredit, handler, offset, elements, blocksize, comm, stream, op);
+    }
 }

-template <typename DType>
+template <typename DType, bool DISABLE_FP32_ACC>
 void allreduce2_userbuff_inplace(
    int const handler, size_t const offset, size_t const elements, communicator* comm, cudaStream_t stream)
 {
-    allreduce_nonsharp_inplace<DType>(handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp2);
+    allreduce_nonsharp_inplace<DType, DISABLE_FP32_ACC>(
+        handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp2);
 }

 bool use_oneshot_kernel(communicator* comm, size_t elements, int hidden_size)
@ -1262,7 +1295,7 @@ bool use_oneshot_kernel(communicator* comm, size_t elements, int hidden_size)
    }
 }

-template <typename DType>
+template <typename DType, bool DISABLE_FP32_ACC>
 int allreduce2_userbuff_inplace_rmsnorm_quant(int const handler, size_t const offset, int const out_handler,
    size_t const out_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps,
    float* scalefactor, void* residual_in, void* residual_out, communicator* comm, cudaStream_t stream)
@ -1315,7 +1348,7 @@ int allreduce2_userbuff_inplace_rmsnorm_quant(int const handler, size_t const of
    return sms;
 }

-template <typename DType>
+template <typename DType, bool DISABLE_FP32_ACC>
 int allreduce2_userbuff_inplace_rmsnorm_quant_fp4(int const handler, size_t const offset, int const out_handler,
    size_t const out_offset, int const scale_handler, size_t const scale_offset, size_t const elements,
    int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in,
@ -1422,11 +1455,31 @@ void allreduce2_userbuff_inplace_impl(int const handler, size_t const offset, si
 {
    switch (dataType)
    {
-    case nvinfer1::DataType::kHALF: allreduce2_userbuff_inplace<half>(handler, offset, elements, comm, stream); break;
+    case nvinfer1::DataType::kHALF:
+    {
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            allreduce2_userbuff_inplace<half, true>(handler, offset, elements, comm, stream);
+        }
+        else
+        {
+            allreduce2_userbuff_inplace<half, false>(handler, offset, elements, comm, stream);
+        }
+        break;
+    }
 #ifdef ENABLE_BF16
    case nvinfer1::DataType::kBF16:
-        allreduce2_userbuff_inplace<__nv_bfloat16>(handler, offset, elements, comm, stream);
+    {
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            allreduce2_userbuff_inplace<__nv_bfloat16, true>(handler, offset, elements, comm, stream);
+        }
+        else
+        {
+            allreduce2_userbuff_inplace<__nv_bfloat16, false>(handler, offset, elements, comm, stream);
+        }
        break;
+    }
 #endif
    default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_impl");
    }
@ -1458,14 +1511,36 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_impl(int const handler, size_t con
    switch (dataType)
    {
    case nvinfer1::DataType::kHALF:
-        return allreduce2_userbuff_inplace_rmsnorm_quant<half>(handler, offset, out_handler, out_offset, elements,
-            hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
+    {
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant<half, true>(handler, offset, out_handler, out_offset,
+                elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
+        }
+        else
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant<half, false>(handler, offset, out_handler, out_offset,
+                elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
+        }
        break;
+    }
 #ifdef ENABLE_BF16
    case nvinfer1::DataType::kBF16:
-        return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16>(handler, offset, out_handler, out_offset,
-            elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
+    {
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16, true>(handler, offset, out_handler,
+                out_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm,
+                stream);
+        }
+        else
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16, false>(handler, offset, out_handler,
+                out_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm,
+                stream);
+        }
        break;
+    }
 #endif
    default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl");
    }
@ -1479,16 +1554,36 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_impl(int const handler, size_t
    switch (dataType)
    {
    case nvinfer1::DataType::kHALF:
-        return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half>(handler, offset, out_handler, out_offset,
-            scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
-            residual_out, comm, stream);
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half, true>(handler, offset, out_handler, out_offset,
+                scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
+                residual_out, comm, stream);
+        }
+        else
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half, false>(handler, offset, out_handler, out_offset,
+                scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
+                residual_out, comm, stream);
+        }
        break;
 #ifdef ENABLE_BF16
    case nvinfer1::DataType::kBF16:
-        return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16>(handler, offset, out_handler, out_offset,
-            scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
-            residual_out, comm, stream);
+    {
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16, true>(handler, offset, out_handler,
+                out_offset, scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor,
+                residual_in, residual_out, comm, stream);
+        }
+        else
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16, false>(handler, offset, out_handler,
+                out_offset, scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor,
+                residual_in, residual_out, comm, stream);
+        }
        break;
+    }
 #endif
    default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl");
    }
--- a/cpp/tensorrt_llm/kernels/userbuffers/utils.h
+++ b/cpp/tensorrt_llm/kernels/userbuffers/utils.h
@ -656,4 +656,7 @@ __inline__ __device__ T blockReduceSumV2(T* val)
    warpReduceSumV2<T, NUM>(val);
    return (T) 0.0f;
 }
+
+static bool const kDISABLE_FP32_ACCUMULATION = getenv("TRTLLM_UB_AR_DISABLE_FP32_ACCUMULATION") != nullptr;
+
 } // namespace tensorrt_llm::runtime::ub
--- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h
@ -96,18 +96,7 @@ inline bool is_supported(int arch, KernelType kernel_type)
        SUPPORT(KernelType::FP16Int8PerChannel);
        SUPPORT(KernelType::FP16Int4PerChannel);
    }
-    else if (arch >= 80 && arch < 90)
-    {
-        SUPPORT(KernelType::FP16Int8Groupwise);
-        SUPPORT(KernelType::BF16Int8Groupwise);
-        SUPPORT(KernelType::FP16Int4Groupwise);
-        SUPPORT(KernelType::BF16Int4Groupwise);
-        SUPPORT(KernelType::FP16Int8PerChannel);
-        SUPPORT(KernelType::BF16Int8PerChannel);
-        SUPPORT(KernelType::FP16Int4PerChannel);
-        SUPPORT(KernelType::BF16Int4PerChannel);
-    }
-    else if (arch >= 90 && arch != 120)
+    else if (arch >= 80)
    {
        SUPPORT(KernelType::FP16Int8Groupwise);
        SUPPORT(KernelType::BF16Int8Groupwise);
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@ -64,7 +64,7 @@ void initBindings(pybind11::module_& m)

    py::classh<GenLlmReq>(m, "GenericLlmRequest")
        .def("validate", &GenLlmReq::validate, py::arg("max_input_len"), py::arg("max_seq_len"),
-            py::arg("max_draft_len"), py::arg("max_endocer_input_len") = std::nullopt,
+            py::arg("max_draft_len"), py::arg("vocab_size_padded"), py::arg("max_endocer_input_len") = std::nullopt,
            py::arg("enable_kv_cache_reuse") = false)
        .def("set_exclude_input_from_output", &GenLlmReq::setExcludeInputFromOutput, py::arg("exclude"))
        .def("get_num_tokens", &GenLlmReq::getNumTokens, py::arg("beam"))
--- a/cpp/tests/resources/scripts/build_chatglm_engines.py
+++ b/cpp/tests/resources/scripts/build_chatglm_engines.py
@ -96,7 +96,10 @@ def build_engines(model_cache: typing.Optional[str] = None,
                  world_size: int = 1,
                  clean: Optional[bool] = False):

-    for model_name in ["chatglm-6b", "chatglm2-6b", "chatglm3-6b", "glm-10b"]:
+    for model_name in [
+            "chatglm-6b", "chatglm2-6b", "chatglm3-6b", "glm-10b", "glm-4-9b",
+            "chatglm3-6b-32k"
+    ]:
        is_chatglm_6b_or_glm_10b = model_name in ["chatglm-6b", "glm-10b"]
        if model_cache and (Path(model_cache) / model_name).is_dir():
            model_cache_dir = Path(model_cache) / model_name
@ -136,20 +139,12 @@ def build_engines(model_cache: typing.Optional[str] = None,
            if ckpt_dir.is_dir():
                shutil.rmtree(ckpt_dir, ignore_errors=True)

-        # Fix HF error for ChatGLM-6B / GLM-4-9B / ChatGLM2-6B, hope to remove this in the future
-        if model_name == "chatglm-6b":
+        # Fix HF error for ChatGLM-6B / GLM-4-9B / ChatGLM2-6B / ChatGLM3-6B-32K, hope to remove this in the future
+        if model_name in [
+                "chatglm-6b", "glm-4-9b", "chatglm2-6b", "chatglm3-6b-32k"
+        ]:
            shutil.copy(
-                chatglm_example_dir / "chatglm-6b/tokenization_chatglm.py",
-                hf_dir,
-            )
-        if model_name == "glm-4-9b":
-            shutil.copy(
-                chatglm_example_dir / "glm-4-9b/tokenization_chatglm.py",
-                hf_dir,
-            )
-        if model_name == "chatglm2-6b":
-            shutil.copy(
-                chatglm_example_dir / "chatglm2-6b/tokenization_chatglm.py",
+                chatglm_example_dir / f"{model_name}/tokenization_chatglm.py",
                hf_dir,
            )

--- a/cpp/tests/resources/scripts/test_cpp.py
+++ b/cpp/tests/resources/scripts/test_cpp.py
@ -530,12 +530,6 @@ def prepare_multi_gpu_model_tests(python_exe: str,
                        resources_dir=resources_dir,
                        model_cache_arg=model_cache_arg)

-    prepare_model_tests(model_name="chatglm",
-                        python_exe=python_exe,
-                        root_dir=root_dir,
-                        resources_dir=resources_dir,
-                        model_cache_arg=model_cache_arg)
-

 def prepare_model_tests(model_name: str,
                        python_exe: str,
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@ -1,9 +1,9 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
-ARG BASE_TAG=25.01-py3
+ARG BASE_TAG=25.03-py3
 ARG DEVEL_IMAGE=devel

-FROM ${BASE_IMAGE}:${BASE_TAG} as base
+FROM ${BASE_IMAGE}:${BASE_TAG} AS base

 # https://www.gnu.org/software/bash/manual/html_node/Bash-Startup-Files.html
 # The default values come from `nvcr.io/nvidia/pytorch`
@ -14,7 +14,10 @@ ENV GITHUB_MIRROR=$GITHUB_MIRROR
 RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
 SHELL ["/bin/bash", "-c"]

-FROM base as devel
+# Clean up the pip constraint file from the base NGC PyTorch image.
+RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true
+
+FROM base AS devel

 ARG PYTHON_VERSION="3.12.3"
 RUN echo "Using Python version: $PYTHON_VERSION"
@ -59,7 +62,7 @@ ARG TORCH_INSTALL_TYPE="skip"
 COPY docker/common/install_pytorch.sh install_pytorch.sh
 RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh

-FROM ${DEVEL_IMAGE} as wheel
+FROM ${DEVEL_IMAGE} AS wheel
 WORKDIR /src/tensorrt_llm
 COPY benchmarks benchmarks
 COPY cpp cpp
@ -77,7 +80,7 @@ ARG BUILD_WHEEL_ARGS="--clean --trt_root /usr/local/tensorrt --python_bindings -
 RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=/root/.cache/ccache \
    python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS}

-FROM ${DEVEL_IMAGE} as release
+FROM ${DEVEL_IMAGE} AS release

 # Create a cache directory for pip
 RUN mkdir -p /root/.cache/pip
--- a/docker/Makefile
+++ b/docker/Makefile
@ -152,16 +152,16 @@ jenkins-aarch64_%: STAGE = devel
 jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell grep 'LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
 jenkins-rockylinux8_%: STAGE = devel
 jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda
-jenkins-rockylinux8_%: BASE_TAG = 12.6.3-devel-rockylinux8
+jenkins-rockylinux8_%: BASE_TAG = 12.8.1-devel-rockylinux8

 rockylinux8_%: STAGE = devel
 rockylinux8_%: BASE_IMAGE = nvidia/cuda
-rockylinux8_%: BASE_TAG = 12.6.3-devel-rockylinux8
+rockylinux8_%: BASE_TAG = 12.8.1-devel-rockylinux8

 # For x86_64 and aarch64
 ubuntu22_%: STAGE = devel
 ubuntu22_%: BASE_IMAGE = nvidia/cuda
-ubuntu22_%: BASE_TAG = 12.6.3-devel-ubuntu22.04
+ubuntu22_%: BASE_TAG = 12.8.1-devel-ubuntu22.04

 trtllm_%: STAGE = release
 trtllm_%: PUSH_TO_STAGING := 0
--- a/docker/common/install_cuda_toolkit.sh
+++ b/docker/common/install_cuda_toolkit.sh
@ -5,7 +5,7 @@ set -ex
 # This script is used for reinstalling CUDA on Rocky Linux 8 with the run file.
 # CUDA version is usually aligned with the latest NGC CUDA image tag.
 # Only use when public CUDA image is not ready.
-CUDA_VER="12.8.0_570.86.10"
+CUDA_VER="12.8.1_570.124.06"
 CUDA_VER_SHORT="${CUDA_VER%_*}"

 NVCC_VERSION_OUTPUT=$(nvcc --version)
--- a/docker/common/install_pytorch.sh
+++ b/docker/common/install_pytorch.sh
@ -4,9 +4,8 @@ set -ex

 # Use latest stable version from https://pypi.org/project/torch/#history
 # and closest to the version specified in
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01
-# PyTorch v2.5.1 has a fix for https://github.com/pytorch/pytorch/issues/138324.
-TORCH_VERSION="2.5.1"
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03
+TORCH_VERSION="2.6.0"
 SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')

 prepare_environment() {
--- a/docker/common/install_tensorrt.sh
+++ b/docker/common/install_tensorrt.sh
@ -2,20 +2,20 @@

 set -ex

-TRT_VER="10.8.0.43"
+TRT_VER="10.9.0.34"
 # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01
-CUDA_VER="12.8" # 12.8.0
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03
+CUDA_VER="12.8" # 12.8.1
 # Keep the installation for cuDNN if users want to install PyTorch with source codes.
 # PyTorch 2.x can compile with cuDNN v9.
-CUDNN_VER="9.7.0.66-1"
+CUDNN_VER="9.8.0.87-1"
 NCCL_VER="2.25.1-1+cuda12.8"
-CUBLAS_VER="12.8.3.14-1"
+CUBLAS_VER="12.8.4.1-1"
 # Align with the pre-installed CUDA / NVCC / NVRTC versions from
 # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
-NVRTC_VER="12.8.61-1"
-CUDA_RUNTIME="12.8.57-1"
-CUDA_DRIVER_VERSION="570.86.10-1.el8"
+NVRTC_VER="12.8.93-1"
+CUDA_RUNTIME="12.8.90-1"
+CUDA_DRIVER_VERSION="570.124.06-1.el8"

 for i in "$@"; do
    case $i in
@ -116,7 +116,7 @@ install_tensorrt() {
        if [ -z "$ARCH" ];then ARCH=$(uname -m);fi
        if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
        if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
-        RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz"
+        RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz"
    fi
    wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
    tar -xf /tmp/TensorRT.tar -C /usr/local/
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@ -1,7 +1,7 @@
 sphinx>=7.0
 sphinx-argparse
 sphinx-click
-sphinx-rtd-theme
+nvidia-sphinx-theme
 myst_parser
 breathe
 pygit2
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@ -67,7 +67,7 @@ source_suffix = {
    '.md': 'markdown',
 }

-html_theme = 'sphinx_rtd_theme'
+html_theme = 'nvidia_sphinx_theme'
 html_static_path = ['_static']

 # ------------------------  C++ Doc related  --------------------------
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -27,8 +27,6 @@ Welcome to TensorRT-LLM's Documentation!

   installation/linux.md
   installation/build-from-source-linux.md
-   installation/windows.md
-   installation/build-from-source-windows.md
   installation/grace-hopper.md


--- a/docs/source/installation/build-from-source-windows.md
+++ b/docs/source/installation/build-from-source-windows.md
@ -1,199 +0,0 @@
-(build-from-source-windows)=
-
-# Building from Source Code on Windows
-
-```{note}
-This section is for advanced users. Skip this section if you plan to use the pre-built TensorRT-LLM release wheel.
-```
-
-## Prerequisites
-
-1. Install prerequisites listed in our [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) document.
-2. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path.
-3. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/).
-4. Download and unzip [TensorRT 10.8.0.43](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/zip/TensorRT-10.8.0.43.Windows.win10.cuda-12.8.zip).
-
-## Building a TensorRT-LLM Docker Image
-
-### Docker Desktop
-
-1. Install [Docker Desktop on Windows](https://docs.docker.com/desktop/install/windows-install/).
-2. Set the following configurations:
-
-  1. Right-click the Docker icon in the Windows system tray (bottom right of your taskbar) and select **Switch to Windows containers...**.
-  2. In the Docker Desktop settings on the **General** tab, uncheck **Use the WSL 2 based image**.
-  3. On the **Docker Engine** tab, set your configuration file to:
-
-  ```
-  {
-    "experimental": true
-  }
-  ```
-
-```{note}
-After building, copy the files out of your container. `docker cp` is not supported on Windows for Hyper-V based images. Unless you are using WSL 2 based images, mount a folder, for example, `trt-llm-build`, to your container when you run it for moving files between the container and host system.
-```
-
-### Acquire an Image
-
-The Docker container will be hosted for public download in a future release. At this time, it must be built manually. From the `TensorRT-LLM\windows\` folder, run the build command:
-
-```bash
-docker build -f .\docker\Dockerfile -t tensorrt-llm-windows-build:latest .
-```
-
-And your image is now ready for use.
-
-### Run the Container
-
-Run the container in interactive mode with your build folder mounted. Specify a memory limit with the `-m` flag. By default, the limit is 2 GB, which is not sufficient to build TensorRT-LLM.
-
-```bash
-docker run -it -m 12g -v .\trt-llm-build:C:\workspace\trt-llm-build tensorrt-llm-windows-build:latest
-```
-
-### Build and Extract Files
-
-1. Clone and setup the TensorRT-LLM repository within the container.
-
-```bash
-git clone https://github.com/NVIDIA/TensorRT-LLM.git
-cd TensorRT-LLM
-git submodule update --init --recursive
-```
-
-2. Build TensorRT-LLM. This command generates `build\tensorrt_llm-*.whl`.
-
-```bash
-python .\scripts\build_wheel.py -a "89-real" --trt_root C:\workspace\TensorRT-10.8.0.43\
-```
-
-3. Copy or move `build\tensorrt_llm-*.whl` into your mounted folder so it can be accessed on your host machine. If you intend to use the C++ runtime, you'll also need to gather various DLLs from the build into your mounted folder. For more information, refer to [C++ Runtime Usage](#c-runtime-usage).
-
-
-
-## Building TensorRT-LLM on Bare Metal
-
-**Prerequisites**
-
-1. Install all prerequisites (`git`, `python`, `CUDA`) listed in our [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) document.
-2. Install Nsight NVTX. TensorRT-LLM on Windows currently depends on NVTX assets that do not come packaged with the CUDA 12.8.0 installer. To install these assets, download the [CUDA 11.8 Toolkit](https://developer.nvidia.com/cuda-11-8-0-download-archive?target_os=Windows&target_arch=x86_64).
-
-    1. During installation, select **Advanced installation**.
-
-    2. Nsight NVTX is located in the CUDA drop-down.
-
-    3. Deselect all packages, and select **Nsight NVTX**.
-
-3. Install the dependencies one of two ways:
-
-    1. Run the `setup_build_env.ps1` script, which installs CMake, Microsoft Visual Studio Build Tools, and TensorRT automatically with default settings.
-
-        1. Run PowerShell as Administrator to use the script.
-
-        ```bash
-        ./setup_build_env.ps1 -TRTPath <TRT-containing-folder> [-skipCMake] [-skipVSBuildTools] [-skipTRT]
-        ```
-
-        2. Close and reopen PowerShell after running the script so that `Path` changes take effect.
-
-        3. Supply a directory that already exists to contain TensorRT to `-TRTPath`, for example, `-TRTPath ~/inference` may be valid, but `-TRTPath ~/inference/TensorRT` will not be valid if `TensorRT` does not exist. `-TRTPath` isn't required if `-skipTRT` is supplied.
-
-    2. Install the dependencies one at a time.
-
-        1. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path.
-        2. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/). When prompted to select more Workloads, check **Desktop development with C++**.
-        3. Download and unzip [TensorRT 10.8.0.43](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/zip/TensorRT-10.8.0.43.Windows.win10.cuda-12.8.zip). Move the folder to a location you can reference later, such as `%USERPROFILE%\inference\TensorRT`.
-
-            1. Add the libraries for TensorRT  to your system's `Path` environment variable. Your `Path` should include a line like this:
-
-            ```bash
-            %USERPROFILE%\inference\TensorRT\lib
-            ```
-
-            2. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
-
-            3. Remove existing `tensorrt` wheels first by executing
-
-            ```bash
-            pip uninstall -y tensorrt tensorrt_libs tensorrt_bindings
-            pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12
-            ```
-
-            4. Install the TensorRT core libraries, run PowerShell, and use `pip` to install the Python wheel.
-
-            ```bash
-            pip install %USERPROFILE%\inference\TensorRT\python\tensorrt-*.whl
-            ```
-
-            5. Verify that your TensorRT installation is working properly.
-
-            ```bash
-            python -c "import tensorrt as trt; print(trt.__version__)"
-            ```
-
-
-**Steps**
-
-1. Launch a 64-bit Developer PowerShell. From your usual PowerShell terminal, run one of the following two commands.
-
-    1. If you installed Visual Studio Build Tools (that is, used the `setup_build_env.ps1` script):
-
-    ```bash
-    & 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1' -Arch amd64
-    ```
-
-    2. If you installed Visual Studio Community (e.g. via manual GUI setup):
-
-    ```bash
-    & 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1' -Arch amd64
-    ```
-
-2. In PowerShell, from the `TensorRT-LLM` root folder, run:
-
-```bash
-python .\scripts\build_wheel.py -a "89-real" --trt_root <path_to_trt_root>
-```
-
-The `-a` flag specifies the device architecture. `"89-real"` supports GeForce 40-series cards.
-
-The flag `-D "ENABLE_MULTI_DEVICE=0"`, while not specified here, is implied on Windows. Multi-device inference is supported on Linux, but not on Windows.
-
-This command generates `build\tensorrt_llm-*.whl`.
-
-(c-runtime-usage)=
-## Linking with the TensorRT-LLM C++ Runtime
-
-```{note}
-This section is for advanced users. Skip this section if you do not intend to use the TensorRT-LLM C++ runtime directly. You must build from source to use the C++ runtime.
-```
-
-Building from source creates libraries that can be used if you wish to directly link against the C++ runtime for TensorRT-LLM. These libraries are also required if you wish to run C++ unit tests and some benchmarks.
-
-Building from source produces the following library files.
- `tensorrt_llm` libraries located in `cpp\build\tensorrt_llm`
-  - `tensorrt_llm.dll` - Shared library
-  - `tensorrt_llm.exp` - Export file
-  - `tensorrt_llm.lib` - Stub for linking to `tensorrt_llm.dll`
- Dependency libraries (these get copied to `tensorrt_llm\libs\`)
-  - `nvinfer_plugin_tensorrt_llm` libraries located in `cpp\build\tensorrt_llm\plugins\`
-    - `nvinfer_plugin_tensorrt_llm.dll`
-    - `nvinfer_plugin_tensorrt_llm.exp`
-    - `nvinfer_plugin_tensorrt_llm.lib`
-  - `th_common` libraries located in `cpp\build\tensorrt_llm\thop\`
-    - `th_common.dll`
-    - `th_common.exp`
-    - `th_common.lib`
-
-The locations of the DLLs, in addition to some `torch` DLLs and `TensorRT` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
-
-```bash
-%USERPROFILE%\inference\TensorRT\lib
-%USERPROFILE%\inference\TensorRT-LLM\cpp\build\tensorrt_llm
-%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\tensorrt_llm\libs
-%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\torch\lib
-```
-
-Your `Path` additions may differ, particularly if you used the Docker method and copied all the relevant DLLs into a single folder.
-
-Again, close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
--- a/docs/source/installation/grace-hopper.md
+++ b/docs/source/installation/grace-hopper.md
@ -5,7 +5,7 @@
 1. Install TensorRT-LLM (tested on Ubuntu 24.04).

    ```bash
-    pip3 install torch==2.5.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+    pip3 install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

    sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm --extra-index-url https://pypi.nvidia.com
    ```
--- a/docs/source/installation/windows.md
+++ b/docs/source/installation/windows.md
@ -1,81 +0,0 @@
-(windows)=
-
-# Installing on Windows
-
-```{note}
-The Windows release of TensorRT-LLM is currently in beta.
-We recommend checking out the [v0.17.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.17.0) for the most stable experience.
-```
-
-```{note}
-TensorRT-LLM on Windows only supports single-GPU execution.
-```
-
-**Prerequisites**
-
-1. Clone this repository using [Git for Windows](https://git-scm.com/download/win).
-
-2. Install the dependencies one of two ways:
-
-    1. Install all dependencies together.
-
-       1. Run the provided PowerShell script `setup_env.ps1` located under the `/windows/` folder which installs Python and CUDA 12.8.0 automatically with default settings. Run PowerShell as Administrator to use the script.
-
-       ```bash
-       ./setup_env.ps1 [-skipCUDA] [-skipPython]
-       ```
-
-       2. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path` modified by the `setup_env.ps1` script above.
-
-    2. Install the dependencies one at a time.
-
-        1. Install [Python 3.10](https://www.python.org/ftp/python/3.10.11/python-3.10.11-amd64.exe).
-
-            1. Select **Add python.exe to PATH** at the start of the installation. The installation may only add the `python` command, but not the `python3` command.
-            2. Navigate to the installation path `%USERPROFILE%\AppData\Local\Programs\Python\Python310` (`AppData` is a hidden folder) and copy `python.exe` to `python3.exe`.
-
-        2. Install [CUDA 12.8.0 Toolkit](https://developer.nvidia.com/cuda-12-8-0-download-archive?target_os=Windows&target_arch=x86_64). Use the Express Installation option. Installation may require a restart.
-
-  3. If using conda environment, run the following command before installing TensorRT-LLM.
-     ```bash
-     conda install -c conda-forge pyarrow
-     ```
-
-
-**Steps**
-
-1. Install TensorRT-LLM.
-
-  If you have an existing TensorRT installation (from older versions of `tensorrt_llm`), please execute
-
-  ```bash
-  pip uninstall -y tensorrt tensorrt_libs tensorrt_bindings
-  pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12
-  ```
-
-  before installing TensorRT-LLM with the following command.
-
-  ```bash
-  pip install tensorrt_llm==0.17.0.post1 --extra-index-url https://download.pytorch.org/whl/ --extra-index-url https://pypi.nvidia.com
-  ```
-
-  Run the following command to verify that your TensorRT-LLM installation is working properly.
-
-  ```bash
-  python -c "import tensorrt_llm; print(tensorrt_llm._utils.trt_version())"
-  ```
-
-2. Build the model.
-3. Deploy the model.
-
-**Known Issue**
-
-1. `OSError: exception: access violation reading 0x0000000000000000` during `import tensorrt_llm` or `trtllm-build`.
-
-This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install
-[the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version)
-and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSVC DLL).
-
-2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies.
-
-Installing the latest [Build Tools for Visual Studio 2022] (https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022) will resolve the issue.
--- a/docs/source/overview.md
+++ b/docs/source/overview.md
@ -35,7 +35,7 @@ Certain limitations might apply. Refer to the {ref}`support-matrix` for more inf

 ### Native Windows Support

-Application developers and AI enthusiasts can now benefit from accelerated LLMs running locally on PCs and Workstations powered by NVIDIA RTX and NVIDIA GeForce RTX GPUs. Refer to the {ref}`windows` section for more information.
+Windows platform support is deprecated as of v0.18.0. All Windows-related code and functionality will be completely removed in future releases.

 ## What Can You Do With TensorRT-LLM?

--- a/docs/source/reference/support-matrix.md
+++ b/docs/source/reference/support-matrix.md
@ -90,7 +90,7 @@ In addition, older architectures can have limitations for newer software release
 * -
  - Hardware Compatibility
 * - Operating System
-  - TensorRT-LLM requires Linux x86_64, Linux aarch64 or Windows.
+  - TensorRT-LLM requires Linux x86_64 or Linux aarch64.
 * - GPU Model Architectures
  -
    - [NVIDIA Blackwell Architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/)
@ -112,9 +112,9 @@ The following table shows the supported software for TensorRT-LLM.
 * -
  - Software Compatibility
 * - Container
-  - [25.01](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+  - [25.03](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
 * - TensorRT
-  - [10.8](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
+  - [10.9](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
 * - Precision
  -
    - Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
--- a/docs/source/release-notes.md
+++ b/docs/source/release-notes.md
@ -5,6 +5,23 @@
 All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).


+## TensorRT-LLM Release 0.18.0
+
+### Key Features and Enhancements
+  - **Features that were previously available in the 0.18.0.dev pre-releases are not included in this release**.
+  - [BREAKING CHANGE] Windows platform support is deprecated as of v0.18.0. All Windows-related code and functionality will be completely removed in future releases.
+
+### Known Issues
+  - The PyTorch workflow on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms.
+
+### Infrastructure Changes
+  - The base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:25.03-py3`.
+  - The base Docker image for TensorRT-LLM Backend is updated to `nvcr.io/nvidia/tritonserver:25.03-py3`.
+  - The dependent TensorRT version is updated to 10.9.
+  - The dependent CUDA version is updated to 12.8.1.
+  - The dependent NVIDIA ModelOpt version is updated to 0.25 for Linux platform.
+
+
 ## TensorRT-LLM Release 0.17.0

 ### Key Features and Enhancements
--- a/docs/source/torch.md
+++ b/docs/source/torch.md
@ -33,7 +33,7 @@ Or you can try the following commands to get a quantized model by yourself:

 ```bash
 git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/llm_ptq
+cd TensorRT-Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
 ```

--- a/examples/baichuan/requirements.txt
+++ b/examples/baichuan/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.15.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/bloom/requirements.txt
+++ b/examples/bloom/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py
+++ b/examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py
@ -0,0 +1,313 @@
+import json
+import os
+from typing import Dict, List, Optional, Union
+
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
+from transformers.utils import PaddingStrategy
+
+
+class SPTokenizer:
+
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+        special_tokens = [
+            "[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>",
+            "<|user|>", "<|assistant|>", "<|observation|>"
+        ]
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+
+    def tokenize(self, s: str):
+        return self.sp_model.EncodeAsPieces(s)
+
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
+
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+
+    def convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens:
+            return self.index_special_tokens[index]
+        if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(self,
+                 vocab_file,
+                 padding_side="left",
+                 clean_up_tokenization_spaces=False,
+                 **kwargs):
+        self.name = "GLMTokenizer"
+
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<pad>": self.tokenizer.pad_id
+        }
+        super().__init__(
+            padding_side=padding_side,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs)
+
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+
+    @property
+    def unk_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {
+            self._convert_id_to_token(i): i
+            for i in range(self.vocab_size)
+        }
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.tokenizer.convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(save_directory,
+                                      self.vocab_files_names["vocab_file"])
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_bytes = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_bytes)
+
+        return (vocab_file, )
+
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+
+    def build_single_message(self, role, metadata, message):
+        assert role in ["system", "user", "assistant", "observation"], role
+        role_tokens = [self.get_command(f"<|{role}|>")
+                       ] + self.tokenizer.encode(f"{metadata}\n")
+        message_tokens = self.tokenizer.encode(message)
+        tokens = role_tokens + message_tokens
+        return tokens
+
+    def build_chat_input(self, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(
+                    item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(
+                self.build_single_message(item["role"],
+                                          item.get("metadata", ""), content))
+        input_ids.extend(self.build_single_message(role, "", query))
+        input_ids.extend([self.get_command("<|assistant|>")])
+        return self.batch_encode_plus([input_ids],
+                                      return_tensors="pt",
+                                      is_split_into_words=True)
+
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [
+                self.get_command("<eos>")
+            ]
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+            padding_side: str = "left",  # wili, fix for new transformers
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (
+                max_length % pad_to_multiple_of != 0):
+            max_length = (
+                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
+            required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [
+                    0
+                ] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [
+                    0
+                ] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[
+                0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
--- a/examples/chatglm/requirements.txt
+++ b/examples/chatglm/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 protobuf
--- a/examples/commandr/requirements.txt
+++ b/examples/commandr/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/dbrx/requirements.txt
+++ b/examples/dbrx/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/deepseek_v1/requirements.txt
+++ b/examples/deepseek_v1/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/draft_target_model/requirements.txt
+++ b/examples/draft_target_model/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece>=0.1.99
--- a/examples/eagle/requirements.txt
+++ b/examples/eagle/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 SentencePiece~=0.1.99
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 transformers>=4.31.0
 datasets~=2.14.5
 evaluate~=0.4.1
--- a/examples/gemma/requirements.txt
+++ b/examples/gemma/requirements.txt
@ -2,7 +2,7 @@
 # WAR the new posting of "nvidia-cudnn-cu12~=9.0".
 # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
 nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 flax~=0.8.0
 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
 jax~=0.4.19; platform_system == "Windows"
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/gptj/requirements.txt
+++ b/examples/gptj/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/gptneox/requirements.txt
+++ b/examples/gptneox/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 evaluate~=0.4.1
--- a/examples/grok/requirements.txt
+++ b/examples/grok/requirements.txt
@ -1,5 +1,5 @@
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/internlm/requirements.txt
+++ b/examples/internlm/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets==2.14.5
 rouge_score~=0.1.2
 sentencepiece>=0.1.99
--- a/examples/jais/requirements.txt
+++ b/examples/jais/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 transformers>=4.43.0
 datasets==2.14.6
 evaluate~=0.4.1
--- a/examples/lookahead/requirements.txt
+++ b/examples/lookahead/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece>=0.1.99
--- a/examples/mamba/requirements.txt
+++ b/examples/mamba/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 transformers>=4.39.0
 datasets~=2.14.5
 evaluate
--- a/examples/medusa/requirements.txt
+++ b/examples/medusa/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece>=0.1.99
--- a/examples/mixtral/requirements.txt
+++ b/examples/mixtral/requirements.txt
@ -1,3 +1,3 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 transformers==4.38.2
 accelerate==0.25.0
--- a/examples/mpt/requirements.txt
+++ b/examples/mpt/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/nemotron/requirements.txt
+++ b/examples/nemotron/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 nemo-toolkit[all]==2.0.0rc1
 megatron-core==0.8.0
 datasets~=2.14.5
--- a/examples/opt/requirements.txt
+++ b/examples/opt/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/phi/requirements.txt
+++ b/examples/phi/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/prompt_lookup/requirements.txt
+++ b/examples/prompt_lookup/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
--- a/examples/quantization/requirements.txt
+++ b/examples/quantization/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets>=2.14.4
 nemo-toolkit[all]==2.0.0rc1
 rouge_score~=0.1.2
--- a/examples/qwen/requirements.txt
+++ b/examples/qwen/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/qwenvl/requirements.txt
+++ b/examples/qwenvl/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/recurrentgemma/requirements.txt
+++ b/examples/recurrentgemma/requirements.txt
@ -1,5 +1,5 @@
-tensorrt_llm==0.17.0.post1
-git+https://github.com/google-deepmind/recurrentgemma.git
+tensorrt_llm==0.18.0
+git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365
 flax>=0.8.2
 jax~=0.4.23
 orbax-checkpoint==0.5.7
--- a/examples/redrafter/requirements.txt
+++ b/examples/redrafter/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece>=0.1.99
--- a/examples/skywork/requirements.txt
+++ b/examples/skywork/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.16.1
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/smaug/requirements.txt
+++ b/examples/smaug/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/whisper/requirements.txt
+++ b/examples/whisper/requirements.txt
@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 tiktoken
 datasets
 kaldialign
--- a/Show More
+++ b/Show More