From 3c0462002cbefde475d46acb5d5edefec92dfb6a Mon Sep 17 00:00:00 2001
From: Kaiyu Xie <26294424+kaiyux@users.noreply.github.com>
Date: Wed, 2 Apr 2025 17:01:16 +0800
Subject: [PATCH] TensorRT-LLM v0.18 release (#3231)

---
 .pre-commit-config.yaml                       |   4 +-
 README.md                                     |   6 +-
 .../tensorrt_llm/batch_manager/GptManager.h   |   1 +
 .../batch_manager/allocateKvCache.h           |   2 -
 .../batch_manager/evictionPolicy.h            |   2 +-
 .../batch_manager/kvCacheManager.h            |  38 ++-
 .../tensorrt_llm/batch_manager/kvCacheUtils.h |   2 +-
 .../tensorrt_llm/batch_manager/llmRequest.h   |   7 +-
 .../tensorrt_llm/runtime/samplingConfig.h     |   1 +
 .../libtensorrt_llm_batch_manager_static.a    |   4 +-
 ...sorrt_llm_batch_manager_static.pre_cxx11.a |   4 +-
 .../libtensorrt_llm_ucx_wrapper.so            |   4 +-
 .../aarch64-linux-gnu/version.txt             |   4 +-
 .../libtensorrt_llm_batch_manager_static.a    |   4 +-
 ...sorrt_llm_batch_manager_static.pre_cxx11.a |   4 +-
 .../libtensorrt_llm_ucx_wrapper.so            |   2 +-
 .../x86_64-linux-gnu/version.txt              |   4 +-
 .../tensorrt_llm_batch_manager_static.lib     |   3 -
 .../x86_64-windows-msvc/version.txt           |   2 -
 .../libtensorrt_llm_executor_static.a         |   4 +-
 ...ibtensorrt_llm_executor_static.pre_cxx11.a |   4 +-
 .../executor/aarch64-linux-gnu/version.txt    |   6 +-
 .../libtensorrt_llm_executor_static.a         |   4 +-
 ...ibtensorrt_llm_executor_static.pre_cxx11.a |   4 +-
 .../executor/x86_64-linux-gnu/version.txt     |   6 +-
 .../tensorrt_llm_executor_static.lib          |   3 -
 .../executor/x86_64-windows-msvc/version.txt  |   2 -
 .../libtensorrt_llm_nvrtc_wrapper.so          |   4 +-
 .../aarch64-linux-gnu/version.txt             |   4 +-
 .../libtensorrt_llm_nvrtc_wrapper.so          |   4 +-
 .../nvrtcWrapper/x86_64-linux-gnu/version.txt |   4 +-
 .../tensorrt_llm_nvrtc_wrapper.dll            |   3 -
 .../tensorrt_llm_nvrtc_wrapper.lib            |   3 -
 .../x86_64-windows-msvc/version.txt           |   3 -
 ...orrt_llm_internal_cutlass_kernels_static.a |   4 +-
 ...nternal_cutlass_kernels_static.pre_cxx11.a |   4 +-
 .../aarch64-linux-gnu/version.txt             |   6 +-
 ...orrt_llm_internal_cutlass_kernels_static.a |   4 +-
 ...nternal_cutlass_kernels_static.pre_cxx11.a |   4 +-
 .../x86_64-linux-gnu/version.txt              |   6 +-
 ...rt_llm_internal_cutlass_kernels_static.lib |   3 -
 .../x86_64-windows-msvc/version.txt           |   2 -
 cpp/tensorrt_llm/kernels/penaltyKernels.cu    |   5 +-
 .../kernels/samplingTopKKernels.cu            |   3 +-
 .../kernels/userbuffers/userbuffers.cu        | 199 ++++++++---
 cpp/tensorrt_llm/kernels/userbuffers/utils.h  |   3 +
 .../weightOnlyBatchedGemv/kernelLauncher.h    |  13 +-
 .../pybind/batch_manager/bindings.cpp         |   2 +-
 .../scripts/build_chatglm_engines.py          |  23 +-
 cpp/tests/resources/scripts/test_cpp.py       |   6 -
 docker/Dockerfile.multi                       |  13 +-
 docker/Makefile                               |   6 +-
 docker/common/install_cuda_toolkit.sh         |   2 +-
 docker/common/install_pytorch.sh              |   5 +-
 docker/common/install_tensorrt.sh             |  18 +-
 docs/requirements.txt                         |   2 +-
 docs/source/conf.py                           |   2 +-
 docs/source/index.rst                         |   2 -
 .../installation/build-from-source-windows.md | 199 -----------
 docs/source/installation/grace-hopper.md      |   2 +-
 docs/source/installation/windows.md           |  81 -----
 docs/source/overview.md                       |   2 +-
 docs/source/reference/support-matrix.md       |   6 +-
 docs/source/release-notes.md                  |  17 +
 docs/source/torch.md                          |   2 +-
 examples/baichuan/requirements.txt            |   2 +-
 examples/bloom/requirements.txt               |   2 +-
 .../chatglm3-6b-32k/tokenization_chatglm.py   | 313 ++++++++++++++++++
 examples/chatglm/requirements.txt             |   2 +-
 examples/commandr/requirements.txt            |   2 +-
 examples/dbrx/requirements.txt                |   2 +-
 examples/deepseek_v1/requirements.txt         |   2 +-
 examples/draft_target_model/requirements.txt  |   2 +-
 examples/eagle/requirements.txt               |   2 +-
 examples/falcon/requirements.txt              |   2 +-
 examples/gemma/requirements.txt               |   2 +-
 examples/gpt/requirements.txt                 |   2 +-
 examples/gptj/requirements.txt                |   2 +-
 examples/gptneox/requirements.txt             |   2 +-
 examples/grok/requirements.txt                |   2 +-
 examples/internlm/requirements.txt            |   2 +-
 examples/jais/requirements.txt                |   2 +-
 examples/llama/requirements.txt               |   2 +-
 examples/lookahead/requirements.txt           |   2 +-
 examples/mamba/requirements.txt               |   2 +-
 examples/medusa/requirements.txt              |   2 +-
 examples/mixtral/requirements.txt             |   2 +-
 examples/mpt/requirements.txt                 |   2 +-
 examples/nemotron/requirements.txt            |   2 +-
 examples/opt/requirements.txt                 |   2 +-
 examples/phi/requirements.txt                 |   2 +-
 examples/prompt_lookup/requirements.txt       |   2 +-
 examples/quantization/requirements.txt        |   2 +-
 examples/qwen/requirements.txt                |   2 +-
 examples/qwenvl/requirements.txt              |   2 +-
 examples/recurrentgemma/requirements.txt      |   4 +-
 examples/redrafter/requirements.txt           |   2 +-
 examples/skywork/requirements.txt             |   2 +-
 examples/smaug/requirements.txt               |   2 +-
 examples/whisper/requirements.txt             |   2 +-
 requirements.txt                              |  12 +-
 .../_torch/attention_backend/flashinfer.py    | 246 +++++++-------
 .../_torch/attention_backend/interface.py     |  61 ++--
 .../attention_backend/star_flashinfer.py      |   5 +-
 .../_torch/attention_backend/trtllm.py        |   4 +-
 .../_torch/attention_backend/vanilla.py       |   6 +-
 .../models/modeling_multimodal_utils.py       | 149 +++++++++
 tensorrt_llm/_torch/models/modeling_vit.py    |   3 +-
 tensorrt_llm/llmapi/_perf_evaluator.py        |   4 +-
 .../runtime/multimodal_model_runner.py        |  27 +-
 tensorrt_llm/version.py                       |   2 +-
 tests/_torch/test_fp4_gemm_quantize.py        |   2 +
 tests/llmapi/test_llm_models.py               |   2 +-
 113 files changed, 1001 insertions(+), 707 deletions(-)
 delete mode 100644 cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
 delete mode 100644 cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
 delete mode 100644 cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
 delete mode 100644 cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
 delete mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
 delete mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
 delete mode 100644 cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
 delete mode 100644 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
 delete mode 100644 cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
 delete mode 100644 docs/source/installation/build-from-source-windows.md
 delete mode 100644 docs/source/installation/windows.md
 create mode 100644 examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py
 create mode 100644 tensorrt_llm/_torch/models/modeling_multimodal_utils.py

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 6c65a0b401..4224abebba 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -4,7 +4,7 @@ repos:
     hooks:
     -   id: isort
 -   repo: https://github.com/Lucas-C/pre-commit-hooks.git
-    rev: v1.1.13
+    rev: v1.5.5
     hooks:
     -   id: remove-crlf
 -   repo: https://github.com/google/yapf
@@ -24,7 +24,7 @@ repos:
     -   id: check-yaml
     -   id: trailing-whitespace
 -   repo: https://github.com/PyCQA/autoflake
-    rev: v1.6.1
+    rev: v2.3.1
     hooks:
     -   id: autoflake
         args: ['--in-place', '--remove-all-unused-imports', '--remove-unused-variables']
diff --git a/README.md b/README.md
index dc17c77dff..da4975cf1c 100644
--- a/README.md
+++ b/README.md
@@ -7,9 +7,9 @@ TensorRT-LLM
 [![Documentation](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](https://nvidia.github.io/TensorRT-LLM/)
 [![python](https://img.shields.io/badge/python-3.12-green)](https://www.python.org/downloads/release/python-3123/)
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
-[![cuda](https://img.shields.io/badge/cuda-12.8.0-green)](https://developer.nvidia.com/cuda-downloads)
-[![trt](https://img.shields.io/badge/TRT-10.8.0-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.17.0-green)](./tensorrt_llm/version.py)
+[![cuda](https://img.shields.io/badge/cuda-12.8.1-green)](https://developer.nvidia.com/cuda-downloads)
+[![trt](https://img.shields.io/badge/TRT-10.9.0-green)](https://developer.nvidia.com/tensorrt)
+[![version](https://img.shields.io/badge/release-0.18.0-green)](./tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture/overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](./docs/source/performance/perf-overview.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://docs.google.com/presentation/d/1gycPmtdh7uUcH6laOvW65Dbp9F1McUkGDIcAyjicBZs/edit?usp=sharing)
diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
index 40034b995f..aa9c5dc92c 100644
--- a/cpp/include/tensorrt_llm/batch_manager/GptManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@@ -94,6 +94,7 @@ private:
     [[nodiscard]] SizeType32 getMaxSequenceLen() const;
     [[nodiscard]] SizeType32 getMaxNumSequences() const;
     [[nodiscard]] SizeType32 getMaxDraftLen() const;
+    [[nodiscard]] SizeType32 getVocabSizePadded() const;
 
     void validateLlmRequest(
         LlmRequest& newReq, runtime::ModelConfig const& modelConfig, runtime::WorldConfig const& worldConfig) const;
diff --git a/cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h b/cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h
index 5236f902c8..5ccfe0ee11 100644
--- a/cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h
+++ b/cpp/include/tensorrt_llm/batch_manager/allocateKvCache.h
@@ -26,8 +26,6 @@
 namespace tensorrt_llm::batch_manager
 {
 
-namespace tle = tensorrt_llm::executor;
-
 class AllocateKvCache : Algorithm
 {
     using BaseKVCacheManager = tensorrt_llm::batch_manager::kv_cache_manager::BaseKVCacheManager;
diff --git a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
index 17d558e675..2a288a42f4 100644
--- a/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
+++ b/cpp/include/tensorrt_llm/batch_manager/evictionPolicy.h
@@ -56,7 +56,7 @@ public:
 
 struct ExpiringBlockComparator
 {
-    inline bool operator()(BlockPtr const& a, BlockPtr const& b) const
+    bool operator()(BlockPtr const& a, BlockPtr const& b) const
     {
         // If two blocks expire in the same millisecond, their expiration times will be equal. As a fallback, check the
         // raw pointer values.
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index 7596130fb0..fa5cc7fe9d 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -166,6 +166,8 @@ class KVCacheBlock
 public:
     using IdType = std::int32_t;
 
+    static constexpr IdType kCachedBlocksRootId = -1;
+
     explicit KVCacheBlock(IdType blockId, kernels::KVCacheIndex blockIdx);
 
     void startScheduling();
@@ -379,6 +381,16 @@ public:
         return mKvCacheRetentionConfig.getDecodeDurationMs();
     }
 
+    [[nodiscard]] bool getContextRequiresCyclicKvCache() const
+    {
+        return mContextRequiresCyclicKvCache;
+    }
+
+    void setContextRequiresCyclicKvCache(bool contextRequiresCyclicKvCache)
+    {
+        mContextRequiresCyclicKvCache = contextRequiresCyclicKvCache;
+    }
+
 private:
     // Request id of the sequence
     LlmRequest::RequestIdType mRequestId;
@@ -392,6 +404,9 @@ private:
     runtime::ITensor::SharedPtr mCacheBlockIndices;
     // The retention priority to assign to decode blocks
     executor::KvCacheRetentionConfig mKvCacheRetentionConfig;
+
+    // A value indicating whether or not the context is long enough to warrant the use of cyclic kv-cache.
+    bool mContextRequiresCyclicKvCache{false};
 };
 
 // attach metadata to a pool pointer
@@ -443,7 +458,7 @@ public:
         SizeType32 maxNumSequences, std::shared_ptr<runtime::CudaStream> stream, bool onboardBlocks,
         CacheType cacheType = CacheType::kSELF,
         std::optional<executor::RetentionPriority> secondaryOffloadMinPriority = std::nullopt,
-        std::shared_ptr<KVCacheEventManager> eventManager = nullptr);
+        std::shared_ptr<KVCacheEventManager> eventManager = nullptr, bool enableHashKey = false);
 
     ~BlockManager();
 
@@ -712,6 +727,9 @@ private:
     SizeType32 mMissedBlocks;
     std::set<KVCacheBlock::IdType> reusedBlockIds;
 
+    // Whether or not to maintain a hashmap of blocks.
+    bool mEnableHashKey;
+
 private:
     friend class KVCacheManager;
 };
@@ -818,16 +836,18 @@ public:
     //! \details These blocks become reusable from next step.
     virtual void storeContextBlocks(LlmRequest const& llmRequest) = 0;
 
-    virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
+    [[nodiscard]] virtual bool schedulingHasFreeBlocks(SizeType32 numRequired = 1) const = 0;
 
-    virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(LlmRequest::RequestIdType requestId) const = 0;
+    [[nodiscard]] virtual std::vector<std::vector<SizeType32>> const& getCacheBlockIds(
+        LlmRequest::RequestIdType requestId) const
+        = 0;
 
-    virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
+    [[nodiscard]] virtual std::vector<std::vector<std::vector<SizeType32>>> getBatchCacheBlockIds(
         std::vector<LlmRequest::RequestIdType> const& requestIds) const
         = 0;
 
-    virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
-    virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
+    [[nodiscard]] virtual runtime::ITensor::SharedPtr getPrimaryPool(SizeType32 layer_idx) const = 0;
+    [[nodiscard]] virtual SizeType32 getPoolLayerIdx(SizeType32 layer_idx) const = 0;
 
     virtual void refreshBlocks() = 0;
     virtual void flushIterationEvents() = 0;
@@ -846,7 +866,7 @@ public:
             * 2 * modelConfig.getSizePerHead();
     }
 
-    [[nodiscard]] static std::tuple<SizeType32, SizeType32> const calculateMaxNumBlocks(KvCacheConfig const& config,
+    [[nodiscard]] static std::tuple<SizeType32, SizeType32> calculateMaxNumBlocks(KvCacheConfig const& config,
         nvinfer1::DataType dtype, tensorrt_llm::runtime::ModelConfig const& modelConfig,
         tensorrt_llm::runtime::WorldConfig const& worldConfig, runtime::BufferManager const& bufferManager);
 
@@ -924,7 +944,7 @@ public:
         return mBlockManager.getNumFreeBlocks();
     }
 
-    [[nodiscard]] virtual SizeType32 getNumPools() const override
+    [[nodiscard]] SizeType32 getNumPools() const override
     {
         return mBlockManager.getNumPools();
     }
@@ -994,8 +1014,6 @@ public:
     /// @return  The number of blocks
     [[nodiscard]] SizeType32 getRemainingBlocksToCompletion(LlmRequest const& req) const override;
 
-    void addContextTokens(LlmRequest::RequestIdType requestId, SizeType32 numTokens);
-
     /// @brief Increase size for request with requestId. Allocate new KV cache block(s) if needed.
     void addToken(LlmRequest::RequestIdType requestId) override;
 
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
index 6febf09887..41a34fed80 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheUtils.h
@@ -91,7 +91,7 @@ private:
 
     runtime::ITensor::SharedPtr mPool;
     runtime::ITensor::SharedPtr mCurrent;
-    const std::vector<SizeType32> mBlockIds;
+    std::vector<SizeType32> const mBlockIds;
     size_t mIdx;
 };
 
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index 4d80636647..c1aaebeca4 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -490,9 +490,14 @@ public:
         initialize(req.getInputTokenIds(), req.getOutputConfig().returnLogProbs);
     }
 
-    void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen,
+    void validate(SizeType32 maxInputLen, SizeType32 maxSequenceLen, SizeType32 maxDraftLen, SizeType32 vocabSizePadded,
         std::optional<SizeType32> maxEncoderInputLen = std::nullopt, bool enableKVCacheReuse = false)
     {
+        if (mEndId.has_value())
+        {
+            TLLM_CHECK_WITH_INFO(*mEndId >= -1 && *mEndId < vocabSizePadded,
+                "EndId (%d) is not within acceptable range [-1, %d).", *mEndId, vocabSizePadded);
+        }
         TLLM_CHECK_WITH_INFO(!(maxEncoderInputLen.has_value() && getEncoderInputLen() > maxEncoderInputLen.value()),
             "Encoder length (%d) exceeds maximum encoder input length (%d).", getEncoderInputLen(),
             maxEncoderInputLen.value());
diff --git a/cpp/include/tensorrt_llm/runtime/samplingConfig.h b/cpp/include/tensorrt_llm/runtime/samplingConfig.h
index 923dca81cd..d4b4ee3c26 100644
--- a/cpp/include/tensorrt_llm/runtime/samplingConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/samplingConfig.h
@@ -21,6 +21,7 @@
 #include "tensorrt_llm/layers/defaultDecodingParams.h"
 #include "tensorrt_llm/runtime/common.h"
 
+#include <algorithm>
 #include <functional>
 #include <optional>
 #include <vector>
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index b08c13c0cb..1e363f63a4 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:54aeaec28cc8cd7e5f62829fecf5af5be192e906333b108028af951fc6b6346d
-size 9125406
+oid sha256:2d361766d0a13d5d88071e546f5d7ca51fef92300fcc7b261337c638746cbff1
+size 9123884
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index 42ffff1710..23f1466689 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d5c2aac4ca61d80f8a61d06a7e17fcfb0c9428a3dd89ade705f14cf651acd4b
-size 9169292
+oid sha256:e7a942b813cd05c5d21c82a7e5b5988227988668bf960b3a954443998b4e2a2b
+size 9167324
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
index e9f4b7568b..88bcdfcdba 100755
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c067e858d968710bfe4575694ff5cabf1fb9d29924f6bd4cee552c4fd485a8ca
-size 2026192
+oid sha256:a11179efe519b2b001d6021c7cbea949c81b3618395fa2ce44da9b09d7d35d14
+size 2029704
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
index 3d6f0c6ee3..1730f2bc44 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
-ca50ae76421863dfebf6080b7f4f6b29 libtensorrt_llm_ucx_wrapper.so
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
+9f9942768fd5b0cf5ed19860ad539dc9 libtensorrt_llm_ucx_wrapper.so
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index e279e66354..de1c8e8b1d 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:092faef60d09d008331b2b09453d89563b5fe6c49a6a195051ad51ca80f79277
-size 8407972
+oid sha256:e2ec997b71d8b990eecc0034930b24319916ed3615a618f982e1b780d7643bc6
+size 8408224
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index e2cd91c68a..f6828f3a5b 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d00bce3aa2f555f98cb2213d183a3174bb083cd62179ac65dce24d75bd648eb
-size 8374854
+oid sha256:2662dfb4833b41f71f4ef9cfe6a46d6312a04065c2343a899e1476102019a180
+size 8374456
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
index ec4557bc66..33347341b9 100755
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_ucx_wrapper.so
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:ac4ad59907a322e6fbb85b9e88cea587cc4f5e4018017726895f64bd800c8552
+oid sha256:1a1d23d99e2b91fa11ee3e1fb94179c0731065059eb981da9434a42d98ffa4d8
 size 15592
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
index 13f916f8de..bc11eb8b02 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
-0ec83a0451530fcf0e3f325cdc185043 libtensorrt_llm_ucx_wrapper.so
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
+e383212a40dca932c7b77bf4544dab80 libtensorrt_llm_ucx_wrapper.so
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
deleted file mode 100644
index f5ccca6cfe..0000000000
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a64b3c16150b34ad6437862eaf95c5c35acdf69facc40af14bc0632a16b7d162
-size 54093198
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
deleted file mode 100644
index edb78ee9fb..0000000000
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-41725f3b40ca44954bb9de6e7dcbfd2c tensorrt_llm_batch_manager_static.lib
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
index c3faa14335..719ef20171 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4d333952a574e9056a0f01f788f9c053d92a7a9bc988a335df663697405f5659
-size 3102572
+oid sha256:d6ef115e34695dd0bec9df6069dd2e95615f401546ce275b133145fdb7568c6c
+size 3102764
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
index b8d3c22ade..a41548be59 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:aa6dcdbe9501429192102c1094be664d0ab07199acc4882ab061eb48b699d83c
-size 3145248
+oid sha256:3bc68d4aec21a361954dd144084edb050e19390b87d6668f88b9e7f110f717a0
+size 3145744
diff --git a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
index c01167e075..5d29f71099 100644
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-15c05b1921f3f8cbb7bc1e53f189c661 libtensorrt_llm_executor_static.a
-b586d90eac1293ea656ff2db8a35cd92 libtensorrt_llm_executor_static.pre_cxx11.a
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
+288c6aa23b993d60d758107188c37d78 libtensorrt_llm_executor_static.a
+20e46fb4b5b23a0f27eb3f8dd7d4d3bf libtensorrt_llm_executor_static.pre_cxx11.a
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
index 6594703a05..d568a4076a 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a4b144867eb79d269db173338ccae8dc6c68a8374a49fe17a555034ac433f46f
-size 3457528
+oid sha256:5b819d5cf3f7d9bd1ee69427db4b7ce3eb9c17f9e2cfa319540dad51ed6648e7
+size 3457520
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
index 296dcd3894..108882da9c 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a7b5f70dda7b8042d09098859a4bd1851159d3c750da97f3bfd55bf713c7a1cc
-size 3447846
+oid sha256:9f123e25ff2e046afff092b082a60b461f3f13853630857bd166e5e8a084e1ee
+size 3448406
diff --git a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
index 4abbd8fc13..0d7ff37de2 100644
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-d573456348a96fd7a97aa832f1113986 libtensorrt_llm_executor_static.a
-b3ba7776e3b5bb6e750e8412fc3b5c60 libtensorrt_llm_executor_static.pre_cxx11.a
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
+eebaf66c6ac42645a9bf12a0b013ac4b libtensorrt_llm_executor_static.a
+1fc4cc62abfb31ad51f6ae3620641c04 libtensorrt_llm_executor_static.pre_cxx11.a
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
deleted file mode 100644
index 827b388048..0000000000
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:3722133c17bdad38c3bbd3a5caa4eafbe17805d3a7fa784f10c313902020a13b
-size 26350954
diff --git a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
deleted file mode 100644
index fbfc2fbcdf..0000000000
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-5beaadd32fc3dd25770746016b293229 tensorrt_llm_executor_static.lib
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
index 9d01b7d0fe..aee08cf678 100755
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:41cb6788cd975f0f2ef1de0bdff9d3178280a3c2683abb11929e0d5ccecc9d76
-size 126752312
+oid sha256:f82db62eaeeb8a02d44b4cad97ae050cc447eea8a3e48b03f56f6040d1aaccc8
+size 126824176
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
index d36de4e84d..dba2e06400 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
-200a2d19469277b9906a00f7da83fd04 libtensorrt_llm_nvrtc_wrapper.so
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
+f3143205203b038b9dca6dd32cf02f59 libtensorrt_llm_nvrtc_wrapper.so
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
index 5ae764d3e1..5167fc4b57 100755
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:738668482149971eaa849b863360e21fe66781c5eeaadab8263c83a3b67637dc
-size 133824576
+oid sha256:bdb100ae1f96025c5dd7e977cbae05005c2b2b3f36b902edc3a4f24d85ec3731
+size 133867944
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
index fcd6037b22..d5ce681f16 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
@@ -1,2 +1,2 @@
-20761e50ba44b91a7a2d1f8d1c5c780b libtensorrt_llm_nvrtc_wrapper.so
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
+770ca93818f3f04837a67353e3f71fbc libtensorrt_llm_nvrtc_wrapper.so
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
deleted file mode 100644
index 383dc043e8..0000000000
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5926bdebb3d02686d81b3e29edbdf3fb89e44f1518ae187a66284175fb980613
-size 1230336
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
deleted file mode 100644
index 51ec0f17b5..0000000000
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:97eb854ba561c25d46c3e9a074dc5a8ba873923fd28d649c869d9db450e26a8a
-size 3488
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
deleted file mode 100644
index 788d1bb79d..0000000000
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-3082017cee538017c343567d938bb106 tensorrt_llm_nvrtc_wrapper.lib
-b9b4bf6a2d38abae1d3e038ad73b3890 tensorrt_llm_nvrtc_wrapper.dll
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
index 435b00d8fd..4ad2b77974 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2c9e20ddfa9d8b200a595a3a59afd53356afe4ad1b030f0b2cf446cac7271c58
-size 53382370
+oid sha256:cb21d66bb8b8eec2f6e11696b2b9b4b629b92ab299fec6702f2102277bb453bf
+size 53355202
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
index b61ada9906..55eb2f88cd 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b615014ee1f3e9ef8fd498a72ef8f699b92727177416c05d08bbf1c95d3ff52e
-size 53479636
+oid sha256:7be13e39772baa2ade76d60407fa0d56ecca58c39b24e020f1f0b58c0eede5f0
+size 53469348
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
index f8182838cb..e79ef6bc1b 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-32e01331abfcacf5b70854104ca4bf20 libtensorrt_llm_internal_cutlass_kernels_static.a
-662d22cefd410c1851ac701e6e3bcbbf libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
+6bf0ba4e9b8b1152a21316243d30bec6 libtensorrt_llm_internal_cutlass_kernels_static.a
+96f8a359c84a78ba415f4d98ef1c4e1d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
index f25f1646be..1198abdc61 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:baf28ca8666062b417d251b608f31227cfac15676aa0bbbaacb1263befc9d408
-size 68138662
+oid sha256:d5cb27ba31185f16333b697bf9d913015ee85508e96aea2248162f3ff9a618b9
+size 68126454
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
index 3758e6c124..b3257f7cb4 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:05358d9334e8976d463a40b4c6b0b5d780561556cb89194a8f243b0b69f59f33
-size 68308080
+oid sha256:c22c8b6856111183fc44fb11c8843ea8506f7297a97fee1e9a98414f9526118a
+size 68295728
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
index 199022625a..67f5b21039 100644
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-d8af682c4274543b06992255e727f52e libtensorrt_llm_internal_cutlass_kernels_static.a
-c797baf2a0a7538eb8f75e0f898ae208 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
+64df74edb7e87b69478e4f9a2c0b3bb7 libtensorrt_llm_internal_cutlass_kernels_static.a
+68a43f555a5b930950a436ebb54a1267 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+3c5fe5eb86077f67febc42070be11f11de17c1e2 commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
deleted file mode 100644
index 0baeacdc3d..0000000000
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b3856b1a5ae0f8f8e489f79d6396f2cc2e823536728b042082892502bcd33d76
-size 246691546
diff --git a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
deleted file mode 100644
index 8ca26b27ce..0000000000
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-ab51496e515622f560f4b989ed1d7e63 tensorrt_llm_internal_cutlass_kernels_static.lib
-f8c0381a2bc50ee2739c3d8c2be481b31e5f00bd commit
\ No newline at end of file
diff --git a/cpp/tensorrt_llm/kernels/penaltyKernels.cu b/cpp/tensorrt_llm/kernels/penaltyKernels.cu
index bb78192c24..257ce8a51f 100644
--- a/cpp/tensorrt_llm/kernels/penaltyKernels.cu
+++ b/cpp/tensorrt_llm/kernels/penaltyKernels.cu
@@ -213,8 +213,9 @@ __global__ void batchApplyPenalty(T const* const* inputLogits, T* outputLogits,
     if (hasMinLength)
     {
         __syncthreads();
-        // Min length
-        if ((threadIdx.x == 0) && (currentStep - inputLen < minLength))
+        // If current generation length is too short, make sure EOS doesn't have high probability.
+        // This check is not needed when endId is already -1 as generation won't stop on EOS anyway.
+        if ((threadIdx.x == 0) && (currentStep - inputLen < minLength) && endIds[batchSlot] > -1)
         {
             outLogitsPtr[endIds[batchSlot]] = MASK_VAL;
         }
diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
index a573d3911d..d6770adec9 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
+++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
@@ -79,7 +79,8 @@ __global__ void topKStage1(T const* __restrict logProbs, T const* const* __restr
         if (tid < k && endIds != nullptr) // if returnAllSelectedToken, endIds would not be an input
         {
             auto const index = tmpTopKBufIndex + tid;
-            if (blockLane == 0 && tid == 0)
+            // endId=-1 means generation doesn't stop upon encountering a certain token.
+            if (blockLane == 0 && tid == 0 && endIds[batchSlot] > -1)
             {
                 auto const endId = endIds[batchSlot];
                 topKTmpIdBuf[index] = tmpLogBufIndex + endId;
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu
index f797440c61..d211a99b81 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu
+++ b/cpp/tensorrt_llm/kernels/userbuffers/userbuffers.cu
@@ -252,29 +252,49 @@ __device__ __forceinline__ void MULTIMEM_ST2(ValType& val, PtrType ptr)
     asm volatile("multimem.st.global.v2.f32 [%0], {%1,%2};" ::"l"(ptr), "r"(val.x), "r"(val.y) : "memory");
 }
 
-template <typename DType, typename ValType, typename PtrType>
+template <typename DType, bool const DISABLE_FP32_ACC, typename ValType, typename PtrType>
 __device__ __forceinline__ void MULTIMEM_LD(ValType& val, PtrType ptr)
 {
     if constexpr (std::is_same_v<DType, half>)
     {
-        asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"
-            : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
-            : "l"(ptr)
-            : "memory");
+        if (!DISABLE_FP32_ACC)
+        {
+            asm("multimem.ld_reduce.global.add.v4.f16x2.acc::f32 {%0,%1,%2,%3}, [%4];"
+                : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                : "l"(ptr)
+                : "memory");
+        }
+        else
+        {
+            asm("multimem.ld_reduce.global.add.v4.f16x2 {%0,%1,%2,%3}, [%4];"
+                : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                : "l"(ptr)
+                : "memory");
+        }
     }
 #ifdef ENABLE_BF16
     if constexpr (std::is_same_v<DType, __nv_bfloat16>)
     {
-        asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];"
-            : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
-            : "l"(ptr)
-            : "memory");
+        if (!DISABLE_FP32_ACC)
+        {
+            asm("multimem.ld_reduce.global.add.v4.bf16x2.acc::f32 {%0,%1,%2,%3}, [%4];"
+                : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                : "l"(ptr)
+                : "memory");
+        }
+        else
+        {
+            asm("multimem.ld_reduce.global.add.v4.bf16x2 {%0,%1,%2,%3}, [%4];"
+                : "=r"(val.x), "=r"(val.y), "=r"(val.z), "=r"(val.w)
+                : "l"(ptr)
+                : "memory");
+        }
     }
 #endif
 }
 
 // All MC kernels here
-template <typename DType, int RANKS>
+template <typename DType, int RANKS, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset,
     int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
     void** commbuff, int const handleridx, float4* mc_ptr)
@@ -310,7 +330,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
         uint4 val[UNROLL_MC];
 #pragma unroll
         for (int i = 0; i < UNROLL_MC; i++)
-            MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
+            MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
 #pragma unroll
         for (int i = 0; i < UNROLL_MC; i++)
             MULTIMEM_ST(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
@@ -318,7 +338,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
     for (int line = end_aligned; line < end_elem; line += loop_step0)
     {
         uint4 val;
-        MULTIMEM_LD<DType>(val, mc_ptr + (lineoffset + line));
+        MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val, mc_ptr + (lineoffset + line));
         MULTIMEM_ST(val, mc_ptr + (lineoffset + line));
     }
     __syncthreads();
@@ -336,7 +356,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
 } // fp16 inplace reduce kernel (Hopper) MC
 
 #else
-template <typename DType, int RANKS>
+template <typename DType, int RANKS, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc(int const op, int const flagoffset,
     int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
     void** commbuff, int const handleridx, float4* mc_ptr)
@@ -382,8 +402,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
             reinterpret_cast<void*>(&arg3), reinterpret_cast<void*>(&arg4), reinterpret_cast<void*>(&arg5),            \
             reinterpret_cast<void*>(&arg6), reinterpret_cast<void*>(&arg7), reinterpret_cast<void*>(&arg8),            \
             reinterpret_cast<void*>(&arg9), reinterpret_cast<void*>(&arg10)};                                          \
-        TLLM_CUDA_CHECK(                                                                                               \
-            cudaLaunchKernelExC(&cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc<DType, x>), kernelArgs));           \
+        TLLM_CUDA_CHECK(cudaLaunchKernelExC(                                                                           \
+            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc<DType, x, DISABLE_FP32_ACC>), kernelArgs));             \
     }
 
 struct LaunchConfig
@@ -529,7 +549,7 @@ __device__ uint32_t cvt_warp_fp16_to_fp4_mc(PackedVec<Type>& vec, float SFScaleV
 #endif
 }
 
-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS)
     userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4(int const op, int const flagoffset, int const firstrank,
         int const myrank, int const gpustep, size_t const lineoffset, int const numlines, void** commbuff,
@@ -574,7 +594,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
         DType* x = reinterpret_cast<DType*>(&val[0]);
 #pragma unroll
         for (int i = 0; i < UNROLL_NLINES; i++)
-            MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
+            MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
 
         if (residual_in != nullptr)
         {
@@ -643,7 +663,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
 #endif
 }
 
-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS)
     userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot(int const op, int const flagoffset,
         int const firstrank, int const myrank, int const gpustep, size_t const lineoffset, int const numlines,
@@ -687,7 +707,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
         DType* x = reinterpret_cast<DType*>(&val[0]);
 #pragma unroll
         for (int i = 0; i < UNROLL_NLINES; i++)
-            MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
+            MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
 
         if (residual_in != nullptr)
         {
@@ -744,7 +764,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
 
 #if __CUDA_ARCH__ >= 900
 
-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant(int const op,
     int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
     int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@@ -786,7 +806,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
         DType* x = reinterpret_cast<DType*>(&val[0]);
 #pragma unroll
         for (int i = 0; i < UNROLL_NLINES; i++)
-            MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
+            MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
 
         if (residual_in != nullptr)
         {
@@ -848,7 +868,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
         *reduceidptr = reduce_id;
 } // quant kernel fp16->fp8 twoshot
 
-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot(int const op,
     int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
     int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@@ -889,7 +909,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
         DType* x = reinterpret_cast<DType*>(&val[0]);
 #pragma unroll
         for (int i = 0; i < UNROLL_NLINES; i++)
-            MULTIMEM_LD<DType>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
+            MULTIMEM_LD<DType, DISABLE_FP32_ACC>(val[i], mc_ptr + (lineoffset + line + i * loop_step0));
 
         if (residual_in != nullptr)
         {
@@ -997,7 +1017,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
 } // residual allgather kernel
 
 #else
-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant(int const op,
     int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
     int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@@ -1018,7 +1038,7 @@ __global__ void __launch_bounds__(MAX_THREADS)
     asm volatile("brkpt;\n");
 }
 
-template <typename DType, int UNROLL_NLINES>
+template <typename DType, int UNROLL_NLINES, bool DISABLE_FP32_ACC>
 __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot(int const op,
     int const flagoffset, int const firstrank, int const myrank, int const gpustep, size_t const lineoffset,
     int const numlines, void** commbuff, int const handleridx, float4* mc_ptr, DType const* beta, DType const* gamma,
@@ -1059,8 +1079,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
             reinterpret_cast<void*>(&arg12), reinterpret_cast<void*>(&arg13), reinterpret_cast<void*>(&arg14),         \
             reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17),         \
             reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20)};        \
-        TLLM_CUDA_CHECK(cudaLaunchKernelExC(                                                                           \
-            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant<DType, x>), kernelArgs));                 \
+        TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg,                                                                      \
+            (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant<DType, x, DISABLE_FP32_ACC>), kernelArgs));     \
     }
 
 #define callranksMC_RMSNORM_QUANT_ONESHOT(x)                                                                           \
@@ -1091,8 +1111,9 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
             reinterpret_cast<void*>(&arg12), reinterpret_cast<void*>(&arg13), reinterpret_cast<void*>(&arg14),         \
             reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17),         \
             reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20)};        \
-        TLLM_CUDA_CHECK(cudaLaunchKernelExC(                                                                           \
-            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot<DType, x>), kernelArgs));         \
+        TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg,                                                                      \
+            (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_oneshot<DType, x, DISABLE_FP32_ACC>),           \
+            kernelArgs));                                                                                              \
     }
 
 #define callranksMC_RMSNORM_QUANT_FP4(x)                                                                               \
@@ -1127,8 +1148,8 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
             reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17),         \
             reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20),         \
             reinterpret_cast<void*>(&arg21), reinterpret_cast<void*>(&arg22), reinterpret_cast<void*>(&arg23)};        \
-        TLLM_CUDA_CHECK(cudaLaunchKernelExC(                                                                           \
-            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4<DType, x>), kernelArgs));             \
+        TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg,                                                                      \
+            (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4<DType, x, DISABLE_FP32_ACC>), kernelArgs)); \
     }
 
 #define callranksMC_RMSNORM_QUANT_FP4_ONESHOT(x)                                                                       \
@@ -1163,8 +1184,9 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
             reinterpret_cast<void*>(&arg15), reinterpret_cast<void*>(&arg16), reinterpret_cast<void*>(&arg17),         \
             reinterpret_cast<void*>(&arg18), reinterpret_cast<void*>(&arg19), reinterpret_cast<void*>(&arg20),         \
             reinterpret_cast<void*>(&arg21), reinterpret_cast<void*>(&arg22)};                                         \
-        TLLM_CUDA_CHECK(cudaLaunchKernelExC(                                                                           \
-            &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot<DType, x>), kernelArgs));     \
+        TLLM_CUDA_CHECK(cudaLaunchKernelExC(&cfg,                                                                      \
+            (void*) (userbuffers_fp16_sum_inplace_gpu_mc_rmsnorm_quant_fp4_oneshot<DType, x, DISABLE_FP32_ACC>),       \
+            kernelArgs));                                                                                              \
     }
 #define callranksMC_RES_AG(x)                                                                                          \
     if (nlines == x)                                                                                                   \
@@ -1189,7 +1211,7 @@ __global__ void __launch_bounds__(MAX_THREADS) userbuffers_fp16_sum_inplace_gpu_
             &cfg, (void*) (userbuffers_fp16_sum_inplace_gpu_mc_res_allgather<DType, x>), kernelArgs));                 \
     }
 
-template <typename DType>
+template <typename DType, bool DISABLE_FP32_ACC>
 int allreduce2_userbuff_inplace_gpu(int const maxcredit, int const handler, size_t const offset, size_t const elements,
     int const blocksize, communicator* comm, cudaStream_t stream, int op)
 {
@@ -1226,7 +1248,7 @@ int allreduce2_userbuff_inplace_gpu(int const maxcredit, int const handler, size
     return sms;
 }
 
-template <typename DType>
+template <typename DType, bool DISABLE_FP32_ACC>
 void allreduce_nonsharp_inplace(
     int const handler, size_t const offset, size_t const elements, communicator* comm, cudaStream_t stream, int op)
 {
@@ -1234,14 +1256,25 @@ void allreduce_nonsharp_inplace(
         return;
     int blocksize = elements * 2;
     int maxcredit = 0;
-    int sms = allreduce2_userbuff_inplace_gpu<DType>(maxcredit, handler, offset, elements, blocksize, comm, stream, op);
+    int sms;
+    if (DISABLE_FP32_ACC)
+    {
+        sms = allreduce2_userbuff_inplace_gpu<DType, true>(
+            maxcredit, handler, offset, elements, blocksize, comm, stream, op);
+    }
+    else
+    {
+        sms = allreduce2_userbuff_inplace_gpu<DType, false>(
+            maxcredit, handler, offset, elements, blocksize, comm, stream, op);
+    }
 }
 
-template <typename DType>
+template <typename DType, bool DISABLE_FP32_ACC>
 void allreduce2_userbuff_inplace(
     int const handler, size_t const offset, size_t const elements, communicator* comm, cudaStream_t stream)
 {
-    allreduce_nonsharp_inplace<DType>(handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp2);
+    allreduce_nonsharp_inplace<DType, DISABLE_FP32_ACC>(
+        handler, offset, elements, comm, stream, userbuffers_allreduceop_nonsharp2);
 }
 
 bool use_oneshot_kernel(communicator* comm, size_t elements, int hidden_size)
@@ -1262,7 +1295,7 @@ bool use_oneshot_kernel(communicator* comm, size_t elements, int hidden_size)
     }
 }
 
-template <typename DType>
+template <typename DType, bool DISABLE_FP32_ACC>
 int allreduce2_userbuff_inplace_rmsnorm_quant(int const handler, size_t const offset, int const out_handler,
     size_t const out_offset, size_t const elements, int const hidden_size, void* beta, void* gamma, float eps,
     float* scalefactor, void* residual_in, void* residual_out, communicator* comm, cudaStream_t stream)
@@ -1315,7 +1348,7 @@ int allreduce2_userbuff_inplace_rmsnorm_quant(int const handler, size_t const of
     return sms;
 }
 
-template <typename DType>
+template <typename DType, bool DISABLE_FP32_ACC>
 int allreduce2_userbuff_inplace_rmsnorm_quant_fp4(int const handler, size_t const offset, int const out_handler,
     size_t const out_offset, int const scale_handler, size_t const scale_offset, size_t const elements,
     int const hidden_size, void* beta, void* gamma, float eps, float* scalefactor, void* residual_in,
@@ -1422,11 +1455,31 @@ void allreduce2_userbuff_inplace_impl(int const handler, size_t const offset, si
 {
     switch (dataType)
     {
-    case nvinfer1::DataType::kHALF: allreduce2_userbuff_inplace<half>(handler, offset, elements, comm, stream); break;
+    case nvinfer1::DataType::kHALF:
+    {
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            allreduce2_userbuff_inplace<half, true>(handler, offset, elements, comm, stream);
+        }
+        else
+        {
+            allreduce2_userbuff_inplace<half, false>(handler, offset, elements, comm, stream);
+        }
+        break;
+    }
 #ifdef ENABLE_BF16
     case nvinfer1::DataType::kBF16:
-        allreduce2_userbuff_inplace<__nv_bfloat16>(handler, offset, elements, comm, stream);
+    {
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            allreduce2_userbuff_inplace<__nv_bfloat16, true>(handler, offset, elements, comm, stream);
+        }
+        else
+        {
+            allreduce2_userbuff_inplace<__nv_bfloat16, false>(handler, offset, elements, comm, stream);
+        }
         break;
+    }
 #endif
     default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_impl");
     }
@@ -1458,14 +1511,36 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_impl(int const handler, size_t con
     switch (dataType)
     {
     case nvinfer1::DataType::kHALF:
-        return allreduce2_userbuff_inplace_rmsnorm_quant<half>(handler, offset, out_handler, out_offset, elements,
-            hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
+    {
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant<half, true>(handler, offset, out_handler, out_offset,
+                elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
+        }
+        else
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant<half, false>(handler, offset, out_handler, out_offset,
+                elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
+        }
         break;
+    }
 #ifdef ENABLE_BF16
     case nvinfer1::DataType::kBF16:
-        return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16>(handler, offset, out_handler, out_offset,
-            elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm, stream);
+    {
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16, true>(handler, offset, out_handler,
+                out_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm,
+                stream);
+        }
+        else
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant<__nv_bfloat16, false>(handler, offset, out_handler,
+                out_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in, residual_out, comm,
+                stream);
+        }
         break;
+    }
 #endif
     default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl");
     }
@@ -1479,16 +1554,36 @@ int allreduce2_userbuff_inplace_rmsnorm_quant_fp4_impl(int const handler, size_t
     switch (dataType)
     {
     case nvinfer1::DataType::kHALF:
-        return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half>(handler, offset, out_handler, out_offset,
-            scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
-            residual_out, comm, stream);
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half, true>(handler, offset, out_handler, out_offset,
+                scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
+                residual_out, comm, stream);
+        }
+        else
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<half, false>(handler, offset, out_handler, out_offset,
+                scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
+                residual_out, comm, stream);
+        }
         break;
 #ifdef ENABLE_BF16
     case nvinfer1::DataType::kBF16:
-        return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16>(handler, offset, out_handler, out_offset,
-            scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor, residual_in,
-            residual_out, comm, stream);
+    {
+        if (kDISABLE_FP32_ACCUMULATION)
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16, true>(handler, offset, out_handler,
+                out_offset, scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor,
+                residual_in, residual_out, comm, stream);
+        }
+        else
+        {
+            return allreduce2_userbuff_inplace_rmsnorm_quant_fp4<__nv_bfloat16, false>(handler, offset, out_handler,
+                out_offset, scale_handler, scale_offset, elements, hidden_size, beta, gamma, eps, scalefactor,
+                residual_in, residual_out, comm, stream);
+        }
         break;
+    }
 #endif
     default: TLLM_THROW("Unsupported dataType for allreduce2_userbuff_inplace_rmsnorm_quant_impl");
     }
diff --git a/cpp/tensorrt_llm/kernels/userbuffers/utils.h b/cpp/tensorrt_llm/kernels/userbuffers/utils.h
index 60a7f714ba..c280ab72e3 100644
--- a/cpp/tensorrt_llm/kernels/userbuffers/utils.h
+++ b/cpp/tensorrt_llm/kernels/userbuffers/utils.h
@@ -656,4 +656,7 @@ __inline__ __device__ T blockReduceSumV2(T* val)
     warpReduceSumV2<T, NUM>(val);
     return (T) 0.0f;
 }
+
+static bool const kDISABLE_FP32_ACCUMULATION = getenv("TRTLLM_UB_AR_DISABLE_FP32_ACCUMULATION") != nullptr;
+
 } // namespace tensorrt_llm::runtime::ub
diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h
index aa27ac6b3b..73343ec2b7 100644
--- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h
+++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h
@@ -96,18 +96,7 @@ inline bool is_supported(int arch, KernelType kernel_type)
         SUPPORT(KernelType::FP16Int8PerChannel);
         SUPPORT(KernelType::FP16Int4PerChannel);
     }
-    else if (arch >= 80 && arch < 90)
-    {
-        SUPPORT(KernelType::FP16Int8Groupwise);
-        SUPPORT(KernelType::BF16Int8Groupwise);
-        SUPPORT(KernelType::FP16Int4Groupwise);
-        SUPPORT(KernelType::BF16Int4Groupwise);
-        SUPPORT(KernelType::FP16Int8PerChannel);
-        SUPPORT(KernelType::BF16Int8PerChannel);
-        SUPPORT(KernelType::FP16Int4PerChannel);
-        SUPPORT(KernelType::BF16Int4PerChannel);
-    }
-    else if (arch >= 90 && arch != 120)
+    else if (arch >= 80)
     {
         SUPPORT(KernelType::FP16Int8Groupwise);
         SUPPORT(KernelType::BF16Int8Groupwise);
diff --git a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
index 491d362b85..ce6d8b05f2 100644
--- a/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/batch_manager/bindings.cpp
@@ -64,7 +64,7 @@ void initBindings(pybind11::module_& m)
 
     py::classh<GenLlmReq>(m, "GenericLlmRequest")
         .def("validate", &GenLlmReq::validate, py::arg("max_input_len"), py::arg("max_seq_len"),
-            py::arg("max_draft_len"), py::arg("max_endocer_input_len") = std::nullopt,
+            py::arg("max_draft_len"), py::arg("vocab_size_padded"), py::arg("max_endocer_input_len") = std::nullopt,
             py::arg("enable_kv_cache_reuse") = false)
         .def("set_exclude_input_from_output", &GenLlmReq::setExcludeInputFromOutput, py::arg("exclude"))
         .def("get_num_tokens", &GenLlmReq::getNumTokens, py::arg("beam"))
diff --git a/cpp/tests/resources/scripts/build_chatglm_engines.py b/cpp/tests/resources/scripts/build_chatglm_engines.py
index 7cd1d686fd..35efba1eb4 100644
--- a/cpp/tests/resources/scripts/build_chatglm_engines.py
+++ b/cpp/tests/resources/scripts/build_chatglm_engines.py
@@ -96,7 +96,10 @@ def build_engines(model_cache: typing.Optional[str] = None,
                   world_size: int = 1,
                   clean: Optional[bool] = False):
 
-    for model_name in ["chatglm-6b", "chatglm2-6b", "chatglm3-6b", "glm-10b"]:
+    for model_name in [
+            "chatglm-6b", "chatglm2-6b", "chatglm3-6b", "glm-10b", "glm-4-9b",
+            "chatglm3-6b-32k"
+    ]:
         is_chatglm_6b_or_glm_10b = model_name in ["chatglm-6b", "glm-10b"]
         if model_cache and (Path(model_cache) / model_name).is_dir():
             model_cache_dir = Path(model_cache) / model_name
@@ -136,20 +139,12 @@ def build_engines(model_cache: typing.Optional[str] = None,
             if ckpt_dir.is_dir():
                 shutil.rmtree(ckpt_dir, ignore_errors=True)
 
-        # Fix HF error for ChatGLM-6B / GLM-4-9B / ChatGLM2-6B, hope to remove this in the future
-        if model_name == "chatglm-6b":
+        # Fix HF error for ChatGLM-6B / GLM-4-9B / ChatGLM2-6B / ChatGLM3-6B-32K, hope to remove this in the future
+        if model_name in [
+                "chatglm-6b", "glm-4-9b", "chatglm2-6b", "chatglm3-6b-32k"
+        ]:
             shutil.copy(
-                chatglm_example_dir / "chatglm-6b/tokenization_chatglm.py",
-                hf_dir,
-            )
-        if model_name == "glm-4-9b":
-            shutil.copy(
-                chatglm_example_dir / "glm-4-9b/tokenization_chatglm.py",
-                hf_dir,
-            )
-        if model_name == "chatglm2-6b":
-            shutil.copy(
-                chatglm_example_dir / "chatglm2-6b/tokenization_chatglm.py",
+                chatglm_example_dir / f"{model_name}/tokenization_chatglm.py",
                 hf_dir,
             )
 
diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py
index 200ed5baa8..cca154b1a0 100755
--- a/cpp/tests/resources/scripts/test_cpp.py
+++ b/cpp/tests/resources/scripts/test_cpp.py
@@ -530,12 +530,6 @@ def prepare_multi_gpu_model_tests(python_exe: str,
                         resources_dir=resources_dir,
                         model_cache_arg=model_cache_arg)
 
-    prepare_model_tests(model_name="chatglm",
-                        python_exe=python_exe,
-                        root_dir=root_dir,
-                        resources_dir=resources_dir,
-                        model_cache_arg=model_cache_arg)
-
 
 def prepare_model_tests(model_name: str,
                         python_exe: str,
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index d1b13d5f84..1593f242ac 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -1,9 +1,9 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
-ARG BASE_TAG=25.01-py3
+ARG BASE_TAG=25.03-py3
 ARG DEVEL_IMAGE=devel
 
-FROM ${BASE_IMAGE}:${BASE_TAG} as base
+FROM ${BASE_IMAGE}:${BASE_TAG} AS base
 
 # https://www.gnu.org/software/bash/manual/html_node/Bash-Startup-Files.html
 # The default values come from `nvcr.io/nvidia/pytorch`
@@ -14,7 +14,10 @@ ENV GITHUB_MIRROR=$GITHUB_MIRROR
 RUN echo "Using GitHub mirror: $GITHUB_MIRROR"
 SHELL ["/bin/bash", "-c"]
 
-FROM base as devel
+# Clean up the pip constraint file from the base NGC PyTorch image.
+RUN [ -f /etc/pip/constraint.txt ] && : > /etc/pip/constraint.txt || true
+
+FROM base AS devel
 
 ARG PYTHON_VERSION="3.12.3"
 RUN echo "Using Python version: $PYTHON_VERSION"
@@ -59,7 +62,7 @@ ARG TORCH_INSTALL_TYPE="skip"
 COPY docker/common/install_pytorch.sh install_pytorch.sh
 RUN bash ./install_pytorch.sh $TORCH_INSTALL_TYPE && rm install_pytorch.sh
 
-FROM ${DEVEL_IMAGE} as wheel
+FROM ${DEVEL_IMAGE} AS wheel
 WORKDIR /src/tensorrt_llm
 COPY benchmarks benchmarks
 COPY cpp cpp
@@ -77,7 +80,7 @@ ARG BUILD_WHEEL_ARGS="--clean --trt_root /usr/local/tensorrt --python_bindings -
 RUN --mount=type=cache,target=/root/.cache/pip --mount=type=cache,target=/root/.cache/ccache \
     python3 scripts/build_wheel.py ${BUILD_WHEEL_ARGS}
 
-FROM ${DEVEL_IMAGE} as release
+FROM ${DEVEL_IMAGE} AS release
 
 # Create a cache directory for pip
 RUN mkdir -p /root/.cache/pip
diff --git a/docker/Makefile b/docker/Makefile
index 9ea6eac62a..a6ebd81091 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -152,16 +152,16 @@ jenkins-aarch64_%: STAGE = devel
 jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell grep 'LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = ' ../jenkins/L0_MergeRequest.groovy | grep -o '".*"' | tr -d '"')
 jenkins-rockylinux8_%: STAGE = devel
 jenkins-rockylinux8_%: BASE_IMAGE = nvidia/cuda
-jenkins-rockylinux8_%: BASE_TAG = 12.6.3-devel-rockylinux8
+jenkins-rockylinux8_%: BASE_TAG = 12.8.1-devel-rockylinux8
 
 rockylinux8_%: STAGE = devel
 rockylinux8_%: BASE_IMAGE = nvidia/cuda
-rockylinux8_%: BASE_TAG = 12.6.3-devel-rockylinux8
+rockylinux8_%: BASE_TAG = 12.8.1-devel-rockylinux8
 
 # For x86_64 and aarch64
 ubuntu22_%: STAGE = devel
 ubuntu22_%: BASE_IMAGE = nvidia/cuda
-ubuntu22_%: BASE_TAG = 12.6.3-devel-ubuntu22.04
+ubuntu22_%: BASE_TAG = 12.8.1-devel-ubuntu22.04
 
 trtllm_%: STAGE = release
 trtllm_%: PUSH_TO_STAGING := 0
diff --git a/docker/common/install_cuda_toolkit.sh b/docker/common/install_cuda_toolkit.sh
index 1045cd1237..042c26a0a8 100644
--- a/docker/common/install_cuda_toolkit.sh
+++ b/docker/common/install_cuda_toolkit.sh
@@ -5,7 +5,7 @@ set -ex
 # This script is used for reinstalling CUDA on Rocky Linux 8 with the run file.
 # CUDA version is usually aligned with the latest NGC CUDA image tag.
 # Only use when public CUDA image is not ready.
-CUDA_VER="12.8.0_570.86.10"
+CUDA_VER="12.8.1_570.124.06"
 CUDA_VER_SHORT="${CUDA_VER%_*}"
 
 NVCC_VERSION_OUTPUT=$(nvcc --version)
diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh
index 605420b0e7..85371b81f1 100644
--- a/docker/common/install_pytorch.sh
+++ b/docker/common/install_pytorch.sh
@@ -4,9 +4,8 @@ set -ex
 
 # Use latest stable version from https://pypi.org/project/torch/#history
 # and closest to the version specified in
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01
-# PyTorch v2.5.1 has a fix for https://github.com/pytorch/pytorch/issues/138324.
-TORCH_VERSION="2.5.1"
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03
+TORCH_VERSION="2.6.0"
 SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 
 prepare_environment() {
diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh
index f158cb3491..e0cc092666 100644
--- a/docker/common/install_tensorrt.sh
+++ b/docker/common/install_tensorrt.sh
@@ -2,20 +2,20 @@
 
 set -ex
 
-TRT_VER="10.8.0.43"
+TRT_VER="10.9.0.34"
 # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01
-CUDA_VER="12.8" # 12.8.0
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03
+CUDA_VER="12.8" # 12.8.1
 # Keep the installation for cuDNN if users want to install PyTorch with source codes.
 # PyTorch 2.x can compile with cuDNN v9.
-CUDNN_VER="9.7.0.66-1"
+CUDNN_VER="9.8.0.87-1"
 NCCL_VER="2.25.1-1+cuda12.8"
-CUBLAS_VER="12.8.3.14-1"
+CUBLAS_VER="12.8.4.1-1"
 # Align with the pre-installed CUDA / NVCC / NVRTC versions from
 # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
-NVRTC_VER="12.8.61-1"
-CUDA_RUNTIME="12.8.57-1"
-CUDA_DRIVER_VERSION="570.86.10-1.el8"
+NVRTC_VER="12.8.93-1"
+CUDA_RUNTIME="12.8.90-1"
+CUDA_DRIVER_VERSION="570.124.06-1.el8"
 
 for i in "$@"; do
     case $i in
@@ -116,7 +116,7 @@ install_tensorrt() {
         if [ -z "$ARCH" ];then ARCH=$(uname -m);fi
         if [ "$ARCH" = "arm64" ];then ARCH="aarch64";fi
         if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
-        RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz"
+        RELEASE_URL_TRT="https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.9.0/tars/TensorRT-${TRT_VER}.Linux.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz"
     fi
     wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
     tar -xf /tmp/TensorRT.tar -C /usr/local/
diff --git a/docs/requirements.txt b/docs/requirements.txt
index b696542fed..cf58a547b5 100644
--- a/docs/requirements.txt
+++ b/docs/requirements.txt
@@ -1,7 +1,7 @@
 sphinx>=7.0
 sphinx-argparse
 sphinx-click
-sphinx-rtd-theme
+nvidia-sphinx-theme
 myst_parser
 breathe
 pygit2
diff --git a/docs/source/conf.py b/docs/source/conf.py
index d0434836ab..c9964c09be 100644
--- a/docs/source/conf.py
+++ b/docs/source/conf.py
@@ -67,7 +67,7 @@ source_suffix = {
     '.md': 'markdown',
 }
 
-html_theme = 'sphinx_rtd_theme'
+html_theme = 'nvidia_sphinx_theme'
 html_static_path = ['_static']
 
 # ------------------------  C++ Doc related  --------------------------
diff --git a/docs/source/index.rst b/docs/source/index.rst
index f4405ccf99..50f321e54a 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -27,8 +27,6 @@ Welcome to TensorRT-LLM's Documentation!
 
    installation/linux.md
    installation/build-from-source-linux.md
-   installation/windows.md
-   installation/build-from-source-windows.md
    installation/grace-hopper.md
 
 
diff --git a/docs/source/installation/build-from-source-windows.md b/docs/source/installation/build-from-source-windows.md
deleted file mode 100644
index de31e4fcce..0000000000
--- a/docs/source/installation/build-from-source-windows.md
+++ /dev/null
@@ -1,199 +0,0 @@
-(build-from-source-windows)=
-
-# Building from Source Code on Windows
-
-```{note}
-This section is for advanced users. Skip this section if you plan to use the pre-built TensorRT-LLM release wheel.
-```
-
-## Prerequisites
-
-1. Install prerequisites listed in our [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) document.
-2. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path.
-3. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/).
-4. Download and unzip [TensorRT 10.8.0.43](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/zip/TensorRT-10.8.0.43.Windows.win10.cuda-12.8.zip).
-
-## Building a TensorRT-LLM Docker Image
-
-### Docker Desktop
-
-1. Install [Docker Desktop on Windows](https://docs.docker.com/desktop/install/windows-install/).
-2. Set the following configurations:
-
-  1. Right-click the Docker icon in the Windows system tray (bottom right of your taskbar) and select **Switch to Windows containers...**.
-  2. In the Docker Desktop settings on the **General** tab, uncheck **Use the WSL 2 based image**.
-  3. On the **Docker Engine** tab, set your configuration file to:
-
-  ```
-  {
-    "experimental": true
-  }
-  ```
-
-```{note}
-After building, copy the files out of your container. `docker cp` is not supported on Windows for Hyper-V based images. Unless you are using WSL 2 based images, mount a folder, for example, `trt-llm-build`, to your container when you run it for moving files between the container and host system.
-```
-
-### Acquire an Image
-
-The Docker container will be hosted for public download in a future release. At this time, it must be built manually. From the `TensorRT-LLM\windows\` folder, run the build command:
-
-```bash
-docker build -f .\docker\Dockerfile -t tensorrt-llm-windows-build:latest .
-```
-
-And your image is now ready for use.
-
-### Run the Container
-
-Run the container in interactive mode with your build folder mounted. Specify a memory limit with the `-m` flag. By default, the limit is 2 GB, which is not sufficient to build TensorRT-LLM.
-
-```bash
-docker run -it -m 12g -v .\trt-llm-build:C:\workspace\trt-llm-build tensorrt-llm-windows-build:latest
-```
-
-### Build and Extract Files
-
-1. Clone and setup the TensorRT-LLM repository within the container.
-
-```bash
-git clone https://github.com/NVIDIA/TensorRT-LLM.git
-cd TensorRT-LLM
-git submodule update --init --recursive
-```
-
-2. Build TensorRT-LLM. This command generates `build\tensorrt_llm-*.whl`.
-
-```bash
-python .\scripts\build_wheel.py -a "89-real" --trt_root C:\workspace\TensorRT-10.8.0.43\
-```
-
-3. Copy or move `build\tensorrt_llm-*.whl` into your mounted folder so it can be accessed on your host machine. If you intend to use the C++ runtime, you'll also need to gather various DLLs from the build into your mounted folder. For more information, refer to [C++ Runtime Usage](#c-runtime-usage).
-
-
-
-## Building TensorRT-LLM on Bare Metal
-
-**Prerequisites**
-
-1. Install all prerequisites (`git`, `python`, `CUDA`) listed in our [Installing on Windows](https://nvidia.github.io/TensorRT-LLM/installation/windows.html) document.
-2. Install Nsight NVTX. TensorRT-LLM on Windows currently depends on NVTX assets that do not come packaged with the CUDA 12.8.0 installer. To install these assets, download the [CUDA 11.8 Toolkit](https://developer.nvidia.com/cuda-11-8-0-download-archive?target_os=Windows&target_arch=x86_64).
-
-    1. During installation, select **Advanced installation**.
-
-    2. Nsight NVTX is located in the CUDA drop-down.
-
-    3. Deselect all packages, and select **Nsight NVTX**.
-
-3. Install the dependencies one of two ways:
-
-    1. Run the `setup_build_env.ps1` script, which installs CMake, Microsoft Visual Studio Build Tools, and TensorRT automatically with default settings.
-
-        1. Run PowerShell as Administrator to use the script.
-
-        ```bash
-        ./setup_build_env.ps1 -TRTPath <TRT-containing-folder> [-skipCMake] [-skipVSBuildTools] [-skipTRT]
-        ```
-
-        2. Close and reopen PowerShell after running the script so that `Path` changes take effect.
-
-        3. Supply a directory that already exists to contain TensorRT to `-TRTPath`, for example, `-TRTPath ~/inference` may be valid, but `-TRTPath ~/inference/TensorRT` will not be valid if `TensorRT` does not exist. `-TRTPath` isn't required if `-skipTRT` is supplied.
-
-    2. Install the dependencies one at a time.
-
-        1. Install [CMake](https://cmake.org/download/), version 3.27.7 is recommended, and select the option to add it to the system path.
-        2. Download and install [Visual Studio 2022](https://visualstudio.microsoft.com/). When prompted to select more Workloads, check **Desktop development with C++**.
-        3. Download and unzip [TensorRT 10.8.0.43](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.8.0/zip/TensorRT-10.8.0.43.Windows.win10.cuda-12.8.zip). Move the folder to a location you can reference later, such as `%USERPROFILE%\inference\TensorRT`.
-
-            1. Add the libraries for TensorRT  to your system's `Path` environment variable. Your `Path` should include a line like this:
-
-            ```bash
-            %USERPROFILE%\inference\TensorRT\lib
-            ```
-
-            2. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
-
-            3. Remove existing `tensorrt` wheels first by executing
-
-            ```bash
-            pip uninstall -y tensorrt tensorrt_libs tensorrt_bindings
-            pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12
-            ```
-
-            4. Install the TensorRT core libraries, run PowerShell, and use `pip` to install the Python wheel.
-
-            ```bash
-            pip install %USERPROFILE%\inference\TensorRT\python\tensorrt-*.whl
-            ```
-
-            5. Verify that your TensorRT installation is working properly.
-
-            ```bash
-            python -c "import tensorrt as trt; print(trt.__version__)"
-            ```
-
-
-**Steps**
-
-1. Launch a 64-bit Developer PowerShell. From your usual PowerShell terminal, run one of the following two commands.
-
-    1. If you installed Visual Studio Build Tools (that is, used the `setup_build_env.ps1` script):
-
-    ```bash
-    & 'C:\Program Files (x86)\Microsoft Visual Studio\2022\BuildTools\Common7\Tools\Launch-VsDevShell.ps1' -Arch amd64
-    ```
-
-    2. If you installed Visual Studio Community (e.g. via manual GUI setup):
-
-    ```bash
-    & 'C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1' -Arch amd64
-    ```
-
-2. In PowerShell, from the `TensorRT-LLM` root folder, run:
-
-```bash
-python .\scripts\build_wheel.py -a "89-real" --trt_root <path_to_trt_root>
-```
-
-The `-a` flag specifies the device architecture. `"89-real"` supports GeForce 40-series cards.
-
-The flag `-D "ENABLE_MULTI_DEVICE=0"`, while not specified here, is implied on Windows. Multi-device inference is supported on Linux, but not on Windows.
-
-This command generates `build\tensorrt_llm-*.whl`.
-
-(c-runtime-usage)=
-## Linking with the TensorRT-LLM C++ Runtime
-
-```{note}
-This section is for advanced users. Skip this section if you do not intend to use the TensorRT-LLM C++ runtime directly. You must build from source to use the C++ runtime.
-```
-
-Building from source creates libraries that can be used if you wish to directly link against the C++ runtime for TensorRT-LLM. These libraries are also required if you wish to run C++ unit tests and some benchmarks.
-
-Building from source produces the following library files.
-- `tensorrt_llm` libraries located in `cpp\build\tensorrt_llm`
-  - `tensorrt_llm.dll` - Shared library
-  - `tensorrt_llm.exp` - Export file
-  - `tensorrt_llm.lib` - Stub for linking to `tensorrt_llm.dll`
-- Dependency libraries (these get copied to `tensorrt_llm\libs\`)
-  - `nvinfer_plugin_tensorrt_llm` libraries located in `cpp\build\tensorrt_llm\plugins\`
-    - `nvinfer_plugin_tensorrt_llm.dll`
-    - `nvinfer_plugin_tensorrt_llm.exp`
-    - `nvinfer_plugin_tensorrt_llm.lib`
-  - `th_common` libraries located in `cpp\build\tensorrt_llm\thop\`
-    - `th_common.dll`
-    - `th_common.exp`
-    - `th_common.lib`
-
-The locations of the DLLs, in addition to some `torch` DLLs and `TensorRT` DLLs, must be added to the Windows `Path` in order to use the TensorRT-LLM C++ runtime. Append the locations of these libraries to your `Path`. When complete, your `Path` should include lines similar to these:
-
-```bash
-%USERPROFILE%\inference\TensorRT\lib
-%USERPROFILE%\inference\TensorRT-LLM\cpp\build\tensorrt_llm
-%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\tensorrt_llm\libs
-%USERPROFILE%\AppData\Local\Programs\Python\Python310\Lib\site-packages\torch\lib
-```
-
-Your `Path` additions may differ, particularly if you used the Docker method and copied all the relevant DLLs into a single folder.
-
-Again, close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path`.
diff --git a/docs/source/installation/grace-hopper.md b/docs/source/installation/grace-hopper.md
index 98d8e5189c..bc48c702eb 100644
--- a/docs/source/installation/grace-hopper.md
+++ b/docs/source/installation/grace-hopper.md
@@ -5,7 +5,7 @@
 1. Install TensorRT-LLM (tested on Ubuntu 24.04).
 
     ```bash
-    pip3 install torch==2.5.1 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
+    pip3 install torch==2.6.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126
 
     sudo apt-get -y install libopenmpi-dev && pip3 install --upgrade pip setuptools && pip3 install tensorrt_llm --extra-index-url https://pypi.nvidia.com
     ```
diff --git a/docs/source/installation/windows.md b/docs/source/installation/windows.md
deleted file mode 100644
index 9f6bd51b71..0000000000
--- a/docs/source/installation/windows.md
+++ /dev/null
@@ -1,81 +0,0 @@
-(windows)=
-
-# Installing on Windows
-
-```{note}
-The Windows release of TensorRT-LLM is currently in beta.
-We recommend checking out the [v0.17.0 tag](https://github.com/NVIDIA/TensorRT-LLM/releases/tag/v0.17.0) for the most stable experience.
-```
-
-```{note}
-TensorRT-LLM on Windows only supports single-GPU execution.
-```
-
-**Prerequisites**
-
-1. Clone this repository using [Git for Windows](https://git-scm.com/download/win).
-
-2. Install the dependencies one of two ways:
-
-    1. Install all dependencies together.
-
-       1. Run the provided PowerShell script `setup_env.ps1` located under the `/windows/` folder which installs Python and CUDA 12.8.0 automatically with default settings. Run PowerShell as Administrator to use the script.
-
-       ```bash
-       ./setup_env.ps1 [-skipCUDA] [-skipPython]
-       ```
-
-       2. Close and re-open any existing PowerShell or Git Bash windows so they pick up the new `Path` modified by the `setup_env.ps1` script above.
-
-    2. Install the dependencies one at a time.
-
-        1. Install [Python 3.10](https://www.python.org/ftp/python/3.10.11/python-3.10.11-amd64.exe).
-
-            1. Select **Add python.exe to PATH** at the start of the installation. The installation may only add the `python` command, but not the `python3` command.
-            2. Navigate to the installation path `%USERPROFILE%\AppData\Local\Programs\Python\Python310` (`AppData` is a hidden folder) and copy `python.exe` to `python3.exe`.
-
-        2. Install [CUDA 12.8.0 Toolkit](https://developer.nvidia.com/cuda-12-8-0-download-archive?target_os=Windows&target_arch=x86_64). Use the Express Installation option. Installation may require a restart.
-
-  3. If using conda environment, run the following command before installing TensorRT-LLM.
-     ```bash
-     conda install -c conda-forge pyarrow
-     ```
-
-
-**Steps**
-
-1. Install TensorRT-LLM.
-
-  If you have an existing TensorRT installation (from older versions of `tensorrt_llm`), please execute
-
-  ```bash
-  pip uninstall -y tensorrt tensorrt_libs tensorrt_bindings
-  pip uninstall -y nvidia-cublas-cu12 nvidia-cuda-nvrtc-cu12 nvidia-cuda-runtime-cu12 nvidia-cudnn-cu12
-  ```
-
-  before installing TensorRT-LLM with the following command.
-
-  ```bash
-  pip install tensorrt_llm==0.17.0.post1 --extra-index-url https://download.pytorch.org/whl/ --extra-index-url https://pypi.nvidia.com
-  ```
-
-  Run the following command to verify that your TensorRT-LLM installation is working properly.
-
-  ```bash
-  python -c "import tensorrt_llm; print(tensorrt_llm._utils.trt_version())"
-  ```
-
-2. Build the model.
-3. Deploy the model.
-
-**Known Issue**
-
-1. `OSError: exception: access violation reading 0x0000000000000000` during `import tensorrt_llm` or `trtllm-build`.
-
-This may be caused by an outdated Microsoft Visual C++ Redistributable Version. Please install
-[the latest MSVC](https://learn.microsoft.com/en-us/cpp/windows/latest-supported-vc-redist?view=msvc-170#latest-microsoft-visual-c-redistributable-version)
-and retry. Check the system path to make sure the latest version installed in `System32` is searched first. Check dependencies to make sure no other packages are using an outdated version (e.g. package `pyarrow` might contain an outdated MSVC DLL).
-
-2. OSError: [WinError 126] The specified module could not be found. Error loading “...\Lib\site-packages\torch\lib\fbgemm.dll” or one of its dependencies.
-
-Installing the latest [Build Tools for Visual Studio 2022] (https://visualstudio.microsoft.com/downloads/#build-tools-for-visual-studio-2022) will resolve the issue.
diff --git a/docs/source/overview.md b/docs/source/overview.md
index 2a2f8dadae..2c8f49bdcd 100644
--- a/docs/source/overview.md
+++ b/docs/source/overview.md
@@ -35,7 +35,7 @@ Certain limitations might apply. Refer to the {ref}`support-matrix` for more inf
 
 ### Native Windows Support
 
-Application developers and AI enthusiasts can now benefit from accelerated LLMs running locally on PCs and Workstations powered by NVIDIA RTX and NVIDIA GeForce RTX GPUs. Refer to the {ref}`windows` section for more information.
+Windows platform support is deprecated as of v0.18.0. All Windows-related code and functionality will be completely removed in future releases.
 
 ## What Can You Do With TensorRT-LLM?
 
diff --git a/docs/source/reference/support-matrix.md b/docs/source/reference/support-matrix.md
index bcbcb196de..f072f48015 100644
--- a/docs/source/reference/support-matrix.md
+++ b/docs/source/reference/support-matrix.md
@@ -90,7 +90,7 @@ In addition, older architectures can have limitations for newer software release
 * -
   - Hardware Compatibility
 * - Operating System
-  - TensorRT-LLM requires Linux x86_64, Linux aarch64 or Windows.
+  - TensorRT-LLM requires Linux x86_64 or Linux aarch64.
 * - GPU Model Architectures
   -
     - [NVIDIA Blackwell Architecture](https://www.nvidia.com/en-us/data-center/technologies/blackwell-architecture/)
@@ -112,9 +112,9 @@ The following table shows the supported software for TensorRT-LLM.
 * -
   - Software Compatibility
 * - Container
-  - [25.01](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+  - [25.03](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
 * - TensorRT
-  - [10.8](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
+  - [10.9](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
 * - Precision
   -
     - Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
diff --git a/docs/source/release-notes.md b/docs/source/release-notes.md
index 8ed209badc..ea75de3c34 100644
--- a/docs/source/release-notes.md
+++ b/docs/source/release-notes.md
@@ -5,6 +5,23 @@
 All published functionality in the Release Notes has been fully tested and verified with known limitations documented. To share feedback about this release, access our [NVIDIA Developer Forum](https://forums.developer.nvidia.com/).
 
 
+## TensorRT-LLM Release 0.18.0
+
+### Key Features and Enhancements
+  - **Features that were previously available in the 0.18.0.dev pre-releases are not included in this release**.
+  - [BREAKING CHANGE] Windows platform support is deprecated as of v0.18.0. All Windows-related code and functionality will be completely removed in future releases.
+
+### Known Issues
+  - The PyTorch workflow on SBSA is incompatible with bare metal environments like Ubuntu 24.04. Please use the [PyTorch NGC Container](https://catalog.ngc.nvidia.com/orgs/nvidia/containers/pytorch) for optimal support on SBSA platforms.
+
+### Infrastructure Changes
+  - The base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:25.03-py3`.
+  - The base Docker image for TensorRT-LLM Backend is updated to `nvcr.io/nvidia/tritonserver:25.03-py3`.
+  - The dependent TensorRT version is updated to 10.9.
+  - The dependent CUDA version is updated to 12.8.1.
+  - The dependent NVIDIA ModelOpt version is updated to 0.25 for Linux platform.
+
+
 ## TensorRT-LLM Release 0.17.0
 
 ### Key Features and Enhancements
diff --git a/docs/source/torch.md b/docs/source/torch.md
index 1522d1dd3d..fc76269382 100644
--- a/docs/source/torch.md
+++ b/docs/source/torch.md
@@ -33,7 +33,7 @@ Or you can try the following commands to get a quantized model by yourself:
 
 ```bash
 git clone https://github.com/NVIDIA/TensorRT-Model-Optimizer.git
-cd TensorRT-Model-Optimizer/llm_ptq
+cd TensorRT-Model-Optimizer/examples/llm_ptq
 scripts/huggingface_example.sh --model <huggingface_model_card> --quant fp8 --export_fmt hf
 ```
 
diff --git a/examples/baichuan/requirements.txt b/examples/baichuan/requirements.txt
index 8ff315a52c..b333586508 100644
--- a/examples/baichuan/requirements.txt
+++ b/examples/baichuan/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.15.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/bloom/requirements.txt b/examples/bloom/requirements.txt
index 8c677a9944..59c42359ab 100644
--- a/examples/bloom/requirements.txt
+++ b/examples/bloom/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py b/examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py
new file mode 100644
index 0000000000..a260656e7d
--- /dev/null
+++ b/examples/chatglm/chatglm3-6b-32k/tokenization_chatglm.py
@@ -0,0 +1,313 @@
+import json
+import os
+from typing import Dict, List, Optional, Union
+
+from sentencepiece import SentencePieceProcessor
+from transformers import PreTrainedTokenizer
+from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
+from transformers.utils import PaddingStrategy
+
+
+class SPTokenizer:
+
+    def __init__(self, model_path: str):
+        # reload tokenizer
+        assert os.path.isfile(model_path), model_path
+        self.sp_model = SentencePieceProcessor(model_file=model_path)
+
+        # BOS / EOS token IDs
+        self.n_words: int = self.sp_model.vocab_size()
+        self.bos_id: int = self.sp_model.bos_id()
+        self.eos_id: int = self.sp_model.eos_id()
+        self.pad_id: int = self.sp_model.unk_id()
+        assert self.sp_model.vocab_size() == self.sp_model.get_piece_size()
+
+        special_tokens = [
+            "[MASK]", "[gMASK]", "[sMASK]", "sop", "eop", "<|system|>",
+            "<|user|>", "<|assistant|>", "<|observation|>"
+        ]
+        self.special_tokens = {}
+        self.index_special_tokens = {}
+        for token in special_tokens:
+            self.special_tokens[token] = self.n_words
+            self.index_special_tokens[self.n_words] = token
+            self.n_words += 1
+
+    def tokenize(self, s: str):
+        return self.sp_model.EncodeAsPieces(s)
+
+    def encode(self, s: str, bos: bool = False, eos: bool = False) -> List[int]:
+        assert type(s) is str
+        t = self.sp_model.encode(s)
+        if bos:
+            t = [self.bos_id] + t
+        if eos:
+            t = t + [self.eos_id]
+        return t
+
+    def decode(self, t: List[int]) -> str:
+        text, buffer = "", []
+        for token in t:
+            if token in self.index_special_tokens:
+                if buffer:
+                    text += self.sp_model.decode(buffer)
+                    buffer = []
+                text += self.index_special_tokens[token]
+            else:
+                buffer.append(token)
+        if buffer:
+            text += self.sp_model.decode(buffer)
+        return text
+
+    def decode_tokens(self, tokens: List[str]) -> str:
+        text = self.sp_model.DecodePieces(tokens)
+        return text
+
+    def convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        return self.sp_model.PieceToId(token)
+
+    def convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.index_special_tokens:
+            return self.index_special_tokens[index]
+        if index in [self.eos_id, self.bos_id, self.pad_id] or index < 0:
+            return ""
+        return self.sp_model.IdToPiece(index)
+
+
+class ChatGLMTokenizer(PreTrainedTokenizer):
+    vocab_files_names = {"vocab_file": "tokenizer.model"}
+
+    model_input_names = ["input_ids", "attention_mask", "position_ids"]
+
+    def __init__(self,
+                 vocab_file,
+                 padding_side="left",
+                 clean_up_tokenization_spaces=False,
+                 **kwargs):
+        self.name = "GLMTokenizer"
+
+        self.vocab_file = vocab_file
+        self.tokenizer = SPTokenizer(vocab_file)
+        self.special_tokens = {
+            "<bos>": self.tokenizer.bos_id,
+            "<eos>": self.tokenizer.eos_id,
+            "<pad>": self.tokenizer.pad_id
+        }
+        super().__init__(
+            padding_side=padding_side,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs)
+
+    def get_command(self, token):
+        if token in self.special_tokens:
+            return self.special_tokens[token]
+        assert token in self.tokenizer.special_tokens, f"{token} is not a special token for {self.name}"
+        return self.tokenizer.special_tokens[token]
+
+    @property
+    def unk_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token(self) -> str:
+        return "<unk>"
+
+    @property
+    def pad_token_id(self):
+        return self.get_command("<pad>")
+
+    @property
+    def eos_token(self) -> str:
+        return "</s>"
+
+    @property
+    def eos_token_id(self):
+        return self.get_command("<eos>")
+
+    @property
+    def vocab_size(self):
+        return self.tokenizer.n_words
+
+    def get_vocab(self):
+        """ Returns vocab as a dict """
+        vocab = {
+            self._convert_id_to_token(i): i
+            for i in range(self.vocab_size)
+        }
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text, **kwargs):
+        return self.tokenizer.tokenize(text)
+
+    def _convert_token_to_id(self, token):
+        """ Converts a token (str) in an id using the vocab. """
+        return self.tokenizer.convert_token_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens: List[str]) -> str:
+        return self.tokenizer.decode_tokens(tokens)
+
+    def save_vocabulary(self, save_directory, filename_prefix=None):
+        """
+        Save the vocabulary and special tokens file to a directory.
+
+        Args:
+            save_directory (`str`):
+                The directory in which to save the vocabulary.
+            filename_prefix (`str`, *optional*):
+                An optional prefix to add to the named of the saved files.
+
+        Returns:
+            `Tuple(str)`: Paths to the files saved.
+        """
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(save_directory,
+                                      self.vocab_files_names["vocab_file"])
+        else:
+            vocab_file = save_directory
+
+        with open(self.vocab_file, 'rb') as fin:
+            proto_bytes = fin.read()
+
+        with open(vocab_file, "wb") as writer:
+            writer.write(proto_bytes)
+
+        return (vocab_file, )
+
+    def get_prefix_tokens(self):
+        prefix_tokens = [self.get_command("[gMASK]"), self.get_command("sop")]
+        return prefix_tokens
+
+    def build_single_message(self, role, metadata, message):
+        assert role in ["system", "user", "assistant", "observation"], role
+        role_tokens = [self.get_command(f"<|{role}|>")
+                       ] + self.tokenizer.encode(f"{metadata}\n")
+        message_tokens = self.tokenizer.encode(message)
+        tokens = role_tokens + message_tokens
+        return tokens
+
+    def build_chat_input(self, query, history=None, role="user"):
+        if history is None:
+            history = []
+        input_ids = []
+        for item in history:
+            content = item["content"]
+            if item["role"] == "system" and "tools" in item:
+                content = content + "\n" + json.dumps(
+                    item["tools"], indent=4, ensure_ascii=False)
+            input_ids.extend(
+                self.build_single_message(item["role"],
+                                          item.get("metadata", ""), content))
+        input_ids.extend(self.build_single_message(role, "", query))
+        input_ids.extend([self.get_command("<|assistant|>")])
+        return self.batch_encode_plus([input_ids],
+                                      return_tensors="pt",
+                                      is_split_into_words=True)
+
+    def build_inputs_with_special_tokens(
+            self,
+            token_ids_0: List[int],
+            token_ids_1: Optional[List[int]] = None) -> List[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        prefix_tokens = self.get_prefix_tokens()
+        token_ids_0 = prefix_tokens + token_ids_0
+        if token_ids_1 is not None:
+            token_ids_0 = token_ids_0 + token_ids_1 + [
+                self.get_command("<eos>")
+            ]
+        return token_ids_0
+
+    def _pad(
+            self,
+            encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
+            max_length: Optional[int] = None,
+            padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+            pad_to_multiple_of: Optional[int] = None,
+            return_attention_mask: Optional[bool] = None,
+            padding_side: str = "left",  # wili, fix for new transformers
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        assert self.padding_side == "left"
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+        seq_length = len(required_input)
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (
+                max_length % pad_to_multiple_of != 0):
+            max_length = (
+                (max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(
+            required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * seq_length
+
+        if "position_ids" not in encoded_inputs:
+            encoded_inputs["position_ids"] = list(range(seq_length))
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+
+            if "attention_mask" in encoded_inputs:
+                encoded_inputs["attention_mask"] = [
+                    0
+                ] * difference + encoded_inputs["attention_mask"]
+            if "position_ids" in encoded_inputs:
+                encoded_inputs["position_ids"] = [
+                    0
+                ] * difference + encoded_inputs["position_ids"]
+            encoded_inputs[self.model_input_names[
+                0]] = [self.pad_token_id] * difference + required_input
+
+        return encoded_inputs
diff --git a/examples/chatglm/requirements.txt b/examples/chatglm/requirements.txt
index ccfdf00b22..9af012fb93 100644
--- a/examples/chatglm/requirements.txt
+++ b/examples/chatglm/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 protobuf
diff --git a/examples/commandr/requirements.txt b/examples/commandr/requirements.txt
index 39a247e708..f5d7970be4 100644
--- a/examples/commandr/requirements.txt
+++ b/examples/commandr/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/dbrx/requirements.txt b/examples/dbrx/requirements.txt
index e05fb8b017..b5041429fe 100644
--- a/examples/dbrx/requirements.txt
+++ b/examples/dbrx/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/deepseek_v1/requirements.txt b/examples/deepseek_v1/requirements.txt
index 4cf08453e3..752f91c48c 100644
--- a/examples/deepseek_v1/requirements.txt
+++ b/examples/deepseek_v1/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/draft_target_model/requirements.txt b/examples/draft_target_model/requirements.txt
index d6aece6754..5621123750 100644
--- a/examples/draft_target_model/requirements.txt
+++ b/examples/draft_target_model/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece>=0.1.99
diff --git a/examples/eagle/requirements.txt b/examples/eagle/requirements.txt
index bcc8468154..851cd3e45d 100644
--- a/examples/eagle/requirements.txt
+++ b/examples/eagle/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 SentencePiece~=0.1.99
diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt
index 012045e5f1..af1fce6b5c 100644
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 transformers>=4.31.0
 datasets~=2.14.5
 evaluate~=0.4.1
diff --git a/examples/gemma/requirements.txt b/examples/gemma/requirements.txt
index e3de0c99ce..9733032d05 100644
--- a/examples/gemma/requirements.txt
+++ b/examples/gemma/requirements.txt
@@ -2,7 +2,7 @@
 # WAR the new posting of "nvidia-cudnn-cu12~=9.0".
 # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
 nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 flax~=0.8.0
 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
 jax~=0.4.19; platform_system == "Windows"
diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt
index 49d198a5b6..a147c2f04f 100644
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/gptj/requirements.txt b/examples/gptj/requirements.txt
index ed02cf1ea5..e82e5403ca 100644
--- a/examples/gptj/requirements.txt
+++ b/examples/gptj/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/gptneox/requirements.txt b/examples/gptneox/requirements.txt
index 4ade6c457e..61e9be0a1c 100644
--- a/examples/gptneox/requirements.txt
+++ b/examples/gptneox/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 evaluate~=0.4.1
diff --git a/examples/grok/requirements.txt b/examples/grok/requirements.txt
index 8bd9d2e519..1ed21f8a72 100644
--- a/examples/grok/requirements.txt
+++ b/examples/grok/requirements.txt
@@ -1,5 +1,5 @@
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt
index f3384bb7d2..3583d8cc11 100644
--- a/examples/internlm/requirements.txt
+++ b/examples/internlm/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets==2.14.5
 rouge_score~=0.1.2
 sentencepiece>=0.1.99
diff --git a/examples/jais/requirements.txt b/examples/jais/requirements.txt
index 49d198a5b6..a147c2f04f 100644
--- a/examples/jais/requirements.txt
+++ b/examples/jais/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/llama/requirements.txt b/examples/llama/requirements.txt
index 6c8bfdb84b..817652fff7 100644
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 transformers>=4.43.0
 datasets==2.14.6
 evaluate~=0.4.1
diff --git a/examples/lookahead/requirements.txt b/examples/lookahead/requirements.txt
index d6aece6754..5621123750 100644
--- a/examples/lookahead/requirements.txt
+++ b/examples/lookahead/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece>=0.1.99
diff --git a/examples/mamba/requirements.txt b/examples/mamba/requirements.txt
index 4c74bdb527..991a392421 100644
--- a/examples/mamba/requirements.txt
+++ b/examples/mamba/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 transformers>=4.39.0
 datasets~=2.14.5
 evaluate
diff --git a/examples/medusa/requirements.txt b/examples/medusa/requirements.txt
index d6aece6754..5621123750 100644
--- a/examples/medusa/requirements.txt
+++ b/examples/medusa/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece>=0.1.99
diff --git a/examples/mixtral/requirements.txt b/examples/mixtral/requirements.txt
index 5918f86123..d8d78c6726 100644
--- a/examples/mixtral/requirements.txt
+++ b/examples/mixtral/requirements.txt
@@ -1,3 +1,3 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 transformers==4.38.2
 accelerate==0.25.0
diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt
index ed02cf1ea5..e82e5403ca 100644
--- a/examples/mpt/requirements.txt
+++ b/examples/mpt/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/nemotron/requirements.txt b/examples/nemotron/requirements.txt
index 810efadc3f..714eba33dd 100644
--- a/examples/nemotron/requirements.txt
+++ b/examples/nemotron/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 nemo-toolkit[all]==2.0.0rc1
 megatron-core==0.8.0
 datasets~=2.14.5
diff --git a/examples/opt/requirements.txt b/examples/opt/requirements.txt
index ed02cf1ea5..e82e5403ca 100644
--- a/examples/opt/requirements.txt
+++ b/examples/opt/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/phi/requirements.txt b/examples/phi/requirements.txt
index c86d655add..bef827d3ca 100644
--- a/examples/phi/requirements.txt
+++ b/examples/phi/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/prompt_lookup/requirements.txt b/examples/prompt_lookup/requirements.txt
index 32b0788b09..654b312853 100644
--- a/examples/prompt_lookup/requirements.txt
+++ b/examples/prompt_lookup/requirements.txt
@@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
diff --git a/examples/quantization/requirements.txt b/examples/quantization/requirements.txt
index 81d80a1eda..3bc34bba56 100644
--- a/examples/quantization/requirements.txt
+++ b/examples/quantization/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets>=2.14.4
 nemo-toolkit[all]==2.0.0rc1
 rouge_score~=0.1.2
diff --git a/examples/qwen/requirements.txt b/examples/qwen/requirements.txt
index 6f015adc09..630b9bfa44 100644
--- a/examples/qwen/requirements.txt
+++ b/examples/qwen/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/qwenvl/requirements.txt b/examples/qwenvl/requirements.txt
index bd5966a120..debbcdaf91 100644
--- a/examples/qwenvl/requirements.txt
+++ b/examples/qwenvl/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/recurrentgemma/requirements.txt b/examples/recurrentgemma/requirements.txt
index 26cc3f714d..0d98bb13b2 100644
--- a/examples/recurrentgemma/requirements.txt
+++ b/examples/recurrentgemma/requirements.txt
@@ -1,5 +1,5 @@
-tensorrt_llm==0.17.0.post1
-git+https://github.com/google-deepmind/recurrentgemma.git
+tensorrt_llm==0.18.0
+git+https://github.com/google-deepmind/recurrentgemma.git@8a32e365
 flax>=0.8.2
 jax~=0.4.23
 orbax-checkpoint==0.5.7
diff --git a/examples/redrafter/requirements.txt b/examples/redrafter/requirements.txt
index d6aece6754..5621123750 100644
--- a/examples/redrafter/requirements.txt
+++ b/examples/redrafter/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece>=0.1.99
diff --git a/examples/skywork/requirements.txt b/examples/skywork/requirements.txt
index 8e3e9b528b..e164ec3254 100644
--- a/examples/skywork/requirements.txt
+++ b/examples/skywork/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets~=2.16.1
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/smaug/requirements.txt b/examples/smaug/requirements.txt
index fef42c7e41..8f8bec16d6 100644
--- a/examples/smaug/requirements.txt
+++ b/examples/smaug/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/whisper/requirements.txt b/examples/whisper/requirements.txt
index a1fab87089..6c82f0c0d4 100644
--- a/examples/whisper/requirements.txt
+++ b/examples/whisper/requirements.txt
@@ -1,4 +1,4 @@
-tensorrt_llm==0.17.0.post1
+tensorrt_llm==0.18.0
 tiktoken
 datasets
 kaldialign
diff --git a/requirements.txt b/requirements.txt
index 2ebec77668..d764bbeb20 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -17,11 +17,11 @@ pandas
 h5py==3.12.1
 StrEnum
 sentencepiece>=0.1.99
-tensorrt~=10.8.0
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-01.html#rel-25-01 uses 2.6.0a0.
-torch>=2.5.1,<=2.6.0a0
+tensorrt~=10.9.0
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-03.html#rel-25-03 uses 2.7.0a0.
+torch>=2.6.0,<=2.7.0a0
 torchvision
-nvidia-modelopt[torch]~=0.23.0
+nvidia-modelopt[torch]~=0.25.0
 nvidia-nccl-cu12
 nvidia-cuda-nvrtc-cu12
 # TODO: Remove the <4.48.0 constraint once https://github.com/huggingface/transformers/issues/35625 is resolved.
@@ -41,5 +41,5 @@ uvicorn
 httpx
 setuptools
 ordered-set
-flashinfer @ git+https://github.com/flashinfer-ai/flashinfer.git@06309c4e
-s2wrapper @ git+https://github.com/bfshi/scaling_on_scales.git@60da2afe
+einops
+flashinfer-python~=0.2.0
diff --git a/tensorrt_llm/_torch/attention_backend/flashinfer.py b/tensorrt_llm/_torch/attention_backend/flashinfer.py
index 78f7104417..aeeee513c0 100644
--- a/tensorrt_llm/_torch/attention_backend/flashinfer.py
+++ b/tensorrt_llm/_torch/attention_backend/flashinfer.py
@@ -8,7 +8,8 @@ import torch
 from flashinfer.jit.core import check_cuda_arch
 
 from tensorrt_llm._torch.attention_backend.interface import (
-    AttentionBackend, AttentionMask, AttentionMetadata, PredefinedAttentionMask)
+    AttentionBackend, AttentionMask, AttentionMetadata, PredefinedAttentionMask,
+    dummy_forward)
 from tensorrt_llm.functional import AttentionMaskType
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
@@ -420,123 +421,6 @@ class FlashInferAttention(AttentionBackend[FlashInferAttentionMetadata]):
             if quant_mode.has_fp8_kv_cache():
                 self.has_fp8_kv_cache = True
 
-    @torch.library.custom_op("trtllm::flashinfer_forward", mutates_args=())
-    @staticmethod
-    def forward_pattern(
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        num_heads: int,
-        head_dim: int,
-        num_kv_heads: int,
-        layer_idx: int,
-        has_fp8_kv_cache: bool,
-        attention_mask_type: int,
-        attention_mask_data: Optional[torch.Tensor],
-    ) -> torch.Tensor:
-        '''
-        Wrapping the flashinfer forward as a custom op is required to fix `torch.compile` graph breaks,
-        otherwise it will graph break when calling `metadata.num_contexts` since it convert tensor's sum directly to int.
-        '''
-        # torch.compile does not support custom object as arguments, so we have to use global function to get the metadata.
-        metadata = get_metadata()
-
-        # This is only for memory estimation for now.
-        # NOTE: this method is not accurate while it works for most scenario.
-        if metadata is None or metadata.kv_cache_manager is None:
-            q = q.view(1, -1, num_heads, head_dim)
-            k = k.view(1, -1, num_kv_heads, head_dim)
-            v = v.view(1, -1, num_kv_heads, head_dim)
-            return AttentionBackend.dummy_forward(q, k, v)
-
-        assert isinstance(
-            metadata,
-            FlashInferAttentionMetadata,
-        )
-
-        # Query
-        q = q.view(-1, num_heads, head_dim)
-
-        # Key and Value
-        kv_cache = metadata.kv_cache_manager.get_buffers(layer_idx)
-
-        if k is not None and v is not None:
-            k = k.view(-1, num_kv_heads, head_dim)
-            v = v.view(-1, num_kv_heads, head_dim)
-
-            if has_fp8_kv_cache:
-                assert kv_cache.dtype == torch.float8_e4m3fn, f"KV cache should have fp8 dtype, but get {kv_cache.dtype}"
-                k = k.to(torch.float8_e4m3fn)
-                v = v.to(torch.float8_e4m3fn)
-            assert k.dtype == v.dtype == kv_cache.dtype, f"KV cache dtype {kv_cache.dtype} does not match k/v dtype {k.dtype}/{v.dtype}"
-
-            flashinfer.page.append_paged_kv_cache(
-                append_key=k,
-                append_value=v,
-                batch_indices=metadata.batch_indices,
-                positions=metadata.positions,
-                paged_kv_cache=kv_cache,
-                kv_indices=metadata.paged_kv_indices,
-                kv_indptr=metadata.paged_kv_indptr,
-                kv_last_page_len=metadata.paged_kv_last_page_len,
-                kv_layout=metadata.kv_layout)
-
-        num_contexts = metadata.num_contexts
-        num_generations = metadata.num_generations
-        num_ctx_tokens = metadata.num_ctx_tokens
-
-        def prefill_forward(plan_params: PlanParams):
-            wrapper = metadata.get_prefill_wrapper(plan_params)
-            output = wrapper.run(q[:num_ctx_tokens], kv_cache)
-            output = output.view(num_ctx_tokens, -1)
-            return output
-
-        def decode_forward(plan_params: PlanParams):
-            wrapper = metadata.get_decode_wrapper(plan_params)
-            output = wrapper.run(q[num_ctx_tokens:], kv_cache)
-            output = output.view(num_generations, -1)
-            return output
-
-        # this will do nothing if the last forward pass had the same parameters
-        plan_params = metadata.plan(num_heads,
-                                    num_kv_heads,
-                                    head_dim,
-                                    q_dtype=q.dtype,
-                                    kv_dtype=kv_cache.dtype,
-                                    attention_mask_type=attention_mask_type,
-                                    attention_mask_data=attention_mask_data)
-
-        if num_contexts > 0:
-            ctx_output = prefill_forward(plan_params)
-
-        if num_generations > 0:
-            gen_output = decode_forward(plan_params)
-
-        if num_contexts > 0 and num_generations > 0:
-            output = torch.cat([ctx_output, gen_output], dim=0)
-        elif num_contexts > 0:
-            output = ctx_output
-        elif num_generations > 0:
-            output = gen_output
-
-        return output
-
-    @forward_pattern.register_fake
-    @staticmethod
-    def _(
-        q: torch.Tensor,
-        k: torch.Tensor,
-        v: torch.Tensor,
-        num_heads: int,
-        head_dim: int,
-        num_kv_heads: int,
-        layer_idx: int,
-        has_fp8_kv_cache: bool,
-        attention_mask_type: int,
-        attention_mask_data: Optional[torch.Tensor],
-    ):
-        return torch.empty_like(q)
-
     def forward(self,
                 q: torch.Tensor,
                 k: Optional[torch.Tensor],
@@ -554,7 +438,125 @@ class FlashInferAttention(AttentionBackend[FlashInferAttentionMetadata]):
         else:
             raise ValueError("Unexpected attention mask type")
 
-        return FlashInferAttention.forward_pattern(
-            q, k, v, self.num_heads, self.head_dim, self.num_kv_heads,
-            self.layer_idx, self.has_fp8_kv_cache, attention_mask_type,
-            attention_mask_data)
+        return forward_pattern(q, k, v, self.num_heads, self.head_dim,
+                               self.num_kv_heads, self.layer_idx,
+                               self.has_fp8_kv_cache, attention_mask_type,
+                               attention_mask_data)
+
+
+@torch.library.custom_op("trtllm::flashinfer_forward", mutates_args=())
+def forward_pattern(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    num_heads: int,
+    head_dim: int,
+    num_kv_heads: int,
+    layer_idx: int,
+    has_fp8_kv_cache: bool,
+    attention_mask_type: int,
+    attention_mask_data: Optional[torch.Tensor],
+) -> torch.Tensor:
+    '''
+    Wrapping the flashinfer forward as a custom op is required to fix `torch.compile` graph breaks,
+    otherwise it will graph break when calling `metadata.num_contexts` since it convert tensor's sum directly to int.
+    '''
+    # torch.compile does not support custom object as arguments, so we have to use global function to get the metadata.
+    metadata = get_metadata()
+
+    # This is only for memory estimation for now.
+    # NOTE: this method is not accurate while it works for most scenario.
+    if metadata is None or metadata.kv_cache_manager is None:
+        q = q.view(1, -1, num_heads, head_dim)
+        k = k.view(1, -1, num_kv_heads, head_dim)
+        v = v.view(1, -1, num_kv_heads, head_dim)
+        return dummy_forward(q, k, v)
+
+    assert isinstance(
+        metadata,
+        FlashInferAttentionMetadata,
+    )
+
+    # Query
+    q = q.view(-1, num_heads, head_dim)
+
+    # Key and Value
+    kv_cache = metadata.kv_cache_manager.get_buffers(layer_idx)
+
+    if k is not None and v is not None:
+        k = k.view(-1, num_kv_heads, head_dim)
+        v = v.view(-1, num_kv_heads, head_dim)
+
+        if has_fp8_kv_cache:
+            assert kv_cache.dtype == torch.float8_e4m3fn, f"KV cache should have fp8 dtype, but get {kv_cache.dtype}"
+            k = k.to(torch.float8_e4m3fn)
+            v = v.to(torch.float8_e4m3fn)
+        assert k.dtype == v.dtype == kv_cache.dtype, f"KV cache dtype {kv_cache.dtype} does not match k/v dtype {k.dtype}/{v.dtype}"
+
+        flashinfer.page.append_paged_kv_cache(
+            append_key=k,
+            append_value=v,
+            batch_indices=metadata.batch_indices,
+            positions=metadata.positions,
+            paged_kv_cache=kv_cache,
+            kv_indices=metadata.paged_kv_indices,
+            kv_indptr=metadata.paged_kv_indptr,
+            kv_last_page_len=metadata.paged_kv_last_page_len,
+            kv_layout=metadata.kv_layout)
+
+    num_contexts = metadata.num_contexts
+    num_generations = metadata.num_generations
+    num_ctx_tokens = metadata.num_ctx_tokens
+
+    def prefill_forward(plan_params: PlanParams):
+        wrapper = metadata.get_prefill_wrapper(plan_params)
+        output = wrapper.run(q[:num_ctx_tokens], kv_cache)
+        output = output.view(num_ctx_tokens, -1)
+        return output
+
+    def decode_forward(plan_params: PlanParams):
+        wrapper = metadata.get_decode_wrapper(plan_params)
+        output = wrapper.run(q[num_ctx_tokens:], kv_cache)
+        output = output.view(num_generations, -1)
+        return output
+
+    # this will do nothing if the last forward pass had the same parameters
+    plan_params = metadata.plan(num_heads,
+                                num_kv_heads,
+                                head_dim,
+                                q_dtype=q.dtype,
+                                kv_dtype=kv_cache.dtype,
+                                attention_mask_type=attention_mask_type,
+                                attention_mask_data=attention_mask_data)
+
+    if num_contexts > 0:
+        ctx_output = prefill_forward(plan_params)
+
+    if num_generations > 0:
+        gen_output = decode_forward(plan_params)
+
+    if num_contexts > 0 and num_generations > 0:
+        output = torch.cat([ctx_output, gen_output], dim=0)
+    elif num_contexts > 0:
+        output = ctx_output
+    elif num_generations > 0:
+        output = gen_output
+
+    return output
+
+
+@forward_pattern.register_fake
+@staticmethod
+def _(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    num_heads: int,
+    head_dim: int,
+    num_kv_heads: int,
+    layer_idx: int,
+    has_fp8_kv_cache: bool,
+    attention_mask_type: int,
+    attention_mask_data: Optional[torch.Tensor],
+):
+    return torch.empty_like(q)
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
index 5e55bc582c..ee18a37e92 100644
--- a/tensorrt_llm/_torch/attention_backend/interface.py
+++ b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -412,34 +412,35 @@ class AttentionBackend(Generic[TMetadata]):
         """
         raise NotImplementedError
 
-    @torch.library.custom_op("trtllm::attn_dummy_fwd", mutates_args=())
-    @staticmethod
-    def dummy_forward(q: torch.Tensor, k: torch.Tensor,
-                      v: torch.Tensor) -> torch.Tensor:
-        """
-        Dummy attention forward function to estimate memory usage.
-        Args:
-            q (torch.Tensor): Query tensor with shape (1, num_q_tokens, num_heads, head_dim),.
-            k (torch.Tensor): Key tensor with shape (1, num_new_kv_tokens, num_kv_heads, head_dim)
-            v (torch.Tensor): Value tensor with shape (1, num_new_kv_tokens, num_kv_heads, head_dim)
-        Returns:
-            torch.Tensor with shape (num_q_tokens, num_heads * head_dim)
-        """
-        head_dim = q.shape[3]
-        assert q.dim() == 4 and q.size()[0] == 1
-        assert k.dim() == 4 and k.size()[0] == 1 and k.size()[3] == head_dim
-        assert v.dim() == 4 and v.size()[0] == 1 and v.size()[3] == head_dim
-        # This is only for memory estimation for now.
-        # NOTE: this method is not accurate while it works for most scenario.
-        o = _flash_attention_forward(q,
-                                     k,
-                                     v,
-                                     attention_mask=None,
-                                     query_length=q.size(1),
-                                     is_causal=True)
-        return o.reshape(o.size(1), -1)
 
-    @dummy_forward.register_fake
-    def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
-        num_q_tokens = q.size()[1]
-        return torch.empty_like(q).reshape(num_q_tokens, -1)
+@torch.library.custom_op("trtllm::attn_dummy_fwd", mutates_args=())
+def dummy_forward(q: torch.Tensor, k: torch.Tensor,
+                  v: torch.Tensor) -> torch.Tensor:
+    """
+    Dummy attention forward function to estimate memory usage.
+    Args:
+        q (torch.Tensor): Query tensor with shape (1, num_q_tokens, num_heads, head_dim),.
+        k (torch.Tensor): Key tensor with shape (1, num_new_kv_tokens, num_kv_heads, head_dim)
+        v (torch.Tensor): Value tensor with shape (1, num_new_kv_tokens, num_kv_heads, head_dim)
+    Returns:
+        torch.Tensor with shape (num_q_tokens, num_heads * head_dim)
+    """
+    head_dim = q.shape[3]
+    assert q.dim() == 4 and q.size()[0] == 1
+    assert k.dim() == 4 and k.size()[0] == 1 and k.size()[3] == head_dim
+    assert v.dim() == 4 and v.size()[0] == 1 and v.size()[3] == head_dim
+    # This is only for memory estimation for now.
+    # NOTE: this method is not accurate while it works for most scenario.
+    o = _flash_attention_forward(q,
+                                 k,
+                                 v,
+                                 attention_mask=None,
+                                 query_length=q.size(1),
+                                 is_causal=True)
+    return o.reshape(o.size(1), -1)
+
+
+@dummy_forward.register_fake
+def _(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor) -> torch.Tensor:
+    num_q_tokens = q.size()[1]
+    return torch.empty_like(q).reshape(num_q_tokens, -1)
diff --git a/tensorrt_llm/_torch/attention_backend/star_flashinfer.py b/tensorrt_llm/_torch/attention_backend/star_flashinfer.py
index b697a8994c..ae9d17470e 100644
--- a/tensorrt_llm/_torch/attention_backend/star_flashinfer.py
+++ b/tensorrt_llm/_torch/attention_backend/star_flashinfer.py
@@ -4,6 +4,7 @@ import torch
 from ..distributed import allgather
 from ..modules.linear import ParallelConfig
 from .flashinfer import *
+from .interface import dummy_forward
 
 
 # Please sync with flashinfer's DISPATCH_GQA_GROUP_SIZE in include/flashinfer/utils.cuh
@@ -320,9 +321,7 @@ class StarAttention(AttentionBackend[StarAttentionMetadata]):
         # This is only for memory estimation for now.
         # NOTE: this method is not accurate while it works for most scenario.
         if metadata is None or metadata.kv_cache_manager is None:
-            return AttentionBackend.dummy_forward(q.unsqueeze(0),
-                                                  k.unsqueeze(0),
-                                                  v.unsqueeze(0))
+            return dummy_forward(q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0))
 
         num_contexts = metadata.num_contexts
         num_queries = metadata.num_queries
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
index a7ad5a9879..65a2a98e97 100644
--- a/tensorrt_llm/_torch/attention_backend/trtllm.py
+++ b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -10,6 +10,8 @@ from tensorrt_llm.functional import AttentionMaskType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
+from .interface import dummy_forward
+
 
 @dataclass(kw_only=True, init=False)
 class TrtllmAttentionWrapper:
@@ -432,7 +434,7 @@ class TrtllmAttention(AttentionBackend[TrtllmAttentionMetadata]):
             q = q.reshape(1, -1, num_heads, head_dim).contiguous()
             k = k.reshape(1, -1, num_kv_heads, head_dim).contiguous()
             v = v.reshape(1, -1, num_kv_heads, head_dim).contiguous()
-            return AttentionBackend.dummy_forward(q, k, v)
+            return dummy_forward(q, k, v)
 
         assert isinstance(
             metadata,
diff --git a/tensorrt_llm/_torch/attention_backend/vanilla.py b/tensorrt_llm/_torch/attention_backend/vanilla.py
index c0e0643a44..c343f2e7bd 100644
--- a/tensorrt_llm/_torch/attention_backend/vanilla.py
+++ b/tensorrt_llm/_torch/attention_backend/vanilla.py
@@ -10,7 +10,7 @@ except ImportError:
     AttentionMaskConverter = None
 
 from .interface import (AttentionBackend, AttentionMask, AttentionMetadata,
-                        PredefinedAttentionMask)
+                        PredefinedAttentionMask, dummy_forward)
 
 
 def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
@@ -160,9 +160,7 @@ class VanillaAttention(AttentionBackend[VanillaAttentionMetadata]):
         # This is only for memory estimation for now.
         # NOTE: this method is not accurate while it works for most scenario.
         if metadata is None or metadata.kv_cache_manager is None:
-            return AttentionBackend.dummy_forward(q.unsqueeze(0),
-                                                  k.unsqueeze(0),
-                                                  v.unsqueeze(0))
+            return dummy_forward(q.unsqueeze(0), k.unsqueeze(0), v.unsqueeze(0))
 
         past_seen_tokens = metadata.kv_cache_params.num_cached_tokens_per_seq
         cache_indices = [
diff --git a/tensorrt_llm/_torch/models/modeling_multimodal_utils.py b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py
new file mode 100644
index 0000000000..f6c2ac3499
--- /dev/null
+++ b/tensorrt_llm/_torch/models/modeling_multimodal_utils.py
@@ -0,0 +1,149 @@
+# Copyright 2024 NVIDIA CORPORATION & AFFILIATES
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# SPDX-License-Identifier: Apache-2.0
+# This file is based on official VILA: https://github.com/NVlabs/VILA/
+# and s2wrapper: https://github.com/bfshi/scaling_on_scales
+
+import math
+
+import torch
+import torch.nn.functional as F
+from einops import rearrange
+
+#  ------------------------------------------------------------------------------------------
+#  Original code by Baifeng Shi, licensed under the MIT License:
+#  https://github.com/bfshi/scaling_on_scales/blob/master/LICENSE.md
+#  ------------------------------------------------------------------------------------------
+
+
+def s2_split_chessboard(x, num_split):
+    """
+        x: b * c * h * w
+        Deividing x into num_split**2 sub-squares, and concatenate all the sub-squares on the batch dimension
+    """
+    B, C, H, W = x.shape
+    assert H % num_split == 0 and W % num_split == 0
+    x_split = rearrange(x,
+                        'b c (nh h) (nw w) -> (nh nw b) c h w',
+                        nh=num_split,
+                        nw=num_split)
+    return x_split
+
+
+def s2_merge_chessboard(x, num_split):
+    """
+        x: b * c * h * w
+        Assuming x contains num_split**2 sub-squares concatenated along batch dimension, merge the sub-squares back to the original whole square.
+        (inverse of split_chessboard)
+    """
+    B, C, H, W = x.shape
+    assert B % (num_split**2) == 0
+    x_merge = rearrange(x,
+                        '(nh nw b) c h w -> b c (nh h) (nw w)',
+                        nh=num_split,
+                        nw=num_split)
+
+    return x_merge
+
+
+def s2_batched_forward(model, x, batch_size=-1):
+    if batch_size == -1:
+        return model(x)
+    else:
+        x_batched = x.split(batch_size)
+        outs = [model(x) for x in x_batched]
+        return torch.cat(outs, dim=0)
+
+
+def multiscale_forward(model,
+                       input,
+                       scales=None,
+                       img_sizes=None,
+                       max_split_size=None,
+                       resize_output_to_idx=0,
+                       num_prefix_token=0,
+                       output_shape='bnc',
+                       split_forward=False):
+
+    assert input.dim() == 4, "Input image must be in the shape of BxCxHxW."
+    assert input.shape[2] == input.shape[
+        3], "Currently only square images are supported."
+    assert output_shape in [
+        'bnc', 'bchw'
+    ], "Output shape should be either BxNxC (e.g., ViT) or BxCxHxW (e.g., ConvNet)."
+    assert output_shape == 'bnc' or num_prefix_token == 0, "For ConvNet there shouldn't be any prefix token."
+
+    b, c, input_size, _ = input.shape
+
+    # image size for each scale
+    assert scales is not None or img_sizes is not None, "Please assign either scales or img_sizes."
+    img_sizes = img_sizes or [int(input_size * scale) for scale in scales]
+
+    # prepare multiscale inputs
+    max_split_size = max_split_size or input_size  # The maximum size of each split of image. Set as the input size by default
+    num_splits = [math.ceil(size / max_split_size)
+                  for size in img_sizes]  # number of splits each scale
+    input_multiscale = []
+    for size, num_split in zip(img_sizes, num_splits):
+        x = F.interpolate(input.to(torch.float32), size=size,
+                          mode='bicubic').to(input.dtype)
+        x = s2_split_chessboard(x, num_split=num_split)
+        input_multiscale.append(x)
+
+    # run feedforward on each scale
+    outs_multiscale = [
+        s2_batched_forward(model, x, b) if split_forward else model(x)
+        for x in input_multiscale
+    ]
+    if num_prefix_token > 0:
+        outs_prefix_multiscale = [
+            out[:, :num_prefix_token] for out in outs_multiscale
+        ]
+        outs_multiscale = [out[:, num_prefix_token:] for out in outs_multiscale]
+    if output_shape == 'bnc':
+        outs_multiscale = [
+            rearrange(out,
+                      'b (h w) c -> b c h w',
+                      h=int(out.shape[1]**0.5),
+                      w=int(out.shape[1]**0.5)) for out in outs_multiscale
+        ]
+
+    # merge outputs of different splits for each scale separately
+    outs_multiscale = [
+        s2_merge_chessboard(out, num_split=num_split)
+        for num_split, out in zip(num_splits, outs_multiscale)
+    ]
+
+    # interpolate outputs from different scales and concat together
+    output_size = outs_multiscale[resize_output_to_idx].shape[-2]
+    out = torch.cat([
+        F.interpolate(outs_multiscale[i].to(torch.float32),
+                      size=output_size,
+                      mode='area').to(outs_multiscale[i].dtype)
+        for i in range(len(outs_multiscale))
+    ],
+                    dim=1)
+    if output_shape == 'bnc':
+        out = rearrange(out, 'b c h w -> b (h w) c')
+    if num_prefix_token > 0:
+        # take the mean of prefix tokens from different splits for each scale
+        outs_prefix_multiscale = [
+            torch.stack(out.split(b, dim=0), dim=0).mean(dim=0)
+            for out in outs_prefix_multiscale
+        ]
+        out_prefix_multiscale = torch.cat(outs_prefix_multiscale, dim=-1)
+        out = torch.cat([out_prefix_multiscale, out], dim=1)
+
+    return out
diff --git a/tensorrt_llm/_torch/models/modeling_vit.py b/tensorrt_llm/_torch/models/modeling_vit.py
index eee67edcfd..c6b634eeab 100644
--- a/tensorrt_llm/_torch/models/modeling_vit.py
+++ b/tensorrt_llm/_torch/models/modeling_vit.py
@@ -19,12 +19,13 @@
 import torch
 import torch.nn as nn
 from accelerate.hooks import add_hook_to_module
-from s2wrapper import forward as multiscale_forward
 from transformers import (CLIPImageProcessor, CLIPVisionModel, PretrainedConfig,
                           PreTrainedModel)
 from transformers.image_processing_utils import BaseImageProcessor
 from transformers.integrations.deepspeed import is_deepspeed_zero3_enabled
 
+from .modeling_multimodal_utils import multiscale_forward
+
 
 class VisionTower(nn.Module):
 
diff --git a/tensorrt_llm/llmapi/_perf_evaluator.py b/tensorrt_llm/llmapi/_perf_evaluator.py
index aa048524ac..ddb341204a 100644
--- a/tensorrt_llm/llmapi/_perf_evaluator.py
+++ b/tensorrt_llm/llmapi/_perf_evaluator.py
@@ -366,8 +366,8 @@ class LLMPerfEvaluator:
                 sample = self.samples[sample_offset]
                 sample_offset += 1
                 sampling_params.max_tokens = sample.output_len
-                sampling_params.end_id = -2
-                sampling_params.pad_id = -2
+                sampling_params.end_id = -1
+                sampling_params.pad_id = -1
                 if self.sampling_extra_params is not None:
                     for key, value in self.sampling_extra_params.items():
                         setattr(sampling_params, key, value)
diff --git a/tensorrt_llm/runtime/multimodal_model_runner.py b/tensorrt_llm/runtime/multimodal_model_runner.py
index 7723520395..37f965ac4a 100644
--- a/tensorrt_llm/runtime/multimodal_model_runner.py
+++ b/tensorrt_llm/runtime/multimodal_model_runner.py
@@ -643,12 +643,18 @@ class MultimodalModelRunner:
                    other_vision_inputs):
         # same prompt for single/multiple image(s)
         n_prompts_n_images = False
-        if isinstance(
-                post_prompt,
-                list) and len(post_prompt) > 1 and image is not None and len(
-                    post_prompt) == image.shape[0]:
-            # n prompts and n images
-            n_prompts_n_images = True
+        if isinstance(post_prompt,
+                      list) and len(post_prompt) > 1 and image is not None:
+            if hasattr(image, "pixel_values"):
+                if len(post_prompt) == image["pixel_values"].shape[0]:
+                    n_prompts_n_images = True
+                    # n prompts and n images
+            else:
+                if isinstance(
+                        image,
+                        torch.Tensor) and len(post_prompt) == image.shape[0]:
+                    n_prompts_n_images = True
+                    # n prompts and n images
 
         if self.model_type == 'kosmos-2':
             input_ids = image['input_ids'].clone()
@@ -867,7 +873,14 @@ class MultimodalModelRunner:
                             1] + visual_atts.shape[1]
             else:
                 post_input_ids = None
-                length = pre_input_ids.shape[1] + visual_atts.shape[1]
+                assert pre_input_ids.shape[0] == visual_atts.shape[0]
+                if visual_atts.shape[0] == 1:
+                    length = pre_input_ids.shape[1] + visual_atts.shape[1]
+                else:
+                    length = [
+                        pre_input_ids.shape[1] + visual_atts.shape[1]
+                        for _ in range(visual_atts.shape[0])
+                    ]
 
         if n_prompts_n_images:
             if isinstance(length, int): length = [length]
diff --git a/tensorrt_llm/version.py b/tensorrt_llm/version.py
index 7c13fa986e..2b89953871 100644
--- a/tensorrt_llm/version.py
+++ b/tensorrt_llm/version.py
@@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.17.0.post1"
+__version__ = "0.18.0"
diff --git a/tests/_torch/test_fp4_gemm_quantize.py b/tests/_torch/test_fp4_gemm_quantize.py
index 58d6210bd9..ac47c58e26 100644
--- a/tests/_torch/test_fp4_gemm_quantize.py
+++ b/tests/_torch/test_fp4_gemm_quantize.py
@@ -17,6 +17,7 @@ import os
 import sys
 import unittest
 
+import pytest
 import torch
 from parameterized import parameterized
 
@@ -56,6 +57,7 @@ class TestFunctional(unittest.TestCase):
     )
     @skip_pre_blackwell_unittest
     def test_fp4_quantize_gemm_torch(self, m, n, k):
+        pytest.skip("https://nvbugs/5100633")
         a = torch.randn([m, k], dtype=torch.float32)
         b = torch.randn([n, k], dtype=torch.float32)
         a_global_sf = (448 * 6) / a.abs().max().float()
diff --git a/tests/llmapi/test_llm_models.py b/tests/llmapi/test_llm_models.py
index bc63cfda9a..559b365625 100644
--- a/tests/llmapi/test_llm_models.py
+++ b/tests/llmapi/test_llm_models.py
@@ -37,7 +37,7 @@ gpt_neox_20b_model_path = get_model_path('gpt-neox-20b')
 commandr_v01_model_path = get_model_path('c4ai-command-r-v01')
 commandr_plus_model_path = get_model_path('c4ai-command-r-plus')
 deepseek_v1_model_path = get_model_path("deepseek-moe-16b-base")
-sampling_params = SamplingParams(max_tokens=10)
+sampling_params = SamplingParams(max_tokens=10, end_id=-1)
 
 
 @force_ampere