Update TensorRT-LLM (#2253)

* Update TensorRT-LLM --------- Co-authored-by: Ivan Sorokin <isorokin@nvidia.com> Co-authored-by: lkm2835 <lkm2835@gmail.com>
2026-01-13 22:18:36 +08:00 · 2024-09-24 23:27:31 +08:00 · 2024-09-24 23:27:31 +08:00 · e153372759
commit e153372759
parent a65dba7aaf
98 changed files with 1719 additions and 512 deletions
--- a/README.md
+++ b/README.md
@ -17,6 +17,15 @@ TensorRT-LLM
 <div align="left">

 ## Latest News
+* [2024/09/17] ✨ NVIDIA TensorRT-LLM Meetup
+[➡️ link](https://drive.google.com/file/d/1RR8GqC-QbuaKuHj82rZcXb3MS20SWo6F/view?usp=share_link)
+
+* [2024/09/17] ✨ Accelerating LLM Inference at Databricks with TensorRT-LLM
+[➡️ link](https://drive.google.com/file/d/1NeSmrLaWRJAY1rxD9lJmzpB9rzr38j8j/view?usp=sharing)
+
+* [2024/09/17] ✨ TensorRT-LLM @ Baseten
+[➡️ link](https://drive.google.com/file/d/1Y7L2jqW-aRmt31mCdqhwvGMmCSOzBUjG/view?usp=share_link)
+
 * [2024/09/04] 🏎️🏎️🏎️ Best Practices for Tuning TensorRT-LLM for Optimal Serving with BentoML
 [➡️ link](https://www.bentoml.com/blog/tuning-tensor-rt-llm-for-optimal-serving-with-bentoml)

@ -46,6 +55,9 @@ TensorRT-LLM
 * [2024/07/02] Let the @MistralAI MoE tokens fly 📈 🚀 #Mixtral 8x7B with NVIDIA #TensorRT #LLM on #H100.
 [➡️ Tech blog](https://developer.nvidia.com/blog/achieving-high-mixtral-8x7b-performance-with-nvidia-h100-tensor-core-gpus-and-tensorrt-llm?ncid=so-twit-928467)

+<details close>
+<summary>Previous News</summary>
+
 * [2024/06/24] Enhanced with NVIDIA #TensorRT #LLM, @upstage.ai’s solar-10.7B-instruct is ready to power your developer projects through our API catalog 🏎️. ✨[➡️ link](https://build.nvidia.com/upstage/solar-10_7b-instruct?snippet_tab=Try )

 * [2024/06/18] CYMI: 🤩 Stable Diffusion 3 dropped last week 🎊 🏎️ Speed up your SD3 with #TensorRT INT8 Quantization[➡️ link](https://build.nvidia.com/upstage/solar-10_7b-instruct?snippet_tab=Try )
@ -58,10 +70,6 @@ Technical Deep Dive for serious coders ✅+99% compression ✅1 set of weights
 * [2024/06/04] ✨ #TensorRT and GeForce #RTX unlock ComfyUI SD superhero powers 🦸⚡ 🎥 Demo: [➡️ link](https://youtu.be/64QEVfbPHyg)
 📗 DIY notebook: [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&name=ComfyUI_TensorRT&instance=L4%40g2-standard-4%3Anvidia-l4%3A1&diskStorage=500&cloudID=GCP&baseImage=docker.io%2Fpytorch%2Fpytorch%3A2.2.0-cuda12.1-cudnn8-runtime&ports=ComfUI%3A8188&file=https%3A%2F%2Fgithub.com%2Fbrevdev%2Fnotebooks%2Fblob%2Fmain%2Ftensorrt-comfyui.ipynb&launchableID=env-2hQX3n7ae5mq3NjNZ32DfAG0tJf)

-<details close>
-<summary>Previous News</summary>
-
-
 * [2024/05/28] ✨#TensorRT weight stripping for ResNet-50 ✨ ✅+99% compression
 ✅1 set of weights → ** GPUs\ ✅0 performance loss ✅** models…LLM, CNN, etc
 👀 📚 DIY [➡️ link](https://console.brev.dev/launchable/deploy?userID=2x2sil999&orgID=ktj33l4xj&launchableID=env-2h6bym7h5GFNho3vpWQQeUYMwTM&instance=L4%40g6.xlarge&diskStorage=500&cloudID=devplane-brev-1&baseImage=nvcr.io%2Fnvidia%2Ftensorrt%3A24.05-py3&file=https%3A%2F%2Fgithub.com%2FNVIDIA%2FTensorRT%2Fblob%2Frelease%2F10.0%2Fsamples%2Fpython%2Fsample_weight_stripping%2Fnotebooks%2Fweight_stripping.ipynb&name=tensorrt_weight_stripping_resnet50)
@ -71,10 +79,8 @@ Serverless TensorRT-LLM (LLaMA 3 8B) | Modal Docs [➡️ link](https://modal.co

 * [2024/05/08] NVIDIA TensorRT Model Optimizer -- the newest member of the #TensorRT ecosystem is a library of post-training and training-in-the-loop model optimization techniques ✅quantization ✅sparsity ✅QAT [➡️ blog](https://developer.nvidia.com/blog/accelerate-generative-ai-inference-performance-with-nvidia-tensorrt-model-optimizer-now-publicly-available/)

-
 * [2024/05/07] 🦙🦙🦙 24,000 tokens per second 🛫Meta Llama 3 takes off with #TensorRT #LLM 📚[➡️ link](https://blogs.nvidia.com/blog/meta-llama3-inference-acceleration/)

-
 * [2024/02/06] [🚀 Speed up inference with SOTA quantization techniques in TRT-LLM](./docs/source/blogs/quantization-in-TRT-LLM.md)
 * [2024/01/30] [ New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget](./docs/source/blogs/XQA-kernel.md)
 * [2023/12/04] [Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100](./docs/source/blogs/Falcon180B-H200.md)
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@ -40,24 +40,21 @@ namespace tensorrt_llm::batch_manager
 * @brief The state of the request.
 *
 * Enum order must follow chronological order for state dependency check, @see hasReachedState().
- *
- * @todo(rkobus): refactor
 */
-enum LlmRequestState_t
+enum class LlmRequestState : int32_t
 {
-    REQUEST_STATE_UNKNOWN = 0,                          ///< Unknown state
-    REQUEST_STATE_ENCODER_INIT = 1,                     ///< Encoder phase starts (for encoder-decoder models)
-    REQUEST_STATE_CONTEXT_INIT = 2,                     ///< Context phase starts
-    REQUEST_STATE_GENERATION_IN_PROGRESS = 3,           ///< Generation phase is in progress
-    REQUEST_STATE_GENERATION_TO_COMPLETE = 4,           ///< Generation phase is to be completed
-    REQUEST_STATE_GENERATION_COMPLETE = 5,              ///< Generation phase completed
-    REQUEST_STATE_DISAGG_GENERATION_INIT = 6,           ///< For disaggregated serving only:
-                                                        /// new Generation request arrived at generation model
-    REQUEST_STATE_DISAGG_CONTEXT_TRANS_IN_PROGRESS = 7, ///< For disaggregated serving only:
-                                                        /// Waiting context-only request transmitting the kv cache
-    REQUEST_STATE_DISAGG_CONTEXT_COMPLETE = 8,          ///< Context-only request finished kv cache transmission.
-    REQUEST_STATE_DISAGG_GENERATION_TRANS_IN_PROGRESS
-    = 9,                                                ///< For disaggregated serving only: transmitting the kv cache
+    kUNKNOWN = 0,                             ///< Unknown state
+    kENCODER_INIT = 1,                        ///< Encoder phase starts (for encoder-decoder models)
+    kCONTEXT_INIT = 2,                        ///< Context phase starts
+    kGENERATION_IN_PROGRESS = 3,              ///< Generation phase is in progress
+    kGENERATION_TO_COMPLETE = 4,              ///< Generation phase is to be completed
+    kGENERATION_COMPLETE = 5,                 ///< Generation phase completed
+    kDISAGG_GENERATION_INIT = 6,              ///< For disaggregated serving only:
+                                              /// new Generation request arrived at generation model
+    kDISAGG_CONTEXT_TRANS_IN_PROGRESS = 7,    ///< For disaggregated serving only:
+                                              /// Waiting context-only request transmitting the kv cache
+    kDISAGG_CONTEXT_COMPLETE = 8,             ///< Context-only request finished kv cache transmission.
+    kDISAGG_GENERATION_TRANS_IN_PROGRESS = 9, ///< For disaggregated serving only: transmitting the kv cache
 };

 enum LlmRequestType
@ -115,7 +112,7 @@ public:
        , mPromptLen(inputTokens->size())
        , mMaxNewTokens(maxNewTokens)
        , mSamplingConfig(samplingConfig)
-        , mState(REQUEST_STATE_CONTEXT_INIT)
+        , mState(LlmRequestState::kCONTEXT_INIT)
        , mEndId(endId)
        , mPadId(padId)
        , mLogitsPostProcessor(logitsPostProcessor)
@ -160,7 +157,7 @@ public:
    {
        if (mEncoderTokens.has_value() || encoderInputFeatures.has_value())
        {
-            mState = REQUEST_STATE_ENCODER_INIT;
+            mState = LlmRequestState::kENCODER_INIT;
        }

        initialize(*inputTokens, returnLogProbs);
@ -171,7 +168,7 @@ public:
        , mPromptLen(req.getInputTokenIds().size())
        , mMaxNewTokens(req.getMaxTokens())
        , mSamplingConfig(req.getSamplingConfig(), req.getExternalDraftTokensConfig())
-        , mState(REQUEST_STATE_CONTEXT_INIT)
+        , mState(LlmRequestState::kCONTEXT_INIT)
        , mEndId(req.getEndId())
        , mPadId(req.getPadId())
        , mClientId(req.getClientId())
@ -213,7 +210,7 @@ public:
    {
        if (req.getRequestType() == executor::RequestType::REQUEST_TYPE_GENERATION_ONLY)
        {
-            mState = REQUEST_STATE_DISAGG_GENERATION_INIT;
+            mState = LlmRequestState::kDISAGG_GENERATION_INIT;
        }
        if (mIsStreaming && mSamplingConfig.beamWidth > 1 && !mReturnAllGeneratedTokens)
        {
@ -237,7 +234,7 @@ public:

        if (req.getEncoderInputTokenIds().has_value() || req.getEncoderInputFeatures().has_value())
        {
-            mState = REQUEST_STATE_ENCODER_INIT;
+            mState = LlmRequestState::kENCODER_INIT;
            if (req.getEncoderInputTokenIds().has_value())
            {
                mEncoderTokens = std::make_shared<VecTokens>(req.getEncoderInputTokenIds().value());
@ -716,8 +713,8 @@ public:
        }

        // for enc-dec models, pause means saving generated tokens to prompt but need to re-do encoder phase
-        mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? REQUEST_STATE_ENCODER_INIT
-                                                                     : REQUEST_STATE_CONTEXT_INIT;
+        mState = mEncoderTokens.has_value() || mEncoderInputFeatures ? LlmRequestState::kENCODER_INIT
+                                                                     : LlmRequestState::kCONTEXT_INIT;
        mContextCurrentPosition = 0;
        mContextChunkSize = std::nullopt;
        mSeqSlot.reset();
@ -1101,44 +1098,44 @@ public:
        mGenerationLogitsFragments.clear();
    }

-    [[nodiscard]] bool hasReachedState(LlmRequestState_t state) const noexcept
+    [[nodiscard]] bool hasReachedState(LlmRequestState state) const noexcept
    {
        return mState >= state;
    }

    [[nodiscard]] bool isEncoderInitState() const noexcept
    {
-        return mState == REQUEST_STATE_ENCODER_INIT;
+        return mState == LlmRequestState::kENCODER_INIT;
    }

    [[nodiscard]] bool isContextInitState() const noexcept
    {
-        return mState == REQUEST_STATE_CONTEXT_INIT;
+        return mState == LlmRequestState::kCONTEXT_INIT;
    }

    [[nodiscard]] bool isGenerationInProgressState() const noexcept
    {
-        return mState == REQUEST_STATE_GENERATION_IN_PROGRESS || mState == REQUEST_STATE_GENERATION_TO_COMPLETE;
+        return mState == LlmRequestState::kGENERATION_IN_PROGRESS || mState == LlmRequestState::kGENERATION_TO_COMPLETE;
    }

    [[nodiscard]] bool isGenerationCompleteState() const noexcept
    {
-        return mState == REQUEST_STATE_GENERATION_COMPLETE;
+        return mState == LlmRequestState::kGENERATION_COMPLETE;
    }

    [[nodiscard]] bool isDisaggGenerationInitState() const noexcept
    {
-        return mState == REQUEST_STATE_DISAGG_GENERATION_INIT;
+        return mState == LlmRequestState::kDISAGG_GENERATION_INIT;
    }

    [[nodiscard]] bool isDisaggContextTransmissionState() const noexcept
    {
-        return mState == REQUEST_STATE_DISAGG_CONTEXT_TRANS_IN_PROGRESS;
+        return mState == LlmRequestState::kDISAGG_CONTEXT_TRANS_IN_PROGRESS;
    }

    [[nodiscard]] bool isDisaggContextCompleteState() const noexcept
    {
-        return mState == REQUEST_STATE_DISAGG_CONTEXT_COMPLETE;
+        return mState == LlmRequestState::kDISAGG_CONTEXT_COMPLETE;
    }

    /// To determine whether the context is unchunked. When a context is chunked into only a part, it
@ -1252,7 +1249,7 @@ public:
    std::optional<executor::Response> createResponse()
    {
        TLLM_CHECK(!isDisaggContextCompleteState());
-        if (isGenerationCompleteState() || (mIsStreaming && isGenerationInProgressState())
+        if (isGenerationCompleteState() || (mIsStreaming && mState == LlmRequestState::kGENERATION_IN_PROGRESS)
            || isDisaggContextTransmissionState())
        {
            TLLM_LOG_DEBUG("Creating response for request %lu", mRequestId);
@ -1400,7 +1397,7 @@ public:
    SizeType32 mMaxNewTokens;
    // Tokens [beam_size, mPromptLen + getMaxNumGeneratedTokens()]
    runtime::SamplingConfig mSamplingConfig;
-    LlmRequestState_t mState;
+    LlmRequestState mState;
    std::optional<TokenIdType> mEndId;
    std::optional<TokenIdType> mPadId;
    std::optional<SizeType32> mSeqSlot;
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:679f0879be2232dc93113c5d96b128628fa2f518cc7aebffd12cbd6b06d68573
-size 4667768
+oid sha256:e08b60b89bb4934490ee61383c55c22d831fa1cfcccedea5735400e3574aadbc
+size 4671466
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6dd4a4c8600f6da076c9c60047043c23a2d020528d833572f3c2a0fcfce8cf12
-size 4772870
+oid sha256:2b6b3bf449c4b4d67f0bb9879af6b8eda6f46f272eaa5b7305582a2cc8c73e17
+size 4775694
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-8a04a7d0057b71b63a9c6e4f33cc30e7 libtensorrt_llm_batch_manager_static.a
-915451635c4e57cd8fd49a6dedb22ab2 libtensorrt_llm_batch_manager_static.pre_cxx11.a
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+f229593e4699180b52e38f99c8ac31dc libtensorrt_llm_batch_manager_static.a
+440b3ae47982d88fc8517c5f01f67b3c libtensorrt_llm_batch_manager_static.pre_cxx11.a
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0d63dbd6150c592cea517b6490cc69a5d60aede23362b885300de4c0b248ba50
-size 4519402
+oid sha256:1a71c70d555673ce9a5086c27cbd27642f940d2439effb72a75f1302725a2513
+size 4522988
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4be8b156080d2bb8b3f80f1ecd02a01b9dfad6f8044ba1f87a416c4d8e7dd1f3
-size 4483702
+oid sha256:93c436037b41d06d735acfbf065ccef4ea50085052920cb6a54fb3f84c59fb12
+size 4486958
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-a46c69375658ab41016ef6e7c4744135 libtensorrt_llm_batch_manager_static.a
-eb86de29ef2413010975fed7106356b7 libtensorrt_llm_batch_manager_static.pre_cxx11.a
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+f402983564a358853384650313301b32 libtensorrt_llm_batch_manager_static.a
+44558f89a86de69191f18a2704cff505 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:4a356ece970fe47f6e41dbe1c98b551b5623341eb209294e27fd932cf12b0ee0
-size 28212158
+oid sha256:1f8f3d6e22edead45c5bde864b541311a4b9a28f1916cd7b5bbf1292746c06c5
+size 28211626
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/version.txt
@ -1,2 +1,2 @@
-51e88c8d94071a4dc24f8eea43bf8c97 tensorrt_llm_batch_manager_static.lib
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+7b8f52e42d11765115c185d7d13e40a3 tensorrt_llm_batch_manager_static.lib
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/common/mpiUtils.cpp
+++ b/cpp/tensorrt_llm/common/mpiUtils.cpp
@ -16,6 +16,7 @@

 #include <algorithm>
 #include <numeric>
+#include <unordered_set>

 #include "tensorrt_llm/common/mpiUtils.h"

@ -127,7 +128,6 @@ std::vector<int> getWorldRanks(MpiComm const& comm)
    MPICHECK(MPI_Group_translate_ranks(group, groupSize, ranks.data(), worldGroup, worldRanks.data()));
    MPICHECK(MPI_Group_free(&group));
    MPICHECK(MPI_Group_free(&worldGroup));
-    std::sort(worldRanks.begin(), worldRanks.end());
 #else
    std::vector<int> worldRanks{0};
 #endif
@ -391,31 +391,30 @@ MpiComm& MpiComm::mutableLocalSession()
 void MpiComm::refreshLocalSession()
 {
 #if ENABLE_MULTI_DEVICE
-    static std::vector<int> initSessionRanks;
    static std::mutex mutex;
    std::unique_lock lock(mutex);
-    if (initSessionRanks.empty())
-    {
-        auto initSessionRanks = getWorldRanks(MpiComm::session());
-        auto localSessionRanks = getWorldRanks(MpiComm::localSession());
-        std::vector<int> intersectionRanks;
-        std::set_intersection(initSessionRanks.begin(), initSessionRanks.end(), localSessionRanks.begin(),
-            localSessionRanks.end(), std::back_inserter(intersectionRanks));
+    auto initSessionRanks = getWorldRanks(MpiComm::session());
+    auto localSessionRanks = getWorldRanks(MpiComm::localSession());

-        MPI_Group worldGroup;
-        MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
-        MPI_Group localGroup;
-        MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup));
-        MPI_Comm localComm;
-        MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm));
-        MpiComm::mutableLocalSession().mFreeComm = true;
-        MpiComm::mutableLocalSession() = MpiComm{localComm, false};
-    }
-    else
+    // Add to intersectionRanks in order of initSessionRanks
+    std::vector<int> intersectionRanks;
+    std::unordered_set<int> localSessionRanksSet(localSessionRanks.begin(), localSessionRanks.end());
+    for (auto rank : initSessionRanks)
    {
-        TLLM_CHECK_WITH_INFO(getWorldRanks(MpiComm::session()) == initSessionRanks,
-            "Executors in the same process must use the same participant IDs.");
+        if (localSessionRanksSet.find(rank) != localSessionRanksSet.end())
+        {
+            intersectionRanks.push_back(rank);
+        }
    }
+
+    MPI_Group worldGroup;
+    MPICHECK(MPI_Comm_group(MPI_COMM_WORLD, &worldGroup));
+    MPI_Group localGroup;
+    MPICHECK(MPI_Group_incl(worldGroup, intersectionRanks.size(), intersectionRanks.data(), &localGroup));
+    MPI_Comm localComm;
+    MPICHECK(MPI_Comm_create_group(MPI_COMM_WORLD, localGroup, intersectionRanks.front(), &localComm));
+    MpiComm::mutableLocalSession().mFreeComm = true;
+    MpiComm::mutableLocalSession() = MpiComm{localComm, false};
    TLLM_LOG_INFO("Refreshed the MPI local session");
 #endif // ENABLE_MULTI_DEVICE
 }
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:674d432118c54977329079a06882c84d0d25b764759dfbbaaff0c7bc666eef57
-size 1741228
+oid sha256:954f77299e1d61a038c90bc578936ec06da842909ace2e8ba978fd0c0da0cc1f
+size 1782460
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9c5fed43853462c21cddc12ff3439c410f413c24fd5c970f9a315265df1bb932
-size 1768982
+oid sha256:4ac77ca2662830a5990dbd06ee4d664f8ac97dc342206f5c51ca9f9ca6cb6ce1
+size 1808956
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-0171ced884334f5d26ff73a701cf5343 libtensorrt_llm_executor_static.a
-afc62322817744d37b422e870f202c23 libtensorrt_llm_executor_static.pre_cxx11.a
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+425c9af7b0ae82e622771fc3ef7e3f01 libtensorrt_llm_executor_static.a
+efa5708f62775822591ebd50974ccfd8 libtensorrt_llm_executor_static.pre_cxx11.a
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2faea1a5e1ef32e99fe8570650bc1b300cea4c5c5b1a21652005685b45cf26d7
-size 1807914
+oid sha256:d74551056bd413556a9485bdb7b084e14264e78f12ae6037472878371f2b3b62
+size 1846866
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:192efad1021271bbdb03d1bb82699f761f0f463a3d85cb4d4a8421685fb9a3e4
-size 1717616
+oid sha256:393724263d6f08b831b8d8b56a3e0677f51df80183ba0d4b1fa7f40c2f8611ca
+size 1757514
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-789c268816a73f2723a9d3d85e02987e libtensorrt_llm_executor_static.a
-ca0412898bdc79b3508c68394dd5e3ea libtensorrt_llm_executor_static.pre_cxx11.a
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+12fd54cc3b5ec8462a9db96d646ea578 libtensorrt_llm_executor_static.a
+75fcebc1eae90d05bd3e2d3321a50041 libtensorrt_llm_executor_static.pre_cxx11.a
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f7e9b154c122fedc45327e35b7f46aacb4cd43784f7b6273ee4207c8e2639169
-size 19234764
+oid sha256:98dcfc32cb6a6fcbd84625f0232f9f6d8305b38a7e8380b7e228be5f820c0dd4
+size 19615228
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/version.txt
@ -1,2 +1,2 @@
-a5b2ac5b4fcc5dfb46b97d0e74fd88b6 tensorrt_llm_executor_static.lib
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+422bcd24325bf7ec0b26a9e4a23cce63 tensorrt_llm_executor_static.lib
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@ -1,2 +1,2 @@
 88c30973b9b3452baa3f063d34d08169 libtensorrt_llm_nvrtc_wrapper.so
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/version.txt
@ -1,2 +1,2 @@
 95e9f87610383348e444d2d0b8396f2d libtensorrt_llm_nvrtc_wrapper.so
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:a16c9f1c7c85ab6302be0401c370ddcfb2b5a493cf23348ee8c665ca0af50593
+oid sha256:a9a2ccc0462e815aae0e7fd458c0423f76b3c0bb63ecc7a8902b94194803c4bc
 size 1128448
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e207a8f57b944529163c7ed2ab30639a5f2779c5118602c6ebd50a623d16f845
+oid sha256:e74ab8e65851dfc44e015714fe166f521649b781c85bd0215d42b488218e9ca5
 size 3488
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/version.txt
@ -1,3 +1,3 @@
-b7e624ba775e9f5090ef4b67bcdbd7a2 tensorrt_llm_nvrtc_wrapper.lib
-3e1e3245888be6fd9b7d5934fb2a7718 tensorrt_llm_nvrtc_wrapper.dll
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+700fc148d9a0f939e0088bf69e899360 tensorrt_llm_nvrtc_wrapper.lib
+de95527e8d24da458cf9e3c30b21abea tensorrt_llm_nvrtc_wrapper.dll
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:82d12e1445abc726a172d3383a2d7759f8a4309656b84d767a98aba7e2876e2c
+oid sha256:e29b1c9c454f90b68feffb65f270bba757a649c4cfa26134d21a5c71ecee9d17
 size 25364090
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:dbb231f365dc916557d7c2407008072f4bcd8a54c1196c2c4b08b0cbb91fe1d4
+oid sha256:616fd0c5db05f9ba95fec4180907a890e470a42fe0d361a212b161c915d61a7b
 size 25768990
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-7379ff8f89a0af16364f4440cbcf53bd libtensorrt_llm_internal_cutlass_kernels_static.a
-014c7b9c179959a57faaa2398b63082b libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+dc54f44211570918194805b138a0c5eb libtensorrt_llm_internal_cutlass_kernels_static.a
+7b74b643b98590d9540d62e22764a45d libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:03fff62938a4aea11bbd9a3a01d06e1be1eb07333b56084a2523fe3aa771e653
+oid sha256:04908e6011b2cda19555da0ca257e0f562719a32da76ba288cd35ae17b19c762
 size 44173632
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b70d2553c65a07f5b5ad0bcecd91c9970e84bf8adcfd70ee13316a8a25787e3e
+oid sha256:de44f2b89ef65a7016bd074d6a69610f81752e1cebd6e86b2314c4348cc48cc9
 size 43561142
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-6c7dbbe475d18ef707233096bca3ffcd libtensorrt_llm_internal_cutlass_kernels_static.a
-02ea1e93d2dbd74f7c89289256ee7d95 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+52fd02865207f1e48afb7e2090c750c3 libtensorrt_llm_internal_cutlass_kernels_static.a
+6d1e31066fce1e3c70ec0c56c4b0abb5 libtensorrt_llm_internal_cutlass_kernels_static.pre_cxx11.a
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/tensorrt_llm_internal_cutlass_kernels_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b59abe44ebe9a79100bed9318941895cd09cdcbebea00b06d6ee29cc177368f9
-size 88130432
+oid sha256:04f935b72c3af6c2081f19b94216f0bbcecae37511d9958ff6c62304aa022cab
+size 88141376
--- a/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
+++ b/cpp/tensorrt_llm/kernels/internal_cutlass_kernels/x86_64-windows-msvc/version.txt
@ -1,2 +1,2 @@
-bfe49c735d179edaa07fd0833eab824d tensorrt_llm_internal_cutlass_kernels_static.lib
-bb95842cfaecdd3437b38fdb5c45b7d523bc5d36 commit
+1ce107485d9c6133e135f4dbb0a5c4dc tensorrt_llm_internal_cutlass_kernels_static.lib
+0144f02d9054532ac5b39f5accd89e607f5966a2 commit
--- a/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
+++ b/cpp/tensorrt_llm/layers/lookaheadDecodingUtils.h
@ -121,14 +121,20 @@ private:
 class DebugTensor
 {
 public:
-    DebugTensor(runtime::ITensor const& tensor, char const* name)
+    DebugTensor(runtime::ITensor const& tensor, char const* name,
+        std::shared_ptr<runtime::BufferManager> bufferManager = nullptr,
+        std::shared_ptr<runtime::CudaStream> stream = nullptr)
        : mTensor(tensor)
        , mName(name)
+        , mBufferManager(bufferManager)
+        , mStream(stream)
    {
    }

-    DebugTensor(runtime::ITensor::SharedConstPtr tensor, char const* name)
-        : DebugTensor(*tensor, name)
+    DebugTensor(runtime::ITensor::SharedConstPtr tensor, char const* name,
+        std::shared_ptr<runtime::BufferManager> bufferManager = nullptr,
+        std::shared_ptr<runtime::CudaStream> stream = nullptr)
+        : DebugTensor(*tensor, name, bufferManager, stream)
    {
    }

@ -187,9 +193,11 @@ public:
        runtime::BufferManager::ITensorPtr hostPtr{nullptr};
        if (mTensor.getMemoryType() == runtime::MemoryType::kGPU)
        {
-            runtime::BufferManager manager{std::make_shared<runtime::CudaStream>()};
-            hostPtr = manager.copyFrom(mTensor, runtime::MemoryType::kCPU);
-            manager.getStream().synchronize();
+            auto theManager = mBufferManager
+                ? mBufferManager
+                : std::make_shared<runtime::BufferManager>(mStream ? mStream : std::make_shared<runtime::CudaStream>());
+            hostPtr = theManager->copyFrom(mTensor, runtime::MemoryType::kCPU);
+            theManager->getStream().synchronize();
        }
        return hostPtr;
    }
@ -343,12 +351,80 @@ public:
        TLLM_LOG_DEBUG(shape());
    }

+    template <typename T>
+    void randomize(runtime::SizeType32 vtype)
+    {
+        runtime::BufferRange<T> tensorRange(const_cast<runtime::ITensor&>(mTensor));
+        for (auto& item : tensorRange)
+        {
+            item = vtype == 0 ? 0 : vtype == 1 ? 1 : rand();
+        }
+    }
+
+    void randomize(void)
+    {
+        if (mTensor.getMemoryType() == runtime::MemoryType::kGPU)
+        {
+            runtime::ITensor& nonConstTensor = const_cast<runtime::ITensor&>(mTensor);
+            runtime::BufferManager manager{std::make_shared<runtime::CudaStream>()};
+            runtime::ITensor::SharedConstPtr cpuBuffer = manager.cpu(mTensor.getShape(), mTensor.getDataType());
+            DebugTensor(cpuBuffer, "cpuBuffer").randomize();
+            manager.copy(*cpuBuffer, nonConstTensor);
+            manager.getStream().synchronize();
+        }
+        else
+        {
+            switch (mTensor.getDataType())
+            {
+            case nvinfer1::DataType::kBOOL: return randomize<bool>(3);
+            case nvinfer1::DataType::kFLOAT: return randomize<float>(3);
+            case nvinfer1::DataType::kINT8: return randomize<std::int8_t>(3);
+            case nvinfer1::DataType::kINT32: return randomize<std::int32_t>(3);
+            case nvinfer1::DataType::kINT64: return randomize<std::int64_t>(3);
+            case nvinfer1::DataType::kUINT8: return randomize<std::uint8_t>(3);
+            default: return;
+            }
+        }
+    }
+
+    void setZeros(void)
+    {
+        switch (mTensor.getDataType())
+        {
+        case nvinfer1::DataType::kBOOL: return randomize<bool>(0);
+        case nvinfer1::DataType::kFLOAT: return randomize<float>(0);
+        case nvinfer1::DataType::kINT8: return randomize<std::int8_t>(0);
+        case nvinfer1::DataType::kINT32: return randomize<std::int32_t>(0);
+        case nvinfer1::DataType::kINT64: return randomize<std::int64_t>(0);
+        case nvinfer1::DataType::kUINT8: return randomize<std::uint8_t>(0);
+        default: return;
+        }
+    }
+
+    void setOnes(void)
+    {
+        switch (mTensor.getDataType())
+        {
+        case nvinfer1::DataType::kBOOL: return randomize<bool>(1);
+        case nvinfer1::DataType::kFLOAT: return randomize<float>(1);
+        case nvinfer1::DataType::kINT8: return randomize<std::int8_t>(1);
+        case nvinfer1::DataType::kINT32: return randomize<std::int32_t>(1);
+        case nvinfer1::DataType::kINT64: return randomize<std::int64_t>(1);
+        case nvinfer1::DataType::kUINT8: return randomize<std::uint8_t>(1);
+        default: return;
+        }
+    }
+
 private:
    runtime::ITensor const& mTensor;
    std::string mName;
+    std::shared_ptr<runtime::BufferManager> mBufferManager;
+    std::shared_ptr<runtime::CudaStream> mStream;
 };

 #define D(x) tensorrt_llm::layers::DebugTensor(x, #x)
+#define Db(x, bufferManager) tensorrt_llm::layers::DebugTensor(x, #x, bufferManager, nullptr)
+#define Ds(x, stream) tensorrt_llm::layers::DebugTensor(x, #x, nullptr, stream)
 #define PRINT_TOKENS(x) D(x).print_tokens()
 #define PRINT_VALUES(x) D(x).print_values()
 #define PRINT_SHAPE(x) D(x).print_shape()
--- a/cpp/tensorrt_llm/pybind/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@ -317,13 +317,13 @@ PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
            py::overload_cast<tr::WorldConfig const&>(&tr::GptJsonConfig::engineFilename, py::const_),
            py::arg("world_config"));

-    py::enum_<tb::LlmRequestState_t>(m, "LlmRequestState")
-        .value("REQUEST_STATE_UNKNOWN", tb::LlmRequestState_t::REQUEST_STATE_UNKNOWN)
-        .value("REQUEST_STATE_ENCODER_INIT", tb::LlmRequestState_t::REQUEST_STATE_ENCODER_INIT)
-        .value("REQUEST_STATE_CONTEXT_INIT", tb::LlmRequestState_t::REQUEST_STATE_CONTEXT_INIT)
-        .value("REQUEST_STATE_GENERATION_IN_PROGRESS", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_IN_PROGRESS)
-        .value("REQUEST_STATE_GENERATION_TO_COMPLETE", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_TO_COMPLETE)
-        .value("REQUEST_STATE_GENERATION_COMPLETE", tb::LlmRequestState_t::REQUEST_STATE_GENERATION_COMPLETE);
+    py::enum_<tb::LlmRequestState>(m, "LlmRequestState")
+        .value("UNKNOWN", tb::LlmRequestState::kUNKNOWN)
+        .value("ENCODER_INIT", tb::LlmRequestState::kENCODER_INIT)
+        .value("CONTEXT_INIT", tb::LlmRequestState::kCONTEXT_INIT)
+        .value("GENERATION_IN_PROGRESS", tb::LlmRequestState::kGENERATION_IN_PROGRESS)
+        .value("GENERATION_TO_COMPLETE", tb::LlmRequestState::kGENERATION_TO_COMPLETE)
+        .value("GENERATION_COMPLETE", tb::LlmRequestState::kGENERATION_COMPLETE);

    tpb::NamedTensor::initBindings(m);
    tpb::LlmRequest::initBindings(m);
--- a/cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp
+++ b/cpp/tensorrt_llm/runtime/lookaheadBuffers.cpp
@ -54,7 +54,7 @@ LookaheadRuntimeBuffers::LookaheadRuntimeBuffers(SizeType32 maxBatchSize, SizeTy
    packedMaskHost = manager.cpu(packedMasksDevice->getShape(), nvinfer1::DataType::kINT32);
    positionOffsetsHost = manager.cpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32);
    generationLengthsHost = manager.cpu(generationLengthsDevice->getShape(), nvinfer1::DataType::kINT32);
-    positionIdsHost = manager.gpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32);
+    positionIdsHost = manager.cpu(positionIdsDevice->getShape(), nvinfer1::DataType::kINT32);

    packedMaskHostCopy = manager.cpu(packedMasksDevice->getShape(), nvinfer1::DataType::kINT32);
    positionOffsetsHostCopy = manager.cpu(positionOffsetsDevice->getShape(), nvinfer1::DataType::kINT32);
--- a/cpp/tensorrt_llm/runtime/medusaModule.cpp
+++ b/cpp/tensorrt_llm/runtime/medusaModule.cpp
@ -96,7 +96,7 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st
        if (curDepth != depth)
        {
            TLLM_CHECK(depth + 1 == curDepth);
-            TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(),
+            TLLM_CHECK_WITH_INFO(curDepth <= getMaxDraftPathLen(),
                "Medusa choices require more Medusa heads than the engine was built with.");
            // Save TopK
            topKs[depth - 1] = maxTopK;
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@ -45,12 +45,20 @@ set(TOP_LEVEL_DIR "${PROJECT_SOURCE_DIR}/..")

 add_custom_target(google-tests)

+set(CASE_REPORT_WRAPPER
+    ${CMAKE_CURRENT_SOURCE_DIR}/resources/scripts/case_report_wrapper.py)
+
 function(add_gtest test_name test_src)
  set(options NO_GTEST_MAIN NO_TLLM_LINKAGE)
  cmake_parse_arguments(ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}"
                        ${ARGN})
  add_executable(${test_name} ${test_src})

+  if($ENV{LLM_MEMORY_PROFILING})
+    set_property(TARGET ${test_name} PROPERTY TEST_LAUNCHER
+                                              ${CASE_REPORT_WRAPPER})
+  endif()
+
  target_link_libraries(${test_name} PUBLIC gmock_main nvonnxparser)
  if(NOT ARGS_NO_GTEST_MAIN)
    target_link_libraries(${test_name} PUBLIC gtest_main)
--- a/cpp/tests/kernels/mixtureOfExpertsTest.cu
+++ b/cpp/tests/kernels/mixtureOfExpertsTest.cu
@ -1087,11 +1087,37 @@ protected:
    void BasicPermuteTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4);

    std::vector<int> calcPermuteMapExpertParallel(std::vector<int> const& expected_experts);
-    void ExpertParallelTest(int k = 1);

-    void TensorParallelTest(int k = 1);
+    void ExpertParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4)
+    {
+        // 2 experts per rank
+        ParallelelismTest(k, 1, num_experts / 2, hidden_size, num_experts);
+        // 1 expert per rank
+        ParallelelismTest(k, 1, num_experts, hidden_size, num_experts);
+    }

-    void MixedParallelTest(int k = 1);
+    void TensorParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4)
+    {
+        ParallelelismTest(k, 2, 1, hidden_size, num_experts);
+        ParallelelismTest(k, 4, 1, hidden_size, num_experts);
+        ParallelelismTest(k, 8, 1, hidden_size, num_experts);
+    }
+
+    void MixedParallelTest(int k = 1, int64_t hidden_size = DEFAULT_HIDDEN_SIZE, int64_t num_experts = 4)
+    {
+        // 2 experts per rank
+        ParallelelismTest(k, 2, num_experts / 2, hidden_size, num_experts);
+        ParallelelismTest(k, 4, num_experts / 2, hidden_size, num_experts);
+        ParallelelismTest(k, 8, num_experts / 2, hidden_size, num_experts);
+
+        // 1 expert per rank
+        ParallelelismTest(k, 2, num_experts, hidden_size, num_experts);
+        ParallelelismTest(k, 4, num_experts, hidden_size, num_experts);
+        ParallelelismTest(k, 8, num_experts, hidden_size, num_experts);
+    }
+
+    void ParallelelismTest(int k = 1, int tp_size = 4, int ep_size = 2, int64_t hidden_size = DEFAULT_HIDDEN_SIZE,
+        int64_t num_experts = 4);
 };

 template <class WeightParams>
@ -1276,6 +1302,7 @@ TYPED_TEST(MixtureOfExpertsTest, PermuteMixtral8x7b)
 {
    this->mUseBias = false;
    this->mActType = tensorrt_llm::ActivationType::Swiglu;
+    this->mNormMode = tensorrt_llm::kernels::MOEExpertScaleNormalizationMode::RENORMALIZE;
    this->BasicPermuteTest(2, 4096, 8);
 }

@ -1299,7 +1326,8 @@ std::vector<int> MixtureOfExpertsTest<TypeParam_>::calcPermuteMapExpertParallel(
 }

 template <class TypeParam_>
-void MixtureOfExpertsTest<TypeParam_>::ExpertParallelTest(int k)
+void MixtureOfExpertsTest<TypeParam_>::ParallelelismTest(
+    int k, int tp_size, int ep_size, int64_t hidden_size, int64_t num_experts)
 {
    if (FP8)
    {
@ -1307,15 +1335,20 @@ void MixtureOfExpertsTest<TypeParam_>::ExpertParallelTest(int k)
        mUseBias = false;
    }

+    ASSERT_LE(ep_size, num_experts);
+    if (tp_size == 1)
+    {
+        // Only the first 4 experts are ever used. They should be split across at least 2 ranks
+        ASSERT_LT(num_experts / ep_size, 4)
+            << "Expert parallelism must have less than 4 experts per rank or the test is ineffective";
+    }
+
    auto test_archs = getAllTileConfigsToTest();
    for (auto [gemm1, gemm2] : test_archs)
    {
        mInternalSelectedConfig1 = gemm1;
        mInternalSelectedConfig2 = gemm2;

-        int64_t hidden_size = DEFAULT_HIDDEN_SIZE;
-        int parallelism = 2;
-        int64_t num_experts = 4;
        int64_t num_tokens = 3;

        std::vector<DataType> hidden_states(hidden_size * num_tokens);
@ -1327,122 +1360,9 @@ void MixtureOfExpertsTest<TypeParam_>::ExpertParallelTest(int k)
            0.25, 0.21, 0.35, 0.19, //
        };

-        std::vector<int> expected_experts{0, 3, 2};
-        if (k == 2)
-            expected_experts = {0, 2, 3, 1, 2, 0};
-        else if (k == 3)
-            expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1};
-        std::vector<OutputType> results(hidden_states.size(), 0);
-        for (int i = 0; i < parallelism; i++)
-        {
-            if (i == 0)
-            {
-                // Only need to init the inputs on the first iteration
-                runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {},
-                    MOEParallelismConfig{1, 0, parallelism, i});
-            }
-            else
-            {
-                runMoEPermute(MOEParallelismConfig{1, 0, parallelism, i});
-            }
-
-            auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k);
-            // Experts should only be selected when we are on the right node
-            // Note the index is [0,num_experts_per_node), so we offset the experts by the start for this node
-            int const start_expert = i * (mNumExperts / parallelism);
-            std::transform(selected_expert.begin(), selected_expert.end(), selected_expert.begin(),
-                [&](int val) { return val >= mNumExperts ? val : val + start_expert; });
-            auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, parallelism, i);
-            ASSERT_EQ(selected_expert, masked_expected_experts);
-
-            auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k);
-            auto permute_map = calcPermuteMapExpertParallel(masked_expected_experts);
-            ASSERT_EQ(permute_map, proj_map) << "Iteration " << i;
-            compareSoftmax(expected_experts, probs);
-
-            // Do the final reduce
-            auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size);
-            std::transform(
-                iter_results.cbegin(), iter_results.cend(), results.cbegin(), results.begin(), std::plus<>{});
-        }
-
-        compareFinal(expected_experts, probs, raw_unquant_input, results);
-    }
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallel)
-{
-    this->ExpertParallelTest();
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelK2)
-{
-    this->ExpertParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelNoBias)
-{
-    this->mUseBias = false;
-    this->ExpertParallelTest();
-    this->ExpertParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelRenorm)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;
-    this->ExpertParallelTest();
-    this->ExpertParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelSparseMixer)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;
-    this->ExpertParallelTest();
-    this->ExpertParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelGeglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Geglu;
-    this->ExpertParallelTest();
-    this->ExpertParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, ExpertParallelSwiglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Swiglu;
-    this->ExpertParallelTest();
-    this->ExpertParallelTest(2);
-}
-
-template <class TypeParam_>
-void MixtureOfExpertsTest<TypeParam_>::TensorParallelTest(int k)
-{
-    if (FP8)
-    {
-        // TODO Remove this when bias + FP8 is supported
-        mUseBias = false;
-    }
-
-    auto test_archs = getAllTileConfigsToTest();
-    for (auto [gemm1, gemm2] : test_archs)
-    {
-        mInternalSelectedConfig1 = gemm1;
-        mInternalSelectedConfig2 = gemm2;
-
-        int64_t hidden_size = DEFAULT_HIDDEN_SIZE;
-        int parallelism = 8;
-        int64_t num_experts = 4;
-        int64_t num_tokens = 3;
-
-        std::vector<DataType> hidden_states(hidden_size * num_tokens);
-        auto raw_unquant_input = populateTokens(hidden_states);
-
-        std::vector<float> probs = {
-            0.5, 0.1, 0.25, 0.15,   //
-            0.03, 0.2, 0.07, 0.7,   //
-            0.25, 0.21, 0.35, 0.19, //
-        };
+        std::vector<std::vector<DataType>> hidden_input = {hidden_states};
+        std::vector<std::vector<float>> router_input = {probs};
+        resizeRouterInputs(router_input, num_experts, num_tokens);

        std::vector<int> expected_experts{0, 3, 2};
        if (k == 2)
@ -1450,159 +1370,34 @@ void MixtureOfExpertsTest<TypeParam_>::TensorParallelTest(int k)
        else if (k == 3)
            expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1};
        std::vector<OutputType> results(hidden_states.size(), 0);
-        for (int i = 0; i < parallelism; i++)
+        for (int i = 0; i < tp_size; i++)
        {
-            if (i == 0)
-            {
-                // Only need to init the inputs on the first iteration
-                runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {},
-                    MOEParallelismConfig{parallelism, i, 1, 0});
-            }
-            else
-            {
-                runMoEPermute(MOEParallelismConfig{parallelism, i, 1, 0});
-            }
-
-            auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k);
-            EXPECT_EQ(selected_expert, expected_experts);
-
-            auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k);
-            std::vector<int> permute_map{0, 2, 1};
-            if (k == 2)
-                permute_map = {0, 5, 4, 3, 2, 1};
-            if (k == 3)
-                permute_map = {0, 8, 6, 4, 2, 1, 7, 5, 3};
-
-            ASSERT_EQ(permute_map, proj_map) << "Iteration " << i;
-
-            // Do the final reduce
-            auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size);
-            std::transform(
-                iter_results.cbegin(), iter_results.cend(), results.cbegin(), results.begin(), std::plus<>{});
-        }
-
-        compareFinal(expected_experts, probs, raw_unquant_input, results);
-    }
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallel)
-{
-    this->TensorParallelTest();
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelK2)
-{
-    this->TensorParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelK3)
-{
-    this->TensorParallelTest(3);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelNoBias)
-{
-    this->mUseBias = false;
-    this->TensorParallelTest();
-    this->TensorParallelTest(2);
-    this->TensorParallelTest(3);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelRenorm)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;
-    this->TensorParallelTest();
-    this->TensorParallelTest(2);
-    this->TensorParallelTest(3);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelSparseMixer)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;
-    this->TensorParallelTest();
-    this->TensorParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelGeglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Geglu;
-    this->TensorParallelTest();
-    this->TensorParallelTest(2);
-    this->TensorParallelTest(3);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, TensorParallelSwiglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Swiglu;
-    this->TensorParallelTest();
-    this->TensorParallelTest(2);
-    this->TensorParallelTest(3);
-}
-
-template <class TypeParam_>
-void MixtureOfExpertsTest<TypeParam_>::MixedParallelTest(int k)
-{
-    if (FP8)
-    {
-        // TODO Remove this when bias + FP8 is supported
-        mUseBias = false;
-    }
-
-    auto test_archs = getAllTileConfigsToTest();
-    for (auto [gemm1, gemm2] : test_archs)
-    {
-        mInternalSelectedConfig1 = gemm1;
-        mInternalSelectedConfig2 = gemm2;
-
-        int64_t hidden_size = DEFAULT_HIDDEN_SIZE;
-        int tp_parallelism = 2;
-        int ep_parallelism = 2;
-        int64_t num_experts = 4;
-        int64_t num_tokens = 3;
-
-        std::vector<DataType> hidden_states(hidden_size * num_tokens);
-        auto raw_unquant_input = populateTokens(hidden_states);
-
-        std::vector<float> probs = {
-            0.5, 0.1, 0.25, 0.15,   //
-            0.03, 0.2, 0.07, 0.7,   //
-            0.25, 0.21, 0.35, 0.19, //
-        };
-
-        std::vector<int> expected_experts{0, 3, 2};
-        if (k == 2)
-            expected_experts = {0, 2, 3, 1, 2, 0};
-        else if (k == 3)
-            expected_experts = {0, 2, 3, 3, 1, 2, 2, 0, 1};
-        std::vector<OutputType> results(hidden_states.size(), 0);
-        for (int i = 0; i < tp_parallelism; i++)
-        {
-            for (int j = 0; j < ep_parallelism; j++)
+            for (int j = 0; j < ep_size; j++)
            {
                if (i == 0 && j == 0)
                {
                    // Only need to init the inputs on the first iteration
-                    runMoEPermute({hidden_states}, {probs}, hidden_size, num_experts, k, {},
-                        MOEParallelismConfig{tp_parallelism, i, ep_parallelism, j});
+                    runMoEPermute(hidden_input, router_input, hidden_size, num_experts, k, {},
+                        MOEParallelismConfig{tp_size, i, ep_size, j});
                }
                else
                {
-                    runMoEPermute(MOEParallelismConfig{tp_parallelism, i, ep_parallelism, j});
+                    runMoEPermute(MOEParallelismConfig{tp_size, i, ep_size, j});
                }

                auto selected_expert = getDataFromDevice(mSelectedExpert, num_tokens * k);
                // Experts should only be selected when we are on the right node
                // Note the index is [0,num_experts_per_node), so we offset the experts by the start for this node
-                int const start_expert = j * (mNumExperts / ep_parallelism);
+                int const start_expert = j * (mNumExperts / ep_size);
                std::transform(selected_expert.begin(), selected_expert.end(), selected_expert.begin(),
                    [&](int val) { return val >= mNumExperts ? val : val + start_expert; });
-                auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, ep_parallelism, j);
+                auto masked_expected_experts = maskSelectedExpertsForTP(expected_experts, ep_size, j);
                ASSERT_EQ(selected_expert, masked_expected_experts);

                auto proj_map = getDataFromDevice(mSourceToExpandedMap, num_tokens * k);
                auto permute_map = calcPermuteMapExpertParallel(masked_expected_experts);
                ASSERT_EQ(permute_map, proj_map) << "Iteration " << i << " " << j;
-                compareSoftmax(expected_experts, probs);
+                compareSoftmax(expected_experts, router_input[0]);

                // Do the final reduce
                auto iter_results = getDataFromDevice(mFinalOutput, num_tokens * hidden_size);
@ -1611,54 +1406,76 @@ void MixtureOfExpertsTest<TypeParam_>::MixedParallelTest(int k)
            }
        }

-        compareFinal(expected_experts, probs, raw_unquant_input, results);
+        compareFinal(expected_experts, router_input[0], raw_unquant_input, results);
    }
 }

-TYPED_TEST(MixtureOfExpertsTest, MixedParallel)
-{
-    this->MixedParallelTest();
-}
+#define PARALLEL_TEST_SUITE(ParallelismType)                                                                           \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType)                                                                  \
+    {                                                                                                                  \
+        this->ParallelismType##Test();                                                                                 \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##K2)                                                              \
+    {                                                                                                                  \
+        this->ParallelismType##Test(2);                                                                                \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##K3)                                                              \
+    {                                                                                                                  \
+        this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##NoBias)                                                          \
+    {                                                                                                                  \
+        this->mUseBias = false;                                                                                        \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Renorm)                                                          \
+    {                                                                                                                  \
+        this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;                                                \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##SparseMixer)                                                     \
+    {                                                                                                                  \
+        this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;                                               \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        /* k=3 is not supported for sparse mixer tests */                                                              \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Geglu)                                                           \
+    {                                                                                                                  \
+        this->mActType = tensorrt_llm::ActivationType::Geglu;                                                          \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Swiglu)                                                          \
+    {                                                                                                                  \
+        this->mActType = tensorrt_llm::ActivationType::Swiglu;                                                         \
+        this->ParallelismType##Test();                                                                                 \
+        this->ParallelismType##Test(2);                                                                                \
+        this->ParallelismType##Test(3);                                                                                \
+    }                                                                                                                  \
+                                                                                                                       \
+    TYPED_TEST(MixtureOfExpertsTest, ParallelismType##Mixtral8x7b)                                                     \
+    {                                                                                                                  \
+        this->mUseBias = false;                                                                                        \
+        this->mActType = tensorrt_llm::ActivationType::Swiglu;                                                         \
+        this->mNormMode = tensorrt_llm::kernels::MOEExpertScaleNormalizationMode::RENORMALIZE;                         \
+        this->ParallelismType##Test(2, 4096, 8);                                                                       \
+    }

-TYPED_TEST(MixtureOfExpertsTest, MixedParallelK2)
-{
-    this->MixedParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelNoBias)
-{
-    this->mUseBias = false;
-    this->MixedParallelTest();
-    this->MixedParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelRenorm)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::RENORMALIZE;
-    this->MixedParallelTest();
-    this->MixedParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelSparseMixer)
-{
-    this->mNormMode = MOEExpertScaleNormalizationMode::SPARSE_MIXER;
-    this->MixedParallelTest();
-    this->MixedParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelGeglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Geglu;
-    this->MixedParallelTest();
-    this->MixedParallelTest(2);
-}
-
-TYPED_TEST(MixtureOfExpertsTest, MixedParallelSwiglu)
-{
-    this->mActType = tensorrt_llm::ActivationType::Swiglu;
-    this->MixedParallelTest();
-    this->MixedParallelTest(2);
-}
+PARALLEL_TEST_SUITE(ExpertParallel)
+PARALLEL_TEST_SUITE(TensorParallel)
+PARALLEL_TEST_SUITE(MixedParallel)

 TYPED_TEST(MixtureOfExpertsTest, ConfigSweep)
 {
--- a/cpp/tests/resources/scripts/case_report_wrapper.py
+++ b/cpp/tests/resources/scripts/case_report_wrapper.py
@ -0,0 +1,42 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import subprocess
+import sys
+import time
+
+if __name__ == '__main__':
+    case = ''
+    for arg in sys.argv[1:]:
+        if '--gtest_filter=' in arg:
+            case = arg.removeprefix('--gtest_filter=')
+
+    gtest = subprocess.Popen(sys.argv[1:])
+
+    if case:
+        import multiprocessing.connection
+
+        with multiprocessing.connection.Client("/tmp/profiling_scribe.unix",
+                                               "AF_UNIX") as client:
+            client.send({
+                "type": "gtest_case",
+                "timestamp": time.time(),
+                "case": case,
+                "pid": gtest.pid
+            })
+
+    gtest.wait()
+    exit(gtest.returncode)
--- a/cpp/tests/resources/scripts/test_cpp.py
+++ b/cpp/tests/resources/scripts/test_cpp.py
@ -16,13 +16,16 @@

 import argparse as _arg
 import copy
+import functools
 import glob
 import logging as _log
 import os as _os
 import pathlib as _pl
 import platform
+import signal
 import subprocess as _sp
 import sys as _sys
+import time as _time
 import typing as _tp

 build_script_dir = _pl.Path(
@ -556,6 +559,31 @@ def build_tests(build_dir: _pl.Path):
    run_command(make_google_tests, cwd=build_dir, timeout=300)


+def with_memory_monitor(func):
+    if not _os.environ.get('LLM_MEMORY_PROFILING', False):
+        return func
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        memory_collector = _sp.Popen([
+            "/usr/bin/python3",
+            find_root_dir() /
+            "tests/llm-test-defs/turtle/defs/memory_collector.py",
+            "-p",
+            str(_os.getpid()),
+            "-i",
+            "0.2",
+        ])
+        try:
+            func(*args, **kwargs)
+        finally:
+            memory_collector.send_signal(signal.SIGINT)
+            memory_collector.wait()
+
+    return wrapper
+
+
+@with_memory_monitor
 def run_unit_tests(build_dir: _pl.Path, timeout=1800):
    build_tests(build_dir=build_dir)

@ -579,6 +607,7 @@ def run_unit_tests(build_dir: _pl.Path, timeout=1800):
    parallel_run_ctest(ctest, cwd=build_dir, env=cpp_env, timeout=timeout)


+@with_memory_monitor
 def run_single_gpu_tests(build_dir: _pl.Path,
                         run_gpt,
                         run_gptj,
@ -646,6 +675,7 @@ def produce_mpirun_command(*, global_commands, nranks, local_commands,
    return l[:-1]


+@with_memory_monitor
 def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
    build_tests(build_dir=build_dir)

@ -1068,4 +1098,24 @@ if __name__ == "__main__":

    del test_args.run_all_models

-    run_tests(**vars(test_args))
+    do_memory_profiling = _os.environ.get('LLM_MEMORY_PROFILING', False)
+    if do_memory_profiling:
+        unix_socket = "/tmp/profiling_scribe.unix"
+
+        scribe = _sp.Popen([
+            "/usr/bin/python3",
+            find_root_dir() /
+            "tests/llm-test-defs/turtle/defs/profiling_scribe.py", "-l",
+            unix_socket
+        ])
+
+        while not _os.path.exists(unix_socket):
+            _time.sleep(0.1)
+
+    try:
+        run_tests(**vars(test_args))
+    finally:
+        if do_memory_profiling:
+            scribe.send_signal(signal.SIGINT)
+            scribe.wait(timeout=10)
+            scribe.kill()
--- a/docs/source/advanced/executor.md
+++ b/docs/source/advanced/executor.md
--- a/docs/source/advanced/kv-cache-reuse.md
+++ b/docs/source/advanced/kv-cache-reuse.md
@ -1,3 +1,5 @@
+(kv-cache-reuse)=
+
 # KV cache reuse

 This document describes how kv cache pages can be shared and reused by requests that start with the same prompt. This can greatly lower first token latency, the time it takes before the first output token is generated. Many use cases can benefit from this, including multi-turn requests and system prompts.
--- a/docs/source/advanced/speculative-decoding.md
+++ b/docs/source/advanced/speculative-decoding.md
@ -1,3 +1,5 @@
+(speculative-decoding)=
+
 # Speculative Sampling

 Speculative Sampling (also referred to as Speculative Decoding) is a set of techniques designed to allow generation of more than one token per forward pass iteration. This can lead to a reduction in the average per-token latency **in situations where the GPU
@ -30,7 +32,7 @@ may prove simpler than generating a summary for an article.
 Furthermore, when integrating Medusa with a standard PyTorch model implementation which may not be as finely
 tuned as TensorRT-LLM, the potential time savings are more pronounced.

-# Draft Model Approach
+## Draft Model Approach

 The Draft model approach involves the use of two distinct models trained independently
 but sharing the same vocabulary: a smaller Draft model and a larger Target model.
@ -58,7 +60,7 @@ it is advisable to enable KV cache reuse for both models.
 This can be achieved by adding the `--use_paged_context_fmha=enable` flag to the `trtllm-build` command
 and setting `enableBlockReuse=true` in the `KVCacheConfig`.

-## Using Draft model approach with Triton Inference Server
+### Using Draft model approach with Triton Inference Server

 + Draft model approach is supported since TensorRT-LLM-0.7.0 (using two separate Tritonserver to maintain draft and target model respectively), but has significant optimization in TensorRT-LLM-0.10.0 (using one Tritonserver with [Business Logic Scripting](https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#business-logic-scripting), BLS).
 + The source file of Draft model with BLS can be found [here](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py).
@ -218,7 +220,7 @@ and setting `enableBlockReuse=true` in the `KVCacheConfig`.
    pkill -9 -f tritonserver
    ```

-# Medusa
+## Medusa

 This approach leverages a single model to both generate and verify draft tokens.
 It enhances the existing model by adding multiple extra language model heads, known as Medusa heads.
@ -249,7 +251,7 @@ In the TensorRT-LLM implementation of Medusa, the configuration of the tree is a
 This flexibility allows you to experiment and identify the optimal tree structure for your use case,
 which can then be utilized in a production environment.

-## Medusa Tree
+### Medusa Tree

 Consider the following diagram, which illustrates how the hidden states from the last layer of the base model
 are passed to the base model's language model (LM) head and to four Medusa heads (MHs).
@ -294,11 +296,11 @@ So, only `9` candidates are specified.

 **Specifying paths-only instead of all choices is currently supported only in the Python runtime.**

-## Using Medusa with TensorRT-LLM
+### Using Medusa with TensorRT-LLM

 For guidance on constructing and executing Medusa with the Python runtime, consult the [Medusa README](https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/medusa/README.md). When utilizing the Inflight Fused Batching (IFB) with the C++ API, it is necessary to define the `medusa_choices` explicitly within the model configuration. For detailed instructions, refer to the [model configuration in TensorRT-LLM backend](https://github.com/triton-inference-server/tensorrtllm_backend?tab=readme-ov-file#modify-the-model-configuration) for more details.

-### Limitations
+#### Limitations

 - TensorRT-LLM supports Medusa only for Vicuna (fine tuned LLaMA).
 However, similar to any new model, you can follow the same approach to define your own Medusa model and deploy with TensorRT-LLM.
@ -306,7 +308,7 @@ However, similar to any new model, you can follow the same approach to define yo
 - Beam search is **not** compatible with Medusa.


-# ReDrafter
+## ReDrafter

 This approach enhances the single-model Medusa method by predicting and verifying tokens using the same model. However, unlike Medusa, it predicts draft tokens using a recurrent predictor, where each draft token depends on the previous one. This method also allows the use of beam search to identify more prominent draft tokens. For more details, please read [the ReDrafter paper](https://arxiv.org/html/2403.09919v1).

--- a/docs/source/architecture/core-concepts.md
+++ b/docs/source/architecture/core-concepts.md
@ -205,7 +205,7 @@ void invokeQuantization(...) {
 ```

 For more details on how TensorRT-LLM implements the GPT Attention operator, see
-the [Multi-head, Multi-query and Group-query Attention](gpt_attention.md) document.
+the [Multi-head, Multi-query and Group-query Attention](../advanced/gpt-attention.md) document.

 # Runtime

@ -214,7 +214,7 @@ the runtime components is to load the TensorRT engines and drive their
 execution. Typically, for an auto-regressive model like GPT, the runtime is in
 charge of loading the engine that implements both the processing of the input
 sequence as well as the body of the generation loop. See the [GPT C++
-Runtime](gpt_runtime.md) document for details on the C++ Runtime.
+Runtime](../advanced/gpt-runtime.md) document for details on the C++ Runtime.

 (multi-gpu-multi-node)=

--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@ -96,11 +96,14 @@ Welcome to TensorRT-LLM's Documentation!

   advanced/gpt-attention.md
   advanced/gpt-runtime.md
+   advanced/executor.md
   advanced/graph-rewriting.md
   advanced/batch-manager.md
   advanced/inference-request.md
   advanced/lora.md
   advanced/expert-parallelism.md
+   advanced/kv-cache-reuse.md
+   advanced/speculative-decoding.md

 .. toctree::
   :maxdepth: 2
--- a/docs/source/release-notes.md
+++ b/docs/source/release-notes.md
@ -377,11 +377,11 @@ All published functionality in the Release Notes has been fully tested and verif

 ### Key Features and Enhancements

- Chunked context support (see docs/source/gpt_attention.md#chunked-context)
+- Chunked context support (see docs/source/advanced/gpt-attention.md#chunked-context)
 - LoRA support for C++ runtime (see docs/source/lora.md)
 - Medusa decoding support (see examples/medusa/README.md)
  - The support is limited to Python runtime for Ampere or newer GPUs with fp16 and bf16 accuracy, and the `temperature` parameter of sampling configuration should be 0
- StreamingLLM support for LLaMA (see docs/source/gpt_attention.md#streamingllm)
+- StreamingLLM support for LLaMA (see docs/source/advanced/gpt-attention.md#streamingllm)
 - Support for batch manager to return logits from context and/or generation phases
  - Include support in the Triton backend
 - Support AWQ and GPTQ for QWEN
--- a/examples/baichuan/requirements.txt
+++ b/examples/baichuan/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.15.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/bloom/requirements.txt
+++ b/examples/bloom/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/chatglm/requirements.txt
+++ b/examples/chatglm/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 evaluate~=0.4.1
 protobuf
--- a/examples/dbrx/requirements.txt
+++ b/examples/dbrx/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/deepseek_v1/README.md
+++ b/examples/deepseek_v1/README.md
@ -0,0 +1,77 @@
+# Deepseek-v1
+
+This document shows how to build and run [deepseek-v1](https://arxiv.org/pdf/2401.06066) model in TensorRT-LLM.
+
+- [Deepseek-v1](#deepseek-v1)
+    - [Prerequisite](#prerequistie)
+    - [Hardware](#hardware)
+    - [Overview](#overview)
+    - [Support Matrix](#support-matrix)
+    - [Usage](#usage)
+        - [Build TensorRT engine(s)](#build-tensorrt-engines)
+
+## Prerequisite
+
+First, please download Deepseek-v1 weights from HF https://huggingface.co/deepseek-ai/deepseek-moe-16b-base.
+
+```bash
+git lfs install
+git clone https://huggingface.co/deepseek-ai/deepseek-moe-16b-base
+```
+
+## Hardware
+
+The Deepseek-v1 model requires 1x80G GPU memory.
+
+## Overview
+
+The TensorRT-LLM Deepseek-v1 implementation can be found in [tensorrt_llm/models/deepseek_v1/model.py](../../tensorrt_llm/models/deepseek_v1/model.py). The TensorRT-LLM Deepseek-v1 example code is located in [`example/deepseek_v1`](./). There is one main file:
+
+* [`convert_checkpoint.py`](./convert_checkpoint.py) to convert the Deepseek-v1 model into tensorrt-llm checkpoint format.
+
+In addition, there are three shared files in the parent folder [`examples`](../) can be used for inference and evaluation:
+
+* [`../run.py`](../run.py) to run the model inference output by given an input text.
+* [`../summarize.py`](../summarize.py) to summarize the article from [cnn_dailmail](https://huggingface.co/datasets/cnn_dailymail) dataset, it can running the summarize from HF model and TensorRT-LLM model.
+* [`../mmlu.py`](../mmlu.py) to running score script from https://github.com/declare-lab/instruct-eval to compare HF model and TensorRT-LLM model on the MMLU dataset.
+
+## Support Matrix
+
+- [x] FP16
+- [x] TENSOR PARALLEL
+- [ ] FP8
+
+## Usage
+
+The TensorRT-LLM Deepseek-v1 example code locates at [examples/deepseek_v1](./). It takes PyTorch weights as input, and builds corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
+
+### Build TensorRT engine(s)
+
+Below is the step-by-step to run Deepseek-v1 with TensorRT-LLM.
+
+First the checkpoint will be converted to the TensorRT-LLM checkpoint format by apply [`convert_checkpoint.py`](./convert_checkpoint.py). After that, the TensorRT engine(s) can be build with TensorRT-LLM checkpoint.
+
+```bash
+# Build the bfloat16 engine from Deepseek-v1 HF weights.
+python convert_checkpoint.py --model_dir ./deepseek_moe_16b/ \
+                            --output_dir ./trtllm_checkpoint_deepseek_v1_1gpu_bf16 \
+                            --dtype bfloat16 \
+                            --tp_size 1
+trtllm-build --checkpoint_dir ./trtllm_checkpoint_deepseek_v1_1gpu_bf16 \
+            --output_dir ./trtllm_engines/deepseek_v1/bf16/tp1 \
+            --gpt_attention_plugin bfloat16 \
+            --gemm_plugin bfloat16 \
+            --moe_plugin bfloat16 \
+```
+
+Then, test the engine with [run.py](../run.py) script:
+
+```bash
+python ../run.py --engine_dir ./trtllm_engines/deepseek_v1/bf16/tp1 \
+                --tokenizer_dir ./deepseek_moe_16b/ \
+                --max_output_len 32 \
+                --top_p 0 \
+                --input_text "The president of the United States is person who"
+```
+## Credits
+This Deepseek-v1 model example exists thanks to @akhoroshev(https://github.com/akhoroshev) community contribution!
--- a/examples/deepseek_v1/init.py
+++ b/examples/deepseek_v1/init.py
@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/examples/deepseek_v1/convert_checkpoint.py
+++ b/examples/deepseek_v1/convert_checkpoint.py
@ -0,0 +1,215 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import os
+import time
+import traceback
+from concurrent.futures import ThreadPoolExecutor, as_completed
+
+import tensorrt_llm
+from tensorrt_llm._utils import release_gc
+from tensorrt_llm.layers import MoeConfig
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import DeepseekForCausalLM
+from tensorrt_llm.models.deepseek_v1.convert import load_hf_deepseek
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model_dir', type=str, default=None, required=True)
+    parser.add_argument('--tp_size',
+                        type=int,
+                        default=1,
+                        help='N-way tensor parallelism size')
+    parser.add_argument('--pp_size',
+                        type=int,
+                        default=1,
+                        help='N-way pipeline parallelism size')
+    parser.add_argument(
+        '--moe_tp_size',
+        type=int,
+        default=-1,
+        help=
+        'N-way tensor parallelism size for MoE, default is tp_size, which will do tp-only for MoE'
+    )
+    parser.add_argument(
+        '--moe_ep_size',
+        type=int,
+        default=-1,
+        help=
+        'N-way expert parallelism size for MoE, default is 1, which will do tp-only for MoE'
+    )
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='float16',
+                        choices=['float32', 'bfloat16', 'float16'])
+    parser.add_argument(
+        '--use_parallel_embedding',
+        action="store_true",
+        default=False,
+        help=
+        'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
+    )
+    parser.add_argument(
+        '--embedding_sharding_dim',
+        type=int,
+        default=0,
+        choices=[0, 1],
+        help=
+        'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0)'
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
+        'Note: embedding sharing is only enabled when embedding_sharding_dim=0')
+    parser.add_argument(
+        '--use_embedding_sharing',
+        action="store_true",
+        default=False,
+        help=
+        'Try to reduce the engine size by sharing the embedding lookup table between two layers'
+        'Note: the flag might not take effect when the criteria are not met')
+    parser.add_argument('--output_dir',
+                        type=str,
+                        default='trtllm_checkpoint',
+                        required=True,
+                        help='The path to save the TensorRT-LLM checkpoint')
+    parser.add_argument(
+        '--workers',
+        type=int,
+        default=1,
+        help='The number of workers for converting checkpoint in parallel')
+    parser.add_argument(
+        '--moe_num_experts',
+        type=int,
+        default=0,
+        help='Specify the number of experts to use for MOE layers')
+    parser.add_argument(
+        '--moe_top_k',
+        type=int,
+        default=0,
+        help=
+        'Specify the top_k value to use for MOE layers. Default to 1 if --moe_num_experts is set'
+    )
+    parser.add_argument(
+        '--moe_renorm_mode',
+        type=int,
+        default=MoeConfig.ExpertScaleNormalizationMode.RENORMALIZE,
+        help=
+        'Controls renormalization after gate logits. Check layers/moe.py for accepted values'
+    )
+    parser.add_argument(
+        '--save_config_only',
+        action="store_true",
+        default=False,
+        help=
+        'Only save the model config w/o read and converting weights, be careful, this is for debug only'
+    )
+    parser.add_argument(
+        '--disable_weight_only_quant_plugin',
+        default=False,
+        action="store_true",
+        help=
+        'By default, using plugin implementation for weight quantization. Enabling disable_weight_only_quant_plugin flag will use ootb implementation instead of plugin.'
+        'You must also use --use_weight_only for that argument to have an impact'
+    )
+    # Add quantization related feature later
+    args = parser.parse_args()
+
+    return args
+
+
+def args_to_build_options(args):
+    return {
+        'use_parallel_embedding': args.use_parallel_embedding,
+        'embedding_sharding_dim': args.embedding_sharding_dim,
+        'share_embedding_table': args.use_embedding_sharing,
+        'disable_weight_only_quant_plugin':
+        args.disable_weight_only_quant_plugin
+    }
+
+
+def execute(workers, func, args):
+    if workers == 1:
+        for rank, f in enumerate(func):
+            f(args, rank)
+    else:
+        with ThreadPoolExecutor(max_workers=workers) as p:
+            futures = [p.submit(f, args, rank) for rank, f in enumerate(func)]
+            exceptions = []
+            for future in as_completed(futures):
+                try:
+                    future.result()
+                except Exception as e:
+                    traceback.print_exc()
+                    exceptions.append(e)
+            assert len(
+                exceptions
+            ) == 0, "Checkpoint conversion failed, please check error log."
+
+
+def convert_and_save_hf(args):
+    model_dir = args.model_dir
+    world_size = args.tp_size * args.pp_size
+    # Need to convert the cli args to the kay-value pairs and override them in the generate config dict.
+    # Ideally these fields will be moved out of the config and pass them into build API, keep them here for compatibility purpose for now,
+    # before the refactor is done.
+    override_fields = {}
+    override_fields.update(args_to_build_options(args))
+
+    hf_model = load_hf_deepseek(model_dir)
+
+    def convert_and_save_rank(args, rank):
+        mapping = Mapping(world_size=world_size,
+                          rank=rank,
+                          tp_size=args.tp_size,
+                          pp_size=args.pp_size,
+                          moe_tp_size=args.moe_tp_size,
+                          moe_ep_size=args.moe_ep_size)
+        deepseekv1 = DeepseekForCausalLM.from_hugging_face(
+            hf_model, args.model_dir, args.dtype, mapping, **override_fields)
+        deepseekv1.save_checkpoint(args.output_dir, save_config=(rank == 0))
+        del deepseekv1
+
+    execute(args.workers, [convert_and_save_rank] * world_size, args)
+    release_gc()
+
+
+def main():
+    print(tensorrt_llm.__version__)
+    args = parse_arguments()
+
+    args.tp_size * args.pp_size
+    if (args.moe_tp_size == -1 and args.moe_ep_size == -1):
+        # moe default to tp-only
+        args.moe_tp_size = args.tp_size
+        args.moe_ep_size = 1
+    elif (args.moe_tp_size == -1):
+        args.moe_tp_size = args.tp_size // args.moe_ep_size
+    elif (args.moe_ep_size == -1):
+        args.moe_ep_size = args.tp_size // args.moe_tp_size
+    assert (args.moe_tp_size * args.moe_ep_size == args.tp_size
+            ), "moe_tp_size * moe_ep_size must equal to tp_size"
+
+    tik = time.time()
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+    assert args.model_dir is not None
+    convert_and_save_hf(args)
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    print(f'Total time of converting checkpoints: {t}')
+
+
+if __name__ == '__main__':
+    main()
--- a/examples/deepseek_v1/requirements.txt
+++ b/examples/deepseek_v1/requirements.txt
@ -0,0 +1,5 @@
+--extra-index-url https://pypi.nvidia.com
+tensorrt_llm==0.11.0
+datasets~=2.14.6
+evaluate~=0.4.1
+rouge_score~=0.1.2
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 transformers>=4.31.0
 datasets~=2.14.5
 evaluate~=0.4.1
--- a/examples/gemma/requirements.txt
+++ b/examples/gemma/requirements.txt
@ -3,7 +3,7 @@
 # WAR the new posting of "nvidia-cudnn-cu12~=9.0".
 # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
 nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 flax~=0.8.0
 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
 jax~=0.4.19; platform_system == "Windows"
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/gptj/requirements.txt
+++ b/examples/gptj/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/gptneox/requirements.txt
+++ b/examples/gptneox/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 rouge_score~=0.1.2
 evaluate~=0.4.1
--- a/examples/grok/requirements.txt
+++ b/examples/grok/requirements.txt
@ -1,6 +1,6 @@
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/internlm/requirements.txt
+++ b/examples/internlm/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets==2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
--- a/examples/jais/requirements.txt
+++ b/examples/jais/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/llm-api/requirements.txt
+++ b/examples/llm-api/requirements.txt
@ -1,2 +1,2 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
--- a/examples/mamba/requirements.txt
+++ b/examples/mamba/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 transformers>=4.39.0
 datasets~=2.14.5
 evaluate
--- a/examples/medusa/requirements.txt
+++ b/examples/medusa/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
--- a/examples/mixtral/requirements.txt
+++ b/examples/mixtral/requirements.txt
@ -1,4 +1,4 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 transformers==4.38.2
 accelerate==0.25.0
--- a/examples/mpt/requirements.txt
+++ b/examples/mpt/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/nemotron/requirements.txt
+++ b/examples/nemotron/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 nemo-toolkit[all]==2.0.0rc1
 megatron-core==0.8.0
 datasets~=2.14.5
--- a/examples/opt/requirements.txt
+++ b/examples/opt/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/phi/requirements.txt
+++ b/examples/phi/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/quantization/requirements.txt
+++ b/examples/quantization/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets>=2.14.4
 nemo-toolkit[all]<=1.20.0,>=1.18.0
 rouge_score~=0.1.2
--- a/examples/qwen/requirements.txt
+++ b/examples/qwen/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/qwenvl/requirements.txt
+++ b/examples/qwenvl/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/recurrentgemma/requirements.txt
+++ b/examples/recurrentgemma/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 git+https://github.com/google-deepmind/recurrentgemma.git
 flax>=0.8.2
 jax~=0.4.23
--- a/examples/redrafter/requirements.txt
+++ b/examples/redrafter/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
--- a/examples/skywork/requirements.txt
+++ b/examples/skywork/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets~=2.16.1
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/smaug/requirements.txt
+++ b/examples/smaug/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/whisper/requirements.txt
+++ b/examples/whisper/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.14.0.dev2024091700
+tensorrt_llm==0.14.0.dev2024092400
 tiktoken
 datasets
 kaldialign
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@ -18,3 +18,4 @@ bandit==1.7.7
 jsonlines==4.0.0
 jieba==0.42.1
 rouge==1.0.1
+pytest-rerunfailures
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@ -74,7 +74,7 @@ def parse_arguments():
    parser.add_argument(
        '--max_batch_size',
        type=int,
-        default=256,
+        default=2048,
        help="Maximum number of requests that the engine can schedule.")
    parser.add_argument('--max_input_len',
                        type=int,
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@ -4063,7 +4063,7 @@ def bert_attention(tensor: Tensor,
            The maximum distance of relative position in attention, for implicit mode.
            Default value is 0, meaning to use the regular mode of relative attention bias.
            Implicit mode is only enabled when passing in non-zero positive max_distance value.
-            See relative attention bias in docs/gpt_attention.md
+            See relative attention bias in docs/source/advanced/gpt-attention.md

        max_input_length: Tensor = None
            The maximum input sequence length represented by Tensor shape. Requires for remove_input_padding to pre-define plugin workspace size.
@ -4619,19 +4619,19 @@ def gpt_attention(
    arguments that are likely to be removed or merged with others in the future
    release.

-    See docs/gpt_attention.md for the documentation of that function.
+    See docs/source/advanced/gpt-attention.md for the documentation of that function.

    Parameters:
        qkv: Tensor (On GPU)
            The input QKV tensor. Its shape is [batch_beam_size, max_seqlen, qkv_dim] in padded mode and [1, num_tokens, qkv_dim] in
-            packed mode. Where qkv_dim depends on using MQA, GQA, or MHA. See QKV Input in docs/gpt_attention.md,
+            packed mode. Where qkv_dim depends on using MQA, GQA, or MHA. See QKV Input in docs/source/advanced/gpt-attention.md,

        past_key_value: Tensor (On GPU)
            The tensor that stores KV cache data. Its shape is
            [max_batch_size * max_beam_width, 2, num_kv_heads, max_seqlen, hidden_dim_per_head]
            in contiguous mode and
            [max_blocks, 2, num_kv_heads, num_tokens_per_block, hidden_dim_per_head]
-            in paged mode. See KV Cache in docs/gpt_attention.md,
+            in paged mode. See KV Cache in docs/source/advanced/gpt-attention.md,

        context_fmha_custom_mask: Tensor (On GPU)
            The tensor that stores the packed custom mask for fmha.
@ -4639,7 +4639,7 @@ def gpt_attention(

        sequence_lengths: Tensor (On GPU)
            The tensor that stores the length of each sequence. Its shape is
-            [batch_size]. See QKV Input in docs/gpt_attention.md,
+            [batch_size]. See QKV Input in docs/source/advanced/gpt-attention.md,

        host_past_key_value_lengths: Tensor (On CPU)
            An INT32 tensor of shape [batch_size],
@ -4657,12 +4657,12 @@ def gpt_attention(
        cache_indirection: Tensor (On GPU)
            The tensor to reconstruct the paths when using beam-search. Its
            shape is [batch_size, beam_width, max_seqlen]. See Beam-Search in
-            docs/gpt_attention.md,
+            docs/source/advanced/gpt-attention.md,

        host_request_types: Tensor = None (On CPU)
            The tensor on the host that indicates if a request is in context or
            generation phase. Its shape is [batch_size]. See Inflight Batching
-            in docs/gpt_attention.md,
+            in docs/source/advanced/gpt-attention.md,

        layer_idx: int
            The index of this attention layer, used to access kv_cache_block_offsets,
@ -4678,7 +4678,7 @@ def gpt_attention(

        q_scaling: float
            The value used to compute the scaling factor applied to the output
-            of the Q*K^T product. See Scaling Factors in docs/gpt_attention.md,
+            of the Q*K^T product. See Scaling Factors in docs/source/advanced/gpt-attention.md,

        qk_tanh_scale: float
            The scale * tanh(value / scale) used to compute the scaling factor applied to the output
@ -4726,12 +4726,12 @@ def gpt_attention(
        kv_orig_quant_scale: Tensor
            The tensor to store the scaling factor for quantization to INT8/FP8
            in the KV cache. Its shape is [1]. See INT8/FP8 KV Cache in
-            docs/gpt_attention.md,
+            docs/source/advanced/gpt-attention.md,

        kv_quant_orig_scale: Tensor
            The tensor to store the scaling factor for dequantization from
            INT8/FP8 in the KV cache. Its shape is [1]. See INT8/FP8 KV Cache
-            in docs/gpt_attention.md,
+            in docs/source/advanced/gpt-attention.md,

        attention_output_orig_quant_scale: Tensor
            The tensor to store the scaling factor for quantization to FP8
@ -4742,7 +4742,7 @@ def gpt_attention(

        max_context_length: int32_t
            The length of the longest input sequence. See QKV Input in
-            docs/gpt_attention.md,
+            docs/source/advanced/gpt-attention.md,

        mask_type: int = 1
            The type of mask:
@ -4779,14 +4779,14 @@ def gpt_attention(
        kv_cache_block_offsets:
            The tensor of block offsets for the KV cache. Its shape is
            [num_layers, max_batch_size, max_beam_width, 2, max_blocks_per_sequence * 2],
-            See KV cache section in docs/gpt_attention.md, on gpu,
+            See KV cache section in docs/source/advanced/gpt-attention.md, on gpu,

        host_kv_cache_block_offsets:
            The same as kv_cache_block_offsets, but on cpu,

        host_kv_cache_pool_pointers:
            The tensor of pool pointers for the KV cache. Its shape is [2],
-            See KV cache section in docs/gpt_attention.md, on gpu,
+            See KV cache section in docs/source/advanced/gpt-attention.md, on gpu,

        do_cross_attention: bool = False
            Do we use this as cross attention instead of self attention,
@ -4809,7 +4809,7 @@ def gpt_attention(
            The maximum distance of relative position in attention, for implicit mode.
            Default value is 0, meaning to use the regular mode of relative attention bias.
            Implicit mode is only enabled when passing in non-zero positive max_distance value.
-            See relative attention bias in docs/gpt_attention.md
+            See relative attention bias in docs/source/advanced/gpt-attention.md

        host_context_lengths: Tensor = None (On CPU)
            A host tensor that contains the lengths of the different inputs,
@ -5609,7 +5609,7 @@ def lora_plugin(
        host_request_types : Tensor = None
            The tensor on the host that indicates if a request is in context or
            generation phase. Its shape is [batch_size]. See Inflight Batching
-            in docs/gpt_attention.md,
+            in docs/source/advanced/gpt-attention.md,

        transa : bool
            Is the first input transposed? Set to 'True' if you want the first
@ -5736,7 +5736,7 @@ def mamba_conv1d(input: Tensor,
        host_request_types : Tensor (On CPU)
            The tensor on the host that indicates if a request is in context or
            generation phase. Its shape is [batch_size]. See Inflight Batching
-            in docs/gpt_attention.md,
+            in docs/source/advanced/gpt-attention.md,

        last_token_ids : Tensor (On GPU)
            The inclusive prefix-sum of the lengths or the lengths of the
@ -5883,7 +5883,7 @@ def selective_scan(input: Tensor,
        host_request_types : Tensor (On CPU)
            The tensor on the host that indicates if a request is in context or
            generation phase. Its shape is [batch_size]. See Inflight Batching
-            in docs/gpt_attention.md
+            in docs/source/advanced/gpt-attention.md

        last_token_ids : Tensor (On GPU)
            The inclusive prefix-sum of the lengths or the lengths of the
@ -6029,7 +6029,7 @@ def rg_lru(input: Tensor,
        host_request_types : Tensor (On CPU)
            The tensor on the host that indicates if a request is in context or
            generation phase. Its shape is [batch_size]. See Inflight Batching
-            in docs/gpt_attention.md,
+            in docs/source/advanced/gpt-attention.md,

        last_token_ids : Tensor (On GPU)
            The inclusive prefix-sum of the lengths or the lengths of the
--- a/tensorrt_llm/layers/init.py
+++ b/tensorrt_llm/layers/init.py
@ -23,7 +23,7 @@ from .embedding import Embedding, PromptTuningEmbedding
 from .linear import ColumnLinear, Linear, RowLinear
 from .lora import Lora, LoraParams, LoraRuntimeParams
 from .mlp import MLP, FusedGatedMLP, GatedMLP
-from .moe import MOE, MoeConfig
+from .moe import MOE, MoeConfig, SharedMoE
 from .normalization import GroupNorm, LayerNorm, RmsNorm
 from .pooling import AvgPool2d
 from .recurrent import FusedRgLru, GroupedLinear, Recurrent, RgLru
@ -61,6 +61,7 @@ __all__ = [
    'LoraRuntimeParams',
    'MOE',
    'MoeConfig',
+    'SharedMoE',
    'Mamba',
    'Mamba2',
    'Recurrent',
--- a/tensorrt_llm/layers/embedding.py
+++ b/tensorrt_llm/layers/embedding.py
@ -90,15 +90,10 @@ class Embedding(Module):
        param.value = loaded_weight

    def postprocess(self, tllm_key, weights, **kwargs):
-        config = kwargs.get("config", None)
        if weights is None:
            return {}
        weights = weights.to(str_dtype_to_torch(self.dtype))
-        if config.share_embedding_table:
-            return {}
-        else:
-            weights = weights.clone()
-            return {tllm_key: weights}
+        return {tllm_key: weights}


 class PromptTuningEmbedding(Embedding):
--- a/tensorrt_llm/layers/moe.py
+++ b/tensorrt_llm/layers/moe.py
@ -61,6 +61,9 @@ class MoeConfig:
        SPARSE_MIXER = 2

    num_experts: int = 0
+    moe_intermediate_size: int = 0  # Add moe inter size (shanshan)
+    num_shared_experts: int = 0  # Add number of shared experts (shanshan)
+
    top_k: int = 0
    normalization_mode: ExpertScaleNormalizationMode = ExpertScaleNormalizationMode.RENORMALIZE
    sparse_mixer_epsilon: float = 0.01
@ -832,3 +835,51 @@ class MoeOOTB(MOE):
                if is_gated_act:
                    expert.gate.bias.value = experts_bias_1_raw[
                        i, :self.expert_inter_size]
+
+
+# Add SharedMoE class (shanshan)
+class SharedMoE(Module):
+
+    def __init__(self,
+                 moe_config: MoeConfig,
+                 hidden_size: int,
+                 ffn_hidden_size: int,
+                 hidden_act: str,
+                 mapping: Mapping = Mapping(),
+                 bias: bool = True,
+                 dtype=None,
+                 **kwargs):
+        super().__init__()
+
+        self.moe_config = moe_config
+        self.hidden_size = hidden_size
+        self.ffn_hidden_size = ffn_hidden_size
+        self.hidden_act = hidden_act
+        self.mapping = mapping
+        self.bias = bias
+        self.dtype = dtype
+
+        self.moe = MOE(hidden_size=self.hidden_size,
+                       moe_config=self.moe_config,
+                       mapping=self.mapping,
+                       ffn_hidden_size=self.moe_config.moe_intermediate_size,
+                       hidden_act=self.hidden_act,
+                       dtype=self.dtype,
+                       bias=False,
+                       tp_group=self.mapping.tp_group,
+                       tp_size=self.mapping.tp_size)
+        ClsMLP = GatedMLP if is_gated_activation(self.hidden_act) else MLP
+        self.shared_experts = ClsMLP(
+            hidden_size=self.hidden_size,
+            ffn_hidden_size=self.ffn_hidden_size,
+            hidden_act=non_gated_version(self.hidden_act),  # deepseek use SiLU
+            bias=False,
+            dtype=self.dtype,
+            tp_group=self.mapping.tp_group,
+            tp_size=self.mapping.tp_size)
+
+    def forward(self, hidden_states):
+        if self.moe_config.num_shared_experts > 0:
+            return self.moe(hidden_states) + self.shared_experts(hidden_states)
+        else:
+            return self.moe(hidden_states)
--- a/tensorrt_llm/models/init.py
+++ b/tensorrt_llm/models/init.py
@ -23,6 +23,7 @@ from .cogvlm.model import CogVLMForCausalLM
 from .dbrx.config import DbrxConfig
 from .dbrx.model import DbrxForCausalLM
 from .deci.model import DeciLMForCausalLM
+from .deepseek_v1.model import DeepseekForCausalLM
 from .dit.model import DiT
 from .enc_dec.model import DecoderModel, EncoderModel, WhisperEncoder
 from .falcon.config import FalconConfig
@ -57,6 +58,7 @@ __all__ = [
    'BloomModel',
    'BloomForCausalLM',
    'DiT',
+    'DeepseekForCausalLM',
    'FalconConfig',
    'FalconForCausalLM',
    'FalconModel',
@ -158,5 +160,6 @@ MODEL_MAP = {
    'RecurrentGemmaForCausalLM': RecurrentGemmaForCausalLM,
    'CogVLMForCausalLM': CogVLMForCausalLM,
    'DiT': DiT,
+    'DeepseekForCausalLM': DeepseekForCausalLM,
    'DeciLMForCausalLM': DeciLMForCausalLM,
 }
--- a/tensorrt_llm/models/deepseek_v1/init.py
+++ b/tensorrt_llm/models/deepseek_v1/init.py
@ -0,0 +1,14 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
--- a/tensorrt_llm/models/deepseek_v1/convert.py
+++ b/tensorrt_llm/models/deepseek_v1/convert.py
@ -0,0 +1,361 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import time
+
+import torch
+from transformers import AutoConfig, AutoModelForCausalLM
+
+from tensorrt_llm.layers import MoeConfig
+
+from ..._utils import pad_vocab_size, release_gc
+from ...mapping import Mapping
+
+
+## Convert config parameters to dict
+def create_trt_config_from_hf(model_dir,
+                              dtype,
+                              mapping: Mapping,
+                              override_fields: dict = {}):
+    config = {}
+    assert isinstance(model_dir, str)
+    hf_config = AutoConfig.from_pretrained(model_dir, trust_remote_code=True)
+    dtype = dtype
+    n_layer = hf_config.num_hidden_layers
+    n_head = hf_config.num_attention_heads
+    n_embd = hf_config.hidden_size
+    inter_size = hf_config.intermediate_size
+    n_kv_head = hf_config.num_key_value_heads
+    vocab_size = hf_config.vocab_size
+    n_positions = hf_config.max_position_embeddings
+    hidden_act = 'swiglu'  # TRT-LLM request make gated activation explicit for MOE implementation
+    rotary_base = hf_config.rope_theta
+    rms_norm_eps = hf_config.rms_norm_eps
+    moe_num_experts = hf_config.n_routed_experts
+    moe_top_k = hf_config.num_experts_per_tok
+    ## shanshan fix
+    moe_renorm_mode = MoeConfig.ExpertScaleNormalizationMode.NONE
+    moe_num_shared_experts = hf_config.n_shared_experts
+    moe_inter_size = hf_config.moe_intermediate_size
+    rotary_scaling = hf_config.rope_scaling
+
+    config = {
+        'architecture': "DeepseekForCausalLM",
+        'dtype': dtype,
+        'logits_type': 'float32',
+        'num_hidden_layers': n_layer,
+        'num_attention_heads': n_head,
+        'hidden_size': n_embd,
+        'intermediate_size': inter_size,
+        'num_key_value_heads': n_kv_head,
+        'vocab_size': vocab_size,
+        'position_embedding_type': 'rope_gpt_neox',
+        'max_position_embeddings': n_positions,
+        'hidden_act': hidden_act,
+        'rotary_base': rotary_base,
+        'norm_epsilon': rms_norm_eps,
+        'rotary_scaling': rotary_scaling,
+        'moe_num_experts': moe_num_experts,
+        'moe_top_k': moe_top_k,
+        'moe_renorm_mode': moe_renorm_mode,
+        'moe_num_shared_experts': moe_num_shared_experts,
+        'moe_inter_size': moe_inter_size,
+        'mapping': {
+            'world_size': mapping.tp_size * mapping.pp_size,
+            'tp_size': mapping.tp_size,
+            'pp_size': mapping.pp_size,
+            'moe_tp_size': mapping.moe_tp_size,
+            'moe_ep_size': mapping.moe_ep_size,
+        },
+    }
+    config.update(override_fields)
+
+    moe_config = MoeConfig(num_experts=config['moe_num_experts'],
+                           moe_intermediate_size=config['moe_inter_size'],
+                           num_shared_experts=config['moe_num_shared_experts'],
+                           top_k=config['moe_top_k'],
+                           normalization_mode=config['moe_renorm_mode'])
+    moe_config.validate()
+
+    return config
+
+
+## Get HF model
+def load_hf_deepseek(model_dir):
+    model = AutoModelForCausalLM.from_pretrained(model_dir,
+                                                 device_map='auto',
+                                                 torch_dtype='auto',
+                                                 trust_remote_code=True)
+    return model
+
+
+## Prepare weights for TP
+def split(v, tp_size, idx, dim=0):
+    if tp_size == 1:
+        return v
+    if len(v.shape) == 1:
+        return torch.chunk(v, tp_size)[idx].contiguous()
+    else:
+        return torch.chunk(v, tp_size, dim=dim)[idx].contiguous()
+
+
+def split_qkv_tp(v, n_head, n_hidden, tensor_parallel, rank):
+    """
+    Splits the QKV matrix according to tensor parallelism
+    """
+    v = v.reshape(3, n_hidden, n_hidden)
+    split_v = split(v, tensor_parallel, rank, dim=1)
+    split_v = split_v.reshape(3 * (n_hidden // tensor_parallel), n_hidden)
+    return split_v.contiguous()
+
+
+def split_matrix_tp(v, tensor_parallel, rank, dim):
+    return split(v, tensor_parallel, rank, dim=dim)
+
+
+def get_weight(config, prefix, dtype, postfix='.weight'):
+    if config[prefix + postfix].dtype != dtype:
+        config[prefix + postfix].data = config[prefix + postfix].to(dtype)
+    return config[prefix + postfix].detach().cpu()
+
+
+def get_trtllm_linear_weight(weight, prefix, postfix='weight'):
+    results = {}
+    results[prefix + postfix] = weight
+
+    return results
+
+
+def convert_deepseek(hf_model,
+                     config,
+                     mapping,
+                     dtype='float32',
+                     use_parallel_embedding=False,
+                     sharding_dim=0,
+                     share_embedding_table=False):
+
+    weights = {}
+    tik = time.time()
+    mapping.tp_size
+    model_params = dict(hf_model.named_parameters())
+    dtype = getattr(torch, dtype)
+    moe_config = MoeConfig(num_experts=config['moe_num_experts'],
+                           moe_intermediate_size=config['moe_inter_size'],
+                           num_shared_experts=config['moe_num_shared_experts'],
+                           top_k=config['moe_top_k'],
+                           normalization_mode=config['moe_renorm_mode'])
+
+    layers_range = mapping.pp_layers(config['num_hidden_layers'])
+
+    def convert_layer(l):
+        prefix = f'model.layers.{l}.'
+        print(prefix)
+        trtllm_prex = f'transformer.layers.{l - layers_range[0]}.'
+        q_weight = get_weight(model_params, prefix + 'self_attn.q_proj', dtype)
+        k_weight = get_weight(model_params, prefix + 'self_attn.k_proj', dtype)
+        v_weight = get_weight(model_params, prefix + 'self_attn.v_proj', dtype)
+
+        qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
+
+        split_v = split_qkv_tp(qkv_weight, config['num_attention_heads'],
+                               config['hidden_size'], mapping.tp_size,
+                               mapping.tp_rank)
+
+        weights.update(
+            get_trtllm_linear_weight(split_v, trtllm_prex + 'attention.qkv.'))
+
+        attn_dense_weight = get_weight(model_params,
+                                       prefix + 'self_attn.o_proj', dtype)
+        split_v = split_matrix_tp(attn_dense_weight,
+                                  mapping.tp_size,
+                                  mapping.tp_rank,
+                                  dim=1)
+
+        weights.update(
+            get_trtllm_linear_weight(split_v, trtllm_prex + 'attention.dense.'))
+
+        if moe_config.has_moe() and l > 0:
+            rank_experts = list(range(moe_config.num_experts))
+            if mapping.has_moe_ep():
+                rank_experts = mapping.ep_experts(moe_config.num_experts)
+            for suffix in ["gate_proj", "down_proj", "up_proj"]:
+                model_params[f'model.layers.{l}.mlp.experts.{suffix}.weight'] = \
+                torch.stack([model_params[f'model.layers.{l}.mlp.experts.{expert}.{suffix}.weight'].detach().cpu()
+                            for expert in rank_experts])
+
+            gate_proj = model_params[
+                f'model.layers.{l}.mlp.experts.gate_proj.weight']
+            down_proj = model_params[
+                f'model.layers.{l}.mlp.experts.down_proj.weight']
+            up_proj = model_params[
+                f'model.layers.{l}.mlp.experts.up_proj.weight']
+            if mapping.has_moe_tp():
+                gate_proj = split(gate_proj,
+                                  mapping.tp_size,
+                                  mapping.tp_rank,
+                                  dim=1)
+                down_proj = split(down_proj,
+                                  mapping.tp_size,
+                                  mapping.tp_rank,
+                                  dim=2)
+                up_proj = split(up_proj,
+                                mapping.tp_size,
+                                mapping.tp_rank,
+                                dim=1)
+
+            model_params[
+                f'model.layers.{l}.mlp.experts.up_gate_proj.weight'] = torch.concat(
+                    [up_proj, gate_proj], dim=-2)
+            model_params[
+                f'model.layers.{l}.mlp.experts.down_proj.weight'] = down_proj
+
+            ## mlp.experts.down_proj.weight
+            moe_experts_down_proj_weights = get_weight(
+                model_params, prefix + 'mlp.experts.down_proj', dtype)
+            weights.update(
+                get_trtllm_linear_weight(moe_experts_down_proj_weights,
+                                         trtllm_prex + 'mlp.moe.proj.'))
+            ##mlp.experts.up_gate.weight
+            moe_experts_up_gate_proj_weights = get_weight(
+                model_params, prefix + 'mlp.experts.up_gate_proj', dtype)
+            weights.update(
+                get_trtllm_linear_weight(moe_experts_up_gate_proj_weights,
+                                         trtllm_prex + 'mlp.moe.fc.'))
+            ## MOE hardcoded routing_input into trt.float32, please refer to moe.py line 397
+            moe_experts_gate_weights = get_weight(model_params,
+                                                  prefix + 'mlp.gate',
+                                                  torch.float32)
+            weights.update(
+                get_trtllm_linear_weight(moe_experts_gate_weights,
+                                         trtllm_prex + 'mlp.moe.router.'))
+
+            if moe_config.num_shared_experts > 0:
+                ## mlp.shared_experts.gate_proj.weight
+                shared_moe_gate_proj_weights = get_weight(
+                    model_params, prefix + 'mlp.shared_experts.gate_proj',
+                    dtype)
+                split_v = split_matrix_tp(shared_moe_gate_proj_weights,
+                                          mapping.tp_size,
+                                          mapping.tp_rank,
+                                          dim=0)
+                weights.update(
+                    get_trtllm_linear_weight(
+                        split_v, trtllm_prex + 'mlp.shared_experts.fc.'))
+                # mlp.shared_experts.down_proj.weight
+                shared_moe_down_proj_weights = get_weight(
+                    model_params, prefix + 'mlp.shared_experts.down_proj',
+                    dtype)
+                split_v = split_matrix_tp(shared_moe_down_proj_weights,
+                                          mapping.tp_size,
+                                          mapping.tp_rank,
+                                          dim=1)
+                weights.update(
+                    get_trtllm_linear_weight(
+                        split_v, trtllm_prex + 'mlp.shared_experts.proj.'))
+                ## mlp.shared_experts.up_proj.weight
+                shared_moe_up_proj_weights = get_weight(
+                    model_params, prefix + 'mlp.shared_experts.up_proj', dtype)
+                split_v = split_matrix_tp(shared_moe_up_proj_weights,
+                                          mapping.tp_size,
+                                          mapping.tp_rank,
+                                          dim=0)
+                weights.update(
+                    get_trtllm_linear_weight(
+                        split_v, trtllm_prex + 'mlp.shared_experts.gate.'))
+
+        else:
+            ## Current deepseek model has one MLP layer only, if it goes large consider to do fuse
+            mlp_gate_weight = get_weight(model_params, prefix + 'mlp.up_proj',
+                                         dtype)
+            split_gate = split_matrix_tp(mlp_gate_weight,
+                                         mapping.tp_size,
+                                         mapping.tp_rank,
+                                         dim=0)
+            weights.update(
+                get_trtllm_linear_weight(split_gate, trtllm_prex + 'mlp.gate.'))
+
+            mlp_fc_weight = get_weight(model_params, prefix + 'mlp.gate_proj',
+                                       dtype)
+            split_fc = split_matrix_tp(mlp_fc_weight,
+                                       mapping.tp_size,
+                                       mapping.tp_rank,
+                                       dim=0)
+            weights.update(
+                get_trtllm_linear_weight(split_fc, trtllm_prex + 'mlp.fc.'))
+
+            mlp_proj_weight = get_weight(model_params, prefix + 'mlp.down_proj',
+                                         dtype)
+            split_proj = split_matrix_tp(mlp_proj_weight,
+                                         mapping.tp_size,
+                                         mapping.tp_rank,
+                                         dim=1)
+            weights.update(
+                get_trtllm_linear_weight(split_proj, trtllm_prex + 'mlp.proj.'))
+
+        # Layer norms do not use tensor parallelism
+        input_ln_weight = get_weight(model_params, prefix + 'input_layernorm',
+                                     dtype)
+        weights[trtllm_prex + 'input_layernorm.weight'] = input_ln_weight
+        post_ln_weight = get_weight(model_params,
+                                    prefix + 'post_attention_layernorm', dtype)
+        weights[trtllm_prex + 'post_layernorm.weight'] = post_ln_weight
+
+    for l in layers_range:
+        convert_layer(l)
+        release_gc()
+
+    v = get_weight(model_params, 'model.embed_tokens', dtype)
+    if hf_model.config.tie_word_embeddings:
+        # lm_head.weight has the same weights as embedding
+        if mapping.is_last_pp_rank():
+            if config['vocab_size'] % mapping.tp_size != 0:
+                # padding
+                vocab_size_padded = pad_vocab_size(config['vocab_size'],
+                                                   mapping.tp_size)
+                pad_width = vocab_size_padded - config['vocab_size']
+                v = torch.nn.functional.pad(v, (0, 0, 0, pad_width), 'constant',
+                                            0)
+            weights['lm_head.weight'] = split(v, mapping.tp_size,
+                                              mapping.tp_rank)
+    if use_parallel_embedding:
+        v = split_matrix_tp(v,
+                            mapping.tp_size,
+                            mapping.tp_rank,
+                            dim=config.embedding_sharding_dim)
+    if mapping.is_first_pp_rank():
+        weights['transformer.vocab_embedding.weight'] = v
+    lm_head_weights = get_weight(model_params, 'lm_head', dtype)
+
+    if mapping.is_last_pp_rank():
+        if config['vocab_size'] % mapping.tp_size != 0:
+            # padding
+            vocab_size_padded = pad_vocab_size(config['vocab_size'],
+                                               mapping.tp_size)
+            pad_width = vocab_size_padded - config['vocab_size']
+            lm_head_weights = torch.nn.functional.pad(lm_head_weights,
+                                                      (0, 0, 0, pad_width),
+                                                      'constant',
+                                                      value=0)
+        weights['lm_head.weight'] = split_matrix_tp(lm_head_weights,
+                                                    mapping.tp_size,
+                                                    mapping.tp_rank,
+                                                    dim=0)
+    ln_f_w = get_weight(model_params, 'model.norm', dtype)
+    weights['transformer.ln_f.weight'] = ln_f_w
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    print(f'Weights loaded. Total time: {t}')
+    #print(set(weights.keys()))
+    return weights
--- a/tensorrt_llm/models/deepseek_v1/model.py
+++ b/tensorrt_llm/models/deepseek_v1/model.py
@ -0,0 +1,257 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+import torch
+
+from ..._utils import pad_vocab_size, torch_dtype_to_str
+from ...functional import Tensor, non_gated_version, recv, send
+from ...layers import (Attention, AttentionMaskType, ColumnLinear, Embedding,
+                       GatedMLP, MoeConfig, PositionEmbeddingType, RmsNorm,
+                       SharedMoE)
+from ...mapping import Mapping
+from ...module import Module
+from ...plugin import init_all_reduce_helper
+from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
+                              PretrainedConfig)
+from .convert import convert_deepseek, create_trt_config_from_hf
+
+
+class DeepseekDecoderLayer(Module):
+
+    def __init__(self, config: PretrainedConfig, layer_idx: int):
+        super().__init__()
+        self.layer_idx = layer_idx
+        self.config = config
+
+        ### Input layernorm in Deepseek v1 is same as Llama
+        self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                       eps=config.norm_epsilon,
+                                       dtype=config.dtype)
+
+        layers_range = config.mapping.pp_layers(config.num_hidden_layers)
+        local_layer_idx = layer_idx - layers_range[0]
+        ### Deepseek v1 model with standard attention
+        self.attention = Attention(
+            local_layer_idx=local_layer_idx,
+            hidden_size=config.hidden_size,
+            attention_head_size=config.head_size,
+            num_attention_heads=config.num_attention_heads,
+            num_kv_heads=config.num_key_value_heads,
+            max_position_embeddings=config.max_position_embeddings,
+            dtype=config.dtype,
+            attention_mask_type=AttentionMaskType.causal,
+            bias=False,
+            position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+            rotary_embedding_base=config.rotary_base,
+            rotary_embedding_scaling=config.rotary_scaling,
+            tp_group=config.mapping.tp_group,
+            tp_size=config.mapping.tp_size,
+            tp_rank=config.mapping.tp_rank)
+
+        ClsMLP = GatedMLP
+
+        moe_config = MoeConfig(num_experts=config.moe_num_experts,
+                               moe_intermediate_size=config.moe_inter_size,
+                               num_shared_experts=config.moe_num_shared_experts,
+                               top_k=config.moe_top_k,
+                               normalization_mode=config.moe_renorm_mode)
+
+        mlp_kwargs = {}
+        if config.moe_num_experts > 0 and layer_idx > 0:
+            mlp_hidden_size = moe_config.num_shared_experts * moe_config.moe_intermediate_size
+            hidden_act = config.hidden_act
+            ClsMLP = SharedMoE
+            mlp_kwargs = {"moe_config": moe_config, "mapping": config.mapping}
+        else:
+            ClsMLP = GatedMLP
+            mlp_hidden_size = config.intermediate_size
+            hidden_act = non_gated_version(
+                config.hidden_act)  # back to non gated for dense layers
+
+        self.mlp = ClsMLP(hidden_size=config.hidden_size,
+                          ffn_hidden_size=mlp_hidden_size,
+                          hidden_act=hidden_act,
+                          dtype=config.dtype,
+                          bias=False,
+                          tp_group=config.mapping.tp_group,
+                          tp_size=config.mapping.tp_size,
+                          **mlp_kwargs)
+
+        ### Pose layernorm in Deepseek v1 is same as Llama             )
+        self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                      eps=config.norm_epsilon,
+                                      dtype=config.dtype)
+
+    def forward(self,
+                hidden_states,
+                attention_mask=None,
+                use_cache=False,
+                spec_decoding_params=None,
+                kv_cache_params=None,
+                attention_params=None):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+
+        attention_output = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            use_cache=use_cache,
+            spec_decoding_params=spec_decoding_params,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params)
+        if use_cache:
+            attention_output, presents = attention_output
+
+        hidden_states = residual + attention_output
+
+        residual = hidden_states
+
+        hidden_states = self.post_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        if use_cache:
+            return (hidden_states, presents)
+        return hidden_states
+
+
+class DeepseekModel(Module):
+
+    def __init__(self, config: PretrainedConfig) -> None:
+        super().__init__()
+        init_all_reduce_helper()  # enable use_customer_all_reduce
+
+        self.mapping = config.mapping
+        if self.mapping.is_first_pp_rank():
+            self.vocab_embedding = Embedding(config.vocab_size,
+                                             config.hidden_size,
+                                             dtype=config.dtype)
+
+        self.layers = DecoderLayerList(DeepseekDecoderLayer, config)
+
+        if self.mapping.is_last_pp_rank():
+            self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
+                                eps=config.norm_epsilon,
+                                dtype=config.dtype)
+
+    def forward(self,
+                input_ids,
+                position_ids=None,
+                use_cache=False,
+                attention_mask=None,
+                spec_decoding_params=None,
+                kv_cache_params=None,
+                attention_params=None,
+                hidden_states=None,
+                prompt_embedding_table: Optional[Tensor] = None,
+                prompt_tasks: Optional[Tensor] = None,
+                prompt_vocab_size: Optional[Tensor] = None):
+
+        ptuning_args = [
+            prompt_embedding_table, prompt_tasks, prompt_vocab_size
+        ] if prompt_embedding_table is not None else []
+
+        if self.mapping.is_first_pp_rank():
+            hidden_states = self.vocab_embedding(input_ids, *ptuning_args)
+        else:
+            hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
+
+        hidden_states = self.layers.forward(
+            hidden_states,
+            use_cache=use_cache,
+            attention_mask=attention_mask,
+            kv_cache_params=kv_cache_params,
+            attention_params=attention_params,
+            spec_decoding_params=spec_decoding_params)
+
+        if use_cache:
+            hidden_states, presents = hidden_states
+
+        if self.mapping.is_last_pp_rank():
+            hidden_states = self.ln_f(hidden_states)
+        else:
+            hidden_states = send(hidden_states, self.mapping.next_pp_rank())
+
+        if use_cache:
+            return (hidden_states, tuple(presents))
+        return hidden_states
+
+
+class DeepseekForCausalLM(DecoderModelForCausalLM):
+
+    def __init__(self, config: PretrainedConfig):
+        transformer = DeepseekModel(config)
+        vocab_size_padded = pad_vocab_size(config.vocab_size,
+                                           config.mapping.tp_size)
+        if config.mapping.is_last_pp_rank():
+            lm_head = ColumnLinear(config.hidden_size,
+                                   vocab_size_padded,
+                                   bias=False,
+                                   dtype=config.dtype,
+                                   tp_group=config.mapping.tp_group,
+                                   tp_size=config.mapping.tp_size,
+                                   gather_output=True)
+        else:
+            lm_head = None
+        self.mapping = config.mapping
+        super().__init__(config, transformer, lm_head)
+
+    @classmethod
+    def from_hugging_face(cls,
+                          hf_model,
+                          model_dir,
+                          dtype: str = 'auto',
+                          mapping: Optional[Mapping] = None,
+                          override_fields={},
+                          **kwargs):
+        assert hf_model is not None
+        if mapping is None:
+            mapping = Mapping()
+        config = create_trt_config_from_hf(model_dir,
+                                           dtype,
+                                           mapping=mapping,
+                                           override_fields=override_fields)
+        print(config)
+        pretrained_config = PretrainedConfig.from_dict(config)
+        pretrained_config.set_rank(mapping.rank)  # TODO:remove this hack
+
+        if dtype == 'auto':
+            dtype = getattr(config, 'torch_dtype', None)
+        if dtype is None:
+            dtype = 'float16'
+        if isinstance(dtype, torch.dtype):
+            dtype = torch_dtype_to_str(dtype)
+        if dtype == 'float32':  # should remove "float32"
+            dtype = 'float16'
+        if dtype == 'bfloat16' and torch.cuda.get_device_properties(
+                0).major < 8:
+            logger.warning(
+                "Pre SM 80 GPUs do not support bfloat16, fallback to float16")
+            dtype = 'float16'
+
+        deepseek = cls.from_config(pretrained_config)
+        weights = convert_deepseek(
+            hf_model,
+            config,
+            mapping,
+            dtype=dtype,
+            use_parallel_embedding=config.get('use_parallel_embedding', False),
+            sharding_dim=config.get('embedding_sharding_dim', 0),
+            share_embedding_table=config.get('share_embedding_table', False))
+        #check_share_embedding(weights, config)
+        deepseek.load(weights)
+
+        return deepseek
--- a/tensorrt_llm/models/llama/model.py
+++ b/tensorrt_llm/models/llama/model.py
@ -352,8 +352,7 @@ class LLaMAForCausalLM(DecoderModelForCausalLM):
                hf_model_dir = quant_ckpt_path

            loader = ModelWeightsLoader(hf_model_dir, custom_dict)
-            if config.share_embedding_table:
-                config.share_embedding_table = loader.check_share_embedding()
+            loader.check_share_embedding(config)
            model = cls(config)
            loader.generate_tllm_weights(model)
        else:
--- a/tensorrt_llm/models/model_weights_loader.py
+++ b/tensorrt_llm/models/model_weights_loader.py
@ -283,23 +283,49 @@ class ModelWeightsLoader:

        return weight_dict

-    def check_share_embedding(self):
+    def check_share_embedding(self, config):
+        # TODO: Remove after --use_share_embedding is removed
+        if not config.share_embedding_table:
+            return
+
+        from ..logger import logger
        lm_head_weights = self.load_tensor(
            self.translate_to_external_key("lm_head.weight",
                                           self.tllm_to_externel_key_dict))
        vocab_embed_weights = self.load_tensor(
            self.translate_to_external_key("transformer.vocab_embedding.weight",
                                           self.tllm_to_externel_key_dict))
+        share_embedding_table = False
        if lm_head_weights is not None and vocab_embed_weights is not None:
            if lm_head_weights.shape == vocab_embed_weights.shape:
                if not (lm_head_weights - vocab_embed_weights).any():
-                    return True
-        from ..logger import logger
-        logger.warning(
-            "lm_head.weight and transformer.vocab_embedding.weight are not identical, "
-            "share_embedding_table cannot be enabled; setting share_embedding_table=False."
-        )
-        return False
+                    share_embedding_table = True
+        elif lm_head_weights is None and vocab_embed_weights is not None:
+            self.tllm_to_externel_key_dict[
+                'lm_head'] = self.tllm_to_externel_key_dict[
+                    'transformer'] + '.' + self.tllm_to_externel_key_dict[
+                        'vocab_embedding']
+            share_embedding_table = True
+        elif lm_head_weights is not None and vocab_embed_weights is None:
+            self.tllm_to_externel_key_dict[
+                'vocab_embedding'] = self.tllm_to_externel_key_dict['lm_head']
+            share_embedding_table = True
+
+        # Validation
+        mapping = config.mapping
+        if mapping.tp_size > 1:
+            if (not config.use_parallel_embedding) or (
+                    config.use_parallel_embedding
+                    and config.embedding_sharding_dim == 1):
+                share_embedding_table = False
+        if mapping.pp_size > 1:
+            share_embedding_table = False
+        if mapping.cp_size > 1:
+            share_embedding_table = False
+        config.share_embedding_table = share_embedding_table
+
+        if config.share_embedding_table:
+            logger.info("share_embedding_table enabled.")

    def update_key_mapping(self, model):
        self.model = weakref.ref(model)()
@ -313,6 +339,13 @@ class ModelWeightsLoader:
                    pp_layers)
            })

+        # Share embedding
+        if self.tllm_to_externel_key_dict[
+                'vocab_embedding'] == self.tllm_to_externel_key_dict['lm_head']:
+            self.model.transformer.vocab_embedding.tllm_to_externel_key_dict = {
+                self.tllm_to_externel_key_dict['transformer']: '',
+            }
+
    def fill(self, weights):
        for tllm_key, param in self.model.named_parameters():
            if param.is_buffer:
--- a/tensorrt_llm/models/modeling_utils.py
+++ b/tensorrt_llm/models/modeling_utils.py
@ -1286,6 +1286,9 @@ def preprocess_weights(weights: Dict[str, torch.Tensor],
 def check_share_embedding(weights: Dict[str, torch.Tensor],
                          model_config: PretrainedConfig):
    if model_config.share_embedding_table:
+        if "lm_head.weight" in weights:
+            if weights["lm_head.weight"] is None:
+                weights.pop("lm_head.weight")
        if "lm_head.weight" in weights and "transformer.vocab_embedding.weight" in weights:
            if (weights["lm_head.weight"] -
                    weights["transformer.vocab_embedding.weight"]).any():
--- a/tensorrt_llm/models/qwen/model.py
+++ b/tensorrt_llm/models/qwen/model.py
@ -25,6 +25,7 @@ from ...functional import Tensor, allreduce, recv, send, sigmoid
 from ...layers import (MLP, MOE, Attention, AttentionMaskType, ColumnLinear,
                       Embedding, GatedMLP, RmsNorm, RowLinear)
 from ...layers.moe import MOEWeightWrapper
+from ...logger import logger
 from ...lora_manager import (LoraConfig,
                             get_default_trtllm_modules_to_hf_modules, use_lora)
 from ...mapping import Mapping
@ -427,12 +428,18 @@ class QWenForCausalLM(DecoderModelForCausalLM):
        else:
            if not use_preloading:
                hf_model = load_hf_qwen(hf_model_dir, load_model_on_cpu)
+
+            logger.debug(f"HuggingFace model: {hf_model}")
+
+            model = QWenForCausalLM(config)
+
+            logger.debug(f"TensorRT-LLM model: {model}")
+
            if use_hf_gptq_checkpoint:
                weights = load_weights_from_hf_gptq_model(hf_model, config)
            else:
                weights = load_weights_from_hf_model(hf_model, config)
            check_share_embedding(weights, config)
-            model = QWenForCausalLM(config)
            model.load(weights)
        return model

--- a/tensorrt_llm/module.py
+++ b/tensorrt_llm/module.py
@ -18,6 +18,18 @@ from ._common import default_net
 from .logger import logger


+def _addindent(s_, numSpaces):
+    s = s_.split('\n')
+    # don't do anything for single-line stuff
+    if len(s) == 1:
+        return s_
+    first = s.pop(0)
+    s = [(numSpaces * ' ') + line for line in s]
+    s = '\n'.join(s)
+    s = first + '\n' + s
+    return s
+
+
 class Module(object):

    def __init__(self) -> None:
@ -191,6 +203,23 @@ class Module(object):
        for k, v in self.named_parameters():
            v.value = tm[k].detach().cpu().numpy()

+    def _get_name(self):
+        return self.__class__.__name__
+
+    def __repr__(self):
+        # We treat the extra repr like the sub-module, one item per line
+        child_lines = []
+        for key, module in self._modules.items():
+            mod_str = repr(module)
+            mod_str = _addindent(mod_str, 2)
+            child_lines.append('(' + key + '): ' + mod_str)
+        main_str = self._get_name() + '('
+        if child_lines:
+            # simple one-liner info, which most builtin Modules will use
+            main_str += '\n  ' + '\n  '.join(child_lines) + '\n'
+        main_str += ')'
+        return main_str
+

 class ModuleList(Module):

@ -221,3 +250,35 @@ class ModuleList(Module):

    def __len__(self):
        return len(self._modules)
+
+    def __repr__(self):
+        """Return a custom repr for ModuleList that compresses repeated module representations."""
+        list_of_reprs = [repr(item) for item in self]
+        if len(list_of_reprs) == 0:
+            return self._get_name() + "()"
+
+        start_end_indices = [[0, 0]]
+        repeated_blocks = [list_of_reprs[0]]
+        for i, r in enumerate(list_of_reprs[1:], 1):
+            if r == repeated_blocks[-1]:
+                start_end_indices[-1][1] += 1
+                continue
+
+            start_end_indices.append([i, i])
+            repeated_blocks.append(r)
+
+        lines = []
+        main_str = self._get_name() + "("
+        for (start_id, end_id), b in zip(start_end_indices, repeated_blocks):
+            local_repr = f"({start_id}): {b}"  # default repr
+
+            if start_id != end_id:
+                n = end_id - start_id + 1
+                local_repr = f"({start_id}-{end_id}): {n} x {b}"
+
+            local_repr = _addindent(local_repr, 2)
+            lines.append(local_repr)
+
+        main_str += "\n  " + "\n  ".join(lines) + "\n"
+        main_str += ")"
+        return main_str
--- a/tensorrt_llm/version.py
+++ b/tensorrt_llm/version.py
@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.14.0.dev2024091700"
+__version__ = "0.14.0.dev2024092400"
--- a/tests/bindings/test_bindings_ut.py
+++ b/tests/bindings/test_bindings_ut.py
@ -314,7 +314,7 @@ def test_llm_request():
    assert llm_request.max_num_generated_tokens == 2

    llm_request.pause(0)
-    assert llm_request.state == _tb.LlmRequestState.REQUEST_STATE_CONTEXT_INIT
+    assert llm_request.state == _tb.LlmRequestState.CONTEXT_INIT

    llm_request.max_sent_token_len = 1
    assert llm_request.max_sent_token_len == 1
--- a/tests/conftest.py
+++ b/tests/conftest.py
@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# # Force resource release after test
+import gc
+import multiprocessing.connection
+import os
+import sys
+import time
+
+import pytest
+
+memory_profiling_enabled = os.environ.get("LLM_MEMORY_PROFILING", False)
+
+if memory_profiling_enabled:
+
+    @pytest.hookimpl(trylast=True)
+    def pytest_sessionstart(session):
+        import xdist
+        session.stash["reporter"] = multiprocessing.connection.Client(
+            "/tmp/profiling_scribe.unix", "AF_UNIX")
+        session.stash["worker_id"] = xdist.get_xdist_worker_id(session)
+        session.stash["reporter"].send({
+            "type": "identity",
+            "identifier": "unittest",
+            "pid": os.getpid(),
+            "worker_id": session.stash["worker_id"]
+        })
+
+    @pytest.hookimpl(trylast=True)
+    def pytest_collection_modifyitems(session, config, items):
+        for item in items:
+            item.stash["reporter"] = session.stash["reporter"]
+            item.stash["worker_id"] = session.stash["worker_id"]
+
+    @pytest.hookimpl(trylast=True)
+    def pytest_sessionfinish(session):
+        session.stash["reporter"].close()
+
+
+@pytest.hookimpl(tryfirst=True, wrapper=True)
+def pytest_runtest_protocol(item, nextitem):
+    if memory_profiling_enabled:
+        path, line, name = item.reportinfo()
+        item.stash["reporter"].send({
+            "type": "unit_case",
+            "timestamp": time.time(),
+            "case": {
+                "path": str(path),
+                "line": line,
+                "name": name
+            },
+            "worker_id": item.stash["worker_id"],
+            "pid": os.getpid()
+        })
+
+    result = yield
+
+    if not any(module == 'torch' or module.startswith('torch.')
+               for module in sys.modules):
+        return result
+
+    import torch
+
+    if memory_profiling_enabled:
+        item.stash["reporter"].send({
+            "type": "torch_report",
+            "timestamp": time.time(),
+            "case": {
+                "path": str(path),
+                "line": line,
+                "name": name
+            },
+            "context": "unit",
+            "worker_id": item.stash["worker_id"],
+            "pid": os.getpid(),
+            "report": {
+                "allocated": torch.cuda.memory_allocated(),
+                "max_allocated": torch.cuda.max_memory_allocated(),
+                "reserved": torch.cuda.memory_reserved(),
+                "max_reserved": torch.cuda.max_memory_reserved(),
+            }
+        })
+
+        torch.cuda.reset_peak_memory_stats()
+
+    worker_count = int(os.environ.get('PYTEST_XDIST_WORKER_COUNT', 1))
+
+    if (torch.cuda.memory_reserved(0) + torch.cuda.memory_allocated(0)
+        ) >= (torch.cuda.get_device_properties(0).total_memory //
+              worker_count) * 0.9:
+        gc.collect()
+        torch.cuda.empty_cache()
+
+    return result
--- a/tests/test_module.py
+++ b/tests/test_module.py
@ -72,6 +72,7 @@ class TestModule(unittest.TestCase):

    def test_module(self):
        m = Module3()
+        print(m)
        m.forward()

        self.assertEqual(4, len(list(m.named_modules())))
@ -88,6 +89,7 @@ class TestModule(unittest.TestCase):

    def test_module_list(self):
        m = Module4()
+        print(m)
        m.forward()

        self.assertEqual(8, len(list(m.named_modules())))