Update TensorRT-LLM (#1835)

* Update TensorRT-LLM --------- Co-authored-by: Morgan Funtowicz <funtowiczmo@gmail.com>
2026-01-13 22:18:36 +08:00 · 2024-06-25 21:10:30 +08:00 · 2024-06-25 21:10:30 +08:00 · 9691e12bce
commit 9691e12bce
parent 2a115dae84
94 changed files with 1129 additions and 988 deletions
--- a/benchmarks/cpp/gptManagerBenchmark.cpp
+++ b/benchmarks/cpp/gptManagerBenchmark.cpp
@ -165,6 +165,9 @@ struct BenchmarkParams

    // Weights offloading
    float gpuWeightsPercent{1.0};
+
+    // Decoding params
+    std::optional<std::vector<std::vector<SizeType32>>> medusaChoices;
 };

 class InferenceRequestsSyncSend
@ -791,6 +794,10 @@ public:
            executorConfig.setMaxBatchSize(benchmarkParams.maxBatchSize.value());
        }

+        executorConfig.setDecodingConfig(texec::DecodingConfig(
+            benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
+            std::nullopt, benchmarkParams.medusaChoices));
+
        mExecutor = std::make_unique<texec::Executor>(trtEnginePath, texec::ModelType::kDECODER_ONLY, executorConfig);

        if (logIterationData)
@ -1346,6 +1353,9 @@ void benchmarkGptManager(std::filesystem::path const& engineDir, TrtGptModelType
    optionalParams.maxBeamWidth = beamWidth;
    optionalParams.maxBatchSize = benchmarkParams.maxBatchSize;
    optionalParams.schedulerConfig = texec::SchedulerConfig{capacitySchedulerPolicy};
+    optionalParams.decodingConfig = texec::DecodingConfig(
+        benchmarkParams.medusaChoices.has_value() ? texec::DecodingMode::Medusa() : texec::DecodingMode::Auto(),
+        std::nullopt, benchmarkParams.medusaChoices);

    auto const jsonConfig = GptJsonConfig::parse(engineDir / "config.json");
    SizeType32 deviceCount{0};
@ -1600,6 +1610,32 @@ void benchmarkExecutor(std::filesystem::path const& engineDir, TrtGptModelType m
    }
 }

+std::vector<std::vector<SizeType32>> parseVectorOfVectors(std::string const& input)
+{
+    std::vector<std::vector<SizeType32>> result;
+    std::regex outer_regex(R"(\[(.*?)\])");
+    std::regex inner_regex(R"(\d+)");
+    auto outer_begin = std::sregex_iterator(input.begin(), input.end(), outer_regex);
+    auto outer_end = std::sregex_iterator();
+
+    for (std::sregex_iterator i = outer_begin; i != outer_end; ++i)
+    {
+        std::smatch match = *i;
+        std::string inner_str = match.str(1);
+        std::vector<int> inner_vec;
+        auto inner_begin = std::sregex_iterator(inner_str.begin(), inner_str.end(), inner_regex);
+        auto inner_end = std::sregex_iterator();
+
+        for (std::sregex_iterator j = inner_begin; j != inner_end; ++j)
+        {
+            std::smatch inner_match = *j;
+            inner_vec.push_back(std::stoi(inner_match.str()));
+        }
+        result.push_back(inner_vec);
+    }
+    return result;
+}
+
 } // namespace

 int main(int argc, char* argv[])
@ -1692,6 +1728,8 @@ int main(int argc, char* argv[])
    options.add_options()("gpu_weights_percent",
        "Specify the percentage of weights that reside on GPU (from 0.0 to 1.0).",
        cxxopts::value<float>()->default_value("1.0"));
+    options.add_options()(
+        "medusa_choices", "Medusa choices in the format of [[0], [0, 1], [0, 0, 1]]", cxxopts::value<std::string>());

    auto result = options.parse(argc, argv);

@ -1823,6 +1861,12 @@ int main(int argc, char* argv[])
    // Argument: If offloaded blocks should be onboarded to primary memory before they are reused.
    benchmarkParams.kvOnboardBlocks = !result["kv_dont_onboard_blocks"].as<bool>();

+    // Argument: Medusa choices for the Medusa speculative decoding.
+    if (result.count("medusa_choices"))
+    {
+        benchmarkParams.medusaChoices = parseVectorOfVectors(result["medusa_choices"].as<std::string>());
+    }
+
    std::optional<TokenIdType> padId;
    // Argument: Padding token id
    if (result.count("pad_id"))
--- a/benchmarks/python/build.py
+++ b/benchmarks/python/build.py
@ -944,6 +944,7 @@ def build_gpt(args):
    network = builder.create_network()
    network.trt_network.name = engine_name
    network.plugin_config.to_legacy_setting()
+    network.plugin_config.dtype = args.dtype

    # Plugins
    if args.mode in ['plugin', 'plugin-ifb']:
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
-size 3931604
+oid sha256:8b6ad33047e2684c7d22471f87febbb96ae26f4eac6529e2f3b7c1469ec2ec6d
+size 3931504
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:1fec0fdc00c076761ec48eb5e2ea93473a329e844a8091e26c6e3e02fd14a8b1
-size 3931604
+oid sha256:560f736af15a4dfba849ab29efc3520d6ec8c87bf2aa16589299b232dc171cca
+size 3989220
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.a
-93adf3003d7c422586a9bf892367371d libtensorrt_llm_batch_manager_static.pre_cxx11.a
-c0bd2b69c932257678a2aad9bd8baba4b291795e commit
+f8538ac35803837e5d457ea8c1a58053 libtensorrt_llm_batch_manager_static.a
+dc6fc82dc4ba319899e1d6777bd8c3a4 libtensorrt_llm_batch_manager_static.pre_cxx11.a
+265b039443334094026fbd8f396d52fe29c2d9d1 commit
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bd757c26886a3ffd6947615d9f2829434e94839b693007a64b47c6b5c26416e4
-size 3812158
+oid sha256:74948e00ff7341914b1831ccfdce9ae242dd149603b1ba7e24ee993f08b63542
+size 3812960
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:87321383075adf2d87cfbdc8a12a3d3815ef058d5da9b6aaa8d7d3f3263af439
-size 3773896
+oid sha256:0421ceacd5d07bc172bb4d0979edaf466aa8950290b4d6d1a7d355dbcefc2c84
+size 3772832
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:58cdc0a330f8bfb7b50e3202aeac47bde0835b1dc600b4bfdcd2b30801e66e03
-size 22381766
+oid sha256:46eb1d351e3e8da3945a3f451166f12536aae3e440d57337d8891492424aff78
+size 22387798
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
-size 1396404
+oid sha256:19585b7709736197d9c1762d1bb8e3099e298d6dcc1c521d51c83637cc624c20
+size 1397814
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:18a967eaa1e9a7164e0b104a84b13ea95404f7c7c278375feb2513d5f063bafe
-size 1396404
+oid sha256:f5d5475663640c068af2e9b5772b9b602656641dd17ca473ce7125ef7f2ec855
+size 1423172
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.a
-7d12b9c04cb6738bb5f7747a88b00c1c libtensorrt_llm_executor_static.pre_cxx11.a
-c0bd2b69c932257678a2aad9bd8baba4b291795e commit
+e18e84fb356995b11c04b79e55c4c3f5 libtensorrt_llm_executor_static.a
+f0555b76f21d43e676e5808bf197cc58 libtensorrt_llm_executor_static.pre_cxx11.a
+265b039443334094026fbd8f396d52fe29c2d9d1 commit
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:e503b4cfb1c842850287a359ffed23a1773a67a96475d365b66d757a283ac218
-size 1448772
+oid sha256:8496c9e4a20efd3d2072520cf843dac70cbb0fe23621cfba2a1e0ef3e5fa22ed
+size 1450288
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f8c80cf7aca2b135a656a060456fb30a820e459b4b36560162b02fa65121ef50
-size 1375430
+oid sha256:1b76267834252836e26ddecc2e1b9449e33a67fb1981e5d42f721bc439be1c02
+size 1377018
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:cc65971d6d74260cb49b354aa4b0b82f92863cc722fbf206bf8a4919a4897532
-size 14031364
+oid sha256:9bd0faf48175745d7aeff58f539ae021db365b73933dab9c51329de9e92f2d86
+size 14039826
--- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
+++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_kernels_template.h
@ -424,7 +424,8 @@ std::vector<cutlass_extensions::CutlassGemmConfig> MoeGemmRunner<T, WeightType>:
 template <typename T, typename WeightType>
 bool MoeGemmRunner<T, WeightType>::isHopperSpecialised() const
 {
-    bool config_is_sm90 = best_config_ && best_config_->is_sm90;
+    TLLM_CHECK_WITH_INFO(best_config_, "Cannot determine if hopper is specialised without a selected config");
+    bool config_is_sm90 = best_config_->is_sm90;
    return supportsHopperSpecialisation() && config_is_sm90;
 }

@ -440,7 +441,7 @@ int MoeGemmRunner<T, WeightType>::getSM() const
    return this->sm_;
 }

-// currently support sm80 bf16/fp16 gate ativation, only set predication tensor for m direction
+// currently support sm80 bf16/fp16 gate activation, only set predication tensor for m direction
 template <typename T, typename WeightType>
 bool MoeGemmRunner<T, WeightType>::isFusedGatedActivation(bool is_gated_activation, int gemm_n, int gemm_k) const
 {
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@ -1,2 +1,2 @@
 5b6c74ce66f62d2a58aa9cac16f11ad6 libtensorrt_llm_nvrtc_wrapper.so
-c0bd2b69c932257678a2aad9bd8baba4b291795e commit
+265b039443334094026fbd8f396d52fe29c2d9d1 commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:84319476e8ecf9666f40f69355f19ec3b585fc0987f940be14af9e11e3f524c3
+oid sha256:9f2f97eb5b4181917a47b6028a857d7a597ca93faa5846af42c4cb24797d7fa7
 size 1080832
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/instantiation/decoderMaskedMultiheadAttention104_bf16.cu
--- a/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
+++ b/cpp/tensorrt_llm/kernels/mixtureOfExperts/moe_kernels.cu
@ -1072,10 +1072,38 @@ std::vector<size_t> CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWo
    size_t const hopper_size = using_hopper ? HopperGroupedGemmInput::workspaceSize(num_experts_per_node) : 0;
    size_t const gemm_workspace_size = moe_gemm_runner_.getMaxWorkspaceSize(num_experts_per_node);

-    std::vector<size_t> workspace{source_rows_size, permuted_rows_size, permuted_experts_size, permuted_data_size,
-        total_rows_before_expert_size, softmax_out_size, glu_inter_size,
+    // We do some overlapping of the large workspace buffers. Although we could overlap some of the other buffers, they
+    // are small enough (i.e no factor of hidden size) they will only be a couple MiB at most, so we don't bother
+    // in the case of fused activation we overlap permuted_data and fc2_result
+    // in the case of unfused activation we overlap permuted_data and fc1_result
+    // we need to calculate the max possible size, so use the max of all three
+    size_t overlapped_gemm1_gemm2_inputs = std::max(permuted_data_size, fc2_result_size);
+    // When glu_inter_elems is 0 we are always fused, otherwise we may need the un-fused case
+    if (glu_inter_elems > 0)
+    {
+        overlapped_gemm1_gemm2_inputs = std::max(overlapped_gemm1_gemm2_inputs, fc1_result_size);
+    }
+
+    // if we have glu_inter we overlap it with fc2_result, otherwise we use fc1_result by itself
+    size_t overlapped_gemm1_gemm2_outputs = fc1_result_size;
+    if (glu_inter_elems > 0)
+    {
+        overlapped_gemm1_gemm2_outputs
+            = std::max(std::max(glu_inter_size, fc2_result_size), overlapped_gemm1_gemm2_outputs);
+    }
+
+    std::vector<size_t> workspace{     //
+        source_rows_size,              //
+        permuted_rows_size,            //
+        permuted_experts_size,         //
+        total_rows_before_expert_size, //
+        softmax_out_size,              //
+        sorter_size,                   //
        // These pointers reuse the same memory
-        std::max(fc1_result_size, sorter_size), fc2_result_size, hopper_size, gemm_workspace_size};
+        overlapped_gemm1_gemm2_inputs,  //
+        overlapped_gemm1_gemm2_outputs, //
+        hopper_size,                    //
+        gemm_workspace_size};
    return workspace;
 }

@ -1088,7 +1116,9 @@ size_t CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::getWorkspaceSize(i
    TLLM_CHECK_WITH_INFO(num_experts % ep_size == 0, "Number of experts must be a multiple of ep size");
    auto workspace = getWorkspaceBufferSizes(
        num_rows, hidden_size, inter_size, num_experts, num_experts / ep_size, k, activation_type);
-    return tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
+    auto ws_size = tensorrt_llm::common::calculateTotalWorkspaceSize(workspace.data(), workspace.size());
+    TLLM_LOG_DEBUG("Mixture Of Experts Plugin requires workspace of %2f MiB", ws_size / 1024.f / 1024.f);
+    return ws_size;
 }

 template <class T, class WeightType, class OutputType, class Enable>
@ -1109,29 +1139,38 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::configureWsPtrs(char
    source_rows_ = (int*) ws_sliced[0];
    permuted_rows_ = (int*) ws_sliced[1];
    permuted_experts_ = (int*) ws_sliced[2];
-    permuted_data_ = (T*) ws_sliced[3];

-    total_rows_before_expert_ = (int64_t*) ws_sliced[4];
+    total_rows_before_expert_ = (int64_t*) ws_sliced[3];

    softmax_out_ = nullptr;
    bool const is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0);
    if (!is_pow_2 || num_experts > 256)
    {
-        softmax_out_ = (float*) ws_sliced[5];
+        softmax_out_ = (float*) ws_sliced[4];
    }

-    glu_inter_result_ = (T*) ws_sliced[6];
+    sorter_ws_ = (char*) ws_sliced[5];

-    // These pointers are aliased. Since the sort ws can be overwritten after it is finished
-    sorter_ws_ = (char*) ws_sliced[7];
-    fc1_result_ = (T*) ws_sliced[7];
+    // Always 6, but overlapped with either fc1_result_ or fc2_result_
+    permuted_data_ = (T*) ws_sliced[6];

-    fc2_result_ = (T*) ws_sliced[8];
+    bool const is_gated_activation = isGatedActivation(activation_type);
+    bool const use_fused_moe = moe_gemm_runner_.isFusedGatedActivation(is_gated_activation, inter_size, hidden_size);
+    bool const using_hopper = moe_gemm_runner_.isHopperSpecialised();
+    bool const hopper_has_glu = using_hopper && (mayHaveDifferentGEMMOutputType() || is_gated_activation);
+    bool const non_hopper_has_glu = !using_hopper && !use_fused_moe && is_gated_activation;
+    bool const has_glu_inter_result = hopper_has_glu || non_hopper_has_glu;
+    // Always 7, ignored if not needed
+    glu_inter_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : nullptr;
+
+    // fc1 and fc2 alias one of the above pointers, but it depends on if actfn is fused/unfused which is overlapped
+    fc1_result_ = has_glu_inter_result ? (T*) ws_sliced[6] : (T*) ws_sliced[7];
+    fc2_result_ = has_glu_inter_result ? (T*) ws_sliced[7] : (T*) ws_sliced[6];

    hopper_grouped_gemm_input_ = {};
    if (moe_gemm_runner_.isHopperSpecialised())
    {
-        hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[9], num_experts_per_node, ws_sliced[10], ws_sizes[10]);
+        hopper_grouped_gemm_input_.configureWorkspace(ws_sliced[8], num_experts_per_node, ws_sliced[9], ws_sizes[9]);
    }
 }

@ -1293,6 +1332,7 @@ void CutlassMoeFCRunner<T, WeightType, OutputType, Enable>::runMoe(void const* i
    }
    else
    {
+
        // Run the GEMM with activation function overridden with `Identity`, we do the activation separately
        ActivationType activation_type = (use_fused_moe) ? fc1_activation_type : ActivationType::Identity;
        T* gemm_result = (use_fused_moe) ? fc1_result_ : static_cast<T*>(glu_inter_result_);
--- a/cpp/tensorrt_llm/pybind/executor/bindings.cpp
+++ b/cpp/tensorrt_llm/pybind/executor/bindings.cpp
@ -431,7 +431,8 @@ void InitBindings(pybind11::module_& m)
            &tle::DecodingConfig::setLookaheadDecoding)
        .def_property("medusa_choices", &tle::DecodingConfig::getMedusaChoices, &tle::DecodingConfig::setMedusaChoices);

-    auto executorConfigGetState = [&](tle::ExecutorConfig const& self)
+    auto executorConfigGetState = [&peftCacheConfigGetstate, &kvCacheConfigGetstate, &schedulerConfigGetstate,
+                                      &parallelConfigGetstate](tle::ExecutorConfig const& self)
    {
        py::object peftCacheConfigState = py::none();

@ -453,7 +454,8 @@ void InitBindings(pybind11::module_& m)
            peftCacheConfigState, self.getLogitsPostProcessorMap(), self.getLogitsPostProcessorBatched(),
            self.getDecodingConfig(), self.getGpuWeightsPercent());
    };
-    auto executorConfigSetState = [&](py::tuple state)
+    auto executorConfigSetState = [&kvCacheConfigSetstate, &peftCacheConfigSetstate, &schedulerConfigSetstate,
+                                      &parallelConfigSetstate](py::tuple state)
    {
        if (state.size() != 15)
        {
--- a/cpp/tensorrt_llm/runtime/medusaModule.cpp
+++ b/cpp/tensorrt_llm/runtime/medusaModule.cpp
@ -96,8 +96,11 @@ void MedusaModule::initMedusaTensorsFromChoices(MedusaChoices const& choices, st
        if (curDepth != depth)
        {
            TLLM_CHECK(depth + 1 == curDepth);
+            TLLM_CHECK_WITH_INFO(depth <= getMaxDraftPathLen(),
+                "Medusa choices require more Medusa heads than the engine was built with.");
            // Save TopK
            topKs[depth - 1] = maxTopK;
+
            // Accumulate TopK for global indexing in tree
            globalNodeInTreeIdx += maxTopK;

--- a/cpp/tests/kernels/mixtureOfExpertsTest.cu
+++ b/cpp/tests/kernels/mixtureOfExpertsTest.cu
@ -257,7 +257,7 @@ protected:
    template <class T>
    T* allocBuffer(size_t size)
    {
-        managed_buffers.emplace_back(mBufferManager->managed(size * sizeof(T)));
+        managed_buffers.emplace_back(mBufferManager->gpu(size * sizeof(T)));
        EXPECT_EQ(cudaGetLastError(), cudaSuccess) << "Error allocating buffer of size: " << size;
        T* ptr = static_cast<T*>(managed_buffers.back()->data());
        return ptr;
@ -268,15 +268,27 @@ protected:
        this->managed_buffers.clear();             // Make sure all the previous buffers are freed
        check_cuda_error(cudaDeviceSynchronize()); // Sync to make sure all previous operations are resolved

-        size_t weight_size = hidden_size * hidden_size * 4 * num_experts * sizeof(WeightType);
-        // Skip the test if the GPU does not have enough memory
-        size_t workspace_size = this->mMoERunner.getWorkspaceSize(
+        // Calculate the size contributions for all the large buffers to check if the GPU has enough space
+        bool const is_gated = tensorrt_llm::isGatedActivation(mActType);
+        size_t const num_gemms = 2 + is_gated;
+        // Expert weights
+        size_t const weight_size = hidden_size * (hidden_size * 4) * num_experts * sizeof(WeightStorage) * num_gemms;
+        // Workspace size
+        size_t const workspace_size = this->mMoERunner.getWorkspaceSize(
            num_tokens, hidden_size, hidden_size * 4, num_experts, k, this->mActType, {});
+        // The input/output buffers
+        size_t const in_out_size = 2 * num_tokens * hidden_size * sizeof(DataType);

-        size_t total_size = workspace_size + weight_size * 2;
+        // This should be correct to within 100MiB (on tests with 30GiB total)
+        size_t const total_size = workspace_size + weight_size + in_out_size;

+        size_t const memory_pool_free_mem_size = mBufferManager->memoryPoolFree();
        auto const [freeMem, totalMem] = tensorrt_llm::common::getDeviceMemoryInfo(false);
-        return freeMem >= total_size;
+        float const freeMemBuffer = 0.9f; // Add some buffer so we aren't completely pushing the limits
+        std::cout << "Free memory is: " << freeMem << ", memory pool size is: " << memory_pool_free_mem_size
+                  << ", required memory is: " << total_size << ", device total memory capacity: " << totalMem
+                  << std::endl;
+        return (freeMem + memory_pool_free_mem_size) * freeMemBuffer >= total_size;
    }

    void initBuffersPermute(std::vector<std::vector<DataType>> h_hidden_states,
@ -362,7 +374,10 @@ protected:
            initFP8Scales(mMaxInput);
        }

-        mTpExpertScratch = allocBuffer<DataType>(mTpExpertScratchSize);
+        if (parallelism_config.tp_size > 1 || parallelism_config.ep_size > 1)
+        {
+            mTpExpertScratch = allocBuffer<DataType>(mTpExpertScratchSize);
+        }

        mActiveRows = mTotalTokens;
        mFinished = nullptr;
@ -475,10 +490,18 @@ protected:
        ASSERT_NE(mExpertFP8Scale1, nullptr);
        ASSERT_NE(mExpertFP8Scale2, nullptr);
        ASSERT_NE(mExpertFP8Scale3, nullptr);
+
        // Dequant values for each expert are 1/(w_i*a_i) calculated above
-        std::fill_n(mExpertFP8Scale1, mNumExperts, 1.f / (scaleW1 * scaleAct1));
-        std::fill_n(mExpertFP8Scale3, mNumExperts, 1.f / (scaleW2 * scaleAct2));
-        *mExpertFP8Scale2 = scaleAct2;
+        std::vector<float> scales_1(mNumExperts, 1.f / (scaleW1 * scaleAct1));
+        std::vector<float> scales_2(1, scaleAct2);
+        std::vector<float> scales_3(mNumExperts, 1.f / (scaleW2 * scaleAct2));
+
+        check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale1, scales_1.data(), scales_1.size() * sizeof(float),
+            cudaMemcpyHostToDevice, mStream->get()));
+        check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale2, scales_2.data(), scales_2.size() * sizeof(float),
+            cudaMemcpyHostToDevice, mStream->get()));
+        check_cuda_error(cudaMemcpyAsync(mExpertFP8Scale3, scales_3.data(), scales_3.size() * sizeof(float),
+            cudaMemcpyHostToDevice, mStream->get()));

        check_cuda_error(cudaStreamSynchronize(mStream->get()));
    }
@ -561,6 +584,13 @@ protected:
        void* ep_scale_2 = FP8 ? (void*) mExpertFP8Scale2 : (void*) mExpertIntScale2;
        void* ep_scale_3 = FP8 ? mExpertFP8Scale3 : nullptr;

+        // Handle the case with no parallelism to not require the extra alloc
+        if (parallelism_config.tp_size == 1 && parallelism_config.ep_size == 1)
+        {
+            return std::tuple{
+                mExpertWeight1, mExpertWeight2, mExpertBias1, mExpertBias2, ep_scale_1, ep_scale_2, ep_scale_3};
+        }
+
        // Slice weights for EP
        size_t const gated_inter = mInterSize * mGatedMultiplier;
        size_t const experts_per_node = mNumExperts / parallelism_config.ep_size;
--- a/cpp/tests/resources/scripts/build_medusa_engines.py
+++ b/cpp/tests/resources/scripts/build_medusa_engines.py
@ -28,7 +28,7 @@ def build_engine(weight_dir: _pl.Path, medusa_dir: _pl.Path,
    covert_cmd = [_sys.executable, "examples/medusa/convert_checkpoint.py"] + (
        ['--model_dir', str(weight_dir)] if weight_dir else []) + [
            '--medusa_model_dir', str(medusa_dir), \
-            '--output_dir', str(engine_dir), '--dtype=float16', '--fixed_num_medusa_heads=4'
+            '--output_dir', str(engine_dir), '--dtype=float16', '--num_medusa_heads=4'
        ] + list(args)

    run_command(covert_cmd)
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@ -1,6 +1,6 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
-ARG BASE_TAG=24.04-py3
+ARG BASE_TAG=24.05-py3
 ARG DEVEL_IMAGE=devel

 FROM ${BASE_IMAGE}:${BASE_TAG} as base
--- a/docker/common/install_pytorch.sh
+++ b/docker/common/install_pytorch.sh
@ -4,8 +4,8 @@ set -ex

 # Use latest stable version from https://pypi.org/project/torch/#history
 # and closest to the version specified in
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04
-TORCH_VERSION="2.3.0"
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05
+TORCH_VERSION="2.3.1"
 SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')

 prepare_environment() {
--- a/docker/common/install_tensorrt.sh
+++ b/docker/common/install_tensorrt.sh
@ -2,12 +2,12 @@

 set -ex

-TRT_VER="10.0.1.6"
+TRT_VER="10.1.0.27"
 # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05
 CUDA_VER="12.4" # 12.4.1
 # Keep the installation for cuDNN if users want to install PyTorch with source codes.
-# PyTorch 2.3.0 can compile with cuDNN v9.
+# PyTorch 2.3.x can compile with cuDNN v9.
 CUDNN_VER="9.1.0.70-1"
 NCCL_VER="2.21.5-1+cuda12.4"
 CUBLAS_VER="12.4.5.8-1"
@ -86,8 +86,7 @@ install_tensorrt() {
        if [ "$ARCH" = "amd64" ];then ARCH="x86_64";fi
        if [ "$ARCH" = "x86_64" ];then DIR_NAME="x64-agnostic"; else DIR_NAME=${ARCH};fi
        if [ "$ARCH" = "aarch64" ];then OS1="Ubuntu22_04" && OS2="Ubuntu-22.04" && OS="ubuntu-22.04"; else OS1="Linux" && OS2="Linux" && OS="linux";fi
-        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.0.1/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
-
+        RELEASE_URL_TRT=https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/10.1.0/tars/TensorRT-${TRT_VER}.${OS2}.${ARCH}-gnu.cuda-${TRT_CUDA_VERSION}.tar.gz
    fi
    wget --no-verbose ${RELEASE_URL_TRT} -O /tmp/TensorRT.tar
    tar -xf /tmp/TensorRT.tar -C /usr/local/
--- a/docs/source/reference/support-matrix.md
+++ b/docs/source/reference/support-matrix.md
@ -44,9 +44,9 @@ The following table shows the supported software for TensorRT-LLM.
    * -
      - Software Compatibility
    * - Container
-      - [24.04](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
+      - [24.05](https://docs.nvidia.com/deeplearning/frameworks/support-matrix/index.html)
    * - TensorRT
-      - [10.0](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
+      - [10.1](https://docs.nvidia.com/deeplearning/tensorrt/release-notes/index.html)
    * - Precision
      -
        - Hopper (SM90) - FP32, FP16, BF16, FP8, INT8, INT4
--- a/docs/source/release-notes.md
+++ b/docs/source/release-notes.md
@ -23,10 +23,11 @@ All published functionality in the Release Notes has been fully tested and verif
 - TBD

 ### Infrastructure changes
-  - Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.04-py3`.
-  - Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.04-py3`.
+  - Base Docker image for TensorRT-LLM is updated to `nvcr.io/nvidia/pytorch:24.05-py3`.
+  - Base Docker image for TensorRT-LLM backend is updated to `nvcr.io/nvidia/tritonserver:24.05-py3`.
+  - The dependent TensorRT version is updated to 10.1.0.
  - The dependent CUDA version is updated to 12.4.1.
-  - The dependent PyTorch version is updated to 2.3.0.
+  - The dependent PyTorch version is updated to 2.3.1.


 ## TensorRT-LLM Release 0.10.0
--- a/docs/source/speculative_decoding.md
+++ b/docs/source/speculative_decoding.md
@ -15,10 +15,10 @@ If the first assumption holds true, the latency of speculative decoding will no
 The combination of both these allows speculative decoding to result in reduced latency.

 TensorRT-LLM supports several approaches for generating draft tokens, including:
+
 1. Utilizing a smaller, auxiliary model, known as the draft model approach. For more information, refer to the [Fast Inference from Transformers via Speculative Decoding paper](https://arxiv.org/pdf/2211.17192.pdf).
 2. Implementing additional language model heads that predict tokens for future positions, as detailed in the [Medusa: Simple LLM Inference Acceleration Framework with Multiple Decoding Heads paper](https://arxiv.org/abs/2401.10774).

-
 ## Performance Improvements

 It's important to note that the effectiveness of speculative decoding techniques is highly dependent
@ -56,6 +56,166 @@ it is advisable to enable KV cache reuse for both models.
 This can be achieved by adding the `--use_paged_context_fmha=enable` flag to the `trtllm-build` command
 and setting `enableBlockReuse=true` in the `KVCacheConfig`.

+## Using Draft model approach with Triton Inference Server
+
+ Draft model approach is supported since TensorRT-LLM-0.7.0 (using two separate Tritonserver to maintain draft and target model respectively), but has significant optimization in TensorRT-LLM-0.10.0 (using one Tritonserver with [Business Logic Scripting](https://github.com/triton-inference-server/python_backend?tab=readme-ov-file#business-logic-scripting), BLS).
+ The source file of Draft model with BLS can be found [here](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/all_models/inflight_batcher_llm/tensorrt_llm_bls/1/lib/decode.py).
+ This example is based on TensorRT-LLM-0.10.0 and TRTLLM-backend-0.10.0, using docker image `nvcr.io/nvidia/tritonserver:24.05-trtllm-py3`.
+ Llama-7B-hf and Llama-30B-hf are used as draft and target model respectively in this example, assuming the paths to the models' repository are `DRAFT_MODEL_PATH` and `TARGET_MODEL_PATH`.
+ Maximum number of draft tokens is set to 10 in this example.
+
+1. Prepare TensorRT engine for inference
+    + Here are the commands to build draft / target engines in FP16 or FP8. All combinations of the data type (Draft-FP16/FP8 + Target-FP16/FP8) are supported.
+    + `--remove_input_padding=enable --paged_kv_cache=enable` are necessary for inflight-batching.
+    + `--context_fmha=enable --use_paged_context_fmha=enable` are optional, but recommended for the performance.
+    + `--gather_generation_logits` is necessary if using generation logits for selecting tokens in target model.
+    + `--tp_size` can be modified set if using TP mode for draft / target model.
+    + `--max_batch_size` more than 1 is acceptable in general usage, but we use 1 in this example.
+
+    ```bash
+    export MAX_DRAFT_LENGTH=10
+    export COMMON_COMMAND="--max_batch_size=1 --max_input_len=2048 --max_output_len=1024 --gpt_attention_plugin=float16 --gemm_plugin=float16 --remove_input_padding=enable --paged_kv_cache=enable --context_fmha=enable --use_paged_context_fmha=enable --gather_generation_logits"
+    export DRAFT_COMMAND_FP16="$COMMON_COMMAND"
+    export TARGET_COMMAND_FP16="$DRAFT_COMMAND_FP16 --max_draft_len=$MAX_DRAFT_LENGTH --speculative_decoding_mode draft_tokens_external"
+    export DRAFT_COMMAND_FP8="$COMMON_COMMAND --strongly_typed --use_fp8_context_fmha=enable"
+    export TARGET_COMMAND_FP8="$DRAFT_COMMAND_FP8 --max_draft_len=$MAX_DRAFT_LENGTH --speculative_decoding_mode draft_tokens_external"
+
+    # Build checkpoints and engines in tensorrt_llm/examples/llama/
+    # FP16 mode
+    export DRAFT_NAME=llama-7b-fp16-tp1
+    export TARGET_NAME=llama-30b-fp16-tp1
+    python3 convert_checkpoint.py --model_dir=$DRAFT_MODEL_PATH --output_dir=ckpt/$DRAFT_NAME --tp_size=1
+    python3 convert_checkpoint.py --model_dir=$TARGET_MODEL_PATH --output_dir=ckpt/$TARGET_NAME --tp_size=1
+    trtllm-build --checkpoint_dir=ckpt/$DRAFT_NAME --output_dir=engine/draft/$DRAFT_NAME $DRAFT_COMMAND_FP16
+    trtllm-build --checkpoint_dir=ckpt/$TARGET_NAME --output_dir=engine/target/$TARGET_NAME $TARGET_COMMAND_FP16
+    export DRAFT_ENGINE_PATH=$(pwd)/engine/draft/$DRAFT_NAME
+    export TARGET_ENGINE_PATH=$(pwd)/engine/target/$TARGET_NAME
+
+    # FP8 mode
+    export DRAFT_NAME=llama-7b-fp8-tp1
+    export TARGET_NAME=llama-30b-fp8-tp1
+    python3 convert_checkpoint.py --model_dir=$DRAFT_MODEL_PATH --output_dir=ckpt/$DRAFT_NAME --tp_size=1
+    python3 convert_checkpoint.py --model_dir=$TARGET_MODEL_PATH --output_dir=ckpt/$TARGET_NAME --tp_size=1
+    trtllm-build --checkpoint_dir=ckpt/$DRAFT_NAME --output_dir=engine/draft/$DRAFT_NAME $DRAFT_COMMAND_FP8
+    trtllm-build --checkpoint_dir=ckpt/$TARGET_NAME --output_dir=engine/target/$TARGET_NAME $TARGET_COMMAND_FP8
+    export DRAFT_ENGINE_PATH=$(pwd)/engine/draft/$DRAFT_NAME
+    export TARGET_ENGINE_PATH=$(pwd)/engine/target/$TARGET_NAME
+    ```
+
+2. Edit Triton configuration
+    + If both draft and target model can be placed in one GPU (for example, llama-7B-FP8 + llama-30B-FP8, totally 40GiB in one H100-80GiB GPU), `DRAFT_GPU_DEVICE_IDS` and `TARGET_GPU_DEVICE_IDS` can be the same, `0` as example. It appears better performance than placing on two separate GPUs.
+    + Elsewise, the draft and target models can be placed in different GPUs, `DRAFT_GPU_DEVICE_IDS="0"` and `TARGET_GPU_DEVICE_IDS="1"` as example.
+    + Furthermore, if TP mode is used, the value of `GPU_DEVICE_IDS` can be a list, `DRAFT_GPU_DEVICE_IDS="0"` and `TARGET_GPU_DEVICE_IDS="1,2,3,4"` as example.
+    + For more configuration of launching models with Tritonserver, please visit [TensorRT-LLM Backed repo](https://github.com/triton-inference-server/tensorrtllm_backend/blob/main/README.md).
+
+    ```bash
+    ACCUMULATE_TOKEN="false"
+    BACKEND="tensorrtllm"
+    BATCH_SCHEDULER_POLICY="guaranteed_no_evict"
+    BATCHING_STRATEGY="inflight_fused_batching"
+    BLS_INSTANCE_COUNT="1"
+    DECODING_MODE="top_k_top_p"
+    DECOUPLED_MODE="False"
+    DRAFT_GPU_DEVICE_IDS="0"
+    E2E_MODEL_NAME="ensemble"
+    ENABLE_KV_CACHE_REUSE="true"
+    ENGINE_PATH=$TARGET_ENGINE_PATH
+    EXCLUDE_INPUT_IN_OUTPUT="false"
+    KV_CACHE_FREE_GPU_MEM_FRACTION="0.8"
+    MAX_ATTENTION_WINDOW_SIZE=""
+    MAX_BEAM_WIDTH="1"
+    MAX_QUEUE_DELAY_MICROSECONDS="0"
+    MAX_TOKENS_IN_KV_CACHE=""
+    NORMALIZE_LOG_PROBS="true"
+    POSTPROCESSING_INSTANCE_COUNT="1"
+    PREPROCESSING_INSTANCE_COUNT="1"
+    TARGET_GPU_DEVICE_IDS="1"
+    TENSORRT_LLM_DRAFT_MODEL_NAME="tensorrt_llm_draft"
+    TENSORRT_LLM_MODEL_NAME="tensorrt_llm"
+    TOKENIZER_PATH=$DRAFT_MODEL_PATH
+    TOKENIZER_TYPE=llama
+    TRITON_GRPC_PORT="8001"
+    TRITON_HTTP_PORT="8000"
+    TRITON_MAX_BATCH_SIZE="4"
+    TRITON_METRICS_PORT="8002"
+    TRITON_REPO="triton_repo"
+    USE_DRAFT_LOGITS="false"
+
+    # Make a copy of triton repo and replace the fields in the configuration files
+    cd /tensorrtllm_backend/
+    apt-get update && apt-get install -y build-essential cmake git-lfs
+    pip3 install git-lfs tritonclient grpcio
+    rm -rf ${TRITON_REPO}
+    cp -R all_models/inflight_batcher_llm ${TRITON_REPO}
+    python3 tools/fill_template.py -i ${TRITON_REPO}/ensemble/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE}
+    python3 tools/fill_template.py -i ${TRITON_REPO}/preprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},preprocessing_instance_count:${PREPROCESSING_INSTANCE_COUNT}
+    python3 tools/fill_template.py -i ${TRITON_REPO}/postprocessing/config.pbtxt tokenizer_dir:${TOKENIZER_PATH},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},postprocessing_instance_count:${POSTPROCESSING_INSTANCE_COUNT}
+    python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_bls/config.pbtxt triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},decoupled_mode:${DECOUPLED_MODE},accumulate_tokens:${ACCUMULATE_TOKEN},bls_instance_count:${BLS_INSTANCE_COUNT},tensorrt_llm_model_name:${TENSORRT_LLM_MODEL_NAME},tensorrt_llm_draft_model_name:${TENSORRT_LLM_DRAFT_MODEL_NAME}
+
+    # Make a copy of tensorrt_llm as configurations of draft / target models.
+    cp -R ${TRITON_REPO}/tensorrt_llm ${TRITON_REPO}/tensorrt_llm_draft
+    sed -i 's/name: "tensorrt_llm"/name: "tensorrt_llm_draft"/g' ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt
+    python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm/config.pbtxt          triton_backend:${BACKEND},engine_dir:${ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${TARGET_GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE}
+    python3 tools/fill_template.py -i ${TRITON_REPO}/tensorrt_llm_draft/config.pbtxt    triton_backend:${BACKEND},engine_dir:${DRAFT_ENGINE_PATH},decoupled_mode:${DECOUPLED_MODE},max_tokens_in_paged_kv_cache:${MAX_TOKENS_IN_KV_CACHE},max_attention_window_size:${MAX_ATTENTION_WINDOW_SIZE},batch_scheduler_policy:${BATCH_SCHEDULER_POLICY},batching_strategy:${BATCHING_STRATEGY},kv_cache_free_gpu_mem_fraction:${KV_CACHE_FREE_GPU_MEM_FRACTION},exclude_input_in_output:${EXCLUDE_INPUT_IN_OUTPUT},triton_max_batch_size:${TRITON_MAX_BATCH_SIZE},max_queue_delay_microseconds:${MAX_QUEUE_DELAY_MICROSECONDS},max_beam_width:${MAX_BEAM_WIDTH},enable_kv_cache_reuse:${ENABLE_KV_CACHE_REUSE},normalize_log_probs:${NORMALIZE_LOG_PROBS},enable_chunked_context:${ENABLE_CHUNKED_CONTEXT},gpu_device_ids:${DRAFT_GPU_DEVICE_IDS},decoding_mode:${DECODING_MODE}
+    ```
+
+3. Launch Triton server
+    + `--multi-model` is necessary if TP mode is used for target model.
+
+    ```bash
+    python3 scripts/launch_triton_server.py \
+        --model_repo=${TRITON_REPO} \
+        --tensorrt_llm_model_name "tensorrt_llm,tensorrt_llm_draft" \
+        --multi-model \
+        --log &
+    ```
+
+    + Verbose log will be written in to file `triton_log.txt`. Triton server launches successfully if you see the output below in the file:
+
+    ```txt
+    Started HTTPService at 0.0.0.0:8000
+    Started GRPCInferenceService at 0.0.0.0:8001
+    Started Metrics Service at 0.0.0.0:8002
+    ```
+
+4. Send Requests
+    + Prepare a JSON file `input_data.json` containing input data as below (more requests are acceptable).
+
+    ```json
+    [
+        {
+            "input": "James Best, best known for his ",
+            "instruction": "Continue writing the following story:",
+            "output": "                                                                "
+        }
+    ]
+    ```
+
+    + Use command below to launch requests for inference.
+    + `--num-draft-tokens` can be modified by runtime draft lengths, 4 is used in this example.
+
+    ```bash
+    python3 tools/inflight_batcher_llm/speculative_decoding_test.py \
+        --max-input-len 2048 \
+        --dataset=input_data.json \
+        --url-target=localhost:8001 \
+        --url-draft=localhost:8001 \
+        --draft-tensorrt-llm-model-name="${TENSORRT_LLM_DRAFT_MODEL_NAME}" \
+        --target-tensorrt-llm-model-name="${TENSORRT_LLM_MODEL_NAME}" \
+        --bls-speculative-tensorrt-llm-model-name="tensorrt_llm_bls" \
+        --execute-bls-speculative-decoding \
+        --disable-output-comparison \
+        --num-draft-tokens=4 \
+        --verbose
+    ```
+
+5. Kill Tritonserver after finishing inference
+
+    ```bash
+    pkill -9 -f trtllmExecutorWorker
+    pkill -9 -f tritonserver
+    ```
+
 # Medusa

 This approach leverages a single model to both generate and verify draft tokens.
--- a/examples/baichuan/requirements.txt
+++ b/examples/baichuan/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.15.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/bloom/requirements.txt
+++ b/examples/bloom/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/chatglm/requirements.txt
+++ b/examples/chatglm/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 protobuf
--- a/examples/cogvlm/convert_checkpoint.py
+++ b/examples/cogvlm/convert_checkpoint.py
@ -119,7 +119,7 @@ def parse_arguments():
        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
    )
    parser.add_argument(
-        '--modelopt_quant_ckpt_path',
+        '--quant_ckpt_path',
        type=str,
        default=None,
        help='Path of a quantized model checkpoint in .npz format')
@ -438,7 +438,7 @@ def main():

        if args.use_weight_only and args.weight_only_precision == 'int4_gptq':
            weights = load_weights_from_gptq(
-                args.modelopt_quant_ckpt_path,
+                args.quant_ckpt_path,
                PretrainedConfig.from_dict(copy.deepcopy(config)),
            )

--- a/examples/dbrx/requirements.txt
+++ b/examples/dbrx/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 transformers>=4.31.0
 datasets~=2.14.5
 evaluate~=0.4.1
--- a/examples/gemma/convert_checkpoint.py
+++ b/examples/gemma/convert_checkpoint.py
@ -71,7 +71,7 @@ def parse_arguments():
        "By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV",
    )
    parser.add_argument(
-        "--modelopt_quant_ckpt_path",
+        "--quant_ckpt_path",
        default=None,
        help=
        "Path of a directory to quantized model checkpoints in .safetensors format or \
@ -944,7 +944,7 @@ def convert(worker_rank, args, convert_kwargs):
            weight_scales = quantize_fp8_weights(
                weights, trt_llm_config.num_hidden_layers,
                trt_llm_config.mapping)
-            scales = load_from_fp8_gemma(args.modelopt_quant_ckpt_path,
+            scales = load_from_fp8_gemma(args.quant_ckpt_path,
                                         trt_llm_config.num_hidden_layers,
                                         trt_llm_config.mapping,
                                         args.fp8_kv_cache, weight_scales)
--- a/examples/gemma/requirements.txt
+++ b/examples/gemma/requirements.txt
@ -3,7 +3,7 @@
 # WAR the new posting of "nvidia-cudnn-cu12~=9.0".
 # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
 nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 flax~=0.8.0
 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
 jax~=0.4.19; platform_system == "Windows"
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/gptj/requirements.txt
+++ b/examples/gptj/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/gptneox/README.md
+++ b/examples/gptneox/README.md
@ -167,7 +167,7 @@ sh gptq_convert.sh
 ### 3. Convert weights from HF Transformers to TensorRT-LLM format

 To apply groupwise quantization GPTQ, addition command-line flags need to be passed to `convert_checkpoint.py`:
-Here `--modelopt_quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script.
+Here `--quant_ckpt_path` flag specifies the output safetensors of `gptq_convert.sh` script.

 ```bash
 # Single GPU
@ -175,7 +175,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \
                              --dtype float16 \
                              --use_weight_only \
                              --weight_only_precision int4_gptq \
-                              --modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
+                              --quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
                              --output_dir ./gptneox/20B/trt_ckpt/int4_gptq/1-gpu/
 # With 2-way Tensor Parallel
 python3 convert_checkpoint.py --model_dir ./gptneox_model \
@ -184,7 +184,7 @@ python3 convert_checkpoint.py --model_dir ./gptneox_model \
                              --weight_only_precision int4_gptq \
                              --tp_size 2 \
                              --workers 2 \
-                              --modelopt_quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
+                              --quant_ckpt_path ./gptneox_model/gptneox-20b-4bit-gs128.safetensors \
                              --output_dir ./gptneox/20B/trt_ckpt/int4_gptq/2-gpu/
 ```

--- a/examples/gptneox/convert_checkpoint.py
+++ b/examples/gptneox/convert_checkpoint.py
@ -50,7 +50,7 @@ def parse_arguments():
        'Define the precision for the weights when using weight-only quantization.'
        'You must also use --use_weight_only for that argument to have an impact.'
    )
-    parser.add_argument('--modelopt_quant_ckpt_path',
+    parser.add_argument('--quant_ckpt_path',
                        type=str,
                        default=None,
                        help='Path of a quantized model checkpoint')
@ -708,8 +708,7 @@ if __name__ == '__main__':
            'has_zero_point':
            True,
            'group_size':
-            get_gptq_gptneox_group_size(args.modelopt_quant_ckpt_path,
-                                        hf_config)
+            get_gptq_gptneox_group_size(args.quant_ckpt_path, hf_config)
        })

    with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
@ -723,7 +722,7 @@ if __name__ == '__main__':

        if args.use_weight_only and args.weight_only_precision == 'int4_gptq':
            weights = load_from_gptq_gptneox(
-                args.modelopt_quant_ckpt_path,
+                args.quant_ckpt_path,
                hf_config,
                use_parallel_embedding=args.use_parallel_embedding,
                sharding_dim=args.embedding_sharding_dim,
--- a/examples/gptneox/requirements.txt
+++ b/examples/gptneox/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 rouge_score~=0.1.2
 evaluate~=0.4.1
--- a/examples/grok/requirements.txt
+++ b/examples/grok/requirements.txt
@ -1,6 +1,6 @@
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/high-level-api/requirements.txt
+++ b/examples/high-level-api/requirements.txt
@ -1,2 +1,2 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
--- a/examples/internlm/requirements.txt
+++ b/examples/internlm/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets==2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@ -644,7 +644,7 @@ One can enable AWQ/GPTQ INT4 weight only quantization with these options when bu
 - `--use_weight_only` enables weight only GEMMs in the network.
 - `--per_group` enable groupwise weight only quantization, for GPT-J example, we support AWQ with the group size default as 128.
 - `--weight_only_precision` should specify the weight only quantization format. Supported formats are `int4_awq` or `int4_gptq`.
- `--modelopt_quant_ckpt_path` passes the quantized checkpoint to build the engine.
+- `--quant_ckpt_path` passes the quantized checkpoint to build the engine.

 AWQ/GPTQ examples below involves 2 steps:
 1. Weight quantization
@ -700,7 +700,7 @@ To run the GPTQ LLaMa example, the following steps are required:
    python convert_checkpoint.py --model_dir /tmp/llama-7b-hf \
                                 --output_dir ./tllm_checkpoint_2gpu_gptq \
                                 --dtype float16 \
-                                 --modelopt_quant_ckpt_path ./llama-7b-4bit-gs128.safetensors  \
+                                 --quant_ckpt_path ./llama-7b-4bit-gs128.safetensors  \
                                 --use_weight_only \
                                 --weight_only_precision int4_gptq \
                                 --per_group \
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/mamba/README.md
+++ b/examples/mamba/README.md
@ -29,9 +29,6 @@ Please install required packages first and setup `git-lfs`:

 ```bash
 pip install -r requirements.txt
-pip install "transformers>=4.39.0"
-
-# Setup git-lfs
 git lfs install
 ```

@ -98,8 +95,7 @@ The `trtllm-build` command builds TensorRT-LLM engines from TensorRT-LLM checkpo
 # mamba-2.8b
 trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \
             --paged_kv_cache disable \
-             --gemm_plugin bfloat16 \
-             --mamba_conv1d_plugin bfloat16 \
+             --gemm_plugin auto \
             --max_batch_size 8 \
             --max_input_len 924 \
             --max_seq_len 1024 \
@ -108,8 +104,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-2.8b/trt_ckpt/bf16/1-gpu/ \
 # mamba-1.4b
 trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \
             --paged_kv_cache disable \
-             --gemm_plugin float16 \
-             --mamba_conv1d_plugin float16 \
+             --gemm_plugin auto \
             --max_batch_size 8 \
             --max_input_len 924 \
             --max_seq_len 1024 \
@ -118,8 +113,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-1.4b/trt_ckpt/fp16/1-gpu/ \
 # mamba-790m
 trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \
             --paged_kv_cache disable \
-             --gemm_plugin float16 \
-             --mamba_conv1d_plugin float16 \
+             --gemm_plugin auto \
             --max_batch_size 8 \
             --max_input_len 924 \
             --max_seq_len 1024 \
@ -128,8 +122,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-790m/trt_ckpt/fp16/1-gpu/ \
 # mamba-370m
 trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \
             --paged_kv_cache disable \
-             --gemm_plugin float16 \
-             --mamba_conv1d_plugin float16 \
+             --gemm_plugin auto \
             --max_batch_size 8 \
             --max_input_len 924 \
             --max_seq_len 1024 \
@ -138,8 +131,7 @@ trtllm-build --checkpoint_dir ./mamba_model/mamba-370m/trt_ckpt/fp16/1-gpu/ \
 # mamba-130m
 trtllm-build --checkpoint_dir ./mamba_model/mamba-130m/trt_ckpt/fp16/1-gpu/ \
             --paged_kv_cache disable \
-             --gemm_plugin float16 \
-             --mamba_conv1d_plugin float16 \
+             --gemm_plugin auto \
             --max_batch_size 8 \
             --max_input_len 924 \
             --max_seq_len 1024 \
--- a/examples/mamba/requirements.txt
+++ b/examples/mamba/requirements.txt
@ -1,5 +1,6 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
+transformers>=4.39.0
 datasets~=2.14.5
 evaluate
 rouge_score
--- a/examples/medusa/README.md
+++ b/examples/medusa/README.md
@ -13,6 +13,7 @@ For more info about Medusa visit [speculative decoding documentation](../../docs
  * GPU Compute Capability >= 8.0 (Ampere or newer)
  * FP16
  * BF16
+  * FP8 (base model)
  * PAGED_KV_CACHE
  * Tensor Parallel

@ -32,7 +33,7 @@ https://huggingface.co/FasterDecoding/medusa-vicuna-7b-v1.3
 ```

 We use `convert_checkpoint.py` script to convert the model for Medusa decoding into TensorRT-LLM checkpoint format.
-Here we also add `--fixed_num_medusa_heads 4` as `medusa_num_heads` is 2 in `config.json` of `medusa-vicuna-7b-v1.3` but it actually has 4.
+We could use `--num_medusa_heads` to set the number of medusa heads that we want to use. If not, `num_medusa_heads` will be set according to the `medusa_num_heads` from medusa weights' `config.json`.

 Here is the example:
 ```bash
@ -41,20 +42,21 @@ python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \
                            --medusa_model_dir medusa-vicuna-7b-v1.3 \
                            --output_dir ./tllm_checkpoint_1gpu_medusa \
                            --dtype float16 \
-                            --fixed_num_medusa_heads 4
+                            --num_medusa_heads 4

+# Note: Increasing the batch size may have a negative impact on performance
 trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
             --output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
             --gemm_plugin float16 \
             --speculative_decoding_mode medusa \
-             --max_batch_size 8
+             --max_batch_size 4

 # Convert and Build Medusa decoding support for vicuna-13b-v1.3 with 4-way tensor parallelism.
 python convert_checkpoint.py --model_dir ./vicuna-7b-v1.3 \
                            --medusa_model_dir medusa-vicuna-7b-v1.3 \
                            --output_dir ./tllm_checkpoint_1gpu_medusa \
                            --dtype float16 \
-                            --fixed_num_medusa_heads 4 \
+                            --num_medusa_heads 4 \
                            --tp_size 4 \
                            --workers 4

@ -62,7 +64,30 @@ trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_medusa \
             --output_dir ./tmp/medusa/7B/trt_engines/fp16/1-gpu/ \
             --gemm_plugin float16 \
             --speculative_decoding_mode medusa \
-             --max_batch_size 8
+             --max_batch_size 4
+```
+
+### FP8 Post-Training Quantization for Base Model
+The example below quantizes the base model to FP8, while keeping the weight of the medusa head non-quantize.
+```bash
+# Quantize base model into FP8 and export trtllm checkpoint
+python ../quantization/quantize.py --model_dir /path/to/base-model-hf/ \
+                                   --dtype float16 \
+                                   --qformat fp8 \
+                                   --kv_cache_dtype fp8 \
+                                   --output_dir ./tllm_checkpoint_1gpu_base_model_fp8_medusa_fp16 \
+                                   --calib_size 512 \
+                                   --tp_size 1 \
+                                   --medusa_model_dir /path/to/medusa_head/ \
+                                   --num_medusa_heads 4
+
+# Build trtllm engines from the trtllm checkpoint
+trtllm-build --checkpoint_dir ./tllm_checkpoint_1gpu_base_model_fp8_medusa_fp16 \
+         --output_dir ./trt_engine_1gpu_base_model_fp8_medusa_fp16 \
+         --gemm_plugin float16 \
+         --gpt_attention_plugin float16 \
+         --speculative_decoding_mode medusa \
+         --max_batch_size 4
 ```

 ### Run
--- a/examples/medusa/convert_checkpoint.py
+++ b/examples/medusa/convert_checkpoint.py
@ -19,12 +19,13 @@ from transformers.models.llama.modeling_llama import LlamaDecoderLayer
 from transformers.pytorch_utils import Conv1D

 import tensorrt_llm
-from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models import PretrainedConfig
 from tensorrt_llm.models.convert_utils import load_calib_dataset
 from tensorrt_llm.models.llama.convert import load_weights_from_hf_by_shard
+from tensorrt_llm.models.medusa.weight import (get_tllm_linear_weight,
+                                               load_medusa_hf)
 from tensorrt_llm.quantization import QuantAlgo

 try:
@ -108,11 +109,6 @@ def parse_arguments():
        help=
        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
    )
-    parser.add_argument(
-        '--modelopt_quant_ckpt_path',
-        type=str,
-        default=None,
-        help='Path of a quantized model checkpoint in .npz format')

    parser.add_argument(
        '--per_group',
@ -182,13 +178,6 @@ def parse_arguments():
        help='The number of workers for converting checkpoint in parallel')

    parser.add_argument('--num_medusa_heads', type=int, default=4)
-    parser.add_argument(
-        '--fixed_num_medusa_heads',
-        type=int,
-        default=None,
-        help="If exist, fix medusa_num_heads from config.json."
-        "num_medusa_heads < medusa_num_heads in config.json < fixed_num_medusa_heads"
-    )
    parser.add_argument('--num_medusa_layers', type=int, default=1)
    parser.add_argument('--max_medusa_token_len', type=int, default=63)
    parser.add_argument('--medusa_hidden_act', type=str, default="silu")
@ -570,29 +559,6 @@ def get_weight_and_bias(config, prefix, dtype):
    return get_weight(config, prefix, dtype), get_bias(config, prefix, dtype)


-def get_tllm_linear_weight(weight,
-                           prefix,
-                           bias=None,
-                           use_weight_only=False,
-                           plugin_weight_only_quant_type=torch.int8,
-                           postfix='weight'):
-    results = {}
-    if use_weight_only:
-        v = weight.t().contiguous().cpu()
-        processed_torch_weights, torch_weight_scales = \
-            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
-                v, plugin_weight_only_quant_type)
-        results[prefix + postfix] = processed_torch_weights
-        results[prefix + 'per_channel_scale'] = torch_weight_scales
-    else:
-        results[prefix + postfix] = weight.contiguous()
-
-    if bias is not None:
-        results[prefix + 'bias'] = bias
-
-    return results
-
-
 def dup_kv_weight(v, num_head, tp_size):
    assert tp_size % num_head == 0
    reps = tp_size // num_head
@ -1189,77 +1155,28 @@ if __name__ == '__main__':
                    qkv_para=convert_args['llama_qkv_para'],
                    smoother=convert_args['llama_smoother'])

-                def load_medusa_hf(medusa_path: str,
-                                   mapping=Mapping(),
-                                   dtype='float32'):
-                    logger.info("Loading Medusa heads' weights ...")
-                    is_ckpt_safetensors = False
-
-                    ckpt_file = Path(medusa_path) / "medusa_lm_head.pt"
-                    if not ckpt_file.exists():
-                        ckpt_file = Path(
-                            medusa_path) / "medusa_lm_head.safetensors"
-                        is_ckpt_safetensors = True
-
-                    if is_ckpt_safetensors:
-                        logger.info("Safetensors Found ...")
-                        from safetensors.torch import load_file
-                        state_dict = load_file(ckpt_file)
-                    else:
-                        state_dict = torch.load(ckpt_file, map_location="cpu")
-
-                    torch_dtype = str_dtype_to_torch(dtype)
-                    weights = {}
-
-                    for h in range(args.num_medusa_heads):
-                        for l in range(args.num_medusa_layers):
-                            w = state_dict[f"{h}.{l}.linear.weight"].clone().to(
-                                torch_dtype)
-
-                            split_v = split(w, mapping.tp_size, mapping.tp_rank)
-                            weights.update(
-                                get_tllm_linear_weight(
-                                    split_v,
-                                    f'medusa_heads.{h}.medusa_layers.{l}.linear.',
-                                    None, args.use_weight_only,
-                                    plugin_weight_only_quant_type))
-
-                            b = state_dict[f"{h}.{l}.linear.bias"].clone().to(
-                                torch_dtype)
-
-                            weights[
-                                'medusa_heads.{}.medusa_layers.{}.linear.bias'.
-                                format(h, l)] = split(b, mapping.tp_size,
-                                                      mapping.tp_rank)
-
-                        lm = state_dict[
-                            f"{h}.{args.num_medusa_layers}.weight"].clone().to(
-                                torch_dtype)  # LM Head
-
-                        weights['medusa_heads.{}.lm_head.weight'.format(
-                            h)] = split(lm, mapping.tp_size, mapping.tp_rank)
-
-                    return weights
-
                if args.medusa_model_dir is not None:
                    config_file = Path(args.medusa_model_dir) / "config.json"
                    with open(config_file) as fp:
                        config = json.load(fp)
-                    args.num_medusa_heads = config.get('medusa_num_heads',
-                                                       args.num_medusa_heads)
+                    num_medusa_heads_from_config = config.get(
+                        'medusa_num_heads', args.num_medusa_heads)
                    args.num_medusa_layers = config.get('medusa_num_layers',
                                                        args.num_medusa_layers)
-                    if args.fixed_num_medusa_heads is not None and args.fixed_num_medusa_heads != args.num_medusa_heads:
-                        logger.info(
-                            f"fixing num_medusa_heads from {args.num_medusa_heads} to {args.fixed_num_medusa_heads}"
-                        )
-                        args.num_medusa_heads = args.fixed_num_medusa_heads
+                    if args.num_medusa_heads is None:
+                        args.num_medusa_heads = num_medusa_heads_from_config

                    assert args.max_medusa_token_len > 0, "should have max_medusa_token_len > 0"

-                    medusa_weights = load_medusa_hf(args.medusa_model_dir,
-                                                    mapping,
-                                                    dtype=args.dtype)
+                    medusa_weights = load_medusa_hf(
+                        medusa_path=args.medusa_model_dir,
+                        num_medusa_heads=args.num_medusa_heads,
+                        num_medusa_layers=args.num_medusa_layers,
+                        mapping=mapping,
+                        dtype=args.dtype,
+                        use_weight_only=args.use_weight_only,
+                        plugin_weight_only_quant_type=
+                        plugin_weight_only_quant_type)
                    weights.update(medusa_weights)

        safetensors.torch.save_file(
--- a/examples/medusa/requirements.txt
+++ b/examples/medusa/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
--- a/examples/mixtral/requirements.txt
+++ b/examples/mixtral/requirements.txt
@ -1,4 +1,4 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 transformers==4.38.2
 accelerate==0.25.0
--- a/examples/mmlu.py
+++ b/examples/mmlu.py
@ -363,11 +363,13 @@ def main():

    if args.test_trt_llm:
        assert not args.test_hf, "Cannot test both TRT-LLM and HF"
-        runner_cls = ModelRunner if (args.debug_mode
-                                     or not PYTHON_BINDINGS) else ModelRunnerCpp
+        runner_cls = ModelRunner if not PYTHON_BINDINGS else ModelRunnerCpp
+        runner_kwargs = {}
+        if PYTHON_BINDINGS:
+            runner_kwargs.update(max_beam_width=1)
        model = runner_cls.from_dir(args.engine_dir,
                                    rank=runtime_rank,
-                                    debug_mode=args.debug_mode)
+                                    **runner_kwargs)
    else:
        assert args.test_hf, "Must test either TRT-LLM or HF"
        if model_name == 'ChatGLMForCausalLM' and model_version == 'glm':
--- a/examples/mpt/requirements.txt
+++ b/examples/mpt/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/nemotron/requirements.txt
+++ b/examples/nemotron/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 transformers==4.40.2
 datasets~=2.14.5
 evaluate~=0.4.1
--- a/examples/opt/requirements.txt
+++ b/examples/opt/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/phi/README.md
+++ b/examples/phi/README.md
@ -15,10 +15,9 @@ models using TensorRT-LLM and run on a single GPU.

 ## Overview

-The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./). There are two files:
+The TensorRT-LLM Phi implementation can be found in [`tensorrt_llm/models/phi/model.py`](../../tensorrt_llm/models/phi/model.py) and [`tensorrt_llm/models/phi3/model.py`](../../tensorrt_llm/models/phi3/model.py). The TensorRT-LLM Phi example code is located in [`examples/phi`](./) with a single file:

 * [`convert_checkpoint.py`](./convert_checkpoint.py) to convert a checkpoint from the [HuggingFace (HF) Transformers](https://github.com/huggingface/transformers) format to the TensorRT-LLM format
-* [`postprocess_quant_checkpoint.py`](./postprocess_quant_checkpoint.py) to post-process FP8 or INT8 SmoothQuant quantized checkpoints for Phi-3-small variants.

 In addition, there are two shared files in the parent folder [`examples`](../) for inference and evaluation:

@ -29,18 +28,19 @@ In addition, there are two shared files in the parent folder [`examples`](../) f
  * FP16
  * BF16
  * FP8
+  * INT8
  * Tensor Parallel
  ## Support Matrix

-|    Model Name    | FP16  | BF16  | FP8   |  TP   |
-| :--------------: | :---: | :---: | :---: | :---: |
-|    phi-2    |   Y   |   Y    |   |  Y   |
-| Phi-3-mini-4k-instruct    |   Y   |   Y   |  |  |
-| Phi-3-mini-128k-instruct  |   Y   |   Y   |  |  |
-| Phi-3-small-8k-instruct   |   Y   |   Y   | Y   | Y  |
-| Phi-3-small-128k-instruct |   Y   |   Y   | Y   | Y  |
-| Phi-3-medium-8k-instruct   |   Y   |   Y   | |   | Y  |
-| Phi-3-medium-128k-instruct   |   Y   |   Y   | |   | Y  |
+|    Model Name    | FP16  | BF16  | FP8   | INT8  | TP   |
+| :--------------: | :---: | :---: | :---: | :---: | :---: |
+|    phi-2    |   Y   |   Y    |   |    | Y |
+| Phi-3-mini-4k-instruct    |   Y   |   Y   | Y   | Y  |
+| Phi-3-mini-128k-instruct  |   Y   |   Y   | Y   | Y  |
+| Phi-3-small-8k-instruct   |   Y   |   Y   | Y   | Y  | Y |
+| Phi-3-small-128k-instruct |   Y   |   Y   | Y   | Y  | Y |
+| Phi-3-medium-8k-instruct  |   Y   |   Y   | Y   | Y  |
+| Phi-3-medium-128k-instruct |  Y   |   Y   | Y   | Y  |

 * Model Name: the name of the model, the same as the name on HuggingFace
 * TP: Tensor Parallel
@ -128,9 +128,9 @@ python3 ../summarize.py --engine_dir ./phi-engine-tp2  \
 ```


-### 5. Quantization options for Phi-3-small
+### 5. Quantization

-Phi-3-small variants support post-training quantization to FP8 and INT8 SmoothQuant formats.
+All Phi-3 variants support post-training quantization to FP8 and INT8 SmoothQuant formats.

 FP8 checkpoints can be built as follows:

@ -141,8 +141,6 @@ python3 ../quantization/quantize.py \
       --output_dir ./phi3-checkpoint \
       --dtype $DTYPE \
       --qformat fp8 --kv_cache_dtype fp8
-
-python3 postprocess_quant_checkpoint.py --checkpoint_dir ./phi3-checkpoint
 ```

 INT8 checkpoints can be built as follows:
@ -154,8 +152,6 @@ python3 ../quantization/quantize.py \
       --output_dir ./phi3-checkpoint \
       --dtype $DTYPE \
       --qformat int8_sq --kv_cache_dtype int8
-
-python3 postprocess_quant_checkpoint.py --checkpoint_dir ./phi3-checkpoint
 ```

 The commands to [build TensorRT engines](#2-build-tensorrt-engines) from quantized checkpoints
--- a/examples/phi/convert_checkpoint.py
+++ b/examples/phi/convert_checkpoint.py
@ -19,8 +19,7 @@ import time
 from transformers import AutoConfig

 import tensorrt_llm
-from tensorrt_llm.models import (Phi3ForCausalLM, Phi3SmallForCausalLM,
-                                 PhiForCausalLM)
+from tensorrt_llm.models import Phi3ForCausalLM, PhiForCausalLM


 def parse_arguments():
@ -81,16 +80,14 @@ if __name__ == '__main__':
    model_config = AutoConfig.from_pretrained(args.model_dir,
                                              trust_remote_code=True)
    model_type = model_config.architectures[0]
-    supported_model = {
-        'PhiForCausalLM': PhiForCausalLM,
-        'Phi3ForCausalLM': Phi3ForCausalLM,
-        'Phi3VForCausalLM': Phi3ForCausalLM,
-        'Phi3SmallForCausalLM': Phi3SmallForCausalLM
-    }
+    supported_models = [
+        'PhiForCausalLM', 'Phi3ForCausalLM', 'Phi3VForCausalLM',
+        'Phi3SmallForCausalLM'
+    ]
    modelForCausalLM = None
-    if model_type not in supported_model:
+    if model_type not in supported_models:
        assert False, "Invalid model type"
-    modelForCausalLM = supported_model[model_type]
+    modelForCausalLM = PhiForCausalLM if model_type == 'PhiForCausalLM' else Phi3ForCausalLM

    modelForCausalLM.convert_hf_checkpoint(args.model_dir,
                                           dtype=args.dtype,
--- a/examples/phi/postprocess_quant_checkpoint.py
+++ b/examples/phi/postprocess_quant_checkpoint.py
@ -1,63 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import json
-import time
-
-import safetensors
-from safetensors.torch import save_file
-
-import tensorrt_llm
-from tensorrt_llm.models.phi3.phi3small.convert import shuffle_qkv_weights
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--checkpoint_dir', type=str, default=None)
-    args = parser.parse_args()
-
-    return args
-
-
-if __name__ == '__main__':
-    print(tensorrt_llm.__version__)
-    args = parse_arguments()
-    tensorrt_llm.logger.set_level('info')
-
-    tik = time.time()
-    with open(f"{args.checkpoint_dir}/config.json", "r") as f:
-        config = json.load(f)
-
-    weights = {}
-    with safetensors.safe_open(f"{args.checkpoint_dir}/rank0.safetensors",
-                               framework="pt") as f:
-        for k in f.keys():
-            weights[k] = f.get_tensor(k)
-
-    # Transform QKV weights from custom Phi3Small format to TRT-LLM format
-    num_total_heads = config[
-        'num_attention_heads'] + 2 * config['num_key_value_heads']
-    for key, value in weights.items():
-        if "qkv." in key:
-            if 'scaling_factor' in key and value.shape[0] % num_total_heads != 0:
-                continue
-            weights[key] = shuffle_qkv_weights(value, config)
-
-    save_file(weights, f'{args.checkpoint_dir}/rank0.safetensors')
-
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    print(f'Total time of converting checkpoints: {t}')
--- a/examples/phi/requirements.txt
+++ b/examples/phi/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/quantization/quantize.py
+++ b/examples/quantization/quantize.py
@ -90,6 +90,17 @@ if __name__ == "__main__":
                        help="KV Cache dtype.",
                        default=None,
                        choices=["int8", "fp8", None])
+    # Medusa
+    parser.add_argument('--num_medusa_heads', type=int, default=4)
+    parser.add_argument('--num_medusa_layers', type=int, default=1)
+    parser.add_argument('--max_draft_len', type=int, default=63)
+    parser.add_argument('--medusa_hidden_act', type=str, default="silu")
+    parser.add_argument('--medusa_model_dir', type=str, default=None)
+    parser.add_argument('--quant_medusa_head',
+                        default=False,
+                        action='store_true',
+                        help="whether to quantize the weights of medusa heads")
+
    args = parser.parse_args()

    if args.model_dir is not None:
@ -108,7 +119,13 @@ if __name__ == "__main__":
            tp_size=args.tp_size,
            pp_size=args.pp_size,
            seed=args.seed,
-            tokenizer_max_seq_length=args.tokenizer_max_seq_length)
+            tokenizer_max_seq_length=args.tokenizer_max_seq_length,
+            num_medusa_heads=args.num_medusa_heads,
+            num_medusa_layers=args.num_medusa_layers,
+            max_draft_len=args.max_draft_len,
+            medusa_hidden_act=args.medusa_hidden_act,
+            medusa_model_dir=args.medusa_model_dir,
+            quant_medusa_head=args.quant_medusa_head)
    elif args.nemo_ckpt_path is not None:
        quantize_nemo_and_export(nemo_ckpt_path=args.nemo_ckpt_path,
                                 decoder_type=args.decoder_type,
--- a/examples/quantization/requirements.txt
+++ b/examples/quantization/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets>=2.14.4
 nemo-toolkit[all]<=1.20.0,>=1.18.0
 rouge_score~=0.1.2
--- a/examples/qwen/requirements.txt
+++ b/examples/qwen/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/qwenvl/requirements.txt
+++ b/examples/qwenvl/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/recurrentgemma/requirements.txt
+++ b/examples/recurrentgemma/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 git+https://github.com/google-deepmind/recurrentgemma.git
 flax>=0.8.2
 jax~=0.4.23
--- a/examples/run.py
+++ b/examples/run.py
@ -247,7 +247,7 @@ def main(args):

    model_name, model_version = read_model_name(
        args.engine_dir) if not is_enc_dec else ("", "")
-    if args.tokenizer_dir is None:
+    if args.tokenizer_dir is None and model_name in DEFAULT_HF_MODEL_DIRS:
        logger.warning(
            "tokenizer_dir is not specified. Try to infer from model_name, but this may be incorrect."
        )
--- a/examples/skywork/requirements.txt
+++ b/examples/skywork/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets~=2.16.1
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/smaug/requirements.txt
+++ b/examples/smaug/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/whisper/requirements.txt
+++ b/examples/whisper/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.11.0.dev2024061800
+tensorrt_llm==0.11.0.dev2024062500
 tiktoken
 datasets
 kaldialign
--- a/requirements.txt
+++ b/requirements.txt
@ -16,12 +16,13 @@ pandas
 h5py==3.10.0
 StrEnum
 sentencepiece>=0.1.99
-tensorrt==10.0.1
-# https://github.com/pytorch/pytorch/blob/v2.3.0/version.txt uses 2.3.0a0.
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-04.html#rel-24-04 uses 2.3.0a0.
-torch>=2.3.0a,<=2.3.0
+tensorrt==10.1.0
+# https://github.com/pytorch/pytorch/blob/v2.3.1/version.txt uses 2.3.0a0.
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-24-05.html#rel-24-05 uses 2.4.0a0.
+torch>=2.3.0a0,<=2.4.0a0
 nvidia-modelopt~=0.11,<0.12
 transformers>=4.38.2
+pillow==10.2.0
 wheel
 optimum
 evaluate
--- a/tensorrt_llm/auto_parallel/parallelization.py
+++ b/tensorrt_llm/auto_parallel/parallelization.py
@ -1146,8 +1146,8 @@ class GraphGroup(ABC):
        num_kv_heads = plugin_info.pfc_as_ndarray["num_kv_heads"].copy()
        tp_size = plugin_info.pfc_as_ndarray["tp_size"].copy()
        tp_rank = plugin_info.pfc_as_ndarray["tp_rank"].copy()
-        num_kv_heads = num_kv_heads // kv_partition
-        num_heads = num_heads // partition
+        num_kv_heads = np.maximum(num_kv_heads // kv_partition, 1)
+        num_heads = np.maximum(num_heads // partition, 1)
        tp_size[0] = partition
        tp_rank[0] = index

--- a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_node.py
+++ b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_node.py
@ -15,6 +15,11 @@ class PluginNode(Node):
                                                       layer.name)
        layer.to_base_class()

+    def _collect_strategies(self, device_mesh):
+        raise NotImplementedError(
+            f"Auto parallel does not support {self.plugin_type} plugin right now."
+        )
+
    def _default_strategy(self, device_mesh):
        strategies_vector = StrategiesVector(self)
        dim_partition_dict_mapping = {}
--- a/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
+++ b/tensorrt_llm/auto_parallel/tensor_parallel/plugin_nodes/gpt_attention_node.py
@ -1,5 +1,6 @@
 from enum import Enum, auto

+import numpy as np
 import torch

 from tensorrt_llm.functional import PositionEmbeddingType
@ -369,8 +370,8 @@ class GPTAttentionPlugin(PluginNode):
        num_kv_heads = self.plugin_info.pfc_as_ndarray["num_kv_heads"].copy()
        tp_size = self.plugin_info.pfc_as_ndarray["tp_size"].copy()
        tp_rank = self.plugin_info.pfc_as_ndarray["tp_rank"].copy()
-        num_kv_heads = num_kv_heads // kv_partition
-        num_heads = num_heads // partition
+        num_kv_heads = np.maximum(num_kv_heads // kv_partition, 1)
+        num_heads = np.maximum(num_heads // partition, 1)
        tp_size[0] = partition
        tp_rank[0] = 0

--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@ -30,10 +30,9 @@ from ..builder import BuildConfig, Engine, build
 from ..logger import logger
 from ..lora_manager import LoraConfig, LoraManager
 from ..models import MODEL_MAP, PretrainedConfig
-from ..models.modeling_utils import (WEIGHT_LOADER_MODELS, QuantConfig,
+from ..models.modeling_utils import (WEIGHT_LOADER_MODELS,
                                     SpeculativeDecodingMode)
 from ..plugin import PluginConfig, add_plugin_argument
-from ..quantization import QuantAlgo


 def parse_arguments():
@ -133,10 +132,6 @@ def parse_arguments():
                        type=str,
                        default=None,
                        choices=['float16', 'float32'])
-    parser.add_argument('--weight_only_precision',
-                        type=str,
-                        default=None,
-                        choices=['int8', 'int4'])
    parser.add_argument('--weight_sparsity', default=False, action='store_true')
    parser.add_argument(
        '--max_draft_len',
@ -270,14 +265,6 @@ def build_model(build_config: BuildConfig,
    if logits_dtype is not None:
        model_config.logits_dtype = logits_dtype

-    weight_only_precision = kwargs.get('weight_only_precision', None)
-    if not model_config.quant_mode.has_any_quant(
-    ) and weight_only_precision is not None:
-        if weight_only_precision == 'int4':
-            model_config.quantization = QuantConfig(QuantAlgo.W4A16)
-        else:
-            model_config.quantization = QuantConfig(QuantAlgo.W8A16)
-
    architecture = model_config.architecture
    assert not build_config.plugin_config.streamingllm or architecture == "LlamaForCausalLM", \
        "StreamingLLM is only supported in the llama model."
@ -420,7 +407,6 @@ def main():
    kwargs = {
        'logits_dtype': args.logits_dtype,
        'use_fused_mlp': args.use_fused_mlp,
-        'weight_only_precision': args.weight_only_precision,
        'tp_size': args.tp_size,
        'pp_size': args.pp_size,
        'lora_dir': args.lora_dir,
--- a/tensorrt_llm/models/init.py
+++ b/tensorrt_llm/models/init.py
@ -40,7 +40,6 @@ from .modeling_utils import (PretrainedConfig, PretrainedModel,
 from .mpt.model import MPTForCausalLM, MPTModel
 from .opt.model import OPTForCausalLM, OPTModel
 from .phi3.model import Phi3ForCausalLM, Phi3Model
-from .phi3.phi3small.model import Phi3SmallForCausalLM, Phi3SmallModel
 from .phi.model import PhiForCausalLM, PhiModel
 from .qwen.model import QWenForCausalLM
 from .recurrentgemma.model import RecurrentGemmaForCausalLM
@ -70,10 +69,8 @@ __all__ = [
    'GPTNeoXForCausalLM',
    'PhiModel',
    'Phi3Model',
-    'Phi3SmallModel',
    'PhiForCausalLM',
    'Phi3ForCausalLM',
-    'Phi3SmallForCausalLM',
    'ChatGLMForCausalLM',
    'ChatGLMModel',
    'BaichuanForCausalLM',
@ -103,7 +100,7 @@ MODEL_MAP = {
    'FalconForCausalLM': FalconForCausalLM,
    'PhiForCausalLM': PhiForCausalLM,
    'Phi3ForCausalLM': Phi3ForCausalLM,
-    'Phi3SmallForCausalLM': Phi3SmallForCausalLM,
+    'Phi3SmallForCausalLM': Phi3ForCausalLM,
    'MambaForCausalLM': MambaForCausalLM,
    'GPTNeoXForCausalLM': GPTNeoXForCausalLM,
    'GPTJForCausalLM': GPTJForCausalLM,
--- a/tensorrt_llm/models/gemma/model.py
+++ b/tensorrt_llm/models/gemma/model.py
@ -292,6 +292,7 @@ class GemmaForCausalLM(DecoderModelForCausalLM):
        return tllm_llama

    def check_config(self, config):
+        config.set_if_not_exist("share_embedding_table", True)
        config.set_if_not_exist('use_parallel_embedding', False)
        config.set_if_not_exist('embedding_sharding_dim', 0)
        config.set_if_not_exist('mlp_bias', False)
--- a/tensorrt_llm/models/generation_mixin.py
+++ b/tensorrt_llm/models/generation_mixin.py
@ -62,6 +62,80 @@ class GenerationMixin:
            [split_point[-1], max_num_tokens, max_num_tokens])
        return num_tokens_ranges

+    @staticmethod
+    def get_profiles_ranges(
+        *,
+        max_batch_size,
+        max_beam_width,
+        max_input_len,
+        max_num_tokens,
+        max_draft_len,
+        opt_batch_size,
+        opt_num_tokens,
+        enable_ctx_gen_opt_profiles,
+        multiple_profiles,
+    ):
+        default_range = GenerationMixin.default_range
+        if opt_batch_size:
+            bb_range_cxt = [1, opt_batch_size, max_batch_size]
+            bb_range_gen = [
+                1, opt_batch_size * max_beam_width,
+                max_batch_size * max_beam_width
+            ]
+        else:
+            bb_range_cxt = default_range(max_batch_size)
+            bb_range_gen = default_range(max_batch_size * max_beam_width)
+        tokens_per_engine_step = max_draft_len + 1
+        tokens_per_engine_step_range = [
+            1, tokens_per_engine_step, tokens_per_engine_step
+        ]
+        bbd_range_ctx = [
+            bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1)
+            for i in range(len(bb_range_cxt))
+        ]
+        bbd_range_gen = [
+            bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1)
+            for i in range(len(bb_range_gen))
+        ]
+        inlen_range_cxt = default_range(max_input_len)
+        inlen_range_gen = [1, 1, tokens_per_engine_step]
+        if enable_ctx_gen_opt_profiles:
+            num_profiles = 2
+            bb_range = [bb_range_cxt, bb_range_gen]
+            bbd_range = [bbd_range_ctx, bbd_range_gen]
+            inlen_range = [inlen_range_cxt, inlen_range_gen]
+            position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
+            num_tokens_range_ctx = default_range(max_batch_size * max_input_len)
+            # Draft tokens cannot be combined with beam search
+            num_tokens_range_gen = default_range(
+                max_batch_size * max(tokens_per_engine_step, max_beam_width))
+            num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
+        else:
+            if multiple_profiles:
+                num_tokens_range = GenerationMixin.split_num_tokens_range(
+                    max_num_tokens)
+            else:
+                if opt_num_tokens is None:
+                    opt_num_tokens = min(max_num_tokens,
+                                         max_batch_size * max_beam_width)
+                num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
+            num_profiles = len(num_tokens_range)
+            bb_range = [bb_range_gen] * num_profiles
+            bbd_range = [bbd_range_gen] * num_profiles
+            inlen_range = [[1, 1, max_input_len]] * num_profiles
+            position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
+        tokens_per_engine_step_range = [tokens_per_engine_step_range
+                                        ] * num_profiles
+        ranges = {
+            'bb_range': bb_range,
+            'bbd_range': bbd_range,
+            'inlen_range': inlen_range,
+            'position_ids_inlen_range': position_ids_inlen_range,
+            'num_tokens_range': num_tokens_range,
+            'tokens_per_engine_step_range': tokens_per_engine_step_range,
+        }
+        return num_profiles, ranges
+
    def prepare_attention_inputs(self,
                                 *,
                                 max_batch_size,
@ -343,61 +417,26 @@ class GenerationMixin:
            streamingllm: bool = False,
            opt_batch_size=None):

-        default_range = GenerationMixin.default_range
-        tokens_per_engine_step = max_draft_len + 1
-        tokens_per_engine_step_range = [
-            1, tokens_per_engine_step, tokens_per_engine_step
-        ]
-        if opt_batch_size:
-            bb_range_cxt = [1, opt_batch_size, max_batch_size]
-            bb_range_gen = [
-                1, opt_batch_size * max_beam_width,
-                max_batch_size * max_beam_width
-            ]
-        else:
-            bb_range_cxt = default_range(max_batch_size)
-            bb_range_gen = default_range(max_batch_size * max_beam_width)
-        bbd_range_ctx = [
-            bb_range_cxt[i] * (tokens_per_engine_step if i != 0 else 1)
-            for i in range(len(bb_range_cxt))
-        ]
-        bbd_range_gen = [
-            bb_range_gen[i] * (tokens_per_engine_step if i != 0 else 1)
-            for i in range(len(bb_range_gen))
-        ]
-        inlen_range_cxt = default_range(max_input_len)
-        inlen_range_gen = [1, 1, tokens_per_engine_step]
-
        enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
            use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding,
            paged_kv_cache)
-        if enable_ctx_gen_opt_profiles:
-            num_profiles = 2
-            bb_range = [bb_range_cxt, bb_range_gen]
-            bbd_range = [bbd_range_ctx, bbd_range_gen]
-            inlen_range = [inlen_range_cxt, inlen_range_gen]
-            position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
-            num_tokens_range_ctx = default_range(max_batch_size * max_input_len)
-            # Draft tokens cannot be combined with beam search
-            num_tokens_range_gen = default_range(
-                max_batch_size * max(tokens_per_engine_step, max_beam_width))
-            num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
-        else:
-            if multiple_profiles:
-                num_tokens_range = GenerationMixin.split_num_tokens_range(
-                    max_num_tokens)
-            else:
-                if opt_num_tokens is None:
-                    opt_num_tokens = min(max_num_tokens,
-                                         max_batch_size * max_beam_width)
-                num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
-            num_profiles = len(num_tokens_range)
-            bb_range = [bb_range_gen] * num_profiles
-            bbd_range = [bbd_range_gen] * num_profiles
-            inlen_range = [[1, 1, max_input_len]] * num_profiles
-            position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
-        tokens_per_engine_step_range = [tokens_per_engine_step_range
-                                        ] * num_profiles
+
+        num_profiles, ranges = GenerationMixin.get_profiles_ranges(
+            max_batch_size=max_batch_size,
+            max_beam_width=max_beam_width,
+            max_input_len=max_input_len,
+            max_num_tokens=max_num_tokens,
+            max_draft_len=max_draft_len,
+            opt_batch_size=opt_batch_size,
+            opt_num_tokens=opt_num_tokens,
+            enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
+            multiple_profiles=multiple_profiles)
+        bb_range = ranges['bb_range']
+        bbd_range = ranges['bbd_range']
+        inlen_range = ranges['inlen_range']
+        num_tokens_range = ranges['num_tokens_range']
+        position_ids_inlen_range = ranges['position_ids_inlen_range']
+        tokens_per_engine_step_range = ranges['tokens_per_engine_step_range']
        position_ids_num_tokens_range = num_tokens_range

        input_ids = None
@ -597,12 +636,13 @@ class GenerationMixin:
        spec_decoding_params = None
        # Use positional offsets and packed mask only when not in SpS spec decoding
        if speculative_decoding_draft_tokens_external == False and max_draft_len > 0:
+            tokens_per_engine_step = max_draft_len + 1
            # 32 bits packed mask aligned.
            num_packed_masks = (tokens_per_engine_step + 32 - 1) // 32
            packed_mask_len_range = [[0, 1, num_packed_masks]] * num_profiles
            # total number of spec decoding tokens for all sequences (sequence length can be variable).
            num_gen_tokens_range = [
-                default_range(
+                GenerationMixin.default_range(
                    max_batch_size * max_beam_width * tokens_per_engine_step,
                    min_range=0)
            ] * num_profiles
--- a/tensorrt_llm/models/llama/convert.py
+++ b/tensorrt_llm/models/llama/convert.py
@ -1535,6 +1535,12 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):

    moe_config = config.moe

+    kv_tp_size = None
+    kv_tp_rank = None
+    if config.num_key_value_heads < mapping.tp_size:
+        kv_tp_size = config.num_key_value_heads
+        kv_tp_rank = mapping.tp_rank * kv_tp_size // mapping.tp_size
+
    model_prefix = "model."
    key_list = [
        "embed_tokens.weight",  # vocab_embedding
@ -1552,7 +1558,12 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):

    torch_dtype = str_dtype_to_torch(dtype)

-    def load(key, tp_dim=-1, no_prefix=0, is_expert_weights=False):
+    def load(key,
+             tp_dim=-1,
+             no_prefix=0,
+             is_expert_weights=False,
+             tp_size=None,
+             tp_rank=None):
        if not no_prefix:
            key = model_prefix + key
        ptr_idx = safetensors_map[key] if key in safetensors_map else 0
@ -1560,38 +1571,28 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
        if key not in safetensors_ptrs[ptr_idx].keys():
            return None

+        tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key)
+        tensor_shape = tensor_slice.get_shape()
        if tp_dim == -1:
-            res = safetensors_ptrs[ptr_idx].get_tensor(key)
-        else:
+            res = tensor_slice[:]
+        elif tp_dim >= 0 and tp_dim < len(tensor_shape):
            if is_expert_weights:
                tp_size = mapping.moe_tp_size
                tp_rank = mapping.moe_tp_rank
            else:
-                tp_size = mapping.tp_size
-                tp_rank = mapping.tp_rank
-            tensor_slice = safetensors_ptrs[ptr_idx].get_slice(key)
-            tensor_shape = tensor_slice.get_shape()
-            if len(tensor_shape) == 1:
-                if tp_dim == 0:
-                    slice_width = tensor_shape[0] // tp_size
-                    res = tensor_slice[slice_width * tp_rank:slice_width *
-                                       (tp_rank + 1)]
-                else:
-                    res = tensor_slice[:]
-            else:
-                if tensor_shape[tp_dim] % tp_size != 0:
-                    logger.error(
-                        "Current weight shape is invalid for tp_size=" +
-                        str(tp_size))
-                slice_width = tensor_shape[tp_dim] // tp_size
-                if tp_dim == 0:
-                    res = tensor_slice[slice_width * tp_rank:slice_width *
-                                       (tp_rank + 1), :]
-                elif tp_dim == 1:
-                    res = tensor_slice[:, slice_width * tp_rank:slice_width *
-                                       (tp_rank + 1)]
-                else:
-                    assert False, "Invalid TP dim"
+                tp_size = tp_size or mapping.tp_size
+                tp_rank = tp_rank or mapping.tp_rank
+            dim_size = tensor_shape[tp_dim]
+            if dim_size % tp_size != 0:
+                logger.error(
+                    f"Current weight shape {tensor_shape} is invalid at dimension {tp_dim} for TP size {tp_size}"
+                )
+            indices = [slice(None)] * len(tensor_shape)
+            indices[tp_dim] = slice(dim_size * tp_rank // tp_size,
+                                    dim_size * (tp_rank + 1) // tp_size)
+            res = tensor_slice[indices]
+        else:
+            raise ValueError(f"Invalid TP dim: {tp_dim}")
        return res.to(torch_dtype).contiguous(
        ) if "block_sparse_moe.gate" not in key else res.to(torch.float32)

@ -1632,11 +1633,19 @@ def load_weights_from_hf_safetensors(model_dir: str, config: LLaMAConfig):
        # Attention
        qkv_list = []
        for comp in ["q", "k", "v"]:
-            weight_part = load(prefix + key_list[3] + comp + key_list[4], 0)
+            tp_size = kv_tp_size if comp != "q" else None
+            tp_rank = kv_tp_rank if comp != "q" else None
+            weight_part = load(prefix + key_list[3] + comp + key_list[4],
+                               0,
+                               tp_size=tp_size,
+                               tp_rank=tp_rank)
            qkv_list.append(weight_part)
            bias_part = load(
                (prefix + key_list[3] + comp + key_list[4]).replace(
-                    "weight", "bias"), 0)
+                    "weight", "bias"),
+                0,
+                tp_size=tp_size,
+                tp_rank=tp_rank)
            if bias_part is not None:
                qkv_list.append(bias_part)
        if len(qkv_list) == 3:
--- a/tensorrt_llm/models/mamba/model.py
+++ b/tensorrt_llm/models/mamba/model.py
@ -23,6 +23,7 @@ from ...functional import (Tensor, arange, cast, concat, expand,
                           gather_last_token_logits, shape, unsqueeze)
 from ...layers import Embedding, LayerNorm, Linear, Mamba, RmsNorm
 from ...module import Module, ModuleList
+from ...plugin import current_all_reduce_helper
 from ..generation_mixin import GenerationMixin
 from ..modeling_utils import PretrainedConfig, PretrainedModel

@ -192,6 +193,7 @@ class MambaForCausalLM(PretrainedModel):
                ssm_states,
                host_request_types,
                last_token_ids,
+                last_token_ids_for_logits,
                host_context_lengths,
                slot_mapping: Optional[Tensor] = None):
        hidden_states, present_convs, present_ssms = self.backbone(
@ -200,7 +202,7 @@ class MambaForCausalLM(PretrainedModel):

        if not self.gather_context_logits:
            hidden_states = gather_last_token_logits(
-                hidden_states, last_token_ids,
+                hidden_states, last_token_ids_for_logits,
                default_net().plugin_config.remove_input_padding)

        lm_logits = self.lm_head(hidden_states)
@ -218,9 +220,9 @@ class MambaForCausalLM(PretrainedModel):
            max_batch_size,
            max_input_len,
            max_seq_len,
+            max_num_tokens,
            use_cache,
            max_beam_width: int = 1,
-            max_num_tokens: int = None,
            opt_num_tokens: int = None,
            opt_batch_size: int = 0,
            prompt_embedding_table_size: int = 0,
@ -235,56 +237,79 @@ class MambaForCausalLM(PretrainedModel):
            @return: a list contains values which can be fed into the self.forward()
        '''
        assert speculative_decoding_draft_tokens_external == False, "Speculative decoding is not supported in Mamba"
+        assert max_beam_width == 1, "We don't support beam search for the Mamba model."
+
        remove_input_padding = default_net().plugin_config.remove_input_padding
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
+        paged_state = default_net().plugin_config.paged_state
+        multiple_profiles = default_net().plugin_config.multiple_profiles
        use_mamba_conv1d_plugin = default_net(
        ).plugin_config.mamba_conv1d_plugin
-        batch_range = [GenerationMixin.default_range(max_batch_size)]
+        use_custom_all_reduce = default_net(
+        ).plugin_config.use_custom_all_reduce
+
        self.gather_context_logits = gather_context_logits
+        mapping = self.config.mapping
+
+        # basic inputs
+        enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
+            True, use_gemm_plugin, remove_input_padding, paged_state)
+
+        num_profiles, ranges = GenerationMixin.get_profiles_ranges(
+            max_batch_size=max_batch_size,
+            max_beam_width=max_beam_width,
+            max_input_len=max_input_len,
+            max_num_tokens=max_num_tokens,
+            max_draft_len=max_draft_len,
+            opt_batch_size=opt_batch_size,
+            opt_num_tokens=opt_num_tokens,
+            enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
+            multiple_profiles=multiple_profiles)
+
        if remove_input_padding:
            assert use_mamba_conv1d_plugin, "mamba_conv1d_plugin is needed to support remove_input_padding"
-            max_num_tokens = max(
-                max_input_len * max_batch_size,
-                max_beam_width * (max_draft_len + 1) * max_batch_size)
-            if opt_num_tokens is None:
-                opt_num_tokens = max_beam_width * (max_draft_len +
-                                                   1) * max_batch_size
-            num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
            input_ids = Tensor(name='input_ids',
                               dtype=trt.int32,
                               shape=[-1],
                               dim_range=OrderedDict([
-                                   ('num_tokens', num_tokens_range),
+                                   ('num_tokens', ranges['num_tokens_range']),
                               ]))
        else:
            input_ids = Tensor(name='input_ids',
                               dtype=trt.int32,
                               shape=[-1, -1],
                               dim_range=OrderedDict([
-                                   ('batch_size', batch_range),
-                                   ('input_len', [[1, 1, max_input_len]]),
+                                   ('batch_size_beam_width',
+                                    ranges['bb_range']),
+                                   ('input_len', ranges['inlen_range']),
                               ]))
+        if use_custom_all_reduce and mapping.tp_size > 1:
+            current_all_reduce_helper().set_workspace_tensor(
+                mapping, num_profiles)
+
+        # recurrent inputs
        conv_states = []
        ssm_states = []
        if use_mamba_conv1d_plugin:
            conv_state_dim_range = OrderedDict([
-                ('batch_size', batch_range),
-                ('kernel_size', [self.d_conv - 1]),
-                ('dim_size', [self.d_inner]),
+                ('batch_size', ranges['bb_range']),
+                ('kernel_size', [self.d_conv - 1] * num_profiles),
+                ('dim_size', [self.d_inner] * num_profiles),
            ])
        else:
            conv_state_dim_range = OrderedDict([
-                ('batch_size', batch_range),
-                ('dim_size', [self.d_inner]),
-                ('kernel_size', [self.d_conv - 1]),
+                ('batch_size', ranges['bb_range']),
+                ('dim_size', [self.d_inner] * num_profiles),
+                ('kernel_size', [self.d_conv - 1] * num_profiles),
            ])

        ssm_state_dim_range = OrderedDict([
-            ('batch_size', batch_range),
-            ('state_size', [self.d_state]),
-            ('dim_size', [self.d_inner]),
+            ('batch_size', ranges['bb_range']),
+            ('state_size', [self.d_state] * num_profiles),
+            ('dim_size', [self.d_inner] * num_profiles),
        ])
        one_dim_range = OrderedDict([
-            ('buffer_count', [1]),
+            ('buffer_count', [1] * num_profiles),
        ])

        for i in range(self.config.num_hidden_layers):
@ -324,7 +349,7 @@ class MambaForCausalLM(PretrainedModel):
            name='host_request_types',
            dtype=trt.int32,
            shape=[-1],
-            dim_range=OrderedDict([('batch_size', batch_range)]),
+            dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
        )

        if use_mamba_conv1d_plugin and remove_input_padding:
@ -332,21 +357,22 @@ class MambaForCausalLM(PretrainedModel):
                name='host_context_lengths',
                dtype=trt.int32,
                shape=[-1],
-                dim_range=OrderedDict([('batch_size', batch_range)]),
+                dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
            )
        else:
            host_context_lengths = None

-        last_token_ids = None
+        last_token_ids = Tensor(
+            name='last_token_ids',
+            dtype=trt.int32,
+            shape=[-1],
+            dim_range=OrderedDict([
+                ('batch_size', ranges['bbd_range']),
+            ]),
+        )
+        last_token_ids_for_logits = None
        if not gather_context_logits:
-            last_token_ids = Tensor(
-                name='last_token_ids',
-                dtype=trt.int32,
-                shape=[-1],
-                dim_range=OrderedDict([
-                    ('batch_size', batch_range),
-                ]),
-            )
+            last_token_ids_for_logits = last_token_ids

        return_dict = {
            'input_ids': input_ids,
@ -354,6 +380,7 @@ class MambaForCausalLM(PretrainedModel):
            'ssm_states': ssm_states,
            'host_request_types': host_request_types,
            'last_token_ids': last_token_ids,
+            'last_token_ids_for_logits': last_token_ids_for_logits,
            'host_context_lengths': host_context_lengths,
        }

@ -362,7 +389,7 @@ class MambaForCausalLM(PretrainedModel):
                name='slot_mapping',
                dtype=trt.int32,
                shape=[-1],
-                dim_range=OrderedDict([('batch_size', batch_range)]),
+                dim_range=OrderedDict([('batch_size', ranges['bb_range'])]),
            )
            return_dict['slot_mapping'] = slot_mapping

--- a/tensorrt_llm/models/medusa/weight.py
+++ b/tensorrt_llm/models/medusa/weight.py
@ -1,43 +1,80 @@
 from pathlib import Path

-import numpy as np
 import torch

 from tensorrt_llm import logger
-from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
+from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import MedusaLM
 from tensorrt_llm.models.convert_utils import split


+def get_tllm_linear_weight(weight,
+                           prefix,
+                           bias=None,
+                           use_weight_only=False,
+                           plugin_weight_only_quant_type=torch.int8,
+                           postfix='weight'):
+    results = {}
+    if use_weight_only:
+        v = weight.t().contiguous().cpu()
+        processed_torch_weights, torch_weight_scales = \
+            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
+                v, plugin_weight_only_quant_type)
+        results[prefix + postfix] = processed_torch_weights
+        results[prefix + 'per_channel_scale'] = torch_weight_scales
+    else:
+        results[prefix + postfix] = weight.contiguous()
+
+    if bias is not None:
+        results[prefix + 'bias'] = bias
+
+    return results
+
+
 def load_medusa_hf(medusa_path: str,
-                   trt_llm_medusa: MedusaLM,
+                   num_medusa_heads: int,
+                   num_medusa_layers: int,
                   mapping=Mapping(),
-                   dtype='float32'):
+                   dtype='float32',
+                   use_weight_only=False,
+                   plugin_weight_only_quant_type=None):
    logger.info("Loading Medusa heads' weights ...")
+    is_ckpt_safetensors = False
+
    ckpt_file = Path(medusa_path) / "medusa_lm_head.pt"
-    state_dict = torch.load(ckpt_file, map_location="cpu")
+    if not ckpt_file.exists():
+        ckpt_file = Path(medusa_path) / "medusa_lm_head.safetensors"
+        is_ckpt_safetensors = True
+
+    if is_ckpt_safetensors:
+        logger.info("Safetensors Found ...")
+        from safetensors.torch import load_file
+        state_dict = load_file(ckpt_file)
+    else:
+        state_dict = torch.load(ckpt_file, map_location="cpu")
+
    torch_dtype = str_dtype_to_torch(dtype)
-    for h in range(trt_llm_medusa.num_medusa_heads):
-        for l in range(trt_llm_medusa.num_medusa_layers):
-            w = state_dict[f"{h}.{l}.linear.weight"].clone()
-            w = torch_to_numpy(w.to(torch_dtype).detach().cpu())
-            trt_llm_medusa.medusa_heads[h].medusa_layers[
-                l].linear.weight.value = np.ascontiguousarray(
-                    split(w, mapping.tp_size, mapping.tp_rank))
-            if trt_llm_medusa.medusa_heads[h].medusa_layers[
-                    l].linear.bias is not None:
-                # print(f"Setting bias for {h} {l}")
-                b = state_dict[f"{h}.{l}.linear.bias"].clone()
-                b = torch_to_numpy(b.to(torch_dtype).detach().cpu())
-                trt_llm_medusa.medusa_heads[h].medusa_layers[
-                    l].linear.bias.value = np.ascontiguousarray(
-                        np.split(b, mapping.tp_size,
-                                 axis=0)[mapping.tp_rank].copy())
-        lm = state_dict[f"{h}.{trt_llm_medusa.num_medusa_layers}.weight"].clone(
-        )  # LM Head
-        lm = torch_to_numpy(lm.to(torch_dtype).detach().cpu())
-        trt_llm_medusa.medusa_heads[
-            h].lm_head.weight.value = np.ascontiguousarray(
-                split(lm, mapping.tp_size, mapping.tp_rank))
-    return
+    weights = {}
+
+    for h in range(num_medusa_heads):
+        for l in range(num_medusa_layers):
+            w = state_dict[f"{h}.{l}.linear.weight"].clone().to(torch_dtype)
+
+            split_v = split(w, mapping.tp_size, mapping.tp_rank)
+            weights.update(
+                get_tllm_linear_weight(
+                    split_v, f'medusa_heads.{h}.medusa_layers.{l}.linear.',
+                    None, use_weight_only, plugin_weight_only_quant_type))
+
+            b = state_dict[f"{h}.{l}.linear.bias"].clone().to(torch_dtype)
+
+            weights['medusa_heads.{}.medusa_layers.{}.linear.bias'.format(
+                h, l)] = split(b, mapping.tp_size, mapping.tp_rank)
+
+        lm = state_dict[f"{h}.{num_medusa_layers}.weight"].clone().to(
+            torch_dtype)  # LM Head
+
+        weights['medusa_heads.{}.lm_head.weight'.format(h)] = split(
+            lm, mapping.tp_size, mapping.tp_rank)
+
+    return weights
--- a/tensorrt_llm/models/modeling_utils.py
+++ b/tensorrt_llm/models/modeling_utils.py
@ -27,6 +27,10 @@ from ..mapping import Mapping
 from ..module import Module, ModuleList
 from ..parameter import Parameter
 from ..quantization import QuantMode
+from ..quantization.layers import (WeightOnlyGroupwiseQuantLinear,
+                                   WeightOnlyGroupwiseQuantRowLinear,
+                                   WeightOnlyQuantLinear,
+                                   WeightOnlyQuantRowLinear)
 from ..quantization.mode import W8A8_SQ_PLUGIN_LIST, QuantAlgo
 from ..top_model_mixin import TopModelMixin
 from .convert_utils import weight_only_quantize_dict
@ -841,7 +845,8 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel:
                continue
            qkv_params = get_init_params(layer.qkv, ColumnLinear)
            qkv_params["bias"] = qkv_params["bias"] is not None
-            qkv_params["strict_dtype"] = qkv_params["strict_dtype"] is not None
+            qkv_params["strict_dtype"] = qkv_params.get(
+                "strict_dtype") is not None
            q = ColumnLinear(
                **{
                    **qkv_params,
@ -866,20 +871,34 @@ def unfuse_qkv_gemm(model: PretrainedModel) -> PretrainedModel:
            q = quantize(q, model.config.quantization)
            k = quantize(k, model.config.quantization)
            v = quantize(v, model.config.quantization)
+            out_features = q.out_features + k.out_features + v.out_features
+            if isinstance(layer.qkv, (
+                    WeightOnlyQuantLinear,
+                    WeightOnlyQuantRowLinear,
+                    WeightOnlyGroupwiseQuantLinear,
+                    WeightOnlyGroupwiseQuantRowLinear,
+            )):
+                out_dim = 1
+            else:
+                out_dim = 0
            if layer.qkv.weight.is_inited():
                qkv_weight = layer.qkv.weight.raw_value
                weights = np.split(qkv_weight, [
-                    q.out_features,
-                    q.out_features + k.out_features,
-                ])
+                    qkv_weight.shape[out_dim] * q.out_features // out_features,
+                    qkv_weight.shape[out_dim] *
+                    (q.out_features + k.out_features) // out_features,
+                ],
+                                   axis=out_dim)
                for gemm, weight in zip([q, k, v], weights):
                    gemm.weight.value = weight
            if layer.qkv.bias is not None and layer.qkv.bias.is_inited():
                qkv_bias = layer.qkv.bias.raw_value
                biases = np.split(qkv_bias, [
-                    q.out_features,
-                    q.out_features + k.out_features,
-                ])
+                    qkv_bias.shape[out_dim] * q.out_features // out_features,
+                    qkv_bias.shape[out_dim] *
+                    (q.out_features + k.out_features) // out_features,
+                ],
+                                  axis=out_dim)
                for gemm, bias in zip([q, k, v], biases):
                    gemm.bias.value = bias
            for name, parameter in layer.qkv._parameters.items():
--- a/tensorrt_llm/models/phi3/convert.py
+++ b/tensorrt_llm/models/phi3/convert.py
@ -1,9 +1,12 @@
 import torch

+from tensorrt_llm.quantization import QuantAlgo
+
 from ..._utils import str_dtype_to_torch
+from .split_weights import shuffle_qkv_weights, split_weights_tp


-def convert_hf_weights(hf_model, dtype, **kwargs):
+def convert_hf_weights(hf_model, dtype, config, small_variant, args, rank):
    torch_dtype = str_dtype_to_torch(dtype)
    hf_state_dict = hf_model.state_dict()
    weights = {}
@ -15,13 +18,16 @@ def convert_hf_weights(hf_model, dtype, **kwargs):
            key = key.replace("model.layers.", "transformer.layers.")
            #Attention
            key = key.replace("self_attn.", "attention.")
+            key = key.replace("query_key_value.", "qkv.")  # small
            key = key.replace("Wqkv.weight", "qkv.weight")
            key = key.replace("qkv_proj.", "qkv.")  #128k
            #MLP
            key = key.replace("mlp.fc1.", "mlp.fc.")
            key = key.replace("mlp.fc2.", "mlp.proj.")
            key = key.replace("mlp.gate_up_proj.", "mlp.fc.")
-            key = key.replace("mlp.up_proj.", "mlp.gate.")  #128k
+            key = key.replace(
+                "mlp.up_proj.",
+                "mlp.fc." if small_variant else "mlp.gate.")  #128k
            key = key.replace("mlp.down_proj.", "mlp.proj.")  #128k
            key = key.replace("mlp.gate_proj.", "mlp.fc.")  #128k
            key = key.replace("o_proj.", "dense.")  #128k
@ -53,27 +59,67 @@ def convert_hf_weights(hf_model, dtype, **kwargs):
            key = key.replace("q_proj.weight", "qkv.weight")
        elif "k_proj" in key or "v_proj" in key:
            continue
+
        weights[key] = value.to(torch_dtype).cpu()

+    if small_variant:
+        weights['lm_head.weight'] = weights[
+            'transformer.vocab_embedding.weight'].clone()
+
+        # Transform QKV weights from custom Phi3Small format to TRT-LLM format
+        for key, value in weights.items():
+            if "qkv." in key:
+                weights[key] = shuffle_qkv_weights(weights[key], config)
+
+        weights = split_weights_tp(config, weights, args, rank, torch_dtype)
+
    return weights


-def convert_hf_config(hf_config, dtype, **kwargs):
+def convert_small_hf_config(hf_config):
+    return {
+        'architecture': "Phi3SmallForCausalLM",
+        'rotary_base': hf_config.rope_embedding_base,
+        'gegelu_limit': hf_config.gegelu_limit,
+        'mup_attn_multiplier': hf_config.mup_attn_multiplier,
+        'mup_embedding_multiplier': hf_config.mup_embedding_multiplier,
+        'mup_use_scaling': hf_config.mup_use_scaling,
+        'mup_width_multiplier': hf_config.mup_width_multiplier,
+        'blocksparse_block_size': hf_config.blocksparse_block_size,
+        'blocksparse_homo_head_pattern':
+        hf_config.blocksparse_homo_head_pattern,
+        'blocksparse_num_local_blocks': hf_config.blocksparse_num_local_blocks,
+        'blocksparse_vertical_stride': hf_config.blocksparse_vert_stride,
+        'dense_attention_every_n_layers':
+        hf_config.dense_attention_every_n_layers,
+    }
+
+
+def convert_hf_config(hf_config, dtype, args):
    config = {
        'architecture': "Phi3ForCausalLM",
        'dtype': dtype,
        'num_hidden_layers': hf_config.num_hidden_layers,
        'num_attention_heads': hf_config.num_attention_heads,
        'num_key_value_heads': hf_config.num_key_value_heads,
-        'rope_theta': hf_config.rope_theta,
        'hidden_size': hf_config.hidden_size,
        'intermediate_size': hf_config.intermediate_size,
        'vocab_size': hf_config.vocab_size,
        'max_position_embeddings': hf_config.max_position_embeddings,
        'hidden_act': hf_config.hidden_act,
        'share_embedding_table': False,
-        'norm_epsilon': hf_config.rms_norm_eps,
    }
+
+    small_variant = hf_config.architectures[0] == "Phi3SmallForCausalLM"
+    if small_variant:
+        config.update(convert_small_hf_config(hf_config))
+    else:
+        config.update({
+            'rotary_base': hf_config.rope_theta,
+            'norm_epsilon': hf_config.rms_norm_eps,
+        })
+
+    # Long-context variants
    if hf_config.max_position_embeddings >= 128000:
        config.update({
            'original_max_position_embeddings':
@ -83,6 +129,31 @@ def convert_hf_config(hf_config, dtype, **kwargs):
            'longrope_scaling_long_factors':
            hf_config.rope_scaling["long_factor"]
        })
+
+        if small_variant:
+            config.update({
+                'longrope_long_mscale':
+                hf_config.rope_scaling["long_mscale"],
+                'longrope_short_mscale':
+                hf_config.rope_scaling["short_mscale"]
+            })
+
    if config["hidden_act"] == "silu":
        config["hidden_act"] = "swiglu"
+
+    # Tensor parallelism and weight-only quantization
+    if args is not None:
+        config.update({
+            'mapping': {
+                'world_size': args.tp_size * args.pp_size,
+                'tp_size': args.tp_size,
+                'pp_size': args.pp_size,
+            }
+        })
+
+        if args.use_weight_only and args.weight_only_precision == 'int8':
+            config.update({'quantization': {'quant_algo': QuantAlgo.W8A16}})
+        elif args.use_weight_only and args.weight_only_precision == 'int4':
+            config.update({'quantization': {'quant_algo': QuantAlgo.W4A16}})
+
    return config
--- a/tensorrt_llm/models/phi3/model.py
+++ b/tensorrt_llm/models/phi3/model.py
@ -1,15 +1,20 @@
+import json
+import os
+import traceback
+from concurrent.futures import ThreadPoolExecutor, as_completed
 from typing import Optional

 import numpy as np
+import safetensors
 from transformers import AutoModelForCausalLM

 from ..._utils import pad_vocab_size
 from ...functional import PositionEmbeddingType, Tensor
-from ...layers import (MLP, Attention, AttentionMaskType, Embedding,
-                       ParallelLMHead, RmsNorm)
+from ...layers import (MLP, Attention, AttentionMaskType, BlockSparseAttnParams,
+                       Embedding, LayerNorm, ParallelLMHead, RmsNorm)
 from ...module import Module
 from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
-                              PretrainedConfig, save_checkpoint)
+                              PretrainedConfig)
 from .convert import convert_hf_config, convert_hf_weights


@ -22,28 +27,65 @@ class Phi3DecoderLayer(Module):
        tp_group = config.mapping.tp_group
        tp_size = config.mapping.tp_size

-        self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
-                                       eps=config.norm_epsilon,
-                                       dtype=config.dtype)
-        self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
-                                      eps=config.norm_epsilon,
-                                      dtype=config.dtype)
+        attention_mask_type = AttentionMaskType.causal
+        block_sparse_attn_params = BlockSparseAttnParams()
+        q_scaling = 1.0
+        self.gegelu_limit = None
+
+        self.small_variant = config.architecture == "Phi3SmallForCausalLM"
+        if self.small_variant:
+            self.gegelu_limit = config.gegelu_limit
+
+            # MuP uses norm_factor=attention_head_size (rather than sqrt(attention_head_size))
+            # We achieve this using q_scaling = sqrt(attention_head_size)
+            hidden_size = config.hidden_size
+            num_attention_heads = config.num_attention_heads
+            attention_head_size = hidden_size / num_attention_heads
+            q_scaling = attention_head_size**.5
+
+            block_sparse = (
+                (layer_idx + 1) % config.dense_attention_every_n_layers) != 0
+            attention_mask_type = AttentionMaskType.blocksparse if block_sparse else AttentionMaskType.causal
+
+            block_sparse_attn_params = BlockSparseAttnParams(
+                config.blocksparse_block_size,
+                config.blocksparse_homo_head_pattern,
+                config.blocksparse_num_local_blocks,
+                config.blocksparse_vertical_stride)
+
+            self.input_layernorm = LayerNorm(
+                normalized_shape=config.hidden_size, dtype=config.dtype)
+            self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size,
+                                            dtype=config.dtype)
+        else:
+            self.input_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                           eps=config.norm_epsilon,
+                                           dtype=config.dtype)
+            self.post_layernorm = RmsNorm(normalized_shape=config.hidden_size,
+                                          eps=config.norm_epsilon,
+                                          dtype=config.dtype)

        layers_range = config.mapping.pp_layers(config.num_hidden_layers)
        local_layer_idx = layer_idx - layers_range[0]
        position_embedding_type = PositionEmbeddingType.rope_gpt_neox

-        rope_scaling_short_factors = 1.0
-        rope_scaling_long_factors = 1.0
+        rope_scaling_short_factors, rope_scaling_long_factors = 1.0, 1.0
+        rope_scaling_short_mscale, rope_scaling_long_mscale = 1.0, 1.0
        original_max_position_embeddings = config.max_position_embeddings
+
        if hasattr(config, "longrope_scaling_short_factors"):
            rope_scaling_short_factors = np.asarray(
                config.longrope_scaling_short_factors).astype(np.float32)
            rope_scaling_long_factors = np.asarray(
                config.longrope_scaling_long_factors).astype(np.float32)
+
            original_max_position_embeddings = config.original_max_position_embeddings
            position_embedding_type = PositionEmbeddingType.long_rope

+            if self.small_variant:
+                rope_scaling_short_mscale = config.longrope_short_mscale
+                rope_scaling_long_mscale = config.longrope_long_mscale
+
        self.attention = Attention(
            local_layer_idx=local_layer_idx,
            hidden_size=config.hidden_size,
@ -53,15 +95,18 @@ class Phi3DecoderLayer(Module):
            rotary_embedding_base=config.rotary_base,
            max_position_embeddings=config.max_position_embeddings,
            dtype=config.dtype,
-            attention_mask_type=AttentionMaskType.causal,
-            bias=False,
+            attention_mask_type=attention_mask_type,
+            bias=self.small_variant,
+            q_scaling=q_scaling,
            tp_group=tp_group,
            tp_size=tp_size,
            quant_mode=config.quant_mode,
            rope_scaling_short_factors=rope_scaling_short_factors,
            rope_scaling_long_factors=rope_scaling_long_factors,
+            rope_scaling_short_mscale=rope_scaling_short_mscale,
+            rope_scaling_long_mscale=rope_scaling_long_mscale,
            original_max_position_embeddings=original_max_position_embeddings,
-        )
+            block_sparse_params=block_sparse_attn_params)

        self.mlp = MLP(hidden_size=config.hidden_size,
                       ffn_hidden_size=config.intermediate_size,
@ -70,7 +115,7 @@ class Phi3DecoderLayer(Module):
                       tp_group=tp_group,
                       tp_size=tp_size,
                       quant_mode=config.quant_mode,
-                       bias=False)
+                       bias=self.small_variant)

    def forward(
        self,
@ -88,7 +133,7 @@ class Phi3DecoderLayer(Module):
            use_cache=use_cache,
            kv_cache_params=kv_cache_params,
            attention_params=attention_params,
-            norm_before_bmm1=True,
+            norm_before_bmm1=not self.small_variant,
        )

        if use_cache:
@ -96,7 +141,8 @@ class Phi3DecoderLayer(Module):

        post_attention_input = hidden_states + attention_output
        post_attention_output = self.post_layernorm(post_attention_input)
-        feed_forward_hidden_states = self.mlp(post_attention_output, )
+        feed_forward_hidden_states = self.mlp(post_attention_output,
+                                              gegelu_limit=self.gegelu_limit)
        hidden_states = post_attention_input + feed_forward_hidden_states
        if use_cache:
            return (hidden_states, presents)
@ -112,9 +158,15 @@ class Phi3Model(Module):
                                         dtype=config.dtype)

        self.layers = DecoderLayerList(Phi3DecoderLayer, config)
-        self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
-                            eps=config.norm_epsilon,
-                            dtype=config.dtype)
+        self.small_variant = config.architecture == "Phi3SmallForCausalLM"
+        if self.small_variant:
+            self.ln_f = LayerNorm(normalized_shape=config.hidden_size,
+                                  dtype=config.dtype)
+            self.mup_embedding_multiplier = config.mup_embedding_multiplier
+        else:
+            self.ln_f = RmsNorm(normalized_shape=config.hidden_size,
+                                eps=config.norm_epsilon,
+                                dtype=config.dtype)

    def forward(
        self,
@ -132,6 +184,9 @@ class Phi3Model(Module):
                ] if prompt_embedding_table is not None else []
        hidden_states = self.vocab_embedding(input_ids, *args)

+        if self.small_variant and self.mup_embedding_multiplier > 0.0:
+            hidden_states = hidden_states * self.mup_embedding_multiplier
+
        hidden_states = self.layers(
            hidden_states,
            use_cache=use_cache,
@ -152,7 +207,6 @@ class Phi3Model(Module):
 class Phi3ForCausalLM(DecoderModelForCausalLM):

    def __init__(self, config: PretrainedConfig):
-        self.check_config(config)
        transformer = Phi3Model(config)
        vocab_size_padded = pad_vocab_size(config.vocab_size,
                                           config.mapping.tp_size)
@ -167,25 +221,48 @@ class Phi3ForCausalLM(DecoderModelForCausalLM):

        super().__init__(config, transformer, lm_head)

-    def check_config(self, config):
-        config.set_if_not_exist('rotary_base', 10000.0)
-
    @classmethod
    def convert_hf_checkpoint(cls,
                              hf_model_dir: str,
                              dtype: Optional[str] = "float16",
                              output_dir: Optional[str] = None,
-                              **kwargs):
+                              args=None):
        '''
        Convert Huggingface checkpoint to TRT-LLM checkpoint
        '''
+
        hf_model = AutoModelForCausalLM.from_pretrained(hf_model_dir,
                                                        torch_dtype="auto",
                                                        trust_remote_code=True)
-        config = convert_hf_config(hf_model.config, dtype=dtype, **kwargs)
-        weights = convert_hf_weights(hf_model, dtype=dtype, **kwargs)
+        config = convert_hf_config(hf_model.config, dtype, args)
+        with open(os.path.join(output_dir, 'config.json'), 'w') as f:
+            json.dump(config, f, indent=4)

-        if output_dir:
-            save_checkpoint(output_dir, config=config, weights=weights)
+        small_variant = config['architecture'] == "Phi3SmallForCausalLM"

-        return {"weights": weights, "config": config}
+        def covert_and_save(rank):
+            weights = convert_hf_weights(hf_model, dtype, config, small_variant,
+                                         args, rank)
+            safetensors.torch.save_file(
+                weights, os.path.join(output_dir, f'rank{rank}.safetensors'))
+
+        world_size = args.tp_size * args.pp_size
+        if args.workers == 1:
+            for rank in range(world_size):
+                covert_and_save(rank)
+        else:
+            with ThreadPoolExecutor(max_workers=args.workers) as p:
+                futures = [
+                    p.submit(covert_and_save, rank)
+                    for rank in range(world_size)
+                ]
+                exceptions = []
+                for future in as_completed(futures):
+                    try:
+                        future.result()
+                    except Exception as e:
+                        traceback.print_exc()
+                        exceptions.append(e)
+                assert len(
+                    exceptions
+                ) == 0, "Checkpoint conversion failed, please check error log."
--- a/tensorrt_llm/models/phi3/phi3small/init.py
+++ b/tensorrt_llm/models/phi3/phi3small/init.py
@ -1,14 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/tensorrt_llm/models/phi3/phi3small/model.py
+++ b/tensorrt_llm/models/phi3/phi3small/model.py
@ -1,257 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import json
-import os
-import traceback
-from concurrent.futures import ThreadPoolExecutor, as_completed
-
-import numpy as np
-import safetensors
-from transformers import AutoModelForCausalLM
-
-from ...._utils import pad_vocab_size
-from ....functional import PositionEmbeddingType, Tensor
-from ....layers import (MLP, Attention, AttentionMaskType,
-                        BlockSparseAttnParams, Embedding, LayerNorm,
-                        ParallelLMHead)
-from ....module import Module
-from ...modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
-                               PretrainedConfig)
-from .convert import convert_hf_config, convert_hf_weights
-
-
-class Phi3SmallDecoderLayer(Module):
-
-    def __init__(self, config: PretrainedConfig, layer_idx: int):
-        super().__init__()
-        self.config = config
-        self.layer_idx = layer_idx
-        tp_group = config.mapping.tp_group
-        tp_size = config.mapping.tp_size
-        self.gegelu_limit = config.gegelu_limit
-
-        self.input_layernorm = LayerNorm(normalized_shape=config.hidden_size,
-                                         dtype=config.dtype)
-
-        # MuP uses norm_factor=attention_head_size (rather than sqrt(attention_head_size))
-        # We achieve this using q_scaling = sqrt(attention_head_size)
-        hidden_size = config.hidden_size
-        num_attention_heads = config.num_attention_heads
-        attention_head_size = hidden_size / num_attention_heads
-        q_scaling = attention_head_size**.5
-
-        block_sparse = (
-            (layer_idx + 1) % config.dense_attention_every_n_layers) != 0
-        attention_mask_type = AttentionMaskType.blocksparse if block_sparse else AttentionMaskType.causal
-
-        block_sparse_attn_params = BlockSparseAttnParams(
-            config.blocksparse_block_size, config.blocksparse_homo_head_pattern,
-            config.blocksparse_num_local_blocks,
-            config.blocksparse_vertical_stride)
-
-        layers_range = config.mapping.pp_layers(config.num_hidden_layers)
-        local_layer_idx = layer_idx - layers_range[0]
-
-        position_embedding_type = PositionEmbeddingType.rope_gpt_neox
-        original_max_position_embeddings = config.max_position_embeddings
-
-        rope_scaling_short_factors, rope_scaling_long_factors = 1.0, 1.0
-        rope_scaling_short_mscale, rope_scaling_long_mscale = 1.0, 1.0
-
-        if hasattr(config, "longrope_scaling_short_factors"):
-            rope_scaling_short_factors = np.asarray(
-                config.longrope_scaling_short_factors).astype(np.float32)
-            rope_scaling_long_factors = np.asarray(
-                config.longrope_scaling_long_factors).astype(np.float32)
-            rope_scaling_short_mscale = config.longrope_short_mscale
-            rope_scaling_long_mscale = config.longrope_long_mscale
-
-            position_embedding_type = PositionEmbeddingType.long_rope
-            original_max_position_embeddings = config.original_max_position_embeddings
-
-        self.attention = Attention(
-            local_layer_idx=local_layer_idx,
-            hidden_size=config.hidden_size,
-            num_attention_heads=config.num_attention_heads,
-            num_kv_heads=config.num_kv_heads,
-            position_embedding_type=position_embedding_type,
-            rotary_embedding_base=config.rotary_embedding_base,
-            max_position_embeddings=config.max_position_embeddings,
-            original_max_position_embeddings=original_max_position_embeddings,
-            dtype=config.dtype,
-            attention_mask_type=attention_mask_type,
-            bias=True,
-            q_scaling=q_scaling,
-            tp_group=tp_group,
-            tp_size=tp_size,
-            quant_mode=config.quant_mode,
-            rope_scaling_short_factors=rope_scaling_short_factors,
-            rope_scaling_long_factors=rope_scaling_long_factors,
-            rope_scaling_short_mscale=rope_scaling_short_mscale,
-            rope_scaling_long_mscale=rope_scaling_long_mscale,
-            block_sparse_params=block_sparse_attn_params)
-
-        self.post_layernorm = LayerNorm(normalized_shape=config.hidden_size,
-                                        dtype=config.dtype)
-
-        self.mlp = MLP(hidden_size=config.hidden_size,
-                       ffn_hidden_size=config.intermediate_size,
-                       hidden_act=config.hidden_act,
-                       dtype=config.dtype,
-                       tp_group=tp_group,
-                       tp_size=tp_size,
-                       quant_mode=config.quant_mode)
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        attention_mask=None,
-        use_cache=False,
-        kv_cache_params=None,
-        attention_params=None,
-    ):
-        residual = hidden_states
-        input_layernorm_output = self.input_layernorm(hidden_states)
-
-        # Self attention
-        attention_output = self.attention(
-            input_layernorm_output,
-            attention_mask=attention_mask,
-            use_cache=use_cache,
-            kv_cache_params=kv_cache_params,
-            attention_params=attention_params,
-        )
-
-        if use_cache:
-            attention_output, presents = attention_output
-
-        hidden_states = residual + attention_output
-
-        # Fully connected
-        residual = hidden_states
-        hidden_states = self.post_layernorm(hidden_states)
-        hidden_states = self.mlp(hidden_states, gegelu_limit=self.gegelu_limit)
-        hidden_states = residual + hidden_states
-
-        if use_cache:
-            return (hidden_states, presents)
-        return hidden_states
-
-
-class Phi3SmallModel(Module):
-
-    def __init__(self, config: PretrainedConfig):
-        super().__init__()
-        self.vocab_embedding = Embedding(num_embeddings=config.vocab_size,
-                                         embedding_dim=config.hidden_size,
-                                         dtype=config.dtype)
-
-        self.layers = DecoderLayerList(Phi3SmallDecoderLayer, config)
-        self.ln_f = LayerNorm(normalized_shape=config.hidden_size,
-                              dtype=config.dtype)
-        self.mup_embedding_multiplier = config.mup_embedding_multiplier
-
-    def forward(
-        self,
-        input_ids: Tensor,
-        position_ids=None,
-        use_cache=False,
-        attention_mask=None,
-        kv_cache_params=None,
-        attention_params=None,
-        prompt_embedding_table=None,
-        prompt_tasks=None,
-        prompt_vocab_size=None,
-    ):
-        args = [prompt_embedding_table, prompt_tasks, prompt_vocab_size
-                ] if prompt_embedding_table is not None else []
-        hidden_states = self.vocab_embedding(input_ids, *args)
-
-        if self.mup_embedding_multiplier is not None and self.mup_embedding_multiplier > 0.0:
-            hidden_states = hidden_states * self.mup_embedding_multiplier
-
-        hidden_states = self.layers(
-            hidden_states,
-            use_cache=use_cache,
-            attention_mask=attention_mask,
-            kv_cache_params=kv_cache_params,
-            attention_params=attention_params,
-        )
-        if use_cache:
-            hidden_states, presents = hidden_states
-
-        hidden_states = self.ln_f(hidden_states)
-
-        if use_cache:
-            return (hidden_states, tuple(presents))
-        return hidden_states
-
-
-class Phi3SmallForCausalLM(DecoderModelForCausalLM):
-
-    def __init__(self, config: PretrainedConfig):
-        transformer = Phi3SmallModel(config)
-        vocab_size_padded = pad_vocab_size(config.vocab_size,
-                                           config.mapping.tp_size)
-
-        lm_head = ParallelLMHead(config.hidden_size,
-                                 vocab_size_padded,
-                                 bias=False,
-                                 dtype=config.dtype,
-                                 tp_group=config.mapping.tp_group,
-                                 tp_size=config.mapping.tp_size,
-                                 gather_output=True)
-
-        super().__init__(config, transformer, lm_head)
-
-    @classmethod
-    def convert_hf_checkpoint(cls, model_dir, dtype, output_dir, args=None):
-        '''
-        Convert Huggingface checkpoint to TRT-LLM checkpoint
-        '''
-
-        hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
-                                                        torch_dtype="auto",
-                                                        trust_remote_code=True)
-
-        config = convert_hf_config(hf_model.config, dtype, args)
-        with open(os.path.join(output_dir, 'config.json'), 'w') as f:
-            json.dump(config, f, indent=4)
-
-        def covert_and_save(rank):
-            weights = convert_hf_weights(hf_model, config, args, rank)
-            safetensors.torch.save_file(
-                weights, os.path.join(output_dir, f'rank{rank}.safetensors'))
-
-        world_size = args.tp_size * args.pp_size
-        if args.workers == 1:
-            for rank in range(world_size):
-                covert_and_save(rank)
-        else:
-            with ThreadPoolExecutor(max_workers=args.workers) as p:
-                futures = [
-                    p.submit(covert_and_save, rank)
-                    for rank in range(world_size)
-                ]
-                exceptions = []
-                for future in as_completed(futures):
-                    try:
-                        future.result()
-                    except Exception as e:
-                        traceback.print_exc()
-                        exceptions.append(e)
-                assert len(
-                    exceptions
-                ) == 0, "Checkpoint conversion failed, please check error log."
--- a/tensorrt_llm/models/phi3/phi3small/convert.py
+++ b/tensorrt_llm/models/phi3/phi3small/convert.py
@ -15,10 +15,6 @@

 import torch

-from tensorrt_llm.quantization import QuantAlgo
-
-from ...._utils import str_dtype_to_torch
-

 def shuffle_qkv_weights(weights, config):
    # Input weights are organized as
@ -29,8 +25,7 @@ def shuffle_qkv_weights(weights, config):
    # (q00, q01, ..., qnm), (k0, k1, .., kn), (v0, v1, .., vn)

    num_heads = config['num_attention_heads']
-    num_kv_heads = config['num_kv_heads'] if 'num_kv_heads' in config.keys(
-    ) else config['num_key_value_heads']
+    num_kv_heads = config['num_key_value_heads']
    num_q_per_kv = num_heads // num_kv_heads

    hidden_size = config['hidden_size']
@ -152,7 +147,7 @@ def get_tllm_linear_weight(weight,

 def split_weights_tp(config, weights, args, rank, dtype):
    num_heads = config['num_attention_heads']
-    num_kv_heads = config['num_kv_heads']
+    num_kv_heads = config['num_key_value_heads']
    hidden_size = config['hidden_size']

    mha_mode = num_heads == num_kv_heads
@ -228,96 +223,3 @@ def split_weights_tp(config, weights, args, rank, dtype):
                                                dim=0)

    return weights
-
-
-def convert_hf_weights(hf_model, config, args, rank):
-    torch_dtype = str_dtype_to_torch(args.dtype)
-    hf_state_dict = hf_model.state_dict()
-    weights = {}
-
-    # replace key name
-    for key, value in hf_state_dict.items():
-        # Decoder Layers
-        if "model.layers." in key:
-            key = key.replace("model.layers.", "transformer.layers.")
-            key = key.replace("self_attn.", "attention.")
-            key = key.replace("query_key_value.", "qkv.")
-            key = key.replace("mlp.up_proj.", "mlp.fc.")
-            key = key.replace("mlp.down_proj.", "mlp.proj.")
-            key = key.replace("post_attention_layernorm.", "post_layernorm.")
-        # Embedding
-        key = key.replace("model.embed_tokens.weight",
-                          "transformer.vocab_embedding.weight")
-        # Final Layer norm
-        key = key.replace("model.final_layernorm.", "transformer.ln_f.")
-        weights[key] = value.to(torch_dtype).cpu()
-
-    weights['lm_head.weight'] = weights[
-        'transformer.vocab_embedding.weight'].clone()
-
-    # Transform QKV weights from custom Phi3Small format to TRT-LLM format
-    for key, value in weights.items():
-        if "qkv." in key:
-            weights[key] = shuffle_qkv_weights(weights[key], config)
-
-    weights = split_weights_tp(config, weights, args, rank, torch_dtype)
-
-    return weights
-
-
-def convert_hf_config(hf_config, dtype, args):
-    config = {
-        'architecture': 'Phi3SmallForCausalLM',
-        'dtype': dtype,
-        'num_hidden_layers': hf_config.num_hidden_layers,
-        'num_attention_heads': hf_config.num_attention_heads,
-        'num_kv_heads': hf_config.num_key_value_heads,
-        'rotary_embedding_base': hf_config.rope_embedding_base,
-        'hidden_size': hf_config.hidden_size,
-        'intermediate_size': hf_config.intermediate_size,
-        'vocab_size': hf_config.vocab_size,
-        'max_position_embeddings': hf_config.max_position_embeddings,
-        'hidden_act': hf_config.hidden_act,
-        'share_embedding_table': False,
-        'gegelu_limit': hf_config.gegelu_limit,
-        'mup_attn_multiplier': hf_config.mup_attn_multiplier,
-        'mup_embedding_multiplier': hf_config.mup_embedding_multiplier,
-        'mup_use_scaling': hf_config.mup_use_scaling,
-        'mup_width_multiplier': hf_config.mup_width_multiplier,
-        'blocksparse_block_size': hf_config.blocksparse_block_size,
-        'blocksparse_homo_head_pattern':
-        hf_config.blocksparse_homo_head_pattern,
-        'blocksparse_num_local_blocks': hf_config.blocksparse_num_local_blocks,
-        'blocksparse_vertical_stride': hf_config.blocksparse_vert_stride,
-        'dense_attention_every_n_layers':
-        hf_config.dense_attention_every_n_layers,
-    }
-
-    if args is not None:
-        config.update({
-            'mapping': {
-                'world_size': args.tp_size * args.pp_size,
-                'tp_size': args.tp_size,
-                'pp_size': args.pp_size,
-            }
-        })
-
-        if args.use_weight_only and args.weight_only_precision == 'int8':
-            config.update({'quantization': {'quant_algo': QuantAlgo.W8A16}})
-        elif args.use_weight_only and args.weight_only_precision == 'int4':
-            config.update({'quantization': {'quant_algo': QuantAlgo.W4A16}})
-
-    if hf_config.max_position_embeddings >= 128000:
-        config.update({
-            'original_max_position_embeddings':
-            hf_config.original_max_position_embeddings,
-            'longrope_scaling_short_factors':
-            hf_config.rope_scaling["short_factor"],
-            'longrope_scaling_long_factors':
-            hf_config.rope_scaling["long_factor"],
-            'longrope_long_mscale':
-            hf_config.rope_scaling["long_mscale"],
-            'longrope_short_mscale':
-            hf_config.rope_scaling["short_mscale"]
-        })
-    return config
--- a/tensorrt_llm/models/recurrentgemma/model.py
+++ b/tensorrt_llm/models/recurrentgemma/model.py
@ -398,9 +398,9 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
            max_batch_size,
            max_input_len,
            max_seq_len,
+            max_num_tokens,
            use_cache,
            max_beam_width: int = 1,
-            max_num_tokens: int = None,
            opt_num_tokens: int = None,
            opt_batch_size: int = 0,
            prompt_embedding_table_size: int = 0,
@ -434,44 +434,20 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
        self.gather_context_logits = gather_context_logits
        mapping = self.config.mapping

-        default_range = GenerationMixin.default_range
-        batch_range = default_range(max_batch_size)
-        bbd_range = [
-            batch_range[i] * ((max_draft_len + 1) if i != 0 else 1)
-            for i in range(len(batch_range))
-        ]
-        inlen_range_cxt = default_range(max_input_len)
-        inlen_range_gen = [1, 1, max_draft_len + 1]
-
        # basic inputs
        enable_ctx_gen_opt_profiles = GenerationMixin.has_ctx_gen_opt_profiles(
            use_gpt_attention_plugin, use_gemm_plugin, remove_input_padding,
            paged_kv_cache)
-        if max_num_tokens is None:
-            max_num_tokens = max(
-                max_input_len * max_batch_size,
-                max_beam_width * (max_draft_len + 1) * max_batch_size)
-        if enable_ctx_gen_opt_profiles:
-            num_profiles = 2
-            inlen_range = [inlen_range_cxt, inlen_range_gen]
-            num_tokens_range_ctx = default_range(max_num_tokens)
-            num_tokens_range_gen = default_range(
-                max_batch_size * (max_draft_len + 1) * max_beam_width)
-            num_tokens_range = [num_tokens_range_ctx, num_tokens_range_gen]
-            position_ids_inlen_range = [inlen_range_cxt, [1, 1, 1]]
-        else:
-            max_bs_x_max_bw = max_batch_size * max_beam_width
-            if opt_num_tokens is None:
-                opt_num_tokens = max_bs_x_max_bw
-            if multiple_profiles:
-                num_tokens_range = GenerationMixin.split_num_tokens_range(
-                    max_num_tokens)
-            else:
-                num_tokens_range = [[1, opt_num_tokens, max_num_tokens]]
-            num_profiles = len(num_tokens_range)
-            inlen_range = [[1, 1, max_input_len]] * num_profiles
-            position_ids_inlen_range = [[1, 1, max_input_len]] * num_profiles
-        bb_range = [batch_range] * num_profiles
+        num_profiles, ranges = GenerationMixin.get_profiles_ranges(
+            max_batch_size=max_batch_size,
+            max_beam_width=max_beam_width,
+            max_input_len=max_input_len,
+            max_num_tokens=max_num_tokens,
+            max_draft_len=max_draft_len,
+            opt_batch_size=opt_batch_size,
+            opt_num_tokens=opt_num_tokens,
+            enable_ctx_gen_opt_profiles=enable_ctx_gen_opt_profiles,
+            multiple_profiles=multiple_profiles)

        if remove_input_padding:
            assert use_mamba_conv1d_plugin, "mamba_conv1d_plugin is needed to support remove_input_padding"
@ -479,14 +455,14 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
                               dtype=trt.int32,
                               shape=[-1],
                               dim_range=OrderedDict([
-                                   ('num_tokens', num_tokens_range),
+                                   ('num_tokens', ranges['num_tokens_range']),
                               ]))
            position_ids = Tensor(name='position_ids',
                                  dtype=trt.int32,
                                  shape=[-1],
                                  dim_range=OrderedDict([
                                      ('position_ids_num_tokens_range',
-                                       num_tokens_range),
+                                       ranges['num_tokens_range']),
                                  ]))
        else:
            input_ids = Tensor(name='input_ids',
@ -494,16 +470,17 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
                               shape=[-1, -1],
                               dim_range=OrderedDict([
                                   ('batch_size_beam_width',
-                                    [batch_range] * num_profiles),
-                                   ('input_len', inlen_range),
+                                    ranges['bb_range']),
+                                   ('input_len', ranges['inlen_range']),
                               ]))
            position_ids = Tensor(name='position_ids',
                                  dtype=trt.int32,
                                  shape=[-1, -1],
                                  dim_range=OrderedDict([
-                                      ('batch_size_beam_width', bb_range),
+                                      ('batch_size_beam_width',
+                                       ranges['bb_range']),
                                      ('position_ids_inlen_range',
-                                       position_ids_inlen_range),
+                                       ranges['position_ids_inlen_range']),
                                  ]))
        if use_custom_all_reduce and mapping.tp_size > 1:
            current_all_reduce_helper().set_workspace_tensor(
@ -559,7 +536,8 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
                name='host_request_types',
                dtype=trt.int32,
                shape=[-1],
-                dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
+                dim_range=OrderedDict([('batch_size_beam_width',
+                                        ranges['bb_range'])]),
            )

        last_token_ids = Tensor(
@ -567,7 +545,7 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
            dtype=trt.int32,
            shape=[-1],
            dim_range=OrderedDict([
-                ('batch_size_last_token_ids', [bbd_range] * num_profiles),
+                ('batch_size_last_token_ids', ranges['bbd_range']),
            ]),
        )
        last_token_ids_for_logits = None
@ -581,7 +559,8 @@ class RecurrentGemmaForCausalLM(PretrainedModel):
                name='host_context_lengths',
                dtype=trt.int32,
                shape=[-1],
-                dim_range=OrderedDict([('batch_size_beam_width', bb_range)]),
+                dim_range=OrderedDict([('batch_size_beam_width',
+                                        ranges['bb_range'])]),
            )
        else:
            host_context_lengths = None
--- a/tensorrt_llm/quantization/layers.py
+++ b/tensorrt_llm/quantization/layers.py
@ -883,7 +883,6 @@ class FP8Linear(Linear):
        assert lora_runtime_params is None or default_net(
        ).plugin_config.lora_plugin == self.dtype

-        lora_hidden_state = x if lora_runtime_params is not None else None
        if default_net().strongly_typed:
            assert is_same_dtype(
                x.dtype,
@ -894,8 +893,13 @@ class FP8Linear(Linear):
        activation_scaling_factor = cast(activation_scaling_factor, self.dtype)
        if x.dtype != trt.fp8:
            quantized_out = quantize(x, activation_scaling_factor, 'fp8')
+            lora_hidden_state = x if lora_runtime_params is not None else None
        else:
            quantized_out = x
+            # TODO: add fp8 LoRA support
+            lora_hidden_state = dequantize(
+                x, activation_scaling_factor, -1,
+                self.dtype) if lora_runtime_params is not None else None

        weights_scaling_factor = constant(
            self.weights_scaling_factor.raw_value.copy())
@ -956,14 +960,18 @@ class FP8RowLinear(RowLinear):
        assert lora_runtime_params is None or default_net(
        ).plugin_config.lora_plugin == self.dtype

-        lora_hidden_state = x if lora_runtime_params is not None else None
        activation_scaling_factor = constant(
            self.activation_scaling_factor.raw_value.copy())
        activation_scaling_factor = cast(activation_scaling_factor, self.dtype)
        if x.dtype != trt.fp8:
            quantized_out = quantize(x, activation_scaling_factor, 'fp8')
+            lora_hidden_state = x if lora_runtime_params is not None else None
        else:
            quantized_out = x
+            # TODO: add fp8 LoRA support
+            lora_hidden_state = dequantize(
+                x, activation_scaling_factor, -1,
+                self.dtype) if lora_runtime_params is not None else None

        weights_scaling_factor = constant(
            self.weights_scaling_factor.raw_value.copy())
--- a/tensorrt_llm/quantization/quantize.py
+++ b/tensorrt_llm/quantization/quantize.py
@ -57,7 +57,10 @@ def quantize_layers(
            if preprocess_init_params is not None:
                preprocess_init_params(init_params, name, module)
            quant_layer = quant_cls(**init_params)
-            setattr(parent, module_name, quant_layer)
+            if parent is not None:
+                setattr(parent, module_name, quant_layer)
+            else:
+                model = quant_layer

    setattr(model, 'quant_mode', quant_config.quant_mode)
    return model
@ -78,7 +81,7 @@ def weight_only_quantize(model, quant_config: QuantConfig):
            module_name = name.rsplit('.', 1)[-1]
            init_params["transb"] = module_name == "lm_head"

-    quantize_layers(
+    model = quantize_layers(
        model,
        quant_config,
        quant_map,
@ -102,7 +105,7 @@ def weight_only_groupwise_quantize(model, quant_config: QuantConfig):
        init_params[
            "use_w4a8_awq"] = quant_config.quant_algo == QuantAlgo.W4A8_AWQ

-    quantize_layers(
+    model = quantize_layers(
        model,
        quant_config,
        quant_map,
@ -120,7 +123,7 @@ def smooth_quantize_ootb(
        RowLinear: Int8SmoothQuantRowLinear,
    }

-    quantize_layers(
+    model = quantize_layers(
        model,
        quant_config,
        quant_map,
@ -138,7 +141,7 @@ def smooth_quantize_plugin(model, quant_mode):
    }
    for name, layer, parent in model.named_modules_with_parent():
        layer_name = name.rsplit('.', 1)[-1]
-        if layer_name in ['ln_f']:
+        if layer_name in ['ln_f', 'ln_embed']:
            continue

        quant_cls = None
@ -156,7 +159,10 @@ def smooth_quantize_plugin(model, quant_mode):
            init_params[
                "num_attention_heads"] = layer.num_attention_heads * layer.tp_size
        quant_layer = quant_cls(**init_params)
-        setattr(parent, layer_name, quant_layer)
+        if parent is not None:
+            setattr(parent, layer_name, quant_layer)
+        else:
+            model = quant_layer

    setattr(model, 'quant_mode', quant_mode)
    return model
@ -178,7 +184,7 @@ def fp8_quantize(model, quant_config: QuantConfig):
        RowLinear: FP8RowLinear,
    }

-    quantize_layers(
+    model = quantize_layers(
        model,
        quant_config,
        quant_map,
--- a/tensorrt_llm/quantization/quantize_by_modelopt.py
+++ b/tensorrt_llm/quantization/quantize_by_modelopt.py
@ -31,10 +31,12 @@ import numpy as np
 import safetensors
 import torch
 from datasets import load_dataset
+from safetensors.torch import load_file, save_file
 from torch.utils.data import DataLoader
 from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer

 from ..logger import logger
+from ..mapping import Mapping
 from .mode import QuantAlgo

 EMPTY_CFG = {
@ -122,7 +124,8 @@ MODEL_NAME_PATTERN_MAP = {
    "Gemma": "gemma",
    "MixtralForCausalLM": "llama",
    "ArcticForCausalLM": "llama",
-    "Phi3SmallForCausalLM": "phi",
+    "Phi3SmallForCausalLM": "phi3small",
+    "Phi3ForCausalLM": "phi3",
 }


@ -263,10 +266,95 @@ def quantize_model(model, quant_cfg, calib_dataloader=None):
    return model


-def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat,
-                        kv_cache_dtype, calib_size, batch_size,
-                        calib_max_seq_length, awq_block_size, output_dir,
-                        tp_size, pp_size, seed, tokenizer_max_seq_length):
+def combine_medusa_weight(tp_size, pp_size, base_model_output_dir,
+                          num_medusa_heads, num_medusa_layers, max_draft_len,
+                          medusa_hidden_act, medusa_model_dir,
+                          quant_medusa_head):
+
+    with open(f"{medusa_model_dir}/config.json", "r") as fp:
+        medusa_config = json.load(fp)
+
+    num_medusa_heads_from_config = medusa_config.get('medusa_num_heads',
+                                                     num_medusa_heads)
+    num_medusa_layers = medusa_config.get('medusa_num_layers',
+                                          num_medusa_layers)
+    if num_medusa_heads is None:
+        num_medusa_heads = num_medusa_heads_from_config
+
+    assert max_draft_len > 0, "should have max_draft_len > 0"
+
+    world_size = tp_size * pp_size
+    # Process for each rank
+    for rank in range(world_size):
+        mapping = Mapping(world_size=world_size,
+                          rank=rank,
+                          tp_size=tp_size,
+                          pp_size=pp_size)
+        # 1. Load medusa weight for each rank
+        from tensorrt_llm.models.medusa.weight import load_medusa_hf
+        medusa_weights = load_medusa_hf(medusa_path=medusa_model_dir,
+                                        num_medusa_heads=num_medusa_heads,
+                                        num_medusa_layers=num_medusa_layers,
+                                        mapping=mapping,
+                                        dtype="float16")
+        # 2. Load base model safetensors (after quant)
+        base_model_weights = load_file(
+            f"{base_model_output_dir}/rank{rank}.safetensors")
+
+        # 3. Combine and save weight
+        base_model_weights.update(medusa_weights)
+        save_file(base_model_weights,
+                  f"{base_model_output_dir}/rank{rank}.safetensors")
+
+    # 4. Add medusa config into config.json
+    with open(f"{base_model_output_dir}/config.json", 'r') as f:
+        base_model_config = json.load(f)
+        f.close()
+
+    with open(f"{base_model_output_dir}/config.json", 'w') as f:
+        base_model_config['architecture'] = "MedusaForCausalLM"
+        base_model_config['quantization']['exclude_modules'] = [
+            'lm_head',
+            '*router',
+            '*vocab_embedding',
+            '*position_embedding',
+            '*block_embedding',
+        ]
+        if not quant_medusa_head:
+            base_model_config['quantization']['exclude_modules'].append(
+                '*medusa_heads*')
+
+        base_model_config['max_draft_len'] = max_draft_len
+        base_model_config['num_medusa_heads'] = num_medusa_heads
+        base_model_config['num_medusa_layers'] = num_medusa_layers
+        json.dump(base_model_config, f, indent=4)
+
+    torch.cuda.empty_cache()
+    print("Combine medusa heads' weight, done.")
+
+
+def quantize_and_export(*,
+                        model_dir,
+                        device,
+                        calib_dataset,
+                        dtype,
+                        qformat,
+                        kv_cache_dtype,
+                        calib_size,
+                        batch_size,
+                        calib_max_seq_length,
+                        awq_block_size,
+                        output_dir,
+                        tp_size,
+                        pp_size,
+                        seed,
+                        tokenizer_max_seq_length,
+                        num_medusa_heads=None,
+                        num_medusa_layers=None,
+                        max_draft_len=None,
+                        medusa_hidden_act=None,
+                        medusa_model_dir=None,
+                        quant_medusa_head=None):
    '''
        Load model from the model_dir, call Modelopt to quantize the model, and then export
        the quantized model as TRT-LLM checkpoint
@ -419,24 +507,16 @@ def quantize_and_export(*, model_dir, device, calib_dataset, dtype, qformat,
            with open(f"{export_path}/config.json", "w") as f:
                json.dump(tensorrt_llm_config, f, indent=4)

-        if model_type == 'phi':
-            with open(f"{export_path}/config.json", "r") as f:
-                tensorrt_llm_config = json.load(f)
-            phi_config = AutoConfig.from_pretrained(model_dir,
-                                                    trust_remote_code=True)
-
-            from ..models.phi3.phi3small.convert import \
-                convert_hf_config as phi_config_converter
-            phi_config = phi_config_converter(phi_config, dtype, None)
-
-            for key, value in phi_config.items():
-                tensorrt_llm_config[key] = value
-
-            with open(f"{export_path}/config.json", "w") as f:
-                json.dump(tensorrt_llm_config, f, indent=4)
-
        torch.cuda.empty_cache(
        )  # otherwise torch is keeping using GPU, other routine like build engine has less free GPU to use
+
+        # Workaround for combining medusa head
+        # TODO: move these integration into modelopt to avoid redundant reading and writing
+        if medusa_model_dir is not None:
+            combine_medusa_weight(tp_size, pp_size, export_path,
+                                  num_medusa_heads, num_medusa_layers,
+                                  max_draft_len, medusa_hidden_act,
+                                  medusa_model_dir, quant_medusa_head)
        end_time = time.time()
        print(
            "Quantized model exported to {} \nTotal time used {:.2f} s.".format(
--- a/tensorrt_llm/version.py
+++ b/tensorrt_llm/version.py
@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.11.0.dev2024061800"
+__version__ = "0.11.0.dev2024062500"
--- a/tests/model/test_mamba.py
+++ b/tests/model/test_mamba.py
@ -84,10 +84,12 @@ class TestMamba(unittest.TestCase):
            hf_config, hf_path, hf_mamba, load_mode, dtype)
        with net_guard(network):
            network.set_named_parameters(tensorrt_llm_mamba.named_parameters())
-            inputs = tensorrt_llm_mamba.prepare_inputs(batch_size,
-                                                       input_len,
-                                                       input_len + output_len,
-                                                       use_cache=False)
+            inputs = tensorrt_llm_mamba.prepare_inputs(
+                batch_size,
+                input_len,
+                input_len + output_len,
+                max_num_tokens=batch_size * input_len,
+                use_cache=False)
            # Prepare
            tensorrt_llm_mamba(**inputs)
        return network
--- a/tests/test_llama_conversion.sh
+++ b/tests/test_llama_conversion.sh
@ -80,7 +80,7 @@ test_gptq() {
    python convert_checkpoint.py --model_dir ${MODEL} \
                                 --output_dir ./tllm_checkpoint/2gpu_gptq \
                                 --dtype float16 \
-                                 --modelopt_quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
+                                 --quant_ckpt_path /home/scratch.trt_llm_data/llm-models/int4-quantized-gptq-awq/llama-7b-4bit-gs128.safetensors \
                                 --use_weight_only \
                                 --weight_only_precision int4_gptq \
                                 --per_group \