diff --git a/README.md b/README.md
index b5247c457f..7deeac5ee7 100644
--- a/README.md
+++ b/README.md
@@ -8,7 +8,6 @@ TensorRT-LLM
 [![python](https://img.shields.io/badge/python-3.10.12-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-12.2-green)](https://developer.nvidia.com/cuda-downloads)
 [![trt](https://img.shields.io/badge/TRT-9.1-green)](https://developer.nvidia.com/tensorrt)
-[![version](https://img.shields.io/badge/release-0.5.0-green)](./setup.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](./LICENSE)
 
 [Architecture](./docs/source/architecture.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Results](./docs/source/performance.md)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](./examples/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](./docs/source/)
@@ -173,13 +172,13 @@ Lovelace architectures. Certain limitations may, however, apply.
 Various numerical precisions are supported in TensorRT-LLM. The support for
 some of those numerical features require specific architectures:
 
-|                              | FP32  | FP16  | BF16  | FP8  | INT8 | INT4 |
-| :--------------------------- | :---- | :---- | :---- | :--- | :--- | :--- |
-| Volta (SM70)                 | Y     | Y     | N     | N    | Y    | Y    |
-| Turing (SM75)                | Y     | Y     | N     | N    | Y    | Y    |
-| Ampere (SM80, SM86)          | Y     | Y     | Y     | N    | Y    | Y    |
-| Ada-Lovelace (SM89)          | Y     | Y     | Y     | Y    | Y    | Y    |
-| Hopper (SM90)                | Y     | Y     | Y     | Y    | Y    | Y    |
+|                     | FP32 | FP16 | BF16 | FP8  | INT8 | INT4 |
+| :------------------ | :--- | :--- | :--- | :--- | :--- | :--- |
+| Volta (SM70)        | Y    | Y    | N    | N    | Y    | Y    |
+| Turing (SM75)       | Y    | Y    | N    | N    | Y    | Y    |
+| Ampere (SM80, SM86) | Y    | Y    | Y    | N    | Y    | Y    |
+| Ada-Lovelace (SM89) | Y    | Y    | Y    | Y    | Y    | Y    |
+| Hopper (SM90)       | Y    | Y    | Y    | Y    | Y    | Y    |
 
 In this release of TensorRT-LLM, the support for FP8 and quantized data types
 (INT8 or INT4) is not implemented for all the models. See the
@@ -217,8 +216,7 @@ The list of supported models is:
 * [Bert](examples/bert)
 * [Blip2](examples/blip2)
 * [BLOOM](examples/bloom)
-* [ChatGLM-6B](examples/chatglm6b)
-* [ChatGLM2-6B](examples/chatglm2-6b/)
+* [ChatGLM](examples/chatglm), including ChatGLM-6B, ChatGLM2-6B, ChatGLM2-6B-32k, ChatGLM3-6B, ChatGLM3-6B-32k
 * [Falcon](examples/falcon)
 * [GPT](examples/gpt)
 * [GPT-J](examples/gptj)
@@ -230,6 +228,7 @@ The list of supported models is:
 * [OPT](examples/opt)
 * [SantaCoder](examples/gpt)
 * [StarCoder](examples/gpt)
+* [InternLM](examples/internlm)
 
 ## Performance
 
diff --git a/benchmarks/cpp/gptSessionBenchmark.cpp b/benchmarks/cpp/gptSessionBenchmark.cpp
index 1b44c55c34..aff5bf0e3c 100644
--- a/benchmarks/cpp/gptSessionBenchmark.cpp
+++ b/benchmarks/cpp/gptSessionBenchmark.cpp
@@ -18,12 +18,12 @@
 #include "tensorrt_llm/plugins/api/tllmPlugin.h"
 #include "tensorrt_llm/runtime/gptJsonConfig.h"
 #include "tensorrt_llm/runtime/gptSession.h"
+#include "tensorrt_llm/runtime/memoryCounters.h"
 #include "tensorrt_llm/runtime/tllmLogger.h"
 
 #include <NvInfer.h>
 #include <chrono>
 #include <cxxopts.hpp>
-#include <iostream>
 #include <sstream>
 #include <string>
 
@@ -39,14 +39,22 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
     std::shared_ptr<nvinfer1::ILogger> const& logger, int warmUp, int numRuns, int duration,
     GptSession::Config& sessionConfig, bool cudaGraphMode)
 {
-    auto const json = GptJsonConfig::parse(dataPath / "config.json");
+
+    std::string modelNameHyphen = modelName;
+    std::filesystem::path jsonFileName = dataPath / "config.json";
+    if (tc::strStartsWith(modelName, "chatglm"))
+    {
+        std::replace(modelNameHyphen.begin(), modelNameHyphen.end(), '_', '-');
+        jsonFileName = dataPath / (modelNameHyphen + std::string("-config.json"));
+    }
+    auto const json = GptJsonConfig::parse(jsonFileName);
     auto const modelConfig = json.getModelConfig();
     auto const inputPacked = modelConfig.usePackedInput();
     SizeType deviceCount{0};
     TLLM_CUDA_CHECK(cudaGetDeviceCount(&deviceCount));
     auto const worldConfig
         = WorldConfig::mpi(*logger, deviceCount, json.getTensorParallelism(), json.getPipelineParallelism());
-    auto const enginePath = dataPath / json.engineFilename(worldConfig, modelName);
+    auto const enginePath = dataPath / json.engineFilename(worldConfig, modelNameHyphen);
     auto const dtype = modelConfig.getDataType();
     auto const useHalf = (dtype == nvinfer1::DataType::kHALF);
 
@@ -78,10 +86,15 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
         auto constexpr endId = 50256;
         auto constexpr padId = 50256;
 
+        auto& memoryCounter = MemoryCounters::getInstance();
+        TLLM_LOG_INFO(memoryCounter.toString());
+
         for (auto const batchSize : batchSizes)
         {
             try
             {
+                TLLM_LOG_INFO(memoryCounter.toString());
+
                 std::vector<SizeType> inputLenghtsHost(batchSize, maxInputLength);
                 auto inputLenghts
                     = bufferManager.copyFrom(inputLenghtsHost, ITensor::makeShape({batchSize}), MemoryType::kGPU);
@@ -99,6 +112,9 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                     inputIds = bufferManager.copyFrom(
                         inputsHost, ITensor::makeShape({batchSize, maxInputLength}), MemoryType::kGPU);
                 }
+
+                TLLM_LOG_INFO(memoryCounter.toString());
+
                 GenerationInput generationInput{
                     endId, padId, std::move(inputIds), std::move(inputLenghts), inputPacked};
 
@@ -107,6 +123,8 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                     bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
                     bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32)};
 
+                TLLM_LOG_INFO(memoryCounter.toString());
+
                 for (auto r = 0; r < warmUp; ++r)
                 {
                     SizeType numSteps = 0;
@@ -118,6 +136,8 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                 }
                 cudaDeviceSynchronize();
 
+                TLLM_LOG_INFO(memoryCounter.toString());
+
                 int iterIdx = 0;
                 float curDuration = 0;
                 while (iterIdx < numRuns || curDuration / 1000 < duration)
@@ -134,6 +154,9 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                     iterIdx += 1;
                     curDuration += std::chrono::duration<float, std::milli>(end - start).count();
                 }
+
+                TLLM_LOG_INFO(memoryCounter.toString());
+
                 printf("Benchmarking done. Iteration: %d, duration: %.2f sec.\n", iterIdx, curDuration / 1000);
 
                 if (worldConfig.getRank() == 0)
@@ -159,7 +182,7 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                 // We can ignore the OOM exception and continue the rest of the benchmark
                 if (worldConfig.getRank() == 0)
                 {
-                    printf("%s", e.what());
+                    TLLM_LOG_EXCEPTION(e);
                     printf(
                         "[BENCHMARK] batch_size %d input_length %d output_length %d latency(ms) N/A tokensPerSec N/A\n",
                         batchSize, maxInputLength, maxNewTokens);
@@ -167,6 +190,7 @@ void benchmarkGptSession(std::string const& modelName, std::filesystem::path con
                 continue;
             }
         }
+        TLLM_LOG_INFO(memoryCounter.toString());
     }
 }
 
@@ -200,8 +224,8 @@ int main(int argc, char* argv[])
     options.add_options()("duration", "Minimal duration of iterations to measure in seconds.",
         cxxopts::value<int>()->default_value("60"));
 
-    options.add_options()(
-        "num_micro_batches", "Number of micro batches if enabling pipeline parallelism.", cxxopts::value<int>());
+    options.add_options()("ctx_micro_batch_size", "Batch size for context phase.", cxxopts::value<int>());
+    options.add_options()("gen_micro_batch_size", "Batch size for generation phase.", cxxopts::value<int>());
     options.add_options()("max_tokens_in_paged_kvcache", "Max tokens in paged K-V Cache.", cxxopts::value<int>());
     options.add_options()(
         "kv_cache_free_gpu_mem_fraction", "K-V Cache Free Gpu Mem Fraction.", cxxopts::value<float>());
@@ -281,10 +305,15 @@ int main(int argc, char* argv[])
     }
 
     GptSession::Config sessionConfig{0, 0, 0};
-    // Argument: Number of micro batches
-    if (result.count("num_micro_batches"))
+    // Argument: Batch size for context phase
+    if (result.count("ctx_micro_batch_size"))
     {
-        sessionConfig.numMicroBatches = result["num_micro_batches"].as<int>();
+        sessionConfig.ctxMicroBatchSize = result["ctx_micro_batch_size"].as<int>();
+    }
+    // Argument: Batch size for generation phase
+    if (result.count("gen_micro_batch_size"))
+    {
+        sessionConfig.genMicroBatchSize = result["gen_micro_batch_size"].as<int>();
     }
     // Argument: Max tokens in paged K-V Cache
     if (result.count("max_tokens_in_paged_kvcache"))
diff --git a/benchmarks/python/allowed_configs.py b/benchmarks/python/allowed_configs.py
index 922f955a71..5868456318 100644
--- a/benchmarks/python/allowed_configs.py
+++ b/benchmarks/python/allowed_configs.py
@@ -48,6 +48,7 @@ class BuildConfig(BaseModel, extra=Extra.allow):
     # default value to be None, not 0 or 1 to prevent misuse
     rotary_pct: Optional[float] = None
     bias: bool = True
+    quantization: Optional[str] = None
 
 
 class ModelConfig(BaseModel):
@@ -121,7 +122,7 @@ _allowed_configs = {
                     max_input_len=512,
                     max_output_len=200,
                     builder_opt=None,
-                    use_smooth_quant=True,
+                    quantization="int8_sq_per_tensor",
                 )),
     "gpt_350m_sq_per_token_channel":
     ModelConfig(name="gpt_350m_sq_per_token_channel",
@@ -138,9 +139,7 @@ _allowed_configs = {
                     max_input_len=512,
                     max_output_len=200,
                     builder_opt=None,
-                    use_smooth_quant=True,
-                    per_token=True,
-                    per_channel=True,
+                    quantization="int8_sq_per_token_channel",
                 )),
     "gpt-next_2b":
     ModelConfig(name="gpt-next_2b",
@@ -318,7 +317,7 @@ _allowed_configs = {
                                          max_input_len=512,
                                          max_output_len=200,
                                          builder_opt=None,
-                                         use_smooth_quant=True)),
+                                         quantization="int8_sq_per_tensor")),
     "gptj_6b":
     ModelConfig(name="gptj_6b",
                 family="gptj",
@@ -354,7 +353,7 @@ _allowed_configs = {
                     builder_opt=None,
                 )),
     "chatglm_6b":
-    ModelConfig(name="chatglm_6b",
+    ModelConfig(name="chatglm-6b",
                 family="chatglm",
                 benchmark_type="gpt",
                 build_config=BuildConfig(
@@ -371,7 +370,7 @@ _allowed_configs = {
                     remove_input_padding=False,
                 )),
     "chatglm2_6b":
-    ModelConfig(name="chatglm2_6b",
+    ModelConfig(name="chatglm2-6b",
                 family="chatglm2",
                 benchmark_type="gpt",
                 build_config=BuildConfig(
@@ -387,6 +386,23 @@ _allowed_configs = {
                     builder_opt=None,
                     remove_input_padding=False,
                 )),
+    "chatglm3_6b":
+    ModelConfig(name="chatglm3-6b",
+                family="chatglm3",
+                benchmark_type="gpt",
+                build_config=BuildConfig(
+                    num_layers=28,
+                    num_heads=32,
+                    hidden_size=4096,
+                    vocab_size=65024,
+                    hidden_act='swiglu',
+                    n_positions=2048,
+                    max_batch_size=256,
+                    max_input_len=512,
+                    max_output_len=200,
+                    builder_opt=None,
+                    remove_input_padding=False,
+                )),
     "bloom_560m":
     ModelConfig(name="bloom_560m",
                 family="bloom",
diff --git a/benchmarks/python/benchmark.py b/benchmarks/python/benchmark.py
index aeff8b67ce..f13f35a950 100644
--- a/benchmarks/python/benchmark.py
+++ b/benchmarks/python/benchmark.py
@@ -18,15 +18,11 @@ from multiprocessing import Process, Queue
 from time import time
 
 import torch
-from allowed_configs import get_allowed_models
-from bert_benchmark import BERTBenchmark
-from gpt_benchmark import GPTBenchmark
 from mem_monitor import mem_monitor
 
-from tensorrt_llm.logger import logger
-
 
 def parse_arguments():
+    from allowed_configs import get_allowed_models
     parser = argparse.ArgumentParser(
         description='Benchmark TensorRT-LLM models.')
     parser.add_argument('-m',
@@ -172,18 +168,7 @@ def parse_arguments():
         help=
         'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
     )
-    parser.add_argument(
-        '--enable_fp8',
-        default=False,
-        action='store_true',
-        help='Use FP8 Linear layer for LMHead, Attention QKV/Dense, and MLP.')
-    parser.add_argument(
-        '--fp8_kv_cache',
-        default=False,
-        action="store_true",
-        help=
-        'By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV'
-    )
+
     parser.add_argument('--csv',
                         default=False,
                         action="store_true",
@@ -199,11 +184,38 @@ def parse_arguments():
         help=
         'Use latency-optimized all-reduce for tensor parallelism. Gives better performance with NVLink.'
     )
+    parser.add_argument(
+        '--strongly_typed',
+        default=False,
+        action='store_true',
+        help=
+        'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.'
+    )
+    parser.add_argument(
+        '--quantization',
+        type=str,
+        default=None,
+        choices=[
+            'fp8', 'fp8_gemm', 'fp8_kv_cache', 'int8_sq_per_tensor',
+            'int8_sq_per_token_channel', 'int8_weight_only', 'int4_weight_only',
+            'int4_weight_only_awq', 'int4_weight_only_gptq'
+        ],
+        help="Optimize the model with specified quantization recipe")
 
     return parser.parse_args()
 
 
 def main(args):
+    # We import tensorrt_llm here because MPI is initialized when
+    # tensorrt_llm is imported, but mpi4py does not work well with
+    # the start method `spawn` of Python multiprocessing,
+    # so we set the start method first, then initialize MPI.
+    from allowed_configs import get_allowed_models
+    from bert_benchmark import BERTBenchmark
+    from gpt_benchmark import GPTBenchmark
+
+    from tensorrt_llm.logger import logger
+
     logger.set_level(args.log_level)
 
     # Batch size
@@ -235,10 +247,10 @@ def main(args):
             args.max_output_len,
             args.max_batch_size,
             force_num_layer_1=args.force_num_layer_1,
-            enable_fp8=args.enable_fp8,
-            fp8_kv_cache=args.fp8_kv_cache,
             enable_cuda_graph=args.enable_cuda_graph,
-            enable_custom_all_reduce=args.enable_custom_all_reduce)
+            enable_custom_all_reduce=args.enable_custom_all_reduce,
+            strongly_typed=args.strongly_typed,
+            quantization=args.quantization)
     elif args.model in get_allowed_models(benchmark_type="bert"):
         benchmarker = BERTBenchmark(args.engine_dir,
                                     args.model,
@@ -273,8 +285,8 @@ def main(args):
         # Launch a subprocess to monitor memory usage
         q1 = Queue()  # q1 is used for sending signal to subprocess
         q2 = Queue()  # q2 is used for receiving results from subprocess
-        p = Process(target=mem_monitor, args=(q1, q2))
-        p.start()
+        mem_monitor_process = Process(target=mem_monitor, args=(q1, q2))
+        mem_monitor_process.start()
 
         iter_idx = 0
         try:
@@ -301,14 +313,14 @@ def main(args):
 
         except Exception as e:
             print("Found exception during benchmarking", e.with_traceback())
-            p.kill()
+            mem_monitor_process.kill()
             raise e
         logger.debug("Sending signal to mem monitor process, start")
         q1.put(1)
         logger.debug("Sending signal to mem monitor process, done")
         peak_gpu_used = q2.get()
         logger.debug("Get peak gpu memory usage from mem monitor process, done")
-        p.join()
+        mem_monitor_process.join()
         logger.debug("Memory monitor process joined")
 
         latency = round(sum(latencies) / iter_idx, 3)
diff --git a/benchmarks/python/gpt_benchmark.py b/benchmarks/python/gpt_benchmark.py
index 94f60d0dcc..88ec9f7443 100644
--- a/benchmarks/python/gpt_benchmark.py
+++ b/benchmarks/python/gpt_benchmark.py
@@ -24,8 +24,7 @@ import tensorrt_llm
 from tensorrt_llm._utils import str_dtype_to_trt
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.layers import PositionEmbeddingType
-from tensorrt_llm.models import (fp8_quantize, smooth_quantize,
-                                 weight_only_quantize)
+from tensorrt_llm.models import quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
@@ -61,6 +60,7 @@ class GPTBenchmark(BaseBenchmark):
         self.fuse_bias = True
 
         self.cuda_graph_mode = kwargs.get('enable_cuda_graph', False)
+        self.strongly_typed = kwargs.get('strongly_typed', False)
         self.enable_custom_all_reduce = enable_custom_all_reduce
 
         if engine_dir is not None:
@@ -73,12 +73,9 @@ class GPTBenchmark(BaseBenchmark):
             # Build engine
             self.world_size = tensorrt_llm.mpi_world_size()
             self.apply_query_key_layer_scaling = False
-            self.use_smooth_quant = False
-            # this attribute is not stored in allowed_config
-            self.enable_fp8 = kwargs.get('enable_fp8', False)
-            self.fp8_kv_cache = kwargs.get('fp8_kv_cache', False)
 
             self.use_weight_only = False
+            self.per_group = False
             self.weight_only_precision = 'int8'
             self.per_token = False
             self.per_channel = False
@@ -95,12 +92,17 @@ class GPTBenchmark(BaseBenchmark):
             self.use_rmsnorm_plugin = False
             self.use_lookup_plugin = non_mha_plg_dtype
             self.enable_context_fmha = use_mha_plugin
-            self.quant_mode = QuantMode(0)
+
             self.remove_input_padding = use_non_mha_plugin
 
             for key, value in get_build_config(model_name).items():
                 setattr(self, key, value)
 
+            if self.quantization is None:
+                self.quantization = kwargs.get('quantization', None)
+
+            self.set_quantization()
+
             # Override the n_position/max_input_len/max_output_len/max_batch_size to value from cmd line if that's specified.
             if n_positions is not None:
                 assert isinstance(
@@ -126,20 +128,6 @@ class GPTBenchmark(BaseBenchmark):
                 self.num_kv_heads = self.num_heads
             if kwargs.get('force_num_layer_1', False):
                 self.num_layers = 1
-
-            if self.use_smooth_quant:
-                self.quant_mode = QuantMode.use_smooth_quant(
-                    self.per_token, self.per_channel)
-            elif self.use_weight_only:
-                self.quant_mode = QuantMode.use_weight_only(
-                    self.weight_only_precision == 'int4')
-
-            if self.enable_fp8:
-                self.quant_mode = self.quant_mode.set_fp8_qdq()
-
-            if self.fp8_kv_cache:
-                self.quant_mode = self.quant_mode.set_fp8_kv_cache()
-
             engine_buffer = self.build()
 
         assert engine_buffer is not None
@@ -155,16 +143,25 @@ class GPTBenchmark(BaseBenchmark):
             quant_mode=self.quant_mode,
             use_custom_all_reduce=self.enable_custom_all_reduce,
         )
-        if model_name == 'chatglm_6b':
+        if model_name == 'chatglm-6b':
             self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
                 end_id=130005,
                 pad_id=3,
                 num_beams=num_beams,
                 top_k=top_k,
                 top_p=top_p)
-            self.decoder = tensorrt_llm.runtime.ChatGLM6BHeadModelGenerationSession(
+            self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession(
                 model_config, engine_buffer, self.runtime_mapping)
-        elif model_name == 'chatglm2_6b':
+        elif model_name == 'chatglm2-6b':
+            self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
+                end_id=2,
+                pad_id=0,
+                num_beams=num_beams,
+                top_k=top_k,
+                top_p=top_p)
+            self.decoder = tensorrt_llm.runtime.GenerationSession(
+                model_config, engine_buffer, self.runtime_mapping)
+        elif model_name == 'chatglm3-6b':
             self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
                 end_id=2,
                 pad_id=0,
@@ -212,6 +209,75 @@ class GPTBenchmark(BaseBenchmark):
         self.decoder.setup(batch_size, inlen, outlen, beam_width=self.num_beams)
         return (input_ids, input_lengths)
 
+    def set_quantization(self):
+        self.quant_mode = QuantMode(0)
+
+        if self.quantization == "fp8":
+            self.strongly_typed = True
+            self.quant_mode = self.quant_mode.set_fp8_qdq()
+            self.quant_mode = self.quant_mode.set_fp8_kv_cache()
+
+        elif self.quantization == "fp8_gemm":
+            self.strongly_typed = True
+            self.quant_mode = self.quant_mode.set_fp8_qdq()
+
+        elif self.quantization == "fp8_kv_cache":
+            self.strongly_typed = True
+            self.quant_mode = self.quant_mode.set_fp8_kv_cache()
+
+        elif self.quantization == "int8_sq_per_tensor":
+            self.use_smooth_quant = True
+            self.quant_mode = QuantMode.use_smooth_quant(
+                self.per_token, self.per_channel)
+
+        elif self.quantization == "int8_sq_per_token_channel":
+            self.use_smooth_quant = True
+            self.per_token = True
+            self.per_channel = True
+            self.quant_mode = QuantMode.use_smooth_quant(
+                self.per_token, self.per_channel)
+
+        elif self.quantization == "int8_weight_only":
+            self.use_smooth_quant = False
+            self.use_weight_only = True
+            self.weight_only_precision = 'int8'
+            self.quant_mode = QuantMode.use_weight_only(False)
+
+        elif self.quantization == "int4_weight_only":
+            self.use_weight_only = True
+            self.weight_only_precision = 'int4'
+            self.quant_mode = QuantMode.use_weight_only(True)
+
+        elif self.quantization == "int4_weight_only_awq":
+            self.use_weight_only = True
+            self.per_group = True
+            self.weight_only_precision = 'int4_awq'
+            self.quant_mode = QuantMode.from_description(
+                quantize_weights=True,
+                quantize_activations=False,
+                per_token=False,
+                per_channel=False,
+                per_group=True,
+                use_int4_weights=True)
+
+        elif self.quantization == "int4_weight_only_gptq":
+            self.use_weight_only = True
+            self.per_group = True
+            self.weight_only_precision = 'int4_gptq'
+            self.quant_mode = QuantMode.from_description(
+                quantize_weights=True,
+                quantize_activations=False,
+                per_token=False,
+                per_channel=False,
+                per_group=True,
+                use_int4_weights=True)
+
+        elif self.quantization == None:
+            pass
+
+        else:
+            raise Exception(f'{0} is invalid config: {self.quantization}')
+
     def build(self):
         builder = Builder()
         builder_config = builder.create_builder_config(
@@ -232,10 +298,10 @@ class GPTBenchmark(BaseBenchmark):
             max_input_len=self.max_input_len,
             max_output_len=self.max_output_len,
             int8=self.quant_mode.has_act_and_weight_quant(),
-            fp8=self.quant_mode.has_fp8_qdq(),
             quant_mode=self.quant_mode,
             use_refit=self.refit,
-            opt_level=self.builder_opt)
+            opt_level=self.builder_opt,
+            strongly_typed=self.strongly_typed)
         engine_name = get_engine_name(self.model_name, self.dtype,
                                       self.world_size, self.runtime_rank)
 
@@ -322,7 +388,7 @@ class GPTBenchmark(BaseBenchmark):
                 apply_query_key_layer_scaling=builder_config.
                 apply_query_key_layer_scaling)
         elif family == "chatglm":
-            tensorrt_llm_model = tensorrt_llm.models.ChatGLM6BHeadModel(
+            tensorrt_llm_model = tensorrt_llm.models.ChatGLMHeadModel(
                 num_layers=self.num_layers,
                 num_heads=self.num_heads,
                 hidden_size=self.hidden_size,
@@ -335,9 +401,10 @@ class GPTBenchmark(BaseBenchmark):
                     tp_size=self.world_size),  # TP only
                 apply_query_key_layer_scaling=builder_config.
                 apply_query_key_layer_scaling,
-                quant_mode=self.quant_mode)
+                quant_mode=self.quant_mode,
+                model_version="1")
         elif family == "chatglm2":
-            tensorrt_llm_model = tensorrt_llm.models.ChatGLM2_6BHeadModel(
+            tensorrt_llm_model = tensorrt_llm.models.ChatGLMHeadModel(
                 num_layers=self.num_layers,
                 num_heads=self.num_heads,
                 hidden_size=self.hidden_size,
@@ -350,7 +417,24 @@ class GPTBenchmark(BaseBenchmark):
                     tp_size=self.world_size),  # TP only
                 apply_query_key_layer_scaling=builder_config.
                 apply_query_key_layer_scaling,
-                quant_mode=self.quant_mode)
+                quant_mode=self.quant_mode,
+                model_version="2")
+        elif family == "chatglm3":
+            tensorrt_llm_model = tensorrt_llm.models.ChatGLMHeadModel(
+                num_layers=self.num_layers,
+                num_heads=self.num_heads,
+                hidden_size=self.hidden_size,
+                vocab_size=self.vocab_size,
+                hidden_act=self.hidden_act,
+                max_position_embeddings=self.n_positions,
+                dtype=kv_dtype,
+                mapping=tensorrt_llm.Mapping(
+                    world_size=self.world_size,
+                    tp_size=self.world_size),  # TP only
+                apply_query_key_layer_scaling=builder_config.
+                apply_query_key_layer_scaling,
+                quant_mode=self.quant_mode,
+                model_version="3")
         elif family == "bloom":
             tensorrt_llm_model = tensorrt_llm.models.BloomForCausalLM(
                 num_layers=self.num_layers,
@@ -362,6 +446,7 @@ class GPTBenchmark(BaseBenchmark):
                 mapping=tensorrt_llm.Mapping(
                     world_size=self.world_size,
                     tp_size=self.world_size),  # TP only
+                quant_mode=self.quant_mode,
                 use_parallel_embedding=(self.model_name == 'bloom_176b'))
         elif family == "falcon":
             tensorrt_llm_model = tensorrt_llm.models.FalconForCausalLM(
@@ -381,27 +466,34 @@ class GPTBenchmark(BaseBenchmark):
         else:
             raise Exception(f'Unexpected model: {self.model_name}')
 
-        if self.use_smooth_quant:
-            tensorrt_llm_model = smooth_quantize(tensorrt_llm_model,
-                                                 self.quant_mode)
-        elif self.use_weight_only and self.weight_only_precision == 'int8':
-            tensorrt_llm_model = weight_only_quantize(
-                tensorrt_llm_model, QuantMode.use_weight_only())
-        elif self.use_weight_only and self.weight_only_precision == 'int4':
-            tensorrt_llm_model = weight_only_quantize(
-                tensorrt_llm_model,
-                QuantMode.use_weight_only(use_int4_weights=True))
-        elif self.enable_fp8 or self.fp8_kv_cache:
-            tensorrt_llm_model = fp8_quantize(tensorrt_llm_model,
-                                              self.quant_mode)
+        quant_kwargs = {}
+        if family == "llama" and self.use_weight_only:
+            if self.weight_only_precision == 'int4_awq':
+                quant_kwargs = {
+                    "group_size": 128,
+                    "zero": False,
+                    "pre_quant_scale": True,
+                    "exclude_modules": [],
+                }
+            elif self.weight_only_precision == 'int4_gptq':
+                quant_kwargs = {
+                    "group_size": 128,
+                    "zero": True,
+                    "pre_quant_scale": False,
+                }
+        tensorrt_llm_model = quantize_model(tensorrt_llm_model, self.quant_mode,
+                                            **quant_kwargs)
 
         # Module -> Network
         network = builder.create_network()
         network.trt_network.name = engine_name
+
+        not_fp8_quantization = self.quantization is None or "fp8" not in self.quantization
+
         if self.use_gpt_attention_plugin:
             network.plugin_config.set_gpt_attention_plugin(
                 dtype=self.use_gpt_attention_plugin)
-        if self.use_gemm_plugin:
+        if self.use_gemm_plugin and not_fp8_quantization:
             network.plugin_config.set_gemm_plugin(dtype=self.use_gemm_plugin)
         if self.use_layernorm_plugin:
             network.plugin_config.set_layernorm_plugin(
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index d706f6e816..ad30374a11 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -27,10 +27,14 @@ project(tensorrt_llm LANGUAGES CXX)
 
 # Build options
 option(BUILD_PYT "Build in PyTorch TorchScript class mode" ON)
+option(BUILD_PYBIND "Build Python bindings for C++ runtime and batch manager"
+       OFF)
 option(BUILD_TESTS "Build Google tests" ON)
 option(BUILD_BENCHMARKS "Build benchmarks" ON)
 option(NVTX_DISABLE "Disable all NVTX features" ON)
 option(WARNING_IS_ERROR "Treat all warnings as errors" OFF)
+option(FAST_BUILD "Skip compiling some kernels to accelerate compiling" OFF)
+option(FAST_MATH "Compiling in fast math mode" OFF)
 
 if(NVTX_DISABLE)
   add_compile_definitions("NVTX_DISABLE")
@@ -73,6 +77,11 @@ else()
   message(STATUS "Not building benchmarks")
 endif()
 
+if(FAST_BUILD)
+  add_compile_definitions("FAST_BUILD")
+  message(WARNING "Skip some kernels to accelerate compilation")
+endif()
+
 # Determine CUDA version before enabling the language extension
 check_language(CUDA)
 if(CMAKE_CUDA_COMPILER)
@@ -229,6 +238,10 @@ endif()
 
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-extended-lambda")
 set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+if(FAST_MATH)
+  set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
+  message("CMAKE_CUDA_FLAGS: ${CMAKE_CUDA_FLAGS}")
+endif()
 
 set(COMMON_HEADER_DIRS ${PROJECT_SOURCE_DIR} ${CUDAToolkit_INCLUDE_DIR})
 message(STATUS "COMMON_HEADER_DIRS: ${COMMON_HEADER_DIRS}")
@@ -333,3 +346,11 @@ if(BUILD_BENCHMARKS)
   add_subdirectory(${TRT_LLM_ROOT_DIR}/benchmarks/cpp
                    ${CMAKE_BINARY_DIR}/benchmarks)
 endif()
+
+# Measure the compile time
+option(MEASURE_BUILD_TIME "Measure the build time of each module" OFF)
+if(MEASURE_BUILD_TIME)
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_COMPILE "${CMAKE_COMMAND} -E time")
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_CUSTOM "${CMAKE_COMMAND} -E time")
+  set_property(GLOBAL PROPERTY RULE_LAUNCH_LINK "${CMAKE_COMMAND} -E time")
+endif()
diff --git a/cpp/include/tensorrt_llm/batch_manager/GptManager.h b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
index 5eedb86551..89d7994453 100644
--- a/cpp/include/tensorrt_llm/batch_manager/GptManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/GptManager.h
@@ -46,6 +46,7 @@ class GptManager
 public:
     using SizeType = tensorrt_llm::runtime::SizeType;
     using RequestList = std::list<std::shared_ptr<LlmRequest>>;
+    using TensorPtr = runtime::ITensor::SharedPtr;
 
     GptManager(std::filesystem::path const& trtEnginePath, TrtGptModelType modelType, int32_t maxBeamWidth,
         batch_scheduler::SchedulerPolicy schedulerPolicy, GetInferenceRequestsCallback getInferenceRequestsCb,
@@ -108,6 +109,9 @@ private:
     inline static const std::string kBeamWidthTensorName_ = "beam_width";
     inline static const std::string kEndIdTensorName_ = "end_id";
     inline static const std::string kPadIdTensorName_ = "pad_id";
+    inline static const std::string kBadWordsListTensorName_ = "bad_words_list";
+    inline static const std::string kStopWordsListTensorName_ = "stop_words_list";
+    inline static const std::string kEmbeddingBiasTensorName_ = "embedding_bias";
     inline static const std::string kTemperatureTensorName_ = "temperature";
     inline static const std::string kRuntimeTopKTensorName_ = "runtime_top_k";
     inline static const std::string kRuntimeTopPTensorName_ = "runtime_top_p";
@@ -116,6 +120,8 @@ private:
     inline static const std::string kMinLengthTensorName_ = "min_length";
     inline static const std::string kPresencePenaltyTensorName_ = "presence_penalty";
     inline static const std::string kRandomSeedTensorName_ = "random_seed";
+    inline static const std::string kPromptEmbeddingTableName_ = "prompt_embedding_table";
+    inline static const std::string kPromptVocabSizeName_ = "prompt_vocab_size";
     inline static const std::string kOutputIdsTensorName_ = "output_ids";
     inline static const std::string kSequenceLengthTensorName_ = "sequence_length";
 
diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
index 3e967aa42d..690d337ffb 100644
--- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
+++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h
@@ -33,6 +33,16 @@
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 
+using SizeType = tensorrt_llm::runtime::SizeType;
+
+struct KvCacheStats
+{
+    SizeType maxNumBlocks;
+    SizeType freeNumBlocks;
+    SizeType usedNumBlocks;
+    SizeType toksPerBlock;
+};
+
 // Basic building block of a paged KV cache - a single
 // cache block. This class just holds metadata, no pointers
 // since it is reused across all layers.
@@ -231,6 +241,17 @@ public:
         return mBlockManager.getNumFreeBlocks();
     }
 
+    [[nodiscard]] KvCacheStats getKvCacheStats() const
+    {
+        KvCacheStats kvCacheStats;
+        kvCacheStats.maxNumBlocks = getMaxNumBlocks();
+        kvCacheStats.freeNumBlocks = getNumFreeBlocks();
+        kvCacheStats.usedNumBlocks = getUsedNumBlocks();
+        kvCacheStats.toksPerBlock = getTokensPerBlock();
+
+        return kvCacheStats;
+    }
+
     // Volume of [2, numKvHeads, tokensPerBlock, sizePerHead]
     [[nodiscard]] SizeType getBlockSize() const
     {
diff --git a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
index 2703e24f5d..c577151f5a 100644
--- a/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
+++ b/cpp/include/tensorrt_llm/batch_manager/llmRequest.h
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/iTensor.h"
 #include "tensorrt_llm/runtime/samplingConfig.h"
 
 #include <assert.h>
@@ -41,10 +43,14 @@ public:
     using TokenIdType = runtime::TokenIdType;
     using RequestIdType = std::uint64_t;
     using BeamTokens = std::vector<std::vector<TokenIdType>>;
+    using TensorPtr = runtime::ITensor::SharedPtr;
 
     LlmRequest(RequestIdType requestId, SizeType maxNewTokens, std::shared_ptr<std::vector<TokenIdType>> input_tokens,
         runtime::SamplingConfig samplingConfig, bool isStreaming, std::optional<SizeType> endId = std::nullopt,
-        std::optional<SizeType> padId = std::nullopt)
+        std::optional<SizeType> padId = std::nullopt, std::optional<TensorPtr> embeddingBias = std::nullopt,
+        std::optional<TensorPtr> badWordsList = std::nullopt, std::optional<TensorPtr> stopWordsList = std::nullopt,
+        std::optional<TensorPtr> promptEmbeddingTable = std::nullopt,
+        std::optional<SizeType> promptVocabSize = std::nullopt)
         : mRequestId(requestId)
         , mPromptLen(input_tokens->size())
         , mMaxNewTokens(maxNewTokens)
@@ -54,10 +60,25 @@ public:
         , mEndId(endId)
         , mPadId(padId)
         , mBatchSlot(-1)
+        , mEmbeddingBias(embeddingBias)
+        , mBadWordsList(badWordsList)
+        , mStopWordsList(stopWordsList)
+        , mPromptEmbeddingTable(promptEmbeddingTable)
+        , mPromptVocabSize(promptVocabSize)
     {
         mMaxSentTokenPos = mPromptLen - 1;
         // Scatter the input tokens to other beam
         mTokens = std::make_shared<BeamTokens>(mSamplingConfig.beamWidth, *input_tokens);
+
+        if ((mPromptEmbeddingTable.has_value() && !mPromptVocabSize.has_value())
+            || (!mPromptEmbeddingTable.has_value() && mPromptVocabSize.has_value()))
+        {
+            std::string errStr
+                = "Prompt embedding table and prompt vocab size tensors must both be provided for requests with prompt "
+                  "tuning enabled.";
+            TLLM_LOG_ERROR(errStr);
+            throw std::runtime_error(errStr);
+        }
     }
 
     /// @brief Get total number of tokens for this req (prompt + generated)
@@ -104,6 +125,14 @@ public:
         return getMaxBeamNumTokens() - mPromptLen;
     }
 
+    /// @brief Add new generated tokens to the vector of tokens
+    /// @param token The token to add
+    /// @param beam The beam to which to add the new token
+    void addNewToken(TokenIdType token, SizeType beam)
+    {
+        mTokens->at(beam).push_back(token);
+    }
+
     /// @brief Add new generated tokens to the vector of tokens
     /// @param beamTokens A vector containing the tokens to add for each beam index
     ///                   beamTokens is expected to be of size beamWidth
@@ -174,6 +203,46 @@ public:
         mMaxSentTokenPos = pos;
     }
 
+    std::optional<TensorPtr> getPromptEmbeddingTable() const
+    {
+        return mPromptEmbeddingTable;
+    }
+
+    void movePromptEmbeddingTableToGpu(runtime::BufferManager const& manager)
+    {
+        if (!mPromptEmbeddingTable.has_value()
+            || mPromptEmbeddingTable.value()->getMemoryType() == runtime::MemoryType::kGPU)
+        {
+            return;
+        }
+        else
+        {
+            TensorPtr gpuPromptEmbeddingTable
+                = manager.copyFrom(*mPromptEmbeddingTable.value(), runtime::MemoryType::kGPU);
+            mPromptEmbeddingTable = gpuPromptEmbeddingTable;
+        }
+    }
+
+    std::optional<SizeType> getPromptVocabSize() const
+    {
+        return mPromptVocabSize;
+    }
+
+    std::optional<TensorPtr> getEmbeddingBias() const
+    {
+        return mEmbeddingBias;
+    }
+
+    std::optional<TensorPtr> getBadWordsList() const
+    {
+        return mBadWordsList;
+    }
+
+    std::optional<TensorPtr> getStopWordsList() const
+    {
+        return mStopWordsList;
+    }
+
     RequestIdType mRequestId;
     SizeType mPromptLen;
     SizeType mMaxNewTokens;
@@ -188,6 +257,13 @@ public:
 private:
     std::shared_ptr<BeamTokens> mTokens;
     SizeType mMaxSentTokenPos;
+
+    std::optional<TensorPtr> mEmbeddingBias;
+    std::optional<TensorPtr> mBadWordsList;
+    std::optional<TensorPtr> mStopWordsList;
+
+    std::optional<TensorPtr> mPromptEmbeddingTable;
+    std::optional<SizeType> mPromptVocabSize;
 };
 
 } // namespace tensorrt_llm::batch_manager
diff --git a/cpp/include/tensorrt_llm/runtime/generationInput.h b/cpp/include/tensorrt_llm/runtime/generationInput.h
index 3343587c55..840bc247a8 100644
--- a/cpp/include/tensorrt_llm/runtime/generationInput.h
+++ b/cpp/include/tensorrt_llm/runtime/generationInput.h
@@ -19,6 +19,7 @@
 #include "tensorrt_llm/common/cudaUtils.h"
 #include "tensorrt_llm/runtime/common.h"
 #include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/promptTuningParams.h"
 
 #include <optional>
 #include <utility>
@@ -26,18 +27,20 @@
 namespace tensorrt_llm::runtime
 {
 
-class GenerationInput
+template <typename TTensor, typename PromptTuningParams>
+class GenericGenerationInput
 {
 public:
-    using TensorPtr = ITensor::SharedPtr;
+    using TensorPtr = TTensor;
 
-    explicit GenerationInput(
+    explicit GenericGenerationInput(
         SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)
         : endId{endId}
         , padId{padId}
         , ids{std::move(ids)}
         , lengths{std::move(lengths)}
         , packed{packed}
+        , maxNewTokens(std::nullopt)
     {
         TLLM_CHECK_WITH_INFO(static_cast<bool>(this->ids), "Invalid ids tensor");
         TLLM_CHECK_WITH_INFO(static_cast<bool>(this->lengths), "Invalid lengths tensor");
@@ -55,6 +58,22 @@ public:
     TensorPtr badWordsList;               // [2, badWordsLength] or [batchSize, 2, badWordsLength], on gpu
     TensorPtr stopWordsList;              // [batchSize, 2, stopWordsLength], on gpu
     std::optional<SizeType> maxNewTokens; // max number of tokens to generate
+
+    // Ptuning parameters
+    PromptTuningParams promptTuningParams; // See promptTuningParams.h for expected shapes
+};
+
+class GenerationInput : public GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>
+{
+public:
+    using Base = GenericGenerationInput<ITensor::SharedPtr, PromptTuningParams>;
+    using TensorPtr = Base::TensorPtr;
+
+    explicit GenerationInput(
+        SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)
+        : GenericGenerationInput(endId, padId, std::move(ids), std::move(lengths), packed)
+    {
+    }
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/generationOutput.h b/cpp/include/tensorrt_llm/runtime/generationOutput.h
index c601066c06..33b7d7272e 100644
--- a/cpp/include/tensorrt_llm/runtime/generationOutput.h
+++ b/cpp/include/tensorrt_llm/runtime/generationOutput.h
@@ -26,14 +26,14 @@
 namespace tensorrt_llm::runtime
 {
 
-class GenerationOutput
+template <typename TTensor>
+class GenericGenerationOutput
 {
 public:
-    using TensorPtr = ITensor::SharedPtr;
-
+    using TensorPtr = TTensor;
     using Callback = std::function<void(TensorPtr const& ids, SizeType step, bool finished)>;
 
-    explicit GenerationOutput(TensorPtr ids, TensorPtr lengths)
+    explicit GenericGenerationOutput(TensorPtr ids, TensorPtr lengths)
         : ids{std::move(ids)}
         , lengths{std::move(lengths)}
     {
@@ -53,4 +53,16 @@ public:
     Callback onTokenGenerated;
 };
 
+class GenerationOutput : public GenericGenerationOutput<ITensor::SharedPtr>
+{
+public:
+    using Base = GenericGenerationOutput<ITensor::SharedPtr>;
+    using TensorPtr = Base::TensorPtr;
+
+    explicit GenerationOutput(TensorPtr ids, TensorPtr lengths)
+        : GenericGenerationOutput(std::move(ids), std::move(lengths))
+    {
+    }
+};
+
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/gptModelConfig.h b/cpp/include/tensorrt_llm/runtime/gptModelConfig.h
index c49b55e291..325a724662 100644
--- a/cpp/include/tensorrt_llm/runtime/gptModelConfig.h
+++ b/cpp/include/tensorrt_llm/runtime/gptModelConfig.h
@@ -29,7 +29,7 @@ public:
     enum class ModelVariant : std::int32_t
     {
         kGpt = 0,
-        kGlm = 1, // https://github.com/THUDM/GLM
+        kGlm = 1, // https://github.com/THUDM/GLM and https://github.com/THUDM/ChatGLM-6B
     };
 
     constexpr explicit GptModelConfig(
@@ -52,6 +52,7 @@ public:
         , mComputeContextLogits(false)
         , mModelVariant(ModelVariant::kGpt)
         , mUseCustomAllReduce(false)
+        , mMaxPromptEmbeddingTableSize(0)
     {
     }
 
@@ -196,6 +197,21 @@ public:
         mMaxNumTokens = maxNumTokens;
     }
 
+    [[nodiscard]] bool constexpr usePromptTuning() const noexcept
+    {
+        return mMaxPromptEmbeddingTableSize > 0;
+    }
+
+    [[nodiscard]] SizeType constexpr getMaxPromptEmbeddingTableSize() const noexcept
+    {
+        return mMaxPromptEmbeddingTableSize;
+    }
+
+    void constexpr setMaxPromptEmbeddingTableSize(SizeType maxPromptEmbeddingTableSize) noexcept
+    {
+        mMaxPromptEmbeddingTableSize = maxPromptEmbeddingTableSize;
+    }
+
     [[nodiscard]] bool constexpr computeContextLogits() const noexcept
     {
         return mComputeContextLogits;
@@ -246,6 +262,8 @@ private:
     bool mComputeContextLogits;
     ModelVariant mModelVariant;
     bool mUseCustomAllReduce;
+
+    SizeType mMaxPromptEmbeddingTableSize;
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/include/tensorrt_llm/runtime/gptSession.h b/cpp/include/tensorrt_llm/runtime/gptSession.h
index dc603e5d21..fc490e3d95 100644
--- a/cpp/include/tensorrt_llm/runtime/gptSession.h
+++ b/cpp/include/tensorrt_llm/runtime/gptSession.h
@@ -53,10 +53,11 @@ namespace utils
 std::vector<uint8_t> loadEngine(std::string const& enginePath);
 }
 
-class TllmRuntime;
+class IpcMemory;
 class IStatefulGptDecoder;
 class NcclCommunicator;
 class RuntimeBuffers;
+class TllmRuntime;
 
 class GptSession
 {
@@ -85,7 +86,8 @@ public:
         bool decoderPerRequest{false};
         bool cudaGraphMode{false};
         KvCacheConfig kvCacheConfig{};
-        std::optional<SizeType> numMicroBatches = std::nullopt;
+        std::optional<SizeType> ctxMicroBatchSize = std::nullopt;
+        std::optional<SizeType> genMicroBatchSize = std::nullopt;
     };
 
     GptSession(Config const& sessionConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig,
@@ -136,7 +138,7 @@ private:
 
     void setup(Config const& sessionConfig);
 
-    void createContexts(SizeType numMicroBatches, bool useCudaGraphs);
+    void createContexts(SizeType numBatchesCtx, SizeType numBatchesGen, bool useCudaGraphs);
     void createBuffers(SizeType numMicroBatches);
     void createDecoders(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength,
         nvinfer1::DataType logitsType, bool decoderPerRequest, SizeType numMicroBatches);
@@ -144,6 +146,12 @@ private:
         SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength, KvCacheConfig const& config);
     void createCustomAllReduceWorkspace(SizeType batchSize, SizeType beamWidth, SizeType maxSequenceLength);
 
+    void executeContextStep(std::vector<GenerationInput> const& microBatches,
+        std::vector<SizeType> const& microBatchOffsets, KvCacheManager const* kvCacheManager);
+    SizeType executeGenerationStep(SizeType step, std::vector<GenerationInput> const& microBatches,
+        std::vector<SizeType> const& microBatchOffsets, KvCacheManager* kvCacheManager,
+        std::vector<bool>& microBatchesFinished);
+
     //! @brief Execute decoder on last PP rank, receive decoder output on other PP ranks.
     void decoderStepAsync(SizeType decoderStep, SizeType microBatchId);
 
@@ -156,11 +164,11 @@ private:
 
     void kvCacheAddSequences(SizeType beamWidth, SizeType microBatchId, SizeType firstBatchIdx);
 
-    ITensor::SharedPtr initNewTokens(
-        GenerationInput const& inputs, SamplingConfig const& samplingConfig, SizeType microBatchId);
+    //! @brief Populate outputIds and return reference to newTokens tensor
+    ITensor::SharedPtr initDecoder(ITensor& outputIds, GenerationInput const& inputs,
+        SamplingConfig const& samplingConfig, SizeType microBatchId) const;
 
-    std::function<void(SizeType microBatchId, SizeType step, bool finished)> createOnTokenGeneratedCallback(
-        GenerationOutput& outputs, SizeType numMicroBatches);
+    std::function<void(SizeType step, bool finished)> createOnTokenGeneratedCallback(GenerationOutput& outputs);
 
     class CudaGraphExecutor
     {
@@ -196,6 +204,45 @@ private:
         cudaGraphExec_t mInstance;
     };
 
+    class MicroBatchConfig
+    {
+    public:
+        MicroBatchConfig()
+            : numCtxBatches{1}
+            , numGenBatches{1}
+            , ctxBatchSize{0}
+            , genBatchSize{0}
+        {
+        }
+
+        explicit MicroBatchConfig(SizeType maxBatchSize, SizeType pipelineParallelism,
+            std::optional<SizeType> genMicroBatchSize, std::optional<SizeType> ctxMicroBatchSize);
+
+        constexpr SizeType numCtxPerGen() const
+        {
+            return numCtxBatches / numGenBatches;
+        }
+
+        //! @details First 2 * numGenBatches contexts are for generation phase, next numCtxBatches are for context
+        //!          phase. Use numCtxPerGen() contexts for the context batches of each generation batch.
+        constexpr SizeType getCtxContextId(SizeType generationBatchId, SizeType contextBatchId) const
+        {
+            return 2 * numGenBatches + generationBatchId * numCtxPerGen() + contextBatchId;
+        }
+
+        //! @details First 2 * numGenBatches contexts are for generation phase, flip-flop between 2 of them for each
+        //!          generation batch.
+        constexpr SizeType getGenContextId(SizeType flipFlopId, SizeType generationBatchId) const
+        {
+            return flipFlopId * numGenBatches + generationBatchId;
+        }
+
+        SizeType numCtxBatches;
+        SizeType numGenBatches;
+        SizeType ctxBatchSize;
+        SizeType genBatchSize;
+    };
+
     friend class batch_manager::TrtGptModelV1;
 
 private:
@@ -206,13 +253,17 @@ private:
     std::shared_ptr<CudaStream> mCommStream;
     CudaEvent mCommEvent{};
 
+    // tensor parallelism with custom allreduce plugin
+    ITensor::SharedPtr mCommPtrs;
+    std::vector<std::shared_ptr<IpcMemory>> mIpcMemoryHandles;
+
     SizeType mDecoderMaxSequenceLength{};
 
     LoggerPtr mLogger;
     std::shared_ptr<TllmRuntime> mRuntime;
     std::shared_ptr<KvCacheManager> mKvCacheManager;
 
-    SizeType mNumMicroBatches;
+    MicroBatchConfig mMicroBatchConfig;
     // for each micro batch
     std::vector<std::shared_ptr<IStatefulGptDecoder>> mDecoders;
     std::vector<std::shared_ptr<RuntimeBuffers>> mBuffers;
diff --git a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h
index 270058a6de..667c1e58f1 100644
--- a/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h
+++ b/cpp/include/tensorrt_llm/runtime/iGptDecoderBatch.h
@@ -35,9 +35,10 @@ namespace decoder_batch
 class Request
 {
 public:
-    using TensorPtr = std::shared_ptr<ITensor const>;
+    using ConstTensorPtr = std::shared_ptr<ITensor const>;
+    using TensorPtr = std::shared_ptr<ITensor>;
 
-    explicit Request(TensorPtr ids, std::optional<SizeType> maxNewTokens = std::nullopt,
+    explicit Request(ConstTensorPtr ids, std::optional<SizeType> maxNewTokens = std::nullopt,
         std::optional<SizeType> endId = std::nullopt, std::optional<SizeType> padId = std::nullopt)
         : ids{std::move(ids)}
         , maxNewTokens{maxNewTokens}
@@ -46,7 +47,7 @@ public:
     }
 
     // mandatory parameters
-    TensorPtr ids; // [inputSeqLen], the input sequence of token ids, on gpu
+    ConstTensorPtr ids; // [inputSeqLen], the input sequence of token ids, on gpu
 
     // optional parameters
     std::optional<SizeType> maxNewTokens; // maximum number of tokens to generate for this request
diff --git a/cpp/include/tensorrt_llm/runtime/iTensor.h b/cpp/include/tensorrt_llm/runtime/iTensor.h
index 931d58a361..a5847b4ef2 100644
--- a/cpp/include/tensorrt_llm/runtime/iTensor.h
+++ b/cpp/include/tensorrt_llm/runtime/iTensor.h
@@ -114,6 +114,25 @@ public:
         return newDims;
     }
 
+    //!
+    //! \brief Add a *unit* dimension to `shape` at the specified position.
+    //!
+    //! \param shape The shape to unsqueeze.
+    //! \param dim The dimension where unit dimension should be added.
+    //! \return A new shape with the added unit dimension.
+    //!
+    static Shape unsqueeze(Shape const& shape, SizeType dim)
+    {
+        TLLM_CHECK_WITH_INFO(dim <= shape.nbDims && dim >= 0,
+            common::fmtstr("Invalid dim %d, tensor has %d dimensions", dim, shape.nbDims));
+
+        Shape newDims{shape.nbDims + 1};
+        std::copy(shape.d, shape.d + dim, newDims.d);
+        newDims.d[dim] = 1;
+        std::copy(shape.d + dim, shape.d + shape.nbDims, newDims.d + dim + 1);
+        return newDims;
+    }
+
     //!
     //! \brief Removes the given *unit* dimensions from this tensor.
     //!
@@ -122,6 +141,14 @@ public:
         reshape(squeeze(getShape(), dim));
     }
 
+    //!
+    //! \brief Adds a *unit* dimension at the specified position
+    //!
+    void unsqueeze(SizeType dim)
+    {
+        reshape(unsqueeze(getShape(), dim));
+    }
+
     //!
     //! \brief Creates a sliced view on the underlying `tensor`. The view will have the same data type as `tensor`.
     //!
diff --git a/cpp/include/tensorrt_llm/runtime/memoryCounters.h b/cpp/include/tensorrt_llm/runtime/memoryCounters.h
index 503591301d..ff68a80a95 100644
--- a/cpp/include/tensorrt_llm/runtime/memoryCounters.h
+++ b/cpp/include/tensorrt_llm/runtime/memoryCounters.h
@@ -127,6 +127,8 @@ public:
 
     static std::string bytesToString(DiffType bytes, int precision = 2);
 
+    std::string toString() const;
+
 private:
     SizeType mGpu{}, mCpu{}, mPinned{};
     DiffType mGpuDiff{}, mCpuDiff{}, mPinnedDiff{};
diff --git a/cpp/include/tensorrt_llm/runtime/promptTuningParams.h b/cpp/include/tensorrt_llm/runtime/promptTuningParams.h
new file mode 100644
index 0000000000..3690165f53
--- /dev/null
+++ b/cpp/include/tensorrt_llm/runtime/promptTuningParams.h
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/tllmBuffers.h"
+
+#include <optional>
+#include <utility>
+
+namespace tensorrt_llm::runtime
+{
+
+template <typename TTensor>
+class GenericPromptTuningParams
+{
+public:
+    using TensorPtr = TTensor;
+    using SizeType = tensorrt_llm::runtime::SizeType;
+
+    explicit GenericPromptTuningParams(
+        TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr())
+        : embeddingTable{std::move(embeddingTable)}
+        , tasks{std::move(tasks)}
+        , vocabSize{std::move(vocabSize)} {};
+
+    // The prompt embedding table
+    TensorPtr embeddingTable; // [numTasks * taskVocabSize, hidden_dim], on gpu
+    // In GenerationInput, tasks expected shape is [batchSize]
+    // For context requests with non-packed inputs, expected shape is [batchSize, 1]
+    // For generation requests with non-packed inputs, expected shape is [batchSize*beamWidth] for generation requests.
+    // For packed inputs, expected shape is [1, packedLength] (note that ifb currently doesn't support non-packed
+    // inputs)
+    TensorPtr tasks;
+    TensorPtr vocabSize; // [1], on gpu
+
+    std::vector<bool>
+        promptTuningEnabled; // [batchSize] vector of bool that indicates which requests in a batch have ptuning enabled
+};
+
+class PromptTuningParams : public GenericPromptTuningParams<ITensor::SharedPtr>
+{
+public:
+    using TensorPtr = ITensor::SharedPtr;
+    using SizeType = GenericPromptTuningParams::SizeType;
+
+    explicit PromptTuningParams(
+        TensorPtr embeddingTable = nullptr, TensorPtr tasks = nullptr, TensorPtr vocabSize = nullptr)
+        : GenericPromptTuningParams(std::move(embeddingTable), std::move(tasks), std::move(vocabSize))
+    {
+    }
+
+    // Fill the tasks tensor for the batch using the provided tasksHost
+    // Function assumes that the first numContextRequests requests in the batch are context requests
+    void fillTasksTensor(TensorPtr tasksHost, const SizeType batchSize, const SizeType numContextRequests,
+        const std::vector<SizeType>& reqBeamWidths, const std::vector<SizeType>& reqPromptLengths,
+        BufferManager& manager, bool packedInput);
+};
+
+} // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt
index d17d0cf415..2b37960333 100644
--- a/cpp/tensorrt_llm/CMakeLists.txt
+++ b/cpp/tensorrt_llm/CMakeLists.txt
@@ -70,29 +70,40 @@ if(NOT WIN32) # Linux
   endif()
 else() # Windows
   # AMD64, IA64, ARM64, EM64T, X86
-  set(BATCH_MANAGER_TARGET_ARCH "${CMAKE_SYSTEM_PROCESSOR}-WINDOWS")
-  string(TOLOWER ${BATCH_MANAGER_TARGET_ARCH} ${BATCH_MANAGER_TARGET_ARCH})
+  if(CMAKE_SYSTEM_PROCESSOR MATCHES "AMD64")
+    set(BATCH_MANAGER_TARGET_ARCH "x86_64-windows-msvc")
+  else()
+    message(
+      FATAL_ERROR
+        "The system processor type is unsupported: ${CMAKE_SYSTEM_PROCESSOR}")
+  endif()
 endif()
 
 if(BUILD_BATCH_MANAGER)
   add_subdirectory(batch_manager)
 else()
   add_library(${BATCH_MANAGER_TARGET} STATIC IMPORTED)
-  execute_process(
-    COMMAND ${Python3_EXECUTABLE} "-c"
-            "import torch; print(torch.compiled_with_cxx11_abi(),end='');"
-    RESULT_VARIABLE _PYTHON_SUCCESS
-    OUTPUT_VARIABLE USE_CXX11_ABI)
+  if(NOT WIN32) # Linux
+    execute_process(
+      COMMAND ${Python3_EXECUTABLE} "-c"
+              "import torch; print(torch.compiled_with_cxx11_abi(),end='');"
+      RESULT_VARIABLE _PYTHON_SUCCESS
+      OUTPUT_VARIABLE USE_CXX11_ABI)
 
-  message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}")
+    message(STATUS "USE_CXX11_ABI: ${USE_CXX11_ABI}")
 
-  if(USE_CXX11_ABI)
+    if(USE_CXX11_ABI)
+      set(BATCH_MANAGER_LIB_LOC
+          "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.a"
+      )
+    else()
+      set(BATCH_MANAGER_LIB_LOC
+          "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.pre_cxx11.a"
+      )
+    endif()
+  else() # Windows
     set(BATCH_MANAGER_LIB_LOC
-        "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.a"
-    )
-  else()
-    set(BATCH_MANAGER_LIB_LOC
-        "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/libtensorrt_llm_batch_manager_static.pre_cxx11.a"
+        "${CMAKE_CURRENT_SOURCE_DIR}/batch_manager/${BATCH_MANAGER_TARGET_ARCH}/tensorrt_llm_batch_manager_static.lib"
     )
   endif()
   set_property(TARGET ${BATCH_MANAGER_TARGET} PROPERTY IMPORTED_LOCATION
@@ -132,7 +143,7 @@ set_target_properties(
                               CXX_EXTENSIONS "NO")
 
 if(NOT MSVC) # Unix-like compilers
-  set(ALLOW_UNDEFINED_FLAG "-Wl, --no-undefined")
+  set(UNDEFINED_FLAG "-Wl,--no-undefined")
 else() # MSVC
   set(UNDEFINED_FLAG "")
 endif()
@@ -158,4 +169,8 @@ if(BUILD_PYT)
   add_subdirectory(thop)
 endif()
 
+if(BUILD_PYBIND)
+  add_subdirectory(pybind)
+endif()
+
 add_subdirectory(plugins)
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 789ccaf67a..6131fa3c33 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:422df71fccde81a55049fb61996d0b88bbaf1f18866b63c8e73c36b772c2df46
-size 1508332
+oid sha256:f591dd181613b14f7ded3ba3e167d14073564254bc46db8c4bd9636d6d896b16
+size 1611436
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index 65af1acc24..138428aad7 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:0013625bc6b18255f44d6ab38e8ea0bceda6452bddf9df3cf832ad106fc2058d
-size 1516676
+oid sha256:21d17a9fa736d033ad77270a0fbcdd09c27dfab3f871d92a5ffa0cb744fa48fd
+size 1623126
diff --git a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
index 0d5881fb58..8b007588a9 100644
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@@ -1,3 +1,3 @@
-bda56cf4ad2242be25115ddecd23e7df  libtensorrt_llm_batch_manager_static.a
-12d7c8e5b4a018dfd9043fa7db979b5a  libtensorrt_llm_batch_manager_static.pre_cxx11.a
-7e492cc1057b1091f62d69df81547cb071729e5d commit
+e1dc326c0c45864b9e7963b4d92d322f  libtensorrt_llm_batch_manager_static.a
+d2e9d76efe6b4173270aa6b494dfe59c  libtensorrt_llm_batch_manager_static.pre_cxx11.a
+07363ea7a6fdd6eeedc1670dedeeaedff7f9a848 commit
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
index 59f7f8d234..f30db7d141 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:c5a207480594cb228b7264f28af85b0a820046f64379f11fd7389c701ca5497d
-size 1421186
+oid sha256:3fe444bf079ce35262b932302806b372ccb677182969e3bba45698343e5e350f
+size 1523444
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
index 9b57e350db..130b4932c5 100644
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:80e06e15b9e29ba80c036ba6604a2ce286acb294eddb50015bad53cfdeba4534
-size 1423958
+oid sha256:99641389fdf26f6324b7465df0b61b74946787a6a147d145de23b444261e6e5f
+size 1524188
diff --git a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
new file mode 100644
index 0000000000..7bf2950986
--- /dev/null
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/version.txt
@@ -0,0 +1,2 @@
+b10b0e00d0132b04969d779af45d73d0  libtensorrt_llm_batch_manager_static.a
+3ad06255afdaa8450c133d1d1bc486c4  libtensorrt_llm_batch_manager_static.pre_cxx11.a
diff --git a/cpp/tensorrt_llm/common/assert.cpp b/cpp/tensorrt_llm/common/assert.cpp
new file mode 100755
index 0000000000..2f3f780313
--- /dev/null
+++ b/cpp/tensorrt_llm/common/assert.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "assert.h"
+
+bool CHECK_DEBUG_ENABLED = false;
+
+namespace
+{
+
+#if !defined(_MSC_VER)
+__attribute__((constructor))
+#endif
+void initOnLoad()
+{
+    auto constexpr kDebugEnabled = "TRT_LLM_DEBUG_MODE";
+    auto const debugEnabled = std::getenv(kDebugEnabled);
+    if (debugEnabled && debugEnabled[0] == '1')
+    {
+        CHECK_DEBUG_ENABLED = true;
+    }
+}
+} // namespace
diff --git a/cpp/tensorrt_llm/common/assert.h b/cpp/tensorrt_llm/common/assert.h
index 1c4bca699b..8d6daa5cee 100644
--- a/cpp/tensorrt_llm/common/assert.h
+++ b/cpp/tensorrt_llm/common/assert.h
@@ -30,6 +30,8 @@ namespace tensorrt_llm::common
 
 } // namespace tensorrt_llm::common
 
+extern bool CHECK_DEBUG_ENABLED;
+
 #if defined(_WIN32)
 #define TLLM_LIKELY(x) (__assume((x) == 1), (x))
 #else
@@ -50,6 +52,26 @@ namespace tensorrt_llm::common
                                             : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, info);       \
     } while (0)
 
+#define TLLM_CHECK_DEBUG(val)                                                                                          \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if (CHECK_DEBUG_ENABLED)                                                                                       \
+        {                                                                                                              \
+            TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0)                                                           \
+                                                : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, #val);   \
+        }                                                                                                              \
+    } while (0)
+
+#define TLLM_CHECK_DEBUG_WITH_INFO(val, info)                                                                          \
+    do                                                                                                                 \
+    {                                                                                                                  \
+        if (CHECK_DEBUG_ENABLED)                                                                                       \
+        {                                                                                                              \
+            TLLM_LIKELY(static_cast<bool>(val)) ? ((void) 0)                                                           \
+                                                : tensorrt_llm::common::throwRuntimeError(__FILE__, __LINE__, info);   \
+        }                                                                                                              \
+    } while (0)
+
 #define TLLM_THROW(...)                                                                                                \
     do                                                                                                                 \
     {                                                                                                                  \
diff --git a/cpp/tensorrt_llm/common/cudaUtils.h b/cpp/tensorrt_llm/common/cudaUtils.h
index 4393039ea6..2669a06de8 100644
--- a/cpp/tensorrt_llm/common/cudaUtils.h
+++ b/cpp/tensorrt_llm/common/cudaUtils.h
@@ -390,6 +390,17 @@ void print2dToScreen(const T* result, const int r, const int c, const int stride
     print2dToStream(result, r, c, stride, stdout);
 }
 
+template <typename T>
+void print2dToFile(std::string fname, const T* result, const int r, const int c, const int stride)
+{
+    FILE* fp = fopen(fname.c_str(), "wt");
+    if (fp != nullptr)
+    {
+        print2dToStream(result, r, c, stride, fp);
+        fclose(fp);
+    }
+}
+
 inline void print_float_(float x)
 {
     printf("%7.3f ", x);
diff --git a/cpp/tensorrt_llm/common/quantization.h b/cpp/tensorrt_llm/common/quantization.h
index 4ea5fc8327..0e4f8e9f55 100644
--- a/cpp/tensorrt_llm/common/quantization.h
+++ b/cpp/tensorrt_llm/common/quantization.h
@@ -201,7 +201,7 @@ public:
         return quantMode;
     }
 
-    constexpr QuantMode operator+(const QuantMode& other) noexcept
+    constexpr QuantMode operator+(const QuantMode& other) const noexcept
     {
         return QuantMode(mValue | other.mValue);
     }
@@ -211,7 +211,7 @@ public:
         return *this = *this + other;
     }
 
-    constexpr QuantMode operator-(const QuantMode& other) noexcept
+    constexpr QuantMode operator-(const QuantMode& other) const noexcept
     {
         return QuantMode(mValue & ~other.mValue);
     }
diff --git a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
index f609874ed0..1e924fb1ca 100644
--- a/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
+++ b/cpp/tensorrt_llm/common/reduceKernelUtils.cuh
@@ -296,6 +296,11 @@ struct TopK
 
     __device__ __forceinline__ void insert(T elem, int elem_id)
     {
+        if (elem_id < 0)
+        {
+            return;
+        }
+
         if (elem > u[MAX_K - 1] || (p[MAX_K - 1] == -1) || ((elem == u[MAX_K - 1]) && (elem_id < p[MAX_K - 1])))
         // if (elem > u[MAX_K-1] || ((elem == u[MAX_K-1]) && (elem_id < p[MAX_K-1])))
         {
diff --git a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu
index 49e4e4be90..5ca0f47ac1 100644
--- a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu
+++ b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.cu
@@ -171,10 +171,17 @@ template <typename T>
 void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const int** parent_ids_ptr,
     const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int local_batch_size,
     const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids,
-    const float* temperatures, const float* repetition_penalties, const RepetitionPenaltyType repetition_penalty_type,
+    const float* temperatures, const std::vector<float>& h_temperatures, const float* repetition_penalties,
+    const std::vector<float>& h_repetition_penalties, const RepetitionPenaltyType repetition_penalty_type,
     const int* min_lengths, const int max_seq_len, cudaStream_t stream)
 {
-    if (bias != nullptr || temperatures != nullptr || vocab_size != vocab_size_padded)
+
+#define ALL_OF(p_, sz_, dt_, v_) (std::all_of(p_, p_ + sz_, [&](dt_ b) { return b == v_; }))
+
+    if (bias != nullptr
+        || (temperatures != nullptr
+            && !ALL_OF(std::begin(h_temperatures) + ite * local_batch_size, local_batch_size, float, 1.0f))
+        || vocab_size != vocab_size_padded)
     {
         dim3 block(512);
         if (std::is_same<T, half>::value && vocab_size % 2 == 0 && vocab_size_padded % 2 == 0)
@@ -199,14 +206,19 @@ void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const in
             size_t smem_size = (sizeof(T) * max_seq_len + 31) / 32 * 32 + sizeof(int) * max_seq_len;
             dim3 block(256);
             dim3 grid(beam_width * local_batch_size);
-            if (repetition_penalty_type == RepetitionPenaltyType::Multiplicative)
+            float default_value = getDefaultPenaltyValue(repetition_penalty_type);
+            if (repetition_penalty_type == RepetitionPenaltyType::Multiplicative
+                && !ALL_OF(std::begin(h_repetition_penalties) + ite * local_batch_size, local_batch_size, float,
+                    default_value))
             {
                 apply_repetition_penalty<T, false><<<grid, block, smem_size, stream>>>(logits, batch_size, beam_width,
                     vocab_size, vocab_size_padded, output_ids_ptr, parent_ids_ptr, input_lengths, sequence_lengths,
                     repetition_penalties, max_seq_len);
                 sync_check_cuda_error();
             }
-            else if (repetition_penalty_type == RepetitionPenaltyType::Additive)
+            else if (repetition_penalty_type == RepetitionPenaltyType::Additive
+                && !ALL_OF(std::begin(h_repetition_penalties) + ite * local_batch_size, local_batch_size, float,
+                    default_value))
             {
                 apply_repetition_penalty<T, true><<<grid, block, smem_size, stream>>>(logits, batch_size, beam_width,
                     vocab_size, vocab_size_padded, output_ids_ptr, parent_ids_ptr, input_lengths, sequence_lengths,
@@ -224,18 +236,22 @@ void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const in
     apply_min_length_penalty<<<grid_size, block_size, 0, stream>>>(
         logits, min_lengths, end_ids, sequence_lengths, input_lengths, beam_width, vocab_size_padded);
     sync_check_cuda_error();
+
+#undef ALL_OF
 }
 
 template void invokeAddBiasApplyPenalties(float* logits, const int** output_ids_ptr, const int** parent_ids_ptr,
     const int* input_lengths, const int* sequence_lengths, const float* bias, const int ite, const int local_batch_size,
     const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids,
-    const float* temperatures, const float* repetition_penalties, const RepetitionPenaltyType repetition_penalty_type,
+    const float* temperatures, const std::vector<float>& h_temperatures, const float* repetition_penalties,
+    const std::vector<float>& h_repetition_penalties, const RepetitionPenaltyType repetition_penalty_type,
     const int* min_lengths, int max_seq_len, cudaStream_t stream);
 
 template void invokeAddBiasApplyPenalties(half* logits, const int** output_ids_ptr, const int** parent_ids_ptr,
     const int* input_lengths, const int* sequence_lengths, const half* bias, const int ite, const int local_batch_size,
     const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids,
-    const float* temperatures, const float* repetition_penalties, const RepetitionPenaltyType repetition_penalty_type,
+    const float* temperatures, const std::vector<float>& h_temperatures, const float* repetition_penalties,
+    const std::vector<float>& h_repetition_penalties, const RepetitionPenaltyType repetition_penalty_type,
     const int* min_lengths, int max_seq_len, cudaStream_t stream);
 
 } // namespace kernels
diff --git a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h
index 07818b261d..4ff57c69da 100644
--- a/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h
+++ b/cpp/tensorrt_llm/kernels/beamSearchPenaltyKernels.h
@@ -28,7 +28,8 @@ template <typename T>
 void invokeAddBiasApplyPenalties(T* logits, const int** output_ids_ptr, const int** parent_ids_ptr,
     const int* input_lengths, const int* sequence_lengths, const T* bias, const int ite, const int local_batch_size,
     const int batch_size, const int beam_width, const int vocab_size, const int vocab_size_padded, const int* end_ids,
-    const float* temperatures, const float* repetition_penalties, const RepetitionPenaltyType repetition_penalty_type,
+    const float* temperatures, const std::vector<float>& h_temperatures, const float* repetition_penalties,
+    const std::vector<float>& h_repetition_penalties, const RepetitionPenaltyType repetition_penalty_type,
     const int* min_lengths, int max_seq_len, cudaStream_t stream);
 
 } // namespace kernels
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu
index 89d08b4419..a6acfd1688 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention.cu
@@ -47,16 +47,20 @@ void multihead_attention_(
     switch (params.hidden_size_per_head)
     {
     case 32: mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 32>(params, kv_cache_buffer, stream); break;
-    case 48: mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 48>(params, kv_cache_buffer, stream); break;
     case 64: mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 64>(params, kv_cache_buffer, stream); break;
+    case 128:
+        mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 128>(params, kv_cache_buffer, stream);
+        break;
+    case 256:
+        mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 256>(params, kv_cache_buffer, stream);
+        break;
+#ifndef FAST_BUILD // skip mmha 48, 80, 96, 112, 144, 160, 192 and 224 for fast build
+    case 48: mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 48>(params, kv_cache_buffer, stream); break;
     case 80: mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 80>(params, kv_cache_buffer, stream); break;
     case 96: mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 96>(params, kv_cache_buffer, stream); break;
     case 112:
         mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 112>(params, kv_cache_buffer, stream);
         break;
-    case 128:
-        mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 128>(params, kv_cache_buffer, stream);
-        break;
     case 144:
         mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 144>(params, kv_cache_buffer, stream);
         break;
@@ -69,9 +73,7 @@ void multihead_attention_(
     case 224:
         mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 224>(params, kv_cache_buffer, stream);
         break;
-    case 256:
-        mmha::mmha_launch_kernel<T, KVCacheBuffer, KERNEL_PARAMS_TYPE, 256>(params, kv_cache_buffer, stream);
-        break;
+#endif // FAST_BUILD
     default: TLLM_THROW("unsupported head_size");
     }
 }
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_bf16.cu
index f4d9fdb4da..9ccc70152a 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_bf16.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_bf16.cu
@@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 112;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_112 for fast build
 #ifdef ENABLE_BF16
 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead)
-#endif
+#endif // ENABLE_BF16
+#endif // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_float.cu
index d5e786018f..a83d80ab07 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_float.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_float.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 112;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_112 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_half.cu
index ec6264bef6..04dd8beec4 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_half.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention112_half.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 112;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_112 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_bf16.cu
index 53b0603e95..7a7bfce529 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_bf16.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_bf16.cu
@@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 144;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_144 for fast build
 #ifdef ENABLE_BF16
 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead)
-#endif
+#endif // ENABLE_BF16
+#endif // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_float.cu
index af281bc36b..784814d5bc 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_float.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_float.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 144;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_144 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_half.cu
index bc67ddcc9f..fa3c1763e5 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_half.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention144_half.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 144;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_144 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_bf16.cu
index 365daad49b..899a52ae60 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_bf16.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_bf16.cu
@@ -29,8 +29,10 @@ auto constexpr kSizePerHead = 160;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_160 for fast build
 #ifdef ENABLE_BF16
 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead)
+#endif // ENABLE_BF16
 #endif
 
 } // namespace mmha
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_float.cu
index a8ae5fc91b..83b038cd88 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_float.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_float.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 160;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_160 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_half.cu
index d2811d9c88..7cd1b5f00c 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_half.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention160_half.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 160;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_160 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_bf16.cu
index 314ea1feab..0e8c8b2dde 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_bf16.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_bf16.cu
@@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 192;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_192 for fast build
 #ifdef ENABLE_BF16
 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead)
-#endif
+#endif // ENABLE_BF16
+#endif // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_float.cu
index f3df3bc4fd..9f36f82b3f 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_float.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_float.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 192;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_192 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_half.cu
index c7fe1874c0..11dc61cd74 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_half.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention192_half.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 192;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_192 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_bf16.cu
index c8fc0179c6..174605c808 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_bf16.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_bf16.cu
@@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 224;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_224 for fast build
 #ifdef ENABLE_BF16
 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead)
-#endif
+#endif // ENABLE_BF16
+#endif // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_float.cu
index c22677e513..04c116fb17 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_float.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_float.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 224;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_224 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_half.cu
index b20cd5420b..4bbf980190 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_half.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention224_half.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 224;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_224 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_bf16.cu
index 8cdeb7ea2b..17516ec639 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_bf16.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_bf16.cu
@@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 48;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_48 for fast build
 #ifdef ENABLE_BF16
 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead)
-#endif
+#endif // ENABLE_BF16
+#endif // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_float.cu
index 8311b6a152..cc4201dd6d 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_float.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_float.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 48;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_48 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_half.cu
index d7b90c15bd..4b0cf08c6c 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_half.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention48_half.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 48;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_48 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_bf16.cu
index f9f386b3e4..3e02da34bc 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_bf16.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_bf16.cu
@@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 80;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_80 for fast build
 #ifdef ENABLE_BF16
 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead)
-#endif
+#endif // ENABLE_BF16
+#endif // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_float.cu
index 2759d38aea..4bbe57249e 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_float.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_float.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 80;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_80 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_half.cu
index 7891ebefe8..7eae60d985 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_half.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention80_half.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 80;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_80 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_bf16.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_bf16.cu
index 80bbd43f68..1d28e415e9 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_bf16.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_bf16.cu
@@ -29,9 +29,11 @@ auto constexpr kSizePerHead = 96;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_96 for fast build
 #ifdef ENABLE_BF16
 INSTANTIATE_MMHA_LAUNCHERS(__nv_bfloat16, kSizePerHead)
-#endif
+#endif // ENABLE_BF16
+#endif // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_float.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_float.cu
index a1d7c1fddf..771b644d1d 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_float.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_float.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 96;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_96 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(float, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_half.cu b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_half.cu
index be94d17088..40060ac728 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_half.cu
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttention96_half.cu
@@ -29,7 +29,9 @@ auto constexpr kSizePerHead = 96;
 namespace mmha
 {
 
+#ifndef FAST_BUILD // skip mmha_96 for fast build
 INSTANTIATE_MMHA_LAUNCHERS(uint16_t, kSizePerHead)
+#endif             // FAST_BUILD
 
 } // namespace mmha
 
diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
index db2de627a4..ab7db6ff23 100644
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderMaskedMultiheadAttentionTemplate.h
@@ -2152,7 +2152,6 @@ __global__ void masked_multihead_attention_kernel(
     const int normlization_loop_end = MULTI_BLOCK_FLAG ? timesteps_per_block : tlength;
     for (int ti = tidx; ti <= normlization_loop_end; ti += THREADS_PER_BLOCK)
     {
-
         const int time_now = MULTI_BLOCK_FLAG ? ti + c_tile_times_timesteps_per_block : ti;
 
         if (!MULTI_BLOCK_FLAG)
@@ -2308,8 +2307,11 @@ __global__ void masked_multihead_attention_kernel(
         }
     }
 
+    // Get the c_tile_id that handles the current timestep.
+    const int ctile_idx = tlength / timesteps_per_block;
+
     // One group of threads computes the product(s) for the current timestep.
-    if (vo == tlength % V_PER_ITER && is_valid_vi && (!MULTI_BLOCK_FLAG || (c_tile == gridDim.z - 1)))
+    if (vo == tlength % V_PER_ITER && is_valid_vi && (!MULTI_BLOCK_FLAG || (c_tile == ctile_idx)))
     {
         const int tokenIdx = tlength;
         const int inBlockIdx = kvCacheBuffer.getKVLocalIdx(tokenIdx, hi_kv, Dh, vi);
@@ -2396,7 +2398,6 @@ __global__ void masked_multihead_attention_kernel(
         }
 #endif // MMHA_USE_FP32_ACCUM_FOR_LOGITS
     }
-
     // Make sure we can start writing to shared memory.
     __syncthreads();
 
@@ -2428,7 +2429,7 @@ __global__ void masked_multihead_attention_kernel(
     }
 
     const auto bhi = tensorrt_llm::common::flat_index2(batch_beam_idx, hi, num_heads);
-    const auto bhi_seq_len_tile = bhi * params.max_seq_len_tile;
+    const auto bhi_seq_len_tile = bhi * params.seq_len_tile;
     // Output the final values.
     if (vo == 0 && (Dh == Dh_MAX || vi < Dh))
     {
@@ -2499,9 +2500,7 @@ __global__ void masked_multihead_attention_kernel(
 
             float final_max = -FLT_MAX;
             float thread_partial_max = -FLT_MAX;
-            if (tidx < gridDim.z)
-                thread_partial_max = params.partial_max[bhi_seq_len_tile + tidx];
-            // final_max = fmaxf(final_max, thread_partial_max);
+            thread_partial_max = params.partial_max[bhi_seq_len_tile + min(tidx, gridDim.x - 1)];
 
             // Make sure we can start writing to shared memory.
             __syncthreads();
@@ -2548,34 +2547,29 @@ __global__ void masked_multihead_attention_kernel(
             // Shared memory to store partial outputs for each oi. -> size: gridDim.z * Dh * 4 Bytes. Reuse qk_smem.
             T* out_oi_smem = reinterpret_cast<T*>(smem_);
 
-            // Number of threads to utilize: THREADS_PER_VALUE * gridDim.z (THREADS_PER_VALUE for vectorized output
-            // and gridDim.z for all the partial outputs)
-            int threads_boundary = THREADS_PER_VALUE * gridDim.z; // should be smaller than THREADS_PER_BLOCK
-            assert(threads_boundary <= THREADS_PER_BLOCK);
-
             const auto o_idx = chunk_index<T, V_vec_k, THREADS_PER_VALUE>(tidx);
             // The partial output region this thread takes care of
             const auto oo = o_idx.x;
             // The hidden dimensions computed by this particular thread. (refer to vi)
             const auto oi = o_idx.y;
 
+            // Within the bound.
+            const bool within_bound = oo < gridDim.z;
+
             // Load partial output
             int thread_partial_out_offset = oo * params.batch_size * num_heads * params.hidden_size_per_head;
             // Load partial max (different to thread_partial_max since the threadIdx rule changes here)
-            float thread_partial_max_for_out = params.partial_max[bhi_seq_len_tile + oo];
+            float thread_partial_max_for_out = within_bound ? params.partial_max[bhi_seq_len_tile + oo] : final_max;
 
             // Load the partial outputs.
-            V_vec_k thread_partial_out
-                = *reinterpret_cast<const V_vec_k*>(&params.partial_out[thread_partial_out_offset + bhi * Dh + oi]);
-
-            if (tidx >= threads_boundary)
-            {
-                zero(thread_partial_out);
-            }
+            V_vec_k zero_k;
+            zero(zero_k);
+            V_vec_k thread_partial_out = within_bound
+                ? *reinterpret_cast<const V_vec_k*>(&params.partial_out[thread_partial_out_offset + bhi * Dh + oi])
+                : zero_k;
 
             Tk factor_compute;
             convert_from_float(&factor_compute, __expf(thread_partial_max_for_out - final_max));
-
             thread_partial_out = mul<V_vec_k, Tk, V_vec_k>(factor_compute, thread_partial_out);
 
             // Make sure we can start writing to shared memory.
@@ -2620,7 +2614,6 @@ __global__ void masked_multihead_attention_kernel(
                 convert_from_float(&inv_sum_compute, inv_sum);
 
                 thread_partial_out = mul<V_vec_k, Tk, V_vec_k>(inv_sum_compute, thread_partial_out);
-
                 *reinterpret_cast<V_vec_k*>(&params.out[bhi * Dh + oi]) = thread_partial_out;
             }
 
diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu
index 94c901b92c..1ba07ca0ba 100644
--- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu
+++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels.cu
@@ -52,18 +52,22 @@ void invokeTopkSoftMax(const T* log_probs, const T* bias, const bool* finished,
     switch (log_beam_width)
     {
     // 0 < beam_width <= 4
-    case 0: // 1, 2
-    case 1: // 3, 4
+    case 0:        // 1, 2
+    case 1:        // 3, 4
         CASE_K(4)
-    case 2: // 4 < beam_width <= 8
+    case 2:        // 4 < beam_width <= 8
         CASE_K(8)
-    case 3: // 9 < beam_width <= 16
+#ifndef FAST_BUILD // For fast build, skip case 3, 4, 5
+    case 3:        // 9 < beam_width <= 16
         CASE_K(16)
-    case 4: // 16 < beam_width <= 32
+    case 4:        // 16 < beam_width <= 32
         CASE_K(32)
-    case 5: // 32 < beam_width <= 64
+    case 5:        // 32 < beam_width <= 64
         CASE_K(64)
-    default: throw std::runtime_error(fmtstr("Topk kernel of beam search does not support beam_width=%d", beam_width));
+#endif             // FAST_BUILD
+    default:
+        throw std::runtime_error(
+            fmtstr("%s:%d Topk kernel of beam search does not support beam_width=%d", __FILE__, __LINE__, beam_width));
     }
 }
 
diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels16.cu b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels16.cu
index d1a0aa72a0..f77f9f1e4f 100644
--- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels16.cu
+++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels16.cu
@@ -20,9 +20,9 @@ namespace tensorrt_llm
 {
 namespace kernels
 {
-
+#ifndef FAST_BUILD // skip beam_width between [?, 16] for fast build
 INSTANTIATE_BEAMSEARCH_K(float, 16);
 INSTANTIATE_BEAMSEARCH_K(half, 16);
-
+#endif // FAST_BUILD
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels32.cu b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels32.cu
index e6f44a126e..6b1dbd5a10 100644
--- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels32.cu
+++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels32.cu
@@ -21,8 +21,10 @@ namespace tensorrt_llm
 namespace kernels
 {
 
+#ifndef FAST_BUILD // skip beam_width between [?, 32] for fast build
 INSTANTIATE_BEAMSEARCH_K(float, 32);
 INSTANTIATE_BEAMSEARCH_K(half, 32);
+#endif // FAST_BUILD
 
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels64.cu b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels64.cu
index 7c424d5503..e4c70ee4f4 100644
--- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels64.cu
+++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernels64.cu
@@ -21,8 +21,10 @@ namespace tensorrt_llm
 namespace kernels
 {
 
+#ifndef FAST_BUILD // skip beam_width between [?, 64] for fast build
 INSTANTIATE_BEAMSEARCH_K(float, 64);
 INSTANTIATE_BEAMSEARCH_K(half, 64);
+#endif // FAST_BUILD
 
 } // namespace kernels
 } // namespace tensorrt_llm
diff --git a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernelsTemplate.h b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernelsTemplate.h
index fec129aa86..fe4ae9307e 100644
--- a/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernelsTemplate.h
+++ b/cpp/tensorrt_llm/kernels/onlineSoftmaxBeamsearchKernels/onlineSoftmaxBeamsearchKernelsTemplate.h
@@ -181,16 +181,11 @@ __launch_bounds__(THREADBLOCK_SIZE) __global__
 
         for (int i = 0; i < MAX_K; ++i)
         {
-            if (beam_hyps.num_beams != nullptr && x[total.p[i]] % vocab_size == beam_hyps.end_ids[vector_id])
+            if (i < K && beam_hyps.num_beams != nullptr && x[total.p[i]] % vocab_size == beam_hyps.end_ids[vector_id])
             {
                 // if beam_token does not belong to top num_beams tokens, it should not
                 // be added. Refer from
                 // https://github.com/huggingface/transformers/blob/v4.24.0/src/transformers/generation_beam_search.py#L257
-                if (i >= K)
-                {
-                    // do nothing
-                }
-                else
                 {
                     const float normed_score = (float) total.u[i];
                     const int num_beam = beam_hyps.num_beams[global_batch_idx];
diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
index 3286af8de8..3b68220468 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
+++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.cu
@@ -274,7 +274,11 @@ __global__ void topKStage2Sampling(const int* __restrict topKTmpIdBuf, T* topKTm
             randNum = randNum - expLogit;
             if (randNum <= 0.0f || i == k - 1)
             {
-                ids[batchId][sequenceLengths[batchId]] = topKTmpIdBuf[batchId * stride + s_id[i]] % vocabSize;
+                int idx = s_id[i];
+                // If s_id is -1 here we force output token to the last from vocabulary to get vivid indicator of smth
+                // going wrong for the debug
+                auto outputId = idx != -1 ? topKTmpIdBuf[batchId * stride + idx] % vocabSize : vocabSize - 1;
+                ids[batchId][sequenceLengths[batchId]] = outputId;
                 if (cumLogProbs != nullptr || outputLogProbs != nullptr)
                 {
                     float logProb = logf(expLogit);
diff --git a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h
index 3223fd9b0e..a251760428 100644
--- a/cpp/tensorrt_llm/kernels/samplingTopKKernels.h
+++ b/cpp/tensorrt_llm/kernels/samplingTopKKernels.h
@@ -27,6 +27,7 @@ namespace kernels
 //! Computes sequenceLength, finished state, cumLogProbs inplace.
 //! Sampling per request can be controlled using skipDecode, topPs and topKs parameters.
 //! Function sets workspaceSize and exits early if workspace is nullptr.
+//! If logits are Nan, we set output token to be the last in the vocabulary.
 //!
 //! \param workspace pointer to the workspace. Has to be pre-allocated by caller. Function does not take ownership of the
 //! buffer.
diff --git a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu
index 850d43a7b8..13df179342 100644
--- a/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu
+++ b/cpp/tensorrt_llm/layers/baseBeamSearchLayer.cu
@@ -190,7 +190,8 @@ void BaseBeamSearchLayer<T>::forward(BeamSearchOutputParams& outputs, ForwardPar
     invokeAddBiasApplyPenalties(logits.getPtr<T>(), output_ids_ptr.template getPtr<const int*>(),
         outputs.parent_ids_ptr.template getPtr<const int*>(), input_lengths, sequence_length, embedding_bias, ite,
         local_batch_size, batch_size, beam_width, vocab_size_, vocab_size_padded_, end_ids, temperature_buf_,
-        repetition_penalty_buf_, mRepetitionPenaltyType, min_lengths_buf_, max_seq_len, stream_);
+        mTemperature, repetition_penalty_buf_, mRepetitionPenalty, mRepetitionPenaltyType, min_lengths_buf_,
+        max_seq_len, stream_);
     sync_check_cuda_error();
 
     invokeSoftMax(outputs, params);
diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt
new file mode 100644
index 0000000000..209dab837a
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt
@@ -0,0 +1,41 @@
+set(TRTLLM_PYBIND_MODULE bindings)
+set(TRTLLM_PYBIND_MODULE
+    ${TRTLLM_PYBIND_MODULE}
+    PARENT_SCOPE)
+
+if(NOT BUILD_PYT)
+  message(
+    FATAL_ERROR
+      "Python bindings for C++ runtime require PyTorch. Please enable BUILD_PYT"
+  )
+endif()
+
+execute_process(
+  COMMAND ${Python3_EXECUTABLE} "-c"
+          "import pybind11 as pb11; print(pb11.get_cmake_dir(),end='');"
+  RESULT_VARIABLE PYBIND_CMAKE_DIR_RET
+  OUTPUT_VARIABLE PYBIND_CMAKE_DIR)
+
+if(PYBIND_CMAKE_DIR_RET MATCHES 0)
+  list(APPEND CMAKE_PREFIX_PATH "${PYBIND_CMAKE_DIR}")
+else()
+  message(ERROR "pybind11 CMake directory not found.")
+endif()
+
+find_package(pybind11 REQUIRED)
+
+set(SRCS bindings.cpp runtime/generationInput.cpp runtime/generationOutput.cpp)
+
+pybind11_add_module(${TRTLLM_PYBIND_MODULE} ${SRCS})
+
+set_property(TARGET ${TRTLLM_PYBIND_MODULE} PROPERTY POSITION_INDEPENDENT_CODE
+                                                     ON)
+
+target_link_directories(${TRTLLM_PYBIND_MODULE} PUBLIC
+                        "${TORCH_INSTALL_PREFIX}/lib")
+target_link_libraries(
+  ${TRTLLM_PYBIND_MODULE}
+  PUBLIC ${STATIC_TARGET} ${Python3_LIBRARIES} ${TORCH_LIBRARIES} torch_python
+         ${UNDEFINED_FLAG})
+target_compile_definitions(${TRTLLM_PYBIND_MODULE}
+                           PUBLIC TRTLLM_PYBIND_MODULE=${TRTLLM_PYBIND_MODULE})
diff --git a/cpp/tensorrt_llm/pybind/bindings.cpp b/cpp/tensorrt_llm/pybind/bindings.cpp
new file mode 100644
index 0000000000..a1060d8976
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/bindings.cpp
@@ -0,0 +1,250 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <pybind11/operators.h>
+#include <torch/extension.h>
+
+#include "runtime/generationInput.h"
+#include "runtime/generationOutput.h"
+
+#include "tensorrt_llm/batch_manager/kvCacheConfig.h"
+#include "tensorrt_llm/common/quantization.h"
+#include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/gptJsonConfig.h"
+#include "tensorrt_llm/runtime/gptSession.h"
+#include "tensorrt_llm/runtime/samplingConfig.h"
+
+namespace py = pybind11;
+namespace tb = tensorrt_llm::batch_manager;
+namespace tc = tensorrt_llm::common;
+namespace tr = tensorrt_llm::runtime;
+namespace tpr = tensorrt_llm::pybind::runtime;
+
+#if not defined(TRTLLM_PYBIND_MODULE)
+#error "TRTLLM_PYBIND_MODULE must be defined"
+#endif
+
+PYBIND11_MODULE(TRTLLM_PYBIND_MODULE, m)
+{
+    m.doc() = "TensorRT-LLM Python bindings for C++ runtime";
+
+    py::class_<tpr::PromptTuningParams>(m, "PromptTuningParams")
+        .def(py::init<tpr::PromptTuningParams::TensorPtr, tpr::PromptTuningParams::TensorPtr,
+                 tpr::PromptTuningParams::TensorPtr>(),
+            py::arg("embedding_table") = py::none(), py::arg("tasks") = py::none(), py::arg("vocab_size") = py::none())
+        .def_readwrite("embedding_table", &tpr::PromptTuningParams::embeddingTable)
+        .def_readwrite("tasks", &tpr::PromptTuningParams::tasks)
+        .def_readwrite("vocab_size", &tpr::PromptTuningParams::vocabSize)
+        .def_readwrite("prompt_tuning_enabled", &tpr::PromptTuningParams::promptTuningEnabled);
+
+    py::class_<tpr::GenerationInput>(m, "GenerationInput")
+        .def(py::init<tr::SizeType, tr::SizeType, tpr::GenerationInput::TensorPtr, tpr::GenerationInput::TensorPtr,
+                 bool>(),
+            py::arg("end_id"), py::arg("pad_id"), py::arg("ids"), py::arg("lengths"), py::arg("packed") = false)
+        .def_readwrite("end_id", &tpr::GenerationInput::endId)
+        .def_readwrite("pad_id", &tpr::GenerationInput::padId)
+        .def_readwrite("ids", &tpr::GenerationInput::ids)
+        .def_readwrite("lengths", &tpr::GenerationInput::lengths)
+        .def_readwrite("packed", &tpr::GenerationInput::packed)
+        .def_readwrite("embedding_bias", &tpr::GenerationInput::embeddingBiasOpt)
+        .def_readwrite("bad_words_list", &tpr::GenerationInput::badWordsList)
+        .def_readwrite("stop_words_list", &tpr::GenerationInput::stopWordsList)
+        .def_readwrite("max_new_tokens", &tpr::GenerationInput::maxNewTokens)
+        .def_readwrite("prompt_tuning_params", &tpr::GenerationInput::promptTuningParams);
+
+    py::class_<tpr::GenerationOutput>(m, "GenerationOutput")
+        .def(py::init<tpr::GenerationOutput::TensorPtr, tpr::GenerationOutput::TensorPtr>(), py::arg("ids"),
+            py::arg("lengths"))
+        .def_readwrite("ids", &tpr::GenerationOutput::ids)
+        .def_readwrite("lengths", &tpr::GenerationOutput::lengths)
+        .def_readwrite("log_probs", &tpr::GenerationOutput::logProbs)
+        .def_readwrite("context_logits", &tpr::GenerationOutput::contextLogits);
+
+    py::class_<tb::kv_cache_manager::KvCacheConfig>(m, "KvCacheConfig")
+        .def(py::init<std::optional<tr::SizeType>, std::optional<float>>(), py::arg("max_tokens") = py::none(),
+            py::arg("free_gpu_memory_fraction") = py::none())
+        .def_readwrite("max_tokens", &tb::kv_cache_manager::KvCacheConfig::maxTokens)
+        .def_readwrite("free_gpu_memory_fraction", &tb::kv_cache_manager::KvCacheConfig::freeGpuMemoryFraction);
+
+    py::class_<tr::GptSession::Config>(m, "GptSessionConfig")
+        .def(py::init<tr::SizeType, tr::SizeType, tr::SizeType>(), py::arg("max_batch_size"), py::arg("max_beam_width"),
+            py::arg("max_sequence_length"))
+        .def_readwrite("max_batch_size", &tr::GptSession::Config::maxBatchSize)
+        .def_readwrite("max_beam_width", &tr::GptSession::Config::maxBeamWidth)
+        .def_readwrite("max_sequence_length", &tr::GptSession::Config::maxSequenceLength)
+        .def_readwrite("decoder_per_request", &tr::GptSession::Config::decoderPerRequest)
+        .def_readwrite("cuda_graph_mode", &tr::GptSession::Config::cudaGraphMode)
+        .def_readwrite("ctx_micro_batch_size", &tr::GptSession::Config::ctxMicroBatchSize)
+        .def_readwrite("gen_micro_batch_size", &tr::GptSession::Config::genMicroBatchSize)
+        .def_readwrite("kv_cache_config", &tr::GptSession::Config::kvCacheConfig);
+
+    py::enum_<nvinfer1::DataType>(m, "DataType")
+        .value("FLOAT", nvinfer1::DataType::kFLOAT)
+        .value("HALF", nvinfer1::DataType::kHALF)
+        .value("INT8", nvinfer1::DataType::kINT8)
+        .value("INT32", nvinfer1::DataType::kINT32)
+        .value("BOOL", nvinfer1::DataType::kBOOL)
+        .value("UINT8", nvinfer1::DataType::kUINT8)
+        .value("FP8", nvinfer1::DataType::kFP8)
+        .value("BF16", nvinfer1::DataType::kBF16)
+        .value("INT64", nvinfer1::DataType::kINT64)
+        .export_values();
+
+    py::enum_<tr::GptModelConfig::ModelVariant>(m, "GptModelVariant")
+        .value("GPT", tr::GptModelConfig::ModelVariant::kGpt)
+        .value("GLM", tr::GptModelConfig::ModelVariant::kGlm);
+
+    py::class_<tc::QuantMode>(m, "QuantMode")
+        .def_static("none", &tc::QuantMode::none)
+        .def_static("int4_weights", &tc::QuantMode::int4Weights)
+        .def_static("int8_weights", &tc::QuantMode::int8Weights)
+        .def_static("activations", &tc::QuantMode::activations)
+        .def_static("per_channel_scaling", &tc::QuantMode::perChannelScaling)
+        .def_static("per_token_scaling", &tc::QuantMode::perTokenScaling)
+        .def_static("per_group_scaling", &tc::QuantMode::perGroupScaling)
+        .def_static("int8_kv_cache", &tc::QuantMode::int8KvCache)
+        .def_static("fp8_kv_cache", &tc::QuantMode::fp8KvCache)
+        .def_static("fp8_qdq", &tc::QuantMode::fp8Qdq)
+        .def_property_readonly("value", &tc::QuantMode::value)
+        .def("is_set", &tc::QuantMode::isSet, py::arg("mode"))
+        .def_property_readonly("has_int4_weights", &tc::QuantMode::hasInt4Weights)
+        .def_property_readonly("has_int8_weights", &tc::QuantMode::hasInt8Weights)
+        .def_property_readonly("has_activations", &tc::QuantMode::hasActivations)
+        .def_property_readonly("has_per_channel_scaling", &tc::QuantMode::hasPerChannelScaling)
+        .def_property_readonly("has_per_token_scaling", &tc::QuantMode::hasPerTokenScaling)
+        .def_property_readonly("has_per_group_scaling", &tc::QuantMode::hasPerGroupScaling)
+        .def_property_readonly("has_static_activation_scaling", &tc::QuantMode::hasStaticActivationScaling)
+        .def_property_readonly("has_int8_kv_cache", &tc::QuantMode::hasInt8KvCache)
+        .def_property_readonly("has_fp8_kv_cache", &tc::QuantMode::hasFp8KvCache)
+        .def_property_readonly("has_fp8_qdq", &tc::QuantMode::hasFp8Qdq)
+        .def_property_readonly("has_kv_cache_quant", &tc::QuantMode::hasKvCacheQuant)
+        .def_static("from_description", &tc::QuantMode::fromDescription, py::arg("quantize_weights") = false,
+            py::arg("quantize_activations") = false, py::arg("per_token") = false, py::arg("per_channel") = false,
+            py::arg("use_int4_weights") = false, py::arg("use_int8_kv_cache") = false,
+            py::arg("use_fp8_kv_kache") = false, py::arg("use_fp8_qdq") = false)
+        .def(py::self + py::self)
+        .def(py::self += py::self)
+        .def(py::self - py::self)
+        .def(py::self -= py::self)
+        .def(py::self == py::self)
+        .def(py::self != py::self);
+
+    py::class_<tr::GptModelConfig>(m, "GptModelConfig")
+        .def(py::init<tr::SizeType, tr::SizeType, tr::SizeType, tr::SizeType, nvinfer1::DataType>(),
+            py::arg("vocab_size"), py::arg("num_layers"), py::arg("num_heads"), py::arg("hidden_size"),
+            py::arg("data_type"))
+        .def_property_readonly("vocab_size", &tr::GptModelConfig::getVocabSize)
+        .def("vocab_size_padded", &tr::GptModelConfig::getVocabSizePadded, py::arg("world_size"))
+        .def("num_layers", &tr::GptModelConfig::getNbLayers, py::arg("pipeline_parallelism") = 1)
+        .def_property_readonly("num_heads", &tr::GptModelConfig::getNbHeads)
+        .def_property_readonly("hidden_size", &tr::GptModelConfig::getHiddenSize)
+        .def_property_readonly("size_per_head", &tr::GptModelConfig::getSizePerHead)
+        .def_property_readonly("data_type", &tr::GptModelConfig::getDataType)
+        .def_property("num_kv_heads", &tr::GptModelConfig::getNbKvHeads, &tr::GptModelConfig::setNbKvHeads)
+        .def_property("use_gpt_attention_plugin",
+            py::overload_cast<>(&tr::GptModelConfig::useGptAttentionPlugin, py::const_),
+            py::overload_cast<bool>(&tr::GptModelConfig::useGptAttentionPlugin))
+        .def_property("use_packed_input", py::overload_cast<>(&tr::GptModelConfig::usePackedInput, py::const_),
+            py::overload_cast<bool>(&tr::GptModelConfig::usePackedInput))
+        .def_property("use_paged_kv_cache", py::overload_cast<>(&tr::GptModelConfig::usePagedKvCache, py::const_),
+            py::overload_cast<bool>(&tr::GptModelConfig::usePagedKvCache))
+        .def_property(
+            "tokens_per_block", &tr::GptModelConfig::getTokensPerBlock, &tr::GptModelConfig::setTokensPerBlock)
+        .def_property("quant_mode", &tr::GptModelConfig::getQuantMode, &tr::GptModelConfig::setQuantMode)
+        .def_property_readonly("supports_inflight_batching", &tr::GptModelConfig::supportsInflightBatching)
+        .def_property("max_batch_size", &tr::GptModelConfig::getMaxBatchSize, &tr::GptModelConfig::setMaxBatchSize)
+        .def_property("max_input_len", &tr::GptModelConfig::getMaxInputLen, &tr::GptModelConfig::setMaxInputLen)
+        .def_property("max_output_len", &tr::GptModelConfig::getMaxOutputLen, &tr::GptModelConfig::setMaxOutputLen)
+        .def_property("max_num_tokens", &tr::GptModelConfig::getMaxNumTokens, &tr::GptModelConfig::setMaxNumTokens)
+        .def_property("compute_context_logits",
+            py::overload_cast<>(&tr::GptModelConfig::computeContextLogits, py::const_),
+            py::overload_cast<bool>(&tr::GptModelConfig::computeContextLogits))
+        .def_property("model_variant", &tr::GptModelConfig::getModelVariant, &tr::GptModelConfig::setModelVariant)
+        .def_property("use_custom_all_reduce", py::overload_cast<>(&tr::GptModelConfig::useCustomAllReduce, py::const_),
+            py::overload_cast<bool>(&tr::GptModelConfig::useCustomAllReduce));
+
+    py::class_<tr::WorldConfig>(m, "WorldConfig")
+        .def(py::init<tr::SizeType, tr::SizeType, tr::SizeType, tr::SizeType>(), py::arg("tensor_parallelism") = 1,
+            py::arg("pipeline_parallelism") = 1, py::arg("rank") = 0,
+            py::arg("gpus_per_node") = tr::WorldConfig::kDefaultGpusPerNode)
+        .def_property_readonly("size", &tr::WorldConfig::getSize)
+        .def_property_readonly("tensor_parallelism", &tr::WorldConfig::getTensorParallelism)
+        .def_property_readonly("pipeline_parallelism", &tr::WorldConfig::getPipelineParallelism)
+        .def_property_readonly("is_tensor_parallel", &tr::WorldConfig::isTensorParallel)
+        .def_property_readonly("is_pipeline_parallel", &tr::WorldConfig::isPipelineParallel)
+        .def_property_readonly("rank", &tr::WorldConfig::getRank)
+        .def_property_readonly("gpus_per_node", &tr::WorldConfig::getGpusPerNode)
+        .def_property_readonly("device", &tr::WorldConfig::getDevice)
+        .def_property_readonly("pipeline_parallel_rank", &tr::WorldConfig::getPipelineParallelRank)
+        .def_property_readonly("tensor_parallel_rank", &tr::WorldConfig::getTensorParallelRank)
+        .def_static("mpi",
+            py::overload_cast<tr::SizeType, std::optional<tr::SizeType>, std::optional<tr::SizeType>>(
+                &tr::WorldConfig::mpi),
+            py::arg("gpus_per_node") = tr::WorldConfig::kDefaultGpusPerNode, py::arg("tensor_parallelism") = py::none(),
+            py::arg("pipeline_parallelism") = py::none());
+
+    py::class_<tr::SamplingConfig>(m, "SamplingConfig")
+        .def(py::init<tr::SizeType>(), py::arg("beam_width") = 1)
+        .def_readwrite("beam_width", &tr::SamplingConfig::beamWidth)
+        .def_readwrite("temperature", &tr::SamplingConfig::temperature)
+        .def_readwrite("min_length", &tr::SamplingConfig::minLength)
+        .def_readwrite("repetition_penalty", &tr::SamplingConfig::repetitionPenalty)
+        .def_readwrite("presence_penalty", &tr::SamplingConfig::presencePenalty)
+        .def_readwrite("top_k", &tr::SamplingConfig::topK)
+        .def_readwrite("top_p", &tr::SamplingConfig::topP)
+        .def_readwrite("random_seed", &tr::SamplingConfig::randomSeed)
+        .def_readwrite("top_p_decay", &tr::SamplingConfig::topPDecay)
+        .def_readwrite("top_p_min", &tr::SamplingConfig::topPMin)
+        .def_readwrite("top_p_reset_ids", &tr::SamplingConfig::topPResetIds)
+        .def_readwrite("beam_search_diversity_rate", &tr::SamplingConfig::beamSearchDiversityRate)
+        .def_readwrite("length_penalty", &tr::SamplingConfig::lengthPenalty);
+
+    py::class_<tr::GptJsonConfig>(m, "GptJsonConfig")
+        .def(py::init<std::string, std::string, tr::SizeType, tr::SizeType, tr::GptModelConfig>(), py::arg("name"),
+            py::arg("precision"), py::arg("tensor_parallelism"), py::arg("pipeline_parallelism"),
+            py::arg("model_config"))
+        .def_static("parse", py::overload_cast<std::string const&>(&tr::GptJsonConfig::parse), py::arg("json"))
+        .def_static(
+            "parse_file", [](std::string const& file) { return tr::GptJsonConfig::parse(std::filesystem::path(file)); },
+            py::arg("file"))
+        .def_property_readonly("model_config", &tr::GptJsonConfig::getModelConfig)
+        .def_property_readonly("name", &tr::GptJsonConfig::getName)
+        .def_property_readonly("precision", &tr::GptJsonConfig::getPrecision)
+        .def_property_readonly("tensor_parallelism", &tr::GptJsonConfig::getTensorParallelism)
+        .def_property_readonly("pipeline_parallelism", &tr::GptJsonConfig::getPipelineParallelism)
+        .def_property_readonly("world_size", &tr::GptJsonConfig::getWorldSize)
+        .def("engine_filename",
+            py::overload_cast<const tr::WorldConfig&, const std::string&>(
+                &tr::GptJsonConfig::engineFilename, py::const_),
+            py::arg("world_config"), py::arg("model"))
+        .def("engine_filename",
+            py::overload_cast<const tr::WorldConfig&>(&tr::GptJsonConfig::engineFilename, py::const_),
+            py::arg("world_config"));
+
+    py::class_<tr::GptSession>(m, "GptSession")
+        .def(py::init<tr::GptSession::Config, tr::GptModelConfig, tr::WorldConfig, std::string>(), py::arg("config"),
+            py::arg("model_config"), py::arg("world_config"), py::arg("engine_file"))
+        .def_property_readonly("model_config", &tr::GptSession::getModelConfig)
+        .def_property_readonly("world_config", &tr::GptSession::getWorldConfig)
+        .def_property_readonly("device", &tr::GptSession::getDevice)
+        .def(
+            "generate",
+            [](tr::GptSession& self, tpr::GenerationOutput& outputs, tpr::GenerationInput const& inputs,
+                tr::SamplingConfig const& samplingConfig)
+            { self.generate(*outputs.toTrtLlm(), *inputs.toTrtLlm(), samplingConfig); },
+            py::arg("outputs"), py::arg("inputs"), py::arg("sampling_config"));
+}
diff --git a/cpp/tensorrt_llm/pybind/runtime/generationInput.cpp b/cpp/tensorrt_llm/pybind/runtime/generationInput.cpp
new file mode 100644
index 0000000000..bef4ee167f
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/runtime/generationInput.cpp
@@ -0,0 +1,54 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "generationInput.h"
+
+#include "tensorrt_llm/runtime/generationInput.h"
+#include "tensorrt_llm/runtime/torchView.h"
+
+namespace tr = tensorrt_llm::runtime;
+
+using namespace tensorrt_llm::pybind::runtime;
+
+std::shared_ptr<tr::PromptTuningParams> PromptTuningParams::toTrtLlm() const
+{
+    auto ptt = std::make_shared<tr::PromptTuningParams>();
+    if (embeddingTable)
+        ptt->embeddingTable = tr::TorchView::of(embeddingTable.value());
+    if (tasks)
+        ptt->tasks = tr::TorchView::of(tasks.value());
+    if (vocabSize)
+        ptt->vocabSize = tr::TorchView::of(vocabSize.value());
+    ptt->promptTuningEnabled = promptTuningEnabled;
+    return ptt;
+}
+
+std::shared_ptr<tr::GenerationInput> GenerationInput::toTrtLlm() const
+{
+    auto input = std::make_shared<tr::GenerationInput>(
+        endId, padId, tr::TorchView::of(ids.value()), tr::TorchView::of(lengths.value()), packed);
+    if (embeddingBiasOpt)
+        input->embeddingBiasOpt = tr::TorchView::of(embeddingBiasOpt.value());
+    if (badWordsList)
+        input->badWordsList = tr::TorchView::of(badWordsList.value());
+    if (stopWordsList)
+        input->stopWordsList = tr::TorchView::of(stopWordsList.value());
+    input->maxNewTokens = maxNewTokens;
+    input->promptTuningParams = *promptTuningParams.toTrtLlm();
+    return input;
+
+    return input;
+}
diff --git a/cpp/tensorrt_llm/pybind/runtime/generationInput.h b/cpp/tensorrt_llm/pybind/runtime/generationInput.h
new file mode 100644
index 0000000000..d975dba2ff
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/runtime/generationInput.h
@@ -0,0 +1,66 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/common/assert.h"
+#include "tensorrt_llm/runtime/common.h"
+#include "tensorrt_llm/runtime/generationInput.h"
+
+#include <ATen/ATen.h>
+
+#include <ATen/ops/tensor.h>
+#include <memory>
+#include <optional>
+
+namespace tensorrt_llm::pybind::runtime
+{
+
+using SizeType = tensorrt_llm::runtime::SizeType;
+
+class PromptTuningParams : public tensorrt_llm::runtime::GenericPromptTuningParams<std::optional<at::Tensor>>
+{
+public:
+    using Base = tensorrt_llm::runtime::GenericPromptTuningParams<std::optional<at::Tensor>>;
+    using TensorPtr = Base::TensorPtr;
+    using SizeType = Base::SizeType;
+
+    explicit PromptTuningParams(
+        TensorPtr embeddingTable = TensorPtr(), TensorPtr tasks = TensorPtr(), TensorPtr vocabSize = TensorPtr())
+        : GenericPromptTuningParams(std::move(embeddingTable), std::move(tasks), std::move(vocabSize))
+    {
+    }
+
+    [[nodiscard]] std::shared_ptr<tensorrt_llm::runtime::PromptTuningParams> toTrtLlm() const;
+};
+
+class GenerationInput
+    : public tensorrt_llm::runtime::GenericGenerationInput<std::optional<at::Tensor>, PromptTuningParams>
+{
+public:
+    using Base = tensorrt_llm::runtime::GenericGenerationInput<std::optional<at::Tensor>, PromptTuningParams>;
+    using TensorPtr = Base::TensorPtr;
+
+    explicit GenerationInput(
+        SizeType const endId, SizeType const padId, TensorPtr ids, TensorPtr lengths, bool packed = false)
+        : GenericGenerationInput(endId, padId, std::move(ids), std::move(lengths), packed)
+    {
+    }
+
+    [[nodiscard]] std::shared_ptr<tensorrt_llm::runtime::GenerationInput> toTrtLlm() const;
+};
+} // namespace tensorrt_llm::pybind::runtime
diff --git a/cpp/tensorrt_llm/pybind/runtime/generationOutput.cpp b/cpp/tensorrt_llm/pybind/runtime/generationOutput.cpp
new file mode 100644
index 0000000000..e6d97b9833
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/runtime/generationOutput.cpp
@@ -0,0 +1,39 @@
+/*
+ * SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#include "generationOutput.h"
+
+#include "tensorrt_llm/runtime/torchView.h"
+
+namespace tr = tensorrt_llm::runtime;
+
+using namespace tensorrt_llm::pybind::runtime;
+
+std::shared_ptr<tr::GenerationOutput> GenerationOutput::toTrtLlm() const
+{
+    auto output
+        = std::make_shared<tr::GenerationOutput>(tr::TorchView::of(ids.value()), tr::TorchView::of(lengths.value()));
+    if (logProbs)
+    {
+        output->logProbs = tr::TorchView::of(logProbs.value());
+    }
+    if (contextLogits)
+    {
+        output->contextLogits = tr::TorchView::of(contextLogits.value());
+    }
+    // TODO(mseznec): add support for onTokenGenerated
+    return output;
+}
diff --git a/cpp/tensorrt_llm/pybind/runtime/generationOutput.h b/cpp/tensorrt_llm/pybind/runtime/generationOutput.h
new file mode 100644
index 0000000000..ce34d7fc1c
--- /dev/null
+++ b/cpp/tensorrt_llm/pybind/runtime/generationOutput.h
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "tensorrt_llm/runtime/generationOutput.h"
+
+#include <ATen/ATen.h>
+#include <optional>
+
+namespace tensorrt_llm::pybind::runtime
+{
+
+class GenerationOutput : public tensorrt_llm::runtime::GenericGenerationOutput<std::optional<at::Tensor>>
+{
+public:
+    using Base = tensorrt_llm::runtime::GenericGenerationOutput<std::optional<at::Tensor>>;
+    using TensorPtr = Base::TensorPtr;
+
+    explicit GenerationOutput(TensorPtr ids, TensorPtr lengths)
+        : GenericGenerationOutput(std::move(ids), std::move(lengths))
+    {
+    }
+
+    [[nodiscard]] std::shared_ptr<tensorrt_llm::runtime::GenerationOutput> toTrtLlm() const;
+};
+
+} // namespace tensorrt_llm::pybind::runtime
diff --git a/cpp/tensorrt_llm/runtime/CMakeLists.txt b/cpp/tensorrt_llm/runtime/CMakeLists.txt
index 00ae3c80d2..6f59553341 100644
--- a/cpp/tensorrt_llm/runtime/CMakeLists.txt
+++ b/cpp/tensorrt_llm/runtime/CMakeLists.txt
@@ -17,6 +17,7 @@ include(FetchContent)
 set(SRCS
     utils/numpyUtils.cpp
     utils/sessionUtils.cpp
+    utils/debugUtils.cu
     bufferManager.cpp
     decodingOutput.cpp
     gptDecoder.cpp
@@ -28,6 +29,7 @@ set(SRCS
     ipcUtils.cpp
     memoryCounters.cpp
     ncclCommunicator.cpp
+    promptTuningParams.cpp
     runtimeBuffers.cpp
     runtimeKernels.cu
     statefulGptDecoder.cpp
diff --git a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
index ad4588a4fa..ec2dc1b027 100644
--- a/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
+++ b/cpp/tensorrt_llm/runtime/gptDecoderBatch.cpp
@@ -212,9 +212,28 @@ void GptDecoderBatch::newRequest(
     TensorPtr endIdTensorPtr{ITensor::slice(constPointerCast(dJointInput.endIds), batchIdx, localBatchSize)};
     kernels::invokeFill(*endIdTensorPtr, endId, *stream);
     dInput = std::make_unique<DecodingInput>(inputLength, localBatchSize, dJointInput.logits, endIdTensorPtr);
-    dInput->embeddingBias = request.embeddingBias;
-    dInput->badWordsList = request.badWordsList;
-    dInput->stopWordsList = request.stopWordsList;
+
+    // Here, we need to add leading 1 dimension since decoderInput expects batchSize as leading dim
+    // and decoder_batch::Request doesn't have batch dimension
+    if (request.embeddingBias)
+    {
+        TensorPtr biasView = ITensor::view(request.embeddingBias);
+        biasView->unsqueeze(0);
+        dInput->embeddingBias = biasView;
+    }
+    if (request.badWordsList)
+    {
+        TensorPtr badWordsView = ITensor::view(request.badWordsList);
+        badWordsView->unsqueeze(0);
+        dInput->badWordsList = badWordsView;
+    }
+    if (request.stopWordsList)
+    {
+        TensorPtr stopWordsView = ITensor::view(request.stopWordsList);
+        stopWordsView->unsqueeze(0);
+        dInput->stopWordsList = stopWordsView;
+    }
+
     TensorPtr sequenceLimitLength{
         ITensor::slice(constPointerCast(dJointInput.sequenceLimitLength), batchIdx, localBatchSize)};
     kernels::invokeFill(*sequenceLimitLength, inputLength + maxNewTokens, *stream);
@@ -437,10 +456,20 @@ void GptDecoderBatch::newBatch(GenerationInput const& inputs, SamplingConfig con
             inputView = ITensor::slice(inputs.ids, batchIdx, 1);
             inputView->reshape(inputShape);
         }
-        auto request = decoder_batch::Request{inputView, std::nullopt, inputs.endId, inputs.padId};
-        request.embeddingBias = inputs.embeddingBiasOpt;
-        request.badWordsList = inputs.badWordsList;
-        request.stopWordsList = inputs.stopWordsList;
+        auto request = decoder_batch::Request{inputView, inputs.maxNewTokens, inputs.endId, inputs.padId};
+
+        if (inputs.embeddingBiasOpt)
+        {
+            TLLM_THROW("newBatch doesn't support embeddingBias yet.");
+        }
+        if (inputs.badWordsList)
+        {
+            TLLM_THROW("newBatch doesn't support badWordsList yet.");
+        }
+        if (inputs.stopWordsList)
+        {
+            TLLM_THROW("newBatch doesn't support stopWordsList yet.");
+        }
         newRequest(batchIdx, request, extractSamplingConfig(samplingConfig, batchIdx));
     }
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
diff --git a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
index 1f0169f7ce..3860e4b611 100644
--- a/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
+++ b/cpp/tensorrt_llm/runtime/gptJsonConfig.cpp
@@ -38,9 +38,10 @@ FieldType parseJsonFieldOr(Json const& json, std::string_view name, FieldType de
     {
         value = json.at(name).template get<FieldType>();
     }
-    catch (nlohmann::json::out_of_range&)
+    catch (nlohmann::json::out_of_range& e)
     {
-        // std::cerr << e.what() << '\n';
+        TLLM_LOG_WARNING("Parameter %s cannot be read from json:", std::string(name).c_str());
+        TLLM_LOG_WARNING(e.what());
     }
     return value;
 }
@@ -102,6 +103,8 @@ GptJsonConfig parseJson(InputType&& i)
     auto const maxInputLen = parseJsonFieldOr(builderConfig, "max_input_len", 0);
     auto const maxOutputLen = parseJsonFieldOr(builderConfig, "max_output_len", 0);
     auto const maxNumTokens = parseJsonFieldOptional<SizeType>(builderConfig, "max_num_tokens");
+    auto const maxPromptEmbeddingTableSize
+        = parseJsonFieldOr<SizeType>(builderConfig, "max_prompt_embedding_table_size", 0);
 
     auto const computeContextLogits = parseJsonFieldOr(builderConfig, "gather_all_token_logits", false);
 
@@ -127,11 +130,12 @@ GptJsonConfig parseJson(InputType&& i)
     modelConfig.setMaxInputLen(maxInputLen);
     modelConfig.setMaxOutputLen(maxOutputLen);
     modelConfig.setMaxNumTokens(maxNumTokens);
+    modelConfig.setMaxPromptEmbeddingTableSize(maxPromptEmbeddingTableSize);
 
     if (name == std::string("chatglm-6b"))
     {
         modelConfig.setModelVariant(GptModelConfig::ModelVariant::kGlm);
-        // kGlm is only for ChatGLM-6B, not for ChatGLM2-6B
+        // kGlm is only for ChatGLM-6B and Glm-10B
     }
 
     return GptJsonConfig{name, precision, tensorParallelism, pipelineParallelism, modelConfig};
diff --git a/cpp/tensorrt_llm/runtime/gptSession.cpp b/cpp/tensorrt_llm/runtime/gptSession.cpp
index d4cb14aaf2..68c70d377a 100644
--- a/cpp/tensorrt_llm/runtime/gptSession.cpp
+++ b/cpp/tensorrt_llm/runtime/gptSession.cpp
@@ -21,6 +21,7 @@
 
 #include "iBuffer.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
+#include "tensorrt_llm/common/stringUtils.h"
 #include "tensorrt_llm/kernels/decodingKernels.h"
 #include "tensorrt_llm/runtime/gptDecoderBatch.h"
 #include "tensorrt_llm/runtime/ipcUtils.h"
@@ -48,7 +49,6 @@ GptSession::GptSession(Config const& sessionConfig, GptModelConfig const& modelC
     , mDevice{utils::initDevice(worldConfig)}
     , mLogger{logger ? std::move(logger) : std::make_shared<TllmLogger>()}
     , mRuntime{std::make_shared<TllmRuntime>(engineBuffer, engineSize, *mLogger)}
-    , mNumMicroBatches{worldConfig.getPipelineParallelism()}
     , mDecoders{}
     , mBuffers{}
     , mCudaGraphInstances{}
@@ -59,6 +59,9 @@ GptSession::GptSession(Config const& sessionConfig, GptModelConfig const& modelC
         mCommStream = std::make_shared<CudaStream>();
     }
 
+    TLLM_CHECK_WITH_INFO(!(mModelConfig.usePromptTuning() && !mModelConfig.useGptAttentionPlugin()),
+        "Prompt tuning is only enabled with GPT attention plugin.");
+
     // TODO compare expected and runtime tensor names?
 
     setup(sessionConfig);
@@ -74,7 +77,7 @@ BufferManager& GptSession::getBufferManager() const
     return mRuntime->getBufferManager();
 }
 
-void GptSession::createContexts(SizeType numMicroBatches, bool useCudaGraphs)
+void GptSession::createContexts(SizeType numCtxBatches, SizeType numGenBatches, bool useCudaGraphs)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     mRuntime->clearContexts();
@@ -82,31 +85,22 @@ void GptSession::createContexts(SizeType numMicroBatches, bool useCudaGraphs)
     if (useCudaGraphs)
     {
         // Instantiate multiple graph instances for flip-flopping
-        mCudaGraphInstances.resize(2 * numMicroBatches);
+        mCudaGraphInstances.resize(2 * numGenBatches);
     }
 
     auto const numProfiles = mRuntime->getNbProfiles();
     TLLM_CHECK_WITH_INFO(
         numProfiles == 1 || numProfiles == 2, "GPT only expects one optimization profile or two optimization profiles");
 
-    if (numProfiles == 2)
-    {
-        auto constexpr ctxContextId = 0;
-        auto constexpr genContextId = 1;
-        // Instantiate 2 contexts for flip-flopping
-        for (auto i = 0; i < 2 * numMicroBatches; ++i)
-            mRuntime->addContext(genContextId);
-        // Instantiate 1 context for context phase
-        for (auto i = 0; i < numMicroBatches; ++i)
-            mRuntime->addContext(ctxContextId);
-    }
-    else
-    {
-        auto constexpr contextId = 0;
-        // Instantiate 2 contexts for flip-flopping
-        for (auto i = 0; i < 2 * numMicroBatches; ++i)
-            mRuntime->addContext(contextId);
-    }
+    auto constexpr ctxContextId = 0;
+    auto const genContextId = static_cast<std::int32_t>(numProfiles == 2);
+    // Instantiate 2 contexts for flip-flopping
+    for (auto i = 0; i < 2 * numGenBatches; ++i)
+        mRuntime->addContext(genContextId);
+    // Instantiate 1 context for context phase
+    for (auto i = 0; i < numCtxBatches; ++i)
+        mRuntime->addContext(ctxContextId);
+
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -184,11 +178,48 @@ void GptSession::createCustomAllReduceWorkspace(
 {
     setPeerAccess(mWorldConfig, true);
 
+    mIpcMemoryHandles.clear();
+    const std::size_t bufferSize = static_cast<std::size_t>(maxBatchSize) * maxBeamWidth * maxSequenceLength
+        * mModelConfig.getHiddenSize() * mWorldConfig.getTensorParallelism() * sizeof(float);
+    mIpcMemoryHandles.emplace_back(std::make_shared<IpcMemory>(mWorldConfig, bufferSize));
+    mIpcMemoryHandles.emplace_back(std::make_shared<IpcMemory>(mWorldConfig, IpcMemory::FLAGS_SIZE * sizeof(int32_t)));
+    mIpcMemoryHandles.emplace_back(std::make_shared<IpcMemory>(mWorldConfig, IpcMemory::FLAGS_SIZE * sizeof(int32_t)));
+
     auto& manager = mRuntime->getBufferManager();
-    for (const auto& buffer : mBuffers)
+    mCommPtrs = manager.cpu(
+        ITensor::makeShape({static_cast<SizeType>(mIpcMemoryHandles.size()) * mWorldConfig.getTensorParallelism()}),
+        nvinfer1::DataType::kINT64);
+    const auto commPtrsData = bufferCast<void*>(*mCommPtrs);
+
+    for (size_t memIdx = 0; memIdx < mIpcMemoryHandles.size(); memIdx++)
     {
-        buffer->createCustomAllReduceWorkspace(
-            maxBatchSize, maxBeamWidth, maxSequenceLength, mModelConfig.getHiddenSize(), mWorldConfig, manager);
+        const auto& memCommPtrs = mIpcMemoryHandles[memIdx]->getCommPtrsTensor();
+        for (SizeType tpIdx = 0; tpIdx < mWorldConfig.getTensorParallelism(); tpIdx++)
+        {
+            commPtrsData[memIdx * mWorldConfig.getTensorParallelism() + tpIdx] = memCommPtrs[tpIdx];
+        }
+    }
+}
+
+GptSession::MicroBatchConfig::MicroBatchConfig(SizeType maxBatchSize, SizeType pipelineParallelism,
+    std::optional<SizeType> genMicroBatchSize, std::optional<SizeType> ctxMicroBatchSize)
+{
+    if (genMicroBatchSize || ctxMicroBatchSize)
+    {
+        genBatchSize = genMicroBatchSize.value_or(maxBatchSize);
+        TLLM_CHECK(genBatchSize <= maxBatchSize);
+        ctxBatchSize = ctxMicroBatchSize.value_or(genBatchSize);
+        TLLM_CHECK_WITH_INFO(genBatchSize % ctxBatchSize == 0,
+            tc::fmtstr(
+                "Generation batch size (%d) must be divisible by context batch size (%d)", genBatchSize, ctxBatchSize)
+                .c_str());
+        numGenBatches = tc::ceilDiv(maxBatchSize, genBatchSize);
+        numCtxBatches = numGenBatches * (genBatchSize / ctxBatchSize);
+    }
+    else
+    {
+        numCtxBatches = numGenBatches = pipelineParallelism;
+        ctxBatchSize = genBatchSize = tc::ceilDiv(maxBatchSize, numGenBatches);
     }
 }
 
@@ -202,12 +233,12 @@ void GptSession::setup(Config const& sessionConfig)
     auto const maxBeamWidth = sessionConfig.maxBeamWidth;
     auto const maxSequenceLength = sessionConfig.maxSequenceLength;
 
-    if (sessionConfig.numMicroBatches)
-        mNumMicroBatches = sessionConfig.numMicroBatches.value();
-    createContexts(mNumMicroBatches, sessionConfig.cudaGraphMode);
-    createBuffers(mNumMicroBatches);
+    mMicroBatchConfig = MicroBatchConfig(maxBatchSize, mWorldConfig.getPipelineParallelism(),
+        sessionConfig.genMicroBatchSize, sessionConfig.ctxMicroBatchSize);
+
+    createContexts(mMicroBatchConfig.numCtxBatches, mMicroBatchConfig.numGenBatches, sessionConfig.cudaGraphMode);
+    createBuffers(mMicroBatchConfig.numGenBatches);
 
-    auto const microBatchSize = tc::ceilDiv(maxBatchSize, mNumMicroBatches);
     // Store this param related to decoder buffer size and kv cache manager to check against
     // the input shape with the params given in generate().
     // gptDecoderBatch does not resize buffers, but allows smaller batchSize and beamWidth.
@@ -222,28 +253,29 @@ void GptSession::setup(Config const& sessionConfig)
     if (mWorldConfig.isLastPipelineParallelRank())
     {
         auto const logitsType = mRuntime->getEngine().getTensorDataType("logits");
-        createDecoders(microBatchSize, maxBeamWidth, maxSequenceLength, logitsType, sessionConfig.decoderPerRequest,
-            mNumMicroBatches);
+        createDecoders(mMicroBatchConfig.genBatchSize, maxBeamWidth, maxSequenceLength, logitsType,
+            sessionConfig.decoderPerRequest, mMicroBatchConfig.numGenBatches);
     }
 
-    if (mWorldConfig.isPipelineParallel() || mNumMicroBatches > 1)
+    if (mWorldConfig.isPipelineParallel() || mMicroBatchConfig.numGenBatches > 1)
     {
         mReceivedEvents.clear();
-        for (SizeType i = 0; i < mNumMicroBatches; ++i)
+        for (SizeType i = 0; i < mMicroBatchConfig.numGenBatches; ++i)
             mReceivedEvents.emplace_back();
     }
 
     if (mWorldConfig.isTensorParallel() && mModelConfig.useCustomAllReduce())
     {
-        createCustomAllReduceWorkspace(microBatchSize, maxBeamWidth, maxSequenceLength);
+        createCustomAllReduceWorkspace(mMicroBatchConfig.genBatchSize, maxBeamWidth, maxSequenceLength);
     }
 
-    // we don't know maxInputLength and maxNewTokens yet and ignore those for pre-allocation
-    auto const generationConfig
-        = RuntimeBuffers::GenerationConfig{microBatchSize, maxBeamWidth, 0, 0, maxSequenceLength};
-
     for (auto& buffers : mBuffers)
-        buffers->reshape(generationConfig, mModelConfig, mWorldConfig);
+    {
+        // we don't know maxInputLength yet and ignore it for pre-allocation
+        buffers->generationConfig
+            = RuntimeBuffers::GenerationConfig{mMicroBatchConfig.genBatchSize, maxBeamWidth, 0, maxSequenceLength};
+        buffers->reshape(mModelConfig, mWorldConfig);
+    }
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
@@ -263,8 +295,8 @@ void GptSession::kvCacheAddSequences(SizeType beamWidth, SizeType microBatchId,
     }
 }
 
-ITensor::SharedPtr GptSession::initNewTokens(
-    GenerationInput const& inputs, SamplingConfig const& samplingConfig, SizeType microBatchId)
+ITensor::SharedPtr GptSession::initDecoder(ITensor& outputIds, GenerationInput const& inputs,
+    SamplingConfig const& samplingConfig, SizeType microBatchId) const
 {
     if (mWorldConfig.isLastPipelineParallelRank())
     {
@@ -274,9 +306,29 @@ ITensor::SharedPtr GptSession::initNewTokens(
     }
     else if (mWorldConfig.isFirstPipelineParallelRank())
     {
+        auto& manager = mRuntime->getBufferManager();
+        auto const& stream = mRuntime->getStreamPtr();
+
+        auto const inputLengths = inputs.lengths;
+        auto const batchSize = static_cast<SizeType>(inputLengths->getSize());
+
+        auto const inputLengthsHost = manager.copyFrom(*inputLengths, MemoryType::kCPU);
+        auto const* inputLengthsData = bufferCast<SizeType>(*inputLengthsHost);
+        SizeType const maxInputLength = *std::max_element(inputLengthsData, inputLengthsData + inputLengths->getSize());
+
+        ITensor::SharedPtr inputOffsets = manager.emptyTensor(MemoryType::kGPU, TRTDataType<SizeType>::value);
+        if (inputs.packed)
+        {
+            inputOffsets->reshape(ITensor::makeShape({batchSize + 1}));
+            manager.setZero(*inputOffsets);
+            kernels::invokeInclusiveSum(*ITensor::slice(inputOffsets, 1), *inputLengths, manager, *stream);
+        }
+
+        kernels::initOutputIds(outputIds, *inputs.ids, *inputLengths, *inputOffsets, inputs.padId, inputs.endId,
+            maxInputLength, inputs.packed, *stream);
+
         auto const beamWidth = samplingConfig.beamWidth;
-        auto const batchSize = static_cast<SizeType>(inputs.lengths->getSize());
-        return mRuntime->getBufferManager().gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kINT32);
+        return manager.gpu(ITensor::makeShape({batchSize, beamWidth}), nvinfer1::DataType::kINT32);
     }
     else
     {
@@ -286,32 +338,34 @@ ITensor::SharedPtr GptSession::initNewTokens(
 
 namespace
 {
-std::vector<GenerationInput> splitInputs(
-    GenerationInput const& inputs, SizeType numMicroBatches, BufferManager& manager)
+std::tuple<std::vector<ITensor::SharedPtr>, std::vector<ITensor::SharedPtr>, std::vector<SizeType>> splitInputIds(
+    GenerationInput const& inputs, SizeType microBatchSize, BufferManager& manager)
 {
-    std::vector<GenerationInput> inputBatches;
     auto const numRequests = inputs.lengths->getShape().d[0];
-    auto const microBatchSize = tc::ceilDiv(numRequests, numMicroBatches);
 
+    std::vector<ITensor::SharedPtr> inputIds;
+    std::vector<ITensor::SharedPtr> inputLengths;
+    std::vector<SizeType> microBatchOffsets(1, 0);
     if (inputs.packed)
     {
-        auto contextLengthsHost = manager.copyFrom(*inputs.lengths, MemoryType::kCPU);
+        auto const contextLengthsHost = manager.copyFrom(*inputs.lengths, MemoryType::kCPU);
         ITensor::SharedPtr inputIdsView = ITensor::view(inputs.ids);
         inputIdsView->squeeze(0);
-        auto contextLengthsRange = BufferRange<SizeType>(*contextLengthsHost);
+        auto const contextLengthsRange = BufferRange<SizeType>(*contextLengthsHost);
 
         auto tokensBegin = 0;
         for (auto offset = 0; offset < numRequests; offset += microBatchSize)
         {
-            auto batchSize = std::min(microBatchSize, numRequests - offset);
-            auto numTokens = std::accumulate(
+            auto const batchSize = std::min(microBatchSize, numRequests - offset);
+            auto const numTokens = std::accumulate(
                 contextLengthsRange.begin() + offset, contextLengthsRange.begin() + offset + batchSize, 0);
 
             ITensor::SharedPtr batchInputs = ITensor::slice(inputIdsView, tokensBegin, numTokens);
             batchInputs->reshape(ITensor::makeShape({1, numTokens}));
 
-            inputBatches.emplace_back(inputs.endId, inputs.padId, batchInputs,
-                ITensor::slice(inputs.lengths, offset, batchSize), inputs.packed);
+            inputIds.emplace_back(std::move(batchInputs));
+            inputLengths.emplace_back(ITensor::slice(inputs.lengths, offset, batchSize));
+            microBatchOffsets.emplace_back(offset + batchSize);
 
             tokensBegin += numTokens;
         }
@@ -320,24 +374,66 @@ std::vector<GenerationInput> splitInputs(
     {
         for (auto offset = 0; offset < numRequests; offset += microBatchSize)
         {
-            auto batchSize = std::min(microBatchSize, numRequests - offset);
-            inputBatches.emplace_back(inputs.endId, inputs.padId, ITensor::slice(inputs.ids, offset, batchSize),
-                ITensor::slice(inputs.lengths, offset, batchSize), inputs.packed);
+            auto const batchSize = std::min(microBatchSize, numRequests - offset);
+
+            inputIds.emplace_back(ITensor::slice(inputs.ids, offset, batchSize));
+            inputLengths.emplace_back(ITensor::slice(inputs.lengths, offset, batchSize));
+            microBatchOffsets.emplace_back(offset + batchSize);
         }
     }
 
-    for (auto& batch : inputBatches)
+    return {inputIds, inputLengths, microBatchOffsets};
+}
+
+std::vector<GenerationInput> splitInputs(GenerationInput const& inputs, SizeType microBatchSize, BufferManager& manager)
+{
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    auto [inputIds, inputLengths, microBatchOffsets] = splitInputIds(inputs, microBatchSize, manager);
+
+    std::vector<GenerationInput> inputBatches;
+    for (std::size_t batchId = 0; batchId < inputIds.size(); ++batchId)
     {
+        inputBatches.emplace_back(
+            inputs.endId, inputs.padId, std::move(inputIds[batchId]), std::move(inputLengths[batchId]), inputs.packed);
+    }
+
+    for (std::size_t batchId = 0; batchId < inputBatches.size(); ++batchId)
+    {
+        auto& batch = inputBatches[batchId];
+        auto const offset = microBatchOffsets[batchId];
+        auto const batchSize = microBatchOffsets[batchId + 1] - offset;
+
         if (inputs.embeddingBiasOpt)
             batch.embeddingBiasOpt = inputs.embeddingBiasOpt;
         if (inputs.badWordsList)
-            batch.badWordsList = inputs.badWordsList;
+        {
+            auto const& shape = inputs.badWordsList->getShape();
+            if (shape.nbDims == 2)
+            {
+                batch.badWordsList = inputs.badWordsList;
+            }
+            else
+            {
+                assert(nbDims == 3);
+                batch.badWordsList = ITensor::slice(inputs.badWordsList, offset, batchSize);
+            }
+        }
         if (inputs.stopWordsList)
-            batch.stopWordsList = inputs.stopWordsList;
+        {
+            batch.stopWordsList = ITensor::slice(inputs.stopWordsList, offset, batchSize);
+        }
         if (inputs.maxNewTokens)
             batch.maxNewTokens = inputs.maxNewTokens;
+
+        if (inputs.promptTuningParams.embeddingTable)
+            batch.promptTuningParams.embeddingTable = inputs.promptTuningParams.embeddingTable;
+        if (inputs.promptTuningParams.tasks)
+            batch.promptTuningParams.tasks = ITensor::slice(inputs.promptTuningParams.tasks, offset, batchSize);
+        if (inputs.promptTuningParams.vocabSize)
+            batch.promptTuningParams.vocabSize = inputs.promptTuningParams.vocabSize;
     }
 
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
     return inputBatches;
 }
 
@@ -381,40 +477,33 @@ void GptSession::generate(
         outputs.contextLogits->reshape(ITensor::makeShape({batchSize, maxInputLength, vocabSizePadded}));
     }
 
-    auto const numMicroBatches = std::min(batchSize, mNumMicroBatches);
-    if (numMicroBatches == 1)
+    if (batchSize <= mMicroBatchConfig.genBatchSize)
     {
         std::vector<GenerationInput> microBatches{inputs};
         generateBatched(outputs, microBatches, samplingConfig);
     }
     else
     {
-        auto const microBatches = splitInputs(inputs, numMicroBatches, manager);
+        auto const microBatches = splitInputs(inputs, mMicroBatchConfig.genBatchSize, manager);
         generateBatched(outputs, microBatches, samplingConfig);
     }
 
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-std::function<void(SizeType microBatchId, SizeType step, bool finished)> GptSession::createOnTokenGeneratedCallback(
-    GenerationOutput& outputs, SizeType numMicroBatches)
+std::function<void(SizeType step, bool finished)> GptSession::createOnTokenGeneratedCallback(GenerationOutput& outputs)
 {
     if (outputs.onTokenGenerated && mWorldConfig.isFirstPipelineParallelRank())
     {
-        ITensor::SharedPtr outputIds{mWorldConfig.isPipelineParallel() || mNumMicroBatches > 1
+        ITensor::SharedPtr outputIds{mWorldConfig.isPipelineParallel() || mMicroBatchConfig.numGenBatches > 1
                 ? outputs.ids
                 : mDecoders.front()->getOutputIds()};
-        auto const lastMicroBatchId = numMicroBatches - 1;
-        return [onTokenGenerated = outputs.onTokenGenerated, outputIds = std::move(outputIds), lastMicroBatchId](
-                   SizeType microBatchId, SizeType step, bool finished)
-        {
-            if (microBatchId == lastMicroBatchId)
-                onTokenGenerated(outputIds, step, finished);
-        };
+        return [onTokenGenerated = outputs.onTokenGenerated, outputIds = std::move(outputIds)](
+                   SizeType step, bool finished) { onTokenGenerated(outputIds, step, finished); };
     }
     else
     {
-        return [](SizeType microBatchId, SizeType step, bool finished) {};
+        return [](SizeType step, bool finished) {};
     }
 }
 
@@ -426,52 +515,50 @@ void GptSession::generateBatched(
     auto& manager = mRuntime->getBufferManager();
     auto const numMicroBatches = static_cast<SizeType>(microBatches.size());
     TLLM_CHECK(numMicroBatches > 0);
-    TLLM_CHECK(numMicroBatches <= mNumMicroBatches);
+    TLLM_CHECK(numMicroBatches <= mMicroBatchConfig.numGenBatches);
     SizeType const beamWidth{samplingConfig.beamWidth};
 
     // Initialize and reshape buffers
-    std::vector<RuntimeBuffers::GenerationConfig> generationConfigs;
     for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId)
     {
         auto const& microBatchInputs = microBatches.at(microBatchId);
         auto& buffers = *mBuffers.at(microBatchId);
-        TLLM_CHECK_WITH_INFO(buffers.allocated, "Buffers not allocated, please call setup first!");
-        buffers.initContextLengths(microBatchInputs.lengths, manager);
-        generationConfigs.emplace_back(
-            RuntimeBuffers::GenerationConfig::fromInput(*microBatchInputs.ids, *buffers.contextLengthsHost,
-                microBatchInputs.packed, beamWidth, mDecoderMaxSequenceLength, microBatchInputs.maxNewTokens));
-        buffers.reshape(generationConfigs.back(), mModelConfig, mWorldConfig);
+        buffers.initFromInput(*microBatchInputs.ids, microBatchInputs.lengths, microBatchInputs.packed, beamWidth,
+            mDecoderMaxSequenceLength, manager);
+        buffers.reshape(mModelConfig, mWorldConfig);
+        buffers.reset(manager);
     }
 
-    auto minMaxNewTokens = std::numeric_limits<SizeType>::max();
     std::vector<SizeType> microBatchOffsets(1, 0);
     microBatchOffsets.reserve(numMicroBatches + 1);
     for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId)
     {
-        auto const& generationConfig = generationConfigs.at(microBatchId);
-        minMaxNewTokens = std::min(minMaxNewTokens, generationConfig.maxNewTokens);
+        auto const& generationConfig = mBuffers.at(microBatchId)->generationConfig;
         microBatchOffsets.emplace_back(microBatchOffsets.back() + generationConfig.batchSize);
     }
 
     for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId)
     {
         auto& buffers = *mBuffers.at(microBatchId);
-        auto const& generationConfig = generationConfigs.at(microBatchId);
         auto const batchOffset = microBatchOffsets.at(microBatchId);
         kvCacheAddSequences(beamWidth, microBatchId, batchOffset);
         auto const& microBatchInputs = microBatches.at(microBatchId);
-        buffers.newTokens = initNewTokens(microBatchInputs, samplingConfig, microBatchId);
-        auto const microBatchSize = generationConfig.batchSize;
+        auto const microBatchSize = buffers.generationConfig.batchSize;
         buffers.outputIds = ITensor::slice(outputs.ids, batchOffset, microBatchSize);
         buffers.outputLengths = ITensor::slice(outputs.lengths, batchOffset, microBatchSize);
+        buffers.newTokens = initDecoder(*buffers.outputIds, microBatchInputs, samplingConfig, microBatchId);
         if (mWorldConfig.isLastPipelineParallelRank() && mModelConfig.computeContextLogits())
         {
             buffers.logits = ITensor::slice(outputs.contextLogits, batchOffset, microBatchSize);
         }
+        if (mModelConfig.usePromptTuning())
+        {
+            buffers.promptTuningParams = microBatchInputs.promptTuningParams;
+        }
     }
 
     // Prepare the onTokenGenerated callback
-    auto const onTokenGenerated = createOnTokenGeneratedCallback(outputs, numMicroBatches);
+    auto const onTokenGenerated = createOnTokenGeneratedCallback(outputs);
 
     if (useCudaGraphs())
     {
@@ -483,101 +570,25 @@ void GptSession::generateBatched(
 
     auto kvCacheManager = mModelConfig.usePagedKvCache() ? mKvCacheManager.get() : nullptr;
 
-    std::vector<RuntimeBuffers::TensorMap> inputBuffers(numMicroBatches * 2);
-    std::vector<RuntimeBuffers::TensorMap> outputBuffers(numMicroBatches * 2);
+    executeContextStep(microBatches, microBatchOffsets, kvCacheManager);
+
     std::vector<bool> microBatchesFinished(numMicroBatches, false);
-    auto notFinished = [&microBatchesFinished]()
-    { return std::any_of(microBatchesFinished.begin(), microBatchesFinished.end(), [](bool x) { return !x; }); };
-
-    for (SizeType step = 0; step < minMaxNewTokens && notFinished(); ++step)
+    SizeType numBatchesFinished{0};
+    SizeType step{0};
+    while (numBatchesFinished < numMicroBatches)
     {
-        auto const flipFlopId = step % 2;
-        for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId)
-        {
-            if (microBatchesFinished.at(microBatchId))
-                continue;
+        ++step;
+        numBatchesFinished
+            += executeGenerationStep(step, microBatches, microBatchOffsets, kvCacheManager, microBatchesFinished);
 
-            auto& buffers = *mBuffers.at(microBatchId);
-            auto& generationConfig = generationConfigs.at(microBatchId);
-
-            auto const contextId = flipFlopId * numMicroBatches + microBatchId;
-            auto& inputBuffer = inputBuffers[contextId];
-            auto& outputBuffer = outputBuffers[contextId];
-
-            if (step == 0)
-            {
-                SizeType const contextIdForContextPhase
-                    = (mRuntime->getNbProfiles() == 2 ? 2 * mNumMicroBatches : 0) + microBatchId;
-
-                auto const& microBatchInputs = microBatches.at(microBatchId);
-                buffers.prepareContextStep(microBatchInputs.ids, microBatchInputs.padId, manager, kvCacheManager,
-                    microBatchOffsets.at(microBatchId), generationConfig, mModelConfig, mWorldConfig);
-                buffers.getRuntimeBuffers(
-                    inputBuffer, outputBuffer, step, microBatchInputs.ids, mModelConfig, mWorldConfig);
-                mRuntime->setInputTensors(contextIdForContextPhase, inputBuffer);
-                mRuntime->setOutputTensors(contextIdForContextPhase, outputBuffer);
-
-                TLLM_CHECK_WITH_INFO(
-                    mRuntime->executeContext(contextIdForContextPhase), "Executing TRT engine in context step failed!");
-                sync_check_cuda_error();
-
-                buffers.postContextStep(manager, generationConfig, mModelConfig, mWorldConfig);
-                sync_check_cuda_error();
-            }
-            else
-            {
-                auto nextInputIds = buffers.prepareNextStep(step - 1, manager, kvCacheManager,
-                    microBatchOffsets.at(microBatchId), generationConfig, mModelConfig, mWorldConfig);
-                buffers.getRuntimeBuffers(inputBuffer, outputBuffer, step, nextInputIds, mModelConfig, mWorldConfig);
-                mRuntime->setInputTensors(contextId, inputBuffer);
-                mRuntime->setOutputTensors(contextId, outputBuffer);
-
-                if (useCudaGraphs())
-                {
-                    mCudaGraphInstances.at(contextId).prepareNextGraph(*mRuntime, contextId);
-                }
-
-                // check decoder result of previous iteration
-                auto const microBatchSize = generationConfig.batchSize;
-                auto const shouldStop = shouldStopSync(microBatchSize, beamWidth, microBatchId);
-                onTokenGenerated(microBatchId, step - 1, shouldStop);
-
-                if (shouldStop)
-                {
-                    mLogger->log(nvinfer1::ILogger::Severity::kVERBOSE, "GPT decoding finished early");
-                    microBatchesFinished.at(microBatchId) = true;
-                    continue;
-                }
-
-                if (useCudaGraphs())
-                {
-                    auto& cudaGraphInstance = mCudaGraphInstances.at(contextId);
-                    TLLM_CHECK(cudaGraphInstance.hasInstance());
-                    cudaGraphInstance.launch(mRuntime->getStream());
-                }
-                else
-                {
-                    TLLM_CHECK_WITH_INFO(mRuntime->executeContext(contextId),
-                        tc::fmtstr("Executing TRT engine in step %d failed!", step));
-                }
-                sync_check_cuda_error();
-            }
-
-            std::swap(buffers.cacheIndirectionDecoderInput, buffers.cacheIndirectionDecoderOutput);
-
-            auto const maxInputLength = generationConfigs.at(microBatchId).maxInputLength;
-            auto const decoderStep = maxInputLength + step;
-            decoderStepAsync(decoderStep, microBatchId);
-        }
+        onTokenGenerated(step - 1, numBatchesFinished == numMicroBatches);
     }
 
     // Collect the results for the last step
     for (auto microBatchId = 0; microBatchId < numMicroBatches; ++microBatchId)
     {
-        auto const& generationConfig = generationConfigs.at(microBatchId);
+        auto const& generationConfig = mBuffers.at(microBatchId)->generationConfig;
         auto const microBatchSize = generationConfig.batchSize;
-        auto const shouldStop = shouldStopSync(microBatchSize, beamWidth, microBatchId);
-        onTokenGenerated(microBatchId, minMaxNewTokens - 1, shouldStop);
 
         auto const firstBatchIdx = microBatchOffsets.at(microBatchId);
         if (mModelConfig.usePagedKvCache())
@@ -594,10 +605,129 @@ void GptSession::generateBatched(
         else if (!mWorldConfig.isPipelineParallel())
             manager.copy(*mDecoders.at(microBatchId)->getOutputIds(), *mBuffers.at(microBatchId)->outputIds);
     }
+
     manager.getStream().synchronize();
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
+void GptSession::executeContextStep(std::vector<GenerationInput> const& generationBatches,
+    std::vector<SizeType> const& generationBatchOffsets, KvCacheManager const* kvCacheManager)
+{
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    auto& manager = mRuntime->getBufferManager();
+
+    auto const numGenerationBatches = static_cast<SizeType>(generationBatches.size());
+    auto constexpr step = 0;
+    for (auto generationBatchId = 0; generationBatchId < numGenerationBatches; ++generationBatchId)
+    {
+        auto const& generationBatchInputs = generationBatches.at(generationBatchId);
+        auto& generationBuffers = *mBuffers.at(generationBatchId);
+
+        auto const contextBatchSize = mMicroBatchConfig.ctxBatchSize;
+        auto [inputIds, inputLengths, contextBatchOffsets]
+            = splitInputIds(generationBatchInputs, contextBatchSize, manager);
+        auto contextBuffers = generationBuffers.split(contextBatchSize, mModelConfig, mWorldConfig);
+        TLLM_CHECK(inputIds.size() == contextBuffers.size());
+        auto const numContextBatches = static_cast<SizeType>(contextBuffers.size());
+
+        for (auto contextBatchId = 0; contextBatchId < numContextBatches; ++contextBatchId)
+        {
+            auto batchOffset = generationBatchOffsets.at(generationBatchId) + contextBatchOffsets.at(contextBatchId);
+            auto& buffers = contextBuffers.at(contextBatchId);
+            auto& inputBuffer = buffers.inputBuffers[0];
+            auto& outputBuffer = buffers.outputBuffers[0];
+
+            auto const contextId = mMicroBatchConfig.getCtxContextId(generationBatchId, contextBatchId);
+
+            buffers.prepareContextStep(inputIds.at(contextBatchId), generationBatchInputs.padId, manager,
+                kvCacheManager, batchOffset, mModelConfig, mWorldConfig);
+            buffers.getRuntimeBuffers(
+                inputBuffer, outputBuffer, step, inputIds.at(contextBatchId), mCommPtrs, mModelConfig, mWorldConfig);
+            mRuntime->setInputTensors(contextId, inputBuffer);
+            mRuntime->setOutputTensors(contextId, outputBuffer);
+
+            TLLM_CHECK_WITH_INFO(mRuntime->executeContext(contextId), "Executing TRT engine in context step failed!");
+            sync_check_cuda_error();
+        }
+
+        generationBuffers.postContextStep(contextBuffers, manager, mModelConfig, mWorldConfig);
+        sync_check_cuda_error();
+
+        std::swap(generationBuffers.cacheIndirectionDecoderInput, generationBuffers.cacheIndirectionDecoderOutput);
+
+        auto const decoderStep = generationBuffers.generationConfig.maxInputLength + step;
+        decoderStepAsync(decoderStep, generationBatchId);
+    }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+SizeType GptSession::executeGenerationStep(SizeType step, std::vector<GenerationInput> const& microBatches,
+    std::vector<SizeType> const& microBatchOffsets, KvCacheManager* kvCacheManager,
+    std::vector<bool>& microBatchesFinished)
+{
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    auto& manager = mRuntime->getBufferManager();
+
+    auto const numMicroBatches = static_cast<SizeType>(microBatches.size());
+    SizeType numBatchesFinished{0};
+
+    auto const flipFlopId = step % 2;
+    for (auto generationBatchId = 0; generationBatchId < numMicroBatches; ++generationBatchId)
+    {
+        if (microBatchesFinished.at(generationBatchId))
+            continue;
+
+        auto& buffers = *mBuffers.at(generationBatchId);
+        auto const& generationConfig = buffers.generationConfig;
+
+        auto const contextId = mMicroBatchConfig.getGenContextId(flipFlopId, generationBatchId);
+        auto& inputBuffer = buffers.inputBuffers[flipFlopId];
+        auto& outputBuffer = buffers.outputBuffers[flipFlopId];
+
+        auto nextInputIds = buffers.prepareNextStep(
+            step - 1, manager, kvCacheManager, microBatchOffsets.at(generationBatchId), mModelConfig, mWorldConfig);
+        buffers.getRuntimeBuffers(inputBuffer, outputBuffer, step, nextInputIds, mCommPtrs, mModelConfig, mWorldConfig);
+        mRuntime->setInputTensors(contextId, inputBuffer);
+        mRuntime->setOutputTensors(contextId, outputBuffer);
+
+        if (useCudaGraphs())
+        {
+            mCudaGraphInstances.at(contextId).prepareNextGraph(*mRuntime, contextId);
+        }
+
+        // check decoder result of previous iteration
+        if (shouldStopSync(generationConfig.batchSize, generationConfig.beamWidth, generationBatchId))
+        {
+            mLogger->log(nvinfer1::ILogger::Severity::kVERBOSE,
+                tc::fmtstr("GPT decoding finished for step %d and microBatchId %d", step, generationBatchId).c_str());
+            microBatchesFinished.at(generationBatchId) = true;
+            numBatchesFinished += 1;
+            continue;
+        }
+
+        if (useCudaGraphs())
+        {
+            auto& cudaGraphInstance = mCudaGraphInstances.at(contextId);
+            TLLM_CHECK(cudaGraphInstance.hasInstance());
+            cudaGraphInstance.launch(mRuntime->getStream());
+        }
+        else
+        {
+            TLLM_CHECK_WITH_INFO(
+                mRuntime->executeContext(contextId), tc::fmtstr("Executing TRT engine in step %d failed!", step));
+        }
+        sync_check_cuda_error();
+
+        std::swap(buffers.cacheIndirectionDecoderInput, buffers.cacheIndirectionDecoderOutput);
+
+        auto const decoderStep = generationConfig.maxInputLength + step;
+        decoderStepAsync(decoderStep, generationBatchId);
+    }
+
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+    return numBatchesFinished;
+}
+
 void GptSession::decoderStepAsync(SizeType decoderStep, SizeType microBatchId)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
@@ -662,7 +792,7 @@ void GptSession::decoderStepAsync(SizeType decoderStep, SizeType microBatchId)
         mCommStream->record(mReceivedEvents.at(microBatchId).get());
     }
 
-    if (!mWorldConfig.isPipelineParallel() && mNumMicroBatches > 1)
+    if (!mWorldConfig.isPipelineParallel() && mMicroBatchConfig.numGenBatches > 1)
     {
         updateOutputIds(outputIds, newTokens, decoderStep, stream);
         stream.record(mReceivedEvents.at(microBatchId).get());
@@ -684,7 +814,7 @@ bool GptSession::shouldStopSync(SizeType batchSize, SizeType beamWidth, SizeType
         decoder.forwardSync();
         nbFinished = *bufferCast<SizeType>(*decoder.getNbFinished());
 
-        if (!mWorldConfig.isPipelineParallel() && mNumMicroBatches > 1)
+        if (!mWorldConfig.isPipelineParallel() && mMicroBatchConfig.numGenBatches > 1)
         {
             // ensure outputIds have been updated
             mReceivedEvents.at(microBatchId).synchronize();
diff --git a/cpp/tensorrt_llm/runtime/memoryCounters.cpp b/cpp/tensorrt_llm/runtime/memoryCounters.cpp
index 7f741ab60a..48e6d3a277 100644
--- a/cpp/tensorrt_llm/runtime/memoryCounters.cpp
+++ b/cpp/tensorrt_llm/runtime/memoryCounters.cpp
@@ -57,6 +57,12 @@ std::string MemoryCounters::bytesToString(DiffType bytes, int precision)
     return doubleBytesToString(static_cast<double>(bytes), precision);
 }
 
+std::string MemoryCounters::toString() const
+{
+    return tensorrt_llm::common::fmtstr("[MemUsage] GPU %s, CPU %s, Pinned %s", bytesToString(this->getGpu()).c_str(),
+        bytesToString(this->getCpu()).c_str(), bytesToString(this->getPinned()).c_str());
+}
+
 void MemoryCounters::allocate(MemoryType memoryType, MemoryCounters::SizeType size)
 {
     switch (memoryType)
diff --git a/cpp/tensorrt_llm/runtime/promptTuningParams.cpp b/cpp/tensorrt_llm/runtime/promptTuningParams.cpp
new file mode 100644
index 0000000000..60074ad025
--- /dev/null
+++ b/cpp/tensorrt_llm/runtime/promptTuningParams.cpp
@@ -0,0 +1,92 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "tensorrt_llm/runtime/promptTuningParams.h"
+
+namespace tensorrt_llm::runtime
+{
+
+void PromptTuningParams::fillTasksTensor(TensorPtr tasksHost, const SizeType batchSize,
+    const SizeType numContextRequests, const std::vector<SizeType>& reqBeamWidths,
+    const std::vector<SizeType>& reqPromptLengths, BufferManager& manager, bool packedInput)
+{
+    auto const& tasksHostShape = tasksHost->getShape();
+    TLLM_CHECK_WITH_INFO(tasksHostShape.nbDims == 1, "tasksHost expected to have dimension [batchSize]");
+    TLLM_CHECK_WITH_INFO(tasksHostShape.d[0] == batchSize, "tasksHost expected to have dimension [batchSize]");
+
+    auto const tasksHostPtr = bufferCast<SizeType const>(*tasksHost);
+
+    bool validInput = packedInput || numContextRequests == batchSize || numContextRequests == 0;
+    TLLM_CHECK_WITH_INFO(validInput,
+        "fillTasksTensor function with packed inputs must be called with only context requests or only generation "
+        "requests.");
+
+    bool validShapes = (static_cast<SizeType>(reqBeamWidths.size()) == batchSize
+        && static_cast<SizeType>(reqPromptLengths.size()) == numContextRequests
+        && static_cast<SizeType>(promptTuningEnabled.size()) == batchSize);
+    TLLM_CHECK_WITH_INFO(validShapes,
+        "Invalid inputs to fillTasksTensor function. reqBeamWidths and reqPtuningEnabled size must be batchSize and "
+        "propmtLenghts size must be numContextRequests");
+
+    SizeType totalInputSize = 0;
+    std::vector<SizeType> promptTasksHost;
+    for (SizeType bid = 0; bid < batchSize; bid++)
+    {
+        SizeType taskId = promptTuningEnabled[bid] ? tasksHostPtr[bid] : 0;
+        if (packedInput)
+        {
+            if (bid < numContextRequests)
+            {
+                totalInputSize += reqPromptLengths[bid];
+                promptTasksHost.insert(promptTasksHost.end(), reqPromptLengths[bid], taskId);
+            }
+            else
+            {
+                for (SizeType beam = 0; beam < reqBeamWidths[bid]; ++beam)
+                {
+                    promptTasksHost.insert(promptTasksHost.end(), 1, taskId);
+                    totalInputSize++;
+                }
+            }
+        }
+        else
+        {
+            if (bid < numContextRequests)
+            {
+                promptTasksHost.push_back(taskId);
+                ++totalInputSize;
+            }
+            else
+            {
+                promptTasksHost.insert(promptTasksHost.end(), reqBeamWidths[bid], taskId);
+                totalInputSize += reqBeamWidths[bid];
+            }
+        }
+    }
+
+    if (packedInput)
+    {
+        tasks = manager.copyFrom(
+            promptTasksHost, runtime::ITensor::makeShape({1, totalInputSize}), runtime::MemoryType::kGPU);
+    }
+    else
+    {
+        tasks = manager.copyFrom(
+            promptTasksHost, runtime::ITensor::makeShape({totalInputSize, 1}), runtime::MemoryType::kGPU);
+    }
+}
+
+} // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp b/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp
index 31b64a20df..6d7ab16d72 100644
--- a/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp
+++ b/cpp/tensorrt_llm/runtime/runtimeBuffers.cpp
@@ -16,7 +16,6 @@
 
 #include "tensorrt_llm/runtime/runtimeBuffers.h"
 
-#include "ipcUtils.h"
 #include "tensorrt_llm/batch_manager/kvCacheManager.h"
 #include "tensorrt_llm/common/stlUtils.h"
 #include "tensorrt_llm/runtime/runtimeKernels.h"
@@ -30,8 +29,7 @@ using namespace tensorrt_llm::runtime;
 namespace tc = tensorrt_llm::common;
 
 RuntimeBuffers::GenerationConfig RuntimeBuffers::GenerationConfig::fromInput(ITensor const& inputIds,
-    ITensor const& inputLengthsHost, bool const inputPacked, SizeType const beamWidth, SizeType const maxSequenceLength,
-    std::optional<SizeType> const& maxNewTokensOpt)
+    ITensor const& inputLengthsHost, bool const inputPacked, SizeType const beamWidth, SizeType const maxSequenceLength)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto const batchSize = static_cast<SizeType>(inputLengthsHost.getSize());
@@ -54,13 +52,12 @@ RuntimeBuffers::GenerationConfig RuntimeBuffers::GenerationConfig::fromInput(ITe
         maxInputLength = inputShape.d[1];
     }
 
-    auto const maxNewTokens = maxNewTokensOpt.value_or(maxSequenceLength - maxInputLength);
-    TLLM_CHECK_WITH_INFO(1 <= maxNewTokens && maxNewTokens <= maxSequenceLength - maxInputLength,
+    TLLM_CHECK_WITH_INFO(maxInputLength < maxSequenceLength,
         "Max input length is equal to or larger that maxSequenceLength given in setup. No new tokens can be "
         "generated.");
 
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-    return GenerationConfig{batchSize, beamWidth, maxInputLength, maxNewTokens, maxSequenceLength, inputLengthSum};
+    return GenerationConfig{batchSize, beamWidth, maxInputLength, maxSequenceLength, inputLengthSum};
 }
 
 void RuntimeBuffers::clear()
@@ -91,6 +88,16 @@ void RuntimeBuffers::clear()
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
+void RuntimeBuffers::clearTensorMaps()
+{
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    for (auto& buffer : inputBuffers)
+        buffer.clear();
+    for (auto& buffer : outputBuffers)
+        buffer.clear();
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
 void RuntimeBuffers::create(TllmRuntime& runtime, GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
@@ -171,41 +178,19 @@ void RuntimeBuffers::create(TllmRuntime& runtime, GptModelConfig const& modelCon
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-void RuntimeBuffers::initContextLengths(TensorPtr const& inputLengths, BufferManager& manager)
+void RuntimeBuffers::initFromInput(ITensor const& inputIds, TensorPtr const& inputLengths, bool inputPacked,
+    SizeType beamWidth, SizeType maxSequenceLength, BufferManager& manager)
 {
     contextLengthsDevice = inputLengths;
     contextLengthsHost->reshape(inputLengths->getShape());
     manager.copy(*contextLengthsDevice, *contextLengthsHost);
     manager.getStream().synchronize(); // wait for context lengths to be copied to host
+
+    generationConfig = RuntimeBuffers::GenerationConfig::fromInput(
+        inputIds, *contextLengthsHost, inputPacked, beamWidth, maxSequenceLength);
 }
 
-void RuntimeBuffers::createCustomAllReduceWorkspace(SizeType maxBatchSize, SizeType maxBeamWidth,
-    SizeType maxSequenceLength, SizeType hiddenSize, WorldConfig const& worldConfig, BufferManager& manager)
-{
-    mIpcMemoryHandles.clear();
-    const std::size_t bufferSize = static_cast<std::size_t>(maxBatchSize) * maxBeamWidth * maxSequenceLength
-        * hiddenSize * worldConfig.getTensorParallelism() * sizeof(float);
-    mIpcMemoryHandles.emplace_back(std::make_shared<IpcMemory>(worldConfig, bufferSize));
-    mIpcMemoryHandles.emplace_back(std::make_shared<IpcMemory>(worldConfig, IpcMemory::FLAGS_SIZE * sizeof(int32_t)));
-    mIpcMemoryHandles.emplace_back(std::make_shared<IpcMemory>(worldConfig, IpcMemory::FLAGS_SIZE * sizeof(int32_t)));
-
-    commPtrs = manager.cpu(
-        ITensor::makeShape({static_cast<SizeType>(mIpcMemoryHandles.size()) * worldConfig.getTensorParallelism()}),
-        nvinfer1::DataType::kINT64);
-    const auto commPtrsData = bufferCast<void*>(*commPtrs);
-
-    for (size_t memIdx = 0; memIdx < mIpcMemoryHandles.size(); memIdx++)
-    {
-        const auto& memCommPtrs = mIpcMemoryHandles[memIdx]->getCommPtrsTensor();
-        for (SizeType tpIdx = 0; tpIdx < worldConfig.getTensorParallelism(); tpIdx++)
-        {
-            commPtrsData[memIdx * worldConfig.getTensorParallelism() + tpIdx] = memCommPtrs[tpIdx];
-        }
-    }
-}
-
-void RuntimeBuffers::reshape(
-    GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
+void RuntimeBuffers::reshape(GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
 
@@ -222,8 +207,10 @@ void RuntimeBuffers::reshape(
 
     lastTokenIds->reshape(ITensor::makeShape({batchSize}));
 
-    auto kvCacheShape
+    auto kvCacheReserve
         = ITensor::makeShape({batchSize, 2, modelConfig.getNbKvHeads(), maxSeqLength, modelConfig.getSizePerHead()});
+    auto kvCacheShape
+        = ITensor::makeShape({batchSize, 2, modelConfig.getNbKvHeads(), maxInputLength, modelConfig.getSizePerHead()});
     if (modelConfig.usePagedKvCache())
     {
         auto const localNbLayers = modelConfig.getNbLayers(worldConfig.getPipelineParallelism());
@@ -240,7 +227,7 @@ void RuntimeBuffers::reshape(
     }
     else
     {
-        utils::reshapeBufferVector(presentKeysVals, kvCacheShape);
+        utils::reshapeBufferVector(presentKeysVals, kvCacheReserve);
     }
 
     if (modelConfig.useGptAttentionPlugin())
@@ -250,7 +237,10 @@ void RuntimeBuffers::reshape(
     }
     else
     {
-        utils::reshapeBufferVector(presentKeysValsAlt, kvCacheShape);
+        utils::reshapeBufferVector(presentKeysValsAlt, kvCacheReserve);
+        // present KV cache tensors will be reshaped by shape inference.
+        // reshape to the required shape here to make context batch slicing work correctly.
+        utils::reshapeBufferVector(presentKeysVals, kvCacheShape);
     }
 
     auto const cacheIndirShape = ITensor::makeShape({batchSize, beamWidth, maxSeqLength});
@@ -260,9 +250,9 @@ void RuntimeBuffers::reshape(
     if (worldConfig.isPipelineParallel())
     {
         // reserve max size
-        auto const maxNumTokens = std::max(batchSize * beamWidth, batchSize * maxInputLength);
+        auto const maxNumTokens = std::max(beamWidth, maxInputLength);
         auto const hiddenSize = modelConfig.getHiddenSize() * worldConfig.getTensorParallelism();
-        auto const hiddenStatesShape = ITensor::makeShape({1, maxNumTokens, hiddenSize});
+        auto const hiddenStatesShape = ITensor::makeShape({batchSize, maxNumTokens, hiddenSize});
         hiddenStates->reshape(hiddenStatesShape);
     }
 
@@ -270,8 +260,104 @@ void RuntimeBuffers::reshape(
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-void RuntimeBuffers::gatherLastTokenLogits(BufferManager& manager, GenerationConfig const& generationConfig,
-    GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
+void RuntimeBuffers::reset(BufferManager& manager)
+{
+    clearTensorMaps();
+    manager.setZero(*cacheIndirectionDecoderInput);
+    manager.setZero(*cacheIndirectionDecoderOutput);
+}
+
+std::vector<RuntimeBuffers> RuntimeBuffers::split(
+    SizeType contextBatchSize, GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
+{
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+
+    std::vector<RuntimeBuffers> bufferSlices;
+    auto const generationBatchSize = generationConfig.batchSize;
+    bufferSlices.reserve(tc::ceilDiv(generationBatchSize, contextBatchSize));
+    if (contextBatchSize >= generationBatchSize)
+    {
+        bufferSlices.emplace_back(*this);
+    }
+    else
+    {
+        for (auto offset = 0; offset < generationBatchSize; offset += contextBatchSize)
+        {
+            auto const batchSize = std::min(contextBatchSize, generationBatchSize - offset);
+            auto& buffers = bufferSlices.emplace_back();
+            buffers.generationConfig = generationConfig;
+            buffers.generationConfig.batchSize = batchSize;
+
+            buffers.contextLengthsHost = ITensor::slice(contextLengthsHost, offset, batchSize);
+            buffers.contextLengthsDevice = ITensor::slice(contextLengthsDevice, offset, batchSize);
+
+            if (worldConfig.isLastPipelineParallelRank() && !modelConfig.computeContextLogits())
+            {
+                buffers.logits = ITensor::slice(logits, offset, batchSize);
+            }
+
+            buffers.lastTokenIds = ITensor::slice(lastTokenIds, offset, batchSize);
+
+            if (modelConfig.usePagedKvCache())
+            {
+                auto const& realCacheBlockPointersShape = kvCacheBlockPointersHost->getShape();
+                auto const localNbLayers = realCacheBlockPointersShape.d[0];
+                auto const maxBlocksPerSeq = realCacheBlockPointersShape.d[3];
+
+                // enable slicing by moving generationBatchSize to first dim
+                auto const fakeCacheBlockPointersShape
+                    = ITensor::makeShape({generationBatchSize, localNbLayers, 2, maxBlocksPerSeq});
+                TensorPtr kvCacheBlockPointersHostView{
+                    ITensor::view(kvCacheBlockPointersHost, fakeCacheBlockPointersShape)};
+                TensorPtr kvCacheBlockPointersDeviceView{
+                    ITensor::view(kvCacheBlockPointersDevice, fakeCacheBlockPointersShape)};
+
+                // slice and reshape to correct shape
+                auto const cacheBlockPointersShape = ITensor::makeShape({localNbLayers, batchSize, 2, maxBlocksPerSeq});
+                buffers.kvCacheBlockPointersHost = ITensor::slice(kvCacheBlockPointersHostView, offset, batchSize);
+                buffers.kvCacheBlockPointersHost->reshape(cacheBlockPointersShape);
+                buffers.kvCacheBlockPointersDevice = ITensor::slice(kvCacheBlockPointersDeviceView, offset, batchSize);
+                buffers.kvCacheBlockPointersDevice->reshape(cacheBlockPointersShape);
+            }
+            else
+            {
+                buffers.presentKeysVals = utils::sliceBufferVector(presentKeysVals, offset, batchSize);
+            }
+
+            if (modelConfig.useGptAttentionPlugin())
+            {
+                buffers.pastKeyValueLengths = ITensor::slice(pastKeyValueLengths, offset, batchSize);
+                buffers.requestTypes = ITensor::slice(requestTypes, offset, batchSize);
+            }
+            else
+            {
+                buffers.presentKeysValsAlt = utils::sliceBufferVector(presentKeysValsAlt, offset, batchSize);
+            }
+
+            if (worldConfig.isPipelineParallel())
+            {
+                buffers.hiddenStates = ITensor::slice(hiddenStates, offset, batchSize);
+            }
+
+            buffers.cacheIndirectionDecoderOutput = ITensor::slice(cacheIndirectionDecoderOutput, offset, batchSize);
+
+            if (modelConfig.usePromptTuning())
+            {
+                auto const& ptuningEnabled = promptTuningParams.promptTuningEnabled;
+                buffers.promptTuningParams.promptTuningEnabled
+                    = std::vector<bool>(ptuningEnabled.begin() + offset, ptuningEnabled.begin() + offset + batchSize);
+
+                buffers.promptTuningParams.tasks = ITensor::slice(promptTuningParams.tasks, offset, batchSize);
+            }
+        }
+    }
+
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+    return bufferSlices;
+}
+
+void RuntimeBuffers::gatherLastTokenLogits(
+    BufferManager& manager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     TLLM_CHECK_WITH_INFO(modelConfig.computeContextLogits(),
@@ -294,8 +380,29 @@ void RuntimeBuffers::gatherLastTokenLogits(BufferManager& manager, GenerationCon
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-void RuntimeBuffers::tile(BufferManager& manager, GenerationConfig const& generationConfig,
-    GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
+void RuntimeBuffers::copyAttentionMasks(std::vector<RuntimeBuffers> const& contextBatches, BufferManager& manager)
+{
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    auto const batchSize = generationConfig.batchSize;
+    auto const maxInputLength = generationConfig.maxInputLength;
+
+    // TODO(rkobus) include tiling
+    attentionMask = manager.gpu(ITensor::makeShape({batchSize, maxInputLength}), nvinfer1::DataType::kINT32);
+
+    auto const numContextBatches = static_cast<SizeType>(contextBatches.size());
+    auto offset = 0;
+    for (auto contextBatchId = 0; contextBatchId < numContextBatches; ++contextBatchId)
+    {
+        auto& buffers = contextBatches.at(contextBatchId);
+        auto contextBatchSize = buffers.generationConfig.batchSize;
+        auto attentionMaskSlice = ITensor::slice(attentionMask, offset, contextBatchSize);
+        manager.copy(*buffers.attentionMask, *attentionMaskSlice);
+        offset += contextBatchSize;
+    }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
+void RuntimeBuffers::tile(BufferManager& manager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto const beamWidth = generationConfig.beamWidth;
@@ -333,7 +440,7 @@ void RuntimeBuffers::tile(BufferManager& manager, GenerationConfig const& genera
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-void RuntimeBuffers::postContextStep(BufferManager& manager, GenerationConfig const& generationConfig,
+void RuntimeBuffers::postContextStep(std::vector<RuntimeBuffers> const& contextBuffers, BufferManager& manager,
     GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
@@ -346,15 +453,22 @@ void RuntimeBuffers::postContextStep(BufferManager& manager, GenerationConfig co
         auto hostRequestTypes = bufferCast<int32_t>(*requestTypes);
         std::fill_n(hostRequestTypes, requestTypes->getSize(), 1);
     }
+    else
+    {
+        copyAttentionMasks(contextBuffers, manager);
+    }
+
+    // TODO(rkobus) handle this more gracefully
+    positionIds = manager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32);
 
     if (modelConfig.computeContextLogits())
     {
-        gatherLastTokenLogits(manager, generationConfig, modelConfig, worldConfig);
+        gatherLastTokenLogits(manager, modelConfig, worldConfig);
     }
 
     if (beamWidth > 1)
     {
-        tile(manager, generationConfig, modelConfig, worldConfig);
+        tile(manager, modelConfig, worldConfig);
     }
 
     // use output lengths after context step
@@ -371,12 +485,25 @@ void RuntimeBuffers::postContextStep(BufferManager& manager, GenerationConfig co
         kvCacheBlockPointersHost->reshape(cacheBlockPointersShape);
         kvCacheBlockPointersDevice->reshape(cacheBlockPointersShape);
     }
+
+    if (modelConfig.usePromptTuning())
+    {
+        std::vector<SizeType> reqBeamWidths(batchSize, beamWidth);
+        //// Note: reqPromptLenghts won't be used
+        std::vector<SizeType> reqPromptLengths;
+        // Copy the generationInput tasks to host
+        promptTuningTasksHost = manager.copyFrom(*promptTuningParams.tasks, MemoryType::kPINNED);
+        // Update the promptTuningParams tasks tensor
+        promptTuningParams.fillTasksTensor(promptTuningTasksHost, batchSize, 0, reqBeamWidths, reqPromptLengths,
+            manager, modelConfig.usePackedInput());
+    }
+
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType const padId, BufferManager& manager,
-    KvCacheManager const* kvCacheManager, SizeType firstBatchSlotIdx, GenerationConfig const& generationConfig,
-    GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
+    KvCacheManager const* kvCacheManager, SizeType firstBatchSlotIdx, GptModelConfig const& modelConfig,
+    WorldConfig const& worldConfig)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& stream = manager.getStream();
@@ -391,12 +518,10 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c
         auto pastKeyValueLengthsPtr = bufferCast<SizeType>(*pastKeyValueLengths);
         TLLM_CHECK(pastKeyValueLengths->getSize() == static_cast<std::size_t>(batchSize));
         std::fill_n(pastKeyValueLengthsPtr, batchSize, 0);
-        if (modelConfig.useGptAttentionPlugin())
-        {
-            auto RequestTypesPtr = bufferCast<int32_t>(*requestTypes);
-            TLLM_CHECK(requestTypes->getSize() == static_cast<std::size_t>(batchSize));
-            std::fill_n(RequestTypesPtr, batchSize, 0);
-        }
+
+        auto RequestTypesPtr = bufferCast<int32_t>(*requestTypes);
+        TLLM_CHECK(requestTypes->getSize() == static_cast<std::size_t>(batchSize));
+        std::fill_n(RequestTypesPtr, batchSize, 0);
 
         auto const& inputShape = inputIds->getShape();
         auto const contextLengthsHostPtr = bufferCast<SizeType const>(*contextLengthsHost);
@@ -417,10 +542,19 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c
         }
         else if (modelVariant == GptModelConfig::ModelVariant::kGlm)
         {
-            auto const positionIdsVec = getPositionIdsContextPhaseGlm(
-                batchSize, maxInputLength, contextLengthsHostPtr, modelConfig.useGptAttentionPlugin());
-            auto const positionIdsShape = ITensor::makeShape({batchSize, 2, maxInputLength});
-            positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU);
+            auto const positionIdsVec = getPositionIdsContextPhaseGlm(batchSize, maxInputLength, contextLengthsHostPtr,
+                modelConfig.useGptAttentionPlugin(), modelConfig.usePackedInput());
+            if (modelConfig.usePackedInput())
+            {
+                int num_tokens = (int) positionIdsVec.size() / 2;
+                auto const positionIdsShape = ITensor::makeShape({1, 2, num_tokens});
+                positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU);
+            }
+            else
+            {
+                auto const positionIdsShape = ITensor::makeShape({batchSize, 2, maxInputLength});
+                positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU);
+            }
         }
         else
         {
@@ -433,6 +567,23 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c
             auto const hiddenStatesShape = ITensor::makeShape({inputShape.d[0], inputShape.d[1], hiddenSize});
             hiddenStates->reshape(hiddenStatesShape);
         }
+
+        if (modelConfig.usePromptTuning())
+        {
+            std::vector<SizeType> reqBeamWidths(batchSize, 1);
+            std::vector<SizeType> reqPromptLengths;
+            for (SizeType i = 0; i < batchSize; ++i)
+            {
+                reqPromptLengths.push_back(contextLengthsHostPtr[i]);
+            }
+
+            // Copy the generationInput tasks to host
+            promptTuningTasksHost = manager.copyFrom(*promptTuningParams.tasks, MemoryType::kPINNED);
+
+            // Update the tasks tensor
+            promptTuningParams.fillTasksTensor(promptTuningTasksHost, batchSize, batchSize, reqBeamWidths,
+                reqPromptLengths, manager, modelConfig.usePackedInput());
+        }
     }
     else
     {
@@ -470,14 +621,12 @@ void RuntimeBuffers::prepareContextStep(TensorPtr const& inputIds, TokenIdType c
         manager.copy(*contextLengthsDevice, *lastTokenIds);
     }
 
-    manager.setZero(*cacheIndirectionDecoderInput);
-    manager.setZero(*cacheIndirectionDecoderOutput);
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
 RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, BufferManager& manager,
-    KvCacheManager* kvCacheManager, SizeType firstBatchSlotIdx, GenerationConfig const& generationConfig,
-    GptModelConfig const& modelConfig, WorldConfig const& worldConfig)
+    KvCacheManager* kvCacheManager, SizeType firstBatchSlotIdx, GptModelConfig const& modelConfig,
+    WorldConfig const& worldConfig)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     auto& stream = manager.getStream();
@@ -519,10 +668,18 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, B
         }
         else if (modelVariant == GptModelConfig::ModelVariant::kGlm)
         {
-            auto const positionIdsVec = getPositionIdsGenerationPhaseGlm(
-                batchSize, beamWidth, step, contextLengthsHostPtr, modelConfig.useGptAttentionPlugin());
-            auto const positionIdsShape = ITensor::makeShape({batchSize * beamWidth, 2, 1});
-            positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU);
+            auto const positionIdsVec = getPositionIdsGenerationPhaseGlm(batchSize, beamWidth, step,
+                contextLengthsHostPtr, modelConfig.useGptAttentionPlugin(), modelConfig.usePackedInput());
+            if (modelConfig.usePackedInput())
+            {
+                auto const positionIdsShape = ITensor::makeShape({1, 2, batchSize * beamWidth});
+                positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU);
+            }
+            else
+            {
+                auto const positionIdsShape = ITensor::makeShape({batchSize * beamWidth, 2, 1});
+                positionIds = manager.copyFrom(positionIdsVec, positionIdsShape, MemoryType::kGPU);
+            }
         }
         else
         {
@@ -538,7 +695,7 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, B
     }
     else
     {
-        auto const shape = attentionMask->getShape();
+        auto const& shape = attentionMask->getShape();
         auto const nbInputs = shape.d[0];
         auto const oldLength = shape.d[1];
         auto const newLength = oldLength + 1;
@@ -583,13 +740,13 @@ RuntimeBuffers::TensorPtr RuntimeBuffers::prepareNextStep(SizeType const step, B
     {
         kernels::invokeInclusiveSum(*lastTokenIds, *lastTokenIds, manager, stream);
     }
-
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
     return nextInputIds;
 }
 
 void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, SizeType const step,
-    TensorPtr const& inputIds, GptModelConfig const& modelConfig, WorldConfig const& worldConfig) const
+    TensorPtr const& inputIds, TensorPtr const& commPtrs, GptModelConfig const& modelConfig,
+    WorldConfig const& worldConfig) const
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
     inputBuffers.clear();
@@ -676,49 +833,110 @@ void RuntimeBuffers::getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outpu
     {
         inputBuffers.insert_or_assign("all_reduce_workspace", commPtrs);
     }
+
+    if (modelConfig.usePromptTuning())
+    {
+        inputBuffers.insert_or_assign("prompt_embedding_table", promptTuningParams.embeddingTable);
+        inputBuffers.insert_or_assign("tasks", promptTuningParams.tasks);
+        inputBuffers.insert_or_assign("prompt_vocab_size", promptTuningParams.vocabSize);
+    }
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-std::vector<SizeType> RuntimeBuffers::getPositionIdsContextPhaseGlm(
-    SizeType batchSize, SizeType maxInputLength, SizeType const* pInputLengths, bool useGptAttentionPlugin)
+std::vector<SizeType> RuntimeBuffers::getPositionIdsContextPhaseGlm(const SizeType& batchSize,
+    const SizeType& maxInputLength, const SizeType* pInputLengths, bool useGptAttentionPlugin, bool usePackedInput)
 {
     TLLM_CHECK(pInputLengths != nullptr);
 
-    auto const size = batchSize * 2 * maxInputLength;
-    std::vector<SizeType> positionIdsVec(size, 0);
-
-    for (SizeType b = 0; b < batchSize; ++b)
+    std::vector<SizeType> positionIdsVec(1, 0);
+    if (useGptAttentionPlugin)
     {
-        auto* pIdB = positionIdsVec.data() + b * 2 * maxInputLength;
-        auto const length = pInputLengths[b];
-        std::iota(pIdB, pIdB + length, 0);
-
-        pIdB[length - 1] = length - 2;
-        pIdB[length - 1 + maxInputLength] = 1;
-    }
-
-    return positionIdsVec;
-}
-
-std::vector<SizeType> RuntimeBuffers::getPositionIdsGenerationPhaseGlm(
-    SizeType batchSize, SizeType beamSize, SizeType step, SizeType const* pInputLengths, bool useGptAttentionPlugin)
-{
-    TLLM_CHECK(pInputLengths != nullptr);
-
-    auto const size = batchSize * beamSize * 2;
-    std::vector<SizeType> positionIdsVec(size, 0);
-
-    for (SizeType b = 0; b < batchSize; ++b)
-    {
-        auto* pIdB = positionIdsVec.data() + b * beamSize * 2;
-        auto const length = pInputLengths[b * beamSize];
-
-        for (SizeType bm = 0; bm < beamSize; ++bm)
+        if (usePackedInput)
         {
-            pIdB[bm * 2 + 0] = length - 2;
-            pIdB[bm * 2 + 1] = step + 2;
+            std::vector<int> pInputLengthsAcc = std::vector<int>(batchSize + 1, 0);
+            for (int i = 0; i < batchSize; ++i)
+            {
+                pInputLengthsAcc[i + 1] = pInputLengthsAcc[i] + pInputLengths[i];
+            }
+
+            auto const size = 1 * 2 * pInputLengthsAcc[batchSize];
+            positionIdsVec.resize(size, 0);
+            for (SizeType b = 0; b < batchSize; ++b)
+            {
+                auto* pIdB = positionIdsVec.data() + pInputLengthsAcc[b];
+                auto const length = pInputLengths[b];
+                std::iota(pIdB, pIdB + length, 0);
+
+                pIdB[length - 1] = length - 2;
+                pIdB[length - 1 + pInputLengthsAcc[batchSize]] = 1;
+            }
+        }
+        else
+        {
+            auto const size = batchSize * 2 * maxInputLength;
+            positionIdsVec.resize(size, 0);
+            for (SizeType b = 0; b < batchSize; ++b)
+            {
+                auto* pIdB = positionIdsVec.data() + b * 2 * maxInputLength;
+                auto const length = pInputLengths[b];
+                std::iota(pIdB, pIdB + length, 0);
+
+                pIdB[length - 1] = length - 2;
+                pIdB[length - 1 + maxInputLength] = 1;
+            }
         }
     }
+    else
+    {
+        TLLM_THROW("Unsupported model without GPT Attention Plugin");
+    }
+
+    return positionIdsVec;
+}
+
+std::vector<SizeType> RuntimeBuffers::getPositionIdsGenerationPhaseGlm(const SizeType& batchSize,
+    const SizeType& beamSize, const SizeType& step, const SizeType* pInputLengths, bool useGptAttentionPlugin,
+    bool usePackedInput)
+{
+    TLLM_CHECK(pInputLengths != nullptr);
+
+    auto const size = 2 * batchSize * beamSize;
+    std::vector<SizeType> positionIdsVec(size, 0);
+    if (useGptAttentionPlugin)
+    {
+        if (usePackedInput)
+        {
+            for (SizeType b = 0; b < batchSize; ++b)
+            {
+                auto* pIdB = positionIdsVec.data() + b * beamSize * 2;
+                auto const length = pInputLengths[b * beamSize];
+
+                for (SizeType bm = 0; bm < beamSize; ++bm)
+                {
+                    pIdB[bm * 2 + 0] = length - 2;
+                    pIdB[bm * 2 + 1] = step + 2;
+                }
+            }
+        }
+        else
+        {
+            for (SizeType b = 0; b < batchSize; ++b)
+            {
+                auto* pIdB = positionIdsVec.data() + b * beamSize * 2;
+                auto const length = pInputLengths[b * beamSize];
+
+                for (SizeType bm = 0; bm < beamSize; ++bm)
+                {
+                    pIdB[bm * 2 + 0] = length - 2;
+                    pIdB[bm * 2 + 1] = step + 2;
+                }
+            }
+        }
+    }
+    else
+    {
+        TLLM_THROW("Unsupported model without GPT Attention Plugin");
+    }
 
     return positionIdsVec;
 }
diff --git a/cpp/tensorrt_llm/runtime/runtimeBuffers.h b/cpp/tensorrt_llm/runtime/runtimeBuffers.h
index 96b31143d4..72b59ef364 100644
--- a/cpp/tensorrt_llm/runtime/runtimeBuffers.h
+++ b/cpp/tensorrt_llm/runtime/runtimeBuffers.h
@@ -19,8 +19,12 @@
 #include "tensorrt_llm/runtime/bufferManager.h"
 #include "tensorrt_llm/runtime/gptModelConfig.h"
 #include "tensorrt_llm/runtime/iTensor.h"
+#include "tensorrt_llm/runtime/promptTuningParams.h"
 #include "tensorrt_llm/runtime/worldConfig.h"
 
+#include <array>
+#include <vector>
+
 namespace tensorrt_llm::batch_manager::kv_cache_manager
 {
 class KVCacheManager;
@@ -28,7 +32,6 @@ class KVCacheManager;
 
 namespace tensorrt_llm::runtime
 {
-class IpcMemory;
 class TllmRuntime;
 
 class RuntimeBuffers
@@ -40,11 +43,39 @@ protected:
 public:
     using TensorMap = StringPtrMap<ITensor>;
 
+    class GenerationConfig
+    {
+    public:
+        GenerationConfig() = default;
+
+        explicit GenerationConfig(SizeType batchSize, SizeType beamWidth, SizeType maxInputLength,
+            SizeType maxSeqLength, SizeType inputLengthSum = SizeType(0))
+            : batchSize{batchSize}
+            , beamWidth{beamWidth}
+            , maxInputLength{maxInputLength}
+            , maxSeqLength{maxSeqLength}
+            , inputLengthSum{inputLengthSum}
+        {
+        }
+
+        SizeType batchSize{};
+        SizeType beamWidth{};
+        SizeType maxInputLength{};
+        SizeType maxSeqLength{};
+        SizeType inputLengthSum{}; // Initialized only if inputPacked is set to true in fromInput.
+
+        static GenerationConfig fromInput(ITensor const& inputIds, ITensor const& inputLengths, bool inputPacked,
+            SizeType beamWidth, SizeType maxSequenceLength);
+    };
+
+public:
+    GenerationConfig generationConfig{};
+    std::array<TensorMap, 2> inputBuffers{};
+    std::array<TensorMap, 2> outputBuffers{};
+
     // general
     TensorPtr contextLengthsHost;
     TensorPtr contextLengthsDevice;
-    TensorPtr inputOffsets;             // helper for packed input
-    TensorPtr kvCacheBlockPointersHost; // [numLayers, batchSize * beamWidth, 2, maxBlocksPerSeq * 2]
 
     // engine
     TensorPtr logits;
@@ -57,6 +88,7 @@ public:
 
     std::vector<TensorPtr> presentKeysVals;
     std::vector<TensorPtr> presentKeysValsAlt; // without attention plugin
+    TensorPtr kvCacheBlockPointersHost;        // [numLayers, batchSize * beamWidth, 2, maxBlocksPerSeq * 2]
     TensorPtr kvCacheBlockPointersDevice;      // [numLayers, batchSize * beamWidth, 2, maxBlocksPerSeq * 2]
 
     // References to tmp buffers
@@ -74,82 +106,58 @@ public:
     // pipeline parallelism
     TensorPtr hiddenStates;
 
-    // tensor parallelism
-    TensorPtr commPtrs;
+    // Prompt tuning
+    PromptTuningParams promptTuningParams;
+    TensorPtr promptTuningTasksHost; // Tensor to hold tasks on host
 
     bool allocated{false};
 
-private:
-    std::vector<std::shared_ptr<IpcMemory>> mIpcMemoryHandles;
-
-public:
-    class GenerationConfig
-    {
-    public:
-        GenerationConfig() = default;
-
-        GenerationConfig(SizeType batchSize, SizeType beamWidth, SizeType maxInputLength, SizeType maxNewTokens,
-            SizeType maxSeqLength, SizeType inputLengthSum = SizeType(0))
-            : batchSize{batchSize}
-            , beamWidth{beamWidth}
-            , maxInputLength{maxInputLength}
-            , maxNewTokens{maxNewTokens}
-            , maxSeqLength{maxSeqLength}
-            , inputLengthSum{inputLengthSum}
-        {
-        }
-
-        SizeType batchSize{};
-        SizeType beamWidth{};
-        SizeType maxInputLength{};
-        SizeType maxNewTokens{};
-        SizeType maxSeqLength{};
-        SizeType inputLengthSum{}; // Initialized only if inputPacked is set to true in fromInput.
-
-        static RuntimeBuffers::GenerationConfig fromInput(ITensor const& inputIds, ITensor const& inputLengths,
-            bool const inputPacked, SizeType const beamWidth, SizeType const maxSequenceLength,
-            std::optional<SizeType> const& maxNewTokensOpt);
-    };
-
 public:
     void clear();
+    void clearTensorMaps();
 
     void create(TllmRuntime& runtime, GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
-    void initContextLengths(TensorPtr const& inputLengths, BufferManager& manager);
+    void initFromInput(ITensor const& inputIds, TensorPtr const& inputLengths, bool inputPacked, SizeType beamWidth,
+        SizeType maxSequenceLength, BufferManager& manager);
 
-    void reshape(
-        GenerationConfig const& generationConfig, GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
+    //! \brief Reshape buffers based on current GenerationConfig
+    void reshape(GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
-    void postContextStep(BufferManager& manager, GenerationConfig const& generationConfig,
+    void reset(BufferManager& manager);
+
+    std::vector<RuntimeBuffers> split(
+        SizeType contextBatchSize, GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
+
+    void postContextStep(std::vector<RuntimeBuffers> const& contextBuffers, BufferManager& manager,
         GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
     void prepareContextStep(TensorPtr const& inputIds, TokenIdType padId, BufferManager& manager,
-        KvCacheManager const* kvCacheManager, SizeType firstBatchSlotIdx, GenerationConfig const& generationConfig,
-        GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
-    TensorPtr prepareNextStep(SizeType step, BufferManager& manager, KvCacheManager* kvCacheManager,
-        SizeType firstBatchSlotIdx, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig,
+        KvCacheManager const* kvCacheManager, SizeType firstBatchSlotIdx, GptModelConfig const& modelConfig,
         WorldConfig const& worldConfig);
+    TensorPtr prepareNextStep(SizeType step, BufferManager& manager, KvCacheManager* kvCacheManager,
+        SizeType firstBatchSlotIdx, GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
-    void getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, SizeType step, TensorPtr const& inputIds,
-        GptModelConfig const& modelConfig, WorldConfig const& worldConfig) const;
-
-    void createCustomAllReduceWorkspace(SizeType maxBatchSize, SizeType maxBeamWidth, SizeType maxSequenceLength,
-        SizeType hiddenSize, WorldConfig const& worldConfig, BufferManager& manager);
+    void getRuntimeBuffers(TensorMap& inputBuffers, TensorMap& outputBuffers, SizeType const step,
+        TensorPtr const& inputIds, TensorPtr const& commPtrs, GptModelConfig const& modelConfig,
+        WorldConfig const& worldConfig) const;
 
 private:
-    void gatherLastTokenLogits(BufferManager& manager, GenerationConfig const& generationConfig,
-        GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
+    void gatherLastTokenLogits(
+        BufferManager& manager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
+
+    void copyAttentionMasks(std::vector<RuntimeBuffers> const& contextBatches, BufferManager& manager);
 
     // Some tensors are properly tiled, some are just reshaped.
-    void tile(BufferManager& manager, GenerationConfig const& generationConfig, GptModelConfig const& modelConfig,
-        WorldConfig const& worldConfig);
+    void tile(BufferManager& manager, GptModelConfig const& modelConfig, WorldConfig const& worldConfig);
 
-    static std::vector<SizeType> getPositionIdsContextPhaseGlm(
-        SizeType batchSize, SizeType maxInputLength, SizeType const* pInputLengths, bool useGptAttentionPlugin);
+    static std::vector<SizeType> getPositionIdsContextPhaseGlm(const SizeType& batchSize,
+        const SizeType& maxInputLength, const SizeType* pInputLengths, const bool useGptAttentionPlugin,
+        const bool usePackedInput);
 
-    static std::vector<SizeType> getPositionIdsGenerationPhaseGlm(SizeType batchSize, SizeType beamSize, SizeType step,
-        SizeType const* pInputLengths, bool useGptAttentionPlugin);
+    static std::vector<SizeType> getPositionIdsGenerationPhaseGlm(const SizeType& batchSize, const SizeType& beamSize,
+        const SizeType& step, const SizeType* pInputLengths, const bool useGptAttentionPlugin,
+        const bool usePackedInput);
 };
 
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.cu b/cpp/tensorrt_llm/runtime/runtimeKernels.cu
index 068fba246d..45da61f20f 100644
--- a/cpp/tensorrt_llm/runtime/runtimeKernels.cu
+++ b/cpp/tensorrt_llm/runtime/runtimeKernels.cu
@@ -747,6 +747,24 @@ void invokeCopyPackedInputToOutput(ITensor& outputIds, ITensor const& inputIds,
         maxInputLength, maxSeqLength);
 }
 
+void initOutputIds(ITensor& outputIds, ITensor const& inputIds, ITensor const& inputLengths,
+    ITensor const& inputOffsets, TokenIdType const padId, TokenIdType const endId, SizeType const maxInputLength,
+    bool const inputPacked, CudaStream const& stream)
+{
+    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
+    kernels::invokeFill(outputIds, endId, stream);
+
+    if (inputPacked)
+    {
+        kernels::invokeCopyPackedInputToOutput(outputIds, inputIds, inputOffsets, maxInputLength, padId, stream);
+    }
+    else
+    {
+        kernels::invokeCopyInputToOutput(outputIds, inputIds, inputLengths, padId, stream);
+    }
+    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
+}
+
 namespace
 {
 template <typename T>
diff --git a/cpp/tensorrt_llm/runtime/runtimeKernels.h b/cpp/tensorrt_llm/runtime/runtimeKernels.h
index de1af30274..8b08d68ca0 100644
--- a/cpp/tensorrt_llm/runtime/runtimeKernels.h
+++ b/cpp/tensorrt_llm/runtime/runtimeKernels.h
@@ -68,6 +68,10 @@ void invokeCopyInputToOutput(
 void invokeCopyPackedInputToOutput(ITensor& outputIds, ITensor const& inputIds, ITensor const& inputOffsets,
     SizeType maxInputLength, SizeType padId, CudaStream const& stream);
 
+void initOutputIds(ITensor& outputIds, ITensor const& inputIds, ITensor const& inputLengths,
+    ITensor const& inputOffsets, TokenIdType padId, TokenIdType endId, SizeType maxInputLength, bool inputPacked,
+    CudaStream const& stream);
+
 void scatterTensor(ITensor& output, ITensor const& input, SizeType beamWidth, CudaStream const& stream);
 
 void tileTensor(ITensor& output, ITensor const& input, SizeType beamWidth, CudaStream const& stream);
diff --git a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
index 0117a37008..fb438ab934 100644
--- a/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
+++ b/cpp/tensorrt_llm/runtime/statefulGptDecoder.cpp
@@ -114,32 +114,10 @@ void StatefulGptDecoder::reshapeBuffers(SizeType batchSize, SizeType beamWidth,
         dOutput.beamHypotheses.release();
     }
 
-    mMaxNewTokens = 0;
     mNbSteps = 0;
     TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
 }
 
-namespace
-{
-void initOutputIds(TensorPtr const& outputIds, TensorPtr const& inputIds, TensorPtr const& inputLengths,
-    TensorPtr const& inputOffsets, SizeType const padId, SizeType const endId, SizeType const maxInputLength,
-    bool const inputPacked, CudaStream const& stream)
-{
-    TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
-    kernels::invokeFill(*outputIds, endId, stream);
-
-    if (inputPacked)
-    {
-        kernels::invokeCopyPackedInputToOutput(*outputIds, *inputIds, *inputOffsets, maxInputLength, padId, stream);
-    }
-    else
-    {
-        kernels::invokeCopyInputToOutput(*outputIds, *inputIds, *inputLengths, padId, stream);
-    }
-    TLLM_LOG_DEBUG("%s stop", __PRETTY_FUNCTION__);
-}
-} // namespace
-
 void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig const& samplingConfig)
 {
     TLLM_LOG_DEBUG("%s start", __PRETTY_FUNCTION__);
@@ -174,11 +152,6 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig
         kernels::invokeInclusiveSum(*ITensor::slice(inputOffsets, 1), *inputLengths, manager, *stream);
     }
 
-    mMaxNewTokens = inputs.maxNewTokens.value_or(mMaxSequenceLength - maxInputLength);
-    TLLM_CHECK_WITH_INFO(maxInputLength + mMaxNewTokens <= mMaxSequenceLength,
-        tc::fmtstr("Input length (%d) + max new tokens (%d) must be less than max sequence length (%d).",
-            maxInputLength, mMaxNewTokens, mMaxSequenceLength));
-
     TLLM_CHECK(inputIds->getDataType() == TRTDataType<TokenIdType>::value);
     auto const endId = inputs.endId;
     auto const padId = inputs.padId;
@@ -191,9 +164,21 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig
     dInput.embeddingBias = inputs.embeddingBiasOpt;
     dInput.badWordsList = inputs.badWordsList;
     dInput.stopWordsList = inputs.stopWordsList;
-    kernels::invokeFill(const_cast<ITensor&>(*dInput.sequenceLimitLength), mMaxSequenceLength, *stream);
     auto inputLengthsView = ITensor::view(dInput.lengths, ITensor::makeShape({batchSize * beamWidth}));
     kernels::tileTensor(const_cast<ITensor&>(*inputLengthsView), *inputLengths, beamWidth, *stream);
+    if (inputs.maxNewTokens)
+    {
+        auto const maxNewTokens = inputs.maxNewTokens.value();
+        TLLM_CHECK_WITH_INFO(maxInputLength + maxNewTokens <= mMaxSequenceLength,
+            tc::fmtstr("Input length (%d) + max new tokens (%d) must be less than max sequence length (%d).",
+                maxInputLength, maxNewTokens, mMaxSequenceLength));
+        manager.copy(*inputLengths, const_cast<ITensor&>(*dInput.sequenceLimitLength));
+        kernels::invokeAdd(const_cast<ITensor&>(*dInput.sequenceLimitLength), maxNewTokens, *stream);
+    }
+    else
+    {
+        kernels::invokeFill(const_cast<ITensor&>(*dInput.sequenceLimitLength), mMaxSequenceLength, *stream);
+    }
 
     // output
     auto& dOutput = *mDecodingOutput;
@@ -227,8 +212,8 @@ void StatefulGptDecoder::newBatch(GenerationInput const& inputs, SamplingConfig
     }
 
     // copy the request ids into dOutput.ids (with tiling)
-    initOutputIds(
-        dOutput.ids, inputIds, inputLengths, inputOffsets, padId, endId, maxInputLength, inputs.packed, *stream);
+    kernels::initOutputIds(
+        *dOutput.ids, *inputIds, *inputLengths, *inputOffsets, padId, endId, maxInputLength, inputs.packed, *stream);
 
     // remaining
     mNbSteps = 0;
diff --git a/cpp/tensorrt_llm/runtime/statefulGptDecoder.h b/cpp/tensorrt_llm/runtime/statefulGptDecoder.h
index 141d5b9a81..0276518ce5 100644
--- a/cpp/tensorrt_llm/runtime/statefulGptDecoder.h
+++ b/cpp/tensorrt_llm/runtime/statefulGptDecoder.h
@@ -90,6 +90,5 @@ private:
 
     SizeType mNbSteps;
     SizeType mMaxSequenceLength{};
-    SizeType mMaxNewTokens;
 };
 } // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/utils/debugUtils.cu b/cpp/tensorrt_llm/runtime/utils/debugUtils.cu
new file mode 100644
index 0000000000..8b34ec6612
--- /dev/null
+++ b/cpp/tensorrt_llm/runtime/utils/debugUtils.cu
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "debugUtils.h"
+
+#include "tensorrt_llm/common/cudaUtils.h"
+#include "tensorrt_llm/common/memoryUtils.h"
+
+namespace
+{
+
+__global__ void checkTensorNanKernel(const float* data, std::size_t size, int* foundNan)
+{
+    auto tidx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    int32_t found = 0;
+
+    for (auto idx = tidx; idx < size; idx += blockDim.x * gridDim.x)
+    {
+        auto value = data[idx];
+        if (isnan(value))
+        {
+            found = 1;
+            break;
+        }
+    }
+    atomicCAS(foundNan, 0, found);
+}
+} // namespace
+
+using namespace tensorrt_llm::runtime;
+namespace tc = tensorrt_llm::common;
+
+namespace tensorrt_llm::runtime::utils
+{
+
+void invokeCheckTensorNanKernel(const float* data, std::size_t size, int* foundNan, cudaStream_t stream)
+{
+    constexpr uint32_t kThreadsPerCta = 256;
+    checkTensorNanKernel<<<tc::ceilDiv(size, kThreadsPerCta), kThreadsPerCta, 0, stream>>>(data, size, foundNan);
+}
+
+bool tensorHasNan(const IBuffer& tensor, BufferManager& manager)
+{
+    auto foundNan = manager.pinned(ITensor::makeShape({1}), nvinfer1::DataType::kINT32);
+    auto foundNanPtr = bufferCast<int32_t>(*foundNan);
+    foundNanPtr[0] = 0;
+    const auto size = tensor.getSize();
+    invokeCheckTensorNanKernel(bufferCast<float>(tensor), size, foundNanPtr, manager.getStream().get());
+    manager.getStream().synchronize();
+    return static_cast<bool>(foundNanPtr[0]);
+}
+} // namespace tensorrt_llm::runtime::utils
diff --git a/cpp/tensorrt_llm/runtime/utils/debugUtils.h b/cpp/tensorrt_llm/runtime/utils/debugUtils.h
new file mode 100644
index 0000000000..b8fa02e737
--- /dev/null
+++ b/cpp/tensorrt_llm/runtime/utils/debugUtils.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/runtimeKernels.h"
+
+namespace tensorrt_llm::runtime
+{
+namespace utils
+{
+
+bool tensorHasNan(const IBuffer& tensor, BufferManager& manager);
+
+}
+} // namespace tensorrt_llm::runtime
diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
index c4144c295a..25455d9f14 100644
--- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
+++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.cpp
@@ -89,6 +89,13 @@ void reshapeBufferVector(std::vector<ITensor::SharedPtr>& vector, nvinfer1::Dims
     }
 }
 
+std::vector<ITensor::SharedPtr> sliceBufferVector(
+    std::vector<ITensor::SharedPtr> const& vector, SizeType const offset, SizeType const size)
+{
+    return transformVector(
+        vector, [offset, size](auto const& buffer) { return std::shared_ptr{ITensor::slice(buffer, offset, size)}; });
+}
+
 void insertTensorVector(StringPtrMap<ITensor>& map, std::string const& key, std::vector<ITensor::SharedPtr> const& vec,
     SizeType const indexOffset)
 {
diff --git a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
index 80c18df375..8f9cb36a55 100644
--- a/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
+++ b/cpp/tensorrt_llm/runtime/utils/sessionUtils.h
@@ -37,6 +37,16 @@ int initDevice(WorldConfig const& worldConfig);
 
 std::vector<uint8_t> loadEngine(std::string const& enginePath);
 
+template <typename TInputContainer, typename TFunc>
+auto transformVector(TInputContainer const& input, TFunc func)
+    -> std::vector<std::remove_reference_t<decltype(func(input.front()))>>
+{
+    std::vector<std::remove_reference_t<decltype(func(input.front()))>> output{};
+    output.reserve(input.size());
+    std::transform(input.begin(), input.end(), std::back_inserter(output), func);
+    return output;
+}
+
 std::vector<ITensor::SharedPtr> createBufferVector(TllmRuntime const& runtime, SizeType indexOffset,
     SizeType numBuffers, std::string const& prefix, MemoryType memType);
 
@@ -45,6 +55,9 @@ std::vector<ITensor::SharedPtr> createBufferVector(
 
 void reshapeBufferVector(std::vector<ITensor::SharedPtr>& vector, nvinfer1::Dims const& shape);
 
+std::vector<ITensor::SharedPtr> sliceBufferVector(
+    std::vector<ITensor::SharedPtr> const& vector, SizeType offset, SizeType size);
+
 void insertTensorVector(StringPtrMap<ITensor>& map, std::string const& key, std::vector<ITensor::SharedPtr> const& vec,
     SizeType indexOffset);
 
diff --git a/cpp/tensorrt_llm/thop/CMakeLists.txt b/cpp/tensorrt_llm/thop/CMakeLists.txt
index d42520f2d9..234e605aff 100644
--- a/cpp/tensorrt_llm/thop/CMakeLists.txt
+++ b/cpp/tensorrt_llm/thop/CMakeLists.txt
@@ -21,6 +21,5 @@ target_link_libraries(th_utils PUBLIC ${TORCH_LIBRARIES} ${CUBLAS_LIB}
 add_library(th_common SHARED dynamicDecodeOp.cpp weightOnlyQuantOp.cpp
                              gatherTreeOp.cpp fp8Op.cpp ncclCommunicatorOp.cpp)
 set_property(TARGET th_common PROPERTY POSITION_INDEPENDENT_CODE ON)
-target_link_libraries(
-  th_common PRIVATE ${TORCH_LIBRARIES} th_utils ${Python3_LIBRARIES}
-                    ${STATIC_TARGET} ${UNDEFINED_FLAG})
+target_link_libraries(th_common PRIVATE ${TORCH_LIBRARIES} th_utils
+                                        ${Python3_LIBRARIES} ${STATIC_TARGET})
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index f773960014..a3fa915c84 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -74,6 +74,7 @@ add_gtest(tllmBuffersTest runtime/tllmBuffersTest.cpp)
 add_gtest(bufferManagerTest runtime/bufferManagerTest.cpp)
 add_gtest(runtimeKernelTest runtime/runtimeKernelTest.cpp)
 add_gtest(samplingTest runtime/samplingTest.cpp)
+add_gtest(iTensorTest runtime/iTensorTest.cpp)
 add_gtest(torchTest runtime/torchTest.cpp)
 set(SAMPLING_KERNEL_TEST_SRC
     kernels/sampling/samplingTest.cpp
diff --git a/cpp/tests/README.md b/cpp/tests/README.md
index 4777c513bb..4d45295d5e 100644
--- a/cpp/tests/README.md
+++ b/cpp/tests/README.md
@@ -36,7 +36,7 @@ To build the engines from the top-level directory:
 PYTHONPATH=examples/gpt:$PYTHONPATH python3 cpp/tests/resources/scripts/build_gpt_engines.py
 PYTHONPATH=examples/gptj:$PYTHONPATH python3 cpp/tests/resources/scripts/build_gptj_engines.py
 PYTHONPATH=examples/llama:$PYTHONPATH python3 cpp/tests/resources/scripts/build_llama_engines.py
-PYTHONPATH=examples/CHATGLM6B:$PYTHONPATH python3 cpp/tests/resources/scripts/build_chatglm6b_engines.py
+PYTHONPATH=examples/chatglm:$PYTHONPATH python3 cpp/tests/resources/scripts/build_chatglm_engines.py
 ```
 
 It is possible to build engines with tensor and pipeline parallelism for LLaMA using 4 GPUs.
@@ -53,8 +53,7 @@ End-to-end tests read inputs and expected outputs from Numpy files located at [c
 PYTHONPATH=examples/gpt:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_gpt_output.py
 PYTHONPATH=examples/gptj:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_gptj_output.py
 PYTHONPATH=examples/llama:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_llama_output.py
-PYTHONPATH=examples/chatglm6b:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_chatglm6b_output.py
-PYTHONPATH=examples/chatglm2-6b:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_chatglm2-6b_output.py
+PYTHONPATH=examples/chatglm:$PYTHONPATH python3 cpp/tests/resources/scripts/generate_expected_chatglm_output.py
 ```
 
 ### Generate data with tensor and pipeline parallelism
diff --git a/cpp/tests/resources/scripts/build_chatglm6b_engines.py b/cpp/tests/resources/scripts/build_chatglm6b_engines.py
deleted file mode 100755
index 4c20ad9ea1..0000000000
--- a/cpp/tests/resources/scripts/build_chatglm6b_engines.py
+++ /dev/null
@@ -1,112 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse as _arg
-import os as _os
-import pathlib as _pl
-import subprocess as _sp
-import sys
-import typing as _tp
-from glob import glob as _glob
-
-import torch.multiprocessing as _mp
-
-resources_dir = _pl.Path(
-    __file__).parent.parent.parent.parent.parent / "examples/chatglm6b"
-sys.path.insert(0, str(resources_dir))
-
-engine_target_path = _pl.Path(
-    __file__).parent.parent / "models/rt_engine/chatglm6b"
-
-import build as _ecb
-
-
-def build_engine(weight_dir: _pl.Path, engine_dir: _pl.Path, world_size, *args):
-    args = [
-        '--log_level=error',
-        '--model_dir',
-        str(weight_dir),
-        '--output_dir',
-        str(engine_dir),
-        '--max_batch_size=2',
-        '--max_beam_width=2',
-        '--builder_opt=0',
-        f'--world_size={world_size}',
-    ] + list(args)
-    print("Running: " + " ".join(args))
-    _ecb.run_build(args)
-
-
-def run_command(command: _tp.Sequence[str], *, cwd=None, **kwargs) -> None:
-
-    command = [str(i) for i in command]
-    print(f"Running: cd %s && %s" %
-          (str(cwd or _pl.Path.cwd()), " ".join(command)))
-    _sp.check_call(command, cwd=cwd, **kwargs)
-
-
-def build_engines(model_cache: _tp.Optional[str] = None, world_size: int = 1):
-
-    # Clone the model directory
-    hf_dir = resources_dir / "pyTorchModel"
-    trt_dir = resources_dir / "trtModel"
-
-    run_command(
-        ["pip", "install", "-r",
-         str(resources_dir) + "/requirements.txt"],
-        cwd=resources_dir)
-
-    if not _os.path.exists(hf_dir):
-        _os.mkdir(hf_dir)
-
-    if len(_glob(str(hf_dir) + "/*")) == 0:
-        run_command(
-            [
-                "git",
-                "clone",
-                "https://huggingface.co/THUDM/chatglm-6b",
-                hf_dir,
-            ],
-            cwd=resources_dir,
-        )
-
-    print("\nBuilding engine")
-    build_engine(hf_dir, trt_dir, world_size, "--dtype", "float16",
-                 "--use_gpt_attention_plugin", "float16", "--use_gemm_plugin",
-                 "float16")
-
-    if not _os.path.exists(str(engine_target_path)):
-        _os.system(f"mkdir -p {str(engine_target_path)}")
-
-    _os.system(f"cp -r {str(trt_dir) + '/*'} {engine_target_path}")
-
-    print("Done.")
-
-
-if __name__ == "__main__":
-    parser = _arg.ArgumentParser()
-    parser.add_argument("--model_cache",
-                        type=str,
-                        help="Directory where models are stored")
-
-    parser.add_argument('--world_size',
-                        type=int,
-                        default=1,
-                        help='world size, only support tensor parallelism now')
-
-    _mp.set_start_method("spawn")
-
-    build_engines(**vars(parser.parse_args()))
diff --git a/cpp/tests/resources/scripts/build_chatglm2-6b_engines.py b/cpp/tests/resources/scripts/build_chatglm_engines.py
similarity index 64%
rename from cpp/tests/resources/scripts/build_chatglm2-6b_engines.py
rename to cpp/tests/resources/scripts/build_chatglm_engines.py
index 6a45b3e183..f3d50dcdf7 100755
--- a/cpp/tests/resources/scripts/build_chatglm2-6b_engines.py
+++ b/cpp/tests/resources/scripts/build_chatglm_engines.py
@@ -15,27 +15,31 @@
 # limitations under the License.
 
 import argparse as _arg
-import os as _os
 import pathlib as _pl
+import shutil as _shutil
 import subprocess as _sp
 import sys
 import typing as _tp
-from glob import glob as _glob
+from collections import OrderedDict as _OrderedDict
+from pathlib import Path as _Path
 
 import torch.multiprocessing as _mp
 
 resources_dir = _pl.Path(
-    __file__).parent.parent.parent.parent.parent / "examples/chatglm2-6b"
+    __file__).parent.parent.parent.parent.parent / "examples/chatglm"
 sys.path.insert(0, str(resources_dir))
 
 engine_target_path = _pl.Path(
-    __file__).parent.parent / "models/rt_engine/chatglm2-6b"
+    __file__).parent.parent / "models/rt_engine/chatglm"
 
 import build as _ecb
 
 
-def build_engine(weight_dir: _pl.Path, engine_dir: _pl.Path, world_size, *args):
+def build_engine(model_version: str, weight_dir: _pl.Path, engine_dir: _pl.Path,
+                 world_size, *args):
     args = [
+        '-m',
+        str(model_version),
         '--log_level=error',
         '--model_dir',
         str(weight_dir),
@@ -60,8 +64,14 @@ def run_command(command: _tp.Sequence[str], *, cwd=None, **kwargs) -> None:
 
 def build_engines(model_cache: _tp.Optional[str] = None, world_size: int = 1):
 
-    # Clone the model directory
-    hf_dir = resources_dir / "pyTorchModel"
+    model_name_dict = _OrderedDict([
+        ["chatglm-6b", "1"],
+        ["chatglm2-6b", "2"],
+        ["chatglm3-6b", "3"],
+    ])
+    hf_dir_list = [
+        resources_dir / model_name for model_name in model_name_dict.keys()
+    ]
     trt_dir = resources_dir / "trtModel"
 
     run_command(
@@ -69,29 +79,27 @@ def build_engines(model_cache: _tp.Optional[str] = None, world_size: int = 1):
          str(resources_dir) + "/requirements.txt"],
         cwd=resources_dir)
 
-    if not _os.path.exists(hf_dir):
-        _os.mkdir(hf_dir)
+    # Clone the model directory
+    for model_name, hf_dir in zip(model_name_dict.keys(), hf_dir_list):
+        if not _Path(hf_dir).exists():
+            run_command(
+                [
+                    "git",
+                    "clone",
+                    "https://huggingface.co/THUDM/" + model_name,
+                ],
+                cwd=resources_dir,
+            )
 
-    if len(_glob(str(hf_dir) + "/*")) == 0:
-        run_command(
-            [
-                "git",
-                "clone",
-                "https://huggingface.co/THUDM/chatglm2-6b",
-                hf_dir,
-            ],
-            cwd=resources_dir,
-        )
+    print("\nBuilding engines")
+    for model, hf_dir in zip(model_name_dict.items(), hf_dir_list):
+        print("Building %s" % model[0])
+        build_engine(model[1], hf_dir, trt_dir, world_size)
 
-    print("\nBuilding engine")
-    build_engine(hf_dir, trt_dir, world_size, "--dtype", "float16",
-                 "--use_gpt_attention_plugin", "float16", "--use_gemm_plugin",
-                 "float16")
-
-    if not _os.path.exists(str(engine_target_path)):
-        _os.system(f"mkdir -p {str(engine_target_path)}")
-
-    _os.system(f"cp -r {str(trt_dir) + '/*'} {engine_target_path}")
+    if not _Path(engine_target_path).exists():
+        _Path(engine_target_path).mkdir(parents=True, exist_ok=True)
+    for file in _Path(trt_dir).glob("*"):
+        _shutil.move(file, engine_target_path)
 
     print("Done.")
 
diff --git a/cpp/tests/resources/scripts/generate_expected_chatglm2-6b_output.py b/cpp/tests/resources/scripts/generate_expected_chatglm2-6b_output.py
deleted file mode 100755
index d3a109c313..0000000000
--- a/cpp/tests/resources/scripts/generate_expected_chatglm2-6b_output.py
+++ /dev/null
@@ -1,163 +0,0 @@
-#!/usr/bin/env python3
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-import os
-import pathlib as _pl
-import sys
-from pathlib import Path
-
-import numpy as np
-import torch
-import transformers
-
-import tensorrt_llm
-from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import GenerationSession, ModelConfig, SamplingConfig
-
-resources_dir = _pl.Path(
-    __file__).parent.parent.parent.parent.parent / "examples/chatglm2-6b"
-sys.path.insert(0, str(resources_dir))
-
-from run import parse_arguments  # isort:skip
-
-from build import find_engines  # isort:skip
-
-MODEL_NAME = "chatglm2-6b"
-
-
-def generate(batch_size, beam_width):
-
-    print("generate expected ChatGLM2-6B output BatchSize=%d, BeamWidth=%d" %
-          (batch_size, beam_width))
-    args = parse_arguments()
-    if batch_size == 1:
-        args.input_text = args.input_text[:1]
-    elif batch_size > 2:
-        args.input_text += args.input_text[0] * (batch_size - 2)
-    args.beam_width = beam_width
-    args.tokenizer_dir = resources_dir / "pyTorchModel"
-    args.engine_dir = _pl.Path(
-        __file__).parent.parent / "models/rt_engine/chatglm2-6b"
-
-    tensorrt_llm.logger.set_level(args.log_level)
-
-    config_path = os.path.join(args.engine_dir, 'config.json')
-    with open(config_path, 'r') as f:
-        config = json.load(f)
-    assert (config['builder_config']['name'] == MODEL_NAME)
-    dtype = config['builder_config']['precision']
-    end_id = config['builder_config']['eos_token_id']
-    pad_id = config['builder_config']['pad_token_id']
-    use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
-    world_size = config['builder_config']['tensor_parallel']
-    assert world_size == tensorrt_llm.mpi_world_size(
-    ), f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
-
-    runtime_rank = tensorrt_llm.mpi_rank()
-    runtime_mapping = tensorrt_llm.Mapping(world_size,
-                                           runtime_rank,
-                                           tp_size=world_size)
-    torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
-
-    serialize_path = find_engines(Path(args.engine_dir),
-                                  dtype=dtype,
-                                  tp_size=world_size,
-                                  rank=runtime_rank)[0]
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        args.tokenizer_dir, trust_remote_code=True)
-    input_text = args.input_text
-    tokenized = tokenizer(input_text,
-                          return_tensors="pt",
-                          padding=True,
-                          return_length=True)
-    input_ids = tokenized['input_ids'].int().contiguous().cuda()
-    input_lengths = tokenized['length'].int().contiguous().cuda()
-
-    if use_gpt_attention_plugin:
-        # when using gpt attention plugin, inputs needs to align at the head
-        input_ids_padding_right = torch.zeros_like(input_ids) + end_id
-        for i, sample in enumerate(input_ids):
-            nPadding = 0
-            for token in sample:
-                if token == pad_id:
-                    nPadding += 1
-                else:
-                    break
-            input_ids_padding_right[
-                i, :len(sample[nPadding:])] = sample[nPadding:]
-        input_ids = input_ids_padding_right
-
-    model_config = ModelConfig(
-        vocab_size=config['builder_config']['vocab_size'],
-        num_layers=config['builder_config']['num_layers'],
-        num_heads=config['builder_config']['num_heads'] // world_size,
-        num_kv_heads=config['builder_config']['num_kv_heads'] // world_size,
-        hidden_size=config['builder_config']['hidden_size'] // world_size,
-        gpt_attention_plugin=use_gpt_attention_plugin,
-        remove_input_padding=config['builder_config']['remove_input_padding'],
-        model_name=MODEL_NAME,
-        paged_kv_cache=config['builder_config']['paged_kv_cache'],
-        quant_mode=QuantMode(config['builder_config']['quant_mode']),
-        dtype=dtype,
-    )
-
-    sampling_config = SamplingConfig(
-        end_id=end_id,
-        pad_id=pad_id,
-        num_beams=args.beam_width,
-        temperature=args.temperature,
-        top_k=args.top_k,
-        top_p=args.top_p,
-    )
-    sampling_config.random_seed = args.random_seed
-
-    with open(serialize_path, 'rb') as f:
-        engine_buffer = f.read()
-    decoder = GenerationSession(
-        model_config,
-        engine_buffer,
-        runtime_mapping,
-    )
-    decoder.setup(input_ids.size(0), input_ids.size(1), args.max_output_len,
-                  args.beam_width)
-    output_ids = decoder.decode(input_ids, input_lengths, sampling_config)
-    torch.cuda.synchronize()
-
-    data_path = _pl.Path(__file__).parent.parent / "data/chatglm2-6b"
-    if not os.path.exists(str(data_path)):
-        os.mkdir(data_path)
-    nBS, nBM = input_ids.size(0), args.beam_width
-    np.save(
-        str(data_path) + "/inputId-BS%d-BM%d.npy" % (nBS, nBM),
-        input_ids.detach().cpu().numpy())
-    outputId = output_ids.detach().cpu().numpy()
-
-    nMaxOutputLength = 0
-    for single_output in outputId.reshape(nBS * nBM, -1):
-        nMaxOutputLength = max(nMaxOutputLength,
-                               np.min(np.where(single_output == end_id)))
-    np.save(
-        str(data_path) + "/outputId-BS%d-BM%d.npy" % (nBS, nBM),
-        outputId[:, :, :(nMaxOutputLength + 1)])
-
-
-if __name__ == '__main__':
-    generate(batch_size=1, beam_width=1)
-    generate(batch_size=2, beam_width=1)
-    generate(batch_size=1, beam_width=2)
-    print("Finish!")
diff --git a/cpp/tests/resources/scripts/generate_expected_chatglm6b_output.py b/cpp/tests/resources/scripts/generate_expected_chatglm_output.py
similarity index 70%
rename from cpp/tests/resources/scripts/generate_expected_chatglm6b_output.py
rename to cpp/tests/resources/scripts/generate_expected_chatglm_output.py
index 523309d990..44c5920eff 100755
--- a/cpp/tests/resources/scripts/generate_expected_chatglm6b_output.py
+++ b/cpp/tests/resources/scripts/generate_expected_chatglm_output.py
@@ -15,9 +15,8 @@
 # limitations under the License.
 
 import json
-import os
-import pathlib as _pl
 import sys
+from collections import OrderedDict
 from pathlib import Path
 
 import numpy as np
@@ -26,40 +25,45 @@ import transformers
 
 import tensorrt_llm
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import (ChatGLM6BHeadModelGenerationSession,
+from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
                                   ModelConfig, SamplingConfig)
 
-resources_dir = _pl.Path(
-    __file__).parent.parent.parent.parent.parent / "examples/chatglm6b"
+resources_dir = Path(
+    __file__).parent.parent.parent.parent.parent / "examples/chatglm"
 sys.path.insert(0, str(resources_dir))
 
 from run import parse_arguments  # isort:skip
 
 from build import find_engines  # isort:skip
 
-MODEL_NAME = "chatglm-6b"
 
+def generate(model_name, batch_size, beam_width):
 
-def generate(batch_size, beam_width):
+    model_name_dict = OrderedDict([
+        ["chatglm-6b", "1"],
+        ["chatglm2-6b", "2"],
+        ["chatglm3-6b", "3"],
+    ])
+
+    print("generate expected %s output BatchSize=%d, BeamWidth=%d" %
+          (model_name, batch_size, beam_width))
 
-    print("generate expected ChatGLM-6B output BatchSize=%d, BeamWidth=%d" %
-          (batch_size, beam_width))
     args = parse_arguments()
     if batch_size == 1:
         args.input_text = args.input_text[:1]
     elif batch_size > 2:
         args.input_text += args.input_text[0] * (batch_size - 2)
+    args.model_version = model_name_dict[model_name]
     args.beam_width = beam_width
-    args.tokenizer_dir = resources_dir / "pyTorchModel"
-    args.engine_dir = _pl.Path(
-        __file__).parent.parent / "models/rt_engine/chatglm6b"
+    args.tokenizer_dir = resources_dir / model_name
+    args.engine_dir = Path(__file__).parent.parent / "models/rt_engine/chatglm"
 
     tensorrt_llm.logger.set_level(args.log_level)
 
-    config_path = os.path.join(args.engine_dir, 'config.json')
+    config_path = Path(args.engine_dir) / (model_name + '-config.json')
     with open(config_path, 'r') as f:
         config = json.load(f)
-    assert (config['builder_config']['name'] == MODEL_NAME)
+    assert (config['builder_config']['name'] == model_name)
     dtype = config['builder_config']['precision']
     end_id = config['builder_config']['eos_token_id']
     pad_id = config['builder_config']['pad_token_id']
@@ -74,10 +78,13 @@ def generate(batch_size, beam_width):
                                            tp_size=world_size)
     torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
 
-    serialize_path = find_engines(Path(args.engine_dir),
-                                  dtype=dtype,
-                                  tp_size=world_size,
-                                  rank=runtime_rank)[0]
+    serialize_path = find_engines(
+        Path(args.engine_dir),
+        model_name=model_name,
+        dtype=dtype,
+        tp_size=world_size,
+        rank=runtime_rank,
+    )[0]
 
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         args.tokenizer_dir, trust_remote_code=True)
@@ -111,7 +118,7 @@ def generate(batch_size, beam_width):
         hidden_size=config['builder_config']['hidden_size'] // world_size,
         gpt_attention_plugin=use_gpt_attention_plugin,
         remove_input_padding=config['builder_config']['remove_input_padding'],
-        model_name=MODEL_NAME,
+        model_name=model_name,
         paged_kv_cache=config['builder_config']['paged_kv_cache'],
         quant_mode=QuantMode(config['builder_config']['quant_mode']),
         dtype=dtype,
@@ -129,19 +136,25 @@ def generate(batch_size, beam_width):
 
     with open(serialize_path, 'rb') as f:
         engine_buffer = f.read()
-    decoder = ChatGLM6BHeadModelGenerationSession(
-        model_config,
-        engine_buffer,
-        runtime_mapping,
-    )
+    if model_name == 'chatglm-6b':
+        decoder = ChatGLMGenerationSession(
+            model_config,
+            engine_buffer,
+            runtime_mapping,
+        )
+    else:
+        decoder = GenerationSession(
+            model_config,
+            engine_buffer,
+            runtime_mapping,
+        )
     decoder.setup(input_ids.size(0), input_ids.size(1), args.max_output_len,
                   args.beam_width)
     output_ids = decoder.decode(input_ids, input_lengths, sampling_config)
     torch.cuda.synchronize()
 
-    data_path = _pl.Path(__file__).parent.parent / "data/chatglm6b"
-    if not os.path.exists(str(data_path)):
-        os.mkdir(data_path)
+    data_path = Path(__file__).parent.parent / "data" / model_name
+    data_path.mkdir(parents=True, exist_ok=True)
     nBS, nBM = input_ids.size(0), args.beam_width
     np.save(
         str(data_path) + "/inputId-BS%d-BM%d.npy" % (nBS, nBM),
@@ -150,15 +163,23 @@ def generate(batch_size, beam_width):
 
     nMaxOutputLength = 0
     for single_output in outputId.reshape(nBS * nBM, -1):
-        nMaxOutputLength = max(nMaxOutputLength,
-                               np.min(np.where(single_output == end_id)))
+        if end_id in single_output:
+            nMaxOutputLength = max(nMaxOutputLength,
+                                   np.min(np.where(single_output == end_id)))
+        else:
+            nMaxOutputLength = len(single_output)
     np.save(
         str(data_path) + "/outputId-BS%d-BM%d.npy" % (nBS, nBM),
         outputId[:, :, :(nMaxOutputLength + 1)])
 
 
 if __name__ == '__main__':
-    generate(batch_size=1, beam_width=1)
-    generate(batch_size=2, beam_width=1)
-    generate(batch_size=1, beam_width=2)
-    print("Finish!")
+    generate("chatglm-6b", batch_size=1, beam_width=1)
+    generate("chatglm-6b", batch_size=2, beam_width=1)
+    generate("chatglm2-6b", batch_size=1, beam_width=1)
+    generate("chatglm2-6b", batch_size=2, beam_width=1)
+    generate("chatglm2-6b", batch_size=1, beam_width=2)
+    generate("chatglm3-6b", batch_size=1, beam_width=1)
+    generate("chatglm3-6b", batch_size=2, beam_width=1)
+    generate("chatglm3-6b", batch_size=1, beam_width=2)
+    print("Done.")
diff --git a/cpp/tests/resources/scripts/test_cpp.py b/cpp/tests/resources/scripts/test_cpp.py
index be52da27c7..7e553ad21d 100755
--- a/cpp/tests/resources/scripts/test_cpp.py
+++ b/cpp/tests/resources/scripts/test_cpp.py
@@ -88,8 +88,7 @@ def run_tests(cuda_architectures: _tp.Optional[str] = None,
               model_cache: _tp.Optional[str] = None,
               skip_gptj=False,
               skip_llama=False,
-              skip_chatglm6b=False,
-              skip_chatglm2_6b=False,
+              skip_chatglm=False,
               only_fp8=False,
               only_multi_gpu=False,
               trt_root: _tp.Optional[str] = None) -> None:
@@ -117,15 +116,13 @@ def run_tests(cuda_architectures: _tp.Optional[str] = None,
                                 model_cache=model_cache,
                                 skip_gptj=skip_gptj,
                                 skip_llama=skip_llama,
-                                skip_chatglm6b=skip_chatglm6b,
-                                skip_chatglm2_6b=skip_chatglm2_6b,
+                                skip_chatglm=skip_chatglm,
                                 only_fp8=only_fp8)
 
         run_google_tests(build_dir=build_dir,
                          skip_gptj=skip_gptj,
                          skip_llama=skip_llama,
-                         skip_chatglm6b=skip_chatglm6b,
-                         skip_chatglm2_6b=skip_chatglm2_6b,
+                         skip_chatglm=skip_chatglm,
                          only_fp8=only_fp8)
 
         run_benchmarks(python_exe=python_exe,
@@ -147,8 +144,7 @@ def prepare_all_model_tests(python_exe: str,
                             model_cache: _tp.Optional[str] = None,
                             skip_gptj=False,
                             skip_llama=False,
-                            skip_chatglm6b=False,
-                            skip_chatglm2_6b=False,
+                            skip_chatglm=False,
                             only_fp8=False):
     model_cache_arg = ["--model_cache", model_cache] if model_cache else []
     only_fp8_arg = ["--only_fp8"] if only_fp8 else []
@@ -178,21 +174,13 @@ def prepare_all_model_tests(python_exe: str,
     else:
         _log.info("Skipping Lllama tests")
 
-    if not skip_chatglm6b:
-        prepare_model_tests(model_name="chatglm6b",
+    if not skip_chatglm:
+        prepare_model_tests(model_name="chatglm",
                             python_exe=python_exe,
                             root_dir=root_dir,
                             resources_dir=resources_dir)
     else:
-        _log.info("Skipping ChatGLM6B tests")
-
-    if not skip_chatglm2_6b:
-        prepare_model_tests(model_name="chatglm2-6b",
-                            python_exe=python_exe,
-                            root_dir=root_dir,
-                            resources_dir=resources_dir)
-    else:
-        _log.info("Skipping ChatGLM2-6B tests")
+        _log.info("Skipping ChatGLM tests")
 
 
 def prepare_multi_gpu_model_tests(python_exe: str,
@@ -231,13 +219,17 @@ def prepare_model_tests(model_name: str,
         str(scripts_dir / f"generate_expected_{model_name}_output.py")
     ] + only_fp8_arg + only_multi_gpu_arg
     if only_multi_gpu_arg:
-        generate_expected_output = ["mpirun", "-n", "4"
-                                    ] + generate_expected_output
+        generate_expected_output = [
+            "mpirun",
+            "-n",
+            "4",
+            "--allow-run-as-root",
+        ] + generate_expected_output
     run_command(generate_expected_output, cwd=root_dir, env=model_env)
 
 
-def run_google_tests(build_dir: _pl.Path, skip_gptj, skip_llama, skip_chatglm6b,
-                     skip_chatglm2_6b, only_fp8):
+def run_google_tests(build_dir: _pl.Path, skip_gptj, skip_llama, skip_chatglm,
+                     only_fp8):
     make_google_tests = [
         "cmake", "--build", ".", "--config", "Release", "-j", "--target",
         "google-tests"
@@ -245,16 +237,14 @@ def run_google_tests(build_dir: _pl.Path, skip_gptj, skip_llama, skip_chatglm6b,
     run_command(make_google_tests, cwd=build_dir)
 
     cpp_env = {**_os.environ}
-    ctest = ["ctest", "--output-on-failure", "--output-junit", "report.xml"]
+    ctest = ["ctest", "--output-on-failure", "--output-junit", "results.xml"]
     excluded_tests = []
     if skip_gptj:
         excluded_tests.append(".*Gptj.*")
     if skip_llama:
         excluded_tests.append(".*Llama.*")
-    if skip_chatglm6b:
-        excluded_tests.append(".*Glm6.*")
-    if skip_chatglm2_6b:
-        excluded_tests.append(".*Glm2_6.*")
+    if skip_chatglm:
+        excluded_tests.append(".*ChatGlm.*")
     if only_fp8:
         ctest.extend(["-R", ".*FP8.*"])
     else:
@@ -274,7 +264,8 @@ def run_multi_gpu_tests(build_dir: _pl.Path):
     tests_dir = build_dir / "tests"
     cpp_env = {**_os.environ}
     session_test = [
-        "mpirun", "-n", "4", "gptSessionTest", "--gtest_filter=*TP*:*PP*"
+        "mpirun", "-n", "4", "--allow-run-as-root", "gptSessionTest",
+        "--gtest_filter=*TP*:*PP*"
     ]
     run_command(session_test, cwd=tests_dir, env=cpp_env)
 
@@ -358,12 +349,9 @@ if __name__ == "__main__":
     parser.add_argument("--skip_llama",
                         action="store_true",
                         help="Skip the tests for Llama")
-    parser.add_argument("--skip_chatglm6b",
+    parser.add_argument("--skip_chatglm",
                         action="store_true",
-                        help="Skip the tests for ChatGLM6B")
-    parser.add_argument("--skip_chatglm2_6b",
-                        action="store_true",
-                        help="Skip the tests for ChatGLM2-6B")
+                        help="Skip the tests for ChatGLM")
     parser.add_argument(
         "--only_fp8",
         action="store_true",
diff --git a/cpp/tests/runtime/gptSessionTest.cpp b/cpp/tests/runtime/gptSessionTest.cpp
index ac1a679ad7..01b53a3894 100644
--- a/cpp/tests/runtime/gptSessionTest.cpp
+++ b/cpp/tests/runtime/gptSessionTest.cpp
@@ -148,6 +148,12 @@ public:
     int mTPSize;
     bool mRandomEndId;
 };
+
+struct MicroBatchSizes
+{
+    std::optional<SizeType> ctxMicroBatchSize{std::nullopt};
+    std::optional<SizeType> genMicroBatchSize{std::nullopt};
+};
 } // namespace
 
 class SessionTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init)
@@ -183,7 +189,7 @@ void verifyModelConfig(GptModelConfig const& modelConfig, ModelSpec const& model
 
 void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, ModelIds const modelIds, SizeType beamWidth,
     std::initializer_list<int> const& batchSizes, fs::path const& resultsFile,
-    std::shared_ptr<nvinfer1::ILogger> const& logger, bool cudaGraphMode, SizeType numMicroBatches)
+    std::shared_ptr<nvinfer1::ILogger> const& logger, bool cudaGraphMode, MicroBatchSizes microBatchSizes)
 {
     auto manager = BufferManager(std::make_shared<CudaStream>());
 
@@ -275,7 +281,8 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, Model
     auto const maxBatchSize = *std::max_element(batchSizes.begin(), batchSizes.end());
     GptSession::Config sessionConfig{maxBatchSize, beamWidth, maxSeqLength};
     sessionConfig.decoderPerRequest = modelSpec.mDecoderPerRequest;
-    sessionConfig.numMicroBatches = numMicroBatches;
+    sessionConfig.ctxMicroBatchSize = microBatchSizes.ctxMicroBatchSize;
+    sessionConfig.genMicroBatchSize = microBatchSizes.genMicroBatchSize;
     sessionConfig.cudaGraphMode = cudaGraphMode;
 
     GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger};
@@ -327,6 +334,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, Model
 
         GenerationInput generationInput{
             endId, padId, std::move(inputIds), std::move(inputLenghts), modelConfig.usePackedInput()};
+        generationInput.maxNewTokens = maxNewTokens;
 
         // runtime will allocate memory for output if this tensor is empty
         GenerationOutput generationOutput{bufferManager.emptyTensor(MemoryType::kGPU, nvinfer1::DataType::kINT32),
@@ -338,11 +346,19 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, Model
         {
             SizeType numSteps = 0;
             generationOutput.onTokenGenerated
-                = [&numSteps, &modelSpec, maxNewTokens]([[maybe_unused]] GenerationOutput::TensorPtr const& outputIds,
-                      [[maybe_unused]] SizeType step, bool finished)
+                = [&numSteps, &modelSpec, maxNewTokens](
+                      [[maybe_unused]] GenerationOutput::TensorPtr const& outputIds, SizeType step, bool finished)
             {
+                // check that we execute the callback in each step
+                EXPECT_EQ(step, numSteps);
                 ++numSteps;
-                EXPECT_TRUE(!finished || modelSpec.mRandomEndId || numSteps == maxNewTokens);
+                if (!modelSpec.mRandomEndId)
+                {
+                    // check that we only finish after producing `maxNewTokens` tokens
+                    EXPECT_TRUE(!finished || numSteps == maxNewTokens);
+                }
+                // check that `finished` is set to true after producing `maxNewTokens` tokens
+                EXPECT_TRUE(numSteps != maxNewTokens || finished);
             };
 
             session.generate(generationOutput, generationInput, samplingConfig);
@@ -416,7 +432,7 @@ void testGptSession(fs::path const& modelPath, ModelSpec const& modelSpec, Model
 
 auto constexpr kBatchSizes = {1, 8};
 
-using ParamType = std::tuple<ModelParams, ModelSpec, SizeType, bool, SizeType>;
+using ParamType = std::tuple<ModelParams, ModelSpec, SizeType, bool, MicroBatchSizes>;
 
 std::string generateTestName(const testing::TestParamInfo<ParamType>& info)
 {
@@ -434,9 +450,11 @@ std::string generateTestName(const testing::TestParamInfo<ParamType>& info)
         name.append("DecoderBatch");
     if (std::get<3>(info.param))
         name.append("CudaGraph");
-    auto const numMicroBatches = std::get<4>(info.param);
-    if (numMicroBatches > 1)
-        name.append("MicroBatch" + std::to_string(numMicroBatches));
+    auto const microBatcheSizes = std::get<4>(info.param);
+    if (microBatcheSizes.ctxMicroBatchSize)
+        name.append("CBS" + std::to_string(microBatcheSizes.ctxMicroBatchSize.value()));
+    if (microBatcheSizes.genMicroBatchSize)
+        name.append("GBS" + std::to_string(microBatcheSizes.genMicroBatchSize.value()));
     if (modelSpec.mPPSize > 1)
         name.append("PP" + std::to_string(modelSpec.mPPSize));
     if (modelSpec.mTPSize > 1)
@@ -458,10 +476,8 @@ TEST_P(ParamTest, Test)
     auto const modelIds = modelParams.ids;
     auto const modelSpec = std::get<1>(GetParam());
     SizeType const beamWidth{std::get<2>(GetParam())};
-    auto const resultsPath
-        = DATA_PATH / modelDir / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
-    fs::path const resultsFile{resultsPath / modelSpec.mResultsFile};
-    auto const numMicroBatches = std::get<4>(GetParam());
+    auto const cudaGraphMode = std::get<3>(GetParam());
+    auto const microBatchSizes = std::get<4>(GetParam());
 
     if (!modelSpec.mUseGptAttentionPlugin && beamWidth > 1)
         GTEST_SKIP();
@@ -485,10 +501,12 @@ TEST_P(ParamTest, Test)
     std::ostringstream gpuSizePath;
     gpuSizePath << "tp" << modelSpec.mTPSize << "-pp" << modelSpec.mPPSize << "-gpu";
     auto const modelPath{ENGINGE_PATH / modelDir / modelSpec.mModelPath / gpuSizePath.str()};
-    auto const cudaGraphMode = std::get<3>(GetParam());
+    auto const resultsPath
+        = DATA_PATH / modelDir / ((beamWidth == 1) ? "sampling" : "beam_search_" + std::to_string(beamWidth));
+    fs::path const resultsFile{resultsPath / modelSpec.mResultsFile};
 
     testGptSession(
-        modelPath, modelSpec, modelIds, beamWidth, kBatchSizes, resultsFile, mLogger, cudaGraphMode, numMicroBatches);
+        modelPath, modelSpec, modelIds, beamWidth, kBatchSizes, resultsFile, mLogger, cudaGraphMode, microBatchSizes);
 }
 
 INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest,
@@ -535,7 +553,8 @@ INSTANTIATE_TEST_SUITE_P(GptSessionTest, ParamTest,
                 .usePagedKvCache()
                 .useDecoderPerRequest()
                 .useRandomEndId()),
-        testing::Values(1, 2), testing::Values(false, true), testing::Values(1, 3)),
+        testing::Values(1, 2), testing::Values(false, true),
+        testing::Values(MicroBatchSizes(), MicroBatchSizes{3, 3}, MicroBatchSizes{3, 6})),
     generateTestName);
 
 INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest,
@@ -568,7 +587,7 @@ INSTANTIATE_TEST_SUITE_P(GptjSessionTest, ParamTest,
                 .useDecoderPerRequest()
 
                 ),
-        testing::Values(1, 2), testing::Values(false), testing::Values(1)),
+        testing::Values(1, 2), testing::Values(false), testing::Values(MicroBatchSizes())),
     generateTestName);
 
 INSTANTIATE_TEST_SUITE_P(LlamaSessionTest, ParamTest,
@@ -611,7 +630,7 @@ INSTANTIATE_TEST_SUITE_P(LlamaSessionTest, ParamTest,
                 .useTensorParallelism(2)
 
                 ),
-        testing::Values(1, 2), testing::Values(false), testing::Values(1)),
+        testing::Values(1, 2), testing::Values(false), testing::Values(MicroBatchSizes())),
     generateTestName);
 
 class LlamaSessionOnDemandTest : public SessionTest
@@ -632,7 +651,8 @@ TEST_F(LlamaSessionOnDemandTest, SamplingFP16WithAttentionPlugin)
     auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin();
     auto const modeIds = ModelIds{2, 2};
 
-    testGptSession(modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger, false, 1);
+    testGptSession(
+        modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger, false, MicroBatchSizes());
 }
 
 TEST_F(LlamaSessionOnDemandTest, SamplingFP16AttentionPluginDecoderBatch)
@@ -648,28 +668,34 @@ TEST_F(LlamaSessionOnDemandTest, SamplingFP16AttentionPluginDecoderBatch)
     auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin().usePackedInput().useDecoderPerRequest();
     auto const modeIds = ModelIds{2, 2};
 
-    testGptSession(modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger, false, 1);
+    testGptSession(
+        modelPath, modelSpec, modeIds, beamWidth, batchSizes, resultsFile, mLogger, false, MicroBatchSizes());
 }
 
-class Glm6bSessionTest : public SessionTest
+class ChatGlmSessionTest : public SessionTest // for ChatGLM-6B
 {
 };
 
-class Glm2_6bSessionTest : public SessionTest
+class ChatGlm2SessionTest : public SessionTest // for ChatGLM2-6B and ChatGLM2-6B-32k
 {
 };
 
-// Engines need to be generated using cpp/tests/resources/scripts/build_gpt_engines.py.
-// Expected outputs need to be generated using cpp/tests/resources/scripts/generate_expected_gpt_output.py.
+class ChatGlm3SessionTest : public SessionTest // for ChatGLM3-6B and ChatGLM3-6B-32k
+{
+};
+
+// Engines need to be generated using cpp/tests/resources/scripts/build_chatglm_engines.py.
+// Expected outputs need to be generated using cpp/tests/resources/scripts/generate_expected_chatglm_output.py.
 
 namespace
 {
 
 // TODO: consolidate this function with testGptSession
-// Notice: both ChatGLM-6B and ChatGLM2-6B use this function, which are different at GptModelConfig::ModelVariant
-void testGlm6bSession(fs::path const& modelPath, std::string const& modelName, ModelSpec const& modelSpec,
+// Notice: all ChatGLM models (ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-6B-32k and ChatGLM3-6B-32k) use this
+// function The differences are GptModelConfig::ModelVariant
+void testChatGlmSession(fs::path const& modelPath, std::string const& modelName, ModelSpec const& modelSpec,
     ModelIds const modelIds, SizeType beamWidth, std::initializer_list<int> const& batchSizes,
-    std::shared_ptr<nvinfer1::ILogger> const& logger, bool cudaGraphMode, SizeType numMicroBatches)
+    std::shared_ptr<nvinfer1::ILogger> const& logger, bool cudaGraphMode, MicroBatchSizes microBatchSizes)
 {
     auto manager = BufferManager(std::make_shared<CudaStream>());
 
@@ -692,7 +718,7 @@ void testGlm6bSession(fs::path const& modelPath, std::string const& modelName, M
     auto const expectedOutputData = bufferCast<TokenIdType const>(*expectedOutput);
 
     ASSERT_TRUE(fs::exists(modelPath));
-    auto const json = GptJsonConfig::parse(modelPath / "config.json");
+    auto const json = GptJsonConfig::parse(modelPath / (modelName + "-config.json"));
     auto const modelConfig = json.getModelConfig();
     verifyModelConfig(modelConfig, modelSpec);
     auto const decoderPerRequest = modelSpec.mDecoderPerRequest;
@@ -728,9 +754,9 @@ void testGlm6bSession(fs::path const& modelPath, std::string const& modelName, M
     auto const maxBatchSize = *std::max_element(batchSizes.begin(), batchSizes.end());
     GptSession::Config sessionConfig{maxBatchSize, beamWidth, maxSeqLength};
     sessionConfig.decoderPerRequest = decoderPerRequest;
-    sessionConfig.numMicroBatches = numMicroBatches;
+    sessionConfig.ctxMicroBatchSize = microBatchSizes.ctxMicroBatchSize;
+    sessionConfig.genMicroBatchSize = microBatchSizes.genMicroBatchSize;
     sessionConfig.cudaGraphMode = cudaGraphMode;
-
     GptSession session{sessionConfig, modelConfig, worldConfig, enginePath.string(), logger};
     EXPECT_EQ(session.getDevice(), worldConfig.getDevice());
     // Use bufferManager for copying data to and from the GPU
@@ -837,62 +863,74 @@ void testGlm6bSession(fs::path const& modelPath, std::string const& modelName, M
 
 } // namespace
 
-TEST_F(Glm6bSessionTest, SamplingFP16WithGptAttentionPluginBS1BM1)
+TEST_F(ChatGlmSessionTest, SamplingFP16WithGptAttentionPluginBS1BM1)
 {
-    auto const modelName{"chatglm6b"};
-    auto const modelPath{ENGINGE_PATH / modelName};
+    auto const modelName{"chatglm-6b"};
+    auto const modelPath{ENGINGE_PATH / "chatglm"};
     auto const batchSizes = {1};
     auto constexpr dtype = nvinfer1::DataType::kHALF;
     auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin();
     auto const modeIds = ModelIds{130005, 130005};
 
-    testGlm6bSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, 1);
+    testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, MicroBatchSizes());
 }
 
-TEST_F(Glm6bSessionTest, SamplingFP16WithGptAttentionPluginBS2BM1)
+TEST_F(ChatGlmSessionTest, SamplingFP16WithGptAttentionPluginBS2BM1)
 {
-    auto const modelName{"chatglm6b"};
-    auto const modelPath{ENGINGE_PATH / modelName};
+    auto const modelName{"chatglm-6b"};
+    auto const modelPath{ENGINGE_PATH / "chatglm"};
     auto const batchSizes = {2};
     auto constexpr dtype = nvinfer1::DataType::kHALF;
     auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin();
     auto const modeIds = ModelIds{130005, 130005};
 
-    testGlm6bSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, 1);
+    testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, MicroBatchSizes());
 }
 
-TEST_F(Glm2_6bSessionTest, SamplingFP16WithGptAttentionPluginBS1BM1)
+TEST_F(ChatGlm2SessionTest, SamplingFP16WithGptAttentionPluginBS1BM1)
 {
     auto const modelName{"chatglm2-6b"};
-    auto const modelPath{ENGINGE_PATH / modelName};
+    auto const modelPath{ENGINGE_PATH / "chatglm"};
     auto const batchSizes = {1};
     auto constexpr dtype = nvinfer1::DataType::kHALF;
     auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin();
     auto const modeIds = ModelIds{2, 2};
 
-    testGlm6bSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, 1);
+    testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, MicroBatchSizes());
 }
 
-TEST_F(Glm2_6bSessionTest, SamplingFP16WithGptAttentionPluginBS2BM1)
+TEST_F(ChatGlm2SessionTest, SamplingFP16WithGptAttentionPluginBS2BM1)
 {
     auto const modelName{"chatglm2-6b"};
-    auto const modelPath{ENGINGE_PATH / modelName};
+    auto const modelPath{ENGINGE_PATH / "chatglm"};
     auto const batchSizes = {2};
     auto constexpr dtype = nvinfer1::DataType::kHALF;
     auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin();
     auto const modeIds = ModelIds{2, 2};
 
-    testGlm6bSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, 1);
+    testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, MicroBatchSizes());
 }
 
-TEST_F(Glm2_6bSessionTest, SamplingFP16WithGptAttentionPluginBS1BM2)
+TEST_F(ChatGlm2SessionTest, SamplingFP16WithGptAttentionPluginBS1BM2)
 {
     auto const modelName{"chatglm2-6b"};
-    auto const modelPath{ENGINGE_PATH / modelName};
+    auto const modelPath{ENGINGE_PATH / "chatglm"};
     auto const batchSizes = {1};
     auto constexpr dtype = nvinfer1::DataType::kHALF;
     auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin();
     auto const modeIds = ModelIds{2, 2};
 
-    testGlm6bSession(modelPath, modelName, modelSpec, modeIds, 2, batchSizes, mLogger, false, 1);
+    testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 2, batchSizes, mLogger, false, MicroBatchSizes());
+}
+
+TEST_F(ChatGlm3SessionTest, SamplingFP16WithGptAttentionPluginBS1BM1)
+{
+    auto const modelName{"chatglm3-6b"};
+    auto const modelPath{ENGINGE_PATH / "chatglm"};
+    auto const batchSizes = {1};
+    auto constexpr dtype = nvinfer1::DataType::kHALF;
+    auto const modelSpec = ModelSpec{"", "", dtype}.useGptAttentionPlugin();
+    auto const modeIds = ModelIds{2, 2};
+
+    testChatGlmSession(modelPath, modelName, modelSpec, modeIds, 1, batchSizes, mLogger, false, MicroBatchSizes());
 }
diff --git a/cpp/tests/runtime/iTensorTest.cpp b/cpp/tests/runtime/iTensorTest.cpp
new file mode 100644
index 0000000000..c2be7b5e50
--- /dev/null
+++ b/cpp/tests/runtime/iTensorTest.cpp
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <gmock/gmock.h>
+#include <gtest/gtest.h>
+
+#include "tensorrt_llm/runtime/bufferManager.h"
+#include "tensorrt_llm/runtime/iTensor.h"
+
+using namespace tensorrt_llm::runtime;
+using namespace ::testing;
+
+namespace
+{
+
+TEST(iTensorTest, UnsqueezeShape)
+{
+    auto oldShape = ITensor::makeShape({2, 3, 4, 5});
+    {
+        auto shape = ITensor::unsqueeze(oldShape, 0);
+
+        EXPECT_EQ(shape.nbDims, 5);
+        EXPECT_EQ(shape.d[0], 1);
+        EXPECT_EQ(shape.d[1], 2);
+        EXPECT_EQ(shape.d[2], 3);
+        EXPECT_EQ(shape.d[3], 4);
+        EXPECT_EQ(shape.d[4], 5);
+    }
+    {
+        auto shape = ITensor::unsqueeze(oldShape, 1);
+
+        EXPECT_EQ(shape.nbDims, 5);
+        EXPECT_EQ(shape.d[0], 2);
+        EXPECT_EQ(shape.d[1], 1);
+        EXPECT_EQ(shape.d[2], 3);
+        EXPECT_EQ(shape.d[3], 4);
+        EXPECT_EQ(shape.d[4], 5);
+    }
+
+    {
+        auto shape = ITensor::unsqueeze(oldShape, 4);
+
+        EXPECT_EQ(shape.nbDims, 5);
+        EXPECT_EQ(shape.d[0], 2);
+        EXPECT_EQ(shape.d[1], 3);
+        EXPECT_EQ(shape.d[2], 4);
+        EXPECT_EQ(shape.d[3], 5);
+        EXPECT_EQ(shape.d[4], 1);
+    }
+
+    std::vector<int> invalidDims{-1, 5, 10};
+    for (auto invalidDim : invalidDims)
+    {
+        try
+        {
+            auto shape = ITensor::unsqueeze(oldShape, invalidDim);
+            FAIL() << "Expected failure";
+        }
+        catch (tensorrt_llm::common::TllmException const& e)
+        {
+            EXPECT_THAT(e.what(), testing::HasSubstr("Invalid dim"));
+        }
+        catch (...)
+        {
+            FAIL() << "Expected TllmException";
+        }
+    }
+}
+
+TEST(iTensorTest, UnsqueezeTensor)
+{
+    auto oldShape = ITensor::makeShape({2, 3, 4, 5});
+    BufferManager manager(std::make_shared<CudaStream>());
+
+    {
+        auto tensor = manager.cpu(oldShape, nvinfer1::DataType::kINT32);
+        tensor->unsqueeze(0);
+        auto shape = tensor->getShape();
+
+        EXPECT_EQ(shape.nbDims, 5);
+        EXPECT_EQ(shape.d[0], 1);
+        EXPECT_EQ(shape.d[1], 2);
+        EXPECT_EQ(shape.d[2], 3);
+        EXPECT_EQ(shape.d[3], 4);
+        EXPECT_EQ(shape.d[4], 5);
+    }
+    {
+        auto tensor = manager.cpu(oldShape, nvinfer1::DataType::kINT32);
+        tensor->unsqueeze(1);
+        auto shape = tensor->getShape();
+
+        EXPECT_EQ(shape.nbDims, 5);
+        EXPECT_EQ(shape.d[0], 2);
+        EXPECT_EQ(shape.d[1], 1);
+        EXPECT_EQ(shape.d[2], 3);
+        EXPECT_EQ(shape.d[3], 4);
+        EXPECT_EQ(shape.d[4], 5);
+    }
+
+    {
+        auto tensor = manager.cpu(oldShape, nvinfer1::DataType::kINT32);
+        tensor->unsqueeze(4);
+        auto shape = tensor->getShape();
+
+        EXPECT_EQ(shape.nbDims, 5);
+        EXPECT_EQ(shape.d[0], 2);
+        EXPECT_EQ(shape.d[1], 3);
+        EXPECT_EQ(shape.d[2], 4);
+        EXPECT_EQ(shape.d[3], 5);
+        EXPECT_EQ(shape.d[4], 1);
+    }
+
+    std::vector<int> invalidDims{-1, 5, 10};
+    for (auto invalidDim : invalidDims)
+    {
+        try
+        {
+            auto tensor = manager.cpu(oldShape, nvinfer1::DataType::kINT32);
+            tensor->unsqueeze(invalidDim);
+            FAIL() << "Expected failure";
+        }
+        catch (tensorrt_llm::common::TllmException const& e)
+        {
+            EXPECT_THAT(e.what(), testing::HasSubstr("Invalid dim"));
+        }
+        catch (...)
+        {
+            FAIL() << "Expected TllmException";
+        }
+    }
+}
+
+} // namespace
diff --git a/docs/source/batch_manager.md b/docs/source/batch_manager.md
index 7a54f18d25..aac340aef3 100644
--- a/docs/source/batch_manager.md
+++ b/docs/source/batch_manager.md
@@ -94,17 +94,29 @@ The statistics are packaged as a JSON string. That string contains the following
   * `Active Request Count`, the number of active requests in batch manager
   * `Max Request Count`, the max number of requests batch manager can support at a time
 
-When using in-flight batching, the following additional statistics are reported:
+When using paged KV cache, following statistics are reported:
   * `Max KV cache blocks`, the maximum number of KV cache blocks per GPU
   * `Free KV cache blocks`, number of free KV cache blocks per GPU
   * `Used KV cache blocks`, number of used KV cache blocks per GPU
   * `Tokens per KV cache block`, number of tokens per KV cache block
   * `Scheduled Requests`, number of requests scheduled this iteration
+
+When using in-flight batching, the following additional statistics are reported per step/iteration:
+
+  * `Scheduled Requests`, number of total requests scheduled
   * `Context Requests`, number of requests in Context phase
-  * `Total Context Tokens`, total number of tokens across requests in context phase
-  * `Generation Requests`, number of requests in Context phase
   * `Generation Requests`, number of requests in Generation phase
-  * `MicroBatch ID`, number of requests in Generation phase
+  * `Total Context Tokens`, total number of tokens across requests in context phase
+  * `MicroBatch ID`, micro batch ID
+
+When using V1 batching, the following additional statistics are reported per V1 iteration:
+
+  * `Scheduled Requests`, number of total requests scheduled
+  * `Context Requests`, number of requests in Context phase
+  * `Total Generation Tokens`, Total number of tokens generated
+  * `Total Context Tokens`, total number of tokens across requests in context phase
+  * `Empty Generation Slots`, total number of padded Slots during generation phase
+
 
 ### GptManager Design
 
diff --git a/docs/source/gpt_runtime.md b/docs/source/gpt_runtime.md
index 997249c3fa..035ed77aec 100644
--- a/docs/source/gpt_runtime.md
+++ b/docs/source/gpt_runtime.md
@@ -266,7 +266,7 @@ second one contains `[9, 2]` and the third one is composed of tokens `[6, 2, 4,
 1]`. In total, there are 9 tokens. That's the length. The shape of the tensor
 is `[2, 9]`.  The first row of the tensor must contain the 9 token IDs and the
 second row must store the
-[exclusive prefix-sum](https://en.wikipedia.org/wiki/Prefix_sum)
+[inclusive prefix-sum](https://en.wikipedia.org/wiki/Prefix_sum)
 of the word lengths as shown on the following diagram:
 
 ```
@@ -274,7 +274,7 @@ of the word lengths as shown on the following diagram:
    |           |       |              |
    V           V       V              V
 [  5,  7,  3,  9,  2,  6,  2,  4,  1]
-[  0,  3,  5,  9, -1, -1, -1, -1, -1]
+[  3,  5,  9, -1, -1, -1, -1, -1, -1]
 ```
 
 In case all the words are made of a single token, the inner-most dimension of
diff --git a/docs/source/precision.md b/docs/source/precision.md
index 5877e78076..86d193fe10 100644
--- a/docs/source/precision.md
+++ b/docs/source/precision.md
@@ -114,23 +114,26 @@ GPT-J and LLaMA. Those examples can be found in
 
 This release of TensorRT-LLM contains the following examples:
 
-| Model                       | FP32 | FP16 | BF16 | FP8  | W8A8 SQ | W8A16 | W4A16 | W4A16 AWQ | W4A16 GPTQ |
-| :-------------------------- | :--: | :--: | :--: | :--: | :-----: | :---: | :---: | :-------: | :--------: |
-| Baichuan                    | Y    | Y    | Y    | .    | .       | Y     | Y     | .         | .          |
-| BERT                        | Y    | Y    | Y    | .    | .       | .     | .     | .         | .          |
-| BLOOM                       | Y    | Y    | Y    | .    | Y       | Y     | Y     | .         | .          |
-| ChatGLM                     | Y    | Y    | Y    | .    | .       | .     | .     | .         | .          |
-| ChatGLM-v2                  | Y    | Y    | Y    | .    | .       | .     | .     | .         | .          |
-| Falcon                      | Y    | Y    | Y    | .    | .       | .     | .     | .         | .          |
-| GPT                         | Y    | Y    | Y    | Y    | Y       | Y     | Y     | .         | .          |
-| GPT-J                       | Y    | Y    | Y    | Y    | Y       | Y     | Y     | Y         | .          |
-| GPT-NeMo                    | Y    | Y    | Y    | .    | .       | .     | .     | .         | .          |
-| GPT-NeoX                    | Y    | Y    | Y    | .    | .       | .     | .     | .         | Y          |
-| LLaMA                       | Y    | Y    | Y    | .    | Y       | Y     | Y     | Y         | Y          |
-| LLaMA-v2                    | Y    | Y    | Y    | Y    | Y       | Y     | Y     | Y         | Y          |
-| OPT                         | Y    | Y    | Y    | .    | .       | .     | .     | .         | .          |
-| SantaCoder                  | Y    | Y    | Y    | .    | .       | .     | .     | .         | .          |
-| StarCoder                   | Y    | Y    | Y    | .    | .       | .     | .     | .         | .          |
+| Model      | FP32  | FP16  | BF16  |  FP8  | W8A8 SQ | W8A16 | W4A16 | W4A16 AWQ | W4A16 GPTQ |
+| :--------- | :---: | :---: | :---: | :---: | :-----: | :---: | :---: | :-------: | :--------: |
+| Baichuan   |   Y   |   Y   |   Y   |   .   |    Y    |   Y   |   Y   |     .     |     .      |
+| BERT       |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
+| BLOOM      |   Y   |   Y   |   Y   |   .   |    Y    |   Y   |   Y   |     .     |     .      |
+| ChatGLM    |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
+| ChatGLM-v2 |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
+| ChatGLM-v3 |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
+| Falcon     |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
+| GPT        |   Y   |   Y   |   Y   |   Y   |    Y    |   Y   |   Y   |     .     |     .      |
+| GPT-J      |   Y   |   Y   |   Y   |   Y   |    Y    |   Y   |   Y   |     Y     |     .      |
+| GPT-NeMo   |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
+| GPT-NeoX   |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     Y      |
+| LLaMA      |   Y   |   Y   |   Y   |   .   |    Y    |   Y   |   Y   |     Y     |     Y      |
+| LLaMA-v2   |   Y   |   Y   |   Y   |   Y   |    Y    |   Y   |   Y   |     Y     |     Y      |
+| OPT        |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
+| SantaCoder |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
+| StarCoder  |   Y   |   Y   |   Y   |   .   |    .    |   .   |   .   |     .     |     .      |
+| InternLM   |   Y   |   Y   |   Y   |   .   |    Y    |   Y   |   Y   |     .     |     .      |
+
 
 ## Technical Detail: The `QuantMode` Flags
 
diff --git a/examples/baichuan/README.md b/examples/baichuan/README.md
index 82e0b87692..241bb540a3 100644
--- a/examples/baichuan/README.md
+++ b/examples/baichuan/README.md
@@ -16,6 +16,8 @@ These scripts accept an argument named model_version, whose value should be `v1_
   * FP16
   * BF16
   * INT4 & INT8 Weight-Only
+  * INT8 KV CACHE
+  * INT8 Smooth Quant
 
 ## Usage
 
@@ -82,6 +84,74 @@ python build.py --model_version v1_13b \
                 --world_size 2
 ```
 
+#### INT8 weight only + INT8 KV cache
+For INT8 KV cache, [`hf_baichuan_convert.py`](./hf_baichuan_convert.py) features a
+`--calibrate-kv-cache, -kv` option. Setting `-kv` will calibrate the model,
+and then export the scaling factors needed for INT8 KV cache inference.
+
+
+Example:
+
+```bash
+python3 hf_baichuan_convert.py -i baichuan-inc/Baichuan-13B-Chat -o ./tmp/baichuan_v1_13b/int8_kv_cache/ --calibrate-kv-cache -t fp16
+```
+
+[`build.py`](./build.py) add new options for the support of INT8 KV cache.
+
+`--int8_kv_cache` is the command-line option to enable INT8 KV cache.
+
+In addition, it could be combined with INT8 weight-only quantization, as follows:
+
+Examples of INT8 weight-only quantization + INT8 KV cache
+
+```bash
+# Build model with both INT8 weight-only and INT8 KV cache enabled
+python build.py --model_version v1_13b \
+                --bin_model_dir=./tmp/baichuan_v1_13b/int8_kv_cache/1-gpu/ \
+                --dtype float16 \
+                --use_gpt_attention_plugin float16 \
+                --use_gemm_plugin float16 \
+                --output_dir ./tmp/baichuan_v1_13b/trt_engines/int8_kv_cache_weight_only/1-gpu \
+                --int8_kv_cache \
+                --use_weight_only
+```
+
+#### SmoothQuant
+
+The SmoothQuant supports all Baichuan model variants. Unlike the FP16 build where the HF weights are processed and loaded into the TensorRT-LLM directly, the SmoothQuant needs to load INT8 weights which should be pre-processed before building an engine.
+
+Example:
+```bash
+python3 hf_baichuan_convert.py -i baichuan-inc/Baichuan-13B-Chat -o ./tmp/baichuan_v1_13b/sq0.8/ -sq 0.8 --tensor-parallelism 1 --storage-type fp16
+```
+
+[`build.py`](./build.py) add new options for the support of INT8 inference of SmoothQuant models.
+
+`--use_smooth_quant` is the starting point of INT8 inference. By default, it
+will run the model in the _per-tensor_ mode.
+
+Then, you can add any combination of `--per-token` and `--per-channel` to get the corresponding behaviors.
+
+Examples of build invocations:
+
+```bash
+# Build model for SmoothQuant in the _per_tensor_ mode.
+python3 build.py --model_version v1_13b \
+                 --bin_model_dir=./tmp/baichuan_v1_13b/sq0.8/1-gpu/ \
+                 --use_smooth_quant \
+                 --use_gpt_attention_plugin float16 \
+
+# Build model for SmoothQuant in the _per_token_ + _per_channel_ mode
+python3 build.py --model_version v1_13b \
+                 --bin_model_dir=./tmp/baichuan_v1_13b/sq0.8/1-gpu/ \
+                 --use_smooth_quant \
+                 --use_gpt_attention_plugin float16 \
+                 --per_token \
+                 --per_channel
+```
+
+Note we use `--bin_model_dir` instead of `--model_dir` and `--meta_ckpt_dir` since SmoothQuant model needs INT8 weights and various scales from the binary files.
+
 ### Run
 
 To run a TensorRT-LLM Baichuan model using the engines generated by build.py
diff --git a/examples/baichuan/build.py b/examples/baichuan/build.py
index 78e9eca2de..45c7f483fe 100644
--- a/examples/baichuan/build.py
+++ b/examples/baichuan/build.py
@@ -15,6 +15,7 @@
 import argparse
 import os
 import time
+from pathlib import Path
 
 import onnx
 import tensorrt as trt
@@ -29,12 +30,12 @@ from tensorrt_llm.builder import Builder
 from tensorrt_llm.layers.attention import PositionEmbeddingType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import BaichuanForCausalLM, weight_only_quantize
+from tensorrt_llm.models import BaichuanForCausalLM, quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
 
-from weight import load_from_hf_baichuan  # isort:skip
+from weight import load_from_hf_baichuan, load_from_binary, parse_bin_config  # isort:skip
 
 # 2 routines: get_engine_name, serialize_engine
 # are direct copy from gpt example, TODO: put in utils?
@@ -115,9 +116,8 @@ def parse_arguments():
                         type=int,
                         default=1,
                         help='world size, only support tensor parallelism now')
-    parser.add_argument('--model_dir',
-                        type=str,
-                        default='baichuan-inc/Baichuan-13B-Chat')
+    parser.add_argument('--model_dir', type=str, default=None)
+    parser.add_argument('--bin_model_dir', type=str, default=None)
     parser.add_argument('--model_version',
                         type=str,
                         default='v1_13b',
@@ -180,6 +180,38 @@ def parse_arguments():
                         default=False,
                         action='store_true')
 
+    # Arguments related to the quantization of the model.
+    parser.add_argument(
+        '--use_smooth_quant',
+        default=False,
+        action="store_true",
+        help=
+        'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.'
+        'See --per_channel and --per_token for finer-grained quantization options.'
+    )
+    parser.add_argument(
+        '--per_channel',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor for the GEMM\'s result. '
+        'per_channel instead uses a different static scaling factor for each channel. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--per_token',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor to scale activations in the int8 range. '
+        'per_token chooses at run time, and for each token, a custom scaling factor. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--int8_kv_cache',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
+    )
     parser.add_argument(
         '--use_weight_only',
         default=False,
@@ -222,11 +254,15 @@ def parse_arguments():
 
     args = parser.parse_args()
 
-    if args.use_weight_only:
-        args.quant_mode = QuantMode.use_weight_only(
-            args.weight_only_precision == 'int4')
-    else:
-        args.quant_mode = QuantMode(0)
+    assert not (
+        args.use_smooth_quant and args.use_weight_only
+    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
+
+    if not args.remove_input_padding:
+        if args.use_gpt_attention_plugin:
+            logger.warning(
+                f"It is recommended to specify --remove_input_padding when using GPT attention plugin"
+            )
 
     if args.use_inflight_batching:
         if not args.use_gpt_attention_plugin:
@@ -245,6 +281,18 @@ def parse_arguments():
     if args.max_num_tokens is not None:
         assert args.enable_context_fmha
 
+    if args.use_smooth_quant:
+        args.quant_mode = QuantMode.use_smooth_quant(args.per_token,
+                                                     args.per_channel)
+    elif args.use_weight_only:
+        args.quant_mode = QuantMode.use_weight_only(
+            args.weight_only_precision == 'int4')
+    else:
+        args.quant_mode = QuantMode(0)
+
+    if args.int8_kv_cache:
+        args.quant_mode = args.quant_mode.set_int8_kv_cache()
+
     if args.model_dir is not None:
         hf_config = AutoConfig.from_pretrained(args.model_dir,
                                                trust_remote_code=True)
@@ -259,6 +307,16 @@ def parse_arguments():
             args.n_positions = hf_config.model_max_length
         args.vocab_size = hf_config.vocab_size
         args.hidden_act = hf_config.hidden_act
+    elif args.bin_model_dir is not None:
+        n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, _ = parse_bin_config(
+            Path(args.bin_model_dir) / "config.ini")
+        args.inter_size = inter_size
+        args.n_embd = n_embd
+        args.n_head = n_head
+        args.n_layer = n_layer
+        args.n_positions = n_positions
+        args.vocab_size = vocab_size
+        args.hidden_act = hidden_act
     else:
         # default values are based on v1_13b, change them based on model_version
         if args.model_version == 'v1_7b':
@@ -286,9 +344,6 @@ def parse_arguments():
             args.vocab_size = 125696
             args.hidden_act = 'silu'
 
-    if args.dtype == 'bfloat16':
-        assert args.use_gemm_plugin, "Please use gemm plugin when dtype is bfloat16"
-
     return args
 
 
@@ -301,7 +356,10 @@ def build_rank_engine(builder: Builder,
        @param args: The cmd line arguments.
        @return: The built engine.
     '''
-    kv_dtype = str_dtype_to_trt(args.dtype)
+    dtype = str_dtype_to_trt(args.dtype)
+    mapping = Mapping(world_size=args.world_size,
+                      rank=rank,
+                      tp_size=args.world_size)
     if args.model_version == 'v1_7b' or args.model_version == 'v2_7b':
         position_embedding_type = PositionEmbeddingType.rope_gpt_neox
     else:
@@ -311,23 +369,19 @@ def build_rank_engine(builder: Builder,
     tensorrt_llm_baichuan = BaichuanForCausalLM(
         num_layers=args.n_layer,
         num_heads=args.n_head,
+        num_kv_heads=None,
         hidden_size=args.n_embd,
         vocab_size=args.vocab_size,
         hidden_act=args.hidden_act,
         max_position_embeddings=args.n_positions,
         position_embedding_type=position_embedding_type,
-        dtype=kv_dtype,
+        dtype=dtype,
         mlp_hidden_size=args.inter_size,
-        mapping=Mapping(world_size=args.world_size,
-                        rank=rank,
-                        tp_size=args.world_size))
-    if args.use_weight_only and args.weight_only_precision == 'int8':
-        tensorrt_llm_baichuan = weight_only_quantize(
-            tensorrt_llm_baichuan, QuantMode.use_weight_only())
-    elif args.use_weight_only and args.weight_only_precision == 'int4':
-        tensorrt_llm_baichuan = weight_only_quantize(
-            tensorrt_llm_baichuan,
-            QuantMode.use_weight_only(use_int4_weights=True))
+        mapping=mapping,
+        quant_mode=args.quant_mode)
+    if args.use_smooth_quant or args.use_weight_only:
+        tensorrt_llm_baichuan = quantize_model(tensorrt_llm_baichuan,
+                                               args.quant_mode)
     if args.model_dir is not None:
         logger.info(
             f'Loading HF Baichuan {args.model_version} ... from {args.model_dir}'
@@ -351,6 +405,12 @@ def build_rank_engine(builder: Builder,
                               args.world_size,
                               dtype=args.dtype)
         del hf_baichuan
+    elif args.bin_model_dir is not None:
+        load_from_binary(tensorrt_llm_baichuan,
+                         args.bin_model_dir,
+                         mapping,
+                         fp16=(args.dtype == 'float16'),
+                         multi_query_mode=False)
 
     # Module -> Network
     network = builder.create_network()
@@ -360,6 +420,12 @@ def build_rank_engine(builder: Builder,
             dtype=args.use_gpt_attention_plugin)
     if args.use_gemm_plugin:
         network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
+    # Quantization plugins.
+    if args.use_smooth_quant:
+        network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype)
+        network.plugin_config.set_rmsnorm_quantization_plugin(dtype=args.dtype)
+        network.plugin_config.set_quantize_tensor_plugin()
+        network.plugin_config.set_quantize_per_token_plugin()
     assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
     if args.enable_context_fmha:
         network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
@@ -393,7 +459,7 @@ def build_rank_engine(builder: Builder,
                 v = v.trt_tensor
                 v.name = k
                 network.trt_network.mark_output(v)
-                v.dtype = kv_dtype
+                v.dtype = dtype
         if args.visualize:
             model_path = os.path.join(args.output_dir, 'test.onnx')
             to_onnx(network.trt_network, model_path)
@@ -407,6 +473,9 @@ def build_rank_engine(builder: Builder,
     if rank == 0:
         config_path = os.path.join(args.output_dir, 'config.json')
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, tensorrt_llm_baichuan)
+
     return engine
 
 
@@ -425,6 +494,9 @@ def build(rank, args):
         # skip other ranks if parallel_build is enabled
         if args.parallel_build and cur_rank != rank:
             continue
+        # NOTE(nkorobov): when only int8 kv cache is used together with paged kv cache no int8 tensors are exposed to TRT
+        int8_trt_flag = args.quant_mode.has_act_or_weight_quant() or (
+            not args.paged_kv_cache and args.quant_mode.has_int8_kv_cache())
         builder_config = builder.create_builder_config(
             name=model_name,
             precision=args.dtype,
@@ -441,7 +513,8 @@ def build(rank, args):
             max_input_len=args.max_input_len,
             max_output_len=args.max_output_len,
             max_num_tokens=args.max_num_tokens,
-            int8=args.quant_mode.has_act_or_weight_quant())
+            int8=int8_trt_flag,
+            quant_mode=args.quant_mode)
         engine_name = get_engine_name(model_name, args.dtype, args.world_size,
                                       cur_rank)
         engine = build_rank_engine(builder, builder_config, engine_name,
@@ -454,6 +527,7 @@ def build(rank, args):
                 cache = builder_config.trt_builder_config.get_timing_cache()
 
         serialize_engine(engine, os.path.join(args.output_dir, engine_name))
+        del engine
 
     if rank == 0:
         ok = builder.save_timing_cache(
diff --git a/examples/baichuan/convert.py b/examples/baichuan/convert.py
new file mode 100644
index 0000000000..11dd73d3ca
--- /dev/null
+++ b/examples/baichuan/convert.py
@@ -0,0 +1,295 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    Utilities for exporting a model to our custom format.
+"""
+import numpy as np
+import torch
+
+
+def save_val(val, dir, key, tp_num=None):
+    suffix = "bin" if tp_num is None else f"{tp_num}.bin"
+    val.tofile(dir / f"model.{key}.{suffix}")
+
+
+def save_split(split_vals, dir, key, i, factor):
+    for j, val in enumerate(split_vals):
+        save_val(val, dir, key, i * factor + j)
+
+
+def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
+    """
+     This function has two purposes:
+      - compute quantized weights, scaled either per-tensor or per-column
+      - compute scaling factors
+
+      Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ.
+      CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W.
+      CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor.
+
+      Here is the list of what we need (T means per-tensor, C per-column):
+        - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8). Used before the GEMM. (T)
+        - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T)
+        - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C)
+        - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32)
+          to quant range (int8) (used for CUBLAS) (T, C)
+
+      Note that we don't do anything special about row-parallel GEMM. Theoretically, we could have per-GPU scaling factors too,
+      but then the model would change depending on the number of GPUs used.
+
+      For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection, we consider it
+      as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V.
+      For our GEMM implementation to respect this behavior, we use per-column mode and replicate values along columns.
+    """
+
+    # compute weight scaling factors for fp->int8 and int8->fp
+    if is_qkv and not multi_query_mode:
+        scale_w_orig_quant_t = 127. / act_range["w"].reshape(3, -1).max(
+            dim=-1, keepdims=True)[0].cpu().numpy()
+        scale_w_orig_quant_c = 127. / act_range["w"].reshape(3,
+                                                             -1).cpu().numpy()
+    elif is_qkv and multi_query_mode:
+        hidden_dim = weights.shape[0]
+        local_dim = act_range["w"].shape[0]
+        kv_dim = (local_dim - hidden_dim) // 2
+        scale_w_q = act_range["w"][0:hidden_dim]
+        scale_w_k = act_range["w"][hidden_dim:hidden_dim + kv_dim]
+        scale_w_v = act_range["w"][-kv_dim:]
+
+        scale_w_qkv_t = torch.concat([
+            scale_w_q.max(dim=0, keepdim=True)[0],
+            scale_w_k.max(dim=0, keepdim=True)[0],
+            scale_w_v.max(dim=0, keepdim=True)[0]
+        ])
+
+        scale_w_orig_quant_t = 127. / scale_w_qkv_t.cpu().numpy()
+        scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy()
+    else:
+        scale_w_orig_quant_t = 127. / act_range["w"].max().cpu().numpy()
+        scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy()
+    scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t
+    scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c
+
+    # compute the rest of needed scaling factors
+    scale_x_orig_quant_t = np.array(127. / act_range["x"].max().item())
+    scale_y_orig_quant_t = np.array(127. / act_range["y"].max().item())
+    scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.)
+    scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t *
+                                                    scale_w_orig_quant_t)
+    scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t *
+                                                    scale_w_orig_quant_c)
+    if is_qkv and not multi_query_mode:
+        scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t,
+                                                scale_w_orig_quant_c.shape)
+        scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t,
+                                               scale_w_orig_quant_c.shape)
+    if is_qkv and multi_query_mode:
+        scale_q_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[0],
+                                            scale_w_q.shape)
+        scale_k_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[1],
+                                            scale_w_k.shape)
+        scale_v_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[2],
+                                            scale_w_v.shape)
+        scale_y_accum_quant_t = np.concatenate(
+            [scale_q_y_accum_t, scale_k_y_accum_t, scale_v_y_accum_t])
+        scale_w_quant_orig_t = np.concatenate([
+            np.broadcast_to(scale_w_quant_orig_t[0], scale_w_q.shape),
+            np.broadcast_to(scale_w_quant_orig_t[1], scale_w_k.shape),
+            np.broadcast_to(scale_w_quant_orig_t[2], scale_w_v.shape)
+        ])
+
+    to_i8 = lambda x: x.round().clip(-127, 127).astype(np.int8)
+
+    if is_qkv and multi_query_mode:
+        scale_w_quant_orig_t_expand = np.ones([weights.shape[-1]])
+        scale_w_quant_orig_t_expand[:hidden_dim] = scale_w_quant_orig_t[0]
+        scale_w_quant_orig_t_expand[hidden_dim:hidden_dim +
+                                    kv_dim] = scale_w_quant_orig_t[1]
+        scale_w_quant_orig_t_expand[-kv_dim:] = scale_w_quant_orig_t[2]
+        weight_int8 = to_i8(weights * scale_w_quant_orig_t_expand)
+    else:
+        weight_int8 = to_i8(weights * scale_w_orig_quant_t)
+    return {
+        "weight.int8": weight_int8,
+        "weight.int8.col": to_i8(weights * scale_w_orig_quant_c),
+        "scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32),
+        "scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32),
+        "scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32),
+        "scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32),
+        "scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32),
+        "scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32),
+    }
+
+
+def save_multi_query_mode_qkv_int8(val, dir, base_key, saved_key, factor, rank,
+                                   local_dim, head_size):
+    q, k, v = np.split(val, [local_dim, local_dim + head_size], axis=-1)
+    q_split = np.split(q, factor, axis=-1)
+    k_split = np.split(k, factor, axis=-1)
+    v_split = np.split(v, factor, axis=-1)
+    split_vals = [
+        np.concatenate((q_split[ii], k_split[ii], v_split[ii]), axis=-1)
+        for ii in range(factor)
+    ]
+    save_split(split_vals, dir, f"{base_key}.{saved_key}", rank, factor)
+
+
+def write_int8(vals,
+               dir,
+               base_key,
+               split_dim,
+               i,
+               factor,
+               is_qkv=False,
+               multi_query_mode=False):
+    saved_keys_once = [
+        "scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant",
+        "scale_y_quant_orig"
+    ]
+
+    if is_qkv and multi_query_mode:
+        assert split_dim == -1
+        local_dim = vals["weight.int8"].shape[0]
+        head_size = (vals["weight.int8"].shape[1] - local_dim) // 2
+
+        save_multi_query_mode_qkv_int8(vals["weight.int8"], dir, base_key,
+                                       "weight.int8", factor, i, local_dim,
+                                       head_size)
+        save_multi_query_mode_qkv_int8(vals["weight.int8.col"], dir, base_key,
+                                       "weight.int8.col", factor, i, local_dim,
+                                       head_size)
+        save_multi_query_mode_qkv_int8(vals["scale_w_quant_orig.col"], dir,
+                                       base_key, "scale_w_quant_orig.col",
+                                       factor, i, local_dim, head_size)
+        save_multi_query_mode_qkv_int8(vals["scale_y_accum_quant.col"], dir,
+                                       base_key, "scale_y_accum_quant.col",
+                                       factor, i, local_dim, head_size)
+    else:
+        save_split(np.split(vals["weight.int8"], factor, axis=split_dim), dir,
+                   f"{base_key}.weight.int8", i, factor)
+        save_split(np.split(vals["weight.int8.col"], factor, axis=split_dim),
+                   dir, f"{base_key}.weight.int8.col", i, factor)
+
+        if split_dim == -1:
+            save_split(
+                np.split(vals["scale_w_quant_orig.col"], factor,
+                         axis=split_dim), dir,
+                f"{base_key}.scale_w_quant_orig.col", i, factor)
+            save_split(
+                np.split(vals["scale_y_accum_quant.col"],
+                         factor,
+                         axis=split_dim), dir,
+                f"{base_key}.scale_y_accum_quant.col", i, factor)
+        else:
+            saved_keys_once += [
+                "scale_w_quant_orig.col", "scale_y_accum_quant.col"
+            ]
+
+    if i == 0:
+        for save_key in saved_keys_once:
+            save_val(vals[save_key], dir, f"{base_key}.{save_key}")
+
+
+def str_to_np_dtype(type_str):
+    convert_dict = {
+        "fp32": np.float32,
+        "fp16": np.float16,
+    }
+    dtype = convert_dict.get(type_str)
+    if dtype is None:
+        raise ValueError(f"{type_str} is an invalid storage type")
+    return dtype
+
+
+def split_and_save_weight(i, saved_dir, factor, key, val, act_range, config):
+    # The split_factor indicates the number of ranks to implement
+    # distributed GEMMs. For Tensor Parallelism, each rank/GPU works
+    # on split_hidden_dim // split_factor channels.
+
+    int8_outputs = config.get("int8_outputs", None)
+    multi_query_mode = config.get("multi_query_mode", False)
+    local_dim = config.get("local_dim", None)
+
+    save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
+
+    if "input_layernorm.weight" in key or "input_layernorm.bias" in key or \
+        "attention.dense.bias" in key or "post_layernorm.weight" in key or \
+        "post_attention_layernorm.bias" in key or "mlp.dense_4h_to_h.bias" in key or \
+        "final_layernorm.weight" in key or "final_layernorm.bias" in key:
+
+        # shared weights, only need to convert the weights of rank 0
+        if i == 0:
+            save_val(val, saved_dir, key)
+
+    elif "attention.dense.weight" in key or "mlp.proj.weight" in key:
+        split_dim = 0
+        split_vals = np.split(val, factor, axis=split_dim)
+        save_split(split_vals, saved_dir, key, i, factor)
+        if act_range is not None and int8_outputs == "all":
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range)
+            write_int8(vals_i8, saved_dir, base_key, split_dim, i, factor)
+
+    elif "mlp.fc.weight" in key or "mlp.gate.weight" in key:
+        split_dim = -1
+        split_vals = np.split(val, factor, axis=split_dim)
+        save_split(split_vals, saved_dir, key, i, factor)
+        if act_range is not None and int8_outputs == "all":
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range)
+            write_int8(vals_i8, saved_dir, base_key, split_dim, i, factor)
+
+    elif "attention.query_key_value.weight" in key:
+        hidden_dim = val.shape[0]
+        if local_dim is None:
+            local_dim = val.shape[-1] // 3
+        if multi_query_mode:
+            head_size = (val.shape[-1] - local_dim) // 2
+            val = val.reshape(hidden_dim, local_dim + 2 * head_size)
+            w_q, w_k, w_v = np.split(val, [local_dim, local_dim + head_size],
+                                     axis=-1)
+            w_q_split = np.split(w_q, factor, axis=-1)
+            w_k_split = np.split(w_k, factor, axis=-1)
+            w_v_split = np.split(w_v, factor, axis=-1)
+            split_vals = [
+                np.concatenate((w_q_split[ii], w_k_split[ii], w_v_split[ii]),
+                               axis=-1) for ii in range(factor)
+            ]
+            split_dim = -1
+        else:
+            val = val.reshape(hidden_dim, 3, local_dim)
+            split_dim = -1
+            split_vals = np.split(val, factor, axis=split_dim)
+        save_split(split_vals, saved_dir, key, i, factor)
+        if save_int8:
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val,
+                                    act_range,
+                                    is_qkv=True,
+                                    multi_query_mode=multi_query_mode)
+            write_int8(vals_i8,
+                       saved_dir,
+                       base_key,
+                       split_dim,
+                       i,
+                       factor,
+                       is_qkv=True,
+                       multi_query_mode=multi_query_mode)
+    elif "attention.dense.smoother" in key or "mlp.proj.smoother" in key:
+        split_vals = np.split(val, factor, axis=0)
+        save_split(split_vals, saved_dir, key, i, factor)
+
+    else:
+        print(f"[WARNING] {key} not handled by converter")
diff --git a/examples/baichuan/hf_baichuan_convert.py b/examples/baichuan/hf_baichuan_convert.py
new file mode 100644
index 0000000000..73c53071f6
--- /dev/null
+++ b/examples/baichuan/hf_baichuan_convert.py
@@ -0,0 +1,291 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Convert Baichuan models. Use https://huggingface.co/baichuan-inc/Baichuan2-7B-Chat as demo.
+'''
+import argparse
+import configparser
+import os
+from pathlib import Path
+
+import torch
+import torch.multiprocessing as multiprocessing
+from convert import split_and_save_weight, str_to_np_dtype
+from smoothquant import (capture_activation_range, smooth_gemm,
+                         smooth_gemm_fc1_gate)
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+@torch.no_grad()
+def smooth_baichuan_model(model, scales, alpha, baichuan_smoother):
+    # Smooth the activation and weights with smoother = $\diag{s}$
+    for name, module in model.named_modules():
+        class_name = module.__class__.__name__
+        if not 'Layer' in class_name:
+            continue
+        print(f'smoothing module: {name}, class_name: {class_name}')
+        # qkv_proj
+        layer_name_qkv = name + ".self_attn.W_pack"
+
+        smoother = smooth_gemm(module.self_attn.W_pack.weight,
+                               scales[layer_name_qkv]["x"],
+                               module.input_layernorm.weight, None, alpha)
+
+        scales[layer_name_qkv]["x"] = scales[layer_name_qkv]["x"] / smoother
+        scales[layer_name_qkv]["w"] = module.self_attn.W_pack.weight.abs().max(
+            dim=1)[0]
+
+        # =================================================================
+        layer_name = name + ".self_attn.o_proj"
+        smoother = smooth_gemm(module.self_attn.o_proj.weight,
+                               scales[layer_name]["x"], None, None, alpha)
+        baichuan_smoother[layer_name] = smoother.float()
+
+        scales[layer_name]["x"] = scales[layer_name]["x"] / smoother
+        scales[layer_name]["w"] = module.self_attn.o_proj.weight.abs().max(
+            dim=1)[0]
+
+        # ==================================================================
+        fc1_layer_name = name + ".mlp.gate_proj"
+        gate_layer_name = name + ".mlp.up_proj"
+
+        smoother = smooth_gemm_fc1_gate(module.mlp.gate_proj.weight,
+                                        module.mlp.up_proj.weight,
+                                        scales[fc1_layer_name]["x"],
+                                        module.post_attention_layernorm.weight,
+                                        None, alpha)
+
+        scales[fc1_layer_name]["x"] = scales[fc1_layer_name]["x"] / smoother
+        scales[fc1_layer_name]["w"] = module.mlp.gate_proj.weight.abs().max(
+            dim=1)[0]
+
+        scales[gate_layer_name]["x"] = scales[gate_layer_name]["x"] / smoother
+        scales[gate_layer_name]["w"] = module.mlp.up_proj.weight.abs().max(
+            dim=1)[0]
+
+        # ==================================================================
+        layer_name = name + ".mlp.down_proj"
+        smoother = smooth_gemm(module.mlp.down_proj.weight,
+                               scales[layer_name]["x"], None, None, alpha)
+        baichuan_smoother[layer_name] = smoother.float()
+        scales[layer_name]["x"] = scales[layer_name]["x"] / smoother
+        scales[layer_name]["w"] = module.mlp.down_proj.weight.abs().max(
+            dim=1)[0]
+
+
+def baichuan_to_bin_name(orig_name):
+    global_bin_weights = {
+        "model.embed_tokens.weight": 'vocab_embedding.weight',
+        "model.norm.weight": 'ln_f.weight',
+        "lm_head.weight": 'lm_head.weight',
+    }
+
+    if orig_name in global_bin_weights:
+        return global_bin_weights[orig_name]
+
+    _, _, layer_id, *weight_name = orig_name.split(".")
+
+    layer_id = int(layer_id)
+    weight_name = ".".join(weight_name)
+
+    per_layer_weights = {
+        "input_layernorm.weight": "input_layernorm.weight",
+        "self_attn.W_pack.weight": "attention.query_key_value.weight",
+        "self_attn.o_proj.weight": "attention.dense.weight",
+        "mlp.gate_proj.weight": "mlp.fc.weight",
+        "mlp.down_proj.weight": "mlp.proj.weight",
+        "mlp.up_proj.weight": "mlp.gate.weight",
+        "post_attention_layernorm.weight": "post_layernorm.weight",
+    }
+
+    return f"layers.{layer_id}.{per_layer_weights[weight_name]}"
+
+
+# Baichuan uses nn.Linear for these following ops whose weight matrix is transposed compared to gpt2.
+# In order to use the preprocess codes of gpt2, we transpose them firstly.
+def transpose_weights(hf_name, param):
+    weight_to_transpose = [
+        "W_pack", "o_proj", "gate_proj", "down_proj", "up_proj"
+    ]
+    if any([k in hf_name for k in weight_to_transpose]):
+        if len(param.shape) == 2:
+            param = param.transpose(0, 1)
+    return param
+
+
+def hf_baichuan_converter(args):
+    infer_tp = args.tensor_parallelism
+    saved_dir = Path(args.out_dir) / f"{infer_tp}-gpu"
+    saved_dir.mkdir(parents=True, exist_ok=True)
+
+    model = AutoModelForCausalLM.from_pretrained(args.in_file,
+                                                 device_map="auto",
+                                                 trust_remote_code=True)
+
+    act_range = {}
+    # smoother for inputs of self_attn.o_proj and mlp.down_proj
+    baichuan_smoother = {}
+
+    if args.smoothquant is not None or args.calibrate_kv_cache:
+        os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
+            "TOKENIZERS_PARALLELISM", "false")
+        act_range = capture_activation_range(
+            model,
+            AutoTokenizer.from_pretrained(args.in_file,
+                                          use_fast=False,
+                                          trust_remote_code=True))
+        if args.smoothquant is not None:
+            smooth_baichuan_model(model, act_range, args.smoothquant,
+                                  baichuan_smoother)
+
+    config = configparser.ConfigParser()
+    config["baichuan"] = {}
+    for key in vars(args):
+        config["baichuan"][key] = f"{vars(args)[key]}"
+    for k, v in vars(model.config).items():
+        config["baichuan"][k] = f"{v}"
+    config["baichuan"]["weight_data_type"] = args.storage_type
+    config["baichuan"]["multi_query_mode"] = str(False)
+    with open(saved_dir / "config.ini", 'w') as configfile:
+        config.write(configfile)
+
+    storage_type = str_to_np_dtype(args.storage_type)
+
+    global_bin_weights = [
+        'vocab_embedding.weight', 'ln_f.weight', 'lm_head.weight'
+    ]
+
+    int8_outputs = None
+    if args.calibrate_kv_cache:
+        int8_outputs = "kv_cache_only"
+    if args.smoothquant is not None:
+        int8_outputs = "all"
+
+    starmap_args = []
+    for name, param in model.named_parameters():
+        if "weight" not in name and "bias" not in name:
+            continue
+        bin_name = baichuan_to_bin_name(name)
+
+        if name.replace(".weight", "") in baichuan_smoother.keys():
+            smoother = baichuan_smoother[name.replace(".weight", "")]
+            smoother = smoother.detach().cpu().numpy()
+            starmap_args.append(
+                (0, saved_dir, infer_tp,
+                 f"{bin_name}.smoother".replace(".weight",
+                                                ""), smoother, None, {
+                                                    "int8_outputs":
+                                                    int8_outputs,
+                                                    "multi_query_mode": False,
+                                                    "local_dim": None,
+                                                }))
+
+        param = transpose_weights(name, param)
+
+        param = param.detach().cpu().numpy().astype(storage_type)
+
+        if bin_name in global_bin_weights:
+            param.tofile(saved_dir / f"{bin_name}.bin")
+        elif bin_name.split('.')[-2] == 'query_key_value':
+            local_dim = None
+            layer_name_qkv = name.replace(".weight", "")
+            # Baichuan models use W_pack to transform qkv
+            # So we can simply use param as qkv weight here
+            qkv = (0, saved_dir, infer_tp, bin_name, param,
+                   act_range.get(layer_name_qkv), {
+                       "int8_outputs": int8_outputs,
+                       "multi_query_mode": False,
+                       "local_dim": local_dim,
+                   })
+            starmap_args.append(qkv)
+        elif bin_name.split('.')[-2] == 'kv':
+            continue
+        else:
+            starmap_args.append((0, saved_dir, infer_tp, bin_name, param,
+                                 act_range.get(name.replace(".weight", "")), {
+                                     "int8_outputs": int8_outputs,
+                                     "multi_query_mode": False,
+                                     "local_dim": None,
+                                 }))
+
+    starmap_args = tqdm(starmap_args, desc="saving weights")
+    if args.processes > 1:
+        with multiprocessing.Pool(args.processes) as pool:
+            pool.starmap(split_and_save_weight, starmap_args)
+    else:
+        # simpler for debug situations
+        for starmap_arg in starmap_args:
+            split_and_save_weight(*starmap_arg)
+
+
+if __name__ == "__main__":
+    torch.multiprocessing.set_start_method("spawn")
+
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--out-dir',
+                        '-o',
+                        type=str,
+                        help='file name of output directory',
+                        required=True)
+    parser.add_argument('--in-file',
+                        '-i',
+                        type=str,
+                        help='file name of input checkpoint file',
+                        required=True)
+    parser.add_argument('--tensor-parallelism',
+                        '-tp',
+                        type=int,
+                        help='Requested tensor parallelism for inference',
+                        default=1)
+    parser.add_argument(
+        "--processes",
+        "-p",
+        type=int,
+        help="How many processes to spawn for conversion (default: 4)",
+        default=4)
+    parser.add_argument(
+        "--calibrate-kv-cache",
+        "-kv",
+        action="store_true",
+        help=
+        "Generate scaling factors for KV cache. Used for storing KV cache in int8."
+    )
+    parser.add_argument(
+        "--smoothquant",
+        "-sq",
+        type=float,
+        default=None,
+        help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)"
+        " to Smoothquant the model, and output int8 weights."
+        " A good first try is 0.5. Must be in [0, 1]")
+    parser.add_argument("--storage-type",
+                        "-t",
+                        type=str,
+                        default="fp32",
+                        choices=["fp32", "fp16"])
+
+    args = parser.parse_args()
+    print("\n=============== Argument ===============")
+    for key in vars(args):
+        print("{}: {}".format(key, vars(args)[key]))
+    print("========================================")
+
+    assert (args.calibrate_kv_cache or args.smoothquant), \
+        ("Either INT8 kv cache or SmoothQuant must be enabled for this script. "
+        "Otherwise you can directly build engines from HuggingFace checkpoints,"
+        " no need to do this bin format conversion. ")
+    hf_baichuan_converter(args)
diff --git a/examples/baichuan/run.py b/examples/baichuan/run.py
index 18d2a5bbf7..6c05bd11ca 100644
--- a/examples/baichuan/run.py
+++ b/examples/baichuan/run.py
@@ -23,6 +23,7 @@ import torch
 from transformers import AutoTokenizer
 
 import tensorrt_llm
+from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime import ModelConfig, SamplingConfig
 
 from build import get_engine_name  # isort:skip
@@ -31,6 +32,75 @@ EOS_TOKEN = 2
 PAD_TOKEN = 0
 
 
+def read_config(config_path: Path):
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
+    remove_input_padding = config['plugin_config']['remove_input_padding']
+    dtype = config['builder_config']['precision']
+    world_size = config['builder_config']['tensor_parallel']
+    assert world_size == tensorrt_llm.mpi_world_size(), \
+        f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
+    num_heads = config['builder_config']['num_heads'] // world_size
+    hidden_size = config['builder_config']['hidden_size'] // world_size
+    vocab_size = config['builder_config']['vocab_size']
+    num_layers = config['builder_config']['num_layers']
+    paged_kv_cache = config['plugin_config']['paged_kv_cache']
+    tokens_per_block = config['plugin_config']['tokens_per_block']
+    quant_mode = QuantMode(config['builder_config']['quant_mode'])
+
+    model_config = ModelConfig(num_heads=num_heads,
+                               num_kv_heads=num_heads,
+                               hidden_size=hidden_size,
+                               vocab_size=vocab_size,
+                               num_layers=num_layers,
+                               gpt_attention_plugin=use_gpt_attention_plugin,
+                               paged_kv_cache=paged_kv_cache,
+                               tokens_per_block=tokens_per_block,
+                               remove_input_padding=remove_input_padding,
+                               dtype=dtype,
+                               quant_mode=quant_mode)
+
+    return model_config, world_size, dtype
+
+
+def parse_input(input_text: str, input_file: str, tokenizer, end_id: int,
+                remove_input_padding: bool):
+    input_tokens = []
+    if input_file is None:
+        input_tokens.append(
+            tokenizer.encode(input_text, add_special_tokens=False))
+    else:
+        if input_file.endswith('.csv'):
+            with open(input_file, 'r') as csv_file:
+                csv_reader = csv.reader(csv_file, delimiter=',')
+                for line in csv_reader:
+                    input_tokens.append(np.array(line, dtype='int32'))
+        elif input_file.endswith('.npy'):
+            inputs = np.load(input_file)
+            for row in inputs:
+                row = row[row != end_id]
+                input_tokens.append(row)
+        else:
+            print('Input file format not supported.')
+            raise SystemExit
+
+    input_ids = None
+    input_lengths = torch.tensor([len(x) for x in input_tokens],
+                                 dtype=torch.int32,
+                                 device='cuda')
+    if remove_input_padding:
+        input_ids = np.concatenate(input_tokens)
+        input_ids = torch.tensor(input_ids, dtype=torch.int32,
+                                 device='cuda').unsqueeze(0)
+    else:
+        input_ids = torch.nested.to_padded_tensor(
+            torch.nested.nested_tensor(input_tokens, dtype=torch.int32),
+            end_id).cuda()
+
+    return input_ids, input_lengths
+
+
 def parse_arguments():
     parser = argparse.ArgumentParser()
     parser.add_argument('--max_output_len', type=int, required=True)
@@ -67,6 +137,38 @@ def parse_arguments():
     return parser.parse_args()
 
 
+def print_output(output_ids, input_lengths, max_output_len, tokenizer,
+                 output_csv, output_npy):
+    num_beams = output_ids.size(1)
+    if output_csv is None and output_npy is None:
+        for b in range(input_lengths.size(0)):
+            inputs = output_ids[b][0][:input_lengths[b]].tolist()
+            input_text = tokenizer.decode(inputs)
+            print(f'Input: \"{input_text}\"')
+            for beam in range(num_beams):
+                output_begin = input_lengths[b]
+                output_end = input_lengths[b] + max_output_len
+                outputs = output_ids[b][beam][output_begin:output_end].tolist()
+                output_text = tokenizer.decode(outputs)
+                print(f'Output: \"{output_text}\"')
+
+    output_ids = output_ids.reshape((-1, output_ids.size(2)))
+
+    if output_csv is not None:
+        output_file = Path(output_csv)
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        outputs = output_ids.tolist()
+        with open(output_file, 'w') as csv_file:
+            writer = csv.writer(csv_file, delimiter=',')
+            writer.writerows(outputs)
+
+    if output_npy is not None:
+        output_file = Path(output_npy)
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        outputs = np.array(output_ids.cpu().contiguous(), dtype='int32')
+        np.save(output_file, outputs)
+
+
 def generate(
     max_output_len: int,
     log_level: str = 'error',
@@ -81,21 +183,9 @@ def generate(
 ):
     tensorrt_llm.logger.set_level(log_level)
 
-    config_path = os.path.join(engine_dir, 'config.json')
-    with open(config_path, 'r') as f:
-        config = json.load(f)
-    use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
-    remove_input_padding = config['plugin_config']['remove_input_padding']
-    paged_kv_cache = config['plugin_config']['paged_kv_cache']
-    tokens_per_block = config['plugin_config']['tokens_per_block']
-    dtype = config['builder_config']['precision']
-    world_size = config['builder_config']['tensor_parallel']
-    assert world_size == tensorrt_llm.mpi_world_size(), \
-        f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
-    num_heads = config['builder_config']['num_heads'] // world_size
-    hidden_size = config['builder_config']['hidden_size'] // world_size
-    vocab_size = config['builder_config']['vocab_size']
-    num_layers = config['builder_config']['num_layers']
+    engine_dir = Path(engine_dir)
+    config_path = engine_dir / 'config.json'
+    model_config, world_size, dtype = read_config(config_path)
 
     runtime_rank = tensorrt_llm.mpi_rank()
     runtime_mapping = tensorrt_llm.Mapping(world_size,
@@ -107,17 +197,6 @@ def generate(
                                               use_fast=False,
                                               trust_remote_code=True)
 
-    model_config = ModelConfig(num_heads=num_heads,
-                               num_kv_heads=num_heads,
-                               hidden_size=hidden_size,
-                               vocab_size=vocab_size,
-                               num_layers=num_layers,
-                               gpt_attention_plugin=use_gpt_attention_plugin,
-                               paged_kv_cache=paged_kv_cache,
-                               tokens_per_block=tokens_per_block,
-                               remove_input_padding=remove_input_padding,
-                               dtype=dtype)
-
     repetition_penalty = 1.1
     temperature = 0.3
     top_k = 5
@@ -144,45 +223,9 @@ def generate(
                                                      engine_buffer,
                                                      runtime_mapping)
 
-    input_tokens = []
-    if input_file is None:
-        input_tokens.append(
-            tokenizer.encode(input_text, add_special_tokens=False))
-    else:
-        if input_file.endswith('.csv'):
-            with open(input_file, 'r') as csv_file:
-                csv_reader = csv.reader(csv_file, delimiter=',')
-                for line in csv_reader:
-                    input_tokens.append(np.array(line, dtype='int32'))
-        elif input_file.endswith('.npy'):
-            inputs = np.load(input_file)
-            for row in inputs:
-                row = row[row != EOS_TOKEN]
-                input_tokens.append(row)
-        else:
-            print('Input file format not supported.')
-            raise SystemExit
-
-    input_ids = None
-    input_lengths = None
-    if input_file is None:
-        input_ids = torch.tensor(input_tokens, dtype=torch.int32, device='cuda')
-        input_lengths = torch.tensor([input_ids.size(1)],
-                                     dtype=torch.int32,
-                                     device='cuda')
-    else:
-        input_lengths = torch.tensor([len(x) for x in input_tokens],
-                                     dtype=torch.int32,
-                                     device='cuda')
-        if remove_input_padding:
-            input_ids = np.concatenate(input_tokens)
-            input_ids = torch.tensor(input_ids,
-                                     dtype=torch.int32,
-                                     device='cuda').unsqueeze(0)
-        else:
-            input_ids = torch.nested.to_padded_tensor(
-                torch.nested.nested_tensor(input_tokens, dtype=torch.int32),
-                EOS_TOKEN).cuda()
+    input_ids, input_lengths = parse_input(input_text, input_file, tokenizer,
+                                           EOS_TOKEN,
+                                           model_config.remove_input_padding)
 
     max_input_length = torch.max(input_lengths).item()
     decoder.setup(input_lengths.size(0),
@@ -194,41 +237,8 @@ def generate(
     torch.cuda.synchronize()
 
     if runtime_rank == 0:
-        if output_csv is None and output_npy is None:
-            for b in range(input_lengths.size(0)):
-                inputs = input_tokens[b]
-                input_text = tokenizer.decode(inputs)
-                print(f'Input: \"{input_text}\"')
-                if num_beams <= 1:
-                    output_begin = max_input_length
-                    outputs = output_ids[b][0][output_begin:].tolist()
-                    output_text = tokenizer.decode(outputs)
-                    print(f'Output: \"{output_text}\"')
-                else:
-                    for beam in range(num_beams):
-                        output_begin = input_lengths[b]
-                        output_end = input_lengths[b] + max_output_len
-                        outputs = output_ids[b][beam][
-                            output_begin:output_end].tolist()
-                        output_text = tokenizer.decode(outputs)
-                        print(f'Output: \"{output_text}\"')
-
-        output_ids = output_ids.reshape((-1, output_ids.size(2)))
-
-        if output_csv is not None:
-            output_file = Path(output_csv)
-            output_file.parent.mkdir(exist_ok=True, parents=True)
-            outputs = output_ids.tolist()
-            with open(output_file, 'w') as csv_file:
-                writer = csv.writer(csv_file, delimiter=',')
-                writer.writerows(outputs)
-
-        if output_npy is not None:
-            output_file = Path(output_npy)
-            output_file.parent.mkdir(exist_ok=True, parents=True)
-            outputs = np.array(output_ids.cpu().contiguous(), dtype='int32')
-            np.save(output_file, outputs)
-    return
+        print_output(output_ids, input_lengths, max_output_len, tokenizer,
+                     output_csv, output_npy)
 
 
 if __name__ == '__main__':
diff --git a/examples/chatglm2-6b/smoothquant.py b/examples/baichuan/smoothquant.py
similarity index 69%
rename from examples/chatglm2-6b/smoothquant.py
rename to examples/baichuan/smoothquant.py
index 0c8dcaa5d4..4e4145cb4e 100644
--- a/examples/chatglm2-6b/smoothquant.py
+++ b/examples/baichuan/smoothquant.py
@@ -16,6 +16,7 @@
 Utilities for SmoothQuant models
 '''
 
+import copy
 import functools
 from collections import defaultdict
 
@@ -78,6 +79,45 @@ def smooth_gemm(gemm_weights,
     return scales
 
 
+@torch.no_grad()
+def smooth_gemm_fc1_gate(fc1_weights,
+                         gate_weights,
+                         act_scales,
+                         layernorm_weights=None,
+                         layernorm_bias=None,
+                         alpha=0.5,
+                         weight_scales=None):
+    gemm_weights = []
+    if not isinstance(fc1_weights, list):
+        fc1_weights = [fc1_weights]
+    if not isinstance(gate_weights, list):
+        gate_weights = [gate_weights]
+
+    for i in range(len(fc1_weights)):
+        gemm_weight = torch.cat([fc1_weights[i], gate_weights[i]], dim=0)
+        gemm_weights.append(gemm_weight)
+
+    orig_dtype = gemm_weights[0].dtype
+
+    for gemm in gemm_weights:
+        # gemm_weights are expected to be transposed
+        assert gemm.shape[1] == act_scales.numel()
+
+    if weight_scales is None:
+        weight_scales = torch.cat(
+            [gemm.abs().max(dim=0, keepdim=True)[0] for gemm in gemm_weights],
+            dim=0)
+        weight_scales = weight_scales.max(dim=0)[0]
+    weight_scales.to(float).clamp(min=1e-5)
+    scales = (act_scales.to(gemm_weights[0].device).to(float).pow(alpha) /
+              weight_scales.pow(1 - alpha)).clamp(min=1e-5)
+
+    apply_smoothing(scales, fc1_weights + gate_weights, layernorm_weights,
+                    layernorm_bias, orig_dtype)
+
+    return scales
+
+
 @torch.no_grad()
 def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5):
     if not isinstance(fcs, list):
@@ -107,9 +147,12 @@ def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5):
 @torch.no_grad()
 def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512):
     model.eval()
-    device = next(model.parameters()).device
+    next(model.parameters()).device
     act_scales = defaultdict(lambda: {"x": None, "y": None, "w": None})
 
+    test_token_num = 923
+    tokenizer.pad_token = tokenizer.eos_token
+
     def stat_tensor(name, tensor, act_scales, key):
         hidden_dim = tensor.shape[-1]
         tensor = tensor.view(-1, hidden_dim).abs().detach()
@@ -129,7 +172,7 @@ def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512):
 
         if act_scales[name]["w"] is None:
             act_scales[name]["w"] = m.weight.abs().clip(1e-8,
-                                                        None).max(dim=0)[0]
+                                                        None).max(dim=1)[0]
 
     hooks = []
     for name, m in model.named_modules():
@@ -139,14 +182,21 @@ def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512):
                     functools.partial(stat_input_hook, name=name)))
 
     from datasets import load_dataset
-    dataset = load_dataset("lambada", split="validation")
+    dataset_cnn = load_dataset("ccdv/cnn_dailymail", '3.0.0')
 
     for i in tqdm(range(num_samples), desc="calibrating model"):
-        input_ids = tokenizer(dataset[i]["text"],
-                              return_tensors="pt",
-                              max_length=seq_len,
-                              truncation=True).input_ids.to(device)
-        model(input_ids)
+        datapoint = dataset_cnn['train'][i:i + 1]
+        line = copy.copy(datapoint['article'])
+        line[0] = line[0] + ' TL;DR: '
+        line[0] = line[0].strip()
+        line[0] = line[0].replace(" n't", "n't")
+        line_encoded = tokenizer(line,
+                                 return_tensors="pt",
+                                 padding=True,
+                                 truncation=True)["input_ids"].type(torch.int64)
+        line_encoded = line_encoded[:, -test_token_num:]
+        line_encoded = line_encoded.cuda()
+        model(line_encoded)
 
     for h in hooks:
         h.remove()
diff --git a/examples/baichuan/summarize.py b/examples/baichuan/summarize.py
index 27201ac92b..b825123ef3 100644
--- a/examples/baichuan/summarize.py
+++ b/examples/baichuan/summarize.py
@@ -25,6 +25,7 @@ from transformers import AutoModelForCausalLM, AutoTokenizer
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm.logger import logger
+from tensorrt_llm.quantization import QuantMode
 
 from build import get_engine_name  # isort:skip
 
@@ -35,7 +36,6 @@ def TRTBaichuan(args, config):
     assert world_size == tensorrt_llm.mpi_world_size(), \
         f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
 
-    world_size = config['builder_config']['tensor_parallel']
     num_heads = config['builder_config']['num_heads'] // world_size
     hidden_size = config['builder_config']['hidden_size'] // world_size
     vocab_size = config['builder_config']['vocab_size']
@@ -45,6 +45,7 @@ def TRTBaichuan(args, config):
     remove_input_padding = config['plugin_config']['remove_input_padding']
     paged_kv_cache = config['plugin_config']['paged_kv_cache']
     tokens_per_block = config['plugin_config']['tokens_per_block']
+    quant_mode = QuantMode(config['builder_config']['quant_mode'])
 
     model_config = tensorrt_llm.runtime.ModelConfig(
         vocab_size=vocab_size,
@@ -56,7 +57,8 @@ def TRTBaichuan(args, config):
         tokens_per_block=tokens_per_block,
         remove_input_padding=remove_input_padding,
         paged_kv_cache=paged_kv_cache,
-        dtype=dtype)
+        dtype=dtype,
+        quant_mode=quant_mode)
 
     runtime_rank = tensorrt_llm.mpi_rank()
     runtime_mapping = tensorrt_llm.Mapping(world_size,
diff --git a/examples/baichuan/weight.py b/examples/baichuan/weight.py
index f777158ff1..7c3bc687c5 100644
--- a/examples/baichuan/weight.py
+++ b/examples/baichuan/weight.py
@@ -12,13 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import configparser
 import time
+from pathlib import Path
 
 import numpy as np
 import torch
 
 import tensorrt_llm
 from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import BaichuanForCausalLM
 from tensorrt_llm.quantization import QuantMode
 
 
@@ -81,7 +85,7 @@ def load_from_hf_baichuan(tensorrt_llm_baichuan,
             if layer_idx is None:
                 continue
             idx = int(layer_idx)
-            if idx >= tensorrt_llm_baichuan._num_layers:
+            if idx >= tensorrt_llm_baichuan.num_layers:
                 continue
             if 'input_layernorm.weight' in k:
                 tensorrt_llm_baichuan.layers[
@@ -163,3 +167,332 @@ def load_from_hf_baichuan(tensorrt_llm_baichuan,
     tok = time.time()
     t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
     tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}')
+
+
+def parse_bin_config(ini_file):
+    baichuan_config = configparser.ConfigParser()
+    baichuan_config.read(ini_file)
+
+    n_embd = baichuan_config.getint('baichuan', 'hidden_size')
+    n_head = baichuan_config.getint('baichuan', 'num_attention_heads')
+    n_kv_head = n_head
+    n_layer = baichuan_config.getint('baichuan', 'num_hidden_layers')
+    if baichuan_config.has_option('baichuan', 'max_position_embeddings'):
+        n_positions = baichuan_config.getint('baichuan',
+                                             'max_position_embeddings')
+    else:
+        n_positions = baichuan_config.getint('baichuan', 'model_max_length')
+    vocab_size = baichuan_config.getint('baichuan', 'vocab_size')
+    hidden_act = baichuan_config.get('baichuan', 'hidden_act')
+    inter_size = baichuan_config.getint('baichuan',
+                                        'intermediate_size',
+                                        fallback=None)
+
+    if inter_size is None:
+        inter_size = 4 * n_embd
+
+    return n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head
+
+
+def gen_suffix(rank, use_smooth_quant, quant_per_channel):
+    suffix = f"{rank}.bin"
+    if use_smooth_quant:
+        sq_prefix = "int8."
+        if quant_per_channel:
+            sq_prefix += "col."
+        suffix = sq_prefix + suffix
+    return suffix
+
+
+def load_from_binary(tensorrt_llm_baichuan: BaichuanForCausalLM,
+                     dir_path,
+                     mapping=Mapping(),
+                     fp16=False,
+                     multi_query_mode=False):
+    tensorrt_llm.logger.info('Loading weights from binary...')
+    tik = time.time()
+
+    quant_mode = getattr(tensorrt_llm_baichuan, 'quant_mode', QuantMode(0))
+
+    n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head = parse_bin_config(
+        Path(dir_path) / 'config.ini')
+    np_dtype = np.float16 if fp16 else np.float32
+
+    def fromfile(dir_path, name, shape=None, dtype=None):
+        dtype = np_dtype if dtype is None else dtype
+        p = dir_path + '/' + name
+        if Path(p).exists():
+            t = np.fromfile(p, dtype=dtype)
+            if shape is not None:
+                t = t.reshape(shape)
+            return t
+        return None
+
+    def set_smoothquant_scale_factors(module,
+                                      pre_scale_weight,
+                                      dir_path,
+                                      basename,
+                                      shape,
+                                      per_tok_dyn,
+                                      per_channel,
+                                      is_qkv=False,
+                                      rank=None):
+        suffix = "bin"
+        if per_channel:
+            if rank is not None:
+                suffix = f"{rank}." + suffix
+            suffix = "col." + suffix
+
+        col_shape = shape if (per_channel or is_qkv) else [1, 1]
+
+        if per_tok_dyn:
+            if pre_scale_weight is not None:
+                pre_scale_weight.value = np.array([1.0], dtype=np.float32)
+            if is_qkv and not per_channel:
+                t = fromfile(dir_path,
+                             f"{basename}scale_w_quant_orig.{rank}.{suffix}",
+                             col_shape, np.float32)
+            else:
+                t = fromfile(dir_path, f"{basename}scale_w_quant_orig.{suffix}",
+                             col_shape, np.float32)
+            module.per_channel_scale.value = t
+        else:
+            t = fromfile(dir_path, f"{basename}scale_x_orig_quant.bin", [1],
+                         np.float32)
+            pre_scale_weight.value = t
+            if is_qkv:
+                t = fromfile(dir_path,
+                             f"{basename}scale_y_accum_quant.{rank}.{suffix}",
+                             col_shape, np.float32)
+            else:
+                t = fromfile(dir_path,
+                             f"{basename}scale_y_accum_quant.{suffix}",
+                             col_shape, np.float32)
+            module.per_channel_scale.value = t
+            t = fromfile(dir_path, f"{basename}scale_y_quant_orig.bin", [1, 1],
+                         np.float32)
+            module.act_scale.value = t
+
+    def set_smoother(module, dir_path, base_name, shape, rank):
+        suffix = f"{rank}.bin"
+        t = fromfile(dir_path, f"{base_name}.smoother.{suffix}", shape,
+                     np.float32)
+        module.smoother.value = t
+
+    # Determine the quantization mode.
+    quant_mode = getattr(tensorrt_llm_baichuan, "quant_mode", QuantMode(0))
+    if quant_mode.is_int8_weight_only():
+        plugin_weight_only_quant_type = torch.int8
+    elif quant_mode.is_int4_weight_only():
+        plugin_weight_only_quant_type = torch.quint4x2
+    # Do we use SmoothQuant?
+    use_smooth_quant = quant_mode.has_act_and_weight_quant()
+    # Do we use quantization per token?
+    quant_per_token_dyn = quant_mode.has_per_token_dynamic_scaling()
+    # Do we use quantization per channel?
+    quant_per_channel = quant_mode.has_per_channel_scaling()
+
+    # Do we use INT4/INT8 weight-only?
+    use_weight_only = quant_mode.is_weight_only()
+
+    # Int8 KV cache
+    use_int8_kv_cache = quant_mode.has_int8_kv_cache()
+
+    # Debug
+    suffix = gen_suffix(mapping.tp_rank, use_smooth_quant, quant_per_channel)
+    # The type of weights.
+    w_type = np_dtype if not use_smooth_quant else np.int8
+
+    if mapping.is_first_pp_rank():
+        tensorrt_llm_baichuan.vocab_embedding.weight.value = (fromfile(
+            dir_path, 'vocab_embedding.weight.bin', [vocab_size, n_embd]))
+
+    if mapping.is_last_pp_rank():
+        tensorrt_llm_baichuan.ln_f.weight.value = (fromfile(
+            dir_path, 'ln_f.weight.bin'))
+    # share input embedding
+    lm_head_weight = fromfile(dir_path, 'lm_head.weight.bin',
+                              [vocab_size, n_embd])
+
+    if vocab_size % mapping.tp_size != 0:
+        # padding
+        vocab_size_padded = tensorrt_llm_baichuan.lm_head.out_features * mapping.tp_size
+        pad_width = vocab_size_padded - vocab_size
+        lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)),
+                                'constant',
+                                constant_values=0)
+    if mapping.is_last_pp_rank():
+        tensorrt_llm_baichuan.lm_head.weight.value = np.ascontiguousarray(
+            split(lm_head_weight, mapping.tp_size, mapping.tp_rank))
+
+    layers_range = list(
+        range(mapping.pp_rank * tensorrt_llm_baichuan.num_layers,
+              (mapping.pp_rank + 1) * tensorrt_llm_baichuan.num_layers, 1))
+
+    for i in layers_range:
+        n_groups = n_head // n_kv_head
+        c_attn_out_dim = (
+            3 * n_embd // mapping.tp_size) if not multi_query_mode else (
+                n_embd // mapping.tp_size +
+                (n_embd // n_head * n_groups) // mapping.tp_size * 2)
+        idx = i - mapping.pp_rank * tensorrt_llm_baichuan.num_layers
+        tensorrt_llm_baichuan.layers[idx].input_layernorm.weight.value = (
+            fromfile(dir_path,
+                     'model.layers.' + str(i) + '.input_layernorm.weight.bin'))
+        t = fromfile(
+            dir_path, 'model.layers.' + str(i) +
+            '.attention.query_key_value.weight.' + suffix,
+            [n_embd, c_attn_out_dim], w_type)
+        if t is not None:
+            dst = tensorrt_llm_baichuan.layers[idx].attention.qkv.weight
+            if use_smooth_quant:
+                dst.value = np.ascontiguousarray(np.transpose(t, [1, 0]))
+                set_smoothquant_scale_factors(
+                    tensorrt_llm_baichuan.layers[idx].attention.qkv,
+                    tensorrt_llm_baichuan.layers[idx].input_layernorm.
+                    scale_to_int,
+                    dir_path,
+                    'model.layers.' + str(i) + '.attention.query_key_value.',
+                    [1, c_attn_out_dim],
+                    quant_per_token_dyn,
+                    quant_per_channel,
+                    rank=mapping.tp_rank,
+                    is_qkv=True)
+            elif use_weight_only:
+                processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                    torch.tensor(t), plugin_weight_only_quant_type)
+                dst.value = processed_torch_weights.numpy()
+                scales = tensorrt_llm_baichuan.layers[
+                    i].attention.qkv.per_channel_scale
+                scales.value = torch_weight_scales.numpy()
+            else:
+                dst.value = np.ascontiguousarray(np.transpose(t, [1, 0]))
+
+        dst = tensorrt_llm_baichuan.layers[idx].attention.dense.weight
+        t = fromfile(
+            dir_path,
+            'model.layers.' + str(i) + '.attention.dense.weight.' + suffix,
+            [n_embd // mapping.tp_size, n_embd], w_type)
+        if use_smooth_quant:
+            dst.value = np.ascontiguousarray(np.transpose(t, [1, 0]))
+            dense_scale = getattr(tensorrt_llm_baichuan.layers[idx].attention,
+                                  "quantization_scaling_factor", None)
+            set_smoothquant_scale_factors(
+                tensorrt_llm_baichuan.layers[idx].attention.dense, dense_scale,
+                dir_path, 'model.layers.' + str(i) + '.attention.dense.',
+                [1, n_embd], quant_per_token_dyn, quant_per_channel)
+            set_smoother(tensorrt_llm_baichuan.layers[idx].attention.dense,
+                         dir_path,
+                         'model.layers.' + str(i) + '.attention.dense',
+                         [1, n_embd // mapping.tp_size], mapping.tp_rank)
+        elif use_weight_only:
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_baichuan.layers[
+                i].attention.dense.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            dst.value = np.ascontiguousarray(np.transpose(t, [1, 0]))
+
+        dst = tensorrt_llm_baichuan.layers[idx].post_layernorm.weight
+        dst.value = fromfile(
+            dir_path, 'model.layers.' + str(i) + '.post_layernorm.weight.bin')
+
+        t = fromfile(dir_path,
+                     'model.layers.' + str(i) + '.mlp.fc.weight.' + suffix,
+                     [n_embd, inter_size // mapping.tp_size], w_type)
+
+        if use_smooth_quant:
+            tensorrt_llm_baichuan.layers[
+                idx].mlp.fc.weight.value = np.ascontiguousarray(
+                    np.transpose(t, [1, 0]))
+            set_smoothquant_scale_factors(
+                tensorrt_llm_baichuan.layers[idx].mlp.fc,
+                tensorrt_llm_baichuan.layers[idx].post_layernorm.scale_to_int,
+                dir_path,
+                'model.layers.' + str(i) + '.mlp.fc.',
+                [1, inter_size // mapping.tp_size],
+                quant_per_token_dyn,
+                quant_per_channel,
+                rank=mapping.tp_rank)
+        elif use_weight_only:
+            dst = tensorrt_llm_baichuan.layers[i].mlp.fc.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_baichuan.layers[i].mlp.fc.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            tensorrt_llm_baichuan.layers[
+                idx].mlp.fc.weight.value = np.ascontiguousarray(
+                    np.transpose(t, [1, 0]))
+
+        t = fromfile(dir_path,
+                     'model.layers.' + str(i) + '.mlp.gate.weight.' + suffix,
+                     [n_embd, inter_size // mapping.tp_size], w_type)
+        if use_smooth_quant:
+            tensorrt_llm_baichuan.layers[
+                idx].mlp.gate.weight.value = np.ascontiguousarray(
+                    np.transpose(t, [1, 0]))
+            set_smoothquant_scale_factors(
+                tensorrt_llm_baichuan.layers[idx].mlp.gate,
+                tensorrt_llm_baichuan.layers[idx].post_layernorm.scale_to_int,
+                dir_path,
+                'model.layers.' + str(i) + '.mlp.gate.',
+                [1, inter_size // mapping.tp_size],
+                quant_per_token_dyn,
+                quant_per_channel,
+                rank=mapping.tp_rank)
+        elif use_weight_only:
+            dst = tensorrt_llm_baichuan.layers[i].mlp.gate.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_baichuan.layers[i].mlp.gate.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            tensorrt_llm_baichuan.layers[
+                idx].mlp.gate.weight.value = np.ascontiguousarray(
+                    np.transpose(t, [1, 0]))
+
+        t = fromfile(dir_path,
+                     'model.layers.' + str(i) + '.mlp.proj.weight.' + suffix,
+                     [inter_size // mapping.tp_size, n_embd], w_type)
+        if use_smooth_quant:
+            tensorrt_llm_baichuan.layers[
+                idx].mlp.proj.weight.value = np.ascontiguousarray(
+                    np.transpose(t, [1, 0]))
+            proj_scale = getattr(tensorrt_llm_baichuan.layers[idx].mlp,
+                                 "quantization_scaling_factor", None)
+            set_smoothquant_scale_factors(
+                tensorrt_llm_baichuan.layers[idx].mlp.proj, proj_scale,
+                dir_path, 'model.layers.' + str(i) + '.mlp.proj.', [1, n_embd],
+                quant_per_token_dyn, quant_per_channel)
+            set_smoother(tensorrt_llm_baichuan.layers[idx].mlp.proj, dir_path,
+                         'model.layers.' + str(i) + '.mlp.proj',
+                         [1, inter_size // mapping.tp_size], mapping.tp_rank)
+        elif use_weight_only:
+            dst = tensorrt_llm_baichuan.layers[i].mlp.proj.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_baichuan.layers[i].mlp.proj.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            tensorrt_llm_baichuan.layers[idx].mlp.proj.weight.value = (
+                np.ascontiguousarray(np.transpose(t, [1, 0])))
+
+        if use_int8_kv_cache:
+            t = fromfile(
+                dir_path, 'model.layers.' + str(i) +
+                '.attention.query_key_value.scale_y_quant_orig.bin', [1],
+                np.float32)
+            tensorrt_llm_baichuan.layers[
+                idx].attention.kv_orig_quant_scale.value = 1.0 / t
+            tensorrt_llm_baichuan.layers[
+                idx].attention.kv_quant_orig_scale.value = t
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}')
diff --git a/examples/bloom/build.py b/examples/bloom/build.py
index 54362d1449..ab4cffd806 100644
--- a/examples/bloom/build.py
+++ b/examples/bloom/build.py
@@ -27,7 +27,7 @@ from tensorrt_llm._utils import str_dtype_to_trt
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import smooth_quantize, weight_only_quantize
+from tensorrt_llm.models import quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
@@ -345,12 +345,8 @@ def build_rank_engine(builder: Builder,
         embedding_sharding_dim=args.embedding_sharding_dim,
         share_embedding_table=share_embedding_table,
         quant_mode=args.quant_mode)
-    if args.use_smooth_quant:
-        tensorrt_llm_bloom = smooth_quantize(tensorrt_llm_bloom,
-                                             args.quant_mode)
-    elif args.use_weight_only:
-        tensorrt_llm_bloom = weight_only_quantize(tensorrt_llm_bloom,
-                                                  args.quant_mode)
+    if args.use_weight_only or args.use_smooth_quant:
+        tensorrt_llm_bloom = quantize_model(tensorrt_llm_bloom, args.quant_mode)
 
     if args.model_dir is not None:
         logger.info(f'Loading HF BLOOM ... from {args.model_dir}')
@@ -442,6 +438,9 @@ def build_rank_engine(builder: Builder,
     if rank == 0:
         config_path = os.path.join(args.output_dir, 'config.json')
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, tensorrt_llm_bloom)
+
     return engine
 
 
@@ -491,6 +490,7 @@ def build(rank, args):
                 cache = builder_config.trt_builder_config.get_timing_cache()
 
         serialize_engine(engine, os.path.join(args.output_dir, engine_name))
+        del engine
 
     if rank == 0:
         ok = builder.save_timing_cache(
diff --git a/examples/chatglm2-6b/.gitignore b/examples/chatglm/.gitignore
similarity index 57%
rename from examples/chatglm2-6b/.gitignore
rename to examples/chatglm/.gitignore
index baa5534912..979e236242 100644
--- a/examples/chatglm2-6b/.gitignore
+++ b/examples/chatglm/.gitignore
@@ -1,5 +1,6 @@
 __pycache__/
-pyTorchModel/
+chatglm*-6b/
+chatglm*-6b-32k/
 trtModel/
 dataset/
 .vscode/
diff --git a/examples/chatglm/README.md b/examples/chatglm/README.md
new file mode 100644
index 0000000000..74042ce268
--- /dev/null
+++ b/examples/chatglm/README.md
@@ -0,0 +1,144 @@
+# ChatGLM
+
+This document explains how to build the [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b), [ChatGLM2-6B](https://huggingface.co/THUDM/chatglm2-6b) and [ChatGLM3-6B](https://huggingface.co/THUDM/chatglm3-6b), [ChatGLM2-6B-32k](https://huggingface.co/THUDM/chatglm2-6b-32k), [ChatGLM3-6B-32k](https://huggingface.co/THUDM/chatglm3-6b-32k) models using TensorRT-LLM and run on a single GPU, a single node with multiple GPUs or multiple nodes with multiple GPUs.
+
+## Overview
+
+The TensorRT-LLM ChatGLM implementation can be found in [`tensorrt_llm/models/chatglm/model.py`](../../tensorrt_llm/models/chatglm/model.py).
+The TensorRT-LLM ChatGLM example code is located in [`examples/chatglm`](./). There are 3 main files in that folder:
+
+* [`build.py`](./build.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the ChatGLM model.
+* [`run.py`](./run.py) to run the inference on an input text.
+* [`summarize.py`](./summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset using the model.
+
+## Support Matrix
+
+* FP16
+* Weight Only Quantization (int8 / int4)
+* Paged KV cache
+* Remove Input Padding
+* Tensor Parallel
+* Strongly Typed
+
+## Usage
+
+The next section describe how to build the engine and run the inference demo.
+
+### 1. Download repo and weights from HuggingFace Transformers
+
+```bash
+pip install -r requirements.txt
+apt-get update
+apt-get install git-lfs
+rm -rf chatglm*
+
+# clone one or more models we want to build
+git clone https://huggingface.co/THUDM/chatglm-6b
+git clone https://huggingface.co/THUDM/chatglm2-6b
+git clone https://huggingface.co/THUDM/chatglm3-6b
+git clone https://huggingface.co/THUDM/chatglm2-6b-32k
+git clone https://huggingface.co/THUDM/chatglm3-6b-32k
+```
+
+### 2. Build TensorRT engine(s)
+
+* This ChatGLM example in TensorRT-LLM builds TensorRT engine(s) using HF checkpoint directly (rather than using FT checkpoints such as GPT example).
+* If no checkpoint directory is specified, TensorRT-LLM will build engine(s) using dummy weights.
+* The [`build.py`](./build.py) script requires a single GPU to build the TensorRT engine(s).
+* You can enable parallel builds to accelerate the engine building process if you have more than one GPU in your system (of the same model).
+* For parallel building, add the `--parallel_build` argument to the build command (this feature cannot take advantage of more than a single node).
+* The number of TensorRT engines depends on the number of GPUs that will be used to run inference.
+* argument [--model_version/-m] is required, which can be one of "1", "2", "3", "2-32k" or "3-32k" for ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-6B-32K or ChatGLM3-6B-32K respectively.
+
+#### Examples of build invocations
+
+```bash
+# Build a default engine of ChatGLM3-6B on single GPU with FP16, GPT Attention plugin, Gemm plugin, RMS Normolization plugin
+python3 build.py -m 3
+
+# Build a engine on single GPU with FMHA kernels (see introduction below), other configurations are the same as default example
+python3 build.py -m 3 --enable_context_fmha  # or --enable_context_fmha_fp32_acc
+
+# Build a engine on single GPU with int8/int4 Weight-Only quantization, other configurations are the same as default example
+python3 build.py -m 3 --use_weight_only  # or --use_weight_only --weight_only_precision int4
+
+# Build a engine on single GPU with int8_kv_cache and remove_input_padding, other configurations are the same as default example
+python3 build.py -m 3 --paged_kv_cache --remove_input_padding
+
+# Build a engine on two GPU, other configurations are the same as default example
+python3 build.py -m 3 --world_size 2
+
+# Build a engine of ChatGLM-6B on single GPU, other configurations are the same as default example
+python3 build.py -m 1
+
+# Build a engine of ChatGLM2-6B on single GPU, other configurations are the same as default example
+python3 build.py -m 2
+
+# Build a engine of ChatGLM2-6B-32k on single GPU, other configurations are the same as default example
+python3 build.py -m 2-32k
+
+# Build a engine of ChatGLM3-6B-32k on single GPU, other configurations are the same as default example
+python3 build.py -m 3-32k
+```
+
+#### Enabled plugins
+
+* Use `--use_gemm_plugin <DataType>` to configure GPT Attention plugin (default as float16)
+* Use `--use_gemm_plugin <DataType>` to configure GEMM normolization plugin (default as float16)
+* Use `--use_layernorm_plugin <DataType>` (for ChatGLM-6B) to configure RMS normolization plugin (default as float16)
+* Use `--use_rmsnorm_plugin <DataType>` (for ChatGLM2-6B and ChatGLM3-6B) to configure RMS normolization plugin (default as float16)
+
+#### Fused MultiHead Attention (FMHA)
+
+* Use `--enable_context_fmha` or `--enable_context_fmha_fp32_acc` to enable FMHA kernels, which can provide better performance and low GPU memory occupation.
+
+* Switch `--use_gpt_attention_plugin float16` must be used when using FMHA.
+
+* `--enable_context_fmha` uses FP16 accumulator, which might cause low accuracy. In this case, `--enable_context_fmha_fp32_acc` should be used to protect accuracy at a cost of small performance drop.
+
+#### Weight Only quantization
+
+* Use `--use_weight_only` to enable INT8-Weight-Only quantization, this will siginficantly lower the latency and memory footprint.
+
+* Furthermore, use `--weight_only_precision int8` or `--weight_only_precision int4` to configure the data type of the weights.
+
+#### In-flight batching and paged KV cache [TODO]
+
+* The engine must be built accordingly if [in-flight batching in C++ runtime](../../docs/in_flight_batching.md) will be used.
+
+* Use `--use_inflight_batching` to enable In-flight Batching.
+
+* Switch `--use_gpt_attention_plugin=float16`, `--paged_kv_cache`, `--remove_input_padding` will be set when using In-flight Batching.
+
+* It is possible to use `--use_gpt_attention_plugin float32` In-flight Batching.
+
+* The size of the block in paged KV cache can be conteoled additionally by using `--tokens_per_block=N`.
+
+### 3. Run
+
+#### Single node, single GPU
+
+```bash
+# Run the default engine of ChatGLM3-6B on single GPU, other model version is available if built.
+python3 run.py -m 3
+```
+
+#### Single node, multi GPU
+
+```bash
+# Run the Tensor Parallel 2 engine of ChatGLM3-6B on two GPU, other model version is available if built.
+mpirun -n 2 python run.py -m 3
+```
+
+* `--allow-run-as-root` might be needed if using `mpirun` as root.
+
+#### Run comparison of performance and accuracy
+
+```bash
+# Run the summarization of ChatGLM3-6B task, other model version is available if built.
+python3 summarize.py -m 3
+```
+
+## Benchmark
+
+* The TensorRT-LLM ChatGLM benchmark is located in [benchmarks/](../../benchmarks/README.md)
diff --git a/examples/chatglm2-6b/build.py b/examples/chatglm/build.py
similarity index 83%
rename from examples/chatglm2-6b/build.py
rename to examples/chatglm/build.py
index d2fc9084b0..88a8687cb2 100644
--- a/examples/chatglm2-6b/build.py
+++ b/examples/chatglm/build.py
@@ -27,14 +27,11 @@ import tensorrt_llm
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import (ChatGLM2_6BHeadModel, smooth_quantize,
-                                 weight_only_quantize)
+from tensorrt_llm.models import ChatGLMHeadModel, quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
 
-MODEL_NAME = "chatglm2-6b"
-
 
 def get_engine_name(model, dtype, tp_size, rank):
     return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
@@ -61,11 +58,20 @@ def serialize_engine(engine, path):
 
 def parse_arguments(args):
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--model_version',
+        '-m',
+        type=str,
+        required=True,
+        choices=["1", "2", "3", "2-32k", "3-32k"],
+        help=
+        '1, 2, 3, 2-32k, 3-32k for ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-32k and ChatGLM3-32k respectively'
+    )
     parser.add_argument('--world_size',
                         type=int,
                         default=1,
                         help='world size, only support tensor parallelism now')
-    parser.add_argument('--model_dir', type=str, default="pyTorchModel")
+    parser.add_argument('--model_dir', type=str, default=None)
     parser.add_argument('--dtype',
                         type=str,
                         default='float16',
@@ -105,6 +111,16 @@ def parse_arguments(args):
         help=
         "Activates GEMM plugin. You can specify the plugin dtype or leave blank to use the model dtype."
     )
+    parser.add_argument(
+        '--use_layernorm_plugin',
+        nargs='?',
+        const='float16',
+        type=str,
+        default='float16',
+        choices=['float32', 'float16', 'bfloat16', False],
+        help=
+        "Activates layernorm plugin for ChatGLM-6B. You can specify the plugin dtype or leave blank to use the model dtype."
+    )
     parser.add_argument(
         '--use_rmsnorm_plugin',
         nargs='?',
@@ -113,7 +129,7 @@ def parse_arguments(args):
         default='float16',
         choices=['float32', 'float16', 'bfloat16', False],
         help=
-        "Activates rmsnorm plugin. You can specify the plugin dtype or leave blank to use the model dtype.",
+        "Activates rmsnorm plugin for ChatGLM2-6B / ChatGLM3-6B. You can specify the plugin dtype or leave blank to use the model dtype."
     )
     parser.add_argument('--gather_all_token_logits',
                         action='store_true',
@@ -242,13 +258,10 @@ def parse_arguments(args):
     args = parser.parse_args(args)
     logger.set_level(args.log_level)
 
-    args.apply_query_key_layer_scaling = False  # always False in TRT-LLM
-    args.hidden_act = 'swiglu'
-    args.multi_block_mode = False
-
     plugins_args = [
         'use_gpt_attention_plugin',
         'use_gemm_plugin',
+        'use_layernorm_plugin',
         'use_rmsnorm_plugin',
     ]
     for plugin_arg in plugins_args:
@@ -258,30 +271,51 @@ def parse_arguments(args):
             )
             setattr(args, plugin_arg, args.dtype)
 
-    assert args.model_dir is not None
+    if args.model_version == "1":
+        args.model_name = "chatglm-6b"
+    elif args.model_version in ["2", "3"]:
+        args.model_name = "chatglm%s-6b" % args.model_version
+    else:
+        args.model_name = "chatglm%s-6b-32k" % args.model_version.split("-")[0]
+    if args.model_dir is None:
+        args.model_dir = args.model_name
     with open(Path(args.model_dir) / "config.json", "r") as f:
         js = json.loads(f.read())
-    assert js["_name_or_path"] == "THUDM/" + MODEL_NAME
-
-    args.apply_residual_connection_post_layernorm = js[
-        "apply_residual_connection_post_layernorm"]
+    if args.model_version == "1":
+        assert args.max_input_len < js["max_sequence_length"]
+    args.apply_query_key_layer_scaling = False  # always False in TRT-LLM
     args.eos_token_id = js["eos_token_id"]
-    args.ffn_hidden_size = js["ffn_hidden_size"]
     args.hidden_size = js["hidden_size"]
-    args.kv_channels = js["kv_channels"]
-    args.layernorm_epsilon = js["layernorm_epsilon"]
-    args.linear_bias = js["add_bias_linear"]
-    args.multi_query_mode = js["multi_query_attention"]
-    args.max_seq_length = min(args.max_input_len + args.max_output_len,
-                              js["seq_length"])
-    args.num_kv_heads = js["multi_query_group_num"]
+    args.multi_block_mode = False
+    args.norm_epsilon = js["layernorm_epsilon"]
     args.num_heads = js["num_attention_heads"]
     args.num_layers = js["num_layers"]
     args.pad_token_id = js["pad_token_id"]
-    args.qkv_bias = js["add_qkv_bias"]
-    args.rmsnorm = js["rmsnorm"]
     args.use_cache = js["use_cache"]
-    args.vocab_size = js["padded_vocab_size"]
+    if args.model_version == "1":
+        args.ffn_hidden_size = js["inner_hidden_size"]
+        args.hidden_act = 'gelu'
+        args.linear_bias = True  # always True in ChatGLM-6B
+        args.max_seq_length = min(args.max_input_len + args.max_output_len,
+                                  js["max_sequence_length"])
+        args.multi_query_mode = False  # always False in ChatGLM-6B
+        args.num_kv_heads = js["num_attention_heads"]
+        args.qkv_bias = True  # always True in ChatGLM-6B
+        args.vocab_size = js["vocab_size"]
+    else:
+        #args.kv_channels = js["kv_channels"]  # useless
+        args.apply_residual_connection_post_layernorm = js[
+            "apply_residual_connection_post_layernorm"]
+        args.ffn_hidden_size = js["ffn_hidden_size"]
+        args.hidden_act = 'swiglu'
+        args.linear_bias = js["add_bias_linear"]
+        args.max_seq_length = min(args.max_input_len + args.max_output_len,
+                                  js["seq_length"])
+        args.multi_query_mode = js["multi_query_attention"]
+        args.num_kv_heads = js["multi_query_group_num"]
+        args.qkv_bias = js["add_qkv_bias"]
+        args.rmsnorm = js["rmsnorm"]
+        args.vocab_size = js["padded_vocab_size"]
 
     if args.use_inflight_batching:
         if not args.use_gpt_attention_plugin:
@@ -344,13 +378,10 @@ def build_rank_engine(builder: Builder,
         rank=rank,
         tp_size=args.world_size,
     )
-    trtllm_model = ChatGLM2_6BHeadModel(args=args)
-
-    if args.use_smooth_quant:
-        trtllm_model = smooth_quantize(trtllm_model, args.quant_mode)
-    elif args.use_weight_only:
-        trtllm_model = weight_only_quantize(trtllm_model, args.quant_mode)
+    trtllm_model = ChatGLMHeadModel(args=args)
 
+    if args.use_smooth_quant or args.use_weight_only:
+        trtllm_model = quantize_model(trtllm_model, args.quant_mode)
     if args.model_dir is not None:
         hf_model = transformers.AutoModel.from_pretrained(
             args.model_dir, trust_remote_code=True).cpu()
@@ -359,6 +390,7 @@ def build_rank_engine(builder: Builder,
             hf_model,
             mapping=args.mapping,
             dtype=args.dtype,
+            model_version=args.model_version,
         )
         del hf_model
 
@@ -370,6 +402,9 @@ def build_rank_engine(builder: Builder,
             dtype=args.use_gpt_attention_plugin)
     if args.use_gemm_plugin:
         network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
+    if args.use_layernorm_plugin:
+        network.plugin_config.set_layernorm_plugin(
+            dtype=args.use_layernorm_plugin)
     if args.use_rmsnorm_plugin:
         network.plugin_config.set_rmsnorm_plugin(dtype=args.use_rmsnorm_plugin)
     assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
@@ -420,8 +455,11 @@ def build_rank_engine(builder: Builder,
     # Network -> Engine
     engine = builder.build_engine(network, builder_config)
     if rank == 0:
-        config_path = args.output_dir / 'config.json'
+        config_path = args.output_dir / (args.model_name + '-config.json')
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, trtllm_model)
+
     return engine
 
 
@@ -453,25 +491,26 @@ def build(rank, args):
             hidden_act=args.hidden_act,
             hidden_size=args.hidden_size,
             max_batch_size=args.max_batch_size,
+            max_beam_width=args.max_beam_width,
             max_input_len=args.max_input_len,
             max_num_tokens=args.max_output_len + args.max_input_len,
             max_output_len=args.max_output_len,
             max_position_embeddings=args.max_seq_length,
             multi_query_mode=args.multi_query_mode,
-            name=MODEL_NAME,
+            name=args.model_name,
             num_heads=args.num_heads,
             num_kv_heads=args.num_kv_heads,
             num_layers=args.num_layers,
             pad_token_id=args.pad_token_id,
             paged_kv_cache=args.paged_kv_cache,
             parallel_build=args.parallel_build,
-            quant_mode=int(args.quant_mode),
+            quant_mode=args.quant_mode,
             remove_input_padding=args.remove_input_padding,
             vocab_size=args.vocab_size,
         )
 
-        engine_name = get_engine_name(MODEL_NAME, args.dtype, args.world_size,
-                                      cur_rank)
+        engine_name = get_engine_name(args.model_name, args.dtype,
+                                      args.world_size, cur_rank)
         engine = build_rank_engine(builder, builder_config, engine_name,
                                    cur_rank, args)
         assert engine is not None, f'Failed to build engine for rank {cur_rank}'
@@ -483,6 +522,7 @@ def build(rank, args):
                 )
 
         serialize_engine(engine, args.output_dir / engine_name)
+        del engine
 
     if rank == 0:
         ok = builder.save_timing_cache(builder_config, timing_cache_file)
diff --git a/examples/chatglm2-6b/requirements.txt b/examples/chatglm/requirements.txt
similarity index 100%
rename from examples/chatglm2-6b/requirements.txt
rename to examples/chatglm/requirements.txt
diff --git a/examples/chatglm6b/run.py b/examples/chatglm/run.py
similarity index 53%
rename from examples/chatglm6b/run.py
rename to examples/chatglm/run.py
index ba80c679cc..6eaf71b4b4 100644
--- a/examples/chatglm6b/run.py
+++ b/examples/chatglm/run.py
@@ -23,16 +23,23 @@ import transformers
 
 import tensorrt_llm
 from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import (ChatGLM6BHeadModelGenerationSession,
+from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
                                   ModelConfig, SamplingConfig)
 
 from build import find_engines  # isort:skip
 
-MODEL_NAME = "chatglm-6b"
 
-
-def parse_arguments():
+def parse_arguments(args=None):
     parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--model_version',
+        '-m',
+        type=str,
+        default="3",
+        choices=["1", "2", "3", "2-32k", "3-32k"],
+        help=
+        '1, 2, 3, 2-32k, 3-32k for ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-32k and ChatGLM3-32k respectively'
+    )
     parser.add_argument('--max_output_len', type=int, default=1024)
     parser.add_argument('--log_level', type=str, default='error')
     parser.add_argument('--engine_dir', type=str, default='trtModel')
@@ -41,7 +48,10 @@ def parse_arguments():
         '--input_text',
         type=str,
         nargs='*',
-        default=["Hello", "Could you introduce NVIDIA Corporation for me?"],
+        default=[
+            "What's new between ChatGLM3-6B and ChatGLM2-6B?",
+            "Could you introduce NVIDIA Corporation for me?",
+        ],
     )
     parser.add_argument(
         '--input_tokens',
@@ -53,14 +63,14 @@ def parse_arguments():
     parser.add_argument(
         '--tokenizer_dir',
         type=str,
-        default='pyTorchModel',
+        default=None,
         help='Directory containing the tokenizer model.',
     )
     parser.add_argument('--temperature', type=float, default=1.0)
     parser.add_argument('--top_k', type=int, default=1)
     parser.add_argument('--top_p', type=float, default=0.0)
     parser.add_argument('--random_seed', type=int, default=1)
-    return parser.parse_args()
+    return parser.parse_args(args)
 
 
 def process_response(responseList):
@@ -87,42 +97,81 @@ if __name__ == '__main__':
     args = parse_arguments()
     tensorrt_llm.logger.set_level(args.log_level)
 
-    config_path = os.path.join(args.engine_dir, 'config.json')
+    if args.model_version == "1":
+        model_name = "chatglm-6b"
+    elif args.model_version in ["2", "3"]:
+        model_name = "chatglm%s-6b" % args.model_version
+    else:
+        model_name = "chatglm%s-6b-32k" % args.model_version.split("-")[0]
+
+    config_path = os.path.join(args.engine_dir, model_name + '-config.json')
     with open(config_path, 'r') as f:
         config = json.load(f)
-    assert (config['builder_config']['name'] == MODEL_NAME)
+
     dtype = config['builder_config']['precision']
     end_id = config['builder_config']['eos_token_id']
     pad_id = config['builder_config']['pad_token_id']
     max_batch_size = config['builder_config']['max_batch_size']
+    max_input_len = config['builder_config']['max_input_len']
+    max_output_len = config['builder_config']['max_output_len']
+    max_beam_width = config['builder_config']['max_beam_width']
+    remove_input_padding = config['builder_config']['remove_input_padding']
     use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
     world_size = config['builder_config']['tensor_parallel']
     assert world_size == tensorrt_llm.mpi_world_size(
     ), f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
 
+    if args.max_output_len > max_output_len:
+        print("Truncate max_output_len as %d" % max_output_len)
+    max_output_len = min(max_output_len, args.max_output_len)
+    if args.beam_width > max_beam_width:
+        print("Truncate beam_width as %d" % max_beam_width)
+    beam_width = min(max_beam_width, args.beam_width)
+
     runtime_rank = tensorrt_llm.mpi_rank()
-    runtime_mapping = tensorrt_llm.Mapping(world_size,
-                                           runtime_rank,
-                                           tp_size=world_size)
+    runtime_mapping = tensorrt_llm.Mapping(
+        world_size,
+        runtime_rank,
+        tp_size=world_size,
+    )
     torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
 
-    serialize_path = find_engines(Path(args.engine_dir),
-                                  dtype=dtype,
-                                  tp_size=world_size,
-                                  rank=runtime_rank)[0]
+    serialize_path = find_engines(
+        Path(args.engine_dir),
+        model_name=model_name,
+        dtype=dtype,
+        tp_size=world_size,
+        rank=runtime_rank,
+    )[0]
 
+    if args.tokenizer_dir is None:
+        args.tokenizer_dir = model_name
     tokenizer = transformers.AutoTokenizer.from_pretrained(
         args.tokenizer_dir, trust_remote_code=True)
     input_ids = None
     input_text = None
     if args.input_tokens is None:
-        input_text = args.input_text[:max_batch_size]
+        input_text = args.input_text
+        batch_size = len(input_text)
+        if batch_size > max_batch_size:
+            print("Truncate batch_size as %d" % max_batch_size)
+            batch_size = max_batch_size
+            input_text = input_text[:max_batch_size]
         tokenized = tokenizer(input_text,
                               return_tensors="pt",
                               padding=True,
                               return_length=True)
-        input_ids = tokenized['input_ids'].int().contiguous().cuda()
-        input_lengths = tokenized['length'].int().contiguous().cuda()
+        input_ids = tokenized['input_ids'].int()
+        input_lengths = tokenized['length'].int()
+        max_input_len_real = torch.max(input_lengths)
+        if max_input_len_real > max_input_len:
+            print("Truncate input_length as %d" % max_input_len)
+            input_ids = input_ids[:, :max_input_len]
+            input_lengths = torch.where(input_lengths > max_input_len,
+                                        max_input_len, input_lengths)
+        else:
+            max_input_len = max_input_len_real
+
     else:
         input_ids = []
         with open(args.input_tokens) as f_in:
@@ -133,7 +182,24 @@ if __name__ == '__main__':
         input_ids = torch.tensor(input_ids,
                                  dtype=torch.int32).cuda().unsqueeze(0)
 
-    if use_gpt_attention_plugin:
+    input_ids_padding = input_ids.clone()
+    if remove_input_padding:
+        input_ids_no_padding = torch.zeros(1,
+                                           torch.sum(input_lengths),
+                                           dtype=torch.int32)
+        lengths_acc = torch.cumsum(
+            torch.cat([torch.IntTensor([0]), input_lengths]),
+            dim=0,
+        )
+        for i in range(len(input_ids)):
+            input_ids_no_padding[
+                0, lengths_acc[i]:lengths_acc[i + 1]] = torch.IntTensor(
+                    input_ids[i,
+                              max_input_len - input_lengths[i]:max_input_len])
+
+        input_ids = input_ids_no_padding
+
+    elif use_gpt_attention_plugin:
         # when using gpt attention plugin, inputs needs to align at the head
         input_ids_padding_right = torch.zeros_like(input_ids) + end_id
         for i, sample in enumerate(input_ids):
@@ -155,7 +221,7 @@ if __name__ == '__main__':
         hidden_size=config['builder_config']['hidden_size'] // world_size,
         gpt_attention_plugin=use_gpt_attention_plugin,
         remove_input_padding=config['builder_config']['remove_input_padding'],
-        model_name=MODEL_NAME,
+        model_name=model_name,
         paged_kv_cache=config['builder_config']['paged_kv_cache'],
         quant_mode=QuantMode(config['builder_config']['quant_mode']),
         dtype=dtype,
@@ -164,7 +230,7 @@ if __name__ == '__main__':
     sampling_config = SamplingConfig(
         end_id=end_id,
         pad_id=pad_id,
-        num_beams=args.beam_width,
+        num_beams=beam_width,
         temperature=args.temperature,
         top_k=args.top_k,
         top_p=args.top_p,
@@ -173,32 +239,49 @@ if __name__ == '__main__':
 
     with open(serialize_path, 'rb') as f:
         engine_buffer = f.read()
-    decoder = ChatGLM6BHeadModelGenerationSession(
-        model_config,
-        engine_buffer,
-        runtime_mapping,
-    )
-    decoder.setup(input_ids.size(0), input_ids.size(1), args.max_output_len,
-                  args.beam_width)
-    output_ids = decoder.decode(input_ids, input_lengths, sampling_config)
-    torch.cuda.synchronize()
 
-    for i in range(len(output_ids.tolist())):
-        output_beams_list = [
-            tokenizer.batch_decode(output_ids[batch_idx, :,
-                                              input_lengths[batch_idx]:],
-                                   skip_special_tokens=True)
-            for batch_idx in range(input_ids.size(0))
-        ]
-        output_text = process_response(output_beams_list[i])
-        end = torch.where(input_ids[i] == end_id)[0]
-        inputLength = int(end[0]) if len(end) > 0 else input_ids.shape[1]
-        print("\nInput  %2d ---> len=%d\n%s" % (i, inputLength, input_text[i]))
-        print("\nOutput %2d --->" % i)
-        for j, simple_output in enumerate(output_text):
-            end = torch.where(output_ids[i, j, input_lengths[i]:] == end_id)[0]
-            outputLength = int(end[0]) if len(end) > 0 else args.max_output_len
-            print("  Beam %2d ---> len=%d\n%s" %
-                  (j, outputLength, simple_output))
+    if model_name == "chatglm-6b":
+        decoder = ChatGLMGenerationSession(
+            model_config,
+            engine_buffer,
+            runtime_mapping,
+        )
+    else:
+        decoder = GenerationSession(
+            model_config,
+            engine_buffer,
+            runtime_mapping,
+        )
+    decoder.setup(
+        len(input_text),
+        max_input_len,
+        max_output_len,
+        beam_width,
+    )
+    output = decoder.decode(
+        input_ids.contiguous().cuda(),
+        input_lengths.contiguous().cuda(),
+        sampling_config,
+        output_sequence_lengths=True,
+        return_dict=True,
+    )
+    torch.cuda.synchronize()
+    output_ids = output["output_ids"]
+    output_lengths = output["sequence_lengths"]
+
+    if runtime_rank == 0:
+        for i in range(batch_size):
+            print("\nInput  %2d ---> len=%d\n%s" %
+                  (i, input_lengths[i], input_text[i]))
+            print("\nOutput %2d --->" % i)
+            output_ids__one_batch = output_ids[i, :, input_lengths[i]:]
+            output_lengths_one_batch = output_lengths[i]
+            output_token_list = tokenizer.batch_decode(output_ids__one_batch,
+                                                       skip_special_tokens=True)
+            output_token_list = process_response(output_token_list)
+            for j, (length, simple_output) in enumerate(
+                    zip(output_lengths_one_batch, output_token_list)):
+                print("\n  Beam %2d ---> len=%d\n%s" %
+                      (j, length, simple_output))
 
     print("Finished!")
diff --git a/examples/chatglm6b/smoothquant.py b/examples/chatglm/smoothquant.py
similarity index 96%
rename from examples/chatglm6b/smoothquant.py
rename to examples/chatglm/smoothquant.py
index 0c8dcaa5d4..163592ff2c 100644
--- a/examples/chatglm6b/smoothquant.py
+++ b/examples/chatglm/smoothquant.py
@@ -105,7 +105,11 @@ def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5):
 
 
 @torch.no_grad()
-def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512):
+def capture_activation_range(model,
+                             tokenizer,
+                             dataset,
+                             num_samples=512,
+                             seq_len=512):
     model.eval()
     device = next(model.parameters()).device
     act_scales = defaultdict(lambda: {"x": None, "y": None, "w": None})
@@ -138,9 +142,6 @@ def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512):
                 m.register_forward_hook(
                     functools.partial(stat_input_hook, name=name)))
 
-    from datasets import load_dataset
-    dataset = load_dataset("lambada", split="validation")
-
     for i in tqdm(range(num_samples), desc="calibrating model"):
         input_ids = tokenizer(dataset[i]["text"],
                               return_tensors="pt",
diff --git a/examples/chatglm6b/summarize.py b/examples/chatglm/summarize.py
similarity index 90%
rename from examples/chatglm6b/summarize.py
rename to examples/chatglm/summarize.py
index 2e2a42107a..086bc68c33 100644
--- a/examples/chatglm6b/summarize.py
+++ b/examples/chatglm/summarize.py
@@ -26,16 +26,17 @@ from transformers import AutoModel, AutoTokenizer
 import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm.logger import logger
-from tensorrt_llm.runtime import (ChatGLM6BHeadModelGenerationSession,
+from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
                                   ModelConfig, SamplingConfig)
 
 from build import find_engines  # isort:skip
 
-MODEL_NAME = "chatglm-6b"
+model_name = ""
 
 
 def TRT(args, config):
 
+    model_name = config['builder_config']['name']
     dtype = config['builder_config']['precision']
     world_size = config['builder_config']['tensor_parallel']
     assert world_size == tensorrt_llm.mpi_world_size(), \
@@ -45,11 +46,12 @@ def TRT(args, config):
     remove_input_padding = config['plugin_config']['remove_input_padding']
 
     model_config = ModelConfig(
-        model_name=MODEL_NAME,
+        model_name=model_name,
         vocab_size=config['builder_config']['vocab_size'],
         num_layers=config['builder_config']['num_layers'],
         num_heads=config['builder_config']['num_heads'] // world_size,
-        num_kv_heads=config['builder_config']['num_heads'] // world_size,
+        num_kv_heads=max(config['builder_config']['num_kv_heads'] // world_size,
+                         1),
         hidden_size=config['builder_config']['hidden_size'] // world_size,
         gpt_attention_plugin=bool(
             config['plugin_config']['gpt_attention_plugin']),
@@ -66,20 +68,31 @@ def TRT(args, config):
                                            tp_size=world_size)
     torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
 
-    serialize_path = find_engines(args.engine_dir,
-                                  dtype=dtype,
-                                  tp_size=world_size,
-                                  rank=runtime_rank)[0]
+    serialize_path = find_engines(
+        args.engine_dir,
+        model_name=model_name,
+        dtype=dtype,
+        tp_size=world_size,
+        rank=runtime_rank,
+    )[0]
 
     tensorrt_llm.logger.set_level(args.log_level)
 
     with open(serialize_path, 'rb') as f:
         engine_buffer = f.read()
-    decoder = ChatGLM6BHeadModelGenerationSession(
-        model_config,
-        engine_buffer,
-        runtime_mapping,
-    )
+
+    if model_name == 'chatglm-6b':
+        decoder = ChatGLMGenerationSession(
+            model_config,
+            engine_buffer,
+            runtime_mapping,
+        )
+    else:
+        decoder = GenerationSession(
+            model_config,
+            engine_buffer,
+            runtime_mapping,
+        )
 
     return decoder
 
@@ -159,7 +172,10 @@ def main(args):
                 line[i],
                 return_tensors='pt',
             ).type(torch.int32)
-            input_id = input_id[:, -test_token_num:]
+            if model_name == 'chatglm-6b':
+                input_id = input_id[:, -test_token_num:]
+            else:
+                input_id = input_id[:, :test_token_num]
 
             line_encoded.append(input_id)
             input_lengths.append(input_id.shape[-1])
@@ -239,7 +255,10 @@ def main(args):
                 line[i],
                 return_tensors='pt',
             ).type(torch.int64)
-            input_id = input_id[:, -test_token_num:]
+            if model_name == 'chatglm-6b':
+                input_id = input_id[:, -test_token_num:]
+            else:
+                input_id = input_id[:, :test_token_num]
 
             line_encoded.append(input_id)
             input_lengths.append(input_id.shape[-1])
@@ -393,7 +412,16 @@ def main(args):
 
 if __name__ == '__main__':
     parser = argparse.ArgumentParser()
-    parser.add_argument('--hf_model_location', type=str, default='pyTorchModel')
+    parser.add_argument(
+        '--model_version',
+        '-m',
+        type=str,
+        required=True,
+        choices=["1", "2", "3", "2-32k", "3-32k"],
+        help=
+        '1, 2, 3, 2-32k, 3-32k for ChatGLM-6B, ChatGLM2-6B, ChatGLM3-6B, ChatGLM2-32k and ChatGLM3-32k respectively'
+    )
+    parser.add_argument('--hf_model_location', type=str, default=None)
     parser.add_argument(
         '--tokenizer',
         default=None,
@@ -424,6 +452,15 @@ if __name__ == '__main__':
     parser.add_argument('--length_penalty', type=float, default=1.0)
 
     args = parser.parse_args()
+
+    if args.model_version == "1":
+        args.model_name = "chatglm-6b"
+    elif args.model_version in ["2", "3"]:
+        args.model_name = "chatglm%s-6b" % args.model_version
+    else:
+        args.model_name = "chatglm%s-6b-32k" % args.model_version.split("-")[0]
+
     if args.tokenizer == None:
-        args.tokenizer = args.hf_model_location
+        args.tokenizer = args.model_name
+
     main(args)
diff --git a/examples/chatglm/weight.py b/examples/chatglm/weight.py
new file mode 100644
index 0000000000..4961c499ee
--- /dev/null
+++ b/examples/chatglm/weight.py
@@ -0,0 +1,366 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import time
+
+import torch
+import torch.nn.functional as F
+
+import tensorrt_llm
+from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
+from tensorrt_llm.quantization import QuantMode
+
+
+def tile_kv_weight_bias(v, kv_num_head, tp_size):
+    head_size = v.shape[0] // kv_num_head
+    reps = tp_size // kv_num_head
+    if v.ndim == 1:
+        v = v.reshape(kv_num_head, head_size)[:, None, :]
+        v = v.expand(kv_num_head, reps, head_size).reshape(-1).clone()
+    else:
+        hidden_size = v.shape[1]
+        v = v.reshape(kv_num_head, head_size, hidden_size)[:, None, :, :]
+        v = v.expand(kv_num_head, reps, head_size,
+                     hidden_size).reshape(-1, hidden_size).clone()
+    return v
+
+
+def split_qkv(v, tp_size, rank, hidden_size, num_heads, num_kv_heads):
+    head_size = hidden_size // num_heads
+    if tp_size == 1:
+        return v
+
+    assert v.shape[0] == hidden_size + head_size * num_kv_heads * 2
+    query = v[:hidden_size]
+    key = v[hidden_size:hidden_size + head_size * num_kv_heads]
+    value = v[hidden_size + head_size * num_kv_heads:hidden_size +
+              head_size * num_kv_heads * 2]
+
+    if num_kv_heads < tp_size:
+        key = tile_kv_weight_bias(key, num_kv_heads, tp_size)
+        value = tile_kv_weight_bias(value, num_kv_heads, tp_size)
+    assert (key.shape[0] % (tp_size * head_size)) == 0
+    assert (value.shape[0] % (tp_size * head_size)) == 0
+
+    q_tmp = torch.chunk(query, tp_size, dim=0)[rank]
+    k_tmp = torch.chunk(key, tp_size, dim=0)[rank]
+    v_tmp = torch.chunk(value, tp_size, dim=0)[rank]
+    return torch.concatenate([q_tmp, k_tmp, v_tmp], dim=0).contiguous()
+
+
+def load_quant_weight(src, value_dst, scale_dst, plugin_weight_only_quant_type):
+    v = torch.transpose(src, dim0=0, dim1=1).contiguous()
+    processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+        v, plugin_weight_only_quant_type)
+    value_dst.value = torch_to_numpy(processed_torch_weights)
+    scale_dst.value = torch_to_numpy(torch_weight_scales)
+
+
+def load_from_hf(
+    trt_model,
+    hf_model,
+    mapping=None,
+    dtype="float32",
+    model_version="3",
+    multi_query_mode=False,
+):
+    # [TODO] Merge model_version=="1" and model_version>="2"
+    tensorrt_llm.logger.info("Loading weights from HF")
+    tik = time.time()
+
+    torch_type = str_dtype_to_torch(dtype)
+    quant_mode = getattr(trt_model, 'quant_mode', QuantMode(0))
+    if quant_mode.is_int8_weight_only():
+        plugin_weight_only_quant_type = torch.int8
+    elif quant_mode.is_int4_weight_only():
+        plugin_weight_only_quant_type = torch.quint4x2
+    use_weight_only = quant_mode.is_weight_only()
+
+    hidden_size = hf_model.config.hidden_size
+    num_heads = hf_model.config.num_attention_heads
+
+    layers_per_pipeline_stage = trt_model.num_layers // mapping.pp_size
+    layers_range = list(
+        range(mapping.pp_rank * layers_per_pipeline_stage,
+              (mapping.pp_rank + 1) * layers_per_pipeline_stage))
+    feed_weight_count = 0
+
+    if model_version == "1":
+        num_kv_heads = hf_model.config.num_attention_heads
+
+        if mapping.is_first_pp_rank():
+            # Embedding
+            weight = hf_model.transformer.word_embeddings.weight.to(
+                torch_type).detach().cpu()
+            trt_model.embedding.weight.value = torch_to_numpy(weight)
+            feed_weight_count += 1
+        if mapping.is_last_pp_rank():
+            # Final normalization
+            weight = hf_model.transformer.final_layernorm.weight.to(
+                torch_type).detach().cpu()
+            trt_model.final_norm.weight.value = torch_to_numpy(weight)
+            bias = hf_model.transformer.final_layernorm.bias.to(
+                torch_type).detach().cpu()
+            trt_model.final_norm.bias.value = torch_to_numpy(bias)
+            feed_weight_count += 2
+
+            # Final LM
+            weight = hf_model.lm_head.weight.to(torch_type).detach().cpu()
+            if weight.shape[0] % mapping.tp_size != 0:
+                pad_width = trt_model.lm_head.out_features * mapping.tp_size - weight.shape[
+                    0]
+                weight = F.pad(weight, (0, 0, 0, pad_width))
+            split_weight = torch.chunk(weight, mapping.tp_size,
+                                       dim=0)[mapping.rank]
+            trt_model.lm_head.weight.value = torch_to_numpy(split_weight)
+            feed_weight_count += 1
+
+        for layer_idx in range(28):
+            if layer_idx not in layers_range:
+                continue
+            i = int(layer_idx) - mapping.pp_rank * layers_per_pipeline_stage
+            if i >= trt_model.num_layers:
+                continue
+
+            # Pre normalization
+            weight = hf_model.transformer.layers[i].input_layernorm.weight.to(
+                torch_type).detach().cpu()
+            trt_model.layers[i].pre_norm.weight.value = torch_to_numpy(weight)
+            bias = hf_model.transformer.layers[i].input_layernorm.bias.to(
+                torch_type).detach().cpu()
+            trt_model.layers[i].pre_norm.bias.value = torch_to_numpy(bias)
+            feed_weight_count += 2
+
+            # QKV multiplication weight
+            weight = hf_model.transformer.layers[
+                i].attention.query_key_value.weight.to(
+                    torch_type).detach().cpu()
+            split_weight = split_qkv(weight, mapping.tp_size, mapping.tp_rank,
+                                     hidden_size, num_heads, num_kv_heads)
+            dst = trt_model.layers[i].attention.qkv
+            if use_weight_only:
+                load_quant_weight(
+                    src=split_weight,
+                    value_dst=dst.weight,
+                    scale_dst=dst.per_channel_scale,
+                    plugin_weight_only_quant_type=plugin_weight_only_quant_type)
+            else:
+                dst.weight.value = torch_to_numpy(split_weight)
+            feed_weight_count += 1
+
+            # QKV multiplication bias
+            bias = hf_model.transformer.layers[
+                i].attention.query_key_value.bias.to(torch_type).detach().cpu()
+            split_bias = split_qkv(bias, mapping.tp_size, mapping.tp_rank,
+                                   hidden_size, num_heads, num_kv_heads)
+            trt_model.layers[i].attention.qkv.bias.value = torch_to_numpy(
+                split_bias)
+            feed_weight_count += 1
+
+            # Dense multiplication weight (no bias)
+            weight = hf_model.transformer.layers[i].attention.dense.weight.to(
+                torch_type).detach().cpu()
+            split_weight = torch.chunk(weight, mapping.tp_size,
+                                       dim=1)[mapping.rank]
+            dst = trt_model.layers[i].attention.dense
+            if use_weight_only:
+                load_quant_weight(
+                    src=split_weight,
+                    value_dst=dst.weight,
+                    scale_dst=dst.per_channel_scale,
+                    plugin_weight_only_quant_type=plugin_weight_only_quant_type)
+            else:
+                dst.weight.value = torch_to_numpy(split_weight)
+            feed_weight_count += 1
+
+            # Post normalization
+            weight = hf_model.transformer.layers[
+                i].post_attention_layernorm.weight.to(
+                    torch_type).detach().cpu()
+            trt_model.layers[i].post_norm.weight.value = torch_to_numpy(weight)
+            bias = hf_model.transformer.layers[
+                i].post_attention_layernorm.bias.to(torch_type).detach().cpu()
+            trt_model.layers[i].post_norm.bias.value = torch_to_numpy(bias)
+            feed_weight_count += 2
+
+            # Multilayer perceptron h -> 4h (no bias)
+            weight = hf_model.transformer.layers[i].mlp.dense_h_to_4h.weight.to(
+                torch_type).detach().cpu()
+            split_weight = torch.chunk(weight, mapping.tp_size,
+                                       dim=0)[mapping.rank]
+            dst = trt_model.layers[i].mlp.fc
+            if use_weight_only:
+                load_quant_weight(
+                    src=split_weight,
+                    value_dst=dst.weight,
+                    scale_dst=dst.per_channel_scale,
+                    plugin_weight_only_quant_type=plugin_weight_only_quant_type)
+            else:
+                dst.weight.value = torch_to_numpy(split_weight)
+            feed_weight_count += 1
+
+            # Multilayer perceptron 4h -> h (no bias)
+            weight = hf_model.transformer.layers[i].mlp.dense_4h_to_h.weight.to(
+                torch_type).detach().cpu()
+            split_weight = torch.chunk(weight, mapping.tp_size,
+                                       dim=1)[mapping.rank]
+            dst = trt_model.layers[i].mlp.proj
+            if use_weight_only:
+                load_quant_weight(
+                    src=split_weight,
+                    value_dst=dst.weight,
+                    scale_dst=dst.per_channel_scale,
+                    plugin_weight_only_quant_type=plugin_weight_only_quant_type)
+            else:
+                dst.weight.value = torch_to_numpy(split_weight)
+            feed_weight_count += 1
+
+        assert feed_weight_count == 4 + trt_model.num_layers * 9, "Some weights not loaded from HF"
+
+    else:
+        num_kv_heads = hf_model.config.multi_query_group_num
+
+        if mapping.is_first_pp_rank():
+            # Embedding
+            weight = hf_model.transformer.embedding.word_embeddings.weight.to(
+                torch_type).detach().cpu()
+            trt_model.embedding.weight.value = torch_to_numpy(weight)
+            feed_weight_count += 1
+        if mapping.is_last_pp_rank():
+            # Final normalization
+            weight = hf_model.transformer.encoder.final_layernorm.weight.to(
+                torch_type).detach().cpu()
+            trt_model.final_norm.weight.value = torch_to_numpy(weight)
+            feed_weight_count += 1
+
+            # Final LM
+            weight = hf_model.transformer.output_layer.weight.to(
+                torch_type).detach().cpu()
+            if weight.shape[0] % mapping.tp_size != 0:
+                pad_width = trt_model.lm_head.out_features * mapping.tp_size - weight.shape[
+                    0]
+                weight = F.pad(weight, (0, 0, 0, pad_width))
+            split_weight = torch.chunk(weight, mapping.tp_size,
+                                       dim=0)[mapping.rank]
+            trt_model.lm_head.weight.value = torch_to_numpy(split_weight)
+            feed_weight_count += 1
+
+        for layer_idx in range(28):
+            if layer_idx not in layers_range:
+                continue
+            i = int(layer_idx) - mapping.pp_rank * layers_per_pipeline_stage
+            if i >= trt_model.num_layers:
+                continue
+
+            # Pre normalization
+            weight = hf_model.transformer.encoder.layers[
+                i].input_layernorm.weight.to(torch_type).detach().cpu()
+            trt_model.layers[i].pre_norm.weight.value = torch_to_numpy(weight)
+            feed_weight_count += 1
+
+            # QKV multiplication weight
+            weight = hf_model.transformer.encoder.layers[
+                i].self_attention.query_key_value.weight.to(
+                    torch_type).detach().cpu()
+            split_weight = split_qkv(weight, mapping.tp_size, mapping.tp_rank,
+                                     hidden_size, num_heads, num_kv_heads)
+            dst = trt_model.layers[i].attention.qkv
+            if use_weight_only:
+                load_quant_weight(
+                    src=split_weight,
+                    value_dst=dst.weight,
+                    scale_dst=dst.per_channel_scale,
+                    plugin_weight_only_quant_type=plugin_weight_only_quant_type)
+            else:
+                dst.weight.value = torch_to_numpy(split_weight)
+            feed_weight_count += 1
+
+            # QKV multiplication bias
+            bias = hf_model.transformer.encoder.layers[
+                i].self_attention.query_key_value.bias.to(
+                    torch_type).detach().cpu()
+            split_bias = split_qkv(bias, mapping.tp_size, mapping.tp_rank,
+                                   hidden_size, num_heads, num_kv_heads)
+            trt_model.layers[i].attention.qkv.bias.value = torch_to_numpy(
+                split_bias)
+            feed_weight_count += 1
+
+            # Dense multiplication weight (no bias)
+            weight = hf_model.transformer.encoder.layers[
+                i].self_attention.dense.weight.to(torch_type).detach().cpu()
+            split_weight = torch.chunk(weight, mapping.tp_size,
+                                       dim=1)[mapping.rank]
+            dst = trt_model.layers[i].attention.dense
+            if use_weight_only:
+                load_quant_weight(
+                    src=split_weight,
+                    value_dst=dst.weight,
+                    scale_dst=dst.per_channel_scale,
+                    plugin_weight_only_quant_type=plugin_weight_only_quant_type)
+            else:
+                dst.weight.value = torch_to_numpy(split_weight)
+            feed_weight_count += 1
+
+            # Post normalization
+            weight = hf_model.transformer.encoder.layers[
+                i].post_attention_layernorm.weight.to(
+                    torch_type).detach().cpu()
+            trt_model.layers[i].post_norm.weight.value = torch_to_numpy(weight)
+            feed_weight_count += 1
+
+            # Multilayer perceptron h -> 4h (no bias)
+            weight = hf_model.transformer.encoder.layers[
+                i].mlp.dense_h_to_4h.weight.to(torch_type).detach().cpu()
+            split_weight = torch.chunk(weight, 2 * mapping.tp_size, dim=0)
+            # swap first and second half weight in columns to adapt trt_llm Swiglu
+            split_weight = torch.cat(
+                [
+                    split_weight[mapping.rank + mapping.tp_size],
+                    split_weight[mapping.rank],
+                ],
+                dim=0,
+            )
+            dst = trt_model.layers[i].mlp.fc
+            if use_weight_only:
+                load_quant_weight(
+                    src=split_weight,
+                    value_dst=dst.weight,
+                    scale_dst=dst.per_channel_scale,
+                    plugin_weight_only_quant_type=plugin_weight_only_quant_type)
+            else:
+                dst.weight.value = torch_to_numpy(split_weight)
+            feed_weight_count += 1
+
+            # Multilayer perceptron 4h -> h (no bias)
+            weight = hf_model.transformer.encoder.layers[
+                i].mlp.dense_4h_to_h.weight.to(torch_type).detach().cpu()
+            split_weight = torch.chunk(weight, mapping.tp_size,
+                                       dim=1)[mapping.rank]
+            dst = trt_model.layers[i].mlp.proj
+            if use_weight_only:
+                load_quant_weight(
+                    src=split_weight,
+                    value_dst=dst.weight,
+                    scale_dst=dst.per_channel_scale,
+                    plugin_weight_only_quant_type=plugin_weight_only_quant_type)
+            else:
+                dst.weight.value = torch_to_numpy(split_weight)
+            feed_weight_count += 1
+
+        assert feed_weight_count == 3 + trt_model.num_layers * 7, "Some weights not loaded from HF"
+
+    tok = time.time()
+
+    tensorrt_llm.logger.info("Loading weights finish in %.2fs" % (tok - tik))
+    return trt_model
diff --git a/examples/chatglm2-6b/README.md b/examples/chatglm2-6b/README.md
deleted file mode 100644
index 87eba32d88..0000000000
--- a/examples/chatglm2-6b/README.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# ChatGLM2-6B
-
-This document explains how to build the [ChatGLM2-6B](https://huggingface.co/THUDM/chatglm2-6b) model using TensorRT-LLM and run on a single GPU.
-
-## Overview
-
-The TensorRT-LLM ChatGLM2-6B implementation can be found in [`tensorrt_llm/models/chatglm2_6b/model.py`](../../tensorrt_llm/models/chatglm6b/model.py).
-The TensorRT-LLM ChatGLM2-6B example code is located in [`examples/chatglm2-6b`](./). There are 3 main files in that folder:
-
-* [`build.py`](./build.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the ChatGLM-6B model.
-* [`run.py`](./run.py) to run the inference on an input text.
-* [`summarize.py`](./summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset using the model.
-
-## Usage
-
-The next section describe how to build the engine and run the inference demo.
-
-### 1. Prepare environment and download weights from HuggingFace Transformers
-
-```bash
-apt-get update
-apt-get install git-lfs
-git clone https://huggingface.co/THUDM/chatglm2-6b pyTorchModel
-```
-
-### 2. Build TensorRT engine(s)
-
-+ This ChatGLM2-6B example in TensorRT-LLM builds TensorRT engine(s) using HF checkpoint directly (rather than using FT checkpoints such as GPT example).
-+ If no checkpoint directory is specified, TensorRT-LLM will build engine(s) using dummy weights.
-+ The [`build.py`](./build.py) script requires a single GPU to build the TensorRT engine(s).
-+ You can enable parallel builds to accelerate the engine building process if you have more than one GPU in your system (of the same model).
-+ For parallel building, add the `--parallel_build` argument to the build command (this feature cannot take advantage of more than a single node).
-+ The number of TensorRT engines depends on the number of GPUs that will be used to run inference.
-
-#### Examples of build invocations:
-
-```bash
-# Build a single-GPU float16 engine using FT weights.
-# --use_gpt_attention_plugin must be used to deal with inputs with different length in one batch
-# --use_gemm_plugin, --use_layernorm_plugin, --enable_context_fmha, --enable_context_fmha_fp32_acc are used to improve accuracy or performance.
-python3 build.py --dtype float16 \
-                 --use_gpt_attention_plugin float16 \
-                 --use_gemm_plugin float16
-```
-
-#### INT8 Weight Only
-
-+ Enable the int 8 weight-only quantization by adding `--use_weight_only`, this will siginficantly lower the latency and memory footprint.
-
-#### Fused MultiHead Attention (FMHA)
-
-+ Use `--enable_context_fmha` or `--enable_context_fmha_fp32_acc` to enable FMHA kernels, which can provide better performance and low GPU memory occupation.
-
-+ Switch `--use_gpt_attention_plugin float16` must be used when using FMHA.
-
-+ `--enable_context_fmha` uses FP16 accumulator, which might cause low accuracy. In this case, `--enable_context_fmha_fp32_acc` should be used to protect accuracy at a cost of small performance drop.
-
-#### In-flight batching and paged KV cache
-
-+ The engine must be built accordingly if [in-flight batching in C++ runtime](../../docs/in_flight_batching.md) will be used.
-
-+ Use `--use_inflight_batching` to enable In-flight Batching.
-
-+ Switch `--use_gpt_attention_plugin=float16`, `--paged_kv_cache`, `--remove_input_padding` will be set when using In-flight Batching.
-
-+ It is possible to use `--use_gpt_attention_plugin float32` In-flight Batching.
-
-+ The size of the block in paged KV cache can be conteoled additionally by using `--tokens_per_block=N`.
-
-### 3. Run
-
-#### Single node, single GPU
-
-Run TensorRT-LLM ChatGLM-6B model on a single GPU
-
-```bash
-# Run the ChatGLM2-6B model on a single GPU.
-python3 run.py
-```
-
-Run comparison of performance and accuracy
-
-```bash
-# Run the summarization task.
-python3 summarize.py
-```
-
-## Benchmark
-
-+ [TODO] The TensorRT-LLM ChatGLM2-6B benchmark is located in [benchmarks/](../../benchmarks/README.md)
diff --git a/examples/chatglm2-6b/run.py b/examples/chatglm2-6b/run.py
deleted file mode 100644
index 319a0b14df..0000000000
--- a/examples/chatglm2-6b/run.py
+++ /dev/null
@@ -1,206 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import os
-import re
-from pathlib import Path
-
-import torch
-import transformers
-
-import tensorrt_llm
-from tensorrt_llm.quantization import QuantMode
-from tensorrt_llm.runtime import GenerationSession, ModelConfig, SamplingConfig
-
-from build import find_engines  # isort:skip
-
-MODEL_NAME = "chatglm2-6b"
-
-
-def parse_arguments():
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--max_output_len', type=int, default=1024)
-    parser.add_argument('--log_level', type=str, default='error')
-    parser.add_argument('--engine_dir', type=str, default='trtModel')
-    parser.add_argument('--beam_width', type=int, default=1)
-    parser.add_argument(
-        '--input_text',
-        type=str,
-        nargs='*',
-        default=[
-            "What's new between ChatGLM2-6B and ChatGLM-6B?",
-            "Could you introduce NVIDIA Corporation for me?"
-        ],
-    )
-    parser.add_argument(
-        '--input_tokens',
-        type=str,
-        help=
-        'CSV or Numpy file containing tokenized input. Alternative to text input.',
-        default=None,
-    )
-    parser.add_argument(
-        '--tokenizer_dir',
-        type=str,
-        default='pyTorchModel',
-        help='Directory containing the tokenizer model.',
-    )
-    parser.add_argument('--temperature', type=float, default=1.0)
-    parser.add_argument('--top_k', type=int, default=1)
-    parser.add_argument('--top_p', type=float, default=0.0)
-    parser.add_argument('--random_seed', type=int, default=1)
-    return parser.parse_args()
-
-
-def process_response(responseList):
-    for i, response in enumerate(responseList):
-        response = response.strip()
-        punkts = [
-            [",", "，"],
-            ["!", "！"],
-            [":", "："],
-            [";", "；"],
-            ["\?", "？"],
-        ]
-        for item in punkts:
-            response = re.sub(r"([\u4e00-\u9fff])%s" % item[0],
-                              r"\1%s" % item[1], response)
-            response = re.sub(r"%s([\u4e00-\u9fff])" % item[0],
-                              r"%s\1" % item[1], response)
-
-        responseList[i] = response
-    return responseList
-
-
-if __name__ == '__main__':
-    args = parse_arguments()
-    tensorrt_llm.logger.set_level(args.log_level)
-
-    config_path = os.path.join(args.engine_dir, 'config.json')
-    with open(config_path, 'r') as f:
-        config = json.load(f)
-    assert (config['builder_config']['name'] == MODEL_NAME)
-    dtype = config['builder_config']['precision']
-    end_id = config['builder_config']['eos_token_id']
-    pad_id = config['builder_config']['pad_token_id']
-    max_batch_size = config['builder_config']['max_batch_size']
-    use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
-    world_size = config['builder_config']['tensor_parallel']
-    assert world_size == tensorrt_llm.mpi_world_size(
-    ), f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
-
-    runtime_rank = tensorrt_llm.mpi_rank()
-    runtime_mapping = tensorrt_llm.Mapping(world_size,
-                                           runtime_rank,
-                                           tp_size=world_size)
-    torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
-
-    serialize_path = find_engines(Path(args.engine_dir),
-                                  dtype=dtype,
-                                  tp_size=world_size,
-                                  rank=runtime_rank)[0]
-
-    tokenizer = transformers.AutoTokenizer.from_pretrained(
-        args.tokenizer_dir, trust_remote_code=True)
-    input_ids = None
-    input_text = None
-    if args.input_tokens is None:
-        input_text = args.input_text[:max_batch_size]
-        tokenized = tokenizer(input_text,
-                              return_tensors="pt",
-                              padding=True,
-                              return_length=True)
-        input_ids = tokenized['input_ids'].int().contiguous().cuda()
-        input_lengths = tokenized['length'].int().contiguous().cuda()
-    else:
-        input_ids = []
-        with open(args.input_tokens) as f_in:
-            for line in f_in:
-                for e in line.strip().split(','):
-                    input_ids.append(int(e))
-        input_text = "<ids from file>"
-        input_ids = torch.tensor(input_ids,
-                                 dtype=torch.int32).cuda().unsqueeze(0)
-
-    if use_gpt_attention_plugin:
-        # when using gpt attention plugin, inputs needs to align at the head
-        input_ids_padding_right = torch.zeros_like(input_ids) + end_id
-        for i, sample in enumerate(input_ids):
-            nPadding = 0
-            for token in sample:
-                if token == pad_id:
-                    nPadding += 1
-                else:
-                    break
-            input_ids_padding_right[
-                i, :len(sample[nPadding:])] = sample[nPadding:]
-        input_ids = input_ids_padding_right
-
-    model_config = ModelConfig(
-        vocab_size=config['builder_config']['vocab_size'],
-        num_layers=config['builder_config']['num_layers'],
-        num_heads=config['builder_config']['num_heads'] // world_size,
-        num_kv_heads=config['builder_config']['num_kv_heads'] // world_size,
-        hidden_size=config['builder_config']['hidden_size'] // world_size,
-        gpt_attention_plugin=use_gpt_attention_plugin,
-        remove_input_padding=config['builder_config']['remove_input_padding'],
-        model_name=MODEL_NAME,
-        paged_kv_cache=config['builder_config']['paged_kv_cache'],
-        quant_mode=QuantMode(config['builder_config']['quant_mode']),
-        dtype=dtype,
-    )
-
-    sampling_config = SamplingConfig(
-        end_id=end_id,
-        pad_id=pad_id,
-        num_beams=args.beam_width,
-        temperature=args.temperature,
-        top_k=args.top_k,
-        top_p=args.top_p,
-    )
-    sampling_config.random_seed = args.random_seed
-
-    with open(serialize_path, 'rb') as f:
-        engine_buffer = f.read()
-    decoder = GenerationSession(
-        model_config,
-        engine_buffer,
-        runtime_mapping,
-    )
-    decoder.setup(input_ids.size(0), input_ids.size(1), args.max_output_len,
-                  args.beam_width)
-    output_ids = decoder.decode(input_ids, input_lengths, sampling_config)
-    torch.cuda.synchronize()
-
-    for i in range(len(output_ids.tolist())):
-        output_beams_list = [
-            tokenizer.batch_decode(output_ids[batch_idx, :,
-                                              input_lengths[batch_idx]:],
-                                   skip_special_tokens=True)
-            for batch_idx in range(input_ids.size(0))
-        ]
-        output_text = process_response(output_beams_list[i])
-        end = torch.where(input_ids[i] == end_id)[0]
-        inputLength = int(end[0]) if len(end) > 0 else input_ids.shape[1]
-        print("\nInput  %2d ---> len=%d\n%s" % (i, inputLength, input_text[i]))
-        print("\nOutput %2d --->" % i)
-        for j, simple_output in enumerate(output_text):
-            end = torch.where(output_ids[i, j, input_lengths[i]:] == end_id)[0]
-            outputLength = int(end[0]) if len(end) > 0 else args.max_output_len
-            print("  Beam %2d ---> len=%d\n%s" %
-                  (j, outputLength, simple_output))
-
-    print("Finished!")
diff --git a/examples/chatglm2-6b/summarize.py b/examples/chatglm2-6b/summarize.py
deleted file mode 100644
index 1bc2f1870c..0000000000
--- a/examples/chatglm2-6b/summarize.py
+++ /dev/null
@@ -1,428 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import copy
-import json
-from pathlib import Path
-
-import evaluate
-import numpy as np
-import torch
-from datasets import load_dataset
-from transformers import AutoModel, AutoTokenizer
-
-import tensorrt_llm
-import tensorrt_llm.profiler as profiler
-from tensorrt_llm.logger import logger
-from tensorrt_llm.runtime import GenerationSession, ModelConfig, SamplingConfig
-
-from build import find_engines  # isort:skip
-
-MODEL_NAME = "chatglm2-6b"
-
-
-def TRT(args, config):
-
-    dtype = config['builder_config']['precision']
-    world_size = config['builder_config']['tensor_parallel']
-    assert world_size == tensorrt_llm.mpi_world_size(), \
-        f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
-
-    world_size = config['builder_config']['tensor_parallel']
-    remove_input_padding = config['plugin_config']['remove_input_padding']
-
-    model_config = ModelConfig(
-        model_name=MODEL_NAME,
-        vocab_size=config['builder_config']['vocab_size'],
-        num_layers=config['builder_config']['num_layers'],
-        num_heads=config['builder_config']['num_heads'] // world_size,
-        num_kv_heads=config['builder_config']['num_heads'] // world_size,
-        hidden_size=config['builder_config']['hidden_size'] // world_size,
-        gpt_attention_plugin=bool(
-            config['plugin_config']['gpt_attention_plugin']),
-        remove_input_padding=remove_input_padding,
-        tokens_per_block=config['plugin_config']['tokens_per_block'],
-        paged_kv_cache=config['plugin_config']['paged_kv_cache'],
-        dtype=dtype,
-        use_custom_all_reduce=config['plugin_config']['use_custom_all_reduce'],
-    )
-
-    runtime_rank = tensorrt_llm.mpi_rank()
-    runtime_mapping = tensorrt_llm.Mapping(world_size,
-                                           runtime_rank,
-                                           tp_size=world_size)
-    torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
-
-    serialize_path = find_engines(args.engine_dir,
-                                  dtype=dtype,
-                                  tp_size=world_size,
-                                  rank=runtime_rank)[0]
-
-    tensorrt_llm.logger.set_level(args.log_level)
-
-    with open(serialize_path, 'rb') as f:
-        engine_buffer = f.read()
-    decoder = GenerationSession(
-        model_config,
-        engine_buffer,
-        runtime_mapping,
-    )
-
-    return decoder
-
-
-def main(args):
-    runtime_rank = tensorrt_llm.mpi_rank()
-    logger.set_level(args.log_level)
-
-    test_hf = args.test_hf and runtime_rank == 0  # only run hf on rank 0
-    test_trt_llm = args.test_trt_llm
-    tokenizer = AutoTokenizer.from_pretrained(
-        args.tokenizer,
-        padding_side='left',
-        trust_remote_code=True,
-    )
-
-    if args.eval_type == 'code_completion':
-        dataset_name = "openai_humaneval"
-        dataset_revision = None
-        dataset_input_key = 'prompt'
-        dataset_output_key = 'canonical_solution'
-    elif args.eval_type == 'summarize':
-        dataset_name = "ccdv/cnn_dailymail"
-        dataset_revision = "3.0.0"
-        dataset_input_key = 'article'
-        dataset_output_key = 'highlights'
-    args.dataset_path.mkdir(parents=True, exist_ok=True)
-    dataset = load_dataset(dataset_name,
-                           dataset_revision,
-                           cache_dir=args.dataset_path)
-
-    config_path = str(args.engine_dir / 'config.json')
-    with open(config_path, 'r') as f:
-        config = json.load(f)
-
-    max_batch_size = args.batch_size
-
-    # runtime parameters
-    # repetition_penalty = 1
-    top_k = args.top_k
-    output_len = args.output_len
-    test_token_num = 800
-    # top_p = 0.0
-    # random_seed = 5
-    temperature = 1
-    num_beams = args.num_beams
-    length_penalty = args.length_penalty
-
-    pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0]
-    end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0]
-
-    if test_trt_llm:
-        tensorrt_llm_gpt = TRT(args, config)
-
-    if test_hf:
-        model = AutoModel.from_pretrained(
-            args.hf_model_location,
-            trust_remote_code=True,
-        )
-        model.cuda()
-        if args.data_type == 'fp16':
-            model.half()
-
-    def eval_tensorrt_llm(datapoint, eval_type='summarize'):
-        batch_size = len(datapoint)
-        append_str = ' TL;DR: ' if eval_type == 'summarize' else ''
-        line = copy.copy(datapoint)
-        line_encoded = []
-        input_lengths = []
-        for i in range(batch_size):
-            line[i] = line[i] + append_str
-
-            line[i] = line[i].strip()
-            line[i] = line[i].replace(" n't", "n't")
-
-            input_id = tokenizer.encode(
-                line[i],
-                return_tensors='pt',
-            ).type(torch.int32)
-            input_id = input_id[:, :test_token_num]
-
-            line_encoded.append(input_id)
-            input_lengths.append(input_id.shape[-1])
-
-        max_length = max(input_lengths)
-
-        if tensorrt_llm_gpt.remove_input_padding:
-            line_encoded = [t.to(torch.int32).cuda() for t in line_encoded]
-        else:
-            # do padding, should move outside the profiling to prevent the overhead
-            for i in range(batch_size):
-                pad_size = max_length - input_lengths[i]
-
-                pad = torch.ones([1, pad_size], dtype=torch.int32) * pad_id
-                line_encoded[i] = torch.cat(
-                    [line_encoded[i].to(torch.int32), pad], axis=-1)
-
-            line_encoded = torch.cat(line_encoded, axis=0).cuda()
-            input_lengths = torch.tensor(input_lengths,
-                                         dtype=torch.int32).cuda()
-
-        sampling_config = SamplingConfig(
-            end_id=end_id,
-            pad_id=pad_id,
-            top_k=top_k,
-            num_beams=num_beams,
-            length_penalty=length_penalty,
-        )
-
-        with torch.no_grad():
-            tensorrt_llm_gpt.setup(batch_size,
-                                   max_context_length=max_length,
-                                   max_new_tokens=output_len,
-                                   beam_width=num_beams)
-
-            if tensorrt_llm_gpt.remove_input_padding:
-                output_ids = tensorrt_llm_gpt.decode_batch(
-                    line_encoded, sampling_config)
-            else:
-                output_ids = tensorrt_llm_gpt.decode(
-                    line_encoded,
-                    input_lengths,
-                    sampling_config,
-                )
-
-            torch.cuda.synchronize()
-
-        # Extract a list of tensors of shape beam_width x output_ids.
-        if tensorrt_llm_gpt.mapping.is_first_pp_rank():
-            output_beams_list = [
-                tokenizer.batch_decode(output_ids[batch_idx, :,
-                                                  input_lengths[batch_idx]:],
-                                       skip_special_tokens=True)
-                for batch_idx in range(batch_size)
-            ]
-            return output_beams_list, output_ids[:, :, max_length:].tolist()
-        return [], []
-
-    def eval_hf(datapoint, eval_type='summarize'):
-        batch_size = len(datapoint)
-        append_str = ' TL;DR: ' if eval_type == 'summarize' else ''
-        if batch_size > 1:
-            logger.warning(
-                f"HF does not support batch_size > 1 to verify correctness due to padding and attention mask. Current batch size is {batch_size}"
-            )
-
-        line = copy.copy(datapoint)
-        line_encoded = []
-        input_lengths = []
-        for i in range(batch_size):
-            line[i] = line[i] + append_str
-
-            line[i] = line[i].strip()
-            line[i] = line[i].replace(" n't", "n't")
-
-            input_id = tokenizer.encode(
-                line[i],
-                return_tensors='pt',
-            ).type(torch.int64)
-            input_id = input_id[:, :test_token_num]
-
-            line_encoded.append(input_id)
-            input_lengths.append(input_id.shape[-1])
-
-        max_length = max(input_lengths)
-
-        for i in range(batch_size):
-            pad_size = max_length - input_lengths[i]
-
-            pad = torch.ones([1, pad_size], dtype=torch.int64) * pad_id
-            line_encoded[i] = torch.cat([pad, line_encoded[i].to(torch.int64)],
-                                        axis=-1)
-
-        line_encoded = torch.cat(line_encoded, axis=0).cuda()
-
-        with torch.no_grad():
-            output = model.generate(line_encoded,
-                                    max_length=len(line_encoded[0]) +
-                                    output_len,
-                                    top_k=top_k,
-                                    temperature=temperature,
-                                    eos_token_id=tokenizer.eos_token_id,
-                                    pad_token_id=tokenizer.pad_token_id,
-                                    num_beams=num_beams,
-                                    num_return_sequences=num_beams,
-                                    early_stopping=True,
-                                    length_penalty=length_penalty)
-
-        tokens_list = output[:, len(line_encoded[0]):].tolist()
-        output = output.reshape([batch_size, num_beams, -1])
-        output_lines_list = [
-            tokenizer.batch_decode(output[:, i, len(line_encoded[0]):],
-                                   skip_special_tokens=True)
-            for i in range(num_beams)
-        ]
-
-        return output_lines_list, tokens_list
-
-    if test_trt_llm:
-        datapoint = dataset['test'][0:1]
-        output, _ = eval_tensorrt_llm(datapoint[dataset_input_key],
-                                      eval_type=args.eval_type)
-        if runtime_rank == 0:
-            logger.info(
-                "---------------------------------------------------------")
-            logger.info("TensorRT-LLM Generated : ")
-            logger.info(f" Input : {datapoint[dataset_input_key]}")
-            logger.info(f"\n Reference : {datapoint[dataset_output_key]}")
-            logger.info(f"\n Output : {output}")
-            logger.info(
-                "---------------------------------------------------------")
-
-    if test_hf:
-        datapoint = dataset['test'][0:1]
-        output, _ = eval_hf(datapoint[dataset_input_key],
-                            eval_type=args.eval_type)
-        logger.info("---------------------------------------------------------")
-        logger.info("HF Generated : ")
-        logger.info(f" Input : {datapoint[dataset_input_key]}")
-        logger.info(f"\n Reference : {datapoint[dataset_output_key]}")
-        logger.info(f"\n Output : {output}")
-        logger.info("---------------------------------------------------------")
-
-    metric_tensorrt_llm = [evaluate.load("rouge") for _ in range(num_beams)]
-    metric_hf = [evaluate.load("rouge") for _ in range(num_beams)]
-    for i in range(num_beams):
-        metric_tensorrt_llm[i].seed = 0
-        metric_hf[i].seed = 0
-
-    ite_count = 0
-    data_point_idx = 0
-    while (data_point_idx < len(dataset['test'])) and (ite_count <
-                                                       args.max_ite):
-        if runtime_rank == 0:
-            logger.debug(
-                f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}"
-            )
-        datapoint = dataset['test'][data_point_idx:(data_point_idx +
-                                                    max_batch_size)]
-
-        if test_trt_llm:
-            profiler.start('tensorrt_llm')
-            output_tensorrt_llm, _ = eval_tensorrt_llm(
-                datapoint[dataset_input_key])
-            profiler.stop('tensorrt_llm')
-
-        if test_hf:
-            profiler.start('hf')
-            output_hf, _ = eval_hf(datapoint[dataset_input_key])
-            profiler.stop('hf')
-
-        if runtime_rank == 0:
-            if test_trt_llm:
-                for batch_idx in range(len(output_tensorrt_llm)):
-                    for beam_idx in range(num_beams):
-                        metric_tensorrt_llm[beam_idx].add_batch(
-                            predictions=[
-                                output_tensorrt_llm[batch_idx][beam_idx]
-                            ],
-                            references=[
-                                datapoint[dataset_output_key][batch_idx]
-                            ])
-            if test_hf:
-                for beam_idx in range(num_beams):
-                    for batch_idx in range(len(output_hf[beam_idx])):
-                        metric_hf[beam_idx].add_batch(
-                            predictions=[output_hf[beam_idx][batch_idx]],
-                            references=[
-                                datapoint[dataset_output_key][batch_idx]
-                            ])
-
-            logger.debug('-' * 100)
-            logger.debug(f"Input : {datapoint[dataset_input_key]}")
-            if test_trt_llm:
-                logger.debug(f'TensorRT-LLM Output: {output_tensorrt_llm}')
-            if test_hf:
-                logger.debug(f'HF Output: {output_hf}')
-            logger.debug(f"highlights : {datapoint[dataset_output_key]}")
-
-        data_point_idx += max_batch_size
-        ite_count += 1
-
-    if runtime_rank == 0:
-        if test_trt_llm:
-            np.random.seed(0)  # rouge score use sampling to compute the score
-            logger.info(
-                f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)'
-            )
-            for beam_idx in range(num_beams):
-                logger.info(f"TensorRT-LLM beam {beam_idx} result")
-                computed_metrics_tensorrt_llm = metric_tensorrt_llm[
-                    beam_idx].compute()
-                for key in computed_metrics_tensorrt_llm.keys():
-                    logger.info(
-                        f'  {key} : {computed_metrics_tensorrt_llm[key] * 100}')
-
-                if args.check_accuracy and beam_idx == 0:
-                    assert computed_metrics_tensorrt_llm[
-                        'rouge1'] * 100 > args.tensorrt_llm_rouge1_threshold
-        if test_hf:
-            np.random.seed(0)  # rouge score use sampling to compute the score
-            logger.info(
-                f'Hugging Face (total latency: {profiler.elapsed_time_in_sec("hf")} sec)'
-            )
-            for beam_idx in range(num_beams):
-                logger.info(f"HF beam {beam_idx} result")
-                computed_metrics_hf = metric_hf[beam_idx].compute()
-                for key in computed_metrics_hf.keys():
-                    logger.info(f'  {key} : {computed_metrics_hf[key] * 100}')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--hf_model_location', type=str, default='pyTorchModel')
-    parser.add_argument(
-        '--tokenizer',
-        default=None,
-        help='tokenizer path; defaults to hf_model_location if left unspecified'
-    )
-    parser.add_argument('--test_hf', action='store_true', default=True)
-    parser.add_argument('--test_trt_llm', action='store_true', default=True)
-    parser.add_argument('--data_type',
-                        type=str,
-                        choices=['fp32', 'fp16'],
-                        default='fp16')
-    parser.add_argument('--dataset_path', type=Path, default='dataset')
-    parser.add_argument('--log_level', type=str, default='info')
-    parser.add_argument('--engine_dir', type=Path, default='trtModel')
-    parser.add_argument('--batch_size', type=int, default=1)
-    parser.add_argument('--max_ite', type=int, default=20)
-    parser.add_argument('--output_len', type=int, default=100)
-    parser.add_argument('--check_accuracy', action='store_true', default=True)
-    parser.add_argument('--tensorrt_llm_rouge1_threshold',
-                        type=float,
-                        default=15.0)
-    parser.add_argument('--num_beams', type=int, default=1)
-    parser.add_argument('--top_k', type=int, default=1)
-    parser.add_argument('--eval_type',
-                        type=str,
-                        default='summarize',
-                        choices=['summarize', 'code_completion'])
-    parser.add_argument('--length_penalty', type=float, default=1.0)
-
-    args = parser.parse_args()
-    if args.tokenizer == None:
-        args.tokenizer = args.hf_model_location
-    main(args)
diff --git a/examples/chatglm2-6b/weight.py b/examples/chatglm2-6b/weight.py
deleted file mode 100644
index 6aa8996790..0000000000
--- a/examples/chatglm2-6b/weight.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import time
-
-import numpy as np
-import torch
-
-import tensorrt_llm
-from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
-from tensorrt_llm.quantization import QuantMode
-
-
-def load_from_hf(
-    tensorrt_llm_model,
-    hf_model,
-    mapping=None,
-    dtype="float32",
-    multi_query_mode=False,
-):
-    tensorrt_llm.logger.info("Loading weights from HF ChatGLM2-6B")
-    tik = time.time()
-
-    quant_mode = getattr(tensorrt_llm_model, 'quant_mode', QuantMode(0))
-    if quant_mode.is_int8_weight_only():
-        plugin_weight_only_quant_type = torch.int8
-    elif quant_mode.is_int4_weight_only():
-        plugin_weight_only_quant_type = torch.quint4x2
-    use_weight_only = quant_mode.is_weight_only()
-
-    torch_type = str_dtype_to_torch(dtype)
-    tensorrt_llm_model.embedding.weight.value = torch_to_numpy(
-        hf_model.transformer.embedding.word_embeddings.weight.to(
-            torch_type).detach().cpu())
-    tensorrt_llm_model.encoder.final_layernorm.weight.value = torch_to_numpy(
-        hf_model.transformer.encoder.final_layernorm.weight.to(
-            torch_type).detach().cpu())
-    tensorrt_llm_model.lm_head.weight.value = torch_to_numpy(
-        hf_model.transformer.output_layer.weight.to(torch_type).detach().cpu())
-
-    def load_quant_weight(src, value_dst, scale_dst,
-                          plugin_weight_only_quant_type):
-        v = np.ascontiguousarray(src.transpose())
-        processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
-            torch.tensor(v), plugin_weight_only_quant_type)
-        value_dst.value = torch_to_numpy(processed_torch_weights)
-        scale_dst.value = torch_to_numpy(torch_weight_scales)
-
-    for i in range(28):
-        tensorrt_llm_model.encoder.layers[
-            i].input_layernorm.weight.value = torch_to_numpy(
-                hf_model.transformer.encoder.layers[i].input_layernorm.weight.
-                to(torch_type).detach().cpu())
-        tensorrt_llm_model.encoder.layers[
-            i].post_layernorm.weight.value = torch_to_numpy(
-                hf_model.transformer.encoder.layers[i].post_attention_layernorm.
-                weight.to(torch_type).detach().cpu())
-        tensorrt_llm_model.encoder.layers[
-            i].self_attention.qkv.bias.value = torch_to_numpy(
-                hf_model.transformer.encoder.layers[i].self_attention.
-                query_key_value.bias.to(torch_type).detach().cpu())
-        # swap first and second half weight columns to adapt trt_llm Swiglu
-        h_to_4h_weight = hf_model.transformer.encoder.layers[
-            i].mlp.dense_h_to_4h.weight.to(torch_type).detach().cpu()
-        h_to_4h_weight = torch.split(h_to_4h_weight,
-                                     h_to_4h_weight.shape[0] // 2, 0)
-        h_to_4h_weight = torch_to_numpy(torch.concat(h_to_4h_weight[::-1], 0))
-        if use_weight_only:
-            load_quant_weight(
-                src=h_to_4h_weight,
-                value_dst=tensorrt_llm_model.encoder.layers[i].mlp.fc.weight,
-                scale_dst=tensorrt_llm_model.encoder.layers[i].mlp.fc.
-                per_channel_scale,
-                plugin_weight_only_quant_type=plugin_weight_only_quant_type)
-            load_quant_weight(
-                src=torch_to_numpy(
-                    hf_model.transformer.encoder.layers[i].mlp.dense_4h_to_h.
-                    weight.to(torch_type).detach().cpu()),
-                value_dst=tensorrt_llm_model.encoder.layers[i].mlp.proj.weight,
-                scale_dst=tensorrt_llm_model.encoder.layers[i].mlp.proj.
-                per_channel_scale,
-                plugin_weight_only_quant_type=plugin_weight_only_quant_type)
-            load_quant_weight(
-                src=torch_to_numpy(
-                    hf_model.transformer.encoder.layers[i].self_attention.
-                    query_key_value.weight.to(torch_type).detach().cpu()),
-                value_dst=tensorrt_llm_model.encoder.layers[i].self_attention.
-                qkv.weight,
-                scale_dst=tensorrt_llm_model.encoder.layers[i].self_attention.
-                qkv.per_channel_scale,
-                plugin_weight_only_quant_type=plugin_weight_only_quant_type)
-            load_quant_weight(
-                src=torch_to_numpy(
-                    hf_model.transformer.encoder.layers[i].self_attention.dense.
-                    weight.to(torch_type).detach().cpu()),
-                value_dst=tensorrt_llm_model.encoder.layers[i].self_attention.
-                dense.weight,
-                scale_dst=tensorrt_llm_model.encoder.layers[i].self_attention.
-                dense.per_channel_scale,
-                plugin_weight_only_quant_type=plugin_weight_only_quant_type)
-
-        else:
-            tensorrt_llm_model.encoder.layers[
-                i].self_attention.qkv.weight.value = torch_to_numpy(
-                    hf_model.transformer.encoder.layers[i].self_attention.
-                    query_key_value.weight.to(torch_type).detach().cpu())
-            tensorrt_llm_model.encoder.layers[
-                i].self_attention.dense.weight.value = torch_to_numpy(
-                    hf_model.transformer.encoder.layers[i].self_attention.dense.
-                    weight.to(torch_type).detach().cpu())
-            tensorrt_llm_model.encoder.layers[
-                i].mlp.fc.weight.value = h_to_4h_weight
-            tensorrt_llm_model.encoder.layers[
-                i].mlp.proj.weight.value = torch_to_numpy(
-                    hf_model.transformer.encoder.layers[i].mlp.dense_4h_to_h.
-                    weight.to(torch_type).detach().cpu())
-
-    tok = time.time()
-    tensorrt_llm.logger.info("Loading weights finish in %.2fs" % (tok - tik))
-    return tensorrt_llm_model
diff --git a/examples/chatglm6b/.gitignore b/examples/chatglm6b/.gitignore
deleted file mode 100644
index baa5534912..0000000000
--- a/examples/chatglm6b/.gitignore
+++ /dev/null
@@ -1,5 +0,0 @@
-__pycache__/
-pyTorchModel/
-trtModel/
-dataset/
-.vscode/
diff --git a/examples/chatglm6b/README.md b/examples/chatglm6b/README.md
deleted file mode 100644
index acdecc16ca..0000000000
--- a/examples/chatglm6b/README.md
+++ /dev/null
@@ -1,85 +0,0 @@
-# ChatGLM-6B
-
-This document explains how to build the [ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) model using TensorRT-LLM and run on a single GPU
-
-## Overview
-
-The TensorRT-LLM ChatGLM-6B implementation can be found in [`tensorrt_llm/models/chatglm6b/model.py`](../../tensorrt_llm/models/chatglm6b/model.py).
-The TensorRT-LLM ChatGLM-6B example code is located in [`examples/chatglm6b`](./). There are 3 main files in that folder:
-
-* [`build.py`](./build.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the ChatGLM-6B model.
-* [`run.py`](./run.py) to run the inference on an input text.
-* [`summarize.py`](./summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset using the model.
-
-## Usage
-
-### 1. Prepare environment and download weights from HuggingFace Transformers
-
-```bash
-pip install -r requirements.txt
-apt-get update
-apt-get install git-lfs
-git clone https://huggingface.co/THUDM/chatglm-6b pyTorchModel
-```
-
-### 2. Build TensorRT engine(s)
-
-+ This ChatGLM-6B example in TensorRT-LLM builds TensorRT engine(s) using HF checkpoint directly (rather than using FT checkpoints such as GPT example).
-+ If no checkpoint directory is specified, TensorRT-LLM will build engine(s) using dummy weights.
-+ The [`build.py`](./build.py) script requires a single GPU to build the TensorRT engine(s).
-+ You can enable parallel builds to accelerate the engine building process if you have more than one GPU in your system (of the same model).
-+ For parallel building, add the `--parallel_build` argument to the build command (this feature cannot take advantage of more than a single node).
-+ The number of TensorRT engines depends on the number of GPUs that will be used to run inference.
-
-#### Examples of build invocations:
-
-```bash
-# Build a single-GPU float16 engine using FT weights.
-# --use_gpt_attention_plugin must be used to deal with inputs with different length in one batch
-# --use_gemm_plugin, --use_layernorm_plugin, --enable_context_fmha, --enable_context_fmha_fp32_acc are used to improve accuracy or performance.
-python3 build.py --dtype float16 \
-                 --use_gpt_attention_plugin float16 \
-                 --use_gemm_plugin float16
-```
-
-#### Fused MultiHead Attention (FMHA)
-
-+ Use `--enable_context_fmha` or `--enable_context_fmha_fp32_acc` to enable FMHA kernels, which can provide better performance and low GPU memory occupation.
-
-+ Switch `--use_gpt_attention_plugin float16` must be used when using FMHA.
-
-+ `--enable_context_fmha` uses FP16 accumulator, which might cause low accuracy. In this case, `--enable_context_fmha_fp32_acc` should be used to protect accuracy at a cost of small performance drop.
-
-#### In-flight batching and paged KV cache
-
-+ The engine must be built accordingly if [in-flight batching in C++ runtime](../../docs/in_flight_batching.md) will be used.
-
-+ Use `--use_inflight_batching` to enable In-flight Batching.
-
-+ Switch `--use_gpt_attention_plugin=float16`, `--paged_kv_cache`, `--remove_input_padding` will be set when using In-flight Batching.
-
-+ It is possible to use `--use_gpt_attention_plugin float32` In-flight Batching.
-
-+ The size of the block in paged KV cache can be conteoled additionally by using `--tokens_per_block=N`.
-
-### 3. Run
-
-#### Single node, single GPU
-
-Run TensorRT-LLM ChatGLM-6B model on a single GPU
-
-```bash
-# Run the ChatGLM-6B model on a single GPU.
-python3 run.py
-```
-
-Run comparison of performance and accuracy
-
-```bash
-# Run the summarization task.
-python3 summarize.py
-```
-
-## Benchmark
-
-+ [TODO] The TensorRT-LLM ChatGLM-6B benchmark is located in [benchmarks/](../../benchmarks/README.md)
diff --git a/examples/chatglm6b/build.py b/examples/chatglm6b/build.py
deleted file mode 100644
index 68301f0131..0000000000
--- a/examples/chatglm6b/build.py
+++ /dev/null
@@ -1,518 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-import json
-import time
-from pathlib import Path
-from typing import List
-
-import torch
-import torch.multiprocessing as mp
-import transformers
-from weight import load_from_hf
-
-import tensorrt_llm
-from tensorrt_llm.builder import Builder
-from tensorrt_llm.logger import logger
-from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import (ChatGLM6BHeadModel, smooth_quantize,
-                                 weight_only_quantize)
-from tensorrt_llm.network import net_guard
-from tensorrt_llm.plugin.plugin import ContextFMHAType
-from tensorrt_llm.quantization import QuantMode
-
-MODEL_NAME = "chatglm-6b"
-
-
-def get_engine_name(model, dtype, tp_size, rank):
-    return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
-
-
-def find_engines(dir: Path,
-                 model_name: str = "*",
-                 dtype: str = "*",
-                 tp_size: str = "*",
-                 rank: str = "*") -> List[Path]:
-    template = f"{model_name}_{dtype}_tp{tp_size}_rank{rank}.engine"
-    return list(dir.glob(template))
-
-
-def serialize_engine(engine, path):
-    logger.info(f'Serializing engine to {path}...')
-    tik = time.time()
-    with open(path, 'wb') as f:
-        f.write(bytearray(engine))
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    logger.info(f'Engine serialized. Total time: {t}')
-
-
-def parse_arguments(args):
-    parser = argparse.ArgumentParser()
-    parser.add_argument('--world_size',
-                        type=int,
-                        default=1,
-                        help='world size, only support tensor parallelism now')
-    parser.add_argument('--model_dir', type=str, default="pyTorchModel")
-    parser.add_argument('--dtype',
-                        type=str,
-                        default='float16',
-                        choices=['float32', 'float16', 'bfloat16'])
-    parser.add_argument(
-        '--timing_cache',
-        type=str,
-        default='model.cache',
-        help=
-        'The path of to read timing cache from, will be ignored if the file does not exist'
-    )
-    parser.add_argument(
-        '--log_level',
-        type=str,
-        default='verbose',
-        choices=['verbose', 'info', 'warning', 'error', 'internal_error'])
-    parser.add_argument('--max_batch_size', type=int, default=8)
-    parser.add_argument('--max_input_len', type=int, default=1024)
-    parser.add_argument('--max_output_len', type=int, default=1024)
-    parser.add_argument('--max_beam_width', type=int, default=1)
-    parser.add_argument(
-        '--use_gpt_attention_plugin',
-        nargs='?',
-        const='float16',
-        default='float16',
-        choices=['float32', 'float16', 'bfloat16', False],
-        help=
-        "Activates attention plugin. You can specify the plugin dtype or leave blank to use the model dtype."
-    )
-    parser.add_argument(
-        '--use_gemm_plugin',
-        nargs='?',
-        const='float16',
-        type=str,
-        default='float16',
-        choices=['float32', 'float16', 'bfloat16', False],
-        help=
-        "Activates GEMM plugin. You can specify the plugin dtype or leave blank to use the model dtype."
-    )
-    parser.add_argument(
-        '--use_layernorm_plugin',
-        nargs='?',
-        const='float16',
-        type=str,
-        default='float16',
-        choices=['float32', 'float16', 'bfloat16', False],
-        help=
-        "Activates layernorm plugin. You can specify the plugin dtype or leave blank to use the model dtype.",
-    )
-    parser.add_argument('--gather_all_token_logits',
-                        action='store_true',
-                        default=False)
-    parser.add_argument('--parallel_build', default=False, action='store_true')
-    parser.add_argument('--enable_context_fmha',
-                        default=False,
-                        action='store_true')
-    parser.add_argument('--enable_context_fmha_fp32_acc',
-                        default=False,
-                        action='store_true')
-    parser.add_argument('--gpus_per_node', type=int, default=8)
-    parser.add_argument('--builder_opt', type=int, default=None)
-    parser.add_argument(
-        '--output_dir',
-        type=Path,
-        default='trtModel',
-        help=
-        'The path to save the serialized engine files, timing cache file and model configs'
-    )
-    parser.add_argument('--remove_input_padding',
-                        default=False,
-                        action='store_true')
-    parser.add_argument(
-        '--use_inflight_batching',
-        action="store_true",
-        default=False,
-        help="Activates inflight batching mode of gptAttentionPlugin.")
-
-    # Arguments related to the quantization of the model.
-    parser.add_argument(
-        '--use_smooth_quant',
-        default=False,
-        action="store_true",
-        help=
-        'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.'
-        'See --per_channel and --per_token for finer-grained quantization options.'
-    )
-    parser.add_argument(
-        '--use_weight_only',
-        default=False,
-        action="store_true",
-        help='Quantize weights for the various GEMMs to INT4/INT8.'
-        'See --weight_only_precision to set the precision')
-    parser.add_argument(
-        '--weight_only_precision',
-        const='int8',
-        type=str,
-        nargs='?',
-        default='int8',
-        choices=['int8', 'int4'],
-        help=
-        'Define the precision for the weights when using weight-only quantization.'
-        'You must also use --use_weight_only for that argument to have an impact.'
-    )
-    parser.add_argument(
-        '--per_channel',
-        default=False,
-        action="store_true",
-        help=
-        'By default, we use a single static scaling factor for the GEMM\'s result. '
-        'per_channel instead uses a different static scaling factor for each channel. '
-        'The latter is usually more accurate, but a little slower.')
-    parser.add_argument(
-        '--per_token',
-        default=False,
-        action="store_true",
-        help=
-        'By default, we use a single static scaling factor to scale activations in the int8 range. '
-        'per_token chooses at run time, and for each token, a custom scaling factor. '
-        'The latter is usually more accurate, but a little slower.')
-    parser.add_argument(
-        '--int8_kv_cache',
-        default=False,
-        action="store_true",
-        help=
-        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
-    )
-    parser.add_argument(
-        '--random_seed',
-        type=int,
-        default=None,
-        help=
-        'Seed to use when initializing the random number generator for torch.')
-    parser.add_argument(
-        '--paged_kv_cache',
-        action="store_true",
-        default=False,
-        help=
-        'By default we use contiguous KV cache. By setting this flag you enable paged KV cache'
-    )
-    parser.add_argument('--tokens_per_block',
-                        type=int,
-                        default=64,
-                        help='Number of tokens per block in paged KV cache')
-
-    parser.add_argument(
-        '--enable_fp8',
-        default=False,
-        action='store_true',
-    )
-    parser.add_argument(
-        '--fp8_kv_cache',
-        default=False,
-        action="store_true",
-        help=
-        'By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV'
-    )
-    parser.add_argument(
-        '--max_num_tokens',
-        type=int,
-        default=None,
-        help='Define the max number of tokens supported by the engine')
-    parser.add_argument(
-        '--strongly_typed',
-        default=False,
-        action="store_true",
-        help=
-        'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.'
-    )
-    parser.add_argument(
-        '--use_custom_all_reduce',
-        action='store_true',
-        help=
-        'Activates latency-optimized algorithm for all-reduce instead of NCCL.')
-    args = parser.parse_args(args)
-    logger.set_level(args.log_level)
-
-    args.apply_query_key_layer_scaling = False  # always False in TRT-LLM
-    args.bias = True
-    args.hidden_act = 'gelu'
-    args.multi_block_mode = False
-    args.multi_query_mode = False  # always False in ChatGLM-6B
-
-    plugins_args = [
-        'use_gpt_attention_plugin',
-        'use_gemm_plugin',
-        'use_layernorm_plugin',
-    ]
-
-    for plugin_arg in plugins_args:
-        if getattr(args, plugin_arg) is None:
-            logger.info(
-                f"{plugin_arg} set, without specifying a value. Using {args.dtype} automatically."
-            )
-            setattr(args, plugin_arg, args.dtype)
-
-    assert args.model_dir is not None
-    with open(Path(args.model_dir) / "config.json", "r") as f:
-        js = json.loads(f.read())
-    assert js["_name_or_path"] == "THUDM/" + MODEL_NAME
-    assert args.max_input_len < js["max_sequence_length"]
-
-    args.eos_token_id = js["eos_token_id"]
-    args.ffn_hidden_size = js["inner_hidden_size"]
-    args.gmask_token_id = js["gmask_token_id"]
-    args.hidden_size = js["hidden_size"]
-    args.layernorm_epsilon = js["layernorm_epsilon"]
-    args.mask_token_id = js["mask_token_id"]
-    args.max_seq_length = min(args.max_input_len + args.max_output_len,
-                              js["max_sequence_length"])
-    args.num_heads = js["num_attention_heads"]
-    args.num_kv_heads = js["num_attention_heads"]
-    args.num_layers = js["num_layers"]
-    args.pad_token_id = js["pad_token_id"]
-    args.use_cache = js["use_cache"]
-    args.vocab_size = js["vocab_size"]
-
-    if args.use_inflight_batching:
-        if not args.use_gpt_attention_plugin:
-            args.use_gpt_attention_plugin = 'float16'
-            logger.info(
-                f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'"
-            )
-        if not args.remove_input_padding:
-            args.remove_input_padding = True
-            logger.info(
-                "Using remove input padding for inflight batching mode.")
-        if not args.paged_kv_cache:
-            args.paged_kv_cache = True
-            logger.info("Using paged KV cache for inflight batching mode.")
-
-    assert not (
-        args.use_smooth_quant and args.use_weight_only
-    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
-
-    if args.use_smooth_quant:
-        args.quant_mode = QuantMode.use_smooth_quant(args.per_token,
-                                                     args.per_channel)
-    elif args.use_weight_only:
-        args.quant_mode = QuantMode.use_weight_only(
-            args.weight_only_precision == 'int4')
-    else:
-        args.quant_mode = QuantMode(0)
-
-    if args.int8_kv_cache:
-        args.quant_mode = args.quant_mode.set_int8_kv_cache()
-
-    if args.fp8_kv_cache:
-        assert (
-            args.use_gpt_attention_plugin or args.use_inflight_batching
-        ), "You have to use GPT attention plugin when fp8 KV cache is set"
-        args.quant_mode = args.quant_mode.set_fp8_kv_cache()
-
-    if args.enable_fp8:
-        args.quant_mode = args.quant_mode.set_fp8_qdq()
-
-    if args.max_num_tokens is not None:
-        assert args.enable_context_fmha
-
-    return args
-
-
-def build_rank_engine(builder: Builder,
-                      builder_config: tensorrt_llm.builder.BuilderConfig,
-                      engine_name, rank, args):
-    '''
-       @brief: Build the engine on the given rank.
-       @param rank: The rank to build the engine.
-       @param args: The cmd line arguments.
-       @return: The built engine.
-    '''
-
-    # Initialize Module
-    args.mapping = Mapping(
-        world_size=args.world_size,
-        rank=rank,
-        tp_size=args.world_size,
-    )
-    trtllm_model = ChatGLM6BHeadModel(args=args)
-
-    if args.use_smooth_quant:
-        trtllm_model = smooth_quantize(trtllm_model, args.quant_mode)
-    elif args.use_weight_only:
-        trtllm_model = weight_only_quantize(trtllm_model, args.quant_mode)
-
-    if args.model_dir is not None:
-        hf_model = transformers.AutoModel.from_pretrained(
-            args.model_dir, trust_remote_code=True).cpu()
-        trtllm_model = load_from_hf(
-            trtllm_model,
-            hf_model,
-            mapping=args.mapping,
-            dtype=args.dtype,
-            max_seq_length=args.max_seq_length,
-        )
-        del hf_model
-
-    # Module -> Network
-    network = builder.create_network()
-    network.trt_network.name = engine_name
-    if args.use_gpt_attention_plugin:
-        network.plugin_config.set_gpt_attention_plugin(
-            dtype=args.use_gpt_attention_plugin)
-    if args.use_gemm_plugin:
-        network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
-    if args.use_layernorm_plugin:
-        network.plugin_config.set_layernorm_plugin(
-            dtype=args.use_layernorm_plugin)
-    assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
-    if args.enable_context_fmha:
-        network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
-    if args.enable_context_fmha_fp32_acc:
-        network.plugin_config.set_context_fmha(
-            ContextFMHAType.enabled_with_fp32_acc)
-    if args.remove_input_padding:
-        network.plugin_config.enable_remove_input_padding()
-    if args.paged_kv_cache:
-        network.plugin_config.enable_paged_kv_cache(args.tokens_per_block)
-
-    # Quantization plugins.
-    if args.use_smooth_quant:
-        network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype)
-        network.plugin_config.set_layernorm_quantization_plugin(
-            dtype=args.dtype)
-
-        network.plugin_config.set_quantize_tensor_plugin()
-        network.plugin_config.set_quantize_per_token_plugin()
-    elif args.use_weight_only:
-        network.plugin_config.set_weight_only_quant_matmul_plugin(
-            dtype=args.dtype)
-
-    if args.world_size > 1:
-        network.plugin_config.set_nccl_plugin(args.dtype,
-                                              args.use_custom_all_reduce)
-
-    with net_guard(network):
-        # Prepare
-        network.set_named_parameters(trtllm_model.named_parameters())
-
-        # Forward
-        inputs = trtllm_model.prepare_inputs(
-            max_batch_size=args.max_batch_size,
-            max_input_len=args.max_input_len,
-            max_new_tokens=args.max_output_len,
-            use_cache=True,
-            max_beam_width=args.max_beam_width,
-        )
-        trtllm_model(*inputs)
-
-    tensorrt_llm.graph_rewriting.optimize(network)
-
-    engine = None
-
-    # Network -> Engine
-    engine = builder.build_engine(network, builder_config)
-    if rank == 0:
-        config_path = args.output_dir / 'config.json'
-        builder.save_config(builder_config, config_path)
-    return engine
-
-
-def build(rank, args):
-    torch.cuda.set_device(rank % args.gpus_per_node)
-    tensorrt_llm.logger.set_level(args.log_level)
-    args.output_dir.mkdir(parents=True, exist_ok=True)
-    timing_cache_file = args.output_dir / "model.cache"
-    timing_cache = timing_cache_file
-
-    builder = Builder()
-
-    for cur_rank in range(args.world_size):
-        # skip other ranks if parallel_build is enabled
-        if args.parallel_build and cur_rank != rank:
-            continue
-        builder_config = builder.create_builder_config(
-            precision=args.dtype,
-            timing_cache=timing_cache,
-            tensor_parallel=args.world_size,
-            int8=(args.quant_mode.has_act_or_weight_quant()
-                  or args.quant_mode.has_int8_kv_cache()),
-            fp8=args.enable_fp8,
-            strongly_typed=args.strongly_typed,
-            opt_level=args.builder_opt,
-            apply_query_key_layer_scaling=args.apply_query_key_layer_scaling,
-            eos_token_id=args.eos_token_id,
-            gather_all_token_logits=args.gather_all_token_logits,
-            hidden_act=args.hidden_act,
-            hidden_size=args.hidden_size,
-            max_batch_size=args.max_batch_size,
-            max_input_len=args.max_input_len,
-            max_num_tokens=args.max_output_len + args.max_input_len,
-            max_output_len=args.max_output_len,
-            max_position_embeddings=args.max_seq_length,
-            multi_query_mode=args.multi_query_mode,
-            name=MODEL_NAME,
-            num_heads=args.num_heads,
-            num_kv_heads=args.num_heads,
-            num_layers=args.num_layers,
-            pad_token_id=args.pad_token_id,
-            paged_kv_cache=args.paged_kv_cache,
-            parallel_build=args.parallel_build,
-            quant_mode=int(args.quant_mode),
-            remove_input_padding=args.remove_input_padding,
-            vocab_size=args.vocab_size,
-        )
-
-        engine_name = get_engine_name(MODEL_NAME, args.dtype, args.world_size,
-                                      cur_rank)
-        engine = build_rank_engine(builder, builder_config, engine_name,
-                                   cur_rank, args)
-        assert engine is not None, f'Failed to build engine for rank {cur_rank}'
-
-        if cur_rank == 0:
-            # Use in-memory timing cache for multiple builder passes.
-            if not args.parallel_build:
-                timing_cache = builder_config.trt_builder_config.get_timing_cache(
-                )
-
-        serialize_engine(engine, args.output_dir / engine_name)
-
-    if rank == 0:
-        ok = builder.save_timing_cache(builder_config, timing_cache_file)
-        assert ok, "Failed to save timing cache."
-
-
-def run_build(args=None):
-    args = parse_arguments(args)
-
-    if args.random_seed is not None:
-        torch.manual_seed(args.random_seed)
-
-    logger.set_level(args.log_level)
-    tik = time.time()
-    if args.parallel_build and args.world_size > 1 and \
-            torch.cuda.device_count() >= args.world_size:
-        logger.warning(
-            f'Parallelly build TensorRT engines. Please make sure that all of the {args.world_size} GPUs are totally free.'
-        )
-        mp.spawn(build, nprocs=args.world_size, args=(args, ))
-    else:
-        args.parallel_build = False
-        logger.info('Serially build TensorRT engines.')
-        build(0, args)
-
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    logger.info(f'Total time of building all {args.world_size} engines: {t}')
-
-
-if __name__ == '__main__':
-    run_build()
diff --git a/examples/chatglm6b/requirements.txt b/examples/chatglm6b/requirements.txt
deleted file mode 100644
index 140929584e..0000000000
--- a/examples/chatglm6b/requirements.txt
+++ /dev/null
@@ -1,5 +0,0 @@
-datasets~=2.14.5
-evaluate
-protobuf
-rouge_score~=0.1.2
-sentencepiece
diff --git a/examples/chatglm6b/weight.py b/examples/chatglm6b/weight.py
deleted file mode 100644
index a114be676e..0000000000
--- a/examples/chatglm6b/weight.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import time
-
-import numpy as np
-import torch
-
-import tensorrt_llm
-from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
-from tensorrt_llm.quantization import QuantMode
-
-
-def load_from_hf(
-    tensorrt_llm_model,
-    hf_model,
-    mapping=None,
-    dtype="float32",
-    max_seq_length=2048,
-    multi_query_mode=False,
-):
-    tensorrt_llm.logger.info("Loading weights from HF ChatGLM-6B")
-    tik = time.time()
-
-    quant_mode = getattr(tensorrt_llm_model, 'quant_mode', QuantMode(0))
-    if quant_mode.is_int8_weight_only():
-        plugin_weight_only_quant_type = torch.int8
-    elif quant_mode.is_int4_weight_only():
-        plugin_weight_only_quant_type = torch.quint4x2
-    use_weight_only = quant_mode.is_weight_only()
-
-    torch_type = str_dtype_to_torch(dtype)
-    tensorrt_llm_model.embedding.weight.value = torch_to_numpy(
-        hf_model.transformer.word_embeddings.weight.to(
-            torch_type).detach().cpu())
-    tensorrt_llm_model.final_layernorm.weight.value = torch_to_numpy(
-        hf_model.transformer.final_layernorm.weight.to(
-            torch_type).detach().cpu())
-    tensorrt_llm_model.final_layernorm.bias.value = torch_to_numpy(
-        hf_model.transformer.final_layernorm.bias.to(torch_type).detach().cpu())
-    tensorrt_llm_model.lm_head.weight.value = torch_to_numpy(
-        hf_model.lm_head.weight.to(torch_type).detach().cpu())
-
-    def load_quant_weight(src, value_dst, scale_dst,
-                          plugin_weight_only_quant_type):
-        v = np.ascontiguousarray(src.transpose())
-        processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
-            torch.tensor(v), plugin_weight_only_quant_type)
-        value_dst.value = torch_to_numpy(processed_torch_weights)
-        scale_dst.value = torch_to_numpy(torch_weight_scales)
-
-    for i in range(28):
-        tensorrt_llm_model.layers[
-            i].input_layernorm.weight.value = torch_to_numpy(
-                hf_model.transformer.layers[i].input_layernorm.weight.to(
-                    torch_type).detach().cpu())
-        tensorrt_llm_model.layers[
-            i].input_layernorm.bias.value = torch_to_numpy(
-                hf_model.transformer.layers[i].input_layernorm.bias.to(
-                    torch_type).detach().cpu())
-        tensorrt_llm_model.layers[
-            i].post_layernorm.weight.value = torch_to_numpy(
-                hf_model.transformer.layers[i].post_attention_layernorm.weight.
-                to(torch_type).detach().cpu())
-        tensorrt_llm_model.layers[i].post_layernorm.bias.value = torch_to_numpy(
-            hf_model.transformer.layers[i].post_attention_layernorm.bias.to(
-                torch_type).detach().cpu())
-        tensorrt_llm_model.layers[i].attention.qkv.bias.value = torch_to_numpy(
-            hf_model.transformer.layers[i].attention.query_key_value.bias.to(
-                torch_type).detach().cpu())
-        if use_weight_only:
-            load_quant_weight(
-                src=torch_to_numpy(
-                    hf_model.transformer.layers[i].mlp.dense_h_to_4h.weight.to(
-                        torch_type).detach().cpu()),
-                value_dst=tensorrt_llm_model.layers[i].mlp.fc.weight,
-                scale_dst=tensorrt_llm_model.layers[i].mlp.fc.per_channel_scale,
-                plugin_weight_only_quant_type=plugin_weight_only_quant_type)
-            load_quant_weight(
-                src=torch_to_numpy(
-                    hf_model.transformer.layers[i].mlp.dense_4h_to_h.weight.to(
-                        torch_type).detach().cpu()),
-                value_dst=tensorrt_llm_model.layers[i].mlp.proj.weight,
-                scale_dst=tensorrt_llm_model.layers[i].mlp.proj.
-                per_channel_scale,
-                plugin_weight_only_quant_type=plugin_weight_only_quant_type)
-            load_quant_weight(
-                src=torch_to_numpy(
-                    hf_model.transformer.layers[i].attention.query_key_value.
-                    weight.to(torch_type).detach().cpu()),
-                value_dst=tensorrt_llm_model.layers[i].attention.qkv.weight,
-                scale_dst=tensorrt_llm_model.layers[i].attention.qkv.
-                per_channel_scale,
-                plugin_weight_only_quant_type=plugin_weight_only_quant_type)
-            load_quant_weight(
-                src=torch_to_numpy(
-                    hf_model.transformer.layers[i].attention.dense.weight.to(
-                        torch_type).detach().cpu()),
-                value_dst=tensorrt_llm_model.layers[i].attention.dense.weight,
-                scale_dst=tensorrt_llm_model.layers[i].attention.dense.
-                per_channel_scale,
-                plugin_weight_only_quant_type=plugin_weight_only_quant_type)
-
-        else:
-            tensorrt_llm_model.layers[
-                i].attention.qkv.weight.value = torch_to_numpy(
-                    hf_model.transformer.layers[i].attention.query_key_value.
-                    weight.to(torch_type).detach().cpu())
-            tensorrt_llm_model.layers[
-                i].attention.dense.weight.value = torch_to_numpy(
-                    hf_model.transformer.layers[i].attention.dense.weight.to(
-                        torch_type).detach().cpu())
-            tensorrt_llm_model.layers[i].mlp.fc.weight.value = torch_to_numpy(
-                hf_model.transformer.layers[i].mlp.dense_h_to_4h.weight.to(
-                    torch_type).detach().cpu())
-            tensorrt_llm_model.layers[i].mlp.proj.weight.value = torch_to_numpy(
-                hf_model.transformer.layers[i].mlp.dense_4h_to_h.weight.to(
-                    torch_type).detach().cpu())
-
-    tok = time.time()
-    tensorrt_llm.logger.info("Loading weights finish in %.2fs" % (tok - tik))
-    return tensorrt_llm_model
diff --git a/examples/enc_dec/build.py b/examples/enc_dec/build.py
index 9668e3c252..ce8a537a51 100644
--- a/examples/enc_dec/build.py
+++ b/examples/enc_dec/build.py
@@ -286,6 +286,9 @@ def build_rank_engine(builder: Builder,
     if rank == 0:
         config_path = args.output_dir / args.component / 'config.json'
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, tllm_model)
+
     return engine
 
 
diff --git a/examples/falcon/build.py b/examples/falcon/build.py
index 689711639a..fd3f4159f0 100644
--- a/examples/falcon/build.py
+++ b/examples/falcon/build.py
@@ -30,7 +30,7 @@ from tensorrt_llm._utils import str_dtype_to_trt
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import fp8_quantize
+from tensorrt_llm.models import quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
@@ -412,9 +412,9 @@ def build_rank_engine(builder: Builder,
         quant_scales = get_scaling_factors(args.quantized_fp8_model_path,
                                            num_layers=args.n_layer,
                                            quant_mode=args.quant_mode)
-        tensorrt_llm_falcon = fp8_quantize(tensorrt_llm_falcon,
-                                           quant_mode=args.quant_mode,
-                                           quant_scales=quant_scales)
+        tensorrt_llm_falcon = quantize_model(tensorrt_llm_falcon,
+                                             quant_mode=args.quant_mode,
+                                             quant_scales=quant_scales)
     if args.model_dir is not None:
         logger.info(f'Loading HF Falcon ... from {args.model_dir}')
         tik = time.time()
@@ -497,6 +497,9 @@ def build_rank_engine(builder: Builder,
     if rank == 0:
         config_path = os.path.join(args.output_dir, 'config.json')
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, tensorrt_llm_falcon)
+
     return engine
 
 
@@ -532,7 +535,6 @@ def build(rank, args):
             max_input_len=args.max_input_len,
             max_output_len=args.max_output_len,
             max_num_tokens=args.max_num_tokens,
-            fp8=args.quant_mode.has_fp8_qdq(),
             quant_mode=args.quant_mode,
             strongly_typed=args.strongly_typed,
             opt_level=args.builder_opt)
@@ -549,6 +551,7 @@ def build(rank, args):
                 cache = builder_config.trt_builder_config.get_timing_cache()
 
         serialize_engine(engine, os.path.join(args.output_dir, engine_name))
+        del engine
 
     if rank == 0:
         ok = builder.save_timing_cache(
diff --git a/examples/falcon/requirements.txt b/examples/falcon/requirements.txt
index 7f49ed9a00..edaad3be36 100644
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@@ -1,5 +1,5 @@
 transformers>=4.31.0
-datasets~=2.3.2
+datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
 typing-extensions==4.5.0
diff --git a/examples/gpt/build.py b/examples/gpt/build.py
index 37f3009524..36c2e24c97 100644
--- a/examples/gpt/build.py
+++ b/examples/gpt/build.py
@@ -26,7 +26,7 @@ from tensorrt_llm.builder import Builder
 from tensorrt_llm.layers import PositionEmbeddingType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import smooth_quantize, weight_only_quantize
+from tensorrt_llm.models import quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
@@ -433,11 +433,9 @@ def build_rank_engine(builder: Builder,
         use_parallel_embedding=args.use_parallel_embedding,
         embedding_sharding_dim=args.embedding_sharding_dim,
         share_embedding_table=share_embedding_table)
-    if args.use_smooth_quant:
-        tensorrt_llm_gpt = smooth_quantize(tensorrt_llm_gpt, args.quant_mode)
-    elif args.use_weight_only:
-        tensorrt_llm_gpt = weight_only_quantize(tensorrt_llm_gpt,
-                                                args.quant_mode)
+
+    if args.use_smooth_quant or args.use_weight_only:
+        tensorrt_llm_gpt = quantize_model(tensorrt_llm_gpt, args.quant_mode)
 
     if args.model_dir is not None:
         gpt_dummy_fp8_scaling_factors = {
@@ -534,6 +532,9 @@ def build_rank_engine(builder: Builder,
     if rank == 0:
         config_path = args.output_dir / 'config.json'
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, tensorrt_llm_gpt)
+
     return engine
 
 
@@ -576,9 +577,9 @@ def build(rank, args):
             opt_level=args.builder_opt,
             multi_query_mode=args.multi_query_mode,
             strongly_typed=args.strongly_typed,
-            use_prompt_tuning=args.max_prompt_embedding_table_size > 0,
+            max_prompt_embedding_table_size=args.
+            max_prompt_embedding_table_size,
             gather_all_token_logits=args.gather_all_token_logits,
-            fp8=args.enable_fp8,
             quant_mode=args.quant_mode,
             use_parallel_embedding=args.use_parallel_embedding)
 
@@ -595,6 +596,7 @@ def build(rank, args):
                 )
 
         serialize_engine(engine, args.output_dir / engine_name)
+        del engine
 
     if rank == 0:
         ok = builder.save_timing_cache(builder_config, timing_cache_file)
diff --git a/examples/gpt/nemo_prompt_convert.py b/examples/gpt/nemo_prompt_convert.py
index ed18fe4ff6..9b16297470 100755
--- a/examples/gpt/nemo_prompt_convert.py
+++ b/examples/gpt/nemo_prompt_convert.py
@@ -32,22 +32,30 @@ logging.basicConfig(format=log_format)
 LOGGER = logging.getLogger(__name__)
 
 
-def prompt_convert(args, prompt_config, prompt_weights):
-    prompt_templates = prompt_config["task_templates"]
+def prompt_convert(out_file, prompt_config, prompt_weights):
+    nemo_type = "peft_tuning" if "peft" in prompt_config else "prompt_learning"
 
-    actual_task_id = 0
     vtokens_embeddings = []
     vtokens_len = []
-    for task_name_id, prompt_task in enumerate(prompt_templates):
-        prompt_task_name = prompt_task["taskname"]
-        LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
-        prompt_task_weights = prompt_weights["prompt_table"].get(
-            f"prompt_table.{prompt_task_name}.prompt_embeddings.weight")
-        if prompt_task_weights is None:
-            continue
+
+    if nemo_type == "peft_tuning":
+        prompt_task_weights = prompt_weights[
+            "model.embedding.adapter_layer.ptuning_adapter.inference_table"]
         vtokens_embeddings.append(prompt_task_weights)
         vtokens_len.append(prompt_task_weights.shape[0])
-        actual_task_id += 1
+    else:
+        prompt_templates = prompt_config["task_templates"]
+        actual_task_id = 0
+        for task_name_id, prompt_task in enumerate(prompt_templates):
+            prompt_task_name = prompt_task["taskname"]
+            LOGGER.info(f"Task {actual_task_id}: {prompt_task['taskname']}")
+            prompt_task_weights = prompt_weights["prompt_table"].get(
+                f"prompt_table.{prompt_task_name}.prompt_embeddings.weight")
+            if prompt_task_weights is None:
+                continue
+            vtokens_embeddings.append(prompt_task_weights)
+            vtokens_len.append(prompt_task_weights.shape[0])
+            actual_task_id += 1
 
     max_vtoken_len = max(vtokens_len)
     embedding_dim = vtokens_embeddings[0].shape[1]
@@ -59,7 +67,7 @@ def prompt_convert(args, prompt_config, prompt_weights):
         vtokens_embeddings[i] = padded_table
 
     vtokens_embeddings = torch.stack(vtokens_embeddings)
-    np.save(args.out_file, torch_to_numpy(vtokens_embeddings))
+    np.save(out_file, torch_to_numpy(vtokens_embeddings))
 
 
 def main(args):
@@ -84,7 +92,7 @@ def main(args):
             weight_path,
             map_location=cpu_map_location,
         )
-    prompt_convert(args, prompt_config, prompt_weights)
+    prompt_convert(args.out_file, prompt_config, prompt_weights)
 
     LOGGER.info("Spent %s (h:m:s) to convert the prompt model",
                 datetime.datetime.now() - start_time)
diff --git a/examples/gpt/requirements.txt b/examples/gpt/requirements.txt
index 61be4accb8..f46bff3100 100644
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@@ -1,2 +1,2 @@
-datasets~=2.3.2
+datasets~=2.14.5
 rouge_score~=0.1.2
diff --git a/examples/gpt/run.py b/examples/gpt/run.py
index 0b5a87f6c6..f94b7acc31 100644
--- a/examples/gpt/run.py
+++ b/examples/gpt/run.py
@@ -48,27 +48,29 @@ def read_config(config_path: Path):
     num_layers = config['builder_config']['num_layers']
     paged_kv_cache = config['plugin_config']['paged_kv_cache']
     tokens_per_block = config['plugin_config']['tokens_per_block']
-    use_prompt_tuning = config['builder_config']['use_prompt_tuning']
+    max_prompt_embedding_table_size = config['builder_config'][
+        'max_prompt_embedding_table_size']
     dtype = config['builder_config']['precision']
     gather_all_token_logits = config['builder_config'][
         'gather_all_token_logits']
     use_custom_all_reduce = config['plugin_config']['use_custom_all_reduce']
     quant_mode = QuantMode(config['builder_config']['quant_mode'])
 
-    model_config = ModelConfig(num_heads=num_heads,
-                               num_kv_heads=num_kv_heads,
-                               hidden_size=hidden_size,
-                               vocab_size=vocab_size,
-                               num_layers=num_layers,
-                               gpt_attention_plugin=use_gpt_attention_plugin,
-                               remove_input_padding=remove_input_padding,
-                               paged_kv_cache=paged_kv_cache,
-                               tokens_per_block=tokens_per_block,
-                               use_prompt_tuning=use_prompt_tuning,
-                               dtype=dtype,
-                               quant_mode=quant_mode,
-                               gather_all_token_logits=gather_all_token_logits,
-                               use_custom_all_reduce=use_custom_all_reduce)
+    model_config = ModelConfig(
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        hidden_size=hidden_size,
+        vocab_size=vocab_size,
+        num_layers=num_layers,
+        gpt_attention_plugin=use_gpt_attention_plugin,
+        remove_input_padding=remove_input_padding,
+        paged_kv_cache=paged_kv_cache,
+        tokens_per_block=tokens_per_block,
+        max_prompt_embedding_table_size=max_prompt_embedding_table_size,
+        dtype=dtype,
+        quant_mode=quant_mode,
+        gather_all_token_logits=gather_all_token_logits,
+        use_custom_all_reduce=use_custom_all_reduce)
 
     dtype = config['builder_config']['precision']
     max_input_len = config['builder_config']['max_input_len']
@@ -290,7 +292,7 @@ def generate(
                   max_output_len,
                   beam_width=num_beams)
 
-    ptuning_args = [] if not model_config.use_prompt_tuning else ptuning_setup(
+    ptuning_args = [] if model_config.max_prompt_embedding_table_size == 0 else ptuning_setup(
         prompt_table, dtype, model_config.hidden_size, tasks, input_ids,
         input_lengths, model_config.remove_input_padding)
 
diff --git a/examples/gpt/summarize.py b/examples/gpt/summarize.py
index 4467f3ce3d..5182e7ab18 100644
--- a/examples/gpt/summarize.py
+++ b/examples/gpt/summarize.py
@@ -26,6 +26,7 @@ import tensorrt_llm
 import tensorrt_llm.profiler as profiler
 from tensorrt_llm.logger import logger
 from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.tools.ppl import ppl
 
 from build import find_engines  # isort:skip
 
@@ -48,6 +49,11 @@ def TRTGPT(args, config):
     num_kv_heads = 1 if multi_query_mode else num_heads
     paged_kv_cache = config['plugin_config']['paged_kv_cache']
     tokens_per_block = config['plugin_config']['tokens_per_block']
+    gather_all_token_logits = config['builder_config'].get(
+        'gather_all_token_logits', False)
+    assert not (args.eval_ppl and not gather_all_token_logits), \
+        "PPL evaluation requires engine built with gather_all_token_logits enabled"
+
     use_custom_all_reduce = config['plugin_config']['use_custom_all_reduce']
     quant_mode = QuantMode(config['builder_config'].get('quant_mode', 0))
 
@@ -63,6 +69,7 @@ def TRTGPT(args, config):
         paged_kv_cache=paged_kv_cache,
         dtype=dtype,
         quant_mode=quant_mode,
+        gather_all_token_logits=gather_all_token_logits,
         use_custom_all_reduce=use_custom_all_reduce,
     )
 
@@ -203,27 +210,71 @@ def main(args):
                                    beam_width=num_beams)
 
             if tensorrt_llm_gpt.remove_input_padding:
-                output_ids = tensorrt_llm_gpt.decode_batch(
-                    line_encoded, sampling_config)
-            else:
-                output_ids = tensorrt_llm_gpt.decode(
+                outputs = tensorrt_llm_gpt.decode_batch(
                     line_encoded,
-                    input_lengths,
                     sampling_config,
-                )
-
+                    output_sequence_lengths=True,
+                    return_dict=True)
+            else:
+                outputs = tensorrt_llm_gpt.decode(line_encoded,
+                                                  input_lengths,
+                                                  sampling_config,
+                                                  output_sequence_lengths=True,
+                                                  return_dict=True)
             torch.cuda.synchronize()
 
         # Extract a list of tensors of shape beam_width x output_ids.
         if tensorrt_llm_gpt.mapping.is_first_pp_rank():
+            output_ids = outputs['output_ids']
             output_beams_list = [
                 tokenizer.batch_decode(output_ids[batch_idx, :,
                                                   input_lengths[batch_idx]:],
                                        skip_special_tokens=True)
                 for batch_idx in range(batch_size)
             ]
-            return output_beams_list, output_ids[:, :, max_length:].tolist()
-        return [], []
+
+            ppls = []
+            if args.eval_ppl:
+                seq_lens = outputs['sequence_lengths']
+                context_logits = outputs['context_logits']
+                if tensorrt_llm_gpt.remove_input_padding:
+                    context_logits = context_logits.flatten(end_dim=1)
+                    seg_points = [0] + np.cumsum(input_lengths).tolist()
+                    context_logits = [
+                        context_logits[s:e]
+                        for s, e in zip(seg_points[:-1], seg_points[1:])
+                    ]
+                else:
+                    context_logits = [
+                        context_logits[bidx, :input_lengths[bidx]]
+                        for bidx in range(batch_size)
+                    ]
+
+                # Remove the first generation logits which are same to last context logits
+                # Step dim at 1
+                generation_logits = torch.stack(
+                    outputs['generation_logits'][1:], dim=1)
+                for bidx in range(batch_size):
+                    # [batch, beam, step]
+                    curr_len = seq_lens[bidx, 0]
+                    curr_ctx_len = input_lengths[bidx]
+                    curr_gen_len = curr_len - curr_ctx_len
+
+                    curr_ids = output_ids[bidx, 0, 1:curr_len]
+                    curr_logits = torch.cat([
+                        context_logits[bidx],
+                        generation_logits[bidx, :curr_gen_len - 1]
+                    ],
+                                            dim=0)
+                    curr_ppl = ppl(curr_logits, curr_ids)
+                    ppls.append(curr_ppl)
+                    logger.debug(
+                        f"TensorRT-LLM PPL: {curr_ppl:.3f} | Generation length: {curr_gen_len}"
+                    )
+
+            return output_beams_list, output_ids[:, :,
+                                                 max_length:].tolist(), ppls
+        return [], [], []
 
     def eval_hf(datapoint, eval_type='summarize'):
         batch_size = len(datapoint)
@@ -264,32 +315,63 @@ def main(args):
         line_encoded = torch.cat(line_encoded, axis=0).cuda()
 
         with torch.no_grad():
-            output = model.generate(line_encoded,
-                                    max_length=len(line_encoded[0]) +
-                                    output_len,
-                                    top_k=top_k,
-                                    temperature=temperature,
-                                    eos_token_id=tokenizer.eos_token_id,
-                                    pad_token_id=tokenizer.pad_token_id,
-                                    num_beams=num_beams,
-                                    num_return_sequences=num_beams,
-                                    early_stopping=True,
-                                    length_penalty=length_penalty)
+            outputs = model.generate(line_encoded,
+                                     max_length=len(line_encoded[0]) +
+                                     output_len,
+                                     top_k=top_k,
+                                     temperature=temperature,
+                                     eos_token_id=tokenizer.eos_token_id,
+                                     pad_token_id=tokenizer.pad_token_id,
+                                     num_beams=num_beams,
+                                     num_return_sequences=num_beams,
+                                     early_stopping=True,
+                                     length_penalty=length_penalty,
+                                     output_scores=True,
+                                     return_dict_in_generate=True)
+            # model.generate cannot return context logits?
+            context_outputs = model(line_encoded)
 
-        tokens_list = output[:, len(line_encoded[0]):].tolist()
-        output = output.reshape([batch_size, num_beams, -1])
+        output_ids = outputs['sequences']
+        tokens_list = output_ids[:, len(line_encoded[0]):].tolist()
+        output_ids = output_ids.reshape([batch_size, num_beams, -1])
         output_lines_list = [
-            tokenizer.batch_decode(output[:, i, len(line_encoded[0]):],
+            tokenizer.batch_decode(output_ids[:, i, len(line_encoded[0]):],
                                    skip_special_tokens=True)
             for i in range(num_beams)
         ]
 
-        return output_lines_list, tokens_list
+        ppls = []
+        if args.eval_ppl and batch_size == 1:
+            # Only for batch size of 1
+            seq_lens = [output_ids.size(-1) for _ in range(batch_size)]
+            context_logits = context_outputs['logits']
+            # Remove the first generation logits which are same to last context logits
+            generation_logits = torch.stack(outputs['scores'][1:], dim=1)
+
+            ppls = []
+            for bidx in range(batch_size):
+                curr_len = seq_lens[bidx]
+                curr_ctx_len = input_lengths[bidx]
+                curr_gen_len = curr_len - curr_ctx_len
+
+                curr_ids = output_ids[bidx, 0, 1:curr_len]
+                curr_logits = torch.cat([
+                    context_logits[bidx],
+                    generation_logits[bidx, :curr_gen_len - 1]
+                ],
+                                        dim=0)
+                curr_ppl = ppl(curr_logits, curr_ids)
+                ppls.append(curr_ppl)
+                logger.debug(
+                    f"HF PPL: {curr_ppl:.3f} | Generation length: {curr_gen_len}"
+                )
+
+        return output_lines_list, tokens_list, ppls
 
     if test_trt_llm:
         datapoint = dataset['test'][0:1]
-        output, _ = eval_tensorrt_llm(datapoint[dataset_input_key],
-                                      eval_type=args.eval_type)
+        output, *_ = eval_tensorrt_llm(datapoint[dataset_input_key],
+                                       eval_type=args.eval_type)
         if runtime_rank == 0:
             logger.info(
                 "---------------------------------------------------------")
@@ -302,8 +384,8 @@ def main(args):
 
     if test_hf:
         datapoint = dataset['test'][0:1]
-        output, _ = eval_hf(datapoint[dataset_input_key],
-                            eval_type=args.eval_type)
+        output, *_ = eval_hf(datapoint[dataset_input_key],
+                             eval_type=args.eval_type)
         logger.info("---------------------------------------------------------")
         logger.info("HF Generated : ")
         logger.info(f" Input : {datapoint[dataset_input_key]}")
@@ -316,6 +398,7 @@ def main(args):
     for i in range(num_beams):
         metric_tensorrt_llm[i].seed = 0
         metric_hf[i].seed = 0
+    ppls_trt_llm, ppls_hf = [], []
 
     ite_count = 0
     data_point_idx = 0
@@ -330,13 +413,13 @@ def main(args):
 
         if test_trt_llm:
             profiler.start('tensorrt_llm')
-            output_tensorrt_llm, _ = eval_tensorrt_llm(
+            output_tensorrt_llm, _, curr_ppls_trt_llm = eval_tensorrt_llm(
                 datapoint[dataset_input_key])
             profiler.stop('tensorrt_llm')
 
         if test_hf:
             profiler.start('hf')
-            output_hf, _ = eval_hf(datapoint[dataset_input_key])
+            output_hf, _, curr_ppls_hf = eval_hf(datapoint[dataset_input_key])
             profiler.stop('hf')
 
         if runtime_rank == 0:
@@ -350,6 +433,7 @@ def main(args):
                             references=[
                                 datapoint[dataset_output_key][batch_idx]
                             ])
+                ppls_trt_llm.extend(curr_ppls_trt_llm)
             if test_hf:
                 for beam_idx in range(num_beams):
                     for batch_idx in range(len(output_hf[beam_idx])):
@@ -358,6 +442,7 @@ def main(args):
                             references=[
                                 datapoint[dataset_output_key][batch_idx]
                             ])
+                ppls_hf.extend(curr_ppls_hf)
 
             logger.debug('-' * 100)
             logger.debug(f"Input : {datapoint[dataset_input_key]}")
@@ -388,6 +473,8 @@ def main(args):
                 if args.check_accuracy and beam_idx == 0:
                     assert computed_metrics_tensorrt_llm['rouge1'].mid[
                         2] * 100 > args.tensorrt_llm_rouge1_threshold
+            if args.eval_ppl:
+                logger.info(f"  Per-token perplexity: {np.mean(ppls_trt_llm)}")
         if test_hf:
             np.random.seed(0)  # rouge score use sampling to compute the score
             logger.info(
@@ -399,6 +486,8 @@ def main(args):
                 for key in computed_metrics_hf.keys():
                     logger.info(
                         f'  {key} : {computed_metrics_hf[key].mid[2]*100}')
+            if args.eval_ppl and args.batch_size == 1:
+                logger.info(f"  Per-token perplexity: {np.mean(ppls_hf)}")
 
 
 if __name__ == '__main__':
@@ -433,6 +522,7 @@ if __name__ == '__main__':
                         default='summarize',
                         choices=['summarize', 'code_completion'])
     parser.add_argument('--length_penalty', type=float, default=1.0)
+    parser.add_argument('--eval_ppl', action='store_true')
 
     args = parser.parse_args()
     if args.tokenizer == None:
diff --git a/examples/gptj/README.md b/examples/gptj/README.md
index ad6fdcf1c9..a66bbcd8b3 100644
--- a/examples/gptj/README.md
+++ b/examples/gptj/README.md
@@ -14,7 +14,9 @@ code is located in [`examples/gptj`](./). There are three main files in that fol
 ## Support Matrix
   * FP16
   * FP8
-  * INT4 Weight-Only
+  * INT8 & INT4 per-channel weight-only
+  * Groupwise quantization (AWQ)
+  * INT8 KV CACHE (+ AWQ/per-channel weight-only)
   * FP8 KV CACHE
 
 ## Usage
@@ -130,6 +132,67 @@ If you find that the default fp16 accumulation (`--enable_context_fmha`) cannot
 
 Note `--enable_context_fmha` / `--enable_context_fmha_fp32_acc` has to be used together with `--use_gpt_attention_plugin float16`.
 
+#### INT8 KV cache
+INT8 KV cache could be enabled to reduce memory footprint. It will bring more performance gains when batch size gets larger.
+
+You can get the INT8 scale of KV cache through `hf_gptj_convert.py`:
+```bash
+# Enable INT8 calibration, and save scales
+python hf_gptj_convert.py -i gptj_model -o gptj_int8_model --calibrate-kv-cache -t float16
+```
+Now the FT-format checkpoint with INT8 KV cache scales is saved to `gptj_int8_model/1-gpu`.
+You can pass this `gptj_int8_model/1-gpu` directory to `build.py` through the argument called `--ft_model_dir`.
+
+INT8 KV cache could be combined with either per-channel INT8/INT4 weight-only quantization or per-group INT4 quantization (which is AWQ, actually).
+
+**INT8 KV cache + per-channel weight-only quantization**
+
+For example, you can enable INT8 KV cache together with per-channel INT8/INT4 weight-only quantization like the following command.
+
+**NOTE**: The whole checkpoint together with INT8 KV scales are passed to `--ft_model_dir`.
+```bash
+# Enable INT8 KV cache together with per-channel INT8 weight-only quantization
+python3 build.py --dtype=float16 \
+                 --log_level=verbose \
+                 --enable_context_fmha \
+                 --use_gpt_attention_plugin float16 \
+                 --use_gemm_plugin float16 \
+                 --max_batch_size=32 \
+                 --max_input_len=1919 \
+                 --max_output_len=128 \
+                 --remove_input_padding \
+                 --output_dir=gptj_engine_wo_int8_kv_cache \
+                 --use_weight_only \
+                 --weight_only_precision=int8 \
+                 --int8_kv_cache \
+                 --ft_model_dir=gptj_ft_model/1-gpu/
+```
+
+**INT8 KV cache + AWQ**
+
+In addition, you can enable INT8 KV cache together with AWQ (per-group INT4 weight-only quantization)like the following command.
+
+**NOTE**: AWQ checkpoint is passed through `--model_dir`, and the INT8 scales of KV cache is through `--ft_model_dir`.
+```bash
+# Enable INT8 KV cache together with AWQ
+python3 build.py --dtype=float16 \
+                 --log_level=verbose \
+                 --enable_context_fmha \
+                 --use_gpt_attention_plugin float16 \
+                 --use_gemm_plugin float16 \
+                 --max_batch_size=32 \
+                 --max_input_len=1919 \
+                 --max_output_len=128 \
+                 --remove_input_padding \
+                 --output_dir=gptj_engine_awq_int8_kv_cache/ \
+                 --use_weight_only \
+                 --per_group \
+                 --weight_only_precision=int4 \
+                 --model_dir=awq_int4_weight_only_quantized_models \
+                 --int8_kv_cache \
+                 --ft_model_dir=gptj_ft_model/1-gpu/
+```
+
 #### FP8 KV cache
 
 One can enable FP8 for KV cache to reduce memory footprint used by KV cache and improve the accuracy over INT8 KV cache. There are 3 options need to be added to the invocation of `build.py` for that:
diff --git a/examples/gptj/build.py b/examples/gptj/build.py
index 80a32f157d..9a4c0702bc 100644
--- a/examples/gptj/build.py
+++ b/examples/gptj/build.py
@@ -16,19 +16,20 @@ import argparse
 import json
 import os
 import time
+from pathlib import Path
 
 import tensorrt as trt
 import torch
 import torch.multiprocessing as mp
 from transformers import AutoModelForCausalLM
-from weight import get_scaling_factors, load_from_awq_gpt_j, load_from_hf_gpt_j
+from weight import (get_scaling_factors, load_from_awq_gpt_j,
+                    load_from_bin_gpt_j, load_from_hf_gpt_j, parse_config)
 
 import tensorrt_llm
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import (weight_only_groupwise_quantize,
-                                 weight_only_quantize)
+from tensorrt_llm.models import quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
@@ -63,6 +64,13 @@ def parse_arguments(args):
         type=str,
         default=None,
         help='The path to HF GPT-J model / checkpoints to read weights from')
+    parser.add_argument(
+        '--ft_model_dir',
+        type=str,
+        default=None,
+        help=
+        'The path to FT-format (binary) GPT-J model / checkpoints to read weights from'
+    )
     parser.add_argument('--dtype',
                         type=str,
                         default='float16',
@@ -102,12 +110,6 @@ def parse_arguments(args):
                         type=str,
                         default=False,
                         choices=['float16', 'float32'])
-    parser.add_argument('--use_weight_only_quant_matmul_plugin',
-                        nargs='?',
-                        const='float16',
-                        type=str,
-                        default=False,
-                        choices=['float16'])
     parser.add_argument('--use_layernorm_plugin',
                         nargs='?',
                         const='float16',
@@ -145,6 +147,13 @@ def parse_arguments(args):
         help=
         'By default, we use dtype for KV cache. fp8_kv_cache chooses fp8 quantization for KV'
     )
+    parser.add_argument(
+        '--int8_kv_cache',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
+    )
     parser.add_argument(
         '--use_inflight_batching',
         action="store_true",
@@ -236,21 +245,41 @@ def parse_arguments(args):
             args.n_layer = hf_gpt.config.n_layer
             args.n_positions = hf_gpt.config.n_positions
             args.vocab_size = hf_gpt.config.vocab_size
-
-    assert not (args.use_weight_only and args.weight_only_precision
-                == 'int8'), "Not support int8 weight only."
-
-    assert not (args.use_weight_only and args.weight_only_precision == 'int4'
-                and args.per_group
-                == False), "We only support AWQ for int4 weight only."
+    elif args.ft_model_dir is not None:
+        logger.info(f"Setting model configuration from {args.ft_model_dir}.")
+        n_embd, n_head, n_layer, n_positions, vocab_size, _, hidden_act, rotary_pct, bias, inter_size, multi_query_mode, dtype, prompt_num_tasks, prompt_max_vocab_size = parse_config(
+            Path(args.ft_model_dir) / "config.ini")
+        args.n_embd = n_embd
+        args.n_head = n_head
+        args.n_layer = n_layer
+        args.n_positions = n_positions
+        args.vocab_size = vocab_size
+        args.hidden_act = hidden_act
+        args.rotary_pct = rotary_pct
+        args.bias = bias
+        args.dtype = dtype
+        args.inter_size = inter_size
+        args.multi_query_mode = multi_query_mode
 
     if args.use_weight_only:
-        args.quant_mode = QuantMode.use_weight_only(
-            args.weight_only_precision == 'int4')
+        if args.per_group:
+            assert args.weight_only_precision == 'int4', "We only support per-group quantization (AWQ/GPT-Q) with INT4 precision"
+            args.quant_mode = QuantMode.from_description(
+                quantize_weights=True,
+                quantize_activations=False,
+                per_token=False,
+                per_channel=False,
+                per_group=True,
+                use_int4_weights=True)
+        else:
+            args.quant_mode = QuantMode.use_weight_only(
+                args.weight_only_precision == 'int4')
     else:
         args.quant_mode = QuantMode(0)
 
-    if args.fp8_kv_cache:
+    if args.int8_kv_cache:
+        args.quant_mode = args.quant_mode.set_int8_kv_cache()
+    elif args.fp8_kv_cache:
         assert (
             args.use_gpt_attention_plugin
         ), "You have to use GPT attention plugin when fp8 KV cache is set"
@@ -289,6 +318,9 @@ def build_rank_engine(builder: Builder,
        @return: The built engine.
     '''
     kv_dtype = trt.float16 if args.dtype == 'float16' else trt.float32
+    mapping = Mapping(world_size=args.world_size,
+                      rank=rank,
+                      tp_size=args.world_size)  # TP only
 
     # Initialize Module
     tensorrt_llm_gpt = tensorrt_llm.models.GPTJForCausalLM(
@@ -301,28 +333,21 @@ def build_rank_engine(builder: Builder,
         rotary_dim=args.rotary_dim,
         dtype=kv_dtype,
         logits_dtype=args.logits_dtype,
-        mapping=Mapping(world_size=args.world_size,
-                        rank=rank,
-                        tp_size=args.world_size),  # TP only
+        mapping=mapping,
         quant_mode=args.quant_mode)
-    if args.use_weight_only_quant_matmul_plugin:
-        tensorrt_llm_gpt = weight_only_quantize(tensorrt_llm_gpt)
-    if args.use_weight_only and args.weight_only_precision == 'int4':
-        if args.per_group:
-            tensorrt_llm_gpt = weight_only_groupwise_quantize(
-                model=tensorrt_llm_gpt,
-                quant_mode=QuantMode.from_description(
-                    quantize_weights=True,
-                    quantize_activations=False,
-                    per_token=False,
-                    per_channel=False,
-                    per_group=True,
-                    use_int4_weights=True),
-                group_size=128,
-                zero=False,
-                pre_quant_scale=True,
-                exclude_modules=[],
-            )
+
+    quantize_kwargs = {}
+    if args.use_weight_only and args.per_group:
+        assert args.weight_only_precision == 'int4'
+        quantize_kwargs = {
+            "group_size": 128,
+            "zero": False,
+            "pre_quant_scale": True,
+            "exclude_modules": [],
+        }
+    tensorrt_llm_gpt = quantize_model(tensorrt_llm_gpt, args.quant_mode,
+                                      **quantize_kwargs)
+
     if args.model_dir is not None:
         assert hf_gpt is not None, f'Could not load weights from hf_gpt model as it is not loaded yet.'
         if args.enable_fp8:
@@ -333,13 +358,18 @@ def build_rank_engine(builder: Builder,
         if args.use_weight_only and args.weight_only_precision == 'int4' and args.per_group:
             load_from_awq_gpt_j(tensorrt_llm_gpt,
                                 awq_gpt_j=hf_gpt,
+                                ft_model_dir=args.ft_model_dir,
                                 config=awq_gptj_config,
+                                mapping=mapping,
                                 fp16=(args.dtype == 'float16'))
         else:
             load_from_hf_gpt_j(tensorrt_llm_gpt,
                                hf_gpt,
                                fp16=(args.dtype == 'float16'),
                                scaling_factors=gptj_scaling_factors)
+    elif args.ft_model_dir is not None:
+        load_from_bin_gpt_j(tensorrt_llm_gpt, args.ft_model_dir, rank,
+                            args.world_size, args.dtype)
 
     # Module -> Network
     network = builder.create_network()
@@ -362,13 +392,13 @@ def build_rank_engine(builder: Builder,
     if args.enable_context_fmha_fp32_acc:
         network.plugin_config.set_context_fmha(
             ContextFMHAType.enabled_with_fp32_acc)
-    if args.use_weight_only_quant_matmul_plugin:
-        network.plugin_config.set_weight_only_quant_matmul_plugin(
-            dtype=args.use_weight_only_quant_matmul_plugin)
     if args.use_weight_only:
         if args.per_group:
             network.plugin_config.set_weight_only_groupwise_quant_matmul_plugin(
                 dtype='float16')
+        else:
+            network.plugin_config.set_weight_only_quant_matmul_plugin(
+                dtype='float16')
     if args.world_size > 1:
         network.plugin_config.set_nccl_plugin(args.dtype)
     if args.remove_input_padding:
@@ -400,6 +430,8 @@ def build_rank_engine(builder: Builder,
     if rank == 0:
         config_path = os.path.join(args.output_dir, 'config.json')
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, tensorrt_llm_gpt)
     return engine
 
 
@@ -417,6 +449,9 @@ def build(rank, args):
         # skip other ranks if parallel_build is enabled
         if args.parallel_build and cur_rank != rank:
             continue
+        # NOTE(nkorobov): when only int8 kv cache is used together with paged kv cache no int8 tensors are exposed to TRT
+        int8_trt_flag = args.quant_mode.has_act_and_weight_quant() or (
+            not args.paged_kv_cache and args.quant_mode.has_int8_kv_cache())
 
         builder_config = builder.create_builder_config(
             name=MODEL_NAME,
@@ -435,7 +470,7 @@ def build(rank, args):
             max_output_len=args.max_output_len,
             max_num_tokens=args.max_num_tokens,
             fp8=args.enable_fp8,
-            int8=args.quant_mode.has_act_or_weight_quant(),
+            int8=int8_trt_flag,
             quant_mode=args.quant_mode,
             strongly_typed=args.strongly_typed)
 
diff --git a/examples/gptj/hf_gptj_convert.py b/examples/gptj/hf_gptj_convert.py
new file mode 100644
index 0000000000..073085eaf9
--- /dev/null
+++ b/examples/gptj/hf_gptj_convert.py
@@ -0,0 +1,349 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Convert huggingface GPT-J model. Use https://huggingface.co/EleutherAI/gpt-j-6b as demo.
+'''
+import argparse
+import configparser
+import dataclasses
+import functools
+import os
+import platform
+from collections import defaultdict
+from pathlib import Path
+
+import torch
+import torch.multiprocessing as multiprocessing
+import torch.nn as nn
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM  # transformers-4.10.0-py3
+from transformers import AutoTokenizer
+from transformers.pytorch_utils import Conv1D
+from utils.convert import split_and_save_weight
+
+from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
+
+
+@torch.no_grad()
+def capture_activation_range(model,
+                             tokenizer,
+                             dataset,
+                             num_samples=512,
+                             seq_len=512):
+    model.eval()
+    device = next(model.parameters()).device
+    act_scales = defaultdict(lambda: {"x": None, "y": None, "w": None})
+
+    def stat_tensor(name, tensor, act_scales, key):
+        hidden_dim = tensor.shape[-1]
+        tensor = tensor.view(-1, hidden_dim).abs().detach()
+        comming_max = torch.max(tensor, dim=0)[0].float()
+
+        if act_scales[name][key] is None:
+            act_scales[name][key] = comming_max
+        else:
+            act_scales[name][key] = torch.max(act_scales[name][key],
+                                              comming_max)
+
+    def stat_input_hook(m, x, y, name):
+        if isinstance(x, tuple):
+            x = x[0]
+        stat_tensor(name, x, act_scales, "x")
+        stat_tensor(name, y, act_scales, "y")
+
+        if act_scales[name]["w"] is None:
+            act_scales[name]["w"] = m.weight.abs().clip(1e-8,
+                                                        None).max(dim=0)[0]
+
+    hooks = []
+    for name, m in model.named_modules():
+        if isinstance(m, nn.Linear) or isinstance(m, Conv1D):
+            hooks.append(
+                m.register_forward_hook(
+                    functools.partial(stat_input_hook, name=name)))
+
+    for i in tqdm(range(num_samples), desc="calibrating model"):
+        input_ids = tokenizer(dataset[i]["text"],
+                              return_tensors="pt",
+                              max_length=seq_len,
+                              truncation=True).input_ids.to(device)
+        model(input_ids)
+
+    for h in hooks:
+        h.remove()
+
+    return act_scales
+
+
+@dataclasses.dataclass(frozen=True)
+class ProgArgs:
+    out_dir: str
+    in_file: str
+    tensor_parallelism: int = 1
+    processes: int = 4
+    calibrate_kv_cache: bool = False
+    model: str = "gpt"
+    storage_type: str = "fp32"
+    dataset_cache_dir: str = None
+    load_model_on_cpu: bool = False
+    convert_model_on_cpu: bool = False
+
+    @staticmethod
+    def parse(args=None) -> 'ProgArgs':
+        parser = argparse.ArgumentParser(
+            formatter_class=argparse.RawTextHelpFormatter)
+        parser.add_argument('--out-dir',
+                            '-o',
+                            type=str,
+                            help='file name of output directory',
+                            required=True)
+        parser.add_argument('--in-file',
+                            '-i',
+                            type=str,
+                            help='file name of input checkpoint file',
+                            required=True)
+        parser.add_argument('--tensor-parallelism',
+                            '-tp',
+                            type=int,
+                            help='Requested tensor parallelism for inference',
+                            default=1)
+        parser.add_argument(
+            "--processes",
+            "-p",
+            type=int,
+            help=
+            "How many processes to spawn for conversion (default: 4). Set it to a lower value to reduce RAM usage.",
+            default=4)
+        parser.add_argument(
+            "--calibrate-kv-cache",
+            "-kv",
+            action="store_true",
+            help=
+            "Generate scaling factors for KV cache. Used for storing KV cache in int8."
+        )
+        parser.add_argument(
+            "--model",
+            default="gpt2",
+            type=str,
+            help="Specify GPT variants to convert checkpoints correctly",
+            choices=["gpt2", "santacoder", "starcoder"])
+        parser.add_argument("--storage-type",
+                            "-t",
+                            type=str,
+                            default="float32",
+                            choices=["float32", "float16", "bfloat16"])
+        parser.add_argument("--dataset-cache-dir",
+                            type=str,
+                            default=None,
+                            help="cache dir to load the hugging face dataset")
+        parser.add_argument("--load-model-on-cpu", action="store_true")
+        parser.add_argument("--convert-model-on-cpu", action="store_true")
+        return ProgArgs(**vars(parser.parse_args(args)))
+
+
+def merge_qkv_scales(q_name, hf_model, scales, gptj_qkv_para):
+    layer_name_q = q_name.replace(".weight", "")
+    layer_name_k = layer_name_q.replace("q_proj", "k_proj")
+    layer_name_v = layer_name_q.replace("q_proj", "v_proj")
+    layer_name_qkv = layer_name_q.replace("q_proj", "qkv_proj")
+
+    q = hf_model.state_dict()[layer_name_q + ".weight"]
+    k = hf_model.state_dict()[layer_name_k + ".weight"]
+    v = hf_model.state_dict()[layer_name_v + ".weight"]
+
+    weight = torch.cat([q, k, v], dim=0)
+
+    scales[layer_name_qkv]["x"] = scales[layer_name_q]["x"]
+    scales[layer_name_qkv]["w"] = weight.abs().max(dim=1)[0]
+
+    scales[layer_name_qkv]["y"] = torch.cat([
+        scales[layer_name_q]["y"], scales[layer_name_k]["y"],
+        scales[layer_name_v]["y"]
+    ],
+                                            dim=0)
+
+    gptj_qkv_para[layer_name_qkv] = weight.transpose(0, 1)
+
+
+def gptj_to_trt_llm_name(orig_name):
+    global_weights = {
+        "transformer.wte.weight": "model.wte",
+        "transformer.ln_f.bias": "model.final_layernorm.bias",
+        "transformer.ln_f.weight": "model.final_layernorm.weight",
+        "lm_head.weight": "model.lm_head.weight",
+        "lm_head.bias": "model.lm_head.bias"
+    }
+
+    if orig_name in global_weights:
+        return global_weights[orig_name]
+
+    _, _, layer_id, *weight_name = orig_name.split(".")
+    layer_id = int(layer_id)
+    weight_name = "transformer." + ".".join(weight_name)
+
+    per_layer_weights = {
+        "transformer.ln_1.bias": "input_layernorm.bias",
+        "transformer.ln_1.weight": "input_layernorm.weight",
+        "transformer.attn.q_proj.weight": "attention.query.weight",
+        "transformer.attn.q_proj.bias": "attention.query.bias",
+        "transformer.attn.k_proj.weight": "attention.key.weight",
+        "transformer.attn.k_proj.bias": "attention.key.bias",
+        "transformer.attn.v_proj.weight": "attention.value.weight",
+        "transformer.attn.v_proj.bias": "attention.value.bias",
+        "transformer.attn.out_proj.bias": "attention.dense.bias",
+        "transformer.attn.out_proj.weight": "attention.dense.weight",
+        "transformer.mlp.fc_in.bias": "mlp.dense_h_to_4h.bias",
+        "transformer.mlp.fc_in.weight": "mlp.dense_h_to_4h.weight",
+        "transformer.mlp.fc_out.bias": "mlp.dense_4h_to_h.bias",
+        "transformer.mlp.fc_out.weight": "mlp.dense_4h_to_h.weight",
+    }
+    return f"layers.{layer_id}.{per_layer_weights[weight_name]}"
+
+
+# GPT-J uses nn.Linear for these following ops whose weight matrix is transposed compared to gpt2.
+# In order to use the preprocess codes of gpt2, we transpose them firstly.
+def transpose_weights(hf_name, param):
+    weight_to_transpose = ["out_proj", "fc_in", "fc_out"]
+    if any([k in hf_name for k in weight_to_transpose]):
+        if len(param.shape) == 2:
+            param = param.transpose(0, 1)
+    return param
+
+
+@torch.no_grad()
+def hf_gptj_converter(args: ProgArgs):
+    infer_tp = args.tensor_parallelism
+    multi_query_mode = False
+    saved_dir = Path(args.out_dir) / f"{infer_tp}-gpu"
+    saved_dir.mkdir(parents=True, exist_ok=True)
+
+    # load position_embedding from rank 0
+    model = AutoModelForCausalLM.from_pretrained(args.in_file,
+                                                 device_map="auto",
+                                                 trust_remote_code=True)
+    if args.load_model_on_cpu:
+        model = model.cpu()
+        torch.cuda.empty_cache()
+    act_range = {}
+    gptj_qkv_para = {}
+
+    if args.calibrate_kv_cache:
+        os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
+            "TOKENIZERS_PARALLELISM", "false")
+        from datasets import load_dataset
+        dataset = load_dataset("lambada",
+                               split="validation",
+                               cache_dir=args.dataset_cache_dir)
+        act_range = capture_activation_range(
+            model, AutoTokenizer.from_pretrained(args.in_file), dataset)
+
+    config = configparser.ConfigParser()
+    config["gpt"] = {}
+    for key in vars(args):
+        config["gpt"][key] = f"{vars(args)[key]}"
+    for k, v in vars(model.config).items():
+        config["gpt"][k] = f"{v}"
+    config["gpt"]["storage_dtype"] = args.storage_type
+    config["gpt"]["multi_query_mode"] = str(multi_query_mode)
+    with open(saved_dir / "config.ini", 'w') as configfile:
+        config.write(configfile)
+
+    storage_type = str_dtype_to_torch(args.storage_type)
+
+    global_ft_weights = [
+        "model.wte", "model.final_layernorm.bias",
+        "model.final_layernorm.weight", "model.lm_head.weight",
+        "model.lm_head.bias"
+    ]
+
+    int8_outputs = None
+    if args.calibrate_kv_cache:
+        int8_outputs = "kv_cache_only"
+
+    starmap_args = []
+    for name, param in model.named_parameters():
+        if "weight" not in name and "bias" not in name:
+            continue
+        trt_llm_name = gptj_to_trt_llm_name(name)
+
+        param = transpose_weights(name, param)
+
+        if args.convert_model_on_cpu:
+            param = param.cpu()
+        if trt_llm_name in global_ft_weights:
+            torch_to_numpy(param.to(storage_type).cpu()).tofile(
+                saved_dir / f"{trt_llm_name}.bin")
+        elif 'q_proj' in name:
+            trt_llm_name = trt_llm_name.replace("query", "query_key_value")
+            # Needed by QKV projection weight split. With multi_query_mode one does not simply take
+            # out_dim and divide it by 3 to get local_dim because out_dim = local_dim + 2 * head_size
+            local_dim = model.transformer.h[
+                0].attn.embed_dim if multi_query_mode else None
+            merge_qkv_scales(name, model, act_range, gptj_qkv_para)
+            qkv = (0, saved_dir, infer_tp, trt_llm_name,
+                   gptj_qkv_para.get(
+                       name.replace(".weight",
+                                    "").replace(".q_proj",
+                                                ".qkv_proj")).to(storage_type),
+                   storage_type,
+                   act_range.get(
+                       name.replace(".weight",
+                                    "").replace(".q_proj", ".qkv_proj")), {
+                                        "int8_outputs": int8_outputs,
+                                        "multi_query_mode": multi_query_mode,
+                                        "local_dim": local_dim
+                                    })
+            starmap_args.append(qkv)
+        elif 'k_proj' in name or 'v_proj' in name:
+            continue
+        else:
+            starmap_args.append(
+                (0, saved_dir, infer_tp, trt_llm_name, param.to(storage_type),
+                 storage_type, act_range.get(name.replace(".weight", "")), {
+                     "int8_outputs": int8_outputs,
+                     "multi_query_mode": multi_query_mode,
+                     "local_dim": None
+                 }))
+
+    starmap_args = tqdm(starmap_args, desc="saving weights")
+    if args.processes > 1:
+        with multiprocessing.Pool(args.processes) as pool:
+            pool.starmap(split_and_save_weight, starmap_args)
+    else:
+        # simpler for debug situations
+        for starmap_arg in starmap_args:
+            split_and_save_weight(*starmap_arg)
+
+
+def run_conversion(args: ProgArgs):
+    if args.processes > 1 and platform.system() == "Windows":
+        print(
+            "Resetting processes to 1 because multi-process on Windows is not implemented."
+        )
+        args = dataclasses.replace(args, processes=1)
+
+    print("\n=============== Arguments ===============")
+    for key, value in vars(args).items():
+        print(f"{key}: {value}")
+    print("========================================")
+
+    assert (args.calibrate_kv_cache), \
+        "INT8 kv cache must be enabled for this script. Otherwise you can directly build engines from HuggingFace checkpoints, no need to do this FT-format conversion. "
+    hf_gptj_converter(args)
+
+
+if __name__ == "__main__":
+    torch.multiprocessing.set_start_method("spawn")
+    run_conversion(ProgArgs.parse())
diff --git a/examples/gptj/utils/convert.py b/examples/gptj/utils/convert.py
new file mode 100644
index 0000000000..f2f204b985
--- /dev/null
+++ b/examples/gptj/utils/convert.py
@@ -0,0 +1,273 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    Utilities for exporting a model to our custom format.
+"""
+
+import numpy as np
+import torch
+
+from tensorrt_llm._utils import torch_to_numpy
+
+
+def cpu_map_location(storage, loc):
+    return storage.cpu()
+
+
+def gpu_map_location(storage, loc):
+    if loc.startswith("cuda"):
+        training_gpu_idx = int(loc.split(":")[1])
+        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
+        return storage.cuda(inference_gpu_idx)
+    elif loc.startswith("cpu"):
+        return storage.cpu()
+    else:
+        raise ValueError(f"Not handled {loc}")
+
+
+def save_val(val, dir, key, tp_num=None):
+    suffix = "bin" if tp_num is None else f"{tp_num}.bin"
+    val.tofile(dir / f"model.{key}.{suffix}")
+
+
+def save_split(split_vals, dir, key, i, split_factor):
+    for j, val in enumerate(split_vals):
+        save_val(val, dir, key, i * split_factor + j)
+
+
+def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
+    """
+     This function has two purposes:
+      - compute quantized weights, scaled either per-tensor or per-column
+      - compute scaling factors
+
+      Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ.
+      CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W.
+      CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor.
+
+      Here is the list of what we need (T means per-tensor, C per-column):
+        - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8). Used before the GEMM. (T)
+        - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T)
+        - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C)
+        - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32)
+          to quant range (int8) (used for CUBLAS) (T, C)
+
+      Note that we don't do anything special about row-parallel GEMM. Theoretically, we could have per-GPU scaling factors too,
+      but then the model would change depending on the number of GPUs used.
+
+      For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection, we consider it
+      as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V.
+    """
+
+    # compute weight scaling factors for fp->int8 and int8->fp
+    if is_qkv and not multi_query_mode:
+        scale_w_orig_quant_t = 127. / act_range["w"].reshape(3, -1).max(
+            dim=-1, keepdims=True)[0].cpu().numpy()
+        scale_w_orig_quant_c = 127. / act_range["w"].reshape(3,
+                                                             -1).cpu().numpy()
+    elif is_qkv and multi_query_mode:
+        raise ValueError(
+            f"Multi-query w/ int8 quant has not been supported yet")
+    else:
+        scale_w_orig_quant_t = 127. / act_range["w"].max().cpu().numpy()
+        scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy()
+    scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t
+    scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c
+
+    # compute the rest of needed scaling factors
+    scale_x_orig_quant_t = np.array(127. / act_range["x"].max().item())
+    scale_y_orig_quant_t = np.array(127. / act_range["y"].max().item())
+    scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.)
+    scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t *
+                                                    scale_w_orig_quant_t)
+    scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t *
+                                                    scale_w_orig_quant_c)
+    if is_qkv:
+        scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t,
+                                                scale_w_orig_quant_c.shape)
+        scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t,
+                                               scale_w_orig_quant_c.shape)
+
+    to_i8 = lambda x: x.round().clip(-127, 127).astype(np.int8)
+    return {
+        "weight.int8": to_i8(weights * scale_w_orig_quant_t),
+        "weight.int8.col": to_i8(weights * scale_w_orig_quant_c),
+        "scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32),
+        "scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32),
+        "scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32),
+        "scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32),
+        "scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32),
+        "scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32),
+    }
+
+
+def write_int8(vals,
+               dir,
+               base_key,
+               split_dim,
+               tp_rank,
+               split_factor,
+               kv_cache_only=False):
+    if not kv_cache_only:
+        save_split(np.split(vals["weight.int8"], split_factor, axis=split_dim),
+                   dir, f"{base_key}.weight.int8", tp_rank, split_factor)
+        save_split(
+            np.split(vals["weight.int8.col"], split_factor, axis=split_dim),
+            dir, f"{base_key}.weight.int8.col", tp_rank, split_factor)
+
+    saved_keys_once = ["scale_y_quant_orig"]
+    if not kv_cache_only:
+        saved_keys_once += [
+            "scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant"
+        ]
+    # per-column scaling factors are loaded per-gpu for ColumnParallel GEMMs (QKV, FC1)
+    if not kv_cache_only:
+        if split_dim == -1:
+            save_split(
+                np.split(vals["scale_w_quant_orig.col"],
+                         split_factor,
+                         axis=split_dim), dir,
+                f"{base_key}.scale_w_quant_orig.col", tp_rank, split_factor)
+            save_split(
+                np.split(vals["scale_y_accum_quant.col"],
+                         split_factor,
+                         axis=split_dim), dir,
+                f"{base_key}.scale_y_accum_quant.col", tp_rank, split_factor)
+        else:
+            saved_keys_once += [
+                "scale_w_quant_orig.col", "scale_y_accum_quant.col"
+            ]
+
+    if tp_rank == 0:
+        for save_key in saved_keys_once:
+            save_val(vals[save_key], dir, f"{base_key}.{save_key}")
+
+
+# Note: in multi_query_mode, only query heads are split between multiple GPUs, while key/value head
+# are not split as there is only one head per key/value.
+@torch.no_grad()
+def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals,
+                          storage_type, act_range, config):
+    use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
+    split_gated_activation = config.get("split_gated_activation", False)
+    num_attention_heads = config.get("num_attention_heads", 0)
+    tp_size = config.get("tp_size", 1)
+    int8_outputs = config.get("int8_outputs", None)
+    multi_query_mode = config.get("multi_query_mode", False)
+    local_dim = config.get("local_dim", None)
+
+    save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
+
+    if not isinstance(vals, list):
+        vals = [vals]
+
+    if config.get("transpose_weights", False) and vals[0].ndim == 2:
+        vals = [val.T for val in vals]
+    if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
+        vals = [val + 1.0 for val in vals]
+    vals = [torch_to_numpy(val.cpu().to(storage_type)) for val in vals]
+
+    if "input_layernorm.weight" in key or "input_layernorm.bias" in key or \
+        "final_layernorm.weight" in key or "final_layernorm.bias" in key or \
+        "mlp.dense_4h_to_h.bias" in key:
+
+        # shared weights, only need to convert the weights of rank 0
+        if tp_rank == 0:
+            save_val(vals[0], saved_dir, key)
+
+    elif "attention.dense.weight" in key or "mlp.dense_4h_to_h.weight" in key:
+        cat_dim = 0
+        val = np.concatenate(vals, axis=cat_dim)
+        split_vals = np.split(val, split_factor, axis=cat_dim)
+        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+        if act_range is not None and int8_outputs == "all":
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val,
+                                    act_range,
+                                    multi_query_mode=multi_query_mode)
+            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank,
+                       split_factor)
+
+    elif "mlp.dense_h_to_4h.weight" in key or "mlp.dense_h_to_4h.bias" in key:
+        if split_gated_activation:
+            splits = [np.split(val, 2, axis=-1) for val in vals]
+            vals, gates = list(zip(*splits))
+        cat_dim = -1
+        val = np.concatenate(vals, axis=cat_dim)
+        split_vals = np.split(val, split_factor, axis=cat_dim)
+        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+        if act_range is not None and int8_outputs == "all":
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val,
+                                    act_range,
+                                    multi_query_mode=multi_query_mode)
+            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank,
+                       split_factor)
+
+        if split_gated_activation:
+            assert not save_int8
+            prefix, dot, suffix = key.rpartition(".")
+            key = prefix + ".gate" + dot + suffix
+
+            gate = np.concatenate(gates, axis=cat_dim)
+            split_vals = np.split(gate, split_factor, axis=cat_dim)
+            save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+
+    elif "attention.query_key_value.weight" in key:
+        hidden_dim = vals[0].shape[0]
+        if local_dim is None:
+            local_dim = vals[0].shape[-1] // 3
+        if multi_query_mode:
+            val = vals[0]
+            # out_feature = local_dim + 2 * head_size; assumes local_dim equals to hidden_dim
+            head_size = (val.shape[-1] - local_dim) // 2
+            val = val.reshape(hidden_dim, local_dim + 2 * head_size)
+            w_q, w_kv = np.split(val, [local_dim], axis=-1)
+            w_q_split = np.split(w_q, split_factor, axis=-1)
+            split_vals = [np.concatenate((i, w_kv), axis=-1) for i in w_q_split]
+        else:
+            if use_attention_nemo_shape:
+                head_num = num_attention_heads // tp_size
+                size_per_head = hidden_dim // num_attention_heads
+                vals = [
+                    val.reshape(hidden_dim, head_num, 3, size_per_head)
+                    for val in vals
+                ]
+                vals = [val.transpose(0, 2, 1, 3) for val in vals]
+
+            vals = [val.reshape(hidden_dim, 3, local_dim) for val in vals]
+            cat_dim = -1
+            val = np.concatenate(vals, axis=cat_dim)
+            split_vals = np.split(val, split_factor, axis=cat_dim)
+        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
+        if save_int8:
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val,
+                                    act_range,
+                                    is_qkv=True,
+                                    multi_query_mode=multi_query_mode)
+            write_int8(vals_i8,
+                       saved_dir,
+                       base_key,
+                       cat_dim,
+                       tp_rank,
+                       split_factor,
+                       kv_cache_only=int8_outputs == "kv_cache_only")
+    elif ("attention.query.weight" in key or "attention.query.bias" in key
+          or "attention.key_value.weight" in key
+          or "attention.key_value.bias" in key):
+        pass
+    else:
+        assert False, f"[ERROR] {key} not handled by converter"
diff --git a/examples/gptj/weight.py b/examples/gptj/weight.py
index 8867fb4b41..9ee593d34d 100644
--- a/examples/gptj/weight.py
+++ b/examples/gptj/weight.py
@@ -12,6 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import configparser
 import time
 from operator import attrgetter
 from pathlib import Path
@@ -22,6 +23,8 @@ import torch
 
 import tensorrt_llm
 import tensorrt_llm.logger as logger
+from tensorrt_llm._utils import pad_vocab_size, str_dtype_to_np
+from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models import GPTJForCausalLM
 from tensorrt_llm.models.quantized.quant import get_dummy_quant_scales
 from tensorrt_llm.quantization import QuantMode
@@ -108,6 +111,385 @@ def get_scaling_factors(
     return scaling_factor
 
 
+def gen_suffix(rank, use_smooth_quant, quant_per_channel):
+    suffix = f"{rank}.bin"
+    if use_smooth_quant:
+        sq_prefix = "int8."
+        if quant_per_channel:
+            sq_prefix += "col."
+        suffix = sq_prefix + suffix
+    return suffix
+
+
+def extract_layer_idx(name):
+    ss = name.split('.')
+    for s in ss:
+        if s.isdigit():
+            return s
+    return None
+
+
+def split(v, tp_size, idx, dim=0):
+    if tp_size == 1:
+        return v
+    if len(v.shape) == 1:
+        return np.ascontiguousarray(np.split(v, tp_size)[idx])
+    elif len(v.shape) == 2:
+        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx])
+    return None
+
+
+def parse_config(ini_file):
+    gpt_config = configparser.ConfigParser()
+    gpt_config.read(ini_file)
+
+    n_embd = gpt_config.getint('gpt', 'n_embd')
+    n_head = gpt_config.getint('gpt', 'n_head')
+    n_layer = gpt_config.getint('gpt', 'n_layer')
+    n_positions = gpt_config.getint('gpt', 'n_positions')
+    vocab_size = gpt_config.getint('gpt', 'vocab_size')
+    do_layer_norm_before = gpt_config.getboolean('gpt',
+                                                 'do_layer_norm_before',
+                                                 fallback=True)
+    rotary_pct = gpt_config.getfloat('gpt', 'rotary_pct', fallback=0.0)
+    hidden_act = gpt_config.get('gpt', 'activation_function')
+    bias = gpt_config.getboolean('gpt', 'bias', fallback=True)
+    inter_size = gpt_config.getint('gpt', 'intermediate_size', fallback=None)
+    dtype = gpt_config.get('gpt', 'storage_dtype', fallback='float32')
+
+    if inter_size is None:
+        inter_size = 4 * n_embd
+
+    multi_query_mode = gpt_config.getboolean('gpt',
+                                             'multi_query_mode',
+                                             fallback=False)
+    prompt_num_tasks = gpt_config.getint('gpt', 'prompt_num_tasks', fallback=0)
+    prompt_max_vocab_size = gpt_config.getint('gpt',
+                                              'prompt_max_vocab_size',
+                                              fallback=0)
+    return n_embd, n_head, n_layer, n_positions, vocab_size, do_layer_norm_before, hidden_act, rotary_pct, bias, inter_size, multi_query_mode, dtype, prompt_num_tasks, prompt_max_vocab_size
+
+
+def load_from_bin_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
+                        dir_path,
+                        rank=0,
+                        tensor_parallel=1,
+                        dtype='float32',
+                        use_parallel_embedding=False,
+                        sharding_dim=0,
+                        share_embedding_table=False,
+                        scaling_factors=None):
+    tensorrt_llm.logger.info('Loading weights from bin...')
+    tik = time.time()
+
+    quant_mode = getattr(tensorrt_llm_gpt_j, 'quant_mode', QuantMode(0))
+    if quant_mode.is_int8_weight_only():
+        plugin_weight_only_quant_type = torch.int8
+    elif quant_mode.is_int4_weight_only():
+        plugin_weight_only_quant_type = torch.quint4x2
+    n_embd, n_head, n_layer, n_positions, vocab_size, do_layer_norm_before, hidden_act, rotary_pct, bias, inter_size, multi_query_mode, *_ = parse_config(
+        Path(dir_path) / 'config.ini')
+    np_dtype = str_dtype_to_np(dtype)
+
+    def fromfile(dir_path, name, shape=None, dtype=None):
+        dtype = np_dtype if dtype is None else dtype
+        p = dir_path + '/' + name
+        if Path(p).exists():
+            t = np.fromfile(p, dtype=dtype)
+            if shape is not None:
+                t = t.reshape(shape)
+            return t
+        return None
+
+    def set_smoothquant_scale_factors(module,
+                                      pre_scale_weight,
+                                      dir_path,
+                                      basename,
+                                      shape,
+                                      per_tok_dyn,
+                                      per_channel,
+                                      is_qkv=False,
+                                      rank=None):
+        suffix = "bin"
+        if per_channel:
+            if rank is not None:
+                suffix = f"{rank}." + suffix
+            suffix = "col." + suffix
+
+        col_shape = shape if (per_channel or is_qkv) else [1, 1]
+        if per_tok_dyn:
+            if pre_scale_weight is not None:
+                pre_scale_weight.value = np.array([1.0], dtype=np.float32)
+            t = fromfile(dir_path, f"{basename}scale_w_quant_orig.{suffix}",
+                         col_shape, np.float32)
+            module.per_channel_scale.value = t
+        else:
+            t = fromfile(dir_path, f"{basename}scale_x_orig_quant.bin", [1],
+                         np.float32)
+            pre_scale_weight.value = t
+            t = fromfile(dir_path, f"{basename}scale_y_accum_quant.{suffix}",
+                         col_shape, np.float32)
+            module.per_channel_scale.value = t
+            t = fromfile(dir_path, f"{basename}scale_y_quant_orig.bin", [1, 1],
+                         np.float32)
+            module.act_scale.value = t
+
+    # Do we use SmoothQuant?
+    use_smooth_quant = quant_mode.has_act_and_weight_quant()
+    # Do we use quantization per token?
+    quant_per_token_dyn = quant_mode.has_per_token_dynamic_scaling()
+    # Do we use quantization per channel?
+    quant_per_channel = quant_mode.has_per_channel_scaling()
+
+    # Do we use INT4/INT8 weight-only?
+    use_weight_only = quant_mode.is_weight_only()
+
+    # Int8 KV cache
+    use_int8_kv_cache = quant_mode.has_int8_kv_cache()
+
+    #Enable FP8 Gemm
+    enable_fp8_qdq = quant_mode.has_fp8_qdq()
+
+    def sq_trick(x):
+        return x.view(np.float32) if use_smooth_quant else x
+
+    # Debug
+    suffix = gen_suffix(rank, use_smooth_quant, quant_per_channel)
+    # The type of weights.
+    w_type = np_dtype if not use_smooth_quant else np.int8
+
+    # pe = fromfile(dir_path, 'model.wpe.bin', [n_positions, n_embd])
+    # if pe is not None:
+    #     tensorrt_llm_gpt_j.embedding.position_embedding.weight.value = (pe)
+
+    vocab_embedding_weight = fromfile(dir_path, 'model.wte.bin',
+                                      [vocab_size, n_embd])
+    if not use_parallel_embedding:
+        tensorrt_llm_gpt_j.embedding.weight.value = vocab_embedding_weight
+    else:
+        if sharding_dim == 0:
+            if vocab_size % tensor_parallel != 0:
+                # padding
+                vocab_size_padded = pad_vocab_size(
+                    tensorrt_llm_gpt_j.embedding.num_embeddings,
+                    tensor_parallel)
+                pad_width = vocab_size_padded - vocab_size
+                vocab_embedding_weight = np.pad(vocab_embedding_weight,
+                                                ((0, pad_width), (0, 0)),
+                                                'constant',
+                                                constant_values=0)
+        tensorrt_llm_gpt_j.embedding.weight.value = np.ascontiguousarray(
+            split(vocab_embedding_weight,
+                  tensor_parallel,
+                  rank,
+                  dim=sharding_dim))
+
+    if do_layer_norm_before:
+        tensorrt_llm_gpt_j.ln_f.bias.value = (fromfile(
+            dir_path, 'model.final_layernorm.bias.bin'))
+        tensorrt_llm_gpt_j.ln_f.weight.value = (fromfile(
+            dir_path, 'model.final_layernorm.weight.bin'))
+
+    # share input embedding
+    if not share_embedding_table:
+        lm_head_weight = fromfile(dir_path, 'model.lm_head.weight.bin',
+                                  [vocab_size, n_embd])
+        lm_head_bias = fromfile(dir_path, 'model.lm_head.bias.bin',
+                                [vocab_size])
+        if lm_head_weight is None:
+            lm_head_weight = fromfile(dir_path, 'model.wte.bin',
+                                      [vocab_size, n_embd])
+        if vocab_size % tensor_parallel != 0:
+            # padding
+            vocab_size_padded = tensorrt_llm_gpt_j.lm_head.out_features * tensor_parallel
+            pad_width = vocab_size_padded - vocab_size
+            lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)),
+                                    'constant',
+                                    constant_values=0)
+        tensorrt_llm_gpt_j.lm_head.weight.value = np.ascontiguousarray(
+            split(lm_head_weight, tensor_parallel, rank))
+        tensorrt_llm_gpt_j.lm_head.bias.value = np.ascontiguousarray(
+            split(lm_head_bias, tensor_parallel, rank))
+    fake_fp8_sf_dt = np.float32
+    for i in range(n_layer):
+        c_attn_out_dim = (3 * n_embd //
+                          tensor_parallel) if not multi_query_mode else (
+                              n_embd // tensor_parallel +
+                              (n_embd // n_head) * 2)
+        tensorrt_llm_gpt_j.layers[i].input_layernorm.weight.value = (fromfile(
+            dir_path, 'model.layers.' + str(i) + '.input_layernorm.weight.bin'))
+        tensorrt_llm_gpt_j.layers[i].input_layernorm.bias.value = (fromfile(
+            dir_path, 'model.layers.' + str(i) + '.input_layernorm.bias.bin'))
+        t = fromfile(
+            dir_path, 'model.layers.' + str(i) +
+            '.attention.query_key_value.weight.' + suffix,
+            [n_embd, c_attn_out_dim], w_type)
+        if t is not None:
+            dst = tensorrt_llm_gpt_j.layers[i].attention.qkv.weight
+            if use_smooth_quant:
+                dst.value = sq_trick(
+                    np.ascontiguousarray(np.transpose(t, [1, 0])))
+                set_smoothquant_scale_factors(
+                    tensorrt_llm_gpt_j.layers[i].attention.qkv,
+                    tensorrt_llm_gpt_j.layers[i].input_layernorm.scale_to_int,
+                    dir_path,
+                    'model.layers.' + str(i) + '.attention.query_key_value.',
+                    [1, c_attn_out_dim],
+                    quant_per_token_dyn,
+                    quant_per_channel,
+                    rank=rank,
+                    is_qkv=True)
+            elif use_weight_only:
+                processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                    torch.tensor(t), plugin_weight_only_quant_type)
+                dst.value = processed_torch_weights.numpy()
+                scales = tensorrt_llm_gpt_j.layers[
+                    i].attention.qkv.per_channel_scale
+                scales.value = torch_weight_scales.numpy()
+            else:
+                dst.value = np.ascontiguousarray(np.transpose(t, [1, 0]))
+
+        if enable_fp8_qdq:
+            tensorrt_llm_gpt_j.layers[
+                i].attention.qkv.activation_scaling_factor.value = np.array(
+                    [scaling_factors['qkv_act'][i]], dtype=fake_fp8_sf_dt)
+            tensorrt_llm_gpt_j.layers[
+                i].attention.qkv.weights_scaling_factor.value = np.array(
+                    [scaling_factors['qkv_weights'][i]], dtype=fake_fp8_sf_dt)
+            tensorrt_llm_gpt_j.layers[
+                i].attention.kv_orig_quant_scale.value = np.array(
+                    [scaling_factors['qkv_output'][i]], dtype=np.float32)
+            tensorrt_llm_gpt_j.layers[
+                i].attention.kv_quant_orig_scale.value = np.array(
+                    [1.0 / scaling_factors['qkv_output'][i]], dtype=np.float32)
+
+        dst = tensorrt_llm_gpt_j.layers[i].attention.dense.weight
+        t = fromfile(
+            dir_path,
+            'model.layers.' + str(i) + '.attention.dense.weight.' + suffix,
+            [n_embd // tensor_parallel, n_embd], w_type)
+        if use_smooth_quant:
+            dst.value = sq_trick(np.ascontiguousarray(np.transpose(t, [1, 0])))
+            dense_scale = getattr(tensorrt_llm_gpt_j.layers[i].attention,
+                                  "quantization_scaling_factor", None)
+            set_smoothquant_scale_factors(
+                tensorrt_llm_gpt_j.layers[i].attention.dense, dense_scale,
+                dir_path, 'model.layers.' + str(i) + '.attention.dense.',
+                [1, n_embd], quant_per_token_dyn, quant_per_channel)
+            # change it to the real smoother if dense layer is applied smooth quant
+            tensorrt_llm_gpt_j.layers[
+                i].attention.dense.smoother.value = np.ones(
+                    [1, n_embd // tensor_parallel], dtype=np.float32)
+        elif use_weight_only:
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_gpt_j.layers[
+                i].attention.dense.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            dst.value = np.ascontiguousarray(np.transpose(t, [1, 0]))
+
+        if enable_fp8_qdq:
+            tensorrt_llm_gpt_j.layers[
+                i].attention.dense.activation_scaling_factor.value = np.array(
+                    [scaling_factors['dense_act'][i]], dtype=fake_fp8_sf_dt)
+            tensorrt_llm_gpt_j.layers[
+                i].attention.dense.weights_scaling_factor.value = np.array(
+                    [scaling_factors['dense_weights'][i]], dtype=fake_fp8_sf_dt)
+
+        t = fromfile(
+            dir_path,
+            'model.layers.' + str(i) + '.mlp.dense_h_to_4h.weight.' + suffix,
+            [n_embd, inter_size // tensor_parallel], w_type)
+        if use_smooth_quant:
+            tensorrt_llm_gpt_j.layers[i].mlp.fc.weight.value = sq_trick(
+                np.ascontiguousarray(np.transpose(t, [1, 0])))
+            set_smoothquant_scale_factors(
+                tensorrt_llm_gpt_j.layers[i].mlp.fc,
+                tensorrt_llm_gpt_j.layers[i].post_layernorm.scale_to_int,
+                dir_path,
+                'model.layers.' + str(i) + '.mlp.dense_h_to_4h.',
+                [1, inter_size // tensor_parallel],
+                quant_per_token_dyn,
+                quant_per_channel,
+                rank=rank)
+        elif use_weight_only:
+            dst = tensorrt_llm_gpt_j.layers[i].mlp.fc.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_gpt_j.layers[i].mlp.fc.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            tensorrt_llm_gpt_j.layers[
+                i].mlp.fc.weight.value = np.ascontiguousarray(
+                    np.transpose(t, [1, 0]))
+        if bias:
+            tensorrt_llm_gpt_j.layers[i].mlp.fc.bias.value = fromfile(
+                dir_path, 'model.layers.' + str(i) +
+                '.mlp.dense_h_to_4h.bias.' + str(rank) + '.bin')
+        if enable_fp8_qdq:
+            tensorrt_llm_gpt_j.layers[
+                i].mlp.fc.activation_scaling_factor.value = np.array(
+                    [scaling_factors['fc_act'][i]], dtype=fake_fp8_sf_dt)
+            tensorrt_llm_gpt_j.layers[
+                i].mlp.fc.weights_scaling_factor.value = np.array(
+                    [scaling_factors['fc_weights'][i]], dtype=fake_fp8_sf_dt)
+
+        t = fromfile(
+            dir_path,
+            'model.layers.' + str(i) + '.mlp.dense_4h_to_h.weight.' + suffix,
+            [inter_size // tensor_parallel, n_embd], w_type)
+        if use_smooth_quant:
+            tensorrt_llm_gpt_j.layers[i].mlp.proj.weight.value = sq_trick(
+                np.ascontiguousarray(np.transpose(t, [1, 0])))
+            proj_scale = getattr(tensorrt_llm_gpt_j.layers[i].mlp,
+                                 "quantization_scaling_factor", None)
+            set_smoothquant_scale_factors(
+                tensorrt_llm_gpt_j.layers[i].mlp.proj, proj_scale, dir_path,
+                'model.layers.' + str(i) + '.mlp.dense_4h_to_h.', [1, n_embd],
+                quant_per_token_dyn, quant_per_channel)
+            # change it to the real smoother if proj layer is applied smooth quant
+            tensorrt_llm_gpt_j.layers[i].mlp.proj.smoother.value = np.ones(
+                [1, inter_size // tensor_parallel], dtype=np.float32)
+        elif use_weight_only:
+            dst = tensorrt_llm_gpt_j.layers[i].mlp.proj.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_gpt_j.layers[i].mlp.proj.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            tensorrt_llm_gpt_j.layers[i].mlp.proj.weight.value = (
+                np.ascontiguousarray(np.transpose(t, [1, 0])))
+        if bias:
+            tensorrt_llm_gpt_j.layers[i].mlp.proj.bias.value = fromfile(
+                dir_path,
+                'model.layers.' + str(i) + '.mlp.dense_4h_to_h.bias.bin')
+
+        if use_int8_kv_cache:
+            t = fromfile(
+                dir_path, 'model.layers.' + str(i) +
+                '.attention.query_key_value.scale_y_quant_orig.bin', [1],
+                np.float32)
+            tensorrt_llm_gpt_j.layers[
+                i].attention.kv_orig_quant_scale.value = 1.0 / t
+            tensorrt_llm_gpt_j.layers[i].attention.kv_quant_orig_scale.value = t
+
+        if enable_fp8_qdq:
+            tensorrt_llm_gpt_j.layers[
+                i].mlp.proj.activation_scaling_factor.value = np.array(
+                    [scaling_factors['proj_act'][i]], dtype=fake_fp8_sf_dt)
+            tensorrt_llm_gpt_j.layers[
+                i].mlp.proj.weights_scaling_factor.value = np.array(
+                    [scaling_factors['proj_weights'][i]], dtype=fake_fp8_sf_dt)
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}')
+
+
 def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
                        hf_gpt_j,
                        fp16=False,
@@ -132,6 +514,13 @@ def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
     ]
 
     quant_mode = getattr(tensorrt_llm_gpt_j, 'quant_mode', QuantMode(0))
+    if quant_mode.is_int8_weight_only():
+        plugin_weight_only_quant_type = torch.int8
+    elif quant_mode.is_int4_weight_only():
+        plugin_weight_only_quant_type = torch.quint4x2
+
+    # Do we use INT4/INT8 weight-only?
+    use_weight_only = quant_mode.is_weight_only()
 
     tensorrt_llm.logger.info('Loading weights from HF GPT-J...')
     tik = time.time()
@@ -171,7 +560,21 @@ def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
                     layer_idx].mlp.proj.weights_scaling_factor.value = np.array(
                         [scaling_factors['proj_weights'][layer_idx]],
                         dtype=np.float32)
-            setattr(layer, 'value', v.to(torch_dtype).cpu().numpy())
+            if use_weight_only and (idx == 2 or idx == 4):
+                processed_torch_weights, torch_weight_scales = \
+                    torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                        v.transpose(0, 1).contiguous(), plugin_weight_only_quant_type
+                    )
+                layer.value = processed_torch_weights.numpy()
+                if idx == 2:
+                    scales = tensorrt_llm_gpt_j.layers[
+                        layer_idx].mlp.fc.per_channel_scale
+                elif idx == 4:
+                    scales = tensorrt_llm_gpt_j.layers[
+                        layer_idx].mlp.proj.per_channel_scale
+                scales.value = torch_weight_scales.numpy()
+            else:
+                setattr(layer, 'value', v.to(torch_dtype).cpu().numpy())
 
         # Attention QKV Linear
         # concatenate the Q, K, V layers weights.
@@ -181,7 +584,16 @@ def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
         qkv_weights = torch.cat((q_weights, k_weights, v_weights))
         layer = attrgetter("attention.qkv.weight")(
             tensorrt_llm_gpt_j.layers[layer_idx])
-        setattr(layer, "value", qkv_weights.to(torch_dtype).cpu().numpy())
+        if use_weight_only:
+            processed_torch_weights, torch_weight_scales = \
+                torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                qkv_weights.transpose(0, 1).contiguous(), plugin_weight_only_quant_type)
+            layer.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_gpt_j.layers[
+                layer_idx].attention.qkv.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            setattr(layer, "value", qkv_weights.to(torch_dtype).cpu().numpy())
         if scaling_factors:
             tensorrt_llm_gpt_j.layers[
                 layer_idx].attention.qkv.activation_scaling_factor.value = np.array(
@@ -206,7 +618,16 @@ def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
         v = hf_gpt_j_state_dict.get(prefix + "attn.out_proj.weight")
         layer = attrgetter("attention.dense.weight")(
             tensorrt_llm_gpt_j.layers[layer_idx])
-        setattr(layer, "value", v.to(torch_dtype).cpu().numpy())
+        if use_weight_only:
+            processed_torch_weights, torch_weight_scales = \
+                torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                v.transpose(0, 1).contiguous(), plugin_weight_only_quant_type)
+            layer.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_gpt_j.layers[
+                layer_idx].attention.dense.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            setattr(layer, "value", v.to(torch_dtype).cpu().numpy())
         if scaling_factors:
             tensorrt_llm_gpt_j.layers[
                 layer_idx].attention.dense.activation_scaling_factor.value = np.array(
@@ -233,106 +654,13 @@ def load_from_hf_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
     tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}')
 
 
-def AWQ_quantize_pack_preprocess(weight, scale, group_size, packer,
-                                 preprocessor):
-    scale = scale.repeat_interleave(group_size, dim=0)
-    weight = weight / scale
-    weight = torch.round(weight).char()
-    weight = torch.where(weight > 7, 7, weight)
-    qweight_int8 = torch.where(weight < -8, -8, weight)
-    int4_weight = packer(qweight_int8.cpu())
-    int4_weight = preprocessor(int4_weight, torch.quint4x2)
-    return int4_weight.view(torch.int8).cpu().numpy()
-
-
-def process_and_assign_weight(awq_gpt_j, mPrefix, mOp, group_size, packer,
-                              preprocessor, torch_dtype):
-    weight = awq_gpt_j[mPrefix + ".weight"].T.contiguous()
-    [k, n] = weight.shape
-    amax = awq_gpt_j[mPrefix + ".weight_quantizer._amax"].reshape(
-        (n, int(k / group_size))).T.contiguous()
-    pre_quant_scale = awq_gpt_j[mPrefix +
-                                ".input_quantizer._pre_quant_scale"].reshape(
-                                    (1, k))
-    scale = amax / 8.0
-    mOp.qweight.value = AWQ_quantize_pack_preprocess(weight, scale, group_size,
-                                                     packer, preprocessor)
-    mOp.scale.value = scale.to(torch_dtype).cpu().numpy()
-    mOp.pre_quant_scale.value = pre_quant_scale.to(torch_dtype).cpu().numpy()
-
-
-def deSmooth(weight, pre_quant_scale):
-    [k, n] = weight.shape
-    pre_quant_scale = pre_quant_scale.repeat((n, 1)).transpose(1,
-                                                               0).contiguous()
-    weight = weight * pre_quant_scale
-    return weight
-
-
-def reSmooth(weight, pre_quant_scale):
-    [k, n] = weight.shape
-    pre_quant_scale = pre_quant_scale.repeat((n, 1)).transpose(1,
-                                                               0).contiguous()
-    weight = weight / pre_quant_scale
-    return weight
-
-
-def get_scale(weight, group_size):
-    weight = weight.T.contiguous()
-    [n, k] = weight.shape
-    weight = weight.reshape(n, int(k / group_size), group_size)
-    weight = torch.abs(weight.reshape(-1, group_size))
-    amax, idx = weight.max(1)
-    amax = amax.reshape(n, int(k / group_size)).T.contiguous()
-    return amax / 8
-
-
-def reSmooth_and_get_scale(weight, pre_quant_scale, avg_pre_quant_scale,
-                           group_size):
-    weight = deSmooth(weight, pre_quant_scale)
-    weight = reSmooth(weight, avg_pre_quant_scale)
-    scale = get_scale(weight, group_size)
-    return weight, scale
-
-
-def process_and_assign_qkv_weight(awq_gpt_j, prefix, mOp, group_size, packer,
-                                  preprocessor, torch_dtype):
-    q_weight = awq_gpt_j[prefix + "attn.q_proj.weight"].T.contiguous()
-    k_weight = awq_gpt_j[prefix + "attn.k_proj.weight"].T.contiguous()
-    v_weight = awq_gpt_j[prefix + "attn.v_proj.weight"].T.contiguous()
-    [k, n] = q_weight.shape
-
-    q_pre_quant_scale = awq_gpt_j[
-        prefix + "attn.q_proj.input_quantizer._pre_quant_scale"].reshape((1, k))
-    k_pre_quant_scale = awq_gpt_j[
-        prefix + "attn.k_proj.input_quantizer._pre_quant_scale"].reshape((1, k))
-    v_pre_quant_scale = awq_gpt_j[
-        prefix + "attn.v_proj.input_quantizer._pre_quant_scale"].reshape((1, k))
-
-    qkv_pre_quant_scale = (q_pre_quant_scale + k_pre_quant_scale +
-                           v_pre_quant_scale) / 3.0
-    q_weight, q_scale = reSmooth_and_get_scale(q_weight, q_pre_quant_scale,
-                                               qkv_pre_quant_scale, group_size)
-    k_weight, k_scale = reSmooth_and_get_scale(k_weight, k_pre_quant_scale,
-                                               qkv_pre_quant_scale, group_size)
-    v_weight, v_scale = reSmooth_and_get_scale(v_weight, v_pre_quant_scale,
-                                               qkv_pre_quant_scale, group_size)
-
-    qkv_weights = torch.cat((q_weight, k_weight, v_weight), dim=1)
-    qkv_scale = torch.cat((q_scale, k_scale, v_scale), dim=1)
-    mOp.pre_quant_scale.value = qkv_pre_quant_scale.to(
-        torch_dtype).cpu().numpy()
-    mOp.qweight.value = AWQ_quantize_pack_preprocess(qkv_weights, qkv_scale,
-                                                     group_size, packer,
-                                                     preprocessor)
-    mOp.scale.value = qkv_scale.to(torch_dtype).cpu().numpy()
-
-
 def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
                         awq_gpt_j,
                         config,
+                        mapping=Mapping(),
                         fp16=False,
-                        group_size=128):
+                        group_size=128,
+                        ft_model_dir=None):
 
     awq_gptj_block_names = [
         "ln_1.weight",
@@ -348,7 +676,18 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
         "mlp.proj.bias",
     ]
 
-    getattr(tensorrt_llm_gpt_j, 'quant_mode', QuantMode(0))
+    def fromfile(dir_path, name, shape=None, dtype=None):
+        p = dir_path + '/' + name
+        if Path(p).exists():
+            t = np.fromfile(p, dtype=dtype)
+            if shape is not None:
+                t = t.reshape(shape)
+            return t
+        return None
+
+    quant_mode = getattr(tensorrt_llm_gpt_j, 'quant_mode', QuantMode(0))
+    # Int8 KV cache
+    use_int8_kv_cache = quant_mode.has_int8_kv_cache()
 
     packer = torch.ops.fastertransformer.pack_int8_tensor_to_packed_int4
     preprocessor = torch.ops.fastertransformer.preprocess_weights_for_mixed_gemm
@@ -358,6 +697,103 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
 
     torch_dtype = torch.float16 if fp16 else torch.float32
 
+    def AWQ_quantize_pack_preprocess(weight, scale):
+        scale = scale.repeat_interleave(group_size, dim=0)
+        weight = weight / scale
+        qweight_int8 = torch.clamp(torch.round(weight.cuda()).char(), -8, 7)
+        int4_weight = packer(qweight_int8.cpu())
+        int4_weight = preprocessor(int4_weight, torch.quint4x2)
+        return int4_weight.view(torch.int8).cpu().numpy()
+
+    def process_and_assign_weight(awq_gpt_j, mPrefix, mOp, tp_dim=0):
+        weight = awq_gpt_j[mPrefix + ".weight"].T.contiguous()
+        [k, n] = weight.shape
+        weight = weight.split(weight.shape[tp_dim] // mapping.tp_size,
+                              dim=tp_dim)[mapping.tp_rank]
+        amax = awq_gpt_j[mPrefix + ".weight_quantizer._amax"].reshape(
+            (n, int(k / group_size))).T.contiguous()
+        amax = amax.split(amax.shape[tp_dim] // mapping.tp_size,
+                          dim=tp_dim)[mapping.tp_rank]
+        pre_quant_scale = awq_gpt_j[
+            mPrefix + ".input_quantizer._pre_quant_scale"].reshape((1, k))
+        if tp_dim == 0:
+            pre_quant_scale = pre_quant_scale.split(k // mapping.tp_size,
+                                                    dim=1)[mapping.tp_rank]
+        scale = amax / 8.0
+        mOp.qweight.value = AWQ_quantize_pack_preprocess(weight, scale)
+        mOp.scale.value = scale.to(torch_dtype).cpu().numpy()
+        mOp.pre_quant_scale.value = pre_quant_scale.to(
+            torch_dtype).cpu().numpy()
+
+    def deSmooth(weight, pre_quant_scale):
+        [k, n] = weight.shape
+        pre_quant_scale = pre_quant_scale.repeat(
+            (n, 1)).transpose(1, 0).contiguous()
+        weight = weight * pre_quant_scale
+        return weight
+
+    def reSmooth(weight, pre_quant_scale):
+        [k, n] = weight.shape
+        pre_quant_scale = pre_quant_scale.repeat(
+            (n, 1)).transpose(1, 0).contiguous()
+        weight = weight / pre_quant_scale
+        return weight
+
+    def get_scale(weight):
+        weight = weight.T.contiguous()
+        [n, k] = weight.shape
+        weight = weight.reshape(n, int(k / group_size), group_size)
+        weight = torch.abs(weight.reshape(-1, group_size))
+        amax, idx = weight.max(1)
+        amax = amax.reshape(n, int(k / group_size)).T.contiguous()
+        return amax / 8
+
+    def reSmooth_and_get_scale(weight, pre_quant_scale, avg_pre_quant_scale):
+        weight = deSmooth(weight, pre_quant_scale)
+        weight = reSmooth(weight, avg_pre_quant_scale)
+        scale = get_scale(weight)
+        return weight, scale
+
+    def process_and_assign_qkv_weight(awq_gpt_j, prefix, mOp):
+        q_weight = awq_gpt_j[prefix + "attn.q_proj.weight"].T.contiguous()
+        k_weight = awq_gpt_j[prefix + "attn.k_proj.weight"].T.contiguous()
+        v_weight = awq_gpt_j[prefix + "attn.v_proj.weight"].T.contiguous()
+        k = q_weight.shape[0]
+
+        q_weight = q_weight.split(q_weight.shape[1] // mapping.tp_size,
+                                  dim=1)[mapping.tp_rank]
+        k_weight = k_weight.split(k_weight.shape[1] // mapping.tp_size,
+                                  dim=1)[mapping.tp_rank]
+        v_weight = v_weight.split(v_weight.shape[1] // mapping.tp_size,
+                                  dim=1)[mapping.tp_rank]
+
+        q_pre_quant_scale = awq_gpt_j[
+            prefix + "attn.q_proj.input_quantizer._pre_quant_scale"].reshape(
+                (1, k))
+        k_pre_quant_scale = awq_gpt_j[
+            prefix + "attn.k_proj.input_quantizer._pre_quant_scale"].reshape(
+                (1, k))
+        v_pre_quant_scale = awq_gpt_j[
+            prefix + "attn.v_proj.input_quantizer._pre_quant_scale"].reshape(
+                (1, k))
+
+        qkv_pre_quant_scale = (q_pre_quant_scale + k_pre_quant_scale +
+                               v_pre_quant_scale) / 3.0
+        q_weight, q_scale = reSmooth_and_get_scale(q_weight, q_pre_quant_scale,
+                                                   qkv_pre_quant_scale)
+        k_weight, k_scale = reSmooth_and_get_scale(k_weight, k_pre_quant_scale,
+                                                   qkv_pre_quant_scale)
+        v_weight, v_scale = reSmooth_and_get_scale(v_weight, v_pre_quant_scale,
+                                                   qkv_pre_quant_scale)
+
+        qkv_weights = torch.cat((q_weight, k_weight, v_weight), dim=1)
+        qkv_scale = torch.cat((q_scale, k_scale, v_scale), dim=1)
+
+        mOp.pre_quant_scale.value = qkv_pre_quant_scale.to(
+            torch_dtype).cpu().numpy()
+        mOp.qweight.value = AWQ_quantize_pack_preprocess(qkv_weights, qkv_scale)
+        mOp.scale.value = qkv_scale.to(torch_dtype).cpu().numpy()
+
     #check if we need to pad vocab
     v = awq_gpt_j.get('transformer.wte.weight')
     [vocab_size, k] = v.shape
@@ -379,6 +815,10 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
         tensorrt_llm.logger.info(f'Process weights in layer: {layer_idx}')
         for idx, awq_attr in enumerate(awq_gptj_block_names):
             v = awq_gpt_j[prefix + awq_attr]
+            if awq_attr == "mlp.fc_in.bias":
+                v = v.split(v.shape[0] // mapping.tp_size, dim=0)[mapping.rank]
+            elif awq_attr == "mlp.fc_out.bias":
+                v = torch.zeros_like(v) if mapping.rank != 0 else v
             layer = attrgetter(tensorrt_llm_model_gptj_block_names[idx])(
                 tensorrt_llm_gpt_j.layers[layer_idx])
             setattr(layer, 'value', v.to(torch_dtype).cpu().numpy())
@@ -387,26 +827,34 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
         # concatenate the Q, K, V layers weights.
         process_and_assign_qkv_weight(
             awq_gpt_j, prefix,
-            tensorrt_llm_gpt_j.layers[layer_idx].attention.qkv, group_size,
-            packer, preprocessor, torch_dtype)
+            tensorrt_llm_gpt_j.layers[layer_idx].attention.qkv)
 
         # Attention Dense (out_proj) Linear
         mPrefix = prefix + "attn.out_proj"
         mOp = tensorrt_llm_gpt_j.layers[layer_idx].attention.dense
-        process_and_assign_weight(awq_gpt_j, mPrefix, mOp, group_size, packer,
-                                  preprocessor, torch_dtype)
+        process_and_assign_weight(awq_gpt_j, mPrefix, mOp, 0)
 
         # MLP Dense (mlp.fc) Linear
         mPrefix = prefix + "mlp.fc_in"
         mOp = tensorrt_llm_gpt_j.layers[layer_idx].mlp.fc
-        process_and_assign_weight(awq_gpt_j, mPrefix, mOp, group_size, packer,
-                                  preprocessor, torch_dtype)
+        process_and_assign_weight(awq_gpt_j, mPrefix, mOp, 1)
 
         # MLP Dense (mlp.proj) Linear
         mPrefix = prefix + "mlp.fc_out"
         mOp = tensorrt_llm_gpt_j.layers[layer_idx].mlp.proj
-        process_and_assign_weight(awq_gpt_j, mPrefix, mOp, group_size, packer,
-                                  preprocessor, torch_dtype)
+        process_and_assign_weight(awq_gpt_j, mPrefix, mOp, 0)
+
+        if use_int8_kv_cache:
+            assert ft_model_dir, "You must pass --ft_model_dir to tell TRT-LLM where to look for scales of INT8 kv cache."
+            t = fromfile(
+                ft_model_dir, 'model.layers.' + str(layer_idx) +
+                '.attention.query_key_value.scale_y_quant_orig.bin', [1],
+                np.float32)
+            assert t is not None, f"{ft_model_dir} does not contain model.layers.{layer_idx}.attention.query_key_value.scale_y_quant_orig.bin"
+            tensorrt_llm_gpt_j.layers[
+                layer_idx].attention.kv_orig_quant_scale.value = 1.0 / t
+            tensorrt_llm_gpt_j.layers[
+                layer_idx].attention.kv_quant_orig_scale.value = t
 
     v = awq_gpt_j['transformer.ln_f.weight']
     tensorrt_llm_gpt_j.ln_f.weight.value = v.to(torch_dtype).cpu().numpy()
@@ -421,14 +869,18 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
         new_weight = torch.zeros([pad_vocab_size, k])
         new_weight[:vocab_size, :] = weight
         new_weight = new_weight.T.contiguous()
+        new_weight = new_weight.split(new_weight.shape[1] // mapping.tp_size,
+                                      dim=1)[mapping.tp_rank]
         amax = awq_gpt_j['lm_head.weight_quantizer._amax'].reshape(
             [vocab_size, int(k / group_size)])
         new_amax = torch.ones([pad_vocab_size, int(k / group_size)])
         new_amax[:vocab_size, :] = amax
         new_amax = new_amax.T.contiguous()
+        new_amax = new_amax.split(new_amax.shape[1] // mapping.tp_size,
+                                  dim=1)[mapping.tp_rank]
         new_scale = new_amax / 8
         tensorrt_llm_gpt_j.lm_head.qweight.value = AWQ_quantize_pack_preprocess(
-            new_weight, new_scale, group_size, packer, preprocessor)
+            new_weight, new_scale)
         tensorrt_llm_gpt_j.lm_head.scale.value = new_scale.to(
             torch_dtype).cpu().numpy()
         tensorrt_llm_gpt_j.lm_head.pre_quant_scale.value = awq_gpt_j[
@@ -438,13 +890,14 @@ def load_from_awq_gpt_j(tensorrt_llm_gpt_j: GPTJForCausalLM,
         bias = awq_gpt_j['lm_head.bias']
         new_bias = torch.zeros([pad_vocab_size])
         new_bias[:vocab_size] = bias
+        new_bias = new_bias.split(pad_vocab_size // mapping.tp_size,
+                                  dim=0)[mapping.tp_rank]
         tensorrt_llm_gpt_j.lm_head.bias.value = new_bias.to(
             torch_dtype).cpu().numpy()
     else:
         mPrefix = "lm_head"
         mOp = tensorrt_llm_gpt_j.lm_head
-        process_and_assign_weight(awq_gpt_j, mPrefix, mOp, group_size, packer,
-                                  preprocessor, torch_dtype)
+        process_and_assign_weight(awq_gpt_j, mPrefix, mOp, 1)
 
         v = awq_gpt_j['lm_head.bias']
         tensorrt_llm_gpt_j.lm_head.bias.value = v.to(torch_dtype).cpu().numpy()
diff --git a/examples/gptneox/build.py b/examples/gptneox/build.py
index ca5c50142c..01c55c32f8 100644
--- a/examples/gptneox/build.py
+++ b/examples/gptneox/build.py
@@ -28,8 +28,7 @@ import tensorrt_llm
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import (weight_only_groupwise_quantize,
-                                 weight_only_quantize)
+from tensorrt_llm.models import quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
@@ -269,15 +268,22 @@ def build_rank_engine(builder: Builder,
         use_parallel_embedding=args.use_parallel_embedding,
         embedding_sharding_dim=args.embedding_sharding_dim)
 
-    if args.use_weight_only_quant_matmul_plugin:
-        tensorrt_llm_gpt = weight_only_quantize(tensorrt_llm_gpt)
-
-    if args.use_weight_only_groupwise_quant_matmul_plugin:
-        tensorrt_llm_gpt = weight_only_groupwise_quantize(
-            model=tensorrt_llm_gpt,
-            quant_mode=QuantMode(0),
-            group_size=128,
-            zero=True)
+    if args.use_weight_only_quant_matmul_plugin or args.use_weight_only_groupwise_quant_matmul_plugin:
+        quant_mode = QuantMode.from_description(
+            quantize_weights=True,
+            quantize_activations=False,
+            per_token=False,
+            per_channel=False,
+            per_group=args.use_weight_only_groupwise_quant_matmul_plugin,
+            use_int4_weights=False)
+        quantize_kwargs = {}
+        if args.use_weight_only_groupwise_quant_matmul_plugin:
+            quantize_kwargs = {
+                "group_size": 128,
+                "zero": True,
+            }
+        tensorrt_llm_gpt = quantize_model(tensorrt_llm_gpt, quant_mode,
+                                          **quantize_kwargs)
 
     if args.model_dir is not None:
         assert hf_gpt is not None, f'Could not load weights from hf_gpt model as it is not loaded yet.'
@@ -341,6 +347,9 @@ def build_rank_engine(builder: Builder,
     if rank == 0:
         config_path = os.path.join(args.output_dir, 'config.json')
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, tensorrt_llm_gpt)
+
     return engine
 
 
diff --git a/examples/internlm/.gitignore b/examples/internlm/.gitignore
new file mode 100644
index 0000000000..7ce339719a
--- /dev/null
+++ b/examples/internlm/.gitignore
@@ -0,0 +1,2 @@
+internlm*
+tokenizer.model
diff --git a/examples/internlm/README.md b/examples/internlm/README.md
new file mode 100644
index 0000000000..2f7a3cee7b
--- /dev/null
+++ b/examples/internlm/README.md
@@ -0,0 +1,307 @@
+# InternLM
+
+This document shows how to build and run InternLM 7B / 20B models in TensorRT-LLM on both single GPU, single node multi-GPU and multi-node multi-GPU.
+
+## Overview
+
+The TensorRT-LLM InternLM implementation can be found in [tensorrt_llm/models/internlm/model.py](../../tensorrt_llm/models/internlm/model.py). The TensorRT-LLM InternLM example code is located in [`examples/internlm`](./). There are three main files in that folder::
+
+ * [`build.py`](./build.py) to build the [TensorRT](https://developer.nvidia.com/tensorrt) engine(s) needed to run the InternLM model,
+ * [`run.py`](./run.py) to run the inference on an input text,
+ * [`summarize.py`](./summarize.py) to summarize the articles in the [cnn_dailymail](https://huggingface.co/datasets/cnn_dailymail) dataset using the model.
+
+## Support Matrix
+  * FP16 / BF16
+  * INT8 & INT4 Weight-Only
+  * Smooth Quant
+  * INT8 KV Cache
+  * Tensor Parallel & Pipeline Parallel
+
+## Usage
+
+The TensorRT-LLM InternLM example code locates at [examples/internlm](./). It takes HF weights as input, and builds the corresponding TensorRT engines. The number of TensorRT engines depends on the number of GPUs used to run inference.
+
+### Build TensorRT engine(s)
+
+TensorRT-LLM InternLM builds TensorRT engine(s) from HF checkpoint. If no checkpoint directory is specified, TensorRT-LLM will build engine(s) with dummy weights.
+
+InternLM has released several checkpoints of different size or capabilities under https://huggingface.co/internlm. Users can pick any one repository and follow instructions to prepare the checkpoint.
+
+Below examples use [internlm-chat-7b](https://huggingface.co/internlm/internlm-chat-7b) and [internlm-chat-20b](https://huggingface.co/internlm/internlm-chat-20b) and assume these repositories are cloned or linked under this directory, for example `./internlm-chat-7b/`.
+
+Normally `build.py` only requires single GPU, but if you've already got all the GPUs needed while inferencing, you could enable parallel building to make the engine building process faster by adding `--parallel_build` argument. Please note that currently `parallel_build` feature only supports single node.
+
+Here're some examples:
+
+```bash
+# Build a single-GPU float16 engine from HF weights.
+# use_gpt_attention_plugin is necessary in InternLM.
+# Try use_gemm_plugin to prevent accuracy issue.
+# It is recommend to use --remove_input_padding along with --use_gpt_attention_plugin for better performance
+
+# Build the InternLM 7B model using a single GPU and FP16.
+python build.py --model_dir ./internlm-chat-7b/ \
+                --dtype float16 \
+                --remove_input_padding \
+                --use_gpt_attention_plugin float16 \
+                --enable_context_fmha \
+                --use_gemm_plugin float16 \
+                --output_dir ./internlm-chat-7b/trt_engines/fp16/1-gpu/
+
+# Build the InternLM 7B model using a single GPU and BF16.
+python build.py --model_dir ./internlm-chat-7b/ \
+                --dtype bfloat16 \
+                --remove_input_padding \
+                --use_gpt_attention_plugin bfloat16 \
+                --enable_context_fmha \
+                --use_gemm_plugin bfloat16 \
+                --output_dir ./internlm-chat-7b/trt_engines/bf16/1-gpu/
+
+# Build the InternLM 7B model using a single GPU and apply INT8 weight-only quantization.
+python build.py --model_dir ./internlm-chat-7b/ \
+                --dtype float16 \
+                --remove_input_padding \
+                --use_gpt_attention_plugin float16 \
+                --enable_context_fmha \
+                --use_gemm_plugin float16 \
+                --use_weight_only \
+                --output_dir ./internlm-chat-7b/trt_engines/weight_only/1-gpu/
+
+# Note: setting `--weight_only_precision int4` to use INT4 weight-only quantization
+
+# Build InternLM 7B using 2-way tensor parallelism.
+python build.py --model_dir ./internlm-chat-7b/ \
+                --dtype float16 \
+                --remove_input_padding \
+                --use_gpt_attention_plugin float16 \
+                --enable_context_fmha \
+                --use_gemm_plugin float16 \
+                --output_dir ./internlm-chat-7b/trt_engines/fp16/2-gpu/ \
+                --world_size 2 \
+                --tp_size 2 \
+                --parallel_build
+
+# Build InternLM 20B using 2-way tensor parallelism and 2-way pipeline parallelism.
+python build.py --model_dir ./internlm-chat-20b/ \
+                --dtype bfloat16 \
+                --remove_input_padding \
+                --use_gpt_attention_plugin bfloat16 \
+                --enable_context_fmha \
+                --use_gemm_plugin bfloat16 \
+                --output_dir ./internlm-chat-20b/trt_engines/bf16/4-gpu/ \
+                --world_size 4 \
+                --tp_size 2 \
+                --pp_size 2 \
+                --parallel_build
+```
+
+#### INT8 weight only + INT8 KV cache
+
+For INT8 KV cache, [`hf_internlm_convert.py`](./hf_internlm_convert.py) features a
+`--calibrate-kv-cache, -kv` option. Setting `-kv` will calibrate the model,
+and then export the scaling factors needed for INT8 KV cache inference.
+
+
+Example:
+
+```bash
+# For 7B models
+python hf_internlm_convert.py -i ./internlm-chat-7b -o ./internlm-chat-7b/smooth_internlm/int8_kv_cache/ --calibrate-kv-cache -t fp16
+# For 20B models
+python hf_internlm_convert.py -i ./internlm-chat-20b -o ./internlm-chat-20b/smooth_internlm/int8_kv_cache/ --calibrate-kv-cache -t fp16
+```
+
+[`build.py`](./build.py) add new options for the support of INT8 KV cache.
+
+`--int8_kv_cache` is the command-line option to enable INT8 KV cache.
+
+In addition, it could be combined with INT8 weight-only quantization, as follows:
+
+Examples of INT8 weight-only quantization + INT8 KV cache
+
+```bash
+# Build 7B model with both INT8 weight-only and INT8 KV cache enabled
+python build.py --ft_model_dir=./internlm-chat-7b/smooth_internlm/int8_kv_cache/1-gpu/ \
+                --dtype float16 \
+                --use_gpt_attention_plugin float16 \
+                --use_gemm_plugin float16 \
+                --output_dir ./internlm-chat-7b/trt_engines/int8_kv_cache_weight_only/1-gpu \
+                --int8_kv_cache \
+                --use_weight_only
+
+# Build 20B model with both INT8 weight-only and INT8 KV cache enabled
+python build.py --ft_model_dir=./internlm-chat-20b/smooth_internlm/int8_kv_cache/1-gpu/ \
+                --dtype float16 \
+                --use_gpt_attention_plugin float16 \
+                --use_gemm_plugin float16 \
+                --output_dir ./internlm-chat-20b/trt_engines/int8_kv_cache_weight_only/1-gpu \
+                --int8_kv_cache \
+                --use_weight_only
+```
+
+Test with `run.py` or `summarize.py`:
+
+```bash
+python run.py --max_output_len=120 \
+              --input_text 'Tell me about yourself.' \
+              --tokenizer_dir ./internlm-chat-7b/ \
+              --engine_dir ./internlm-chat-7b/trt_engines/int8_kv_cache_weight_only/1-gpu
+
+python run.py --max_output_len=120 \
+              --input_text 'Tell me about yourself.' \
+              --tokenizer_dir ./internlm-chat-20b/ \
+              --engine_dir ./internlm-chat-20b/trt_engines/int8_kv_cache_weight_only/1-gpu
+
+python summarize.py --test_trt_llm --test_hf \
+                    --hf_model_location ./internlm-chat-7b \
+                    --data_type fp16 \
+                    --engine_dir ./internlm-chat-7b/trt_engines/int8_kv_cache_weight_only/1-gpu
+
+python summarize.py --test_trt_llm --test_hf \
+                    --hf_model_location ./internlm-chat-20b \
+                    --data_type fp16 \
+                    --engine_dir ./internlm-chat-20b/trt_engines/int8_kv_cache_weight_only/1-gpu
+```
+
+#### SmoothQuant
+
+Unlike the FP16 build where the HF weights are processed and loaded into the TensorRT-LLM directly, the SmoothQuant needs to load INT8 weights which should be pre-processed before building an engine.
+
+Example:
+```bash
+# For 7B models
+python hf_internlm_convert.py -i ./internlm-chat-7b -o ./internlm-chat-7b/smooth_internlm/sq0.5/ -sq 0.5 --tensor-parallelism 1 --storage-type fp16
+# For 20B models
+python hf_internlm_convert.py -i ./internlm-chat-20b -o ./internlm-chat-20b/smooth_internlm/sq0.5/ -sq 0.5 --tensor-parallelism 1 --storage-type fp16
+```
+
+[`build.py`](./build.py) add new options for the support of INT8 inference of SmoothQuant models.
+
+`--use_smooth_quant` is the starting point of INT8 inference. By default, it
+will run the model in the _per-tensor_ mode.
+
+Then, you can add any combination of `--per-token` and `--per-channel` to get the corresponding behaviors.
+
+Examples of build invocations:
+
+```bash
+# Build model for SmoothQuant in the _per_tensor_ mode.
+# 7B model
+python build.py --ft_model_dir=./internlm-chat-7b/smooth_internlm/sq0.5/1-gpu/ \
+                --use_smooth_quant \
+                --output_dir ./internlm-chat-7b/trt_engines/smoothquant/1-gpu
+
+# 20B model
+python build.py --ft_model_dir=./internlm-chat-20b/smooth_internlm/sq0.5/1-gpu/ \
+                --use_smooth_quant \
+                --output_dir ./internlm-chat-20b/trt_engines/smoothquant/1-gpu
+
+# OR build model for SmoothQuant in the _per_token_ + _per_channel_ mode
+# 7B model
+python build.py --ft_model_dir=./internlm-chat-7b/smooth_internlm/sq0.5/1-gpu/ \
+                --use_smooth_quant \
+                --per_token \
+                --per_channel \
+                --output_dir ./internlm-chat-7b/trt_engines/smoothquant/1-gpu
+
+# 20B model
+python build.py --ft_model_dir=./internlm-chat-20b/smooth_internlm/sq0.5/1-gpu/ \
+                --use_smooth_quant \
+                --per_token \
+                --per_channel \
+                --output_dir ./internlm-chat-20b/trt_engines/smoothquant/1-gpu
+```
+
+Note we use `--ft_model_dir` instead of `--model_dir` and `--meta_ckpt_dir` since SmoothQuant model needs INT8 weights and various scales from the binary files.
+
+Test with `run.py` or `summarize.py`:
+
+```bash
+python run.py --max_output_len=120 \
+              --input_text 'Tell me about yourself.' \
+              --tokenizer_dir ./internlm-chat-7b/ \
+              --engine_dir ./internlm-chat-7b/trt_engines/smoothquant/1-gpu
+
+python run.py --max_output_len=120 \
+              --input_text 'Tell me about yourself.' \
+              --tokenizer_dir ./internlm-chat-20b/ \
+              --engine_dir ./internlm-chat-20b/trt_engines/smoothquant/1-gpu
+
+python summarize.py --test_trt_llm --test_hf \
+                    --hf_model_location ./internlm-chat-7b \
+                    --data_type fp16 \
+                    --engine_dir ./internlm-chat-7b/trt_engines/smoothquant/1-gpu
+
+python summarize.py --test_trt_llm --test_hf \
+                    --hf_model_location ./internlm-chat-20b \
+                    --data_type fp16 \
+                    --engine_dir ./internlm-chat-20b/trt_engines/smoothquant/1-gpu
+```
+
+### Run
+
+To run a TensorRT-LLM InternLM model using the engines generated by build.py
+
+```bash
+# InternLM 7B with fp16
+python run.py --max_output_len=120 \
+               --input_text 'Tell me about yourself.' \
+               --tokenizer_dir ./internlm-chat-7b/ \
+               --engine_dir=./internlm-chat-7b/trt_engines/fp16/1-gpu/
+
+# InternLM 7B with bf16
+python run.py --max_output_len=120 \
+               --input_text 'Tell me about yourself.' \
+               --tokenizer_dir ./internlm-chat-7b/ \
+               --engine_dir=./internlm-chat-7b/trt_engines/bf16/1-gpu/
+
+# InternLM 7B with int8 weight only quantization
+python run.py --max_output_len=120 \
+               --input_text 'Tell me about yourself.' \
+               --tokenizer_dir ./internlm-chat-7b/ \
+               --engine_dir=./internlm-chat-7b/trt_engines/weight_only/1-gpu/
+
+# InternLM 7B with fp16 and tensor parallelism
+mpirun -n 2 --allow-run-as-root \
+    python run.py --max_output_len=120 \
+                  --input_text 'Tell me about yourself.' \
+                  --tokenizer_dir ./internlm-chat-7b/ \
+                  --engine_dir=./internlm-chat-7b/trt_engines/fp16/2-gpu/
+
+# InternLM 20B with fp16 and tensor parallelism and pipeline parallelism
+mpirun -n 4 --allow-run-as-root \
+    python run.py --max_output_len=120 \
+                  --input_text 'Tell me about yourself.' \
+                  --tokenizer_dir ./internlm-chat-7b/ \
+                  --engine_dir=./internlm-chat-7b/trt_engines/bf16/4-gpu/
+```
+
+### Summarization using the InternLM model
+
+```bash
+# Run summarization using the InternLM 7B model in FP16.
+python summarize.py --test_trt_llm --test_hf \
+                    --hf_model_location ./internlm-chat-7b/ \
+                    --data_type fp16 \
+                    --engine_dir ./internlm-chat-7b/trt_engines/fp16/1-gpu/
+
+# Run summarization using the InternLM 7B model quantized to INT8.
+python summarize.py --test_trt_llm --test_hf \
+                    --hf_model_location ./internlm-chat-7b/ \
+                    --data_type fp16 \
+                    --engine_dir ./internlm-chat-7b/trt_engines/weight_only/1-gpu/
+
+# Run summarization using the InternLM 7B model in FP16 using two GPUs.
+mpirun -n 2 --allow-run-as-root \
+    python summarize.py --test_trt_llm --test_hf \
+                        --hf_model_location ./internlm-chat-7b/ \
+                        --data_type fp16 \
+                        --engine_dir ./internlm-chat-7b/trt_engines/fp16/2-gpu/
+
+# Run summarization using the InternLM 20B model in BF16 using 4 GPUs.
+mpirun -n 4 --allow-run-as-root \
+    python summarize.py --test_trt_llm --test_hf \
+                        --hf_model_location ./internlm-chat-20b/ \
+                        --data_type bf16 \
+                        --engine_dir ./internlm-chat-20b/trt_engines/bf16/4-gpu/
+```
diff --git a/examples/internlm/build.py b/examples/internlm/build.py
new file mode 100644
index 0000000000..f72706319f
--- /dev/null
+++ b/examples/internlm/build.py
@@ -0,0 +1,724 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import json
+import os
+import time
+from pathlib import Path
+
+import tensorrt as trt
+import torch
+import torch.multiprocessing as mp
+from transformers import AutoConfig, AutoModelForCausalLM
+from weight import (get_scaling_factors, load_from_awq_internlm,
+                    load_from_binary, load_from_gptq_internlm,
+                    load_from_hf_internlm, load_from_meta_internlm)
+
+import tensorrt_llm
+from tensorrt_llm._utils import str_dtype_to_trt
+from tensorrt_llm.builder import Builder
+from tensorrt_llm.layers.attention import PositionEmbeddingType
+from tensorrt_llm.logger import logger
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import quantize_model
+from tensorrt_llm.network import net_guard
+from tensorrt_llm.plugin.plugin import ContextFMHAType
+from tensorrt_llm.quantization import QuantMode
+
+from weight import parse_ft_config  # isort:skip
+
+MODEL_NAME = "internlm"
+
+# 2 routines: get_engine_name, serialize_engine
+# are direct copy from gpt example, TODO: put in utils?
+
+import onnx
+import tensorrt as trt
+from onnx import TensorProto, helper
+
+
+def trt_dtype_to_onnx(dtype):
+    if dtype == trt.float16:
+        return TensorProto.DataType.FLOAT16
+    elif dtype == trt.float32:
+        return TensorProto.DataType.FLOAT
+    elif dtype == trt.int32:
+        return TensorProto.DataType.INT32
+    else:
+        raise TypeError("%s is not supported" % dtype)
+
+
+def to_onnx(network, path):
+    inputs = []
+    for i in range(network.num_inputs):
+        network_input = network.get_input(i)
+        inputs.append(
+            helper.make_tensor_value_info(
+                network_input.name, trt_dtype_to_onnx(network_input.dtype),
+                list(network_input.shape)))
+
+    outputs = []
+    for i in range(network.num_outputs):
+        network_output = network.get_output(i)
+        outputs.append(
+            helper.make_tensor_value_info(
+                network_output.name, trt_dtype_to_onnx(network_output.dtype),
+                list(network_output.shape)))
+
+    nodes = []
+    for i in range(network.num_layers):
+        layer = network.get_layer(i)
+        layer_inputs = []
+        for j in range(layer.num_inputs):
+            ipt = layer.get_input(j)
+            if ipt is not None:
+                layer_inputs.append(layer.get_input(j).name)
+        layer_outputs = [
+            layer.get_output(j).name for j in range(layer.num_outputs)
+        ]
+        nodes.append(
+            helper.make_node(str(layer.type),
+                             name=layer.name,
+                             inputs=layer_inputs,
+                             outputs=layer_outputs,
+                             domain="com.nvidia"))
+
+    onnx_model = helper.make_model(helper.make_graph(nodes,
+                                                     'attention',
+                                                     inputs,
+                                                     outputs,
+                                                     initializer=None),
+                                   producer_name='NVIDIA')
+    onnx.save(onnx_model, path)
+
+
+def get_engine_name(model, dtype, tp_size, pp_size, rank):
+    if pp_size == 1:
+        return '{}_{}_tp{}_rank{}.engine'.format(model, dtype, tp_size, rank)
+    return '{}_{}_tp{}_pp{}_rank{}.engine'.format(model, dtype, tp_size,
+                                                  pp_size, rank)
+
+
+def serialize_engine(engine, path):
+    logger.info(f'Serializing engine to {path}...')
+    tik = time.time()
+    with open(path, 'wb') as f:
+        f.write(bytearray(engine))
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Engine serialized. Total time: {t}')
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--world_size', type=int, default=1)
+    parser.add_argument('--tp_size', type=int, default=1)
+    parser.add_argument('--pp_size', type=int, default=1)
+    parser.add_argument('--model_dir', type=str, default=None)
+    parser.add_argument('--ft_model_dir', type=str, default=None)
+    parser.add_argument('--meta_ckpt_dir', type=str, default=None)
+    parser.add_argument('--quant_ckpt_path', type=str, default=None)
+    parser.add_argument('--dtype',
+                        type=str,
+                        default='float16',
+                        choices=['float32', 'bfloat16', 'float16'])
+    parser.add_argument(
+        '--timing_cache',
+        type=str,
+        default='model.cache',
+        help=
+        'The path of to read timing cache from, will be ignored if the file does not exist'
+    )
+    parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument('--vocab_size', type=int, default=103168)
+    parser.add_argument('--n_layer', type=int, default=32)
+    parser.add_argument('--n_positions', type=int, default=2048)
+    parser.add_argument('--n_embd', type=int, default=4096)
+    parser.add_argument('--n_head', type=int, default=32)
+    parser.add_argument('--n_kv_head', type=int, default=None)
+    parser.add_argument('--multiple_of', type=int, default=256)
+    parser.add_argument('--ffn_dim_multiplier', type=float, default=1.0)
+    parser.add_argument('--inter_size', type=int, default=None)
+    parser.add_argument('--hidden_act', type=str, default='silu')
+    parser.add_argument('--rms_norm_eps', type=float, default=1e-06)
+    parser.add_argument('--max_batch_size', type=int, default=8)
+    parser.add_argument('--max_input_len', type=int, default=2048)
+    parser.add_argument('--max_output_len', type=int, default=512)
+    parser.add_argument('--max_beam_width', type=int, default=1)
+    parser.add_argument('--rotary_base', type=float, default=10000.0)
+    parser.add_argument('--rotary_scaling', nargs=2, type=str, default=None)
+    parser.add_argument('--use_gpt_attention_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'bfloat16', 'float32'])
+    parser.add_argument('--use_gemm_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'bfloat16', 'float32'])
+    parser.add_argument('--use_rmsnorm_plugin',
+                        nargs='?',
+                        const='float16',
+                        type=str,
+                        default=False,
+                        choices=['float16', 'float32', 'bfloat16'])
+    parser.add_argument('--parallel_build', default=False, action='store_true')
+    parser.add_argument('--enable_context_fmha',
+                        default=False,
+                        action='store_true')
+    parser.add_argument('--enable_context_fmha_fp32_acc',
+                        default=False,
+                        action='store_true')
+    parser.add_argument('--visualize', default=False, action='store_true')
+    parser.add_argument('--enable_debug_output',
+                        default=False,
+                        action='store_true')
+    parser.add_argument('--gpus_per_node', type=int, default=8)
+    parser.add_argument('--builder_opt', type=int, default=None)
+    parser.add_argument(
+        '--output_dir',
+        type=str,
+        default='internlm_outputs',
+        help=
+        'The path to save the serialized engine files, timing cache file and model configs'
+    )
+    parser.add_argument('--remove_input_padding',
+                        default=False,
+                        action='store_true')
+
+    # Arguments related to the quantization of the model.
+    parser.add_argument(
+        '--use_smooth_quant',
+        default=False,
+        action="store_true",
+        help=
+        'Use the SmoothQuant method to quantize activations and weights for the various GEMMs.'
+        'See --per_channel and --per_token for finer-grained quantization options.'
+    )
+    parser.add_argument(
+        '--per_channel',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor for the GEMM\'s result. '
+        'per_channel instead uses a different static scaling factor for each channel. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--per_token',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor to scale activations in the int8 range. '
+        'per_token chooses at run time, and for each token, a custom scaling factor. '
+        'The latter is usually more accurate, but a little slower.')
+    parser.add_argument(
+        '--per_group',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use a single static scaling factor to scale weights in the int4 range. '
+        'per_group chooses at run time, and for each group, a custom scaling factor. '
+        'The flag is built for GPTQ/AWQ quantization.')
+    parser.add_argument('--group_size',
+                        type=int,
+                        default=128,
+                        help='Group size used in GPTQ/AWQ quantization.')
+    parser.add_argument(
+        '--int8_kv_cache',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use dtype for KV cache. int8_kv_cache chooses int8 quantization for KV'
+    )
+    parser.add_argument(
+        '--use_parallel_embedding',
+        action="store_true",
+        default=False,
+        help=
+        'By default embedding parallelism is disabled. By setting this flag, embedding parallelism is enabled'
+    )
+    parser.add_argument(
+        '--embedding_sharding_dim',
+        type=int,
+        default=1,  # Meta does TP on hidden dim
+        choices=[0, 1],
+        help=
+        'By default the embedding lookup table is sharded along vocab dimension (embedding_sharding_dim=0). '
+        'To shard it along hidden dimension, set embedding_sharding_dim=1'
+        'Note: embedding sharing is only enabled when embedding_sharding_dim = 0'
+    )
+    parser.add_argument(
+        '--enable_fp8',
+        default=False,
+        action='store_true',
+        help='Use FP8 Linear layer for Attention QKV/Dense and MLP.')
+    parser.add_argument(
+        '--fp8_kv_cache',
+        default=False,
+        action="store_true",
+        help=
+        'By default, we use dtype for KV cache. fp8_kv_cache chooses int8 quantization for KV'
+    )
+    parser.add_argument(
+        '--quantized_fp8_model_path',
+        type=str,
+        default=None,
+        help='Path of a quantized model checkpoint in .npz format')
+    parser.add_argument(
+        '--use_weight_only',
+        default=False,
+        action="store_true",
+        help='Quantize weights for the various GEMMs to INT4/INT8.'
+        'See --weight_only_precision to set the precision')
+    parser.add_argument(
+        '--weight_only_precision',
+        const='int8',
+        type=str,
+        nargs='?',
+        default='int8',
+        choices=['int8', 'int4', 'int4_awq', 'int4_gptq'],
+        help=
+        'Define the precision for the weights when using weight-only quantization.'
+        'You must also use --use_weight_only for that argument to have an impact.'
+    )
+    parser.add_argument(
+        '--use_inflight_batching',
+        action="store_true",
+        default=False,
+        help="Activates inflight batching mode of gptAttentionPlugin.")
+    parser.add_argument(
+        '--paged_kv_cache',
+        action="store_true",
+        default=False,
+        help=
+        'By default we use contiguous KV cache. By setting this flag you enable paged KV cache'
+    )
+    parser.add_argument('--tokens_per_block',
+                        type=int,
+                        default=64,
+                        help='Number of tokens per block in paged KV cache')
+    parser.add_argument(
+        '--max_num_tokens',
+        type=int,
+        default=None,
+        help='Define the max number of tokens supported by the engine')
+    parser.add_argument(
+        '--strongly_typed',
+        default=False,
+        action="store_true",
+        help=
+        'This option is introduced with trt 9.1.0.1+ and will reduce the building time significantly for fp8.'
+    )
+    parser.add_argument(
+        '--use_custom_all_reduce',
+        action='store_true',
+        help=
+        'Activates latency-optimized algorithm for all-reduce instead of NCCL.')
+
+    args = parser.parse_args()
+    tensorrt_llm.logger.set_level(args.log_level)
+
+    assert not (
+        args.use_smooth_quant and args.use_weight_only
+    ), "You cannot enable both SmoothQuant and INT8 weight-only together."
+
+    if not args.remove_input_padding:
+        if args.use_gpt_attention_plugin:
+            logger.warning(
+                f"It is recommended to specify --remove_input_padding when using GPT attention plugin"
+            )
+
+    if args.use_inflight_batching:
+        if not args.use_gpt_attention_plugin:
+            args.use_gpt_attention_plugin = 'float16'
+            logger.info(
+                f"Using GPT attention plugin for inflight batching mode. Setting to default '{args.use_gpt_attention_plugin}'"
+            )
+        if not args.remove_input_padding:
+            args.remove_input_padding = True
+            logger.info(
+                "Using remove input padding for inflight batching mode.")
+        if not args.paged_kv_cache:
+            args.paged_kv_cache = True
+            logger.info("Using paged KV cache for inflight batching mode.")
+
+    if args.use_smooth_quant:
+        args.quant_mode = QuantMode.use_smooth_quant(args.per_token,
+                                                     args.per_channel)
+    elif args.use_weight_only:
+        if args.per_group:
+            args.quant_mode = QuantMode.from_description(
+                quantize_weights=True,
+                quantize_activations=False,
+                per_token=False,
+                per_channel=False,
+                per_group=True,
+                use_int4_weights=True)
+        else:
+            args.quant_mode = QuantMode.use_weight_only(
+                args.weight_only_precision == 'int4')
+    else:
+        args.quant_mode = QuantMode(0)
+
+    if args.int8_kv_cache:
+        args.quant_mode = args.quant_mode.set_int8_kv_cache()
+    elif args.fp8_kv_cache:
+        args.quant_mode = args.quant_mode.set_fp8_kv_cache()
+    if args.enable_fp8:
+        args.quant_mode = args.quant_mode.set_fp8_qdq()
+
+    if args.rotary_scaling is not None:
+        rotary_scaling = {
+            "type": args.rotary_scaling[0],
+            "factor": float(args.rotary_scaling[1])
+        }
+        assert rotary_scaling["type"] in ["linear", "dynamic"]
+        assert rotary_scaling["factor"] > 1.0
+        args.rotary_scaling = rotary_scaling
+        if rotary_scaling["type"] == "dynamic":
+            assert not args.remove_input_padding, "TODO: Not supported yet"
+
+    # Since gpt_attenttion_plugin is the only way to apply RoPE now,
+    # force use the plugin for now with the correct data type.
+    args.use_gpt_attention_plugin = args.dtype
+    if args.model_dir is not None:
+        hf_config = AutoConfig.from_pretrained(args.model_dir,
+                                               trust_remote_code=True)
+        args.inter_size = hf_config.intermediate_size  # override the inter_size for InternLM
+        args.n_embd = hf_config.hidden_size
+        args.n_head = hf_config.num_attention_heads
+        if hasattr(hf_config, "num_key_value_heads"):
+            args.n_kv_head = hf_config.num_key_value_heads
+        args.n_layer = hf_config.num_hidden_layers
+        args.n_positions = hf_config.max_position_embeddings
+        args.vocab_size = hf_config.vocab_size
+        args.hidden_act = hf_config.hidden_act
+        args.rms_norm_eps = hf_config.rms_norm_eps
+        args.attn_bias = hf_config.bias
+    elif args.meta_ckpt_dir is not None:
+        # Not tested
+        with open(Path(args.meta_ckpt_dir, "params.json")) as fp:
+            meta_config: dict = json.load(fp)
+        args.n_embd = meta_config["dim"]
+        args.n_head = meta_config["n_heads"]
+        args.n_layer = meta_config["n_layers"]
+        args.n_kv_head = meta_config.get("n_kv_heads", args.n_head)
+        args.multiple_of = meta_config["multiple_of"]
+        args.ffn_dim_multiplier = meta_config.get("ffn_dim_multiplier", 1)
+        n_embd = int(4 * args.n_embd * 2 / 3)
+        args.inter_size = args.multiple_of * (
+            (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) //
+            args.multiple_of)
+        args.rms_norm_eps = meta_config["norm_eps"]
+    elif args.ft_model_dir is not None:
+        n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head, attn_bias = parse_ft_config(
+            Path(args.ft_model_dir) / "config.ini")
+        args.inter_size = inter_size  # override the inter_size for InternLM
+        args.n_kv_head = n_kv_head
+        args.n_embd = n_embd
+        args.n_head = n_head
+        args.n_layer = n_layer
+        args.n_positions = n_positions
+        args.vocab_size = vocab_size
+        args.hidden_act = hidden_act
+        args.rms_norm_eps = 1e-06
+        logger.warning("Set rms_norm_eps to 1e-06 directly.")
+        args.attn_bias = attn_bias
+    assert args.use_gpt_attention_plugin, "InternLM must use gpt attention plugin"
+    if args.n_kv_head is None:
+        args.n_kv_head = args.n_head
+    elif args.n_kv_head != args.n_head:
+        assert (args.n_head % args.n_kv_head) == 0, \
+            "MQA/GQA requires the number of heads to be divisible by the number of K/V heads."
+        assert (args.n_kv_head % args.tp_size) == 0 or (args.tp_size % args.n_kv_head) == 0, \
+            "MQA/GQA requires either the number of K/V heads to be divisible by the tensor parallelism size OR " \
+            "the tensor parallelism size to be divisible by the number of K/V heads."
+
+    if args.dtype == 'bfloat16':
+        assert args.use_gemm_plugin, "Please use gemm plugin when dtype is bfloat16"
+
+    assert args.pp_size * args.tp_size == args.world_size
+
+    if args.max_num_tokens is not None:
+        assert args.enable_context_fmha
+
+    if args.inter_size is None:
+        # this should not be need when loading a real model
+        # but it is helpful when creating a dummy model without loading any real weights
+        n_embd = int(4 * args.n_embd * 2 / 3)
+        args.inter_size = args.multiple_of * (
+            (int(n_embd * args.ffn_dim_multiplier) + args.multiple_of - 1) //
+            args.multiple_of)
+        logger.info(f"Setting inter_size to {args.inter_size}.")
+
+    return args
+
+
+def build_rank_engine(builder: Builder,
+                      builder_config: tensorrt_llm.builder.BuilderConfig,
+                      engine_name, rank, args):
+    '''
+       @brief: Build the engine on the given rank.
+       @param rank: The rank to build the engine.
+       @param args: The cmd line arguments.
+       @return: The built engine.
+    '''
+    dtype = str_dtype_to_trt(args.dtype)
+    mapping = Mapping(world_size=args.world_size,
+                      rank=rank,
+                      tp_size=args.tp_size,
+                      pp_size=args.pp_size)
+
+    assert args.n_layer % args.pp_size == 0, \
+        f"num_layers {args.n_layer} must be a multiple of pipeline parallelism size {args.pp_size}"
+
+    # Initialize Module
+    tensorrt_llm_internlm = tensorrt_llm.models.InternLMForCausalLM(
+        num_layers=args.n_layer,
+        num_heads=args.n_head,
+        num_kv_heads=args.n_kv_head,
+        hidden_size=args.n_embd,
+        vocab_size=args.vocab_size,
+        hidden_act=args.hidden_act,
+        attn_bias=args.attn_bias,
+        max_position_embeddings=args.n_positions,
+        dtype=dtype,
+        mlp_hidden_size=args.inter_size,
+        position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+        mapping=mapping,
+        rotary_base=args.rotary_base,
+        rotary_scaling=args.rotary_scaling,
+        use_parallel_embedding=args.use_parallel_embedding,
+        embedding_sharding_dim=args.embedding_sharding_dim,
+        quant_mode=args.quant_mode,
+        rms_norm_eps=args.rms_norm_eps)
+    if args.use_smooth_quant:
+        tensorrt_llm_internlm = quantize_model(tensorrt_llm_internlm,
+                                               args.quant_mode)
+    elif args.use_weight_only:
+        if args.weight_only_precision == 'int8':
+            tensorrt_llm_internlm = quantize_model(tensorrt_llm_internlm,
+                                                   args.quant_mode)
+        elif args.weight_only_precision == 'int4':
+            tensorrt_llm_internlm = quantize_model(tensorrt_llm_internlm,
+                                                   args.quant_mode)
+        elif args.weight_only_precision == 'int4_awq':
+            tensorrt_llm_internlm = quantize_model(model=tensorrt_llm_internlm,
+                                                   quant_mode=args.quant_mode,
+                                                   group_size=args.group_size,
+                                                   zero=False,
+                                                   pre_quant_scale=True,
+                                                   exclude_modules=[])
+        elif args.weight_only_precision == 'int4_gptq':
+            tensorrt_llm_internlm = quantize_model(model=tensorrt_llm_internlm,
+                                                   quant_mode=args.quant_mode,
+                                                   group_size=args.group_size,
+                                                   zero=True,
+                                                   pre_quant_scale=False)
+    elif args.enable_fp8 or args.fp8_kv_cache:
+        logger.info(f'Loading scaling factors from '
+                    f'{args.quantized_fp8_model_path}')
+        quant_scales = get_scaling_factors(args.quantized_fp8_model_path,
+                                           num_layers=args.n_layer,
+                                           quant_mode=args.quant_mode)
+        tensorrt_llm_internlm = quantize_model(tensorrt_llm_internlm,
+                                               quant_mode=args.quant_mode,
+                                               quant_scales=quant_scales)
+    if args.per_group:
+        load_func = load_from_awq_internlm if args.weight_only_precision == 'int4_awq' else load_from_gptq_internlm
+        load_func(tensorrt_llm_internlm=tensorrt_llm_internlm,
+                  quant_ckpt_path=args.quant_ckpt_path,
+                  mapping=mapping,
+                  dtype=args.dtype)
+    elif args.meta_ckpt_dir is not None:
+        load_from_meta_internlm(tensorrt_llm_internlm, args.meta_ckpt_dir,
+                                mapping, args.dtype)
+    elif args.model_dir is not None:
+        logger.info(f'Loading HF InternLM ... from {args.model_dir}')
+        tik = time.time()
+        hf_internlm = AutoModelForCausalLM.from_pretrained(
+            args.model_dir,
+            device_map={
+                "model": "cpu",
+                "lm_head": "cpu"
+            },  # Load to CPU memory
+            torch_dtype="auto",
+            trust_remote_code=True)
+        tok = time.time()
+        t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+        logger.info(f'HF InternLM loaded. Total time: {t}')
+        load_from_hf_internlm(tensorrt_llm_internlm,
+                              hf_internlm,
+                              mapping=mapping,
+                              dtype=args.dtype)
+        del hf_internlm
+    elif args.ft_model_dir is not None:
+        load_from_binary(tensorrt_llm_internlm,
+                         args.ft_model_dir,
+                         mapping,
+                         fp16=(args.dtype == 'float16'),
+                         multi_query_mode=(args.n_kv_head != args.n_head))
+
+    # Module -> Network
+    network = builder.create_network()
+    network.trt_network.name = engine_name
+    if args.use_gpt_attention_plugin:
+        network.plugin_config.set_gpt_attention_plugin(
+            dtype=args.use_gpt_attention_plugin)
+    if args.use_gemm_plugin:
+        network.plugin_config.set_gemm_plugin(dtype=args.use_gemm_plugin)
+    if args.use_rmsnorm_plugin:
+        network.plugin_config.set_rmsnorm_plugin(dtype=args.use_rmsnorm_plugin)
+
+    # Quantization plugins.
+    if args.use_smooth_quant:
+        network.plugin_config.set_smooth_quant_gemm_plugin(dtype=args.dtype)
+        network.plugin_config.set_rmsnorm_quantization_plugin(dtype=args.dtype)
+        network.plugin_config.set_quantize_tensor_plugin()
+        network.plugin_config.set_quantize_per_token_plugin()
+    assert not (args.enable_context_fmha and args.enable_context_fmha_fp32_acc)
+    if args.enable_context_fmha:
+        network.plugin_config.set_context_fmha(ContextFMHAType.enabled)
+    if args.enable_context_fmha_fp32_acc:
+        network.plugin_config.set_context_fmha(
+            ContextFMHAType.enabled_with_fp32_acc)
+    if args.use_weight_only:
+        if args.per_group:
+            network.plugin_config.set_weight_only_groupwise_quant_matmul_plugin(
+                dtype='float16')
+        else:
+            network.plugin_config.set_weight_only_quant_matmul_plugin(
+                dtype='float16')
+    if args.world_size > 1:
+        network.plugin_config.set_nccl_plugin(args.dtype,
+                                              args.use_custom_all_reduce)
+    if args.remove_input_padding:
+        network.plugin_config.enable_remove_input_padding()
+    if args.paged_kv_cache:
+        network.plugin_config.enable_paged_kv_cache(args.tokens_per_block)
+
+    with net_guard(network):
+        # Prepare
+        network.set_named_parameters(tensorrt_llm_internlm.named_parameters())
+
+        # Forward
+        inputs = tensorrt_llm_internlm.prepare_inputs(args.max_batch_size,
+                                                      args.max_input_len,
+                                                      args.max_output_len, True,
+                                                      args.max_beam_width,
+                                                      args.max_num_tokens)
+        tensorrt_llm_internlm(*inputs)
+        if args.enable_debug_output:
+            # mark intermediate nodes' outputs
+            for k, v in tensorrt_llm_internlm.named_network_outputs():
+                v = v.trt_tensor
+                v.name = k
+                network.trt_network.mark_output(v)
+                v.dtype = dtype
+        if args.visualize:
+            model_path = os.path.join(args.output_dir, 'test.onnx')
+            to_onnx(network.trt_network, model_path)
+
+    tensorrt_llm.graph_rewriting.optimize(network)
+
+    engine = None
+
+    # Network -> Engine
+    engine = builder.build_engine(network, builder_config)
+    if rank == 0:
+        config_path = os.path.join(args.output_dir, 'config.json')
+        builder.save_config(builder_config, config_path)
+    return engine
+
+
+def build(rank, args):
+    torch.cuda.set_device(rank % args.gpus_per_node)
+    logger.set_level(args.log_level)
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    # when doing serializing build, all ranks share one engine
+    builder = Builder()
+
+    cache = None
+    for cur_rank in range(args.world_size):
+        # skip other ranks if parallel_build is enabled
+        if args.parallel_build and cur_rank != rank:
+            continue
+        # NOTE: when only int8 kv cache is used together with paged kv cache no int8 tensors are exposed to TRT
+        int8_trt_flag = args.quant_mode.has_act_or_weight_quant() or (
+            not args.paged_kv_cache and args.quant_mode.has_int8_kv_cache())
+        builder_config = builder.create_builder_config(
+            name=MODEL_NAME,
+            precision=args.dtype,
+            timing_cache=args.timing_cache if cache is None else cache,
+            tensor_parallel=args.tp_size,
+            pipeline_parallel=args.pp_size,
+            parallel_build=args.parallel_build,
+            num_layers=args.n_layer,
+            num_heads=args.n_head,
+            num_kv_heads=args.n_kv_head,
+            hidden_size=args.n_embd,
+            vocab_size=args.vocab_size,
+            hidden_act=args.hidden_act,
+            max_position_embeddings=args.n_positions,
+            max_batch_size=args.max_batch_size,
+            max_input_len=args.max_input_len,
+            max_output_len=args.max_output_len,
+            max_num_tokens=args.max_num_tokens,
+            int8=int8_trt_flag,
+            fp8=args.quant_mode.has_fp8_qdq(),
+            quant_mode=args.quant_mode,
+            strongly_typed=args.strongly_typed,
+            opt_level=args.builder_opt)
+        engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size,
+                                      args.pp_size, cur_rank)
+        engine = build_rank_engine(builder, builder_config, engine_name,
+                                   cur_rank, args)
+        assert engine is not None, f'Failed to build engine for rank {cur_rank}'
+
+        if cur_rank == 0:
+            # Use in-memory timing cache for multiple builder passes.
+            if not args.parallel_build:
+                cache = builder_config.trt_builder_config.get_timing_cache()
+
+        serialize_engine(engine, os.path.join(args.output_dir, engine_name))
+
+    if rank == 0:
+        ok = builder.save_timing_cache(
+            builder_config, os.path.join(args.output_dir, "model.cache"))
+        assert ok, "Failed to save timing cache."
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    tik = time.time()
+    if args.parallel_build and args.world_size > 1 and \
+            torch.cuda.device_count() >= args.world_size:
+        logger.warning(
+            f'Parallelly build TensorRT engines. Please make sure that all of the {args.world_size} GPUs are totally free.'
+        )
+        mp.spawn(build, nprocs=args.world_size, args=(args, ))
+    else:
+        args.parallel_build = False
+        logger.info('Serially build TensorRT engines.')
+        build(0, args)
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Total time of building all {args.world_size} engines: {t}')
diff --git a/examples/internlm/convert.py b/examples/internlm/convert.py
new file mode 100644
index 0000000000..26831c1c54
--- /dev/null
+++ b/examples/internlm/convert.py
@@ -0,0 +1,322 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+    Utilities for exporting a model to our custom format.
+"""
+import numpy as np
+import torch
+
+
+def save_val(val, dir, key, tp_num=None):
+    suffix = "bin" if tp_num is None else f"{tp_num}.bin"
+    val.tofile(dir / f"model.{key}.{suffix}")
+
+
+def save_split(split_vals, dir, key, i, factor):
+    for j, val in enumerate(split_vals):
+        save_val(val, dir, key, i * factor + j)
+
+
+def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
+    """
+     This function has two purposes:
+      - compute quantized weights, scaled either per-tensor or per-column
+      - compute scaling factors
+
+      Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ.
+      CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W.
+      CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor.
+
+      Here is the list of what we need (T means per-tensor, C per-column):
+        - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8). Used before the GEMM. (T)
+        - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T)
+        - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C)
+        - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32)
+          to quant range (int8) (used for CUBLAS) (T, C)
+
+      Note that we don't do anything special about row-parallel GEMM. Theoretically, we could have per-GPU scaling factors too,
+      but then the model would change depending on the number of GPUs used.
+
+      For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection, we consider it
+      as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V.
+      For our GEMM implementation to respect this behavior, we use per-column mode and replicate values along columns.
+    """
+
+    # compute weight scaling factors for fp->int8 and int8->fp
+    if is_qkv and not multi_query_mode:
+        scale_w_orig_quant_t = 127. / act_range["w"].reshape(3, -1).max(
+            dim=-1, keepdims=True)[0].cpu().numpy()
+        scale_w_orig_quant_c = 127. / act_range["w"].reshape(3,
+                                                             -1).cpu().numpy()
+    elif is_qkv and multi_query_mode:
+        hidden_dim = weights.shape[0]
+        local_dim = act_range["w"].shape[0]
+        kv_dim = (local_dim - hidden_dim) // 2
+        scale_w_q = act_range["w"][0:hidden_dim]
+        scale_w_k = act_range["w"][hidden_dim:hidden_dim + kv_dim]
+        scale_w_v = act_range["w"][-kv_dim:]
+
+        scale_w_qkv_t = torch.concat([
+            scale_w_q.max(dim=0, keepdim=True)[0],
+            scale_w_k.max(dim=0, keepdim=True)[0],
+            scale_w_v.max(dim=0, keepdim=True)[0]
+        ])
+
+        scale_w_orig_quant_t = 127. / scale_w_qkv_t.cpu().numpy()
+        scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy()
+    else:
+        scale_w_orig_quant_t = 127. / act_range["w"].max().cpu().numpy()
+        scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy()
+    scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t
+    scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c
+
+    # compute the rest of needed scaling factors
+    scale_x_orig_quant_t = np.array(127. / act_range["x"].max().item())
+    scale_y_orig_quant_t = np.array(127. / act_range["y"].max().item())
+    scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.)
+    scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t *
+                                                    scale_w_orig_quant_t)
+    scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t *
+                                                    scale_w_orig_quant_c)
+    if is_qkv and not multi_query_mode:
+        scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t,
+                                                scale_w_orig_quant_c.shape)
+        scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t,
+                                               scale_w_orig_quant_c.shape)
+    if is_qkv and multi_query_mode:
+        scale_q_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[0],
+                                            scale_w_q.shape)
+        scale_k_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[1],
+                                            scale_w_k.shape)
+        scale_v_y_accum_t = np.broadcast_to(scale_y_accum_quant_t[2],
+                                            scale_w_v.shape)
+        scale_y_accum_quant_t = np.concatenate(
+            [scale_q_y_accum_t, scale_k_y_accum_t, scale_v_y_accum_t])
+        scale_w_quant_orig_t = np.concatenate([
+            np.broadcast_to(scale_w_quant_orig_t[0], scale_w_q.shape),
+            np.broadcast_to(scale_w_quant_orig_t[1], scale_w_k.shape),
+            np.broadcast_to(scale_w_quant_orig_t[2], scale_w_v.shape)
+        ])
+
+    to_i8 = lambda x: x.round().clip(-127, 127).astype(np.int8)
+
+    if is_qkv and multi_query_mode:
+        scale_w_quant_orig_t_expand = np.ones([weights.shape[-1]])
+        scale_w_quant_orig_t_expand[:hidden_dim] = scale_w_quant_orig_t[0]
+        scale_w_quant_orig_t_expand[hidden_dim:hidden_dim +
+                                    kv_dim] = scale_w_quant_orig_t[1]
+        scale_w_quant_orig_t_expand[-kv_dim:] = scale_w_quant_orig_t[2]
+        weight_int8 = to_i8(weights * scale_w_quant_orig_t_expand)
+    else:
+        weight_int8 = to_i8(weights * scale_w_orig_quant_t)
+    return {
+        "weight.int8": weight_int8,
+        "weight.int8.col": to_i8(weights * scale_w_orig_quant_c),
+        "scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32),
+        "scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32),
+        "scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32),
+        "scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32),
+        "scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32),
+        "scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32),
+    }
+
+
+def save_multi_query_mode_qkv_int8(val, dir, base_key, saved_key, factor, rank,
+                                   local_dim, head_size):
+    q, k, v = np.split(val, [local_dim, local_dim + head_size], axis=-1)
+    q_split = np.split(q, factor, axis=-1)
+    k_split = np.split(k, factor, axis=-1)
+    v_split = np.split(v, factor, axis=-1)
+    split_vals = [
+        np.concatenate((q_split[ii], k_split[ii], v_split[ii]), axis=-1)
+        for ii in range(factor)
+    ]
+    save_split(split_vals, dir, f"{base_key}.{saved_key}", rank, factor)
+
+
+def write_int8(vals,
+               dir,
+               base_key,
+               split_dim,
+               i,
+               factor,
+               is_qkv=False,
+               multi_query_mode=False):
+    saved_keys_once = [
+        "scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant",
+        "scale_y_quant_orig"
+    ]
+
+    if is_qkv and multi_query_mode:
+        assert split_dim == -1
+        local_dim = vals["weight.int8"].shape[0]
+        head_size = (vals["weight.int8"].shape[1] - local_dim) // 2
+
+        save_multi_query_mode_qkv_int8(vals["weight.int8"], dir, base_key,
+                                       "weight.int8", factor, i, local_dim,
+                                       head_size)
+        save_multi_query_mode_qkv_int8(vals["weight.int8.col"], dir, base_key,
+                                       "weight.int8.col", factor, i, local_dim,
+                                       head_size)
+        save_multi_query_mode_qkv_int8(vals["scale_w_quant_orig.col"], dir,
+                                       base_key, "scale_w_quant_orig.col",
+                                       factor, i, local_dim, head_size)
+        save_multi_query_mode_qkv_int8(vals["scale_y_accum_quant.col"], dir,
+                                       base_key, "scale_y_accum_quant.col",
+                                       factor, i, local_dim, head_size)
+        save_multi_query_mode_qkv_int8(vals["scale_w_quant_orig"], dir,
+                                       base_key, "scale_w_quant_orig", factor,
+                                       i, local_dim, head_size)
+        save_multi_query_mode_qkv_int8(vals["scale_y_accum_quant"], dir,
+                                       base_key, "scale_y_accum_quant", factor,
+                                       i, local_dim, head_size)
+        saved_keys_once = ["scale_x_orig_quant", "scale_y_quant_orig"]
+    else:
+        save_split(np.split(vals["weight.int8"], factor, axis=split_dim), dir,
+                   f"{base_key}.weight.int8", i, factor)
+        save_split(np.split(vals["weight.int8.col"], factor, axis=split_dim),
+                   dir, f"{base_key}.weight.int8.col", i, factor)
+
+        if split_dim == -1:
+            save_split(
+                np.split(vals["scale_w_quant_orig.col"], factor,
+                         axis=split_dim), dir,
+                f"{base_key}.scale_w_quant_orig.col", i, factor)
+            save_split(
+                np.split(vals["scale_y_accum_quant.col"],
+                         factor,
+                         axis=split_dim), dir,
+                f"{base_key}.scale_y_accum_quant.col", i, factor)
+            if is_qkv:
+                save_split(
+                    np.split(vals["scale_y_accum_quant"],
+                             factor,
+                             axis=split_dim), dir,
+                    f"{base_key}.scale_y_accum_quant", i, factor)
+                save_split(
+                    np.split(vals["scale_w_quant_orig"], factor,
+                             axis=split_dim), dir,
+                    f"{base_key}.scale_w_quant_orig", i, factor)
+                saved_keys_once = ["scale_x_orig_quant", "scale_y_quant_orig"]
+        else:
+            saved_keys_once += [
+                "scale_w_quant_orig.col", "scale_y_accum_quant.col"
+            ]
+
+    if i == 0:
+        for save_key in saved_keys_once:
+            save_val(vals[save_key], dir, f"{base_key}.{save_key}")
+
+
+def str_to_np_dtype(type_str):
+    convert_dict = {
+        "fp32": np.float32,
+        "fp16": np.float16,
+    }
+    dtype = convert_dict.get(type_str)
+    if dtype is None:
+        raise ValueError(f"{type_str} is an invalid storage type")
+    return dtype
+
+
+def split_and_save_weight(i, saved_dir, factor, key, val, act_range, config):
+    # The split_factor indicates the number of ranks to implement
+    # distributed GEMMs. For Tensor Parallelism, each rank/GPU works
+    # on split_hidden_dim // split_factor channels.
+
+    int8_outputs = config.get("int8_outputs", None)
+    multi_query_mode = config.get("multi_query_mode", False)
+    local_dim = config.get("local_dim", None)
+
+    save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
+
+    if "input_layernorm.weight" in key or "input_layernorm.bias" in key or \
+        "attention.dense.bias" in key or "post_layernorm.weight" in key or \
+        "post_attention_layernorm.bias" in key or "mlp.dense_4h_to_h.bias" in key or \
+        "final_layernorm.weight" in key or "final_layernorm.bias" in key:
+
+        # shared weights, only need to convert the weights of rank 0
+        if i == 0:
+            save_val(val, saved_dir, key)
+
+    elif "attention.dense.weight" in key or "mlp.proj.weight" in key:
+        split_dim = 0
+        split_vals = np.split(val, factor, axis=split_dim)
+        save_split(split_vals, saved_dir, key, i, factor)
+        if act_range is not None and int8_outputs == "all":
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range)
+            write_int8(vals_i8, saved_dir, base_key, split_dim, i, factor)
+
+    elif "mlp.fc.weight" in key or "mlp.gate.weight" in key:
+        split_dim = -1
+        split_vals = np.split(val, factor, axis=split_dim)
+        save_split(split_vals, saved_dir, key, i, factor)
+        if act_range is not None and int8_outputs == "all":
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val, act_range)
+            write_int8(vals_i8, saved_dir, base_key, split_dim, i, factor)
+
+    elif "attention.query_key_value.weight" in key:
+        hidden_dim = val.shape[0]
+        if local_dim is None:
+            local_dim = val.shape[-1] // 3
+        if multi_query_mode:
+            head_size = (val.shape[-1] - local_dim) // 2
+            val = val.reshape(hidden_dim, local_dim + 2 * head_size)
+            w_q, w_k, w_v = np.split(val, [local_dim, local_dim + head_size],
+                                     axis=-1)
+            w_q_split = np.split(w_q, factor, axis=-1)
+            w_k_split = np.split(w_k, factor, axis=-1)
+            w_v_split = np.split(w_v, factor, axis=-1)
+            split_vals = [
+                np.concatenate((w_q_split[ii], w_k_split[ii], w_v_split[ii]),
+                               axis=-1) for ii in range(factor)
+            ]
+            split_dim = -1
+        else:
+            val = val.reshape(hidden_dim, 3, local_dim)
+            split_dim = -1
+            split_vals = np.split(val, factor, axis=split_dim)
+        save_split(split_vals, saved_dir, key, i, factor)
+        if save_int8:
+            base_key = key.replace(".weight", "")
+            vals_i8 = generate_int8(val,
+                                    act_range,
+                                    is_qkv=True,
+                                    multi_query_mode=multi_query_mode)
+            write_int8(vals_i8,
+                       saved_dir,
+                       base_key,
+                       split_dim,
+                       i,
+                       factor,
+                       is_qkv=True,
+                       multi_query_mode=multi_query_mode)
+
+    elif "attention.query_key_value.bias" in key:
+        if local_dim is None:
+            local_dim = val.shape[-1] // 3
+
+        val = val.reshape(3, local_dim)
+        split_vals = np.split(val, factor, axis=-1)
+        save_split(split_vals, saved_dir, key, i, factor)
+
+    elif "attention.dense.smoother" in key or "mlp.proj.smoother" in key:
+        split_vals = np.split(val, factor, axis=0)
+        save_split(split_vals, saved_dir, key, i, factor)
+
+    else:
+        print(f"[WARNING] {key} not handled by converter")
diff --git a/examples/internlm/hf_internlm_convert.py b/examples/internlm/hf_internlm_convert.py
new file mode 100644
index 0000000000..23f80c3102
--- /dev/null
+++ b/examples/internlm/hf_internlm_convert.py
@@ -0,0 +1,368 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Convert huggingface GPT model. Use https://huggingface.co/gpt2 as demo.
+'''
+import argparse
+import configparser
+import os
+from pathlib import Path
+
+import torch
+import torch.multiprocessing as multiprocessing
+from convert import split_and_save_weight, str_to_np_dtype
+from smoothquant import (capture_activation_range, smooth_gemm,
+                         smooth_gemm_fc1_gate)
+from tqdm import tqdm
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+
+def merge_qkv_scales(q_name, hf_model, scales, internlm_qkv_para):
+    layer_name_q = q_name.replace(".weight", "")
+    layer_name_k = layer_name_q.replace("q_proj", "k_proj")
+    layer_name_v = layer_name_q.replace("q_proj", "v_proj")
+    layer_name_qkv = layer_name_q.replace("q_proj", "qkv_proj")
+
+    q = hf_model.state_dict()[layer_name_q + ".weight"]
+    k = hf_model.state_dict()[layer_name_k + ".weight"]
+    v = hf_model.state_dict()[layer_name_v + ".weight"]
+
+    weight = torch.cat([q, k, v], dim=0)
+
+    scales[layer_name_qkv]["x"] = scales[layer_name_q]["x"]
+    scales[layer_name_qkv]["w"] = weight.abs().max(dim=1)[0]
+    print(scales[layer_name_q])
+    scales[layer_name_qkv]["y"] = torch.cat([
+        scales[layer_name_q]["y"], scales[layer_name_k]["y"],
+        scales[layer_name_v]["y"]
+    ],
+                                            dim=0)
+
+    internlm_qkv_para[layer_name_qkv] = weight.transpose(0, 1)
+
+
+def merge_qkv_bias(q_name, hf_model, internlm_qkv_para={}):
+    layer_name_q = q_name.replace(".bias", "")
+    layer_name_k = layer_name_q.replace("q_proj", "k_proj")
+    layer_name_v = layer_name_q.replace("q_proj", "v_proj")
+    # layer_name_qkv = layer_name_q.replace("q_proj", "qkv_proj")
+
+    q = hf_model.state_dict()[layer_name_q + ".bias"]
+    k = hf_model.state_dict()[layer_name_k + ".bias"]
+    v = hf_model.state_dict()[layer_name_v + ".bias"]
+
+    bias = torch.cat([q, k, v], dim=0)
+
+    return bias
+
+
+@torch.no_grad()
+def smooth_internlm_model(model, scales, alpha, internlm_qkv_para,
+                          internlm_smoother):
+    # Smooth the activation and weights with smoother = $\diag{s}$
+    for name, module in model.named_modules():
+        if not module.__class__.__name__ == "InternLMDecoderLayer":
+            continue
+        # qkv_proj
+        layer_name_q = name + ".self_attn.q_proj"
+        layer_name_k = name + ".self_attn.k_proj"
+        layer_name_v = name + ".self_attn.v_proj"
+        layer_name_qkv = name + ".self_attn.qkv_proj"
+
+        weight = torch.cat([
+            module.self_attn.q_proj.weight, module.self_attn.k_proj.weight,
+            module.self_attn.v_proj.weight
+        ],
+                           dim=0)
+
+        smoother = smooth_gemm(weight, scales[layer_name_q]["x"],
+                               module.input_layernorm.weight, None, alpha)
+
+        scales[layer_name_qkv]["x"] = scales[layer_name_q]["x"] / smoother
+        scales[layer_name_qkv]["w"] = weight.abs().max(dim=1)[0]
+        scales[layer_name_qkv]["y"] = torch.cat([
+            scales[layer_name_q]["y"], scales[layer_name_k]["y"],
+            scales[layer_name_v]["y"]
+        ],
+                                                dim=0)
+
+        # see transpose_weights function
+        internlm_qkv_para[layer_name_qkv] = weight.transpose(0, 1)
+
+        # =================================================================
+        layer_name = name + ".self_attn.o_proj"
+        smoother = smooth_gemm(module.self_attn.o_proj.weight,
+                               scales[layer_name]["x"], None, None, alpha)
+        internlm_smoother[layer_name] = smoother.float()
+
+        scales[layer_name]["x"] = scales[layer_name]["x"] / smoother
+        scales[layer_name]["w"] = module.self_attn.o_proj.weight.abs().max(
+            dim=1)[0]
+
+        # ==================================================================
+        fc1_layer_name = name + ".mlp.gate_proj"
+        gate_layer_name = name + ".mlp.up_proj"
+
+        smoother = smooth_gemm_fc1_gate(module.mlp.gate_proj.weight,
+                                        module.mlp.up_proj.weight,
+                                        scales[fc1_layer_name]["x"],
+                                        module.post_attention_layernorm.weight,
+                                        None, alpha)
+
+        scales[fc1_layer_name]["x"] = scales[fc1_layer_name]["x"] / smoother
+        scales[fc1_layer_name]["w"] = module.mlp.gate_proj.weight.abs().max(
+            dim=1)[0]
+
+        scales[gate_layer_name]["x"] = scales[gate_layer_name]["x"] / smoother
+        scales[gate_layer_name]["w"] = module.mlp.up_proj.weight.abs().max(
+            dim=1)[0]
+
+        # ==================================================================
+        layer_name = name + ".mlp.down_proj"
+        smoother = smooth_gemm(module.mlp.down_proj.weight,
+                               scales[layer_name]["x"], None, None, alpha)
+        internlm_smoother[layer_name] = smoother.float()
+        scales[layer_name]["x"] = scales[layer_name]["x"] / smoother
+        scales[layer_name]["w"] = module.mlp.down_proj.weight.abs().max(
+            dim=1)[0]
+
+
+def gpt_to_ft_name(orig_name):
+    global_ft_weights = {
+        "model.embed_tokens.weight": 'vocab_embedding.weight',
+        "model.norm.weight": 'ln_f.weight',
+        "lm_head.weight": 'lm_head.weight',
+    }
+
+    if orig_name in global_ft_weights:
+        return global_ft_weights[orig_name]
+
+    _, _, layer_id, *weight_name = orig_name.split(".")
+
+    layer_id = int(layer_id)
+    weight_name = ".".join(weight_name)
+
+    if weight_name == 'self_attn.q_proj.weight':
+        return f"layers.{layer_id}.attention.query_key_value.weight"
+    elif weight_name == 'self_attn.k_proj.weight' or weight_name == 'self_attn.v_proj.weight':
+        return f"layers.{layer_id}.attention.kv.weight"
+    if weight_name == 'self_attn.q_proj.bias':
+        return f"layers.{layer_id}.attention.query_key_value.bias"
+    elif weight_name == 'self_attn.k_proj.bias' or weight_name == 'self_attn.v_proj.bias':
+        return f"layers.{layer_id}.attention.kv.bias"
+
+    per_layer_weights = {
+        "input_layernorm.weight": "input_layernorm.weight",
+        "self_attn.o_proj.weight": "attention.dense.weight",
+        "self_attn.o_proj.bias": "attention.dense.bias",
+        "mlp.gate_proj.weight": "mlp.fc.weight",
+        "mlp.down_proj.weight": "mlp.proj.weight",
+        "mlp.up_proj.weight": "mlp.gate.weight",
+        "post_attention_layernorm.weight": "post_layernorm.weight",
+    }
+
+    return f"layers.{layer_id}.{per_layer_weights[weight_name]}"
+
+
+# LLaMA uses nn.Linear for these following ops whose weight matrix is transposed compared to gpt2.
+# In order to use the preprocess codes of gpt2, we transpose them firstly.
+def transpose_weights(hf_name, param):
+    weight_to_transpose = ["o_proj", "gate_proj", "down_proj", "up_proj"]
+    if any([k in hf_name for k in weight_to_transpose]):
+        if len(param.shape) == 2:
+            param = param.transpose(0, 1)
+    return param
+
+
+def hf_gpt_converter(args):
+    infer_tp = args.tensor_parallelism
+    saved_dir = Path(args.out_dir) / f"{infer_tp}-gpu"
+    saved_dir.mkdir(parents=True, exist_ok=True)
+
+    model = AutoModelForCausalLM.from_pretrained(args.in_file,
+                                                 device_map="auto",
+                                                 trust_remote_code=True)
+
+    act_range = {}
+    internlm_qkv_para = {}
+    # smoother for inputs of self_attn.o_proj and mlp.down_proj
+    internlm_smoother = {}
+
+    if args.smoothquant is not None or args.calibrate_kv_cache:
+        os.environ["TOKENIZERS_PARALLELISM"] = os.environ.get(
+            "TOKENIZERS_PARALLELISM", "false")
+        act_range = capture_activation_range(
+            model,
+            AutoTokenizer.from_pretrained(args.in_file,
+                                          padding_side='left',
+                                          trust_remote_code=True))
+        if args.smoothquant is not None:
+            smooth_internlm_model(model, act_range, args.smoothquant,
+                                  internlm_qkv_para, internlm_smoother)
+
+    config = configparser.ConfigParser()
+    config["internlm"] = {}
+    for key in vars(args):
+        config["internlm"][key] = f"{vars(args)[key]}"
+    for k, v in vars(model.config).items():
+        config["internlm"][k] = f"{v}"
+    config["internlm"]["weight_data_type"] = args.storage_type
+    config["internlm"]["multi_query_mode"] = str(args.multi_query_mode)
+    with open(saved_dir / "config.ini", 'w') as configfile:
+        config.write(configfile)
+
+    storage_type = str_to_np_dtype(args.storage_type)
+
+    global_ft_weights = [
+        'vocab_embedding.weight', 'ln_f.weight', 'lm_head.weight'
+    ]
+
+    int8_outputs = None
+    if args.calibrate_kv_cache:
+        int8_outputs = "kv_cache_only"
+    if args.smoothquant is not None:
+        int8_outputs = "all"
+
+    starmap_args = []
+    for name, param in model.named_parameters():
+        if "weight" not in name and "bias" not in name:
+            continue
+        ft_name = gpt_to_ft_name(name)
+
+        if name.replace(".weight", "") in internlm_smoother.keys():
+            smoother = internlm_smoother[name.replace(".weight", "")]
+            smoother = smoother.detach().cpu().numpy()
+            starmap_args.append(
+                (0, saved_dir, infer_tp,
+                 f"{ft_name}.smoother".replace(".weight", ""), smoother, None, {
+                     "int8_outputs": int8_outputs,
+                     "multi_query_mode": args.multi_query_mode,
+                     "local_dim": None,
+                 }))
+
+        param = transpose_weights(name, param)
+
+        param = param.detach().cpu().numpy().astype(storage_type)
+
+        if ft_name in global_ft_weights:
+            param.tofile(saved_dir / f"{ft_name}.bin")
+        elif ft_name.split('.')[-2:] == ['query_key_value', 'bias']:
+            param = merge_qkv_bias(name, model)
+            param = param.cpu().numpy().astype(storage_type)
+            bias = (0, saved_dir, infer_tp, ft_name, param, None, {
+                "int8_outputs": int8_outputs,
+                "multi_query_mode": args.multi_query_mode,
+                "local_dim": None
+            })
+            starmap_args.append(bias)
+        elif ft_name.split('.')[-2:] == ['query_key_value', 'weight']:
+            # Is there other ways to get local_dim? local_dim = hidden_size in internlm
+            local_dim = model.config.hidden_size if args.multi_query_mode else None
+            if args.smoothquant is None:
+                merge_qkv_scales(name, model, act_range, internlm_qkv_para)
+            qkv = (0, saved_dir, infer_tp, ft_name,
+                   internlm_qkv_para.get(
+                       name.replace(".weight", "").replace(
+                           ".q_proj",
+                           ".qkv_proj")).cpu().numpy().astype(storage_type),
+                   act_range.get(
+                       name.replace(".weight",
+                                    "").replace(".q_proj", ".qkv_proj")), {
+                                        "int8_outputs": int8_outputs,
+                                        "multi_query_mode":
+                                        args.multi_query_mode,
+                                        "local_dim": local_dim,
+                                    })
+            starmap_args.append(qkv)
+        elif ft_name.split('.')[-2] == 'kv':
+            continue
+        else:
+            starmap_args.append((0, saved_dir, infer_tp, ft_name, param,
+                                 act_range.get(name.replace(".weight", "")), {
+                                     "int8_outputs": int8_outputs,
+                                     "multi_query_mode": args.multi_query_mode,
+                                     "local_dim": None,
+                                 }))
+
+    starmap_args = tqdm(starmap_args, desc="saving weights")
+    if args.processes > 1:
+        with multiprocessing.Pool(args.processes) as pool:
+            pool.starmap(split_and_save_weight, starmap_args)
+    else:
+        # simpler for debug situations
+        for starmap_arg in starmap_args:
+            split_and_save_weight(*starmap_arg)
+
+
+if __name__ == "__main__":
+    torch.multiprocessing.set_start_method("spawn")
+
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument('--out-dir',
+                        '-o',
+                        type=str,
+                        help='file name of output directory',
+                        required=True)
+    parser.add_argument('--in-file',
+                        '-i',
+                        type=str,
+                        help='file name of input checkpoint file',
+                        required=True)
+    parser.add_argument('--tensor-parallelism',
+                        '-tp',
+                        type=int,
+                        help='Requested tensor parallelism for inference',
+                        default=1)
+    parser.add_argument(
+        "--processes",
+        "-p",
+        type=int,
+        help="How many processes to spawn for conversion (default: 8)",
+        default=8)
+    parser.add_argument(
+        "--calibrate-kv-cache",
+        "-kv",
+        action="store_true",
+        help=
+        "Generate scaling factors for KV cache. Used for storing KV cache in int8."
+    )
+    parser.add_argument(
+        "--smoothquant",
+        "-sq",
+        type=float,
+        default=None,
+        help="Set the α parameter (see https://arxiv.org/pdf/2211.10438.pdf)"
+        " to Smoothquant the model, and output int8 weights."
+        " A good first try is 0.5. Must be in [0, 1]")
+    parser.add_argument("--storage-type",
+                        "-t",
+                        type=str,
+                        default="fp32",
+                        choices=["fp32", "fp16"])
+    parser.add_argument("--multi-query-mode",
+                        action="store_true",
+                        help="Use multi-query-attention.")
+
+    args = parser.parse_args()
+    print("\n=============== Argument ===============")
+    for key in vars(args):
+        print("{}: {}".format(key, vars(args)[key]))
+    print("========================================")
+
+    assert (args.calibrate_kv_cache or args.smoothquant), \
+        "Either INT8 kv cache or SmoothQuant must be enabled for this script. Otherwise you can directly build engines from HuggingFace checkpoints, no need to do this FT-format conversion. "
+
+    hf_gpt_converter(args)
diff --git a/examples/internlm/quantize.py b/examples/internlm/quantize.py
new file mode 100644
index 0000000000..3dc088ee92
--- /dev/null
+++ b/examples/internlm/quantize.py
@@ -0,0 +1,135 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Adapted from examples/quantization/hf_ptq.py
+"""
+
+import argparse
+import random
+
+import numpy as np
+import torch
+from datasets import load_dataset
+from torch.utils.data import DataLoader
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+from tensorrt_llm._utils import str_dtype_to_torch
+from tensorrt_llm.logger import logger
+from tensorrt_llm.models.quantized.ammo import quantize_and_export
+
+
+def get_calib_dataloader(data="cnn_dailymail",
+                         tokenizer=None,
+                         batch_size=1,
+                         calib_size=512,
+                         block_size=512):
+    print("Loading calibration dataset")
+    if data == "pileval":
+        dataset = load_dataset(
+            "json",
+            data_files="https://the-eye.eu/public/AI/pile/val.jsonl.zst",
+            split="train")
+        dataset = dataset["text"][:calib_size]
+    elif data == "cnn_dailymail":
+        dataset = load_dataset("cnn_dailymail", name="3.0.0", split="train")
+        dataset = dataset["article"][:calib_size]
+    else:
+        raise NotImplementedError
+
+    batch_encoded = tokenizer.batch_encode_plus(dataset,
+                                                return_tensors="pt",
+                                                padding=True,
+                                                max_length=block_size)
+    batch_encoded = batch_encoded["input_ids"]
+    batch_encoded = batch_encoded.cuda()
+
+    calib_dataloader = DataLoader(batch_encoded,
+                                  batch_size=batch_size,
+                                  shuffle=False)
+
+    return calib_dataloader
+
+
+def get_tokenizer(ckpt_path, **kwargs):
+    logger.info(f"Loading tokenizer from {ckpt_path}")
+    tokenizer = AutoTokenizer.from_pretrained(ckpt_path,
+                                              padding_side="left",
+                                              **kwargs)
+    if tokenizer.pad_token is None:
+        tokenizer.pad_token = tokenizer.eos_token
+    return tokenizer
+
+
+def get_model(ckpt_path, dtype="float16"):
+    logger.info(f"Loading model from {ckpt_path}")
+    torch_dtype = str_dtype_to_torch(dtype)
+    model = AutoModelForCausalLM.from_pretrained(
+        ckpt_path,
+        device_map="auto",
+        trust_remote_code=True,
+        torch_dtype=torch_dtype,
+    )
+    model.eval()
+    model = model.to(memory_format=torch.channels_last)
+    return model
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description=__doc__)
+    parser.add_argument("--model_dir",
+                        type=str,
+                        required=True,
+                        help="Directory of a HF model checkpoint")
+    parser.add_argument("--dtype", help="Model data type.", default="float16")
+    parser.add_argument(
+        "--qformat",
+        type=str,
+        choices=['fp8', 'int4_awq'],
+        default='fp8',
+        help='Quantization format. Currently only fp8 is supported. '
+        'For int8 smoothquant, use smoothquant.py instead. ')
+    parser.add_argument("--calib_size",
+                        type=int,
+                        default=512,
+                        help="Number of samples for calibration.")
+    parser.add_argument("--export_path", default="exported_model")
+    parser.add_argument('--seed', type=int, default=None, help='Random seed')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    if not torch.cuda.is_available():
+        raise EnvironmentError("GPU is required for inference.")
+
+    args = get_args()
+
+    if args.seed is not None:
+        random.seed(args.seed)
+        np.random.seed(args.seed)
+
+    tokenizer = get_tokenizer(args.model_dir)
+    model = get_model(args.model_dir, args.dtype)
+
+    calib_dataloader = get_calib_dataloader(tokenizer=tokenizer,
+                                            calib_size=args.calib_size)
+    model = quantize_and_export(model,
+                                qformat=args.qformat,
+                                calib_dataloader=calib_dataloader,
+                                export_path=args.export_path)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/internlm/requirements.txt b/examples/internlm/requirements.txt
new file mode 100644
index 0000000000..926de5f086
--- /dev/null
+++ b/examples/internlm/requirements.txt
@@ -0,0 +1,3 @@
+datasets==2.14.5
+rouge_score~=0.1.2
+sentencepiece~=0.1.99
diff --git a/examples/internlm/run.py b/examples/internlm/run.py
new file mode 100644
index 0000000000..1976027d37
--- /dev/null
+++ b/examples/internlm/run.py
@@ -0,0 +1,275 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import csv
+import json
+from pathlib import Path
+
+import numpy as np
+import torch
+from transformers import AutoTokenizer
+
+import tensorrt_llm
+from tensorrt_llm.quantization import QuantMode
+from tensorrt_llm.runtime import ModelConfig, SamplingConfig
+
+from build import get_engine_name  # isort:skip
+
+EOS_TOKEN = 2
+PAD_TOKEN = 2
+
+
+def throttle_generator(generator, stream_interval):
+    for i, out in enumerate(generator):
+        if not i % stream_interval:
+            yield out
+
+    if i % stream_interval:
+        yield out
+
+
+def read_config(config_path: Path):
+    with open(config_path, 'r') as f:
+        config = json.load(f)
+    use_gpt_attention_plugin = config['plugin_config']['gpt_attention_plugin']
+    remove_input_padding = config['plugin_config']['remove_input_padding']
+    dtype = config['builder_config']['precision']
+    tp_size = config['builder_config']['tensor_parallel']
+    pp_size = config['builder_config']['pipeline_parallel']
+    world_size = tp_size * pp_size
+    assert world_size == tensorrt_llm.mpi_world_size(), \
+        f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
+    num_heads = config['builder_config']['num_heads'] // tp_size
+    hidden_size = config['builder_config']['hidden_size'] // tp_size
+    vocab_size = config['builder_config']['vocab_size']
+    num_layers = config['builder_config']['num_layers']
+    num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads)
+    paged_kv_cache = config['plugin_config']['paged_kv_cache']
+    tokens_per_block = config['plugin_config']['tokens_per_block']
+    quant_mode = QuantMode(config['builder_config']['quant_mode'])
+    if config['builder_config'].get('multi_query_mode', False):
+        tensorrt_llm.logger.warning(
+            "`multi_query_mode` config is deprecated. Please rebuild the engine."
+        )
+        num_kv_heads = 1
+    num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
+    use_custom_all_reduce = config['plugin_config'].get('use_custom_all_reduce',
+                                                        False)
+
+    model_config = ModelConfig(num_heads=num_heads,
+                               num_kv_heads=num_kv_heads,
+                               hidden_size=hidden_size,
+                               vocab_size=vocab_size,
+                               num_layers=num_layers,
+                               gpt_attention_plugin=use_gpt_attention_plugin,
+                               paged_kv_cache=paged_kv_cache,
+                               tokens_per_block=tokens_per_block,
+                               remove_input_padding=remove_input_padding,
+                               dtype=dtype,
+                               quant_mode=quant_mode,
+                               use_custom_all_reduce=use_custom_all_reduce)
+
+    return model_config, tp_size, pp_size, dtype
+
+
+def parse_input(input_text: str, input_file: str, tokenizer, end_id: int,
+                remove_input_padding: bool):
+    input_tokens = []
+    if input_file is None:
+        input_text = f'<s><|User|>:{input_text}<eoh>\n<|Bot|>:'
+        input_tokens.append(
+            tokenizer.encode(input_text, add_special_tokens=False))
+        print(f'Input: \"{input_text}\"')
+        print(f'Input: {input_tokens[0]}')
+    else:
+        if input_file.endswith('.csv'):
+            with open(input_file, 'r') as csv_file:
+                csv_reader = csv.reader(csv_file, delimiter=',')
+                for line in csv_reader:
+                    input_tokens.append(np.array(line, dtype='int32'))
+        elif input_file.endswith('.npy'):
+            inputs = np.load(input_file)
+            for row in inputs:
+                row = row[row != end_id]
+                input_tokens.append(row)
+        else:
+            print('Input file format not supported.')
+            raise SystemExit
+
+    input_ids = None
+    input_lengths = torch.tensor([len(x) for x in input_tokens],
+                                 dtype=torch.int32,
+                                 device='cuda')
+    if remove_input_padding:
+        input_ids = np.concatenate(input_tokens)
+        input_ids = torch.tensor(input_ids, dtype=torch.int32,
+                                 device='cuda').unsqueeze(0)
+    else:
+        input_ids = torch.nested.to_padded_tensor(
+            torch.nested.nested_tensor(input_tokens, dtype=torch.int32),
+            end_id).cuda()
+
+    return input_ids, input_lengths
+
+
+def print_output(output_ids, input_lengths, max_output_len, tokenizer,
+                 output_csv, output_npy):
+    num_beams = output_ids.size(1)
+    if output_csv is None and output_npy is None:
+        for b in range(input_lengths.size(0)):
+            inputs = output_ids[b][0][:input_lengths[b]].tolist()
+            input_text = tokenizer.decode(inputs)
+            print(f'Input: \"{input_text}\"')
+            for beam in range(num_beams):
+                output_begin = input_lengths[b]
+                output_end = input_lengths[b] + max_output_len
+                outputs = output_ids[b][beam][output_begin:output_end].tolist()
+                output_text = tokenizer.decode(outputs)
+                print(f'Output ids: {outputs}')
+                print(f'Output: \"{output_text}\"')
+
+    output_ids = output_ids.reshape((-1, output_ids.size(2)))
+
+    if output_csv is not None:
+        output_file = Path(output_csv)
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        outputs = output_ids.tolist()
+        with open(output_file, 'w') as csv_file:
+            writer = csv.writer(csv_file, delimiter=',')
+            writer.writerows(outputs)
+
+    if output_npy is not None:
+        output_file = Path(output_npy)
+        output_file.parent.mkdir(exist_ok=True, parents=True)
+        outputs = np.array(output_ids.cpu().contiguous(), dtype='int32')
+        np.save(output_file, outputs)
+
+
+def parse_arguments():
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--max_output_len', type=int, required=True)
+    parser.add_argument('--log_level', type=str, default='error')
+    parser.add_argument('--engine_dir', type=str, default='internlm_outputs')
+    parser.add_argument('--tokenizer_dir',
+                        type=str,
+                        default=".",
+                        help="Directory containing the tokenizer.model.")
+    parser.add_argument('--input_text',
+                        type=str,
+                        default='Tell me about yourself.')
+    parser.add_argument(
+        '--input_tokens',
+        dest='input_file',
+        type=str,
+        help=
+        'CSV or Numpy file containing tokenized input. Alternative to text input.',
+        default=None)
+    parser.add_argument('--output_csv',
+                        type=str,
+                        help='CSV file where the tokenized output is stored.',
+                        default=None)
+    parser.add_argument('--output_npy',
+                        type=str,
+                        help='Numpy file where the tokenized output is stored.',
+                        default=None)
+    parser.add_argument('--num_beams',
+                        type=int,
+                        help="Use beam search if num_beams >1",
+                        default=1)
+    parser.add_argument('--streaming', default=False, action='store_true')
+    parser.add_argument('--streaming_interval',
+                        type=int,
+                        help="How often to return tokens when streaming.",
+                        default=5)
+    return parser.parse_args()
+
+
+def generate(
+    max_output_len: int,
+    log_level: str = 'error',
+    engine_dir: str = 'internlm_outputs',
+    input_text: str = 'Tell me about yourself.',
+    input_file: str = None,
+    output_csv: str = None,
+    output_npy: str = None,
+    tokenizer_dir: str = None,
+    num_beams: int = 1,
+    streaming: bool = False,
+    streaming_interval: int = 5,
+):
+    tensorrt_llm.logger.set_level(log_level)
+
+    engine_dir = Path(engine_dir)
+    config_path = engine_dir / 'config.json'
+    model_config, tp_size, pp_size, dtype = read_config(config_path)
+    world_size = tp_size * pp_size
+
+    runtime_rank = tensorrt_llm.mpi_rank()
+    runtime_mapping = tensorrt_llm.Mapping(world_size,
+                                           runtime_rank,
+                                           tp_size=tp_size,
+                                           pp_size=pp_size)
+    torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+
+    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dir,
+                                              legacy=False,
+                                              trust_remote_code=True)
+
+    sampling_config = SamplingConfig(end_id=EOS_TOKEN,
+                                     pad_id=PAD_TOKEN,
+                                     num_beams=num_beams)
+
+    engine_name = get_engine_name('internlm', dtype, tp_size, pp_size,
+                                  runtime_rank)
+    serialize_path = engine_dir / engine_name
+    with open(serialize_path, 'rb') as f:
+        engine_buffer = f.read()
+    decoder = tensorrt_llm.runtime.GenerationSession(model_config,
+                                                     engine_buffer,
+                                                     runtime_mapping,
+                                                     debug_mode=False,
+                                                     debug_tensors_to_save=None)
+    if runtime_rank == 0:
+        print(f"Running the {dtype} engine ...")
+
+    input_ids, input_lengths = parse_input(input_text, input_file, tokenizer,
+                                           EOS_TOKEN,
+                                           model_config.remove_input_padding)
+
+    max_input_length = torch.max(input_lengths).item()
+    decoder.setup(input_lengths.size(0), max_input_length, max_output_len,
+                  num_beams)
+
+    output_gen_ids = decoder.decode(input_ids,
+                                    input_lengths,
+                                    sampling_config,
+                                    streaming=streaming)
+    torch.cuda.synchronize()
+    if streaming:
+        for output_ids in throttle_generator(output_gen_ids,
+                                             streaming_interval):
+            if runtime_rank == 0:
+                print_output(output_ids, input_lengths, max_output_len,
+                             tokenizer, output_csv, output_npy)
+    else:
+        output_ids = output_gen_ids
+        if runtime_rank == 0:
+            print_output(output_ids, input_lengths, max_output_len, tokenizer,
+                         output_csv, output_npy)
+
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    generate(**vars(args))
diff --git a/examples/internlm/smoothquant.py b/examples/internlm/smoothquant.py
new file mode 100644
index 0000000000..4e4145cb4e
--- /dev/null
+++ b/examples/internlm/smoothquant.py
@@ -0,0 +1,204 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+'''
+Utilities for SmoothQuant models
+'''
+
+import copy
+import functools
+from collections import defaultdict
+
+import torch
+import torch.nn as nn
+from tqdm import tqdm
+from transformers.pytorch_utils import Conv1D
+
+
+@torch.no_grad()
+def apply_smoothing(scales,
+                    gemm_weights,
+                    layernorm_weights=None,
+                    layernorm_bias=None,
+                    dtype=torch.float32,
+                    layernorm_1p=False):
+    if not isinstance(gemm_weights, list):
+        gemm_weights = [gemm_weights]
+
+    if layernorm_weights is not None:
+        assert layernorm_weights.numel() == scales.numel()
+        layernorm_weights.div_(scales).to(dtype)
+    if layernorm_bias is not None:
+        assert layernorm_bias.numel() == scales.numel()
+        layernorm_bias.div_(scales).to(dtype)
+    if layernorm_1p:
+        layernorm_weights += (1 / scales) - 1
+
+    for gemm in gemm_weights:
+        gemm.mul_(scales.view(1, -1)).to(dtype)
+
+
+@torch.no_grad()
+def smooth_gemm(gemm_weights,
+                act_scales,
+                layernorm_weights=None,
+                layernorm_bias=None,
+                alpha=0.5,
+                weight_scales=None):
+    if not isinstance(gemm_weights, list):
+        gemm_weights = [gemm_weights]
+    orig_dtype = gemm_weights[0].dtype
+
+    for gemm in gemm_weights:
+        # gemm_weights are expected to be transposed
+        assert gemm.shape[1] == act_scales.numel()
+
+    if weight_scales is None:
+        weight_scales = torch.cat(
+            [gemm.abs().max(dim=0, keepdim=True)[0] for gemm in gemm_weights],
+            dim=0)
+        weight_scales = weight_scales.max(dim=0)[0]
+    weight_scales.to(float).clamp(min=1e-5)
+    scales = (act_scales.to(gemm_weights[0].device).to(float).pow(alpha) /
+              weight_scales.pow(1 - alpha)).clamp(min=1e-5)
+
+    apply_smoothing(scales, gemm_weights, layernorm_weights, layernorm_bias,
+                    orig_dtype)
+
+    return scales
+
+
+@torch.no_grad()
+def smooth_gemm_fc1_gate(fc1_weights,
+                         gate_weights,
+                         act_scales,
+                         layernorm_weights=None,
+                         layernorm_bias=None,
+                         alpha=0.5,
+                         weight_scales=None):
+    gemm_weights = []
+    if not isinstance(fc1_weights, list):
+        fc1_weights = [fc1_weights]
+    if not isinstance(gate_weights, list):
+        gate_weights = [gate_weights]
+
+    for i in range(len(fc1_weights)):
+        gemm_weight = torch.cat([fc1_weights[i], gate_weights[i]], dim=0)
+        gemm_weights.append(gemm_weight)
+
+    orig_dtype = gemm_weights[0].dtype
+
+    for gemm in gemm_weights:
+        # gemm_weights are expected to be transposed
+        assert gemm.shape[1] == act_scales.numel()
+
+    if weight_scales is None:
+        weight_scales = torch.cat(
+            [gemm.abs().max(dim=0, keepdim=True)[0] for gemm in gemm_weights],
+            dim=0)
+        weight_scales = weight_scales.max(dim=0)[0]
+    weight_scales.to(float).clamp(min=1e-5)
+    scales = (act_scales.to(gemm_weights[0].device).to(float).pow(alpha) /
+              weight_scales.pow(1 - alpha)).clamp(min=1e-5)
+
+    apply_smoothing(scales, fc1_weights + gate_weights, layernorm_weights,
+                    layernorm_bias, orig_dtype)
+
+    return scales
+
+
+@torch.no_grad()
+def smooth_ln_fcs(ln, fcs, act_scales, alpha=0.5):
+    if not isinstance(fcs, list):
+        fcs = [fcs]
+    for fc in fcs:
+        assert isinstance(fc, nn.Linear)
+        assert ln.weight.numel() == fc.in_features == act_scales.numel()
+
+    device, dtype = fcs[0].weight.device, fcs[0].weight.dtype
+    act_scales = act_scales.to(device=device, dtype=dtype)
+    weight_scales = torch.cat(
+        [fc.weight.abs().max(dim=0, keepdim=True)[0] for fc in fcs], dim=0)
+    weight_scales = weight_scales.max(dim=0)[0].clamp(min=1e-5)
+
+    scales = (act_scales.pow(alpha) /
+              weight_scales.pow(1 - alpha)).clamp(min=1e-5).to(device).to(dtype)
+
+    if ln is not None:
+        ln.weight.div_(scales)
+        ln.bias.div_(scales)
+
+    for fc in fcs:
+        fc.weight.mul_(scales.view(1, -1))
+    return scales
+
+
+@torch.no_grad()
+def capture_activation_range(model, tokenizer, num_samples=512, seq_len=512):
+    model.eval()
+    next(model.parameters()).device
+    act_scales = defaultdict(lambda: {"x": None, "y": None, "w": None})
+
+    test_token_num = 923
+    tokenizer.pad_token = tokenizer.eos_token
+
+    def stat_tensor(name, tensor, act_scales, key):
+        hidden_dim = tensor.shape[-1]
+        tensor = tensor.view(-1, hidden_dim).abs().detach()
+        comming_max = torch.max(tensor, dim=0)[0].float()
+
+        if act_scales[name][key] is None:
+            act_scales[name][key] = comming_max
+        else:
+            act_scales[name][key] = torch.max(act_scales[name][key],
+                                              comming_max)
+
+    def stat_input_hook(m, x, y, name):
+        if isinstance(x, tuple):
+            x = x[0]
+        stat_tensor(name, x, act_scales, "x")
+        stat_tensor(name, y, act_scales, "y")
+
+        if act_scales[name]["w"] is None:
+            act_scales[name]["w"] = m.weight.abs().clip(1e-8,
+                                                        None).max(dim=1)[0]
+
+    hooks = []
+    for name, m in model.named_modules():
+        if isinstance(m, nn.Linear) or isinstance(m, Conv1D):
+            hooks.append(
+                m.register_forward_hook(
+                    functools.partial(stat_input_hook, name=name)))
+
+    from datasets import load_dataset
+    dataset_cnn = load_dataset("ccdv/cnn_dailymail", '3.0.0')
+
+    for i in tqdm(range(num_samples), desc="calibrating model"):
+        datapoint = dataset_cnn['train'][i:i + 1]
+        line = copy.copy(datapoint['article'])
+        line[0] = line[0] + ' TL;DR: '
+        line[0] = line[0].strip()
+        line[0] = line[0].replace(" n't", "n't")
+        line_encoded = tokenizer(line,
+                                 return_tensors="pt",
+                                 padding=True,
+                                 truncation=True)["input_ids"].type(torch.int64)
+        line_encoded = line_encoded[:, -test_token_num:]
+        line_encoded = line_encoded.cuda()
+        model(line_encoded)
+
+    for h in hooks:
+        h.remove()
+
+    return act_scales
diff --git a/examples/internlm/summarize.py b/examples/internlm/summarize.py
new file mode 100644
index 0000000000..2199014aaf
--- /dev/null
+++ b/examples/internlm/summarize.py
@@ -0,0 +1,414 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import copy
+import json
+import os
+
+import numpy as np
+import torch
+from datasets import load_dataset, load_metric
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+import tensorrt_llm
+import tensorrt_llm.profiler as profiler
+from tensorrt_llm.logger import logger
+from tensorrt_llm.quantization import QuantMode
+
+from build import get_engine_name  # isort:skip
+
+
+def TRTInternLM(args, config):
+    dtype = config['builder_config']['precision']
+    tp_size = config['builder_config']['tensor_parallel']
+    pp_size = config['builder_config']['pipeline_parallel']
+    world_size = tp_size * pp_size
+
+    assert world_size == tensorrt_llm.mpi_world_size(), \
+        f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
+
+    num_heads = config['builder_config']['num_heads'] // tp_size
+    hidden_size = config['builder_config']['hidden_size'] // tp_size
+    vocab_size = config['builder_config']['vocab_size']
+    num_layers = config['builder_config']['num_layers']
+    use_gpt_attention_plugin = bool(
+        config['plugin_config']['gpt_attention_plugin'])
+    remove_input_padding = config['plugin_config']['remove_input_padding']
+    num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads)
+    paged_kv_cache = config['plugin_config']['paged_kv_cache']
+    tokens_per_block = config['plugin_config']['tokens_per_block']
+    use_custom_all_reduce = config['plugin_config'].get('use_custom_all_reduce',
+                                                        False)
+
+    quant_mode = QuantMode(config['builder_config']['quant_mode'])
+    if config['builder_config'].get('multi_query_mode', False):
+        tensorrt_llm.logger.warning(
+            "`multi_query_mode` config is deprecated. Please rebuild the engine."
+        )
+        num_kv_heads = 1
+    num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
+
+    model_config = tensorrt_llm.runtime.ModelConfig(
+        vocab_size=vocab_size,
+        num_layers=num_layers,
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        hidden_size=hidden_size,
+        paged_kv_cache=paged_kv_cache,
+        tokens_per_block=tokens_per_block,
+        gpt_attention_plugin=use_gpt_attention_plugin,
+        remove_input_padding=remove_input_padding,
+        use_custom_all_reduce=use_custom_all_reduce,
+        dtype=dtype,
+        quant_mode=quant_mode)
+
+    runtime_rank = tensorrt_llm.mpi_rank()
+    runtime_mapping = tensorrt_llm.Mapping(world_size,
+                                           runtime_rank,
+                                           tp_size=tp_size,
+                                           pp_size=pp_size)
+    torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
+
+    engine_name = get_engine_name('internlm', dtype, tp_size, pp_size,
+                                  runtime_rank)
+    serialize_path = os.path.join(args.engine_dir, engine_name)
+
+    tensorrt_llm.logger.set_level(args.log_level)
+
+    profiler.start('load tensorrt_llm engine')
+    with open(serialize_path, 'rb') as f:
+        engine_buffer = f.read()
+    decoder = tensorrt_llm.runtime.GenerationSession(model_config,
+                                                     engine_buffer,
+                                                     runtime_mapping)
+    profiler.stop('load tensorrt_llm engine')
+    tensorrt_llm.logger.info(
+        f'Load engine takes: {profiler.elapsed_time_in_sec("load tensorrt_llm engine")} sec'
+    )
+    return decoder
+
+
+def main(args):
+    runtime_rank = tensorrt_llm.mpi_rank()
+    logger.set_level(args.log_level)
+
+    test_hf = args.test_hf and runtime_rank == 0  # only run hf on rank 0
+    test_trt_llm = args.test_trt_llm
+    hf_model_location = args.hf_model_location
+    profiler.start('load tokenizer')
+    tokenizer = AutoTokenizer.from_pretrained(hf_model_location,
+                                              legacy=False,
+                                              padding_side='left',
+                                              trust_remote_code=True)
+    profiler.stop('load tokenizer')
+    tensorrt_llm.logger.info(
+        f'Load tokenizer takes: {profiler.elapsed_time_in_sec("load tokenizer")} sec'
+    )
+    tokenizer.pad_token = tokenizer.eos_token
+
+    dataset_cnn = load_dataset("ccdv/cnn_dailymail",
+                               '3.0.0',
+                               cache_dir=args.dataset_path)
+
+    max_batch_size = args.batch_size
+
+    # runtime parameters
+    # repetition_penalty = 1
+    top_k = args.top_k
+    output_len = 100
+    test_token_num = 923
+    # top_p = 0.0
+    # random_seed = 5
+    temperature = 1
+    num_beams = args.num_beams
+
+    pad_id = tokenizer.encode(tokenizer.pad_token, add_special_tokens=False)[0]
+    end_id = tokenizer.encode(tokenizer.eos_token, add_special_tokens=False)[0]
+
+    if test_trt_llm:
+        config_path = os.path.join(args.engine_dir, 'config.json')
+        with open(config_path, 'r') as f:
+            config = json.load(f)
+
+        tensorrt_llm_internlm = TRTInternLM(args, config)
+
+    if test_hf:
+        profiler.start('load HF model')
+        model = AutoModelForCausalLM.from_pretrained(hf_model_location,
+                                                     trust_remote_code=True)
+        profiler.stop('load HF model')
+        tensorrt_llm.logger.info(
+            f'Load HF model takes: {profiler.elapsed_time_in_sec("load HF model")} sec'
+        )
+        if args.data_type == 'fp16':
+            model.half()
+        elif args.data_type == 'fp32':
+            model = model.float()
+        elif args.data_type == 'bf16':
+            model = model.to(dtype=torch.bfloat16)
+        # else use dtype in hf config, which is by default
+        model.cuda()
+
+    def summarize_tensorrt_llm(datapoint):
+        batch_size = len(datapoint['article'])
+
+        line = copy.copy(datapoint['article'])
+        line_encoded = []
+        input_lengths = []
+        for i in range(batch_size):
+            line[i] = line[i] + ' TL;DR: '
+
+            line[i] = line[i].strip()
+            line[i] = line[i].replace(" n't", "n't")
+
+            input_id = tokenizer.encode(line[i],
+                                        return_tensors='pt').type(torch.int32)
+            input_id = input_id[:, -test_token_num:]
+
+            line_encoded.append(input_id)
+            input_lengths.append(input_id.shape[-1])
+
+        # do padding, should move outside the profiling to prevent the overhead
+        max_length = max(input_lengths)
+        if tensorrt_llm_internlm.remove_input_padding:
+            line_encoded = [
+                torch.as_tensor(t, dtype=torch.int32, device='cuda')
+                for t in line_encoded
+            ]
+        else:
+            # do padding, should move outside the profiling to prevent the overhead
+            for i in range(batch_size):
+                pad_size = max_length - input_lengths[i]
+
+                pad = torch.ones([1, pad_size]).type(torch.int32) * pad_id
+                line_encoded[i] = torch.cat(
+                    [torch.tensor(line_encoded[i], dtype=torch.int32), pad],
+                    axis=-1)
+
+            line_encoded = torch.cat(line_encoded, axis=0).cuda()
+            input_lengths = torch.tensor(input_lengths,
+                                         dtype=torch.int32).cuda()
+
+        sampling_config = tensorrt_llm.runtime.SamplingConfig(
+            end_id=end_id, pad_id=pad_id, top_k=top_k, num_beams=num_beams)
+
+        with torch.no_grad():
+            tensorrt_llm_internlm.setup(batch_size,
+                                        max_context_length=max_length,
+                                        max_new_tokens=output_len,
+                                        beam_width=num_beams)
+
+            if tensorrt_llm_internlm.remove_input_padding:
+                output_ids = tensorrt_llm_internlm.decode_batch(
+                    line_encoded, sampling_config)
+            else:
+                output_ids = tensorrt_llm_internlm.decode(
+                    line_encoded,
+                    input_lengths,
+                    sampling_config,
+                )
+
+            torch.cuda.synchronize()
+
+        # Extract a list of tensors of shape beam_width x output_ids.
+        if tensorrt_llm_internlm.mapping.is_first_pp_rank():
+            output_beams_list = [
+                tokenizer.batch_decode(output_ids[batch_idx, :,
+                                                  input_lengths[batch_idx]:],
+                                       skip_special_tokens=True)
+                for batch_idx in range(batch_size)
+            ]
+            return output_beams_list, output_ids[:, :, max_length:].tolist()
+        return [], []
+
+    def summarize_hf(datapoint):
+        batch_size = len(datapoint['article'])
+        if batch_size > 1:
+            logger.warning(
+                f"HF does not support batch_size > 1 to verify correctness due to padding. Current batch size is {batch_size}"
+            )
+
+        line = copy.copy(datapoint['article'])
+        for i in range(batch_size):
+            line[i] = line[i] + ' TL;DR: '
+
+            line[i] = line[i].strip()
+            line[i] = line[i].replace(" n't", "n't")
+
+        line_encoded = tokenizer(line,
+                                 return_tensors='pt',
+                                 padding=True,
+                                 truncation=True)["input_ids"].type(torch.int64)
+
+        line_encoded = line_encoded[:, -test_token_num:]
+        line_encoded = line_encoded.cuda()
+
+        with torch.no_grad():
+            output = model.generate(line_encoded,
+                                    max_length=len(line_encoded[0]) +
+                                    output_len,
+                                    top_k=top_k,
+                                    temperature=temperature,
+                                    eos_token_id=tokenizer.eos_token_id,
+                                    pad_token_id=tokenizer.pad_token_id,
+                                    num_beams=num_beams,
+                                    num_return_sequences=num_beams,
+                                    early_stopping=True)
+
+        tokens_list = output[:, len(line_encoded[0]):].tolist()
+        output = output.reshape([batch_size, num_beams, -1])
+        output_lines_list = [
+            tokenizer.batch_decode(output[:, i, len(line_encoded[0]):],
+                                   skip_special_tokens=True)
+            for i in range(num_beams)
+        ]
+
+        return output_lines_list, tokens_list
+
+    if test_trt_llm:
+        datapoint = dataset_cnn['test'][0:1]
+        summary, _ = summarize_tensorrt_llm(datapoint)
+        if runtime_rank == 0:
+            logger.info(
+                "---------------------------------------------------------")
+            logger.info("TensorRT-LLM Generated : ")
+            logger.info(f" Article : {datapoint['article']}")
+            logger.info(f"\n Highlights : {datapoint['highlights']}")
+            logger.info(f"\n Summary : {summary}")
+            logger.info(
+                "---------------------------------------------------------")
+
+    if test_hf:
+        datapoint = dataset_cnn['test'][0:1]
+        summary, _ = summarize_hf(datapoint)
+        logger.info("---------------------------------------------------------")
+        logger.info("HF Generated : ")
+        logger.info(f" Article : {datapoint['article']}")
+        logger.info(f"\n Highlights : {datapoint['highlights']}")
+        logger.info(f"\n Summary : {summary}")
+        logger.info("---------------------------------------------------------")
+
+    metric_tensorrt_llm = [load_metric("rouge") for _ in range(num_beams)]
+    metric_hf = [load_metric("rouge") for _ in range(num_beams)]
+    for i in range(num_beams):
+        metric_tensorrt_llm[i].seed = 0
+        metric_hf[i].seed = 0
+
+    ite_count = 0
+    data_point_idx = 0
+    while (data_point_idx < len(dataset_cnn['test'])) and (ite_count <
+                                                           args.max_ite):
+        if runtime_rank == 0:
+            logger.debug(
+                f"run data_point {data_point_idx} ~ {data_point_idx + max_batch_size}"
+            )
+        datapoint = dataset_cnn['test'][data_point_idx:(data_point_idx +
+                                                        max_batch_size)]
+
+        if test_trt_llm:
+            profiler.start('tensorrt_llm')
+            summary_tensorrt_llm, tokens_tensorrt_llm = summarize_tensorrt_llm(
+                datapoint)
+            profiler.stop('tensorrt_llm')
+
+        if test_hf:
+            profiler.start('hf')
+            summary_hf, tokens_hf = summarize_hf(datapoint)
+            profiler.stop('hf')
+
+        if runtime_rank == 0:
+            if test_trt_llm:
+                for batch_idx in range(len(summary_tensorrt_llm)):
+                    for beam_idx in range(num_beams):
+                        metric_tensorrt_llm[beam_idx].add_batch(
+                            predictions=[
+                                summary_tensorrt_llm[batch_idx][beam_idx]
+                            ],
+                            references=[datapoint['highlights'][batch_idx]])
+            if test_hf:
+                for beam_idx in range(num_beams):
+                    for batch_idx in range(len(summary_hf[beam_idx])):
+                        metric_hf[beam_idx].add_batch(
+                            predictions=[summary_hf[beam_idx][batch_idx]],
+                            references=[datapoint['highlights'][batch_idx]])
+
+            logger.debug('-' * 100)
+            logger.debug(f"Article : {datapoint['article']}")
+            if test_trt_llm:
+                logger.debug(f'TensorRT-LLM Summary: {summary_tensorrt_llm}')
+            if test_hf:
+                logger.debug(f'HF Summary: {summary_hf}')
+            logger.debug(f"highlights : {datapoint['highlights']}")
+
+        data_point_idx += max_batch_size
+        ite_count += 1
+
+    if runtime_rank == 0:
+        if test_trt_llm:
+            np.random.seed(0)  # rouge score use sampling to compute the score
+            logger.info(
+                f'TensorRT-LLM (total latency: {profiler.elapsed_time_in_sec("tensorrt_llm")} sec)'
+            )
+            for beam_idx in range(num_beams):
+                logger.info(f"TensorRT-LLM beam {beam_idx} result")
+                computed_metrics_tensorrt_llm = metric_tensorrt_llm[
+                    beam_idx].compute()
+                for key in computed_metrics_tensorrt_llm.keys():
+                    logger.info(
+                        f'  {key} : {computed_metrics_tensorrt_llm[key].mid[2]*100}'
+                    )
+
+                if args.check_accuracy and beam_idx == 0:
+                    assert computed_metrics_tensorrt_llm['rouge1'].mid[
+                        2] * 100 > args.tensorrt_llm_rouge1_threshold
+        if test_hf:
+            np.random.seed(0)  # rouge score use sampling to compute the score
+            logger.info(
+                f'Hugging Face (total latency: {profiler.elapsed_time_in_sec("hf")} sec)'
+            )
+            for beam_idx in range(num_beams):
+                logger.info(f"HF beam {beam_idx} result")
+                computed_metrics_hf = metric_hf[beam_idx].compute()
+                for key in computed_metrics_hf.keys():
+                    logger.info(
+                        f'  {key} : {computed_metrics_hf[key].mid[2]*100}')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--hf_model_location',
+                        type=str,
+                        default='internlm-7b-hf')
+    parser.add_argument('--test_hf', action='store_true')
+    parser.add_argument('--test_trt_llm', action='store_true')
+    parser.add_argument('--data_type',
+                        type=str,
+                        choices=['fp32', 'fp16', 'bf16'],
+                        default='auto')
+    parser.add_argument('--dataset_path', type=str, default='')
+    parser.add_argument('--log_level', type=str, default='info')
+    parser.add_argument('--engine_dir', type=str, default='internlm_outputs')
+    parser.add_argument('--batch_size', type=int, default=1)
+    parser.add_argument('--max_ite', type=int, default=20)
+    parser.add_argument('--check_accuracy', action='store_true')
+    parser.add_argument('--tensorrt_llm_rouge1_threshold',
+                        type=float,
+                        default=15.0)
+    parser.add_argument('--num_beams', type=int, default=1)
+    parser.add_argument('--top_k', type=int, default=1)
+
+    args = parser.parse_args()
+
+    main(args)
diff --git a/examples/internlm/weight.py b/examples/internlm/weight.py
new file mode 100644
index 0000000000..7293962c74
--- /dev/null
+++ b/examples/internlm/weight.py
@@ -0,0 +1,1318 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import configparser
+import math
+import time
+from operator import attrgetter
+from pathlib import Path
+from typing import Dict, List, Optional, Union
+
+import numpy as np
+import torch
+from safetensors import safe_open
+
+import tensorrt_llm
+import tensorrt_llm.logger as logger
+from tensorrt_llm._utils import str_dtype_to_torch, torch_to_numpy
+from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import InternLMForCausalLM
+from tensorrt_llm.models.quantized.quant import get_dummy_quant_scales
+from tensorrt_llm.quantization import QuantMode
+
+
+def get_scaling_factors(
+    model_path: Union[str, Path],
+    num_layers: int,
+    quant_mode: Optional[QuantMode] = None,
+) -> Optional[Dict[str, List[int]]]:
+    """ Get the scaling factors for InternLM model
+
+    Returns a dictionary of scaling factors for the selected layers of the
+    InternLM model.
+
+    Args:
+        model_path (str): Path to the quantized InternLM model
+        layers (list): List of layers to get the scaling factors for. If None,
+            all layers are selected.
+
+    Returns:
+        dict: Dictionary of scaling factors for the selected layers of the
+        InternLM model.
+
+        example:
+
+        {
+            'qkv_act': qkv_act_scale,
+            'qkv_weights': qkv_weights_scale,
+            'qkv_output' : qkv_outputs_scale,
+            'dense_act': dense_act_scale,
+            'dense_weights': dense_weights_scale,
+            'fc_act': fc_act_scale,
+            'fc_weights': fc_weights_scale,
+            'gate_act': gate_act_scale,
+            'gate_weights': gate_weights_scale,
+            'proj_act': proj_act_scale,
+            'proj_weights': proj_weights_scale,
+        }
+    """
+
+    if model_path is None:
+        logger.warning(f"--quantized_fp8_model_path not specified. "
+                       f"Initialize quantization scales automatically.")
+        return get_dummy_quant_scales(num_layers)
+    weight_dict = np.load(model_path)
+
+    # yapf: disable
+    scaling_factor = {
+        'qkv_act': [],
+        'qkv_weights': [],
+        'qkv_output': [],
+        'dense_act': [],
+        'dense_weights': [],
+        'fc_act': [],
+        'fc_weights': [],
+        'gate_act': [],
+        'gate_weights': [],
+        'proj_act': [],
+        'proj_weights': [],
+    }
+
+    for layer in range(num_layers):
+        scaling_factor['qkv_act'].append(max(
+            weight_dict[f'_np:layers:{layer}:attention:qkv:q:activation_scaling_factor'].item(),
+            weight_dict[f'_np:layers:{layer}:attention:qkv:k:activation_scaling_factor'].item(),
+            weight_dict[f'_np:layers:{layer}:attention:qkv:v:activation_scaling_factor'].item()
+            ))
+        scaling_factor['qkv_weights'].append(max(
+            weight_dict[f'_np:layers:{layer}:attention:qkv:q:weights_scaling_factor'].item(),
+            weight_dict[f'_np:layers:{layer}:attention:qkv:k:weights_scaling_factor'].item(),
+            weight_dict[f'_np:layers:{layer}:attention:qkv:v:weights_scaling_factor'].item()
+            ))
+        if quant_mode is not None and quant_mode.has_fp8_kv_cache():
+            # Not calibrarting KV cache.
+            scaling_factor['qkv_output'].append(1.0)
+        scaling_factor['dense_act'].append(weight_dict[f'_np:layers:{layer}:attention:dense:activation_scaling_factor'].item())
+        scaling_factor['dense_weights'].append(weight_dict[f'_np:layers:{layer}:attention:dense:weights_scaling_factor'].item())
+        scaling_factor['fc_act'].append(weight_dict[f'_np:layers:{layer}:mlp:fc:activation_scaling_factor'].item())
+        scaling_factor['fc_weights'].append(weight_dict[f'_np:layers:{layer}:mlp:fc:weights_scaling_factor'].item())
+        scaling_factor['gate_act'].append(weight_dict[f'_np:layers:{layer}:mlp:gate:activation_scaling_factor'].item())
+        scaling_factor['gate_weights'].append(weight_dict[f'_np:layers:{layer}:mlp:gate:weights_scaling_factor'].item())
+        scaling_factor['proj_act'].append(weight_dict[f'_np:layers:{layer}:mlp:proj:activation_scaling_factor'].item())
+        scaling_factor['proj_weights'].append(weight_dict[f'_np:layers:{layer}:mlp:proj:weights_scaling_factor'].item())
+    # yapf: enable
+    for k, v in scaling_factor.items():
+        assert len(v) == num_layers, \
+        f'Expect scaling factor {k} of length {num_layers}, got {len(v)}'
+
+    return scaling_factor
+
+
+def gen_suffix(rank, use_smooth_quant, quant_per_channel):
+    suffix = f"{rank}.bin"
+    if use_smooth_quant:
+        sq_prefix = "int8."
+        if quant_per_channel:
+            sq_prefix += "col."
+        suffix = sq_prefix + suffix
+    return suffix
+
+
+def extract_layer_idx(name):
+    ss = name.split('.')
+    for s in ss:
+        if s.isdigit():
+            return s
+    return None
+
+
+def split(v, tp_size, idx, dim=0):
+    if tp_size == 1:
+        return v
+    if len(v.shape) == 1:
+        return np.ascontiguousarray(np.split(v, tp_size)[idx].copy())
+    else:
+        return np.ascontiguousarray(np.split(v, tp_size, axis=dim)[idx].copy())
+
+
+def dup_kv_weight(v, num_head, tp_size):
+    assert tp_size % num_head == 0
+    reps = tp_size // num_head
+    head_size = v.shape[0] // num_head
+    v = v.reshape(num_head, head_size,
+                  -1)[:, None, :, :].expand(num_head, reps, head_size,
+                                            v.shape[1])
+    return v.reshape(num_head * reps * head_size, -1).clone()
+
+
+def parse_ft_config(ini_file):
+    gpt_config = configparser.ConfigParser()
+    gpt_config.read(ini_file)
+
+    n_embd = gpt_config.getint('internlm', 'hidden_size')
+    n_head = gpt_config.getint('internlm', 'num_attention_heads')
+    n_layer = gpt_config.getint('internlm', 'num_hidden_layers')
+    n_positions = gpt_config.getint('internlm', 'max_position_embeddings')
+    vocab_size = gpt_config.getint('internlm', 'vocab_size')
+    hidden_act = gpt_config.get('internlm', 'hidden_act')
+    inter_size = gpt_config.getint('internlm',
+                                   'intermediate_size',
+                                   fallback=None)
+    n_kv_head = gpt_config.getint('internlm',
+                                  'num_key_value_heads',
+                                  fallback=n_head)
+    attn_bias = gpt_config.getboolean('internlm', 'bias', fallback=False)
+
+    if inter_size is None:
+        inter_size = math.ceil(8 / 3 * n_embd / 256) * 256
+
+    return n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head, attn_bias
+
+
+def load_from_hf_internlm(
+        tensorrt_llm_internlm: tensorrt_llm.models.InternLMForCausalLM,
+        hf_internlm,
+        mapping=Mapping(),
+        dtype='float32'):
+    tensorrt_llm.logger.info('Loading weights from HF InternLM...')
+    tik = time.time()
+
+    quant_mode = getattr(tensorrt_llm_internlm, 'quant_mode', QuantMode(0))
+    if quant_mode.is_int8_weight_only():
+        plugin_weight_only_quant_type = torch.int8
+    elif quant_mode.is_int4_weight_only():
+        plugin_weight_only_quant_type = torch.quint4x2
+    use_weight_only = quant_mode.is_weight_only()
+    num_kv_heads = tensorrt_llm_internlm.num_kv_heads
+    mha_mode = (num_kv_heads == tensorrt_llm_internlm.num_heads)
+    assert mha_mode, "All InternLM variants should be MHA mode"
+
+    model_params = dict(hf_internlm.named_parameters())
+    for l in range(hf_internlm.config.num_hidden_layers):
+        prefix = f'model.layers.{l}.self_attn.'
+        q_weight = model_params[prefix + 'q_proj.weight']
+        k_weight = model_params[prefix + 'k_proj.weight']
+        v_weight = model_params[prefix + 'v_proj.weight']
+
+        if not mha_mode:
+            head_size = tensorrt_llm_internlm.hidden_size // tensorrt_llm_internlm.num_heads
+            if num_kv_heads < mapping.tp_size:
+                # duplicate the KV heads up to tensor_parallel
+                k_weight = dup_kv_weight(k_weight, num_kv_heads,
+                                         mapping.tp_size)
+                v_weight = dup_kv_weight(v_weight, num_kv_heads,
+                                         mapping.tp_size)
+            assert (k_weight.shape[0] % (mapping.tp_size * head_size)) == 0
+            assert (v_weight.shape[0] % (mapping.tp_size * head_size)) == 0
+            qkv_weight = [q_weight, k_weight, v_weight]
+        else:
+            qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
+
+        model_params[prefix + 'qkv_proj.weight'] = qkv_weight
+
+        if prefix + 'q_proj.bias' in model_params:
+            # only used in 7B models
+            # assert not mha_mode, "MHA mode not used in internlm 7B models"
+            q_bias = model_params[prefix + 'q_proj.bias']
+            k_bias = model_params[prefix + 'k_proj.bias']
+            v_bias = model_params[prefix + 'v_proj.bias']
+            qkv_bias = torch.cat([q_bias, k_bias, v_bias], dim=0)
+            model_params[prefix + 'qkv_proj.bias'] = qkv_bias
+
+    torch_dtype = str_dtype_to_torch(dtype)
+    layers_per_pipeline_stage = hf_internlm.config.num_hidden_layers // mapping.pp_size
+    layers_range = list(
+        range(mapping.pp_rank * layers_per_pipeline_stage,
+              (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1))
+    for k, v in model_params.items():
+        if isinstance(v, list):
+            v = [torch_to_numpy(vv.to(torch_dtype).detach().cpu()) for vv in v]
+        else:
+            v = torch_to_numpy(v.to(torch_dtype).detach().cpu())
+        if 'model.embed_tokens.weight' in k:
+            if tensorrt_llm_internlm.use_parallel_embedding:
+                v = split(v, mapping.tp_size, mapping.tp_rank,
+                          tensorrt_llm_internlm.embedding_sharding_dim)
+            if mapping.is_first_pp_rank():
+                tensorrt_llm_internlm.vocab_embedding.weight.value = v
+        elif 'model.norm.weight' in k:
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_internlm.ln_f.weight.value = v
+        elif 'lm_head.weight' in k:
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_internlm.lm_head.weight.value = np.ascontiguousarray(
+                    split(v, mapping.tp_size, mapping.tp_rank))
+        else:
+            layer_idx = extract_layer_idx(k)
+            if layer_idx is None or int(layer_idx) not in layers_range:
+                continue
+            idx = int(layer_idx) - mapping.pp_rank * layers_per_pipeline_stage
+            if idx >= tensorrt_llm_internlm.num_layers:
+                continue
+            if 'input_layernorm.weight' in k:
+                tensorrt_llm_internlm.layers[
+                    idx].input_layernorm.weight.value = v
+            elif 'post_attention_layernorm.weight' in k:
+                dst = tensorrt_llm_internlm.layers[idx].post_layernorm.weight
+                dst.value = v
+            elif 'self_attn.qkv_proj.weight' in k:
+                dst = tensorrt_llm_internlm.layers[idx].attention.qkv.weight
+                if not mha_mode:
+                    assert isinstance(v, list) and len(v) == 3
+                    wq = split(v[0], mapping.tp_size, mapping.tp_rank)
+                    wk = split(v[1], mapping.tp_size, mapping.tp_rank)
+                    wv = split(v[2], mapping.tp_size, mapping.tp_rank)
+                    split_v = np.concatenate((wq, wk, wv))
+                else:
+                    q_emb = v.shape[0] // 3
+                    model_emb = v.shape[1]
+                    v = v.reshape(3, q_emb, model_emb)
+                    split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1)
+                    split_v = split_v.reshape(3 * (q_emb // mapping.tp_size),
+                                              model_emb)
+                if use_weight_only:
+                    v = np.ascontiguousarray(split_v.transpose())
+                    processed_torch_weights, torch_weight_scales = \
+                        torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                        torch.tensor(v), plugin_weight_only_quant_type)
+                    dst.value = processed_torch_weights.numpy()
+                    scales = tensorrt_llm_internlm.layers[
+                        idx].attention.qkv.per_channel_scale
+                    scales.value = torch_weight_scales.numpy()
+                else:
+                    dst.value = np.ascontiguousarray(split_v)
+            elif 'self_attn.qkv_proj.bias' in k:
+                dst = tensorrt_llm_internlm.layers[idx].attention.qkv.bias
+                if not mha_mode:
+                    assert isinstance(v, list) and len(v) == 3
+                    bq = split(v[0], mapping.tp_size, mapping.tp_rank)
+                    bk = split(v[1], mapping.tp_size, mapping.tp_rank)
+                    bv = split(v[2], mapping.tp_size, mapping.tp_rank)
+                    split_v = np.concatenate((bq, bk, bv))
+                else:
+                    q_emb = v.shape[0] // 3
+                    v = v.reshape(3, q_emb)
+                    split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1)
+                    split_v = split_v.reshape(3 * (q_emb // mapping.tp_size))
+                dst.value = np.ascontiguousarray(split_v)
+            elif 'self_attn.o_proj.weight' in k:
+                dst = tensorrt_llm_internlm.layers[idx].attention.dense.weight
+                split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1)
+                if use_weight_only:
+                    v = np.ascontiguousarray(split_v.transpose())
+                    processed_torch_weights, torch_weight_scales = \
+                        torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                        torch.tensor(v), plugin_weight_only_quant_type)
+                    dst.value = processed_torch_weights.numpy()
+                    scales = tensorrt_llm_internlm.layers[
+                        idx].attention.dense.per_channel_scale
+                    scales.value = torch_weight_scales.numpy()
+                else:
+                    dst.value = np.ascontiguousarray(split_v)
+            elif 'self_attn.o_proj.bias' in k:
+                dst = tensorrt_llm_internlm.layers[idx].attention.dense.bias
+                split_v = v  # no need to divide among ranks?
+                dst.value = np.ascontiguousarray(split_v)
+            elif 'mlp.up_proj.weight' in k:
+                dst = tensorrt_llm_internlm.layers[idx].mlp.gate.weight
+                split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=0)
+                if use_weight_only:
+                    v = np.ascontiguousarray(split_v.transpose())
+                    processed_torch_weights, torch_weight_scales = \
+                        torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                        torch.tensor(v), plugin_weight_only_quant_type)
+                    dst.value = processed_torch_weights.numpy()
+                    scales = tensorrt_llm_internlm.layers[
+                        idx].mlp.gate.per_channel_scale
+                    scales.value = torch_weight_scales.numpy()
+                else:
+                    dst.value = np.ascontiguousarray(split_v)
+            elif 'mlp.down_proj.weight' in k:
+                dst = tensorrt_llm_internlm.layers[idx].mlp.proj.weight
+                split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=1)
+                if use_weight_only:
+                    v = np.ascontiguousarray(split_v.transpose())
+                    processed_torch_weights, torch_weight_scales = \
+                        torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                        torch.tensor(v), plugin_weight_only_quant_type)
+                    dst.value = processed_torch_weights.numpy()
+                    scales = tensorrt_llm_internlm.layers[
+                        idx].mlp.proj.per_channel_scale
+                    scales.value = torch_weight_scales.numpy()
+                else:
+                    dst.value = np.ascontiguousarray(split_v)
+            elif 'mlp.gate_proj.weight' in k:
+                dst = tensorrt_llm_internlm.layers[idx].mlp.fc.weight
+                split_v = split(v, mapping.tp_size, mapping.tp_rank, dim=0)
+                if use_weight_only:
+                    v = np.ascontiguousarray(split_v.transpose())
+                    processed_torch_weights, torch_weight_scales = \
+                        torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                        torch.tensor(v), plugin_weight_only_quant_type)
+                    dst.value = processed_torch_weights.numpy()
+                    scales = tensorrt_llm_internlm.layers[
+                        idx].mlp.fc.per_channel_scale
+                    scales.value = torch_weight_scales.numpy()
+                else:
+                    dst.value = np.ascontiguousarray(split_v)
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}')
+    return
+
+
+def load_from_meta_internlm(
+        tensorrt_llm_internlm: tensorrt_llm.models.InternLMForCausalLM,
+        meta_ckpt_dir,
+        mapping=Mapping(),
+        dtype="float32"):
+    torch_dtype = str_dtype_to_torch(dtype)
+
+    def gather_ckpts(ckpts):
+        gathered = {}
+        for k in ckpts[0]:
+            d = 0
+            if any([n in k for n in ["wo", "w2", "tok"]]):
+                d = 1
+            if "norm" in k or "rope" in k:  # no TP
+                gathered[k] = ckpts[0][k].clone()
+            else:
+                gathered[k] = torch.cat([pt[k] for pt in ckpts], dim=d).clone()
+        return gathered
+
+    def split_ckpt(ckpt, ranks_per_ckpt, ckpt_rank):
+        split_ckpt = {}
+        for k in ckpt:
+            d = 0
+            if any([n in k for n in ["wo", "w2", "tok"]]):
+                d = 1
+            if "norm" in k or "rope" in k:  # no TP
+                split_ckpt[k] = ckpt[k].clone()
+            elif tensorrt_llm_internlm.num_kv_heads < mapping.tp_size and any(
+                [n in k for n in ["wk", "wv"]]):
+                assert mapping.tp_size % tensorrt_llm_internlm.num_kv_heads == 0
+                # special case: we need to duplicate KV head
+                tmp = dup_kv_weight(ckpt[k], tensorrt_llm_internlm.num_kv_heads,
+                                    mapping.tp_size)
+                split_ckpt[k] = torch.split(tmp,
+                                            tmp.shape[d] // ranks_per_ckpt,
+                                            dim=d)[ckpt_rank].clone()
+            else:
+                split_ckpt[k] = torch.split(ckpt[k],
+                                            ckpt[k].shape[d] // ranks_per_ckpt,
+                                            dim=d)[ckpt_rank].clone()
+        return split_ckpt
+
+    def get_current_weights(num_ckpts):
+        if num_ckpts > mapping.tp_size:
+            # combine ckpts
+            assert (num_ckpts % mapping.tp_size) == 0
+            nf = num_ckpts // mapping.tp_size
+            fs = nf * mapping.tp_rank
+            file_ids = list(range(fs, fs + nf))
+            ckpts = []
+            for f in file_ids:
+                ckpt = torch.load(Path(meta_ckpt_dir,
+                                       f"consolidated.{f:02d}.pth"),
+                                  map_location="cpu")
+                ckpts.append(ckpt)
+            return gather_ckpts(ckpts)
+        elif num_ckpts < mapping.tp_size:
+            # split ckpt
+            assert (mapping.tp_size % num_ckpts) == 0
+            ranks_per_ckpt = mapping.tp_size // num_ckpts
+            ckpt_fid = mapping.tp_rank // ranks_per_ckpt
+            ckpt_rank = mapping.tp_rank % ranks_per_ckpt
+            nH_per_ckpt = tensorrt_llm_internlm.num_heads // num_ckpts
+            assert (nH_per_ckpt % ranks_per_ckpt) == 0
+            ckpt = torch.load(Path(meta_ckpt_dir,
+                                   f"consolidated.{ckpt_fid:02d}.pth"),
+                              map_location="cpu")
+            return split_ckpt(ckpt, ranks_per_ckpt, ckpt_rank)
+
+        # num_ckpts == tensor_parallel, 1:1 mapping from files to TP
+        return torch.load(Path(meta_ckpt_dir,
+                               f"consolidated.{mapping.tp_rank:02d}.pth"),
+                          map_location="cpu")
+
+    def permute(w, nH, d, dH):
+        # due to MQA's wk, nH*dH != d could be true
+        return w.view(nH, dH // 2, 2, d).transpose(1, 2).reshape(nH * dH, d)
+
+    if not hasattr(load_from_meta_internlm, "saved_embed"):
+        load_from_meta_internlm.saved_embed = None
+
+    def gather_embedding(cur_embed, name: str, num_ckpts):
+        if mapping.tp_size == 1:
+            # even if num_ckpts > 1, get_current_weights will already have it gathered
+            return cur_embed
+        if load_from_meta_internlm.saved_embed is None:
+            embeds = [None] * num_ckpts
+            for i in range(num_ckpts):
+                ckpt = torch.load(Path(meta_ckpt_dir,
+                                       f"consolidated.{i:02d}.pth"),
+                                  map_location="cpu")
+                embeds[i] = ckpt[name]
+            embed = torch.cat(embeds, dim=1).to(torch_dtype)
+            load_from_meta_internlm.saved_embed = torch_to_numpy(
+                embed)  # cache the embedding, not needed if no refit
+        return load_from_meta_internlm.saved_embed
+
+    tensorrt_llm.logger.info(
+        'Loading weights from Meta InternLM checkpoints ...')
+    tik = time.time()
+
+    quant_mode = getattr(tensorrt_llm_internlm, 'quant_mode', QuantMode(0))
+    if quant_mode.is_int8_weight_only():
+        torch.int8
+    elif quant_mode.is_int4_weight_only():
+        torch.quint4x2
+    quant_mode.is_weight_only()
+    num_kv_heads = tensorrt_llm_internlm.num_kv_heads
+    mha_mode = (num_kv_heads == tensorrt_llm_internlm.num_heads)
+
+    ckpts = list(Path(meta_ckpt_dir).glob("consolidated.*.pth"))
+    num_ckpts = len(ckpts)
+    # internlm/internlm2 doesn't have MQA. So, simplifying loader logic by not worrying about it.
+    assert num_kv_heads > 1 or num_kv_heads >= num_ckpts, \
+        f"We don't know how the {num_kv_heads} KV heads are distributed among {num_ckpts} checkpoints."
+
+    head_size = tensorrt_llm_internlm.hidden_size // tensorrt_llm_internlm.num_heads
+    ckpt = get_current_weights(num_ckpts)
+    layers_range = list(
+        range(mapping.pp_rank * tensorrt_llm_internlm.num_layers,
+              (mapping.pp_rank + 1) * tensorrt_llm_internlm.num_layers, 1))
+
+    for l in layers_range:
+        prefix = f'layers.{l}.attention.'
+        q_weight = permute(ckpt[prefix + 'wq.weight'].clone(),
+                           nH=(tensorrt_llm_internlm.num_heads //
+                               mapping.tp_size),
+                           d=tensorrt_llm_internlm.hidden_size,
+                           dH=head_size)
+        if num_kv_heads < mapping.tp_size and num_ckpts >= mapping.tp_size:
+            assert mapping.tp_size % num_kv_heads == 0
+            assert False, "Not supported yet"
+        k_weight = permute(ckpt[prefix + 'wk.weight'].clone(),
+                           nH=((num_kv_heads + mapping.tp_size - 1) //
+                               mapping.tp_size),
+                           d=tensorrt_llm_internlm.hidden_size,
+                           dH=head_size)
+        v_weight = ckpt[prefix + 'wv.weight'].clone()
+
+        qkv_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
+        ckpt[prefix + 'qkv.weight'] = qkv_weight
+
+    for k, v in ckpt.items():
+        v = torch_to_numpy(v.to(torch_dtype).detach().cpu())
+        if "tok_embeddings" in k:
+            if not tensorrt_llm_internlm.use_parallel_embedding:
+                v = gather_embedding(v, k, num_ckpts)
+            elif tensorrt_llm_internlm.embedding_sharding_dim == 0:
+                # this needs a gather and then resplit along different dims
+                v = gather_embedding(v, k, num_ckpts)
+                v = split(v, mapping.tp_size, mapping.tp_rank, 0)
+            if mapping.is_first_pp_rank():
+                tensorrt_llm_internlm.vocab_embedding.weight.value = v
+        elif "output" in k:
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_internlm.lm_head.weight.value = v
+        elif k == "norm.weight":
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_internlm.ln_f.weight.value = v
+        else:
+            # layer specific weights
+            layer_idx = extract_layer_idx(k)
+            if layer_idx is None:
+                continue
+            idx = int(
+                layer_idx) - mapping.pp_rank * tensorrt_llm_internlm.num_layers
+            if idx >= tensorrt_llm_internlm.num_layers:
+                continue
+            if 'attention_norm.weight' in k:
+                tensorrt_llm_internlm.layers[
+                    idx].input_layernorm.weight.value = v
+            elif 'ffn_norm.weight' in k:
+                tensorrt_llm_internlm.layers[
+                    idx].post_layernorm.weight.value = v
+            elif 'feed_forward.w3.weight' in k:
+                tensorrt_llm_internlm.layers[idx].mlp.gate.weight.value = v
+            elif 'feed_forward.w2.weight' in k:
+                tensorrt_llm_internlm.layers[idx].mlp.proj.weight.value = v
+            elif 'feed_forward.w1.weight' in k:
+                tensorrt_llm_internlm.layers[idx].mlp.fc.weight.value = v
+            elif 'attention.wo.weight' in k:
+                tensorrt_llm_internlm.layers[
+                    idx].attention.dense.weight.value = v
+            elif 'attention.qkv.weight' in k:
+                tensorrt_llm_internlm.layers[idx].attention.qkv.weight.value = v
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}')
+    return
+
+
+def load_from_binary(tensorrt_llm_internlm: InternLMForCausalLM,
+                     dir_path,
+                     mapping=Mapping(),
+                     fp16=False,
+                     multi_query_mode=False):
+    tensorrt_llm.logger.info('Loading weights from FT...')
+    tik = time.time()
+
+    quant_mode = getattr(tensorrt_llm_internlm, 'quant_mode', QuantMode(0))
+
+    n_embd, n_head, n_layer, n_positions, vocab_size, hidden_act, inter_size, n_kv_head, attn_bias = parse_ft_config(
+        Path(dir_path) / 'config.ini')
+    np_dtype = np.float16 if fp16 else np.float32
+
+    def fromfile(dir_path, name, shape=None, dtype=None):
+        dtype = np_dtype if dtype is None else dtype
+        p = dir_path + '/' + name
+        print(f"Loading from {str(p)}")
+        if Path(p).exists():
+            t = np.fromfile(p, dtype=dtype)
+            if shape is not None:
+                t = t.reshape(shape)
+            return t
+        return None
+
+    def set_smoothquant_scale_factors(module,
+                                      pre_scale_weight,
+                                      dir_path,
+                                      basename,
+                                      shape,
+                                      per_tok_dyn,
+                                      per_channel,
+                                      is_qkv=False,
+                                      rank=None):
+        suffix = "bin"
+        if per_channel:
+            if rank is not None:
+                suffix = f"{rank}." + suffix
+            suffix = "col." + suffix
+
+        col_shape = shape if (per_channel or is_qkv) else [1, 1]
+
+        if per_tok_dyn:
+            if pre_scale_weight is not None:
+                pre_scale_weight.value = np.array([1.0], dtype=np.float32)
+            if is_qkv and not per_channel:
+                t = fromfile(dir_path,
+                             f"{basename}scale_w_quant_orig.{rank}.{suffix}",
+                             col_shape, np.float32)
+            else:
+                t = fromfile(dir_path, f"{basename}scale_w_quant_orig.{suffix}",
+                             col_shape, np.float32)
+            module.per_channel_scale.value = t
+        else:
+            t = fromfile(dir_path, f"{basename}scale_x_orig_quant.bin", [1],
+                         np.float32)
+            pre_scale_weight.value = t
+            if is_qkv:
+                t = fromfile(dir_path,
+                             f"{basename}scale_y_accum_quant.{rank}.{suffix}",
+                             col_shape, np.float32)
+            else:
+                t = fromfile(dir_path,
+                             f"{basename}scale_y_accum_quant.{suffix}",
+                             col_shape, np.float32)
+            module.per_channel_scale.value = t
+            t = fromfile(dir_path, f"{basename}scale_y_quant_orig.bin", [1, 1],
+                         np.float32)
+            module.act_scale.value = t
+
+    def set_smoother(module, dir_path, base_name, shape, rank):
+        suffix = f"{rank}.bin"
+        t = fromfile(dir_path, f"{base_name}.smoother.{suffix}", shape,
+                     np.float32)
+        module.smoother.value = t
+
+    # Determine the quantization mode.
+    quant_mode = getattr(tensorrt_llm_internlm, "quant_mode", QuantMode(0))
+    if quant_mode.is_int8_weight_only():
+        plugin_weight_only_quant_type = torch.int8
+    elif quant_mode.is_int4_weight_only():
+        plugin_weight_only_quant_type = torch.quint4x2
+    # Do we use SmoothQuant?
+    use_smooth_quant = quant_mode.has_act_and_weight_quant()
+    # Do we use quantization per token?
+    quant_per_token_dyn = quant_mode.has_per_token_dynamic_scaling()
+    # Do we use quantization per channel?
+    quant_per_channel = quant_mode.has_per_channel_scaling()
+
+    # Do we use INT4/INT8 weight-only?
+    use_weight_only = quant_mode.is_weight_only()
+
+    # Int8 KV cache
+    use_int8_kv_cache = quant_mode.has_int8_kv_cache()
+
+    # Debug
+    suffix = gen_suffix(mapping.tp_rank, use_smooth_quant, quant_per_channel)
+    # The type of weights.
+    w_type = np_dtype if not use_smooth_quant else np.int8
+
+    if mapping.is_first_pp_rank():
+        tensorrt_llm_internlm.vocab_embedding.weight.value = (fromfile(
+            dir_path, 'vocab_embedding.weight.bin', [vocab_size, n_embd]))
+
+    if mapping.is_last_pp_rank():
+        tensorrt_llm_internlm.ln_f.weight.value = (fromfile(
+            dir_path, 'ln_f.weight.bin'))
+    # share input embedding
+    lm_head_weight = fromfile(dir_path, 'lm_head.weight.bin',
+                              [vocab_size, n_embd])
+
+    if vocab_size % mapping.tp_size != 0:
+        # padding
+        vocab_size_padded = tensorrt_llm_internlm.lm_head.out_features * mapping.tp_size
+        pad_width = vocab_size_padded - vocab_size
+        lm_head_weight = np.pad(lm_head_weight, ((0, pad_width), (0, 0)),
+                                'constant',
+                                constant_values=0)
+    if mapping.is_last_pp_rank():
+        tensorrt_llm_internlm.lm_head.weight.value = np.ascontiguousarray(
+            split(lm_head_weight, mapping.tp_size, mapping.tp_rank))
+
+    layers_range = list(
+        range(mapping.pp_rank * tensorrt_llm_internlm.num_layers,
+              (mapping.pp_rank + 1) * tensorrt_llm_internlm.num_layers, 1))
+
+    for i in layers_range:
+        n_groups = n_head // n_kv_head
+        c_attn_out_dim = (
+            3 * n_embd // mapping.tp_size) if not multi_query_mode else (
+                n_embd // mapping.tp_size +
+                (n_embd // n_head * n_groups) // mapping.tp_size * 2)
+        idx = i - mapping.pp_rank * tensorrt_llm_internlm.num_layers
+        tensorrt_llm_internlm.layers[idx].input_layernorm.weight.value = (
+            fromfile(dir_path,
+                     'model.layers.' + str(i) + '.input_layernorm.weight.bin'))
+
+        t = fromfile(
+            dir_path, 'model.layers.' + str(i) +
+            '.attention.query_key_value.weight.' + suffix,
+            [n_embd, c_attn_out_dim], w_type)
+        if t is not None:
+            dst = tensorrt_llm_internlm.layers[idx].attention.qkv.weight
+            if use_smooth_quant:
+                dst.value = (np.ascontiguousarray(np.transpose(t, [1, 0])))
+                set_smoothquant_scale_factors(
+                    tensorrt_llm_internlm.layers[idx].attention.qkv,
+                    tensorrt_llm_internlm.layers[idx].input_layernorm.
+                    scale_to_int,
+                    dir_path,
+                    'model.layers.' + str(i) + '.attention.query_key_value.',
+                    [1, c_attn_out_dim],
+                    quant_per_token_dyn,
+                    quant_per_channel,
+                    rank=mapping.tp_rank,
+                    is_qkv=True)
+            elif use_weight_only:
+                processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                    torch.tensor(t), plugin_weight_only_quant_type)
+                # workaround for trt not supporting int8 inputs in plugins currently
+                dst.value = processed_torch_weights.view(
+                    dtype=torch.float32).numpy()
+                scales = tensorrt_llm_internlm.layers[
+                    i].attention.qkv.per_channel_scale
+                scales.value = torch_weight_scales.numpy()
+            else:
+                dst.value = np.ascontiguousarray(np.transpose(t, [1, 0]))
+
+        dst = tensorrt_llm_internlm.layers[idx].attention.dense.weight
+        t = fromfile(
+            dir_path,
+            'model.layers.' + str(i) + '.attention.dense.weight.' + suffix,
+            [n_embd // mapping.tp_size, n_embd], w_type)
+        if use_smooth_quant:
+            dst.value = (np.ascontiguousarray(np.transpose(t, [1, 0])))
+            dense_scale = getattr(tensorrt_llm_internlm.layers[idx].attention,
+                                  "quantization_scaling_factor", None)
+            set_smoothquant_scale_factors(
+                tensorrt_llm_internlm.layers[idx].attention.dense, dense_scale,
+                dir_path, 'model.layers.' + str(i) + '.attention.dense.',
+                [1, n_embd], quant_per_token_dyn, quant_per_channel)
+            set_smoother(tensorrt_llm_internlm.layers[idx].attention.dense,
+                         dir_path,
+                         'model.layers.' + str(i) + '.attention.dense',
+                         [1, n_embd // mapping.tp_size], mapping.tp_rank)
+        elif use_weight_only:
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_internlm.layers[
+                i].attention.dense.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            dst.value = np.ascontiguousarray(np.transpose(t, [1, 0]))
+
+        if attn_bias:
+            dst = tensorrt_llm_internlm.layers[idx].attention.qkv.bias
+            t = fromfile(
+                dir_path, 'model.layers.' + str(i) +
+                f'.attention.query_key_value.bias.{mapping.tp_rank}.bin')
+            dst.value = np.ascontiguousarray(t)
+
+            dst = tensorrt_llm_internlm.layers[idx].attention.dense.bias
+            t = fromfile(dir_path,
+                         'model.layers.' + str(i) + '.attention.dense.bias.bin')
+            dst.value = np.ascontiguousarray(t)
+
+        dst = tensorrt_llm_internlm.layers[idx].post_layernorm.weight
+        dst.value = fromfile(
+            dir_path, 'model.layers.' + str(i) + '.post_layernorm.weight.bin')
+
+        t = fromfile(dir_path,
+                     'model.layers.' + str(i) + '.mlp.fc.weight.' + suffix,
+                     [n_embd, inter_size // mapping.tp_size], w_type)
+
+        if use_smooth_quant:
+            tensorrt_llm_internlm.layers[idx].mlp.fc.weight.value = (
+                np.ascontiguousarray(np.transpose(t, [1, 0])))
+            set_smoothquant_scale_factors(
+                tensorrt_llm_internlm.layers[idx].mlp.fc,
+                tensorrt_llm_internlm.layers[idx].post_layernorm.scale_to_int,
+                dir_path,
+                'model.layers.' + str(i) + '.mlp.fc.',
+                [1, inter_size // mapping.tp_size],
+                quant_per_token_dyn,
+                quant_per_channel,
+                rank=mapping.tp_rank)
+        elif use_weight_only:
+            dst = tensorrt_llm_internlm.layers[i].mlp.fc.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_internlm.layers[i].mlp.fc.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            tensorrt_llm_internlm.layers[
+                idx].mlp.fc.weight.value = np.ascontiguousarray(
+                    np.transpose(t, [1, 0]))
+
+        t = fromfile(dir_path,
+                     'model.layers.' + str(i) + '.mlp.gate.weight.' + suffix,
+                     [n_embd, inter_size // mapping.tp_size], w_type)
+        if use_smooth_quant:
+            tensorrt_llm_internlm.layers[idx].mlp.gate.weight.value = (
+                np.ascontiguousarray(np.transpose(t, [1, 0])))
+            set_smoothquant_scale_factors(
+                tensorrt_llm_internlm.layers[idx].mlp.gate,
+                tensorrt_llm_internlm.layers[idx].post_layernorm.scale_to_int,
+                dir_path,
+                'model.layers.' + str(i) + '.mlp.gate.',
+                [1, inter_size // mapping.tp_size],
+                quant_per_token_dyn,
+                quant_per_channel,
+                rank=mapping.tp_rank)
+        elif use_weight_only:
+            dst = tensorrt_llm_internlm.layers[i].mlp.gate.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_internlm.layers[i].mlp.gate.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            tensorrt_llm_internlm.layers[
+                idx].mlp.gate.weight.value = np.ascontiguousarray(
+                    np.transpose(t, [1, 0]))
+
+        t = fromfile(dir_path,
+                     'model.layers.' + str(i) + '.mlp.proj.weight.' + suffix,
+                     [inter_size // mapping.tp_size, n_embd], w_type)
+        if use_smooth_quant:
+            tensorrt_llm_internlm.layers[idx].mlp.proj.weight.value = (
+                np.ascontiguousarray(np.transpose(t, [1, 0])))
+            proj_scale = getattr(tensorrt_llm_internlm.layers[idx].mlp,
+                                 "quantization_scaling_factor", None)
+            set_smoothquant_scale_factors(
+                tensorrt_llm_internlm.layers[idx].mlp.proj, proj_scale,
+                dir_path, 'model.layers.' + str(i) + '.mlp.proj.', [1, n_embd],
+                quant_per_token_dyn, quant_per_channel)
+            set_smoother(tensorrt_llm_internlm.layers[idx].mlp.proj, dir_path,
+                         'model.layers.' + str(i) + '.mlp.proj',
+                         [1, inter_size // mapping.tp_size], mapping.tp_rank)
+        elif use_weight_only:
+            dst = tensorrt_llm_internlm.layers[i].mlp.proj.weight
+            processed_torch_weights, torch_weight_scales = torch.ops.fastertransformer.symmetric_quantize_last_axis_of_batched_matrix(
+                torch.tensor(t), plugin_weight_only_quant_type)
+            dst.value = processed_torch_weights.numpy()
+            scales = tensorrt_llm_internlm.layers[i].mlp.proj.per_channel_scale
+            scales.value = torch_weight_scales.numpy()
+        else:
+            tensorrt_llm_internlm.layers[idx].mlp.proj.weight.value = (
+                np.ascontiguousarray(np.transpose(t, [1, 0])))
+
+        if use_int8_kv_cache:
+            t = fromfile(
+                dir_path, 'model.layers.' + str(i) +
+                '.attention.query_key_value.scale_y_quant_orig.bin', [1],
+                np.float32)
+            tensorrt_llm_internlm.layers[
+                idx].attention.kv_orig_quant_scale.value = 1.0 / t
+            tensorrt_llm_internlm.layers[
+                idx].attention.kv_quant_orig_scale.value = t
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}')
+
+
+def load_from_gptq_internlm(tensorrt_llm_internlm,
+                            quant_ckpt_path,
+                            mapping=Mapping(),
+                            dtype="float16"):
+    tensorrt_llm.logger.info(
+        'Loading weights from groupwise GPTQ InternLM safetensors...')
+    tik = time.time()
+
+    if quant_ckpt_path.endswith(".safetensors"):
+        groupwise_qweight_safetensors = safe_open(quant_ckpt_path,
+                                                  framework="pt",
+                                                  device=0)
+        model_params = {
+            key: groupwise_qweight_safetensors.get_tensor(key)
+            for key in groupwise_qweight_safetensors.keys()
+        }
+    elif quant_ckpt_path.endswith(".pt"):
+        model_params = torch.load(quant_ckpt_path,
+                                  map_location=torch.device('cpu'))
+    else:
+        assert False, "Quantized checkpoint format not supported!"
+
+    def unpack_int32_into_int8(w_packed):
+        # Unpack inputs packed in int32/float32 into uint4 and store them in int8 format
+        w_packed_int4x2 = w_packed.contiguous().view(torch.uint8)
+        w_unpacked = torch.zeros(w_packed_int4x2.shape[0],
+                                 w_packed_int4x2.shape[1] * 2,
+                                 dtype=torch.int8)
+        w_unpacked[:, ::2] = w_packed_int4x2 % 16
+        w_unpacked[:, 1::2] = w_packed_int4x2 // 16
+        return w_unpacked.contiguous()
+
+    def preprocess_groupwise_weight_params(weight_name,
+                                           qweight_int32=None,
+                                           qzeros_int32=None,
+                                           scales_fp16=None):
+        if weight_name is not None:
+            qweight_int32 = model_params[weight_name].cpu()
+            qzeros_int32 = model_params[weight_name[:-7] + 'qzeros'].cpu()
+            scales_fp16 = model_params[weight_name[:-7] + 'scales'].cpu()
+
+        UINT4_TO_INT4_FLAG = 1
+        GPTQ_FLAG = 1
+        packer = torch.ops.fastertransformer.pack_int8_tensor_to_packed_int4
+        preprocessor = torch.ops.fastertransformer.preprocess_weights_for_mixed_gemm
+
+        qweight_unpacked_int8 = unpack_int32_into_int8(
+            qweight_int32.T).T.contiguous() - 8
+        qweight_interleaved = preprocessor(packer(qweight_unpacked_int8),
+                                           torch.quint4x2).view(torch.float32)
+        # zeros = zeros * scales
+        qzeros_unpacked_int32 = unpack_int32_into_int8(qzeros_int32)
+        zeros_x_scales_fp16 = (-qzeros_unpacked_int32 + 8 * UINT4_TO_INT4_FLAG -
+                               GPTQ_FLAG) * scales_fp16
+        zeros_x_scales_fp16 = zeros_x_scales_fp16.half()
+
+        # return processed interleaved weight, original scales and zeros * scales
+        return qweight_interleaved.contiguous(), scales_fp16.contiguous(
+        ), zeros_x_scales_fp16.contiguous()
+
+    layer_ids = [
+        extract_layer_idx(key) for key in groupwise_qweight_safetensors.keys()
+    ]
+    layer_ids = [
+        int(layer_idx) for layer_idx in layer_ids if layer_idx is not None
+    ]
+    num_hidden_layers = max(layer_ids) + 1
+    num_kv_heads = tensorrt_llm_internlm.num_kv_heads
+    mha_mode = (num_kv_heads == tensorrt_llm_internlm.num_heads)
+    suffixs = ['qweight', 'qzeros', 'scales']
+
+    layers_per_pipeline_stage = num_hidden_layers // mapping.pp_size
+    layers_range = list(
+        range(mapping.pp_rank * layers_per_pipeline_stage,
+              (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1))
+
+    for l in layers_range:
+        prefix = f'model.layers.{l}.self_attn.'
+        split_qkv_suf = []
+
+        for suf in suffixs:
+            q_part = model_params[prefix + 'q_proj.' + suf].cpu()
+            k_part = model_params[prefix + 'k_proj.' + suf].cpu()
+            v_part = model_params[prefix + 'v_proj.' + suf].cpu()
+            qkv_part = torch.cat([q_part, k_part, v_part], dim=0)
+            dim = qkv_part.shape
+            qkv_part = qkv_part.reshape(3, dim[0] // 3, dim[1])
+            split_qkv = qkv_part.split(dim[1] // mapping.tp_size,
+                                       dim=2)[mapping.tp_rank]
+            split_qkv = torch.cat([
+                split_qkv[0, :, :].squeeze(0), split_qkv[1, :, :].squeeze(0),
+                split_qkv[2, :, :].squeeze(0)
+            ],
+                                  dim=1)
+            split_qkv_suf.append(split_qkv)
+
+        th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params(
+            None, split_qkv_suf[0], split_qkv_suf[1], split_qkv_suf[2])
+
+        idx = l - mapping.pp_rank * layers_per_pipeline_stage
+        tensorrt_llm_internlm.layers[
+            idx].attention.qkv.qweight.value = th_qweight.numpy()
+        tensorrt_llm_internlm.layers[
+            idx].attention.qkv.scale.value = th_zero.numpy()
+        tensorrt_llm_internlm.layers[
+            idx].attention.qkv.zero.value = th_scale.numpy()
+
+    torch_dtype = str_dtype_to_torch(dtype)
+
+    for k, v in model_params.items():
+        if isinstance(v, list):
+            v = [torch_to_numpy(vv.to(torch_dtype).detach().cpu()) for vv in v]
+        else:
+            v = torch_to_numpy(v.to(torch_dtype).detach().cpu())
+        if 'model.embed_tokens.weight' in k:
+            if mapping.is_first_pp_rank():
+                tensorrt_llm_internlm.vocab_embedding.weight.value = v
+        elif 'model.norm.weight' in k:
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_internlm.ln_f.weight.value = v
+        elif 'lm_head.weight' in k:
+            if mapping.is_last_pp_rank():
+                tensorrt_llm_internlm.lm_head.weight.value = np.ascontiguousarray(
+                    split(v, mapping.tp_size, mapping.tp_rank))
+        else:
+            layer_idx = extract_layer_idx(k)
+            if layer_idx is None:
+                continue
+            idx = int(layer_idx)
+            if idx not in layers_range:
+                continue
+            idx = idx - mapping.pp_rank * layers_per_pipeline_stage
+
+            if 'input_layernorm.weight' in k:
+                tensorrt_llm_internlm.layers[
+                    idx].input_layernorm.weight.value = v
+            elif 'post_attention_layernorm.weight' in k:
+                tensorrt_llm_internlm.layers[
+                    idx].post_layernorm.weight.value = v
+            elif 'self_attn.o_proj.qweight' in k:
+                split_v_suf = []
+                for suf in suffixs:
+                    v = model_params[k[:-7] + suf].cpu()
+                    split_v = v.split(v.shape[0] // mapping.tp_size,
+                                      dim=0)[mapping.tp_rank]
+                    split_v_suf.append(split_v)
+                th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params(
+                    None, split_v_suf[0], split_v_suf[1], split_v_suf[2])
+                tensorrt_llm_internlm.layers[
+                    idx].attention.dense.qweight.value = th_qweight.numpy()
+                tensorrt_llm_internlm.layers[
+                    idx].attention.dense.scale.value = th_zero.numpy()
+                tensorrt_llm_internlm.layers[
+                    idx].attention.dense.zero.value = th_scale.numpy()
+            elif 'mlp.up_proj.qweight' in k:
+                split_v_suf = []
+                for suf in suffixs:
+                    v = model_params[k[:-7] + suf].cpu()
+                    split_v = v.split(v.shape[1] // mapping.tp_size,
+                                      dim=1)[mapping.tp_rank]
+                    split_v_suf.append(split_v)
+                th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params(
+                    None, split_v_suf[0], split_v_suf[1], split_v_suf[2])
+                tensorrt_llm_internlm.layers[
+                    idx].mlp.gate.qweight.value = th_qweight.numpy()
+                tensorrt_llm_internlm.layers[
+                    idx].mlp.gate.scale.value = th_zero.numpy()
+                tensorrt_llm_internlm.layers[
+                    idx].mlp.gate.zero.value = th_scale.numpy()
+            elif 'mlp.down_proj.qweight' in k:
+                split_v_suf = []
+                for suf in suffixs:
+                    v = model_params[k[:-7] + suf].cpu()
+                    split_v = v.split(v.shape[0] // mapping.tp_size,
+                                      dim=0)[mapping.tp_rank]
+                    split_v_suf.append(split_v)
+                th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params(
+                    None, split_v_suf[0], split_v_suf[1], split_v_suf[2])
+                tensorrt_llm_internlm.layers[
+                    idx].mlp.proj.qweight.value = th_qweight.numpy()
+                tensorrt_llm_internlm.layers[
+                    idx].mlp.proj.scale.value = th_zero.numpy()
+                tensorrt_llm_internlm.layers[
+                    idx].mlp.proj.zero.value = th_scale.numpy()
+            elif 'mlp.gate_proj.qweight' in k:
+                split_v_suf = []
+                for suf in suffixs:
+                    v = model_params[k[:-7] + suf].cpu()
+                    split_v = v.split(v.shape[1] // mapping.tp_size,
+                                      dim=1)[mapping.tp_rank]
+                    split_v_suf.append(split_v)
+                th_qweight, th_zero, th_scale = preprocess_groupwise_weight_params(
+                    None, split_v_suf[0], split_v_suf[1], split_v_suf[2])
+                tensorrt_llm_internlm.layers[
+                    idx].mlp.fc.qweight.value = th_qweight.numpy()
+                tensorrt_llm_internlm.layers[
+                    idx].mlp.fc.scale.value = th_zero.numpy()
+                tensorrt_llm_internlm.layers[
+                    idx].mlp.fc.zero.value = th_scale.numpy()
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}')
+    return
+
+
+def load_from_awq_internlm(tensorrt_llm_internlm: InternLMForCausalLM,
+                           quant_ckpt_path,
+                           mapping=Mapping(),
+                           dtype="float16"):
+    tensorrt_llm.logger.info(
+        'Loading weights from groupwise AWQ InternLM safetensors...')
+    tik = time.time()
+
+    if quant_ckpt_path.endswith(".safetensors"):
+        groupwise_qweight_safetensors = safe_open(quant_ckpt_path,
+                                                  framework="pt",
+                                                  device=0)
+        awq_internlm = {
+            key: groupwise_qweight_safetensors.get_tensor(key)
+            for key in groupwise_qweight_safetensors.keys()
+        }
+    elif quant_ckpt_path.endswith(".pt"):
+        awq_internlm = torch.load(quant_ckpt_path,
+                                  map_location=torch.device('cpu'))
+    else:
+        assert False, "Quantized checkpoint format not supported!"
+
+    group_size = awq_internlm["model.layers.0.self_attn.o_proj.weight"].numel(
+    ) // awq_internlm[
+        "model.layers.0.self_attn.o_proj.weight_quantizer._amax"].numel()
+
+    awq_internlm_block_names = [
+        "input_layernorm.weight",
+        "post_attention_layernorm.weight",
+    ]
+
+    tensorrt_llm_internlm_block_names = [
+        "input_layernorm.weight",
+        "post_layernorm.weight",
+    ]
+
+    getattr(tensorrt_llm_internlm, 'quant_mode', QuantMode(0))
+
+    packer = torch.ops.fastertransformer.pack_int8_tensor_to_packed_int4
+    preprocessor = torch.ops.fastertransformer.preprocess_weights_for_mixed_gemm
+    torch_dtype = str_dtype_to_torch(dtype)
+
+    def AWQ_quantize_pack_preprocess(weight, scale):
+        scale = scale.repeat_interleave(group_size, dim=0)
+        weight = weight / scale
+        qweight_int8 = torch.clamp(torch.round(weight.cuda()).char(), -8, 7)
+        int4_weight = packer(qweight_int8.cpu())
+        int4_weight = preprocessor(int4_weight, torch.quint4x2)
+        return int4_weight.view(torch.float32).cpu().numpy()
+
+    def process_and_assign_weight(awq_internlm, mPrefix, mOp, tp_dim=0):
+        weight = awq_internlm[mPrefix + ".weight"].T.contiguous()
+        [k, n] = weight.shape
+        weight = weight.split(weight.shape[tp_dim] // mapping.tp_size,
+                              dim=tp_dim)[mapping.tp_rank]
+        amax = awq_internlm[mPrefix + ".weight_quantizer._amax"].reshape(
+            (n, int(k / group_size))).T.contiguous()
+        amax = amax.split(amax.shape[tp_dim] // mapping.tp_size,
+                          dim=tp_dim)[mapping.tp_rank]
+        pre_quant_scale = awq_internlm[
+            mPrefix + ".input_quantizer._pre_quant_scale"].reshape((1, k))
+        if tp_dim == 0:
+            pre_quant_scale = pre_quant_scale.split(k // mapping.tp_size,
+                                                    dim=1)[mapping.tp_rank]
+        scale = amax / 8.0
+        mOp.qweight.value = AWQ_quantize_pack_preprocess(weight, scale)
+        mOp.scale.value = scale.to(torch_dtype).cpu().numpy()
+        mOp.pre_quant_scale.value = pre_quant_scale.to(
+            torch_dtype).cpu().numpy()
+
+    def deSmooth(weight, pre_quant_scale):
+        [k, n] = weight.shape
+        pre_quant_scale = pre_quant_scale.repeat(
+            (n, 1)).transpose(1, 0).contiguous()
+        weight = weight * pre_quant_scale
+        return weight
+
+    def reSmooth(weight, pre_quant_scale):
+        [k, n] = weight.shape
+        pre_quant_scale = pre_quant_scale.repeat(
+            (n, 1)).transpose(1, 0).contiguous()
+        weight = weight / pre_quant_scale
+        return weight
+
+    def get_scale(weight):
+        weight = weight.T.contiguous()
+        [n, k] = weight.shape
+        weight = weight.reshape(n, int(k / group_size), group_size)
+        weight = torch.abs(weight.reshape(-1, group_size))
+        amax, idx = weight.max(1)
+        amax = amax.reshape(n, int(k / group_size)).T.contiguous()
+        return amax / 8
+
+    def reSmooth_and_get_scale(weight, pre_quant_scale, avg_pre_quant_scale):
+        weight = deSmooth(weight, pre_quant_scale)
+        weight = reSmooth(weight, avg_pre_quant_scale)
+        scale = get_scale(weight)
+        return weight, scale
+
+    def process_and_assign_qkv_weight(awq_internlm, prefix, mOp):
+        q_weight = awq_internlm[prefix +
+                                "self_attn.q_proj.weight"].T.contiguous()
+        k_weight = awq_internlm[prefix +
+                                "self_attn.k_proj.weight"].T.contiguous()
+        v_weight = awq_internlm[prefix +
+                                "self_attn.v_proj.weight"].T.contiguous()
+        k = q_weight.shape[0]
+
+        q_weight = q_weight.split(q_weight.shape[1] // mapping.tp_size,
+                                  dim=1)[mapping.tp_rank]
+        k_weight = k_weight.split(k_weight.shape[1] // mapping.tp_size,
+                                  dim=1)[mapping.tp_rank]
+        v_weight = v_weight.split(v_weight.shape[1] // mapping.tp_size,
+                                  dim=1)[mapping.tp_rank]
+
+        q_pre_quant_scale = awq_internlm[
+            prefix +
+            "self_attn.q_proj.input_quantizer._pre_quant_scale"].reshape((1, k))
+        k_pre_quant_scale = awq_internlm[
+            prefix +
+            "self_attn.k_proj.input_quantizer._pre_quant_scale"].reshape((1, k))
+        v_pre_quant_scale = awq_internlm[
+            prefix +
+            "self_attn.v_proj.input_quantizer._pre_quant_scale"].reshape((1, k))
+
+        qkv_pre_quant_scale = (q_pre_quant_scale + k_pre_quant_scale +
+                               v_pre_quant_scale) / 3.0
+        q_weight, q_scale = reSmooth_and_get_scale(q_weight, q_pre_quant_scale,
+                                                   qkv_pre_quant_scale)
+        k_weight, k_scale = reSmooth_and_get_scale(k_weight, k_pre_quant_scale,
+                                                   qkv_pre_quant_scale)
+        v_weight, v_scale = reSmooth_and_get_scale(v_weight, v_pre_quant_scale,
+                                                   qkv_pre_quant_scale)
+
+        qkv_weights = torch.cat((q_weight, k_weight, v_weight), dim=1)
+        qkv_scale = torch.cat((q_scale, k_scale, v_scale), dim=1)
+
+        mOp.pre_quant_scale.value = qkv_pre_quant_scale.to(
+            torch_dtype).cpu().numpy()
+        mOp.qweight.value = AWQ_quantize_pack_preprocess(qkv_weights, qkv_scale)
+        mOp.scale.value = qkv_scale.to(torch_dtype).cpu().numpy()
+
+    # Check if we need to pad vocab
+    v = awq_internlm.get('model.embed_tokens.weight')
+    [vocab_size, k] = v.shape
+    pad_vocab = False
+    pad_vocab_size = vocab_size
+    if vocab_size % 64 != 0:
+        pad_vocab = True
+        pad_vocab_size = int((vocab_size + 63) / 64) * 64
+    if pad_vocab:
+        new_v = torch.zeros([pad_vocab_size, k])
+        new_v[:vocab_size, :] = v
+        v = new_v
+    if mapping.is_first_pp_rank():
+        tensorrt_llm_internlm.vocab_embedding.weight.value = v.to(
+            torch_dtype).cpu().numpy()
+
+    layer_ids = [extract_layer_idx(key) for key in awq_internlm.keys()]
+    layer_ids = [
+        int(layer_idx) for layer_idx in layer_ids if layer_idx is not None
+    ]
+
+    num_hidden_layers = max(layer_ids) + 1
+    layers_per_pipeline_stage = num_hidden_layers // mapping.pp_size
+    layers_range = list(
+        range(mapping.pp_rank * layers_per_pipeline_stage,
+              (mapping.pp_rank + 1) * layers_per_pipeline_stage, 1))
+
+    for layer_idx in layers_range:
+        prefix = "model.layers." + str(layer_idx) + "."
+        tensorrt_llm.logger.info(f'Process weights in layer: {layer_idx}')
+        for idx, awq_attr in enumerate(awq_internlm_block_names):
+            v = awq_internlm[prefix + awq_attr]
+            layer = attrgetter(tensorrt_llm_internlm_block_names[idx])(
+                tensorrt_llm_internlm.layers[layer_idx])
+            setattr(layer, 'value', v.to(torch_dtype).cpu().numpy())
+
+        # Attention QKV Linear
+        # concatenate the Q, K, V layers weights.
+        process_and_assign_qkv_weight(
+            awq_internlm, prefix,
+            tensorrt_llm_internlm.layers[layer_idx].attention.qkv)
+
+        # Attention Dense (out_proj) Linear
+        mPrefix = prefix + "self_attn.o_proj"
+        mOp = tensorrt_llm_internlm.layers[layer_idx].attention.dense
+        process_and_assign_weight(awq_internlm, mPrefix, mOp, 0)
+
+        # MLP up_proj (mlp.gate) Linear
+        mPrefix = prefix + "mlp.up_proj"
+        mOp = tensorrt_llm_internlm.layers[layer_idx].mlp.gate
+        process_and_assign_weight(awq_internlm, mPrefix, mOp, 1)
+
+        # MLP down_proj (mlp.proj) Linear
+        mPrefix = prefix + "mlp.down_proj"
+        mOp = tensorrt_llm_internlm.layers[layer_idx].mlp.proj
+        process_and_assign_weight(awq_internlm, mPrefix, mOp, 0)
+
+        # MLP gate_proj (mlp.fc) Linear
+        mPrefix = prefix + "mlp.gate_proj"
+        mOp = tensorrt_llm_internlm.layers[layer_idx].mlp.fc
+        process_and_assign_weight(awq_internlm, mPrefix, mOp, 1)
+
+    v = awq_internlm['model.norm.weight']
+    if mapping.is_last_pp_rank():
+        tensorrt_llm_internlm.ln_f.weight.value = v.to(
+            torch_dtype).cpu().numpy()
+
+    #lm_head
+    if pad_vocab:
+        weight = awq_internlm['lm_head.weight']
+        [vocab_size, k] = weight.shape
+        new_weight = torch.zeros([pad_vocab_size, k])
+        new_weight[:vocab_size, :] = weight
+        new_weight = new_weight.T.contiguous()
+        amax = awq_internlm['lm_head.weight_quantizer._amax'].reshape(
+            [vocab_size, k // group_size])
+        new_amax = torch.ones([pad_vocab_size, k // group_size])
+        new_amax[:vocab_size, :] = amax
+        new_amax = new_amax.T.contiguous()
+        new_scale = new_amax / 8
+        tensorrt_llm_internlm.lm_head.qweight.value = AWQ_quantize_pack_preprocess(
+            new_weight, new_scale)
+        tensorrt_llm_internlm.lm_head.scale.value = new_scale.to(
+            torch_dtype).cpu().numpy()
+        tensorrt_llm_internlm.lm_head.pre_quant_scale.value = awq_internlm[
+            'lm_head.input_quantizer._pre_quant_scale'].to(
+                torch_dtype).cpu().numpy()
+    else:
+        mPrefix = "lm_head"
+        mOp = tensorrt_llm_internlm.lm_head
+        if mapping.is_last_pp_rank():
+            process_and_assign_weight(awq_internlm, mPrefix, mOp, 1)
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    tensorrt_llm.logger.info(f'Weights loaded. Total time: {t}')
diff --git a/examples/llama/README.md b/examples/llama/README.md
index b147e68875..d7df10f86e 100644
--- a/examples/llama/README.md
+++ b/examples/llama/README.md
@@ -14,7 +14,10 @@ The TensorRT-LLM LLaMA implementation can be found in [tensorrt_llm/models/llama
   * FP16
   * FP8
   * INT8 & INT4 Weight-Only
+  * SmoothQuant
+  * Groupwise quantization (AWQ/GPTQ)
   * FP8 KV CACHE
+  * INT8 KV CACHE (+ AWQ/per-channel weight-only)
   * Tensor Parallel
   * STRONGLY TYPED
 
@@ -152,8 +155,18 @@ python build.py --meta_ckpt_dir ./tmp/llama/70B \
 
 Same instructions can be applied to fine-tuned versions of the LLaMA v2 models (e.g. 7Bf or llama-2-7b-chat).
 
-#### INT8 weight only + INT8 KV cache
-For INT8 KV cache, [`hf_llama_convert.py`](./hf_llama_convert.py) features a
+### Using RoPE Scaling
+RoPE scaling is supported through GPT Attention Plugin. You can add `--rotary_scaling <type> <factor>` during the build command to enable it.
+- The value of `type` can be either `linear` and `dynamic`.
+- The value of `factor` can be any value larger than `1.0`.
+
+The implementation is identical to Huggingface's.
+Please refer to https://huggingface.co/docs/transformers/model_doc/llama2#transformers.LlamaConfig.rope_scaling for more details.
+
+#### INT8 KV cache
+INT8 KV cache could be enabled to reduce memory footprint. It will bring more performance gains when batch size gets larger.
+
+You can get the INT8 scale of KV cache through [`hf_llama_convert.py`](./hf_llama_convert.py), which features a
 `--calibrate-kv-cache, -kv` option. Setting `-kv` will calibrate the model,
 and then export the scaling factors needed for INT8 KV cache inference.
 
@@ -166,9 +179,11 @@ python3 hf_llama_convert.py -i /llama-models/llama-7b-hf -o /llama/smooth_llama_
 
 [`build.py`](./build.py) add new options for the support of INT8 KV cache.
 
-`--int8_kv_cache` is the command-line option to enable INT8 KV cache.
+`--int8_kv_cache` is the command-line option to enable INT8 KV cache, and `--ft_model_dir` should contain the directory where the INT8 KV cache scales lie in.
 
-In addition, it could be combined with INT8 weight-only quantization, as follows:
+**INT8 KV cache + per-channel weight-only quantization**
+
+INT8 KV cache could be combined with per-channel weight-only quantization, as follows:
 
 Examples of INT8 weight-only quantization + INT8 KV cache
 
@@ -193,6 +208,38 @@ python summarize.py --test_trt_llm \
                     --test_hf
 ```
 
+**INT8 KV cache + AWQ**
+
+In addition, you can enable INT8 KV cache together with AWQ (per-group INT4 weight-only quantization)like the following command.
+
+**NOTE**: AWQ checkpoint is passed through `--model_dir`, and the INT8 scales of KV cache is through `--ft_model_dir`.
+
+```bash
+python build.py --model_dir ./tmp/llama/7B/ \
+                --quant_ckpt_path ./llama-7b-4bit-gs128-awq.pt \
+                --dtype float16 \
+                --remove_input_padding \
+                --use_gpt_attention_plugin float16 \
+                --enable_context_fmha \
+                --use_gemm_plugin float16 \
+                --use_weight_only \
+                --weight_only_precision int4_awq \
+                --per_group \
+                --output_dir ./tmp/llama/7B/trt_engines/int8_kv_cache_int4_AWQ/1-gpu/
+                --int8_kv_cache \ # Turn on INT8 KV cache
+                --ft_model_dir /llama/smooth_llama_7B/int8_kv_cache/1-gpu/ # Directory to look for INT8 scale of KV cache
+```
+
+Test with `summarize.py`:
+
+```bash
+python summarize.py --test_trt_llm \
+                    --hf_model_location /llama-models/llama-7b-hf \
+                    --data_type fp16 \
+                    --engine_dir ./tmp/llama/7B/trt_engines/int8_kv_cache_int4_AWQ/1-gpu \
+                    --test_hf
+```
+
 #### SmoothQuant
 
 The smoothquant supports both LLaMA v1 and LLaMA v2. Unlike the FP16 build where the HF weights are processed and loaded into the TensorRT-LLM directly, the SmoothQuant needs to load INT8 weights which should be pre-processed before building an engine.
diff --git a/examples/llama/build.py b/examples/llama/build.py
index 7351cf541f..5a4b524f52 100644
--- a/examples/llama/build.py
+++ b/examples/llama/build.py
@@ -32,9 +32,7 @@ from tensorrt_llm.builder import Builder
 from tensorrt_llm.layers.attention import PositionEmbeddingType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import (fp8_quantize, smooth_quantize,
-                                 weight_only_groupwise_quantize,
-                                 weight_only_quantize)
+from tensorrt_llm.models import quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
@@ -340,6 +338,11 @@ def parse_arguments():
         action='store_true',
         help=
         'Activates latency-optimized algorithm for all-reduce instead of NCCL.')
+    parser.add_argument(
+        '--max_prompt_embedding_table_size',
+        type=int,
+        default=0,
+        help='Setting to a value > 0 enables support for prompt tuning.')
 
     args = parser.parse_args()
     tensorrt_llm.logger.set_level(args.log_level)
@@ -372,17 +375,13 @@ def parse_arguments():
         args.quant_mode = QuantMode.use_smooth_quant(args.per_token,
                                                      args.per_channel)
     elif args.use_weight_only:
-        if args.per_group:
-            args.quant_mode = QuantMode.from_description(
-                quantize_weights=True,
-                quantize_activations=False,
-                per_token=False,
-                per_channel=False,
-                per_group=True,
-                use_int4_weights=True)
-        else:
-            args.quant_mode = QuantMode.use_weight_only(
-                args.weight_only_precision == 'int4')
+        args.quant_mode = QuantMode.from_description(
+            quantize_weights=True,
+            quantize_activations=False,
+            per_token=False,
+            per_channel=False,
+            per_group=args.per_group,
+            use_int4_weights=args.weight_only_precision == "int4")
     else:
         args.quant_mode = QuantMode(0)
 
@@ -394,6 +393,7 @@ def parse_arguments():
         args.quant_mode = args.quant_mode.set_fp8_qdq()
 
     if args.rotary_scaling is not None:
+        assert args.use_gpt_attention_plugin, "RoPE scaling is only supported through GPT attention plugin."
         rotary_scaling = {
             "type": args.rotary_scaling[0],
             "factor": float(args.rotary_scaling[1])
@@ -401,8 +401,6 @@ def parse_arguments():
         assert rotary_scaling["type"] in ["linear", "dynamic"]
         assert rotary_scaling["factor"] > 1.0
         args.rotary_scaling = rotary_scaling
-        if rotary_scaling["type"] == "dynamic":
-            assert not args.remove_input_padding, "TODO: Not supported yet"
 
     if args.model_dir is not None:
         hf_config = LlamaConfig.from_pretrained(args.model_dir)
@@ -452,9 +450,6 @@ def parse_arguments():
             "MQA/GQA requires either the number of K/V heads to be divisible by the tensor parallelism size OR " \
             "the tensor parallelism size to be divisible by the number of K/V heads."
 
-    if args.dtype == 'bfloat16':
-        assert args.use_gemm_plugin, "Please use gemm plugin when dtype is bfloat16"
-
     assert args.pp_size * args.tp_size == args.world_size
 
     if args.max_num_tokens is not None:
@@ -509,47 +504,40 @@ def build_rank_engine(builder: Builder,
         embedding_sharding_dim=args.embedding_sharding_dim,
         quant_mode=args.quant_mode,
         rms_norm_eps=args.rms_norm_eps,
-        use_fused_mlp=args.use_fused_mlp)
-    if args.use_smooth_quant:
-        tensorrt_llm_llama = smooth_quantize(tensorrt_llm_llama,
-                                             args.quant_mode)
-    elif args.use_weight_only:
-        if args.weight_only_precision == 'int8':
-            tensorrt_llm_llama = weight_only_quantize(tensorrt_llm_llama,
-                                                      args.quant_mode)
-        elif args.weight_only_precision == 'int4':
-            tensorrt_llm_llama = weight_only_quantize(tensorrt_llm_llama,
-                                                      args.quant_mode)
-        elif args.weight_only_precision == 'int4_awq':
-            tensorrt_llm_llama = weight_only_groupwise_quantize(
-                model=tensorrt_llm_llama,
-                quant_mode=args.quant_mode,
-                group_size=args.group_size,
-                zero=False,
-                pre_quant_scale=True,
-                exclude_modules=[])
+        use_fused_mlp=args.use_fused_mlp,
+        use_prompt_tuning=args.max_prompt_embedding_table_size > 0,
+    )
+    quantize_kwargs = {}
+    if args.use_smooth_quant or args.use_weight_only:
+        if args.weight_only_precision == 'int4_awq':
+            quantize_kwargs = {
+                "group_size": args.group_size,
+                "zero": False,
+                "pre_quant_scale": True,
+                "exclude_modules": [],
+            }
         elif args.weight_only_precision == 'int4_gptq':
-            tensorrt_llm_llama = weight_only_groupwise_quantize(
-                model=tensorrt_llm_llama,
-                quant_mode=args.quant_mode,
-                group_size=args.group_size,
-                zero=True,
-                pre_quant_scale=False)
+            quantize_kwargs = {
+                "group_size": args.group_size,
+                "zero": True,
+                "pre_quant_scale": False,
+            }
     elif args.enable_fp8 or args.fp8_kv_cache:
         logger.info(f'Loading scaling factors from '
                     f'{args.quantized_fp8_model_path}')
         quant_scales = get_scaling_factors(args.quantized_fp8_model_path,
                                            num_layers=args.n_layer,
                                            quant_mode=args.quant_mode)
-        tensorrt_llm_llama = fp8_quantize(tensorrt_llm_llama,
-                                          quant_mode=args.quant_mode,
-                                          quant_scales=quant_scales)
+        quantize_kwargs = {"quant_scales": quant_scales}
+    tensorrt_llm_llama = quantize_model(tensorrt_llm_llama, args.quant_mode,
+                                        **quantize_kwargs)
     if args.per_group:
         load_func = load_from_awq_llama if args.weight_only_precision == 'int4_awq' else load_from_gptq_llama
         load_func(tensorrt_llm_llama=tensorrt_llm_llama,
                   quant_ckpt_path=args.quant_ckpt_path,
                   mapping=mapping,
-                  dtype=args.dtype)
+                  dtype=args.dtype,
+                  ft_model_dir=args.ft_model_dir)
     elif args.meta_ckpt_dir is not None:
         load_from_meta_llama(tensorrt_llm_llama, args.meta_ckpt_dir, mapping,
                              args.dtype)
@@ -625,11 +613,15 @@ def build_rank_engine(builder: Builder,
         network.set_named_parameters(tensorrt_llm_llama.named_parameters())
 
         # Forward
-        inputs = tensorrt_llm_llama.prepare_inputs(args.max_batch_size,
-                                                   args.max_input_len,
-                                                   args.max_output_len, True,
-                                                   args.max_beam_width,
-                                                   args.max_num_tokens)
+        inputs = tensorrt_llm_llama.prepare_inputs(
+            args.max_batch_size,
+            args.max_input_len,
+            args.max_output_len,
+            True,
+            args.max_beam_width,
+            args.max_num_tokens,
+            prompt_embedding_table_size=args.max_prompt_embedding_table_size,
+        )
         tensorrt_llm_llama(*inputs)
         if args.enable_debug_output:
             # mark intermediate nodes' outputs
@@ -651,6 +643,9 @@ def build_rank_engine(builder: Builder,
     if rank == 0:
         config_path = os.path.join(args.output_dir, 'config.json')
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, tensorrt_llm_llama)
+
     return engine
 
 
@@ -690,10 +685,12 @@ def build(rank, args):
             max_output_len=args.max_output_len,
             max_num_tokens=args.max_num_tokens,
             int8=int8_trt_flag,
-            fp8=args.quant_mode.has_fp8_qdq(),
             quant_mode=args.quant_mode,
             strongly_typed=args.strongly_typed,
-            opt_level=args.builder_opt)
+            opt_level=args.builder_opt,
+            max_prompt_embedding_table_size=args.
+            max_prompt_embedding_table_size,
+        )
         engine_name = get_engine_name(MODEL_NAME, args.dtype, args.tp_size,
                                       args.pp_size, cur_rank)
         engine = build_rank_engine(builder, builder_config, engine_name,
@@ -706,6 +703,7 @@ def build(rank, args):
                 cache = builder_config.trt_builder_config.get_timing_cache()
 
         serialize_engine(engine, os.path.join(args.output_dir, engine_name))
+        del engine
 
     if rank == 0:
         ok = builder.save_timing_cache(
diff --git a/examples/llama/hf_llama_convert.py b/examples/llama/hf_llama_convert.py
index 3db3b8f350..f16627c30d 100644
--- a/examples/llama/hf_llama_convert.py
+++ b/examples/llama/hf_llama_convert.py
@@ -44,7 +44,6 @@ def merge_qkv_scales(q_name, hf_model, scales, llama_qkv_para):
 
     scales[layer_name_qkv]["x"] = scales[layer_name_q]["x"]
     scales[layer_name_qkv]["w"] = weight.abs().max(dim=1)[0]
-    print(scales[layer_name_q])
     scales[layer_name_qkv]["y"] = torch.cat([
         scales[layer_name_q]["y"], scales[layer_name_k]["y"],
         scales[layer_name_v]["y"]
@@ -188,6 +187,7 @@ def hf_gpt_converter(args):
             smooth_llama_model(model, act_range, args.smoothquant,
                                llama_qkv_para, llama_smoother)
 
+    args.multi_query_mode = model.config.num_attention_heads != model.config.num_key_value_heads
     config = configparser.ConfigParser()
     config["llama"] = {}
     for key in vars(args):
@@ -319,9 +319,6 @@ if __name__ == "__main__":
                         type=str,
                         default="fp32",
                         choices=["fp32", "fp16"])
-    parser.add_argument("--multi-query-mode",
-                        action="store_true",
-                        help="Use multi-query-attention.")
 
     args = parser.parse_args()
     print("\n=============== Argument ===============")
diff --git a/examples/llama/run.py b/examples/llama/run.py
index 7c2493735e..2091a9b20c 100644
--- a/examples/llama/run.py
+++ b/examples/llama/run.py
@@ -19,7 +19,7 @@ from pathlib import Path
 
 import numpy as np
 import torch
-from transformers import LlamaTokenizer
+from transformers import LlamaTokenizerFast
 
 import tensorrt_llm
 from tensorrt_llm.quantization import QuantMode
@@ -51,8 +51,8 @@ def read_config(config_path: Path):
     world_size = tp_size * pp_size
     assert world_size == tensorrt_llm.mpi_world_size(), \
         f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
-    num_heads = config['builder_config']['num_heads'] // tp_size
-    hidden_size = config['builder_config']['hidden_size'] // tp_size
+    num_heads = config['builder_config']['num_heads']
+    hidden_size = config['builder_config']['hidden_size']
     vocab_size = config['builder_config']['vocab_size']
     num_layers = config['builder_config']['num_layers']
     num_kv_heads = config['builder_config'].get('num_kv_heads', num_heads)
@@ -65,21 +65,28 @@ def read_config(config_path: Path):
         )
         num_kv_heads = 1
     num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
+    assert (num_heads % tp_size) == 0
+    num_heads = num_heads // tp_size
+    hidden_size = hidden_size // tp_size
     use_custom_all_reduce = config['plugin_config'].get('use_custom_all_reduce',
                                                         False)
+    max_prompt_embedding_table_size = config['builder_config'].get(
+        'max_prompt_embedding_table_size', 0)
 
-    model_config = ModelConfig(num_heads=num_heads,
-                               num_kv_heads=num_kv_heads,
-                               hidden_size=hidden_size,
-                               vocab_size=vocab_size,
-                               num_layers=num_layers,
-                               gpt_attention_plugin=use_gpt_attention_plugin,
-                               paged_kv_cache=paged_kv_cache,
-                               tokens_per_block=tokens_per_block,
-                               remove_input_padding=remove_input_padding,
-                               dtype=dtype,
-                               quant_mode=quant_mode,
-                               use_custom_all_reduce=use_custom_all_reduce)
+    model_config = ModelConfig(
+        num_heads=num_heads,
+        num_kv_heads=num_kv_heads,
+        hidden_size=hidden_size,
+        vocab_size=vocab_size,
+        num_layers=num_layers,
+        gpt_attention_plugin=use_gpt_attention_plugin,
+        paged_kv_cache=paged_kv_cache,
+        tokens_per_block=tokens_per_block,
+        remove_input_padding=remove_input_padding,
+        dtype=dtype,
+        quant_mode=quant_mode,
+        use_custom_all_reduce=use_custom_all_reduce,
+        max_prompt_embedding_table_size=max_prompt_embedding_table_size)
 
     return model_config, tp_size, pp_size, dtype
 
@@ -121,6 +128,37 @@ def parse_input(input_text: str, input_file: str, tokenizer, end_id: int,
     return input_ids, input_lengths
 
 
+def ptuning_setup(prompt_table, dtype, hidden_size, tasks, input_ids,
+                  input_lengths, remove_input_padding):
+    if prompt_table is not None:
+        prompt_table = torch.from_numpy(np.load(prompt_table))
+        task_vocab_size = torch.tensor([prompt_table.shape[1]],
+                                       dtype=torch.int32,
+                                       device="cuda")
+        prompt_table = prompt_table.view(
+            (prompt_table.shape[0] * prompt_table.shape[1],
+             prompt_table.shape[2]))
+        prompt_table = prompt_table.cuda().to(
+            dtype=tensorrt_llm._utils.str_dtype_to_torch(dtype))
+    else:
+        prompt_table = torch.empty([1, hidden_size]).cuda()
+        task_vocab_size = torch.zeros([1]).cuda()
+
+    num_sequences = input_lengths.size(
+        0) if remove_input_padding else input_ids.size(0)
+
+    if tasks is not None:
+        tasks = torch.tensor([int(t) for t in tasks.split(',')],
+                             dtype=torch.int32,
+                             device="cuda")
+        assert tasks.shape[
+            0] == num_sequences, "Number of supplied tasks must match input batch size"
+    else:
+        tasks = torch.zeros([num_sequences]).cuda()
+
+    return [prompt_table, tasks, task_vocab_size]
+
+
 def print_output(output_ids, input_lengths, max_output_len, tokenizer,
                  output_csv, output_npy, sequence_lengths):
     num_beams = output_ids.size(1)
@@ -138,6 +176,7 @@ def print_output(output_ids, input_lengths, max_output_len, tokenizer,
                 print(f'Output: \"{output_text}\"')
 
     output_ids = output_ids.reshape((-1, output_ids.size(2)))
+    print(output_ids)
 
     if output_csv is not None:
         output_file = Path(output_csv)
@@ -190,6 +229,13 @@ def parse_arguments():
                         type=int,
                         help="How often to return tokens when streaming.",
                         default=5)
+    parser.add_argument(
+        '--prompt_table',
+        type=Path,
+        help="Path to .npy file, exported by nemo_prompt_convert.py")
+    parser.add_argument(
+        '--tasks',
+        help="Comma-separated list of tasks for prompt tuning: ex 0,3,1,0")
     return parser.parse_args()
 
 
@@ -205,6 +251,8 @@ def generate(
     num_beams: int = 1,
     streaming: bool = False,
     streaming_interval: int = 5,
+    prompt_table: Path = None,
+    tasks: str = None,
 ):
     tensorrt_llm.logger.set_level(log_level)
 
@@ -220,7 +268,7 @@ def generate(
                                            pp_size=pp_size)
     torch.cuda.set_device(runtime_rank % runtime_mapping.gpus_per_node)
 
-    tokenizer = LlamaTokenizer.from_pretrained(tokenizer_dir, legacy=False)
+    tokenizer = LlamaTokenizerFast.from_pretrained(tokenizer_dir, legacy=False)
 
     sampling_config = SamplingConfig(end_id=EOS_TOKEN,
                                      pad_id=PAD_TOKEN,
@@ -242,14 +290,20 @@ def generate(
     input_ids, input_lengths = parse_input(input_text, input_file, tokenizer,
                                            EOS_TOKEN,
                                            model_config.remove_input_padding)
+    print(input_ids)
 
     max_input_length = torch.max(input_lengths).item()
     decoder.setup(input_lengths.size(0), max_input_length, max_output_len,
                   num_beams)
 
+    ptuning_args = [] if model_config.max_prompt_embedding_table_size == 0 else ptuning_setup(
+        prompt_table, dtype, model_config.hidden_size, tasks, input_ids,
+        input_lengths, model_config.remove_input_padding)
+
     outputs = decoder.decode(input_ids,
                              input_lengths,
                              sampling_config,
+                             *ptuning_args,
                              streaming=streaming,
                              output_sequence_lengths=True,
                              return_dict=True)
diff --git a/examples/llama/weight.py b/examples/llama/weight.py
index 37e9e81505..d347c341e0 100644
--- a/examples/llama/weight.py
+++ b/examples/llama/weight.py
@@ -223,6 +223,11 @@ def load_from_hf_llama(tensorrt_llm_llama: tensorrt_llm.models.LLaMAForCausalLM,
         else:
             v = torch_to_numpy(v.to(torch_dtype).detach().cpu())
         if 'model.embed_tokens.weight' in k:
+            if hf_llama.config.tie_word_embeddings:
+                # lm_head.weight has the same weights as embedding
+                if mapping.is_last_pp_rank():
+                    tensorrt_llm_llama.lm_head.weight.value = np.ascontiguousarray(
+                        split(v, mapping.tp_size, mapping.tp_rank))
             if tensorrt_llm_llama.use_parallel_embedding:
                 v = split(v, mapping.tp_size, mapping.tp_rank,
                           tensorrt_llm_llama.embedding_sharding_dim)
@@ -818,7 +823,8 @@ def load_from_binary(tensorrt_llm_llama: LLaMAForCausalLM,
 def load_from_gptq_llama(tensorrt_llm_llama,
                          quant_ckpt_path,
                          mapping=Mapping(),
-                         dtype="float16"):
+                         dtype="float16",
+                         ft_model_dir=None):
     tensorrt_llm.logger.info(
         'Loading weights from groupwise GPTQ LLaMA safetensors...')
     tik = time.time()
@@ -1019,7 +1025,8 @@ def load_from_gptq_llama(tensorrt_llm_llama,
 def load_from_awq_llama(tensorrt_llm_llama: LLaMAForCausalLM,
                         quant_ckpt_path,
                         mapping=Mapping(),
-                        dtype="float16"):
+                        dtype="float16",
+                        ft_model_dir=None):
     tensorrt_llm.logger.info(
         'Loading weights from groupwise AWQ LLaMA safetensors...')
     tik = time.time()
@@ -1052,12 +1059,23 @@ def load_from_awq_llama(tensorrt_llm_llama: LLaMAForCausalLM,
         "post_layernorm.weight",
     ]
 
-    getattr(tensorrt_llm_llama, 'quant_mode', QuantMode(0))
+    quant_mode = getattr(tensorrt_llm_llama, 'quant_mode', QuantMode(0))
+    # Int8 KV cache
+    use_int8_kv_cache = quant_mode.has_int8_kv_cache()
 
     packer = torch.ops.fastertransformer.pack_int8_tensor_to_packed_int4
     preprocessor = torch.ops.fastertransformer.preprocess_weights_for_mixed_gemm
     torch_dtype = str_dtype_to_torch(dtype)
 
+    def fromfile(dir_path, name, shape=None, dtype=None):
+        p = dir_path + '/' + name
+        if Path(p).exists():
+            t = np.fromfile(p, dtype=dtype)
+            if shape is not None:
+                t = t.reshape(shape)
+            return t
+        return None
+
     def AWQ_quantize_pack_preprocess(weight, scale):
         scale = scale.repeat_interleave(group_size, dim=0)
         weight = weight / scale
@@ -1217,6 +1235,18 @@ def load_from_awq_llama(tensorrt_llm_llama: LLaMAForCausalLM,
         mOp = tensorrt_llm_llama.layers[layer_idx].mlp.fc
         process_and_assign_weight(awq_llama, mPrefix, mOp, 1)
 
+        if use_int8_kv_cache:
+            assert ft_model_dir, "You must pass --ft_model_dir to tell TRT-LLM where to look for scales of INT8 kv cache."
+            t = fromfile(
+                ft_model_dir, 'model.layers.' + str(layer_idx) +
+                '.attention.query_key_value.scale_y_quant_orig.bin', [1],
+                np.float32)
+            assert t is not None, f"{ft_model_dir} does not contain model.layers.{layer_idx}.attention.query_key_value.scale_y_quant_orig.bin"
+            tensorrt_llm_llama.layers[
+                layer_idx].attention.kv_orig_quant_scale.value = 1.0 / t
+            tensorrt_llm_llama.layers[
+                layer_idx].attention.kv_quant_orig_scale.value = t
+
     v = awq_llama['model.norm.weight']
     if mapping.is_last_pp_rank():
         tensorrt_llm_llama.ln_f.weight.value = v.to(torch_dtype).cpu().numpy()
diff --git a/examples/mpt/build.py b/examples/mpt/build.py
index f947a8d83b..2f88cd320d 100644
--- a/examples/mpt/build.py
+++ b/examples/mpt/build.py
@@ -26,7 +26,7 @@ from tensorrt_llm.builder import Builder
 from tensorrt_llm.layers import PositionEmbeddingType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import smooth_quantize, weight_only_quantize
+from tensorrt_llm.models import quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
@@ -429,11 +429,9 @@ def build_rank_engine(builder: Builder,
         use_parallel_embedding=args.use_parallel_embedding,
         embedding_sharding_dim=args.embedding_sharding_dim,
         share_embedding_table=share_embedding_table)
-    if args.use_smooth_quant:
-        tensorrt_llm_gpt = smooth_quantize(tensorrt_llm_gpt, args.quant_mode)
-    elif args.use_weight_only:
-        tensorrt_llm_gpt = weight_only_quantize(tensorrt_llm_gpt,
-                                                args.quant_mode)
+
+    if args.use_smooth_quant or args.use_weight_only:
+        tensorrt_llm_gpt = quantize_model(tensorrt_llm_gpt, args.quant_mode)
 
     if args.model_dir is not None:
         gpt_dummy_fp8_scaling_factors = {
@@ -528,6 +526,9 @@ def build_rank_engine(builder: Builder,
     if rank == 0:
         config_path = args.output_dir / 'config.json'
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, tensorrt_llm_gpt)
+
     return engine
 
 
@@ -569,7 +570,7 @@ def build(rank, args):
             multi_query_mode=args.multi_query_mode,
             strongly_typed=args.strongly_typed,
             use_prompt_tuning=args.max_prompt_embedding_table_size > 0,
-            fp8=args.enable_fp8,
+            quant_mode=args.quant_mode,
             use_parallel_embedding=args.use_parallel_embedding)
 
         engine_name = get_engine_name(MODEL_NAME, args.dtype, args.world_size,
@@ -585,6 +586,7 @@ def build(rank, args):
                 )
 
         serialize_engine(engine, args.output_dir / engine_name)
+        del engine
 
     if rank == 0:
         ok = builder.save_timing_cache(builder_config, timing_cache_file)
diff --git a/examples/mpt/requirements.txt b/examples/mpt/requirements.txt
index 61be4accb8..f46bff3100 100644
--- a/examples/mpt/requirements.txt
+++ b/examples/mpt/requirements.txt
@@ -1,2 +1,2 @@
-datasets~=2.3.2
+datasets~=2.14.5
 rouge_score~=0.1.2
diff --git a/examples/mpt/run.py b/examples/mpt/run.py
index 5ad7ab361b..ff7b88a610 100644
--- a/examples/mpt/run.py
+++ b/examples/mpt/run.py
@@ -42,7 +42,6 @@ def read_config(config_path: Path):
     multi_query_mode = config['builder_config']['multi_query_mode']
     paged_kv_cache = config['plugin_config']['paged_kv_cache']
     tokens_per_block = config['plugin_config']['tokens_per_block']
-    use_prompt_tuning = config['builder_config']['use_prompt_tuning']
     num_kv_heads = 1 if multi_query_mode else num_heads
     dtype = config['builder_config']['precision']
 
@@ -55,13 +54,13 @@ def read_config(config_path: Path):
                                remove_input_padding=remove_input_padding,
                                paged_kv_cache=paged_kv_cache,
                                tokens_per_block=tokens_per_block,
-                               use_prompt_tuning=use_prompt_tuning,
                                dtype=dtype)
 
     dtype = config['builder_config']['precision']
     max_input_len = config['builder_config']['max_input_len']
+    use_prompt_tuning = config['builder_config']['use_prompt_tuning']
 
-    return model_config, world_size, dtype, max_input_len
+    return model_config, world_size, dtype, max_input_len, use_prompt_tuning
 
 
 def parse_input(input_text: str, input_file: str, tokenizer, pad_id: int,
@@ -234,7 +233,8 @@ def generate(
 
     engine_dir = Path(engine_dir)
     config_path = engine_dir / 'config.json'
-    model_config, world_size, dtype, max_input_len = read_config(config_path)
+    model_config, world_size, dtype, max_input_len, use_prompt_tuning = read_config(
+        config_path)
 
     runtime_rank = tensorrt_llm.mpi_rank()
     runtime_mapping = tensorrt_llm.Mapping(world_size,
@@ -284,7 +284,7 @@ def generate(
                   max_output_len,
                   beam_width=num_beams)
 
-    ptuning_args = [] if not model_config.use_prompt_tuning else ptuning_setup(
+    ptuning_args = [] if not use_prompt_tuning else ptuning_setup(
         prompt_table, dtype, model_config.hidden_size, tasks, input_ids,
         input_lengths, model_config.remove_input_padding)
 
diff --git a/examples/opt/build.py b/examples/opt/build.py
index 2bcd53f81d..7f974800d9 100644
--- a/examples/opt/build.py
+++ b/examples/opt/build.py
@@ -25,7 +25,7 @@ from tensorrt_llm._utils import str_dtype_to_trt
 from tensorrt_llm.builder import Builder
 from tensorrt_llm.logger import logger
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import weight_only_quantize
+from tensorrt_llm.models import quantize_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.quantization import QuantMode
@@ -251,8 +251,8 @@ def build_rank_engine(builder: Builder,
         embedding_sharding_dim=args.embedding_sharding_dim,
         share_embedding_table=share_embedding_table)
     if args.use_weight_only:
-        tensorrt_llm_gpt = weight_only_quantize(tensorrt_llm_gpt,
-                                                args.quant_mode)
+        tensorrt_llm_gpt = quantize_model(tensorrt_llm_gpt, args.quant_mode)
+
     if args.model_dir is not None:
         load_from_ft(tensorrt_llm_gpt,
                      args.model_dir,
@@ -313,6 +313,9 @@ def build_rank_engine(builder: Builder,
     if rank == 0:
         config_path = os.path.join(args.output_dir, 'config.json')
         builder.save_config(builder_config, config_path)
+
+    tensorrt_llm.tools.cleanup(network, tensorrt_llm_gpt)
+
     return engine
 
 
@@ -359,6 +362,7 @@ def build(rank, args):
                 cache = builder_config.trt_builder_config.get_timing_cache()
 
         serialize_engine(engine, os.path.join(args.output_dir, engine_name))
+        del engine
 
     if rank == 0:
         ok = builder.save_timing_cache(
diff --git a/requirements-dev-windows.txt b/requirements-dev-windows.txt
index b7cd8dcb7c..5fa7659a9b 100644
--- a/requirements-dev-windows.txt
+++ b/requirements-dev-windows.txt
@@ -1,9 +1,9 @@
---pre --extra-index-url https://download.pytorch.org/whl/nightly/cu121
-# torch is CPU-only on Windows, so need to specify a torch version with GPU support
-torch==2.1.0.dev20230828+cu121
-torchvision==0.16.0.dev20230828
-torchdata==0.7.0.dev20230828
-torchtext==0.16.0.dev20230828
+--extra-index-url https://download.pytorch.org/whl/cu121
+# Default torch is CPU-only on Windows, so need to specify a torch version with GPU support
+torch==2.1.0+cu121
+torchvision==0.16.0+cu121
+torchdata==0.7.0
+torchtext==0.16.0+cpu
 tokenizers==0.13.3
 transformers==4.33.1
 diffusers==0.15.0
@@ -23,4 +23,4 @@ einops
 parameterized
 graphviz
 pywin32
-pynvml
+pynvml>=11.5.0
diff --git a/requirements-dev.txt b/requirements-dev.txt
index 86e4fae049..02536864b2 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -17,4 +17,4 @@ pre-commit
 einops
 parameterized
 graphviz
-pynvml
+pynvml>=11.5.0
diff --git a/requirements-windows.txt b/requirements-windows.txt
index a904da9ff8..2c1de9a1a3 100644
--- a/requirements-windows.txt
+++ b/requirements-windows.txt
@@ -1,10 +1,10 @@
 build
---pre --extra-index-url https://download.pytorch.org/whl/nightly/cu121
-# torch is CPU-only on Windows, so need to specify a torch version with GPU support
-torch==2.1.0.dev20230828+cu121
-torchvision==0.16.0.dev20230828
-torchdata==0.7.0.dev20230828
-torchtext==0.16.0.dev20230828
+--extra-index-url https://download.pytorch.org/whl/cu121
+# Default torch is CPU-only on Windows, so need to specify a torch version with GPU support
+torch==2.1.0+cu121
+torchvision==0.16.0+cu121
+torchdata==0.7.0
+torchtext==0.16.0+cpu
 tokenizers==0.13.3
 transformers==4.33.1
 diffusers==0.15.0
diff --git a/scripts/build_wheel.py b/scripts/build_wheel.py
index 1b567710c5..d2af07eddc 100755
--- a/scripts/build_wheel.py
+++ b/scripts/build_wheel.py
@@ -51,7 +51,8 @@ def main(build_type: str = "Release",
          use_ccache: bool = False,
          cpp_only: bool = False,
          install: bool = False,
-         skip_building_wheel: bool = False):
+         skip_building_wheel: bool = False,
+         python_bindings: bool = False):
     project_dir = Path(__file__).parent.resolve().parent
     os.chdir(project_dir)
     build_run = partial(run, shell=True, check=True)
@@ -142,22 +143,28 @@ def main(build_type: str = "Release",
 
     build_pyt = "OFF" if cpp_only else "ON"
     th_common_lib = "" if cpp_only else "th_common"
+    build_pybind = "ON" if python_bindings else "OFF"
+    bindings_lib = "bindings" if python_bindings else ""
 
     with working_directory(build_dir):
         cmake_def_args = " ".join(cmake_def_args)
         if clean or first_build:
             build_run(
-                f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" {cmake_cuda_architectures}'
-                f' {cmake_def_args} -S "{source_dir}"')
+                f'cmake -DCMAKE_BUILD_TYPE="{build_type}" -DBUILD_PYT="{build_pyt}" -DBUILD_PYBIND="{build_pybind}"'
+                f' {cmake_cuda_architectures} {cmake_def_args} -S "{source_dir}"'
+            )
         build_run(
-            f'cmake --build . --config {build_type} --parallel {job_count} --target tensorrt_llm tensorrt_llm_static nvinfer_plugin_tensorrt_llm {th_common_lib} '
+            f'cmake --build . --config {build_type} --parallel {job_count} '
+            f'--target tensorrt_llm tensorrt_llm_static nvinfer_plugin_tensorrt_llm {th_common_lib} {bindings_lib}'
             f'{" ".join(extra_make_targets)}')
 
     if cpp_only:
         assert not install, "Installing is not supported for cpp_only builds"
         return
 
-    lib_dir = project_dir / "tensorrt_llm/libs"
+    pkg_dir = project_dir / "tensorrt_llm"
+    assert pkg_dir.is_dir(), f"{pkg_dir} is not a directory"
+    lib_dir = pkg_dir / "libs"
     if lib_dir.exists():
         rmtree(lib_dir)
     lib_dir.mkdir(parents=True)
@@ -176,6 +183,15 @@ def main(build_type: str = "Release",
             "tensorrt_llm/plugins/libnvinfer_plugin_tensorrt_llm.so",
             lib_dir / "libnvinfer_plugin_tensorrt_llm.so")
 
+    if python_bindings:
+        # TODO Add windows support for python bindings.
+        pybind_lib = list(
+            (build_dir / "tensorrt_llm" / "pybind").glob("bindings.*.so"))
+        assert len(
+            pybind_lib
+        ) == 1, f"Exactly one pybind library should be present: {pybind_lib}"
+        copy(pybind_lib[0], pkg_dir)
+
     if dist_dir is None:
         dist_dir = project_dir / "build"
     else:
@@ -244,5 +260,9 @@ if __name__ == "__main__":
         action="store_true",
         help=
         "Do not build the *.whl files (they are only needed for distribution).")
+    parser.add_argument("--python_bindings",
+                        "-p",
+                        action="store_true",
+                        help="Build the python bindings for the C++ runtime.")
     args = parser.parse_args()
     main(**vars(args))
diff --git a/setup.py b/setup.py
index bf4e055f45..1644c422fa 100644
--- a/setup.py
+++ b/setup.py
@@ -56,13 +56,16 @@ setup(
     install_requires=required_deps,
     dependency_links=extra_URLs,
     zip_safe=True,
+    license="Apache License 2.0",
     packages=find_packages(),
+    # TODO Add windows support for python bindings.
     package_data={
         'tensorrt_llm':
         (['libs/th_common.dll', 'libs/nvinfer_plugin_tensorrt_llm.dll']
-         if platform.system() == "Windows" else
-         ['libs/libth_common.so', 'libs/libnvinfer_plugin_tensorrt_llm.so']) +
-        ['tools/plugin_gen/templates/*']
+         if platform.system() == "Windows" else [
+             'libs/libth_common.so', 'libs/libnvinfer_plugin_tensorrt_llm.so',
+             'bindings.*.so'
+         ]) + ['tools/plugin_gen/templates/*'],
     },
     python_requires=">=3.7, <4",
     distclass=BinaryDistribution,
diff --git a/tensorrt_llm/_common.py b/tensorrt_llm/_common.py
index 9b8ccb12d4..01bbc72017 100644
--- a/tensorrt_llm/_common.py
+++ b/tensorrt_llm/_common.py
@@ -13,9 +13,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import contextlib
+import ctypes
 import platform
+import time
 from pathlib import Path
 
+import numpy as np
+import tensorrt as trt
 import torch
 
 from ._utils import str_dtype_to_trt
@@ -80,3 +84,79 @@ def precision(dtype):
     prev_dtype = switch_net_dtype(dtype)
     yield
     switch_net_dtype(prev_dtype)
+
+
+def serialize_engine(engine, path):
+    logger.info(f'Serializing engine to {path}...')
+    tik = time.time()
+    if isinstance(engine, trt.ICudaEngine):
+        engine = engine.serialize()
+    with open(path, 'wb') as f:
+        f.write(bytearray(engine))
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    logger.info(f'Engine serialized. Total time: {t}')
+
+
+def deserialize_engine(path):
+    runtime = trt.Runtime(logger.trt_logger)
+    with open(path, 'rb') as f:
+        logger.info(f'Loading engine from {path}...')
+        tik = time.time()
+
+        engine = runtime.deserialize_cuda_engine(f.read())
+        assert engine is not None
+
+        tok = time.time()
+        t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+        logger.info(f'Engine loaded. Total time: {t}')
+    return engine
+
+
+_field_dtype_to_np_dtype_dict = {
+    trt.PluginFieldType.FLOAT16: np.float16,
+    trt.PluginFieldType.FLOAT32: np.float32,
+    trt.PluginFieldType.FLOAT64: np.float64,
+    trt.PluginFieldType.INT8: np.int8,
+    trt.PluginFieldType.INT16: np.int16,
+    trt.PluginFieldType.INT32: np.int32,
+}
+
+
+def field_dtype_to_np_dtype(dtype):
+    ret = _field_dtype_to_np_dtype_dict.get(dtype)
+    assert ret is not None, f'Unsupported dtype: {dtype}'
+    return ret
+
+
+def convert_capsule_to_void_p(capsule):
+    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
+    ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [
+        ctypes.py_object, ctypes.c_char_p
+    ]
+    return ctypes.pythonapi.PyCapsule_GetPointer(capsule, None)
+
+
+def get_nparray_from_void_p(void_pointer, elem_size, field_dtype):
+    ctypes.pythonapi.PyMemoryView_FromMemory.restype = ctypes.py_object
+    ctypes.pythonapi.PyMemoryView_FromMemory.argtypes = [
+        ctypes.c_char_p, ctypes.c_ssize_t, ctypes.c_int
+    ]
+    logger.info(
+        f'get_nparray: pointer = {void_pointer}, elem_size = {elem_size}')
+    char_pointer = ctypes.cast(void_pointer, ctypes.POINTER(ctypes.c_char))
+    np_dtype = field_dtype_to_np_dtype(field_dtype)
+    buf_bytes = elem_size * np.dtype(np_dtype).itemsize
+    logger.info(f'get_nparray: buf_bytes = {buf_bytes}')
+    mem_view = ctypes.pythonapi.PyMemoryView_FromMemory(
+        char_pointer, buf_bytes, 0)  # number 0 represents PyBUF_READ
+    logger.info(
+        f'get_nparray: mem_view = {mem_view}, field_dtype = {field_dtype}')
+    buf = np.frombuffer(mem_view, np_dtype)
+    return buf
+
+
+def get_scalar_from_field(field):
+    void_p = convert_capsule_to_void_p(field.data)
+    np_array = get_nparray_from_void_p(void_p, 1, field.type)
+    return np_array[0]
diff --git a/tensorrt_llm/_utils.py b/tensorrt_llm/_utils.py
index 91a695ac30..b5f0ae4193 100644
--- a/tensorrt_llm/_utils.py
+++ b/tensorrt_llm/_utils.py
@@ -13,26 +13,25 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import copy
-import ctypes
 import json
 import math
-import time
+import struct
 from functools import partial
 
 import numpy as np
 import tensorrt as trt
 import torch
 
-from .logger import logger
-
 # numpy doesn't know bfloat16, define abstract binary type instead
 np_bfloat16 = np.dtype('V2', metadata={"dtype": "bfloat16"})
 
 
-def torch_to_numpy(x):
+def torch_to_numpy(x: torch.Tensor):
+    assert isinstance(x, torch.Tensor), \
+        f'x must be a torch.Tensor object, but got {type(x)}.'
     if x.dtype != torch.bfloat16:
-        return x.numpy()
-    return x.view(torch.int16).numpy().view(np_bfloat16)
+        return x.cpu().numpy()
+    return x.view(torch.int16).cpu().numpy().view(np_bfloat16)
 
 
 fp32_array = partial(np.array, dtype=np.float32)
@@ -192,33 +191,6 @@ def dim_resolve_negative(dim, ndim):
     return tuple(pos)
 
 
-def serialize_engine(engine, path):
-    logger.info(f'Serializing engine to {path}...')
-    tik = time.time()
-    if isinstance(engine, trt.ICudaEngine):
-        engine = engine.serialize()
-    with open(path, 'wb') as f:
-        f.write(bytearray(engine))
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    logger.info(f'Engine serialized. Total time: {t}')
-
-
-def deserialize_engine(path):
-    runtime = trt.Runtime(logger.trt_logger)
-    with open(path, 'rb') as f:
-        logger.info(f'Loading engine from {path}...')
-        tik = time.time()
-
-        engine = runtime.deserialize_cuda_engine(f.read())
-        assert engine is not None
-
-        tok = time.time()
-        t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-        logger.info(f'Engine loaded. Total time: {t}')
-    return engine
-
-
 def mpi_comm():
     from mpi4py import MPI
     return MPI.COMM_WORLD
@@ -251,50 +223,16 @@ def to_json_file(obj, json_file_path):
         writer.write(to_json_string(obj))
 
 
-_field_dtype_to_np_dtype_dict = {
-    trt.PluginFieldType.FLOAT16: np.float16,
-    trt.PluginFieldType.FLOAT32: np.float32,
-    trt.PluginFieldType.FLOAT64: np.float64,
-    trt.PluginFieldType.INT8: np.int8,
-    trt.PluginFieldType.INT16: np.int16,
-    trt.PluginFieldType.INT32: np.int32,
-}
+def numpy_fp32_to_bf16(src):
+    # Numpy doesn't support bfloat16 type
+    # Convert float32 to bfloat16 manually and assign with bf16 abstract type
+    original_shape = src.shape
+    src = src.flatten()
+    src = np.ascontiguousarray(src)
 
-
-def field_dtype_to_np_dtype(dtype):
-    ret = _field_dtype_to_np_dtype_dict.get(dtype)
-    assert ret is not None, f'Unsupported dtype: {dtype}'
-    return ret
-
-
-def convert_capsule_to_void_p(capsule):
-    ctypes.pythonapi.PyCapsule_GetPointer.restype = ctypes.c_void_p
-    ctypes.pythonapi.PyCapsule_GetPointer.argtypes = [
-        ctypes.py_object, ctypes.c_char_p
-    ]
-    return ctypes.pythonapi.PyCapsule_GetPointer(capsule, None)
-
-
-def get_nparray_from_void_p(void_pointer, elem_size, field_dtype):
-    ctypes.pythonapi.PyMemoryView_FromMemory.restype = ctypes.py_object
-    ctypes.pythonapi.PyMemoryView_FromMemory.argtypes = [
-        ctypes.c_char_p, ctypes.c_ssize_t, ctypes.c_int
-    ]
-    logger.info(
-        f'get_nparray: pointer = {void_pointer}, elem_size = {elem_size}')
-    char_pointer = ctypes.cast(void_pointer, ctypes.POINTER(ctypes.c_char))
-    np_dtype = field_dtype_to_np_dtype(field_dtype)
-    buf_bytes = elem_size * np.dtype(np_dtype).itemsize
-    logger.info(f'get_nparray: buf_bytes = {buf_bytes}')
-    mem_view = ctypes.pythonapi.PyMemoryView_FromMemory(
-        char_pointer, buf_bytes, 0)  # number 0 represents PyBUF_READ
-    logger.info(
-        f'get_nparray: mem_view = {mem_view}, field_dtype = {field_dtype}')
-    buf = np.frombuffer(mem_view, np_dtype)
-    return buf
-
-
-def get_scalar_from_field(field):
-    void_p = convert_capsule_to_void_p(field.data)
-    np_array = get_nparray_from_void_p(void_p, 1, field.type)
-    return np_array[0]
+    assert src.dtype == np.float32
+    dst = np.empty_like(src, dtype=np.uint16)
+    for i in range(len(dst)):
+        bytes = struct.pack('<f', src[i])
+        dst[i] = struct.unpack('<H', struct.pack('BB', bytes[2], bytes[3]))[0]
+    return dst.reshape(original_shape).view(np_bfloat16)
diff --git a/tensorrt_llm/builder.py b/tensorrt_llm/builder.py
index f8c16592f9..656c4ff424 100644
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@@ -21,6 +21,8 @@ from typing import Optional, Union
 import tensorrt as trt
 from packaging import version
 
+from tensorrt_llm.quantization import QuantMode
+
 from ._utils import to_dict, to_json_file, trt_version
 from .logger import logger
 from .network import Network
@@ -99,7 +101,6 @@ class Builder():
                               tensor_parallel: int = 1,
                               use_refit: bool = False,
                               int8: bool = False,
-                              fp8: bool = False,
                               strongly_typed: bool = False,
                               opt_level: Optional[int] = None,
                               **kwargs) -> BuilderConfig:
@@ -114,6 +115,7 @@ class Builder():
         '''
         self.strongly_typed = strongly_typed
 
+        quant_mode = kwargs.get("quant_mode", QuantMode(0))
         if not strongly_typed and precision not in self._ALLOWED_PRECISIONS:
             logger.error(
                 f"precision should be one of {self._ALLOWED_PRECISIONS}")
@@ -125,6 +127,8 @@ class Builder():
 
         config = self.trt_builder.create_builder_config()
         if not strongly_typed:
+            fp8 = quant_mode.has_fp8_qdq() or quant_mode.has_fp8_kv_cache()
+
             if precision == 'float16':
                 config.set_flag(trt.BuilderFlag.FP16)
                 config.set_flag(trt.BuilderFlag.OBEY_PRECISION_CONSTRAINTS)
@@ -173,7 +177,6 @@ class Builder():
                                      tensor_parallel=tensor_parallel,
                                      use_refit=use_refit,
                                      int8=int8,
-                                     fp8=fp8,
                                      **kwargs)
 
     def _add_optimization_profile(self, network: Network,
diff --git a/tensorrt_llm/functional.py b/tensorrt_llm/functional.py
index 4c6efcae2d..23dd159d43 100644
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@@ -1772,7 +1772,7 @@ def _lookup_plugin(input: Tensor, weight: Tensor, rank: int) -> Tensor:
 
     Parameters:
         input : Tensor
-            The input tensor the contains the indices to perform the lookup.
+            The input tensor contains the indices to perform the lookup.
 
         weight : Tensor
             The table to gather from.
diff --git a/tensorrt_llm/layers/attention.py b/tensorrt_llm/layers/attention.py
index c2ac95b807..c7cddac874 100644
--- a/tensorrt_llm/layers/attention.py
+++ b/tensorrt_llm/layers/attention.py
@@ -19,9 +19,10 @@ import numpy as np
 import tensorrt as trt
 
 from .._common import default_net, precision
+from .._utils import numpy_fp32_to_bf16
 from ..functional import (AttentionMaskType, PositionEmbeddingType,
                           RotaryScalingType, Tensor, bert_attention, cast, clip,
-                          concat, constant, expand_dims, expand_mask,
+                          concat, constant, embedding, expand_dims, expand_mask,
                           generate_alibi_biases, generate_alibi_slopes,
                           gpt_attention, matmul, repeat_interleave, round,
                           shape, slice, softmax, split, view, where)
@@ -48,7 +49,7 @@ class RopeEmbeddingUtils:
                                  dtype=np.float32)
         concat = np.concatenate((np.sin(sinusoid_inp), np.cos(sinusoid_inp)),
                                 axis=1)
-        return np.expand_dims(concat, axis=0)
+        return np.expand_dims(concat, axis=0).astype(np.float32)
 
     @staticmethod
     def rotate_every_two(tensor: Tensor) -> Tensor:
@@ -140,6 +141,83 @@ class RopeEmbeddingUtils:
             raise ValueError('The PositionEmbeddingType is not RoPE')
         return (tensor * cos) + (rotate_func(tensor) * sin)
 
+    @staticmethod
+    def apply_rotary_pos_emb_chatglm(
+        qkv,
+        position_embedding,
+        num_attention_heads,
+        attention_head_size,
+        max_position_embeddings,
+    ) -> Tensor:
+
+        half_head_size = attention_head_size // 2
+        qkv_shape = shape(qkv)
+        qkv = qkv.view(
+            concat([
+                shape(qkv, 0),
+                shape(qkv, 1),
+                num_attention_heads,
+                3,
+                attention_head_size,
+            ]))
+        query, key, value = split(qkv, 1, dim=3)
+        q_shape = concat([
+            shape(qkv, 0),
+            shape(qkv, 1),
+            num_attention_heads,
+            attention_head_size,
+        ])
+        query = query.view(q_shape)
+        key = key.view(q_shape)
+        value = value.view(q_shape)
+
+        embedding_weight = RopeEmbeddingUtils.create_sinusoidal_positions(
+            max_position_embeddings, half_head_size)
+        embedding_weight = np.split(embedding_weight.squeeze(0), 2, axis=1)
+        embedding_weight = np.concatenate(
+            [
+                embedding_weight[0],
+                embedding_weight[0],
+                embedding_weight[1],
+                embedding_weight[1],
+            ],
+            axis=1,
+        )
+
+        embedding_weight = constant(embedding_weight)
+        position_embedding = embedding(position_embedding, embedding_weight)
+        position_embedding, block_embedding = split(
+            position_embedding,
+            1,
+            dim=1,
+        )
+        sin0, cos0 = split(position_embedding, half_head_size, dim=3)
+        sin1, cos1 = split(block_embedding, half_head_size, dim=3)
+
+        new_shape = concat([
+            shape(qkv, 0),
+            shape(qkv, 1),
+            1,
+            half_head_size,
+        ])
+        position_embedding = [
+            tensor.view(new_shape) for tensor in [cos0, cos1, sin0, sin1]
+        ]
+
+        query = RopeEmbeddingUtils.apply_rotary_pos_emb(
+            tensor=query,
+            position_embedding=position_embedding,
+            pos_emb_type=PositionEmbeddingType.chatglm)
+        key = RopeEmbeddingUtils.apply_rotary_pos_emb(
+            tensor=key,
+            position_embedding=position_embedding,
+            pos_emb_type=PositionEmbeddingType.chatglm)
+
+        qkv = concat([query, key, value], dim=2)
+        qkv = qkv.view(qkv_shape)
+
+        return qkv
+
 
 class AttentionParams(object):
 
@@ -381,6 +459,7 @@ class Attention(Module):
         encoder_output: Optional[Tensor] = None,
         workspace=None,
         position_embedding=None,
+        norm_before_bmm1=False,
     ):
 
         assert isinstance(hidden_states, Tensor)
@@ -399,36 +478,15 @@ class Attention(Module):
                                                  alibi_scale=alibi_scale)
 
         qkv = self.qkv(hidden_states)
+
         if self.position_embedding_type == PositionEmbeddingType.chatglm:
-            qkv = qkv.view(
-                concat([
-                    shape(qkv, 0),
-                    shape(qkv, 1), self.num_attention_heads, 3,
-                    self.attention_head_size
-                ]))
-            query, key, value = split(qkv, 1, dim=3)
-            q_shape = concat([
-                shape(qkv, 0),
-                shape(qkv, 1), self.num_attention_heads,
-                self.attention_head_size
-            ])
-            query = query.view(q_shape)
-            key = key.view(q_shape)
-            value = value.view(q_shape)
-
-            query = RopeEmbeddingUtils.apply_rotary_pos_emb(
-                query,
-                position_embedding=position_embedding,
-                pos_emb_type=PositionEmbeddingType.chatglm)
-            key = RopeEmbeddingUtils.apply_rotary_pos_emb(
-                key,
-                position_embedding=position_embedding,
-                pos_emb_type=PositionEmbeddingType.chatglm)
-
-            qkv = concat([query, key, value], dim=2)
-            qkv = qkv.view(
-                concat([shape(qkv, 0),
-                        shape(qkv, 1), self.hidden_size * 3]))
+            qkv = RopeEmbeddingUtils.apply_rotary_pos_emb_chatglm(
+                qkv,
+                position_embedding,
+                self.num_attention_heads,
+                self.attention_head_size,
+                self.max_position_embeddings,
+            )
 
         paged_kv_cache = default_net().plugin_config.paged_kv_cache
 
@@ -548,7 +606,12 @@ class Attention(Module):
             value = transpose_for_scores(value, is_kv=True)
 
             if self.rotary_enabled:
-                embed_positions = constant(self.embed_positions)
+                if self.dtype == trt.bfloat16:
+                    embed_positions = numpy_fp32_to_bf16(
+                        self.embed_positions.astype(np.float32))
+                    embed_positions = constant(embed_positions)
+                else:
+                    embed_positions = constant(self.embed_positions)
 
                 if self.rotary_embedding_dim is not None:
                     # When shape(hidden_states, 1) > 1(Context phase), the embedding start from 0,
@@ -726,10 +789,13 @@ class Attention(Module):
 
             key = key.permute([0, 1, 3, 2])
             with precision('float32'):
+                if norm_before_bmm1:
+                    # Apply norm on query earlier to prevent matmul fp16 overflow.
+                    query /= self.norm_factor
                 attention_scores = matmul(cast(query, 'float32'),
                                           cast(key, 'float32'))
-
-                attention_scores = attention_scores / self.norm_factor
+                if not norm_before_bmm1:
+                    attention_scores = attention_scores / self.norm_factor
 
                 if self.attention_mask_type in [
                         AttentionMaskType.causal,
diff --git a/tensorrt_llm/layers/embedding.py b/tensorrt_llm/layers/embedding.py
index 8e5be06e7b..d742902bdf 100644
--- a/tensorrt_llm/layers/embedding.py
+++ b/tensorrt_llm/layers/embedding.py
@@ -74,8 +74,11 @@ class Embedding(Module):
 
 class PromptTuningEmbedding(Embedding):
     """
-        Pass all tokens though both normal and prompt embedding tables.
-    Then, combine results based on whether the token was "normal" or "prompt/virtual".
+    PromptTuningEmbedding handles fine-tuned prompts with virtual tokens. At runtime,
+    a supplementary embedding dictionary is passed. Tokens whose ids are >= vocab_size are embedded
+    with that additional dictionary.
+    The prompt tuning dictionary holds multiple tasks, and each sequence is assigned a given task.
+    Prompt-tuned tokens from a given sequence use the adequate task dictionary, as defined by the `tasks` input.
     """
 
     def __init__(self,
@@ -100,6 +103,27 @@ class PromptTuningEmbedding(Embedding):
                 tasks,
                 task_vocab_size,
                 workspace: Optional[Tensor] = None):
+        """
+            Pass all tokens through both normal and prompt embedding tables.
+            Tokens are masked so that "normal" embedding only see "normal" tokens. Same logic for "prompt" embedding.
+            After those two embedding, combine results based on whether the token was "normal" or "prompt-tuned".
+
+        Parameters:
+            tokens : Tensor
+                the ids to embbed, size [batch_size, seq_len]
+
+            prompt_embedding_table : Tensor
+                the additional embedding table for prompt-tuned tokens, size [num_tasks * num_tokens_per_task, hidden_size]
+
+            tasks: Tensor
+                the task required by each token, size [batch_size, seq_len]
+
+            task_vocab_size: Tensor
+                the number of tokens used for each task, should be equal to prompt_embedding_table's num_tokens_per_task, size [1]
+
+        Returns:
+            Tokens' embedding
+        """
         # do not use ">=" because internally the layer works with floating points
         prompt_tokens_mask = tokens > (self.vocab_size - 1)
 
diff --git a/tensorrt_llm/logger.py b/tensorrt_llm/logger.py
index 5fa4193ef9..7e8c2179e0 100644
--- a/tensorrt_llm/logger.py
+++ b/tensorrt_llm/logger.py
@@ -16,7 +16,8 @@ import logging
 import os
 
 import tensorrt as trt
-from mpi4py import MPI
+
+from ._utils import mpi_rank, mpi_world_size
 
 try:
     from polygraphy.logger import G_LOGGER
@@ -62,8 +63,8 @@ class Logger(metaclass=Singleton):
             self._polygraphy_logger.module_severity = severity_map[
                 min_severity][2]
 
-        self.mpi_rank = MPI.COMM_WORLD.Get_rank()
-        self.mpi_size = MPI.COMM_WORLD.Get_size()
+        self.mpi_rank = mpi_rank()
+        self.mpi_size = mpi_world_size()
         if invalid_severity:
             self.warning(
                 f"Requested log level {environ_severity} is invalid. Using 'warning' instead"
diff --git a/tensorrt_llm/models/__init__.py b/tensorrt_llm/models/__init__.py
index 1460593b94..193e7a2c92 100755
--- a/tensorrt_llm/models/__init__.py
+++ b/tensorrt_llm/models/__init__.py
@@ -15,17 +15,15 @@
 from .baichuan.model import BaichuanForCausalLM
 from .bert.model import BertForQuestionAnswering, BertModel
 from .bloom.model import BloomForCausalLM, BloomModel
-from .chatglm2_6b.model import ChatGLM2_6BHeadModel, ChatGLM2_6BModel
-from .chatglm6b.model import ChatGLM6BHeadModel, ChatGLM6BModel
+from .chatglm.model import ChatGLMHeadModel, ChatGLMModel
 from .falcon.model import FalconForCausalLM, FalconModel
 from .gpt.model import GPTLMHeadModel, GPTModel
 from .gptj.model import GPTJForCausalLM, GPTJModel
 from .gptneox.model import GPTNeoXForCausalLM, GPTNeoXModel
+from .internlm.model import InternLMForCausalLM, InternLMModel
 from .llama.model import LLaMAForCausalLM, LLaMAModel
 from .opt.model import OPTLMHeadModel, OPTModel
-from .quantized.quant import (fp8_quantize, smooth_quantize,
-                              weight_only_groupwise_quantize,
-                              weight_only_quantize)
+from .quantized.quant import quantize_model  # noqa
 
 __all__ = [
     'BertModel',
@@ -44,13 +42,10 @@ __all__ = [
     'GPTJForCausalLM',
     'GPTNeoXModel',
     'GPTNeoXForCausalLM',
-    'smooth_quantize',
-    'weight_only_quantize',
-    'weight_only_groupwise_quantize',
-    'fp8_quantize',
-    'ChatGLM6BHeadModel',
-    'ChatGLM6BModel',
-    'ChatGLM2_6BHeadModel',
-    'ChatGLM2_6BModel',
+    'quantize_model',
+    'ChatGLMHeadModel',
+    'ChatGLMModel',
     'BaichuanForCausalLM',
+    'InternLMForCausalLM',
+    'InternLMModel',
 ]
diff --git a/tensorrt_llm/models/baichuan/model.py b/tensorrt_llm/models/baichuan/model.py
index b1a6da7f91..a9cf262e2f 100644
--- a/tensorrt_llm/models/baichuan/model.py
+++ b/tensorrt_llm/models/baichuan/model.py
@@ -22,6 +22,7 @@ from ...layers import (Attention, AttentionMaskType, AttentionParams,
                        RmsNorm)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
+from ...quantization import QuantMode
 from ..generation_mixin import GenerationMixin
 
 
@@ -32,13 +33,28 @@ class BaichuanDecoderLayer(Module):
                  num_attention_heads,
                  max_position_embeddings,
                  position_embedding_type,
+                 num_kv_heads=None,
                  dtype=None,
+                 attention_mask_type=AttentionMaskType.causal,
                  hidden_act='silu',
                  mlp_hidden_size=None,
                  tp_group=None,
                  tp_size=1,
-                 tp_rank=0):
+                 tp_rank=0,
+                 quant_mode=QuantMode(0)):
         super().__init__()
+        # used for quantizing model
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_kv_heads = num_kv_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.dtype = dtype
+        self.hidden_act = hidden_act
+        self.tp_group = tp_group
+        self.tp_size = tp_size
+        self.mlp_hidden_size = mlp_hidden_size
+        self.attention_mask_type = attention_mask_type
+        self.position_embedding_type = position_embedding_type
         self.input_layernorm = RmsNorm(normalized_shape=hidden_size,
                                        dtype=dtype)
 
@@ -46,23 +62,27 @@ class BaichuanDecoderLayer(Module):
         self.attention = Attention(
             hidden_size,
             num_attention_heads,
+            num_kv_heads=num_kv_heads,
             max_position_embeddings=max_position_embeddings,
             dtype=dtype,
-            attention_mask_type=AttentionMaskType.causal,
+            attention_mask_type=attention_mask_type,
             bias=False,
             position_embedding_type=position_embedding_type,
             tp_group=tp_group,
             tp_size=tp_size,
-            tp_rank=tp_rank)
+            tp_rank=tp_rank,
+            use_int8_kv_cache=quant_mode.has_int8_kv_cache(),
+            quant_mode=quant_mode)
         if not mlp_hidden_size:
-            mlp_hidden_size = hidden_size * 4
+            self.mlp_hidden_size = hidden_size * 4
         self.mlp = GatedMLP(hidden_size=hidden_size,
-                            ffn_hidden_size=mlp_hidden_size,
+                            ffn_hidden_size=self.mlp_hidden_size,
                             hidden_act=hidden_act,
                             dtype=dtype,
                             bias=False,
                             tp_group=tp_group,
-                            tp_size=tp_size)
+                            tp_size=tp_size,
+                            quant_mode=quant_mode)
         self.post_layernorm = RmsNorm(normalized_shape=hidden_size, dtype=dtype)
 
     def forward(self,
@@ -101,6 +121,7 @@ class BaichuanModel(Module):
     def __init__(self,
                  num_layers,
                  num_heads,
+                 num_kv_heads,
                  hidden_size,
                  vocab_size,
                  hidden_act,
@@ -108,8 +129,10 @@ class BaichuanModel(Module):
                  position_embedding_type,
                  dtype,
                  mlp_hidden_size=None,
-                 mapping=Mapping()):
+                 mapping=Mapping(),
+                 quant_mode=QuantMode(0)):
         super().__init__()
+        self.mapping = mapping
         self.num_layers = num_layers
         self.vocab_embedding = Embedding(vocab_size, hidden_size, dtype=dtype)
 
@@ -119,12 +142,14 @@ class BaichuanModel(Module):
                 num_attention_heads=num_heads,
                 max_position_embeddings=max_position_embeddings,
                 position_embedding_type=position_embedding_type,
+                num_kv_heads=num_kv_heads,
                 dtype=dtype,
                 hidden_act=hidden_act,
                 mlp_hidden_size=mlp_hidden_size,
                 tp_group=mapping.tp_group,
                 tp_size=mapping.tp_size,
-                tp_rank=mapping.tp_rank) for _ in range(num_layers)
+                tp_rank=mapping.tp_rank,
+                quant_mode=quant_mode) for _ in range(num_layers)
         ])
 
         self.ln_f = RmsNorm(normalized_shape=hidden_size, dtype=dtype)
@@ -176,6 +201,7 @@ class BaichuanForCausalLM(BaichuanModel, GenerationMixin):
     def __init__(self,
                  num_layers,
                  num_heads,
+                 num_kv_heads,
                  hidden_size,
                  vocab_size,
                  hidden_act,
@@ -183,22 +209,35 @@ class BaichuanForCausalLM(BaichuanModel, GenerationMixin):
                  position_embedding_type,
                  dtype,
                  mlp_hidden_size=None,
-                 mapping=Mapping()):
+                 mapping=Mapping(),
+                 quant_mode=QuantMode(0)):
         if isinstance(dtype, str):
-            self._kv_dtype = str_dtype_to_trt(dtype)
+            self.dtype = str_dtype_to_trt(dtype)
         else:
             assert isinstance(dtype, trt.DataType)
-            self._kv_dtype = dtype
-        self._num_layers = num_layers
+            self.dtype = dtype
+
+        self.num_layers = num_layers
         self.num_heads = num_heads
-        self.num_kv_heads = num_heads
+        if num_kv_heads is None or num_kv_heads <= 0:
+            num_kv_heads = num_heads
+        self.num_kv_heads = num_kv_heads
         self.hidden_size = hidden_size
         self.vocab_size = vocab_size
         self.tp_size = mapping.tp_size
-        super().__init__(num_layers, num_heads, hidden_size, vocab_size,
-                         hidden_act, max_position_embeddings,
+
+        self.kv_dtype = self.dtype
+        if quant_mode.has_int8_kv_cache():
+            self.kv_dtype = str_dtype_to_trt('int8')
+        elif quant_mode.has_fp8_kv_cache():
+            self.kv_dtype = str_dtype_to_trt('fp8')
+
+        self.quant_mode = quant_mode
+
+        super().__init__(num_layers, num_heads, num_kv_heads, hidden_size,
+                         vocab_size, hidden_act, max_position_embeddings,
                          position_embedding_type, dtype, mlp_hidden_size,
-                         mapping)
+                         mapping, quant_mode)
         vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size)
         self.lm_head = ColumnLinear(hidden_size,
                                     vocab_size_padded,
@@ -229,11 +268,11 @@ class BaichuanForCausalLM(BaichuanModel, GenerationMixin):
 
         # [batch_size, hidden_size] -> [batch_size, vocab_size]
         lm_logits = self.lm_head(hidden_states)
-        lm_logits.mark_output('logits', self._kv_dtype)
+        lm_logits.mark_output('logits', self.dtype)
 
         if use_cache and default_net().plugin_config.paged_kv_cache == False:
             for i, present in enumerate(presents):
-                present.mark_output(f'present_key_value_{i}', self._kv_dtype)
+                present.mark_output(f'present_key_value_{i}', self.kv_dtype)
             return (lm_logits, presents)
 
         return lm_logits
@@ -253,8 +292,6 @@ class BaichuanForCausalLM(BaichuanModel, GenerationMixin):
 
         # Prepare inputs
         head_size = self.hidden_size // self.num_heads
-        num_heads_kv = (self.num_kv_heads + self.tp_size - 1) // self.tp_size
-
         remove_input_padding = default_net().plugin_config.remove_input_padding
         use_gpt_attention_plugin = default_net(
         ).plugin_config.gpt_attention_plugin
@@ -267,15 +304,18 @@ class BaichuanForCausalLM(BaichuanModel, GenerationMixin):
             max_beam_width,
             max_input_len,
             max_new_tokens,
-            num_heads_kv,
+            self.num_kv_heads,
             head_size,
-            self._num_layers,
-            self._kv_dtype,
+            self.num_layers,
+            self.kv_dtype,
             remove_input_padding=remove_input_padding,
             use_gpt_attention_plugin=use_gpt_attention_plugin,
             use_gemm_plugin=use_gemm_plugin,
             paged_kv_cache=paged_kv_cache,
             tokens_per_block=tokens_per_block,
+            dtype=self.dtype,
+            num_heads=self.num_heads,
+            mapping=self.mapping,
             max_num_tokens=max_num_tokens)
 
         return (model_inputs['input_ids'], model_inputs['position_ids'], True,
diff --git a/tensorrt_llm/models/chatglm2_6b/__init__.py b/tensorrt_llm/models/chatglm/__init__.py
similarity index 100%
rename from tensorrt_llm/models/chatglm2_6b/__init__.py
rename to tensorrt_llm/models/chatglm/__init__.py
diff --git a/tensorrt_llm/models/chatglm2_6b/model.py b/tensorrt_llm/models/chatglm/model.py
similarity index 63%
rename from tensorrt_llm/models/chatglm2_6b/model.py
rename to tensorrt_llm/models/chatglm/model.py
index b13d064a1f..5c0d18fd78 100644
--- a/tensorrt_llm/models/chatglm2_6b/model.py
+++ b/tensorrt_llm/models/chatglm/model.py
@@ -27,39 +27,57 @@ from ...module import Module, ModuleList
 from ..generation_mixin import GenerationMixin
 
 
-class ChatGLM2_6BDecoderLayer(Module):
+class ChatGLMDecoderLayer(Module):
 
-    def __init__(self, args):
+    def __init__(self, layer_id, args):
 
         super().__init__()
 
-        self.apply_residual_connection_post_layernorm = args.apply_residual_connection_post_layernorm
-        self.norm = RmsNorm if args.rmsnorm else LayerNorm
+        self.model_version = args.model_version
         self.use_cache = args.use_cache
 
-        self.input_layernorm = self.norm(
+        if self.model_version == "1":
+            self.alpha = (2 * args.num_layers)**0.5
+            self.norm = LayerNorm
+        else:
+            self.apply_residual_connection_post_layernorm = args.apply_residual_connection_post_layernorm
+            self.norm = RmsNorm if args.rmsnorm else LayerNorm
+
+        self.pre_norm = self.norm(
             normalized_shape=args.hidden_size,
-            eps=args.layernorm_epsilon,
+            eps=args.norm_epsilon,
+            elementwise_affine=True,
             dtype=args.dtype,
         )
 
-        self.self_attention = Attention(
+        self.attention = Attention(
             hidden_size=args.hidden_size,
             num_attention_heads=args.num_heads,
             num_kv_heads=args.num_kv_heads,
             max_position_embeddings=args.max_seq_length,
             num_layers=args.num_layers,
             apply_query_key_layer_scaling=args.apply_query_key_layer_scaling,
-            attention_mask_type=AttentionMaskType.causal,
+            attention_mask_type=AttentionMaskType.bidirectional
+            if args.model_version == "1" else AttentionMaskType.causal,
             bias=args.qkv_bias,
             dtype=args.dtype,
-            position_embedding_type=PositionEmbeddingType.rope_gptj,
+            position_embedding_type=PositionEmbeddingType.chatglm
+            if args.model_version == "1" else PositionEmbeddingType.rope_gptj,
+            rotary_embedding_base=10000.0,
+            rotary_embedding_scaling=None,
             use_int8_kv_cache=args.quant_mode.has_int8_kv_cache(),
+            rotary_embedding_percentage=0.5,
             tp_group=args.mapping.tp_group,
             tp_size=args.mapping.tp_size,
+            tp_rank=args.mapping.rank,
             multi_block_mode=args.multi_block_mode,
             quant_mode=args.quant_mode,
-            rotary_embedding_percentage=0.5,
+            q_scaling=1.0,
+            cross_attention=False,
+            relative_attention=False,
+            max_distance=0,
+            num_buckets=0,
+            instance_id=layer_id * 2,
             dense_bias=args.linear_bias,
         )
 
@@ -67,76 +85,114 @@ class ChatGLM2_6BDecoderLayer(Module):
             hidden_size=args.hidden_size,
             ffn_hidden_size=args.ffn_hidden_size,
             hidden_act=args.hidden_act,
-            dtype=args.dtype,
             bias=args.linear_bias,
+            dtype=args.dtype,
             tp_group=args.mapping.tp_group,
             tp_size=args.mapping.tp_size,
+            quant_mode=args.quant_mode,
+            instance_id=layer_id * 2 + 1,
         )
 
-        self.post_layernorm = self.norm(
+        self.post_norm = self.norm(
             normalized_shape=args.hidden_size,
-            eps=args.layernorm_epsilon,
+            eps=args.norm_epsilon,
+            elementwise_affine=True,
             dtype=args.dtype,
         )
 
     def forward(
         self,
-        hidden_states: Tensor = None,
+        hidden_states: Tensor,
+        position_ids: Tensor = None,  # only used in ChatGLM-6B
         kv_cache_params: KeyValueCacheParams = None,
         attention_params: AttentionParams = None,
     ):
 
-        layernorm_output = self.input_layernorm(hidden_states)
+        norm_output = self.pre_norm(hidden_states)
 
-        attention_output = self.self_attention(
-            hidden_states=layernorm_output,
+        attention_output = self.attention(
+            hidden_states=norm_output,
+            attention_mask=None,
             use_cache=self.use_cache,
             kv_cache_params=kv_cache_params,
             attention_params=attention_params,
+            encoder_output=None,
+            workspace=None,
+            position_embedding=position_ids,
         )
 
         if self.use_cache:
             attention_output, presents = attention_output
 
-        residual = layernorm_output if self.apply_residual_connection_post_layernorm else hidden_states
+        if self.model_version == "1":
+            residual = norm_output
 
-        layernorm_input = residual + attention_output
+            norm_input = residual * self.alpha + attention_output
 
-        layernorm_output = self.post_layernorm(layernorm_input)
+            norm_output = self.post_norm(norm_input)
 
-        mlp_output = self.mlp(layernorm_output)
+            mlp_output = self.mlp(norm_output)
 
-        residual = layernorm_output if self.apply_residual_connection_post_layernorm else layernorm_input
+            residual = norm_output
 
-        output = residual + mlp_output
+            output = residual * self.alpha + mlp_output
+
+        else:
+            residual = norm_output if self.apply_residual_connection_post_layernorm else hidden_states
+
+            norm_input = residual + attention_output
+
+            norm_output = self.post_norm(norm_input)
+
+            mlp_output = self.mlp(norm_output)
+
+            residual = norm_output if self.apply_residual_connection_post_layernorm else norm_input
+
+            output = residual + mlp_output
 
         return (output, presents) if self.use_cache else output
 
 
-class ChatGLM2_6BTransformer(Module):
+class ChatGLMModel(Module):
 
     def __init__(self, args):
 
         super().__init__()
 
+        self.norm = LayerNorm if args.model_version == "1" else RmsNorm
         self.use_cache = args.use_cache
 
-        self.layers = ModuleList(
-            ChatGLM2_6BDecoderLayer(args) for _ in range(args.num_layers))
+        self.embedding = Embedding(
+            num_embeddings=args.vocab_size,
+            embedding_dim=args.hidden_size,
+            dtype=args.dtype,
+            tp_size=1,  #args.mapping.tp_size,
+            tp_group=None,  #args.mapping.tp_group,
+            sharding_dim=0,
+            tp_rank=0,  #args.mapping.rank,
+            instance_id=args.num_layers * 2,
+        )
 
-        self.final_layernorm = RmsNorm(
+        self.layers = ModuleList(
+            ChatGLMDecoderLayer(i, args) for i in range(args.num_layers))
+
+        self.final_norm = self.norm(
             normalized_shape=args.hidden_size,
-            eps=args.layernorm_epsilon,
+            eps=args.norm_epsilon,
+            elementwise_affine=True,
             dtype=args.dtype,
         )
 
     def forward(
         self,
-        hidden_states,
+        input_ids: Tensor = None,
+        position_ids: Tensor = None,  # only used in ChatGLM-6B
         kv_cache_params: KeyValueCacheParams = None,
         attention_params: AttentionParams = None,
     ):
 
+        hidden_states = self.embedding(input_ids)
+
         if self.use_cache:
             presents = []
 
@@ -145,6 +201,7 @@ class ChatGLM2_6BTransformer(Module):
                 kv_cache_params.kv_cache_block_pointers):
             layer_output = layer(
                 hidden_states,
+                position_ids,
                 kv_cache_params=KeyValueCacheParams(
                     past_key_value=[past_key_value],
                     kv_cache_block_pointers=[kv_cache_block_pointers],
@@ -159,44 +216,13 @@ class ChatGLM2_6BTransformer(Module):
                 hidden_states = layer_output[0]
                 presents.append(layer_output[1])
 
-        hidden_states = self.final_layernorm(hidden_states)
+        hidden_states = self.final_norm(hidden_states)
 
         return (hidden_states,
                 tuple(presents)) if self.use_cache else hidden_states
 
 
-class ChatGLM2_6BModel(Module):
-
-    def __init__(self, args):
-
-        super().__init__()
-
-        self.embedding = Embedding(
-            num_embeddings=args.vocab_size,
-            embedding_dim=args.hidden_size,
-            dtype=args.dtype,
-        )
-
-        self.encoder = ChatGLM2_6BTransformer(args)
-
-    def forward(
-        self,
-        input_ids: Tensor = None,
-        kv_cache_params: bool = None,
-        attention_params: bool = None,
-    ):
-
-        inputs_embeds = self.embedding(input_ids)
-
-        hidden_states, presents = self.encoder(
-            inputs_embeds,
-            kv_cache_params=kv_cache_params,
-            attention_params=attention_params,
-        )
-        return hidden_states, presents
-
-
-class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin):
+class ChatGLMHeadModel(ChatGLMModel, GenerationMixin):
 
     def __init__(self, **args):
 
@@ -204,17 +230,27 @@ class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin):
             argNamespace = argparse.Namespace()
             for key, value in args.items():
                 argNamespace.__setattr__(key, value)
+            assert "model_version" in args.keys(), "model_version not set"
             # Other default values
-            argNamespace.apply_residual_connection_post_layernorm = False
-            argNamespace.ffn_hidden_size = 13696
-            argNamespace.kv_channels = 128
-            argNamespace.layernorm_epsilon = 1.0e-5
-            argNamespace.linear_bias = False
             argNamespace.multi_block_mode = False
-            argNamespace.num_kv_heads = 2
-            argNamespace.qkv_bias = True
-            argNamespace.rmsnorm = True
+            argNamespace.norm_epsilon = 1.0e-5
+            argNamespace.tokens_per_block = 64
             argNamespace.use_cache = True
+            if argNamespace.model_version == "1":
+                argNamespace.ffn_hidden_size = 16384
+                argNamespace.linear_bias = True
+                argNamespace.max_seq_length = min(
+                    2048, argNamespace.max_position_embeddings)
+                argNamespace.num_kv_heads = 32
+                argNamespace.qkv_bias = True
+            else:
+                argNamespace.apply_residual_connection_post_layernorm = False
+                argNamespace.ffn_hidden_size = 13696
+                argNamespace.linear_bias = False
+                argNamespace.num_kv_heads = 2
+                argNamespace.qkv_bias = True
+                argNamespace.rmsnorm = True
+
             args = argNamespace
         else:
             args = args["args"]
@@ -238,26 +274,29 @@ class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin):
             self.kv_dtype = str_dtype_to_trt('fp8')
 
         self.hidden_size = args.hidden_size
+        self.mapping = args.mapping
+        self.max_num_tokens = args.max_output_len + args.max_input_len
+        self.model_version = args.model_version
         self.num_heads = args.num_heads
         self.num_kv_heads = args.num_kv_heads
         self.num_layers = args.num_layers
-        self.tp_size = args.mapping.tp_size
+        self.tokens_per_block = args.tokens_per_block
         self.use_cache = args.use_cache
 
         self.lm_head = ColumnLinear(
             in_features=self.hidden_size,
-            out_features=pad_vocab_size(args.vocab_size, self.tp_size),
+            out_features=pad_vocab_size(args.vocab_size, self.mapping.tp_size),
             bias=False,
             dtype=self.dtype,
-            tp_group=args.mapping.tp_group,
-            tp_size=self.tp_size,
+            tp_group=self.mapping.tp_group,
+            tp_size=self.mapping.tp_size,
             gather_output=True,
         )
 
     def forward(
         self,
         input_ids: Tensor = None,
-        position_ids: Tensor = None,
+        position_ids: Tensor = None,  # only used in ChatGLM-6B
         last_token_ids: Tensor = None,
         kv_cache_params: KeyValueCacheParams = None,
         attention_params: AttentionParams = None,
@@ -265,6 +304,7 @@ class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin):
 
         hidden_states = super().forward(
             input_ids,
+            position_ids,
             kv_cache_params,
             attention_params,
         )
@@ -306,7 +346,7 @@ class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin):
             max_beam_width=max_beam_width,
             max_input_len=max_input_len,
             max_new_tokens=max_new_tokens,
-            num_kv_heads=self.num_kv_heads // self.tp_size,
+            num_kv_heads=self.num_kv_heads // self.mapping.tp_size,
             head_size=self.hidden_size // self.num_heads,
             num_layers=self.num_layers,
             kv_dtype=self.kv_dtype,
@@ -315,6 +355,16 @@ class ChatGLM2_6BHeadModel(ChatGLM2_6BModel, GenerationMixin):
             use_gpt_attention_plugin=default_net().plugin_config.
             gpt_attention_plugin,
             use_gemm_plugin=default_net().plugin_config.gemm_plugin,
+            use_custom_all_reduce=False,
+            paged_kv_cache=default_net().plugin_config.paged_kv_cache,
+            tokens_per_block=self.tokens_per_block,
+            gather_all_token_logits=False,
+            dtype=self.kv_dtype,
+            num_heads=self.num_heads,
+            mapping=self.mapping,
+            max_num_tokens=self.max_num_tokens,
+            prompt_embedding_table_size=0,
+            is_chatglm6b=(self.model_version == "1"),
         )
 
         return (model_inputs['input_ids'], model_inputs['position_ids'],
diff --git a/tensorrt_llm/models/chatglm6b/model.py b/tensorrt_llm/models/chatglm6b/model.py
deleted file mode 100644
index 4f66e4e43a..0000000000
--- a/tensorrt_llm/models/chatglm6b/model.py
+++ /dev/null
@@ -1,370 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import argparse
-
-import numpy as np
-import tensorrt as trt
-
-from ..._common import default_net
-from ..._utils import (pad_vocab_size, str_dtype_to_np, str_dtype_to_trt,
-                       trt_dtype_to_np)
-from ...functional import (PositionEmbeddingType, Tensor, concat,
-                           gather_last_token_logits, shape)
-from ...layers import (MLP, Attention, AttentionMaskType, AttentionParams,
-                       ColumnLinear, Embedding, KeyValueCacheParams, LayerNorm)
-from ...module import Module, ModuleList
-from ..generation_mixin import GenerationMixin
-
-
-class ChatGLM6BDecoderLayer(Module):
-
-    def __init__(self, args):
-
-        super().__init__()
-
-        self.use_cache = args.use_cache
-
-        self.input_layernorm = LayerNorm(
-            normalized_shape=args.hidden_size,
-            eps=args.layernorm_epsilon,
-            dtype=args.dtype,
-        )
-
-        self.attention = Attention(
-            hidden_size=args.hidden_size,
-            num_attention_heads=args.num_heads,
-            num_kv_heads=args.num_heads,
-            max_position_embeddings=args.max_seq_length,
-            num_layers=args.num_layers,
-            apply_query_key_layer_scaling=args.apply_query_key_layer_scaling,
-            attention_mask_type=AttentionMaskType.bidirectional,
-            bias=args.bias,
-            dtype=args.dtype,
-            position_embedding_type=PositionEmbeddingType.chatglm,
-            use_int8_kv_cache=args.quant_mode.has_int8_kv_cache(),
-            tp_group=args.mapping.tp_group,
-            tp_size=args.mapping.tp_size,
-            multi_block_mode=args.multi_block_mode,
-            quant_mode=args.quant_mode,
-        )
-
-        self.mlp = MLP(
-            hidden_size=args.hidden_size,
-            ffn_hidden_size=args.ffn_hidden_size,
-            hidden_act=args.hidden_act,
-            dtype=args.dtype,
-            bias=args.bias,
-            tp_group=args.mapping.tp_group,
-            tp_size=args.mapping.tp_size,
-        )
-
-        self.post_layernorm = LayerNorm(
-            normalized_shape=args.hidden_size,
-            eps=args.layernorm_epsilon,
-            dtype=args.dtype,
-        )
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        position_embedding: Tensor,
-        kv_cache_params: KeyValueCacheParams = None,
-        attention_params: AttentionParams = None,
-    ):
-
-        layernorm_output = self.input_layernorm(hidden_states)
-
-        attention_output = self.attention(
-            hidden_states=layernorm_output,
-            attention_mask=None,
-            use_cache=self.use_cache,
-            kv_cache_params=kv_cache_params,
-            attention_params=attention_params,
-            encoder_output=None,
-            workspace=None,
-            position_embedding=position_embedding,
-        )
-
-        if self.use_cache:
-            attention_output, presents = attention_output
-
-        layernorm_input = layernorm_output * 7.484375 + attention_output
-
-        layernorm_output = self.post_layernorm(layernorm_input)
-
-        mlp_output = self.mlp(layernorm_output)
-
-        output = layernorm_output * 7.484375 + mlp_output
-
-        return (output, presents) if self.use_cache else output
-
-
-class ChatGLM6BModel(Module):
-
-    def __init__(self, args):
-
-        super().__init__()
-
-        self.use_cache = args.use_cache
-        self.half_head_size = args.hidden_size // args.num_heads // 2
-
-        self.embedding = Embedding(
-            num_embeddings=args.vocab_size,
-            embedding_dim=args.hidden_size,
-            dtype=args.dtype,
-        )
-
-        # pre-compute weight of position embedding manually
-        if isinstance(args.dtype, trt.DataType):
-            np_dtype = trt_dtype_to_np(args.dtype)
-        else:
-            np_dtype = str_dtype_to_np(args.dtype)
-
-        inv_freq = 10**(-1 / 16 *
-                        np.arange(0, 64, 2, dtype=np.float32)).reshape(1, 32)
-        valueTable = np.matmul(
-            np.arange(args.max_seq_length, dtype=np.float32).reshape(-1, 1),
-            np.tile(inv_freq, [1, 2]),
-        ).reshape(args.max_seq_length, 64)
-
-        self.position_embedding_cos = Embedding(
-            num_embeddings=args.max_seq_length,
-            embedding_dim=self.half_head_size,
-            dtype=args.dtype,
-        )
-        self.position_embedding_sin = Embedding(
-            num_embeddings=args.max_seq_length,
-            embedding_dim=self.half_head_size,
-            dtype=args.dtype,
-        )
-
-        self.position_embedding_cos.weight.value = np.cos(valueTable).astype(
-            np_dtype)
-        self.position_embedding_sin.weight.value = np.sin(valueTable).astype(
-            np_dtype)
-
-        self.layers = ModuleList(
-            ChatGLM6BDecoderLayer(args) for _ in range(args.num_layers))
-
-        self.final_layernorm = LayerNorm(
-            normalized_shape=args.hidden_size,
-            eps=args.layernorm_epsilon,
-            dtype=args.dtype,
-        )
-
-    def forward(
-        self,
-        input_ids: Tensor = None,
-        position_ids: Tensor = None,
-        kv_cache_params: KeyValueCacheParams = None,
-        attention_params: AttentionParams = None,
-    ):
-
-        batch_size = shape(input_ids, 0)
-        input_len = shape(input_ids, 1)
-
-        hidden_states = self.embedding(input_ids)
-
-        position_embedding_cos = self.position_embedding_cos(position_ids)
-        position_embedding_sin = self.position_embedding_sin(position_ids)
-
-        position_embedding_cos0, position_embedding_cos1 = position_embedding_cos.split(
-            1, dim=1)
-        position_embedding_sin0, position_embedding_sin1 = position_embedding_sin.split(
-            1, dim=1)
-
-        position_embedding_cos0 = position_embedding_cos0.view(
-            concat([batch_size, input_len, 1, self.half_head_size]))
-        position_embedding_cos1 = position_embedding_cos1.view(
-            concat([batch_size, input_len, 1, self.half_head_size]))
-        position_embedding_sin0 = position_embedding_sin0.view(
-            concat([batch_size, input_len, 1, self.half_head_size]))
-        position_embedding_sin1 = position_embedding_sin1.view(
-            concat([batch_size, input_len, 1, self.half_head_size]))
-
-        position_embedding = [
-            position_embedding_cos0, position_embedding_cos1,
-            position_embedding_sin0, position_embedding_sin1
-        ]
-
-        if kv_cache_params.past_key_value is None:
-            kv_cache_params.past_key_value = tuple([None] * len(self.layers))
-
-        if self.use_cache:
-            presents = []
-
-        for layer, past_key_value, kv_cache_block_pointers in zip(
-                self.layers, kv_cache_params.past_key_value,
-                kv_cache_params.kv_cache_block_pointers):
-            layer_output = layer(
-                hidden_states,
-                position_embedding,
-                kv_cache_params=KeyValueCacheParams(
-                    past_key_value=[past_key_value],
-                    kv_cache_block_pointers=[kv_cache_block_pointers],
-                    host_past_key_value_lengths=kv_cache_params.
-                    host_past_key_value_lengths,
-                    cache_indirection=kv_cache_params.cache_indirection,
-                ),
-                attention_params=attention_params,
-            )
-
-            if self.use_cache:
-                hidden_states = layer_output[0]
-                presents.append(layer_output[1])
-
-        hidden_states = self.final_layernorm(hidden_states)
-
-        return (hidden_states,
-                tuple(presents)) if self.use_cache else hidden_states
-
-
-class ChatGLM6BHeadModel(ChatGLM6BModel, GenerationMixin):
-
-    def __init__(self, **args):
-
-        if "args" not in args.keys():
-            argNamespace = argparse.Namespace()
-            for key, value in args.items():
-                argNamespace.__setattr__(key, value)
-            # Other default values
-            argNamespace.bias = True
-            argNamespace.ffn_hidden_size = 16384
-            argNamespace.layernorm_epsilon = 1.0e-5
-            argNamespace.max_seq_length = argNamespace.max_position_embeddings
-            argNamespace.multi_block_mode = False
-            argNamespace.num_kv_heads = 32
-            argNamespace.use_cache = True
-            args = argNamespace
-        else:
-            args = args["args"]
-
-        self.init(args)
-
-    def init(self, args):
-
-        super().__init__(args)
-
-        if isinstance(args.dtype, str):
-            self.kv_dtype = str_dtype_to_trt(args.dtype)
-        else:
-            assert isinstance(args.dtype, trt.DataType)
-            self.kv_dtype = args.dtype
-        self.dtype = self.kv_dtype
-
-        if args.quant_mode.has_int8_kv_cache():
-            self.kv_dtype = str_dtype_to_trt('int8')
-        elif args.quant_mode.has_fp8_kv_cache():
-            self.kv_dtype = str_dtype_to_trt('fp8')
-
-        self.hidden_size = args.hidden_size
-        self.num_heads = args.num_heads
-        self.num_kv_heads = args.num_kv_heads
-        self.num_layers = args.num_layers
-        self.tp_size = args.mapping.tp_size
-        self.use_cache = args.use_cache
-
-        self.lm_head = ColumnLinear(
-            in_features=self.hidden_size,
-            out_features=pad_vocab_size(args.vocab_size, self.tp_size),
-            bias=False,
-            dtype=self.dtype,
-            tp_group=args.mapping.tp_group,
-            tp_size=self.tp_size,
-            gather_output=True,
-        )
-
-    def forward(
-        self,
-        input_ids: Tensor = None,
-        position_ids: Tensor = None,
-        last_token_ids: Tensor = None,
-        kv_cache_params: KeyValueCacheParams = None,
-        attention_params: AttentionParams = None,
-    ):
-
-        hidden_states = super().forward(
-            input_ids,
-            position_ids,
-            kv_cache_params,
-            attention_params,
-        )
-
-        if self.use_cache:
-            hidden_states, presents = hidden_states
-
-        hidden_states = gather_last_token_logits(
-            hidden_states, last_token_ids,
-            default_net().plugin_config.remove_input_padding)
-
-        lm_logits = self.lm_head(hidden_states)
-        lm_logits.mark_output('logits', self.dtype)
-
-        if self.use_cache and default_net(
-        ).plugin_config.paged_kv_cache == False:
-            for i, present in enumerate(presents):
-                present.mark_output(f'present_key_value_{i}', self.kv_dtype)
-            return (lm_logits, presents)
-
-        return lm_logits
-
-    def prepare_inputs(
-        self,
-        max_batch_size: int = 0,
-        max_input_len: int = 0,
-        max_new_tokens: int = 0,
-        use_cache: bool = True,
-        max_beam_width: int = 1,
-    ):
-        '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the
-            ranges of the dimensions of when using TRT dynamic shapes.
-
-            @return: a list contains values which can be fed into the self.forward()
-        '''
-
-        model_inputs = self.prepare_basic_inputs(
-            max_batch_size=max_batch_size,
-            max_beam_width=max_beam_width,
-            max_input_len=max_input_len,
-            max_new_tokens=max_new_tokens,
-            num_kv_heads=self.num_kv_heads // self.tp_size,
-            head_size=self.hidden_size // self.num_heads,
-            num_layers=self.num_layers,
-            kv_dtype=self.kv_dtype,
-            remove_input_padding=default_net(
-            ).plugin_config.remove_input_padding,
-            use_gpt_attention_plugin=default_net().plugin_config.
-            gpt_attention_plugin,
-            use_gemm_plugin=default_net().plugin_config.gemm_plugin,
-            is_chatglm6b=True,
-        )
-
-        return (model_inputs['input_ids'], model_inputs['position_ids'],
-                model_inputs['last_token_ids'],
-                KeyValueCacheParams(
-                    past_key_value=model_inputs['past_key_value'],
-                    host_past_key_value_lengths=model_inputs[
-                        'host_past_key_value_lengths'],
-                    kv_cache_block_pointers=model_inputs[
-                        'kv_cache_block_pointers_list'],
-                    cache_indirection=model_inputs['cache_indirection'],
-                ),
-                AttentionParams(
-                    sequence_length=model_inputs['sequence_length'],
-                    context_lengths=model_inputs['context_lengths'],
-                    host_context_lengths=model_inputs['host_context_lengths'],
-                    max_context_length=max_input_len,
-                    host_request_types=model_inputs['host_request_types'],
-                ))
diff --git a/tensorrt_llm/models/generation_mixin.py b/tensorrt_llm/models/generation_mixin.py
index 8a2cac99a7..61be1f3b68 100644
--- a/tensorrt_llm/models/generation_mixin.py
+++ b/tensorrt_llm/models/generation_mixin.py
@@ -22,7 +22,6 @@ from ..mapping import Mapping
 
 
 class GenerationMixin:
-    _use_prompt_tuning = False
 
     def get_transformer_layers(self, mapping, num_layers):
         layers_per_pipeline_stage = num_layers // mapping.pp_size
@@ -51,7 +50,7 @@ class GenerationMixin:
                              num_heads=None,
                              mapping=Mapping(),
                              max_num_tokens=None,
-                             prompt_embedding_table_size=None,
+                             prompt_embedding_table_size: int = 0,
                              is_chatglm6b=False):
 
         max_len = max_input_len + max_new_tokens
@@ -135,15 +134,30 @@ class GenerationMixin:
                          [1, 1] if enable_two_optimization_profiles else [1]),
                         ('num_tokens', num_tokens_range),
                     ]))
-                position_ids = Tensor(
-                    name='position_ids',
-                    dtype=trt.int32,
-                    shape=[1, -1],
-                    dim_range=OrderedDict([
-                        ('batch_size_fake',
-                         [1, 1] if enable_two_optimization_profiles else [1]),
-                        ('num_tokens', num_tokens_range),
-                    ]))
+                if is_chatglm6b:
+                    position_ids = Tensor(
+                        name='position_ids',
+                        dtype=trt.int32,
+                        shape=[1, 2, -1],
+                        dim_range=OrderedDict([
+                            ('batch_size_fake', [1, 1]
+                             if enable_two_optimization_profiles else [1]),
+                            ('2', [2, 2]
+                             if enable_two_optimization_profiles else [2]),
+                            ('num_tokens', num_tokens_range),
+                        ]),
+                    )
+                else:
+                    position_ids = Tensor(
+                        name='position_ids',
+                        dtype=trt.int32,
+                        shape=[1, -1],
+                        dim_range=OrderedDict([
+                            ('batch_size_fake', [1, 1]
+                             if enable_two_optimization_profiles else [1]),
+                            ('num_tokens', num_tokens_range),
+                        ]),
+                    )
             else:
                 assert dtype is not None
                 assert num_heads is not None
@@ -180,16 +194,18 @@ class GenerationMixin:
                             ('2', [2, 2]
                              if enable_two_optimization_profiles else [2]),
                             ('input_len', inlen_range),
-                        ]))
+                        ]),
+                    )
                 else:
-                    position_ids = Tensor(name='position_ids',
-                                          dtype=trt.int32,
-                                          shape=[-1, -1],
-                                          dim_range=OrderedDict([
-                                              ('batch_size_beam_width',
-                                               bb_range),
-                                              ('input_len', inlen_range),
-                                          ]))
+                    position_ids = Tensor(
+                        name='position_ids',
+                        dtype=trt.int32,
+                        shape=[-1, -1],
+                        dim_range=OrderedDict([
+                            ('batch_size_beam_width', bb_range),
+                            ('input_len', inlen_range),
+                        ]),
+                    )
             else:
                 assert dtype is not None
                 assert num_heads is not None
@@ -389,9 +405,8 @@ class GenerationMixin:
         prompt_embedding_table = None
         tasks = None
         prompt_vocab_size = None
-        if self._use_prompt_tuning:
+        if prompt_embedding_table_size > 0:
             hidden_size = num_heads * head_size
-            assert prompt_embedding_table_size is not None, "prompt_embedding_table_size cannot be None when self._use_prompt_tuning is True"
             _p_embedding_range = [
                 1, prompt_embedding_table_size // 2, prompt_embedding_table_size
             ]
diff --git a/tensorrt_llm/models/gpt/model.py b/tensorrt_llm/models/gpt/model.py
index 0135bc8eea..98580565a2 100644
--- a/tensorrt_llm/models/gpt/model.py
+++ b/tensorrt_llm/models/gpt/model.py
@@ -376,7 +376,6 @@ class GPTLMHeadModel(GPTModel, GenerationMixin):
         self._vocab_size = vocab_size
         self._tp_size = mapping.tp_size
         self._multi_query_mode = multi_query_mode
-        self._use_prompt_tuning = use_prompt_tuning
 
         super().__init__(num_layers, num_heads, hidden_size, vocab_size,
                          hidden_act, max_position_embeddings, dtype, mapping,
@@ -444,7 +443,7 @@ class GPTLMHeadModel(GPTModel, GenerationMixin):
                        use_cache,
                        max_beam_width: int = 1,
                        max_num_tokens: int = None,
-                       prompt_embedding_table_size: int = 128,
+                       prompt_embedding_table_size: int = 0,
                        gather_all_token_logits: bool = False):
         '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the
             ranges of the dimensions of when using TRT dynamic shapes.
diff --git a/tensorrt_llm/models/gptj/model.py b/tensorrt_llm/models/gptj/model.py
index 0f1d6a7e63..4fdeee9db6 100644
--- a/tensorrt_llm/models/gptj/model.py
+++ b/tensorrt_llm/models/gptj/model.py
@@ -295,7 +295,8 @@ class GPTJForCausalLM(GPTJModel, GenerationMixin):
             use_custom_all_reduce=use_custom_all_reduce,
             paged_kv_cache=paged_kv_cache,
             tokens_per_block=tokens_per_block,
-            mapping=self.mapping)
+            mapping=self.mapping,
+            max_num_tokens=max_num_tokens)
 
         return (model_inputs['input_ids'], model_inputs['position_ids'], True,
                 model_inputs['last_token_ids'],
diff --git a/tensorrt_llm/models/gptneox/model.py b/tensorrt_llm/models/gptneox/model.py
index 47e985f7f9..0202cc59dc 100644
--- a/tensorrt_llm/models/gptneox/model.py
+++ b/tensorrt_llm/models/gptneox/model.py
@@ -17,108 +17,14 @@ import tensorrt as trt
 from ..._common import default_net
 from ..._utils import pad_vocab_size, str_dtype_to_trt
 from ...functional import (PositionEmbeddingType, Tensor,
-                           gather_last_token_logits, gpt_attention)
-from ...layers import (MLP, AttentionMaskType, AttentionParams, ColumnLinear,
-                       Embedding, KeyValueCacheParams, LayerNorm, RowLinear)
+                           gather_last_token_logits)
+from ...layers import (MLP, Attention, AttentionMaskType, AttentionParams,
+                       ColumnLinear, Embedding, KeyValueCacheParams, LayerNorm)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
-from ...parameter import Parameter
-from ...quantization import QuantMode
 from ..generation_mixin import GenerationMixin
 
 
-class GPTNeoXAttention(Module):
-
-    def __init__(self,
-                 hidden_size,
-                 num_attention_heads,
-                 rotary_dim,
-                 max_position_embeddings,
-                 dtype=None,
-                 multi_block_mode=False,
-                 position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
-                 quant_mode=QuantMode(0),
-                 tp_group=None,
-                 tp_size=1):
-        super().__init__()
-        self.attention_head_size = hidden_size // num_attention_heads
-        self.num_attention_heads = num_attention_heads // tp_size
-        self.max_position_embeddings = max_position_embeddings
-        self.rotary_dim = rotary_dim
-        self.position_embedding_type = position_embedding_type
-        self.multi_block_mode = multi_block_mode
-        self.multi_query_mode = False
-        self.quant_mode = quant_mode
-
-        if self.quant_mode.has_int8_kv_cache():
-            self.kv_quantization_scale = Parameter(shape=(1, ), dtype='float32')
-            self.kv_dequantization_scale = Parameter(shape=(1, ),
-                                                     dtype='float32')
-        else:
-            self.register_parameter('kv_quantization_scale', None)
-            self.register_parameter('kv_dequantization_scale', None)
-
-        self.qkv = ColumnLinear(in_features=hidden_size,
-                                out_features=hidden_size * 3,
-                                bias=True,
-                                tp_group=tp_group,
-                                tp_size=tp_size,
-                                gather_output=False,
-                                dtype=dtype)
-        self.dense = RowLinear(in_features=hidden_size,
-                               out_features=hidden_size,
-                               bias=True,
-                               dtype=dtype,
-                               tp_group=tp_group,
-                               tp_size=tp_size)
-
-    def forward(self,
-                hidden_states: Tensor,
-                attention_mask=None,
-                use_cache=False,
-                kv_cache_params=None,
-                attention_params=None):
-        if not default_net().plugin_config.gpt_attention_plugin:
-            raise ValueError(
-                'GPT-NeoX RoPE is only supported with GPTAttention plugin')
-        qkv = self.qkv(hidden_states)
-
-        assert attention_params.is_valid(
-            default_net().plugin_config.gpt_attention_plugin,
-            default_net().plugin_config.remove_input_padding)
-        assert kv_cache_params.is_valid(
-            default_net().plugin_config.gpt_attention_plugin)
-
-        context, past_key_value = gpt_attention(
-            tensor=qkv,
-            past_key_value=kv_cache_params.get_first_past_key_value(),
-            sequence_length=attention_params.sequence_length,
-            host_past_key_value_lengths=kv_cache_params.
-            host_past_key_value_lengths,
-            context_lengths=attention_params.context_lengths,
-            cache_indirection=kv_cache_params.cache_indirection,
-            host_request_types=attention_params.host_request_types,
-            num_heads=self.num_attention_heads,
-            num_kv_heads=self.num_attention_heads,
-            hidden_size_per_head=self.attention_head_size,
-            q_scaling=1.0,
-            rotary_embedding_dim=self.rotary_dim,
-            position_embedding_type=self.position_embedding_type,
-            multi_block_mode=self.multi_block_mode,
-            kv_orig_quant_scale=self.kv_quantization_scale,
-            kv_quant_orig_scale=self.kv_dequantization_scale,
-            kv_cache_quant_mode=self.quant_mode,
-            max_context_length=attention_params.max_context_length,
-            host_context_lengths=attention_params.host_context_lengths)
-
-        context = self.dense(context)
-
-        if use_cache:
-            return (context, past_key_value)
-
-        return context
-
-
 class GPTNeoXDecoderLayer(Module):
 
     def __init__(self,
@@ -141,13 +47,16 @@ class GPTNeoXDecoderLayer(Module):
         self.post_attention_layernorm = LayerNorm(normalized_shape=hidden_size,
                                                   dtype=dtype)
 
-        self.attention = GPTNeoXAttention(
+        self.attention = Attention(
             hidden_size=hidden_size,
             num_attention_heads=num_attention_heads,
-            rotary_dim=rotary_dim,
+            rotary_embedding_percentage=rotary_dim /
+            (hidden_size // num_attention_heads),
+            position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
             max_position_embeddings=max_position_embeddings,
             dtype=dtype,
-            position_embedding_type=position_embedding_type,
+            attention_mask_type=AttentionMaskType.causal,
+            bias=True,
             tp_group=tp_group,
             tp_size=tp_size)
 
@@ -179,7 +88,8 @@ class GPTNeoXDecoderLayer(Module):
                                           attention_mask=attention_mask,
                                           use_cache=use_cache,
                                           kv_cache_params=kv_cache_params,
-                                          attention_params=attention_params)
+                                          attention_params=attention_params,
+                                          norm_before_bmm1=True)
 
         if use_cache:
             attention_output, presents = attention_output
diff --git a/tensorrt_llm/models/chatglm6b/__init__.py b/tensorrt_llm/models/internlm/__init__.py
similarity index 100%
rename from tensorrt_llm/models/chatglm6b/__init__.py
rename to tensorrt_llm/models/internlm/__init__.py
diff --git a/tensorrt_llm/models/internlm/model.py b/tensorrt_llm/models/internlm/model.py
new file mode 100644
index 0000000000..2324757e42
--- /dev/null
+++ b/tensorrt_llm/models/internlm/model.py
@@ -0,0 +1,427 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import tensorrt as trt
+
+from ..._common import default_net
+from ..._utils import pad_vocab_size, str_dtype_to_trt
+from ...functional import gather_last_token_logits, recv, send
+from ...layers import (Attention, AttentionMaskType, AttentionParams,
+                       ColumnLinear, Embedding, GatedMLP, KeyValueCacheParams,
+                       PositionEmbeddingType, RmsNorm)
+from ...mapping import Mapping
+from ...module import Module, ModuleList
+from ...quantization import QuantMode
+from ..generation_mixin import GenerationMixin
+
+
+class InternLMDecoderLayer(Module):
+
+    def __init__(self,
+                 layer_id,
+                 hidden_size,
+                 num_attention_heads,
+                 num_kv_heads=None,
+                 max_position_embeddings=2048,
+                 dtype=None,
+                 attention_mask_type=AttentionMaskType.causal,
+                 hidden_act='silu',
+                 attn_bias=True,
+                 position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+                 rotary_base=10000.0,
+                 rotary_scaling=None,
+                 mlp_hidden_size=None,
+                 tp_group=None,
+                 tp_size=1,
+                 quant_mode=QuantMode(0),
+                 rms_norm_eps=1e-06):
+        super().__init__()
+        self._layer_id = layer_id  # useful for debugging
+        # used for quantizing model
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+        self.num_kv_heads = num_kv_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.dtype = dtype
+        self.hidden_act = hidden_act
+        self.tp_group = tp_group
+        self.tp_size = tp_size
+        self.mlp_hidden_size = mlp_hidden_size
+        self.attention_mask_type = attention_mask_type
+        self.position_embedding_type = position_embedding_type
+        self.input_layernorm = RmsNorm(normalized_shape=hidden_size,
+                                       eps=rms_norm_eps,
+                                       dtype=dtype)
+
+        self.attention = Attention(
+            hidden_size,
+            num_attention_heads,
+            num_kv_heads,
+            max_position_embeddings,
+            dtype=dtype,
+            attention_mask_type=AttentionMaskType.causal,
+            bias=attn_bias,
+            position_embedding_type=position_embedding_type,
+            rotary_embedding_base=rotary_base,
+            rotary_embedding_scaling=rotary_scaling,
+            tp_group=tp_group,
+            tp_size=tp_size,
+            use_int8_kv_cache=quant_mode.has_int8_kv_cache(),
+            quant_mode=quant_mode,
+            instance_id=2 * layer_id,
+        )
+        if not mlp_hidden_size:
+            self.mlp_hidden_size = hidden_size * 4
+        self.mlp = GatedMLP(hidden_size=hidden_size,
+                            ffn_hidden_size=self.mlp_hidden_size,
+                            hidden_act=hidden_act,
+                            dtype=dtype,
+                            bias=False,
+                            tp_group=tp_group,
+                            tp_size=tp_size,
+                            quant_mode=quant_mode,
+                            instance_id=2 * layer_id + 1)
+        self.post_layernorm = RmsNorm(normalized_shape=hidden_size,
+                                      eps=rms_norm_eps,
+                                      dtype=dtype)
+
+    def forward(self,
+                hidden_states,
+                attention_mask=None,
+                use_cache=False,
+                kv_cache_params=None,
+                attention_params=None,
+                all_reduce_workspace=None):
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        if self._layer_id == 0:
+            self.register_network_output(f"norm0", hidden_states)
+
+        attention_output = self.attention(hidden_states,
+                                          attention_mask=attention_mask,
+                                          use_cache=use_cache,
+                                          kv_cache_params=kv_cache_params,
+                                          attention_params=attention_params,
+                                          workspace=all_reduce_workspace)
+
+        if use_cache:
+            attention_output, presents = attention_output
+        if self._layer_id == 0:
+            self.register_network_output(f"attn", attention_output)
+
+        hidden_states = residual + attention_output
+
+        residual = hidden_states
+        hidden_states = self.post_layernorm(hidden_states)
+        if self._layer_id == 0:
+            self.register_network_output(f"norm1", hidden_states)
+
+        hidden_states = self.mlp(hidden_states, all_reduce_workspace)
+        if self._layer_id == 0:
+            self.register_network_output(f"mlp", hidden_states)
+
+        hidden_states = residual + hidden_states
+        if use_cache:
+            return (hidden_states, presents)
+        return hidden_states
+
+
+class InternLMModel(Module):
+
+    def __init__(self,
+                 num_layers,
+                 num_heads,
+                 num_kv_heads,
+                 hidden_size,
+                 vocab_size,
+                 hidden_act,
+                 attn_bias,
+                 max_position_embeddings,
+                 dtype,
+                 mlp_hidden_size=None,
+                 position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+                 rotary_base=10000.0,
+                 rotary_scaling=None,
+                 mapping=Mapping(),
+                 quant_mode=QuantMode(0),
+                 use_parallel_embedding=False,
+                 embedding_sharding_dim=0,
+                 rms_norm_eps=1e-06):
+        super().__init__()
+        self.mapping = mapping
+
+        if self.mapping.is_first_pp_rank():
+            self.vocab_embedding = Embedding(
+                num_embeddings=vocab_size,
+                embedding_dim=hidden_size,
+                dtype=dtype,
+                tp_size=mapping.tp_size if use_parallel_embedding else 1,
+                tp_group=mapping.tp_group if use_parallel_embedding else None,
+                sharding_dim=embedding_sharding_dim,
+                tp_rank=mapping.tp_rank)
+
+        self.layers = ModuleList([
+            InternLMDecoderLayer(
+                layer_id=i,
+                hidden_size=hidden_size,
+                num_attention_heads=num_heads,
+                num_kv_heads=num_kv_heads,
+                max_position_embeddings=max_position_embeddings,
+                dtype=dtype,
+                hidden_act=hidden_act,
+                attn_bias=attn_bias,
+                mlp_hidden_size=mlp_hidden_size,
+                position_embedding_type=position_embedding_type,
+                rotary_base=rotary_base,
+                rotary_scaling=rotary_scaling,
+                tp_group=mapping.tp_group,
+                tp_size=mapping.tp_size,
+                quant_mode=quant_mode,
+                rms_norm_eps=rms_norm_eps)
+            for i in self.get_transformer_layers(self.mapping, num_layers)
+        ])
+
+        if self.mapping.is_last_pp_rank():
+            self.ln_f = RmsNorm(normalized_shape=hidden_size,
+                                eps=rms_norm_eps,
+                                dtype=dtype)
+
+    def forward(self,
+                input_ids,
+                position_ids=None,
+                use_cache=False,
+                attention_mask=None,
+                kv_cache_params=None,
+                attention_params=None,
+                hidden_states=None,
+                all_reduce_workspace=None):
+
+        if kv_cache_params.past_key_value is None:
+            tuple([None] * len(self.layers))
+
+        if use_cache:
+            presents = []
+
+        if self.mapping.is_first_pp_rank():
+            hidden_states = self.vocab_embedding(input_ids)
+        else:
+            hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
+        self.register_network_output(f"embd", hidden_states)
+
+        for layer, past, pointer in zip(
+                self.layers, kv_cache_params.past_key_value,
+                kv_cache_params.kv_cache_block_pointers):
+            hidden_states = layer(
+                hidden_states,
+                use_cache=use_cache,
+                attention_mask=attention_mask,
+                kv_cache_params=KeyValueCacheParams(
+                    past_key_value=[past],
+                    host_past_key_value_lengths=kv_cache_params.
+                    host_past_key_value_lengths,
+                    kv_cache_block_pointers=[pointer],
+                    cache_indirection=kv_cache_params.cache_indirection),
+                attention_params=attention_params,
+                all_reduce_workspace=all_reduce_workspace)
+
+            if use_cache:
+                presents.append(hidden_states[1])
+                hidden_states = hidden_states[0]
+
+        if self.mapping.is_last_pp_rank():
+            hidden_states = self.ln_f(hidden_states)
+        else:
+            hidden_states = send(hidden_states, self.mapping.next_pp_rank())
+
+        if use_cache:
+            return (hidden_states, tuple(presents))
+        return hidden_states
+
+
+class InternLMForCausalLM(InternLMModel, GenerationMixin):
+
+    def __init__(self,
+                 num_layers,
+                 num_heads,
+                 num_kv_heads,
+                 hidden_size,
+                 vocab_size,
+                 hidden_act,
+                 attn_bias,
+                 max_position_embeddings,
+                 dtype,
+                 logits_dtype="float32",
+                 mlp_hidden_size=None,
+                 position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
+                 rotary_base=10000.0,
+                 rotary_scaling=None,
+                 mapping=Mapping(),
+                 quant_mode=QuantMode(0),
+                 use_parallel_embedding=False,
+                 embedding_sharding_dim=0,
+                 rms_norm_eps=1e-06):
+
+        if isinstance(dtype, str):
+            self.dtype = str_dtype_to_trt(dtype)
+        else:
+            assert isinstance(dtype, trt.DataType)
+            self.dtype = dtype
+
+        if isinstance(logits_dtype, str):
+            self.logits_dtype = str_dtype_to_trt(logits_dtype)
+        else:
+            assert isinstance(logits_dtype, trt.DataType)
+            self.logits_dtype = logits_dtype
+
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        if num_kv_heads is None or num_kv_heads <= 0:
+            num_kv_heads = num_heads
+        self.num_kv_heads = num_kv_heads
+        self.hidden_size = hidden_size
+        self.attn_bias = attn_bias
+        self.vocab_size = vocab_size
+        self.tp_size = mapping.tp_size
+
+        self.kv_dtype = self.dtype
+        if quant_mode.has_int8_kv_cache():
+            self.kv_dtype = str_dtype_to_trt('int8')
+        elif quant_mode.has_fp8_kv_cache():
+            self.kv_dtype = str_dtype_to_trt('fp8')
+
+        self.quant_mode = quant_mode
+        self.use_parallel_embedding = use_parallel_embedding
+        self.embedding_sharding_dim = embedding_sharding_dim
+
+        super().__init__(num_layers, num_heads, num_kv_heads, hidden_size,
+                         vocab_size, hidden_act, attn_bias,
+                         max_position_embeddings, dtype, mlp_hidden_size,
+                         position_embedding_type, rotary_base, rotary_scaling,
+                         mapping, quant_mode, use_parallel_embedding,
+                         embedding_sharding_dim, rms_norm_eps)
+
+        vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size)
+        if self.mapping.is_last_pp_rank():
+            self.lm_head = ColumnLinear(hidden_size,
+                                        vocab_size_padded,
+                                        bias=False,
+                                        dtype=dtype,
+                                        tp_group=mapping.tp_group,
+                                        tp_size=mapping.tp_size,
+                                        gather_output=True)
+
+    def forward(self,
+                input_ids,
+                position_ids=None,
+                use_cache=False,
+                last_token_ids=None,
+                attention_mask=None,
+                kv_cache_params=None,
+                attention_params=None,
+                hidden_states=None,
+                all_reduce_workspace=None):
+        hidden_states = super().forward(input_ids, position_ids, use_cache,
+                                        attention_mask, kv_cache_params,
+                                        attention_params, hidden_states,
+                                        all_reduce_workspace)
+
+        if use_cache:
+            hidden_states, presents = hidden_states
+
+        if self.mapping.is_last_pp_rank():
+            hidden_states = gather_last_token_logits(
+                hidden_states, last_token_ids,
+                default_net().plugin_config.remove_input_padding)
+
+            # [batch_size, hidden_size] -> [batch_size, vocab_size]
+            lm_logits = self.lm_head(hidden_states)
+            lm_logits.mark_output('logits', self.logits_dtype)
+        else:
+            hidden_states.mark_output('hidden_states_output', self.dtype)
+
+        if use_cache and default_net().plugin_config.paged_kv_cache == False:
+            for i, present in zip(
+                    self.get_transformer_layers(self.mapping, self.num_layers),
+                    presents):
+                present.mark_output(f'present_key_value_{i}', self.kv_dtype)
+            if self.mapping.is_last_pp_rank():
+                return (lm_logits, presents)
+            return (hidden_states, presents)
+        else:
+            if self.mapping.is_last_pp_rank():
+                return lm_logits
+            return hidden_states
+
+    def prepare_inputs(self,
+                       max_batch_size,
+                       max_input_len,
+                       max_new_tokens,
+                       use_cache,
+                       max_beam_width,
+                       max_num_tokens: int = None):
+        '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the
+            ranges of the dimensions of when using TRT dynamic shapes.
+
+            @return: a list contains values which can be fed into the self.forward()
+        '''
+
+        # Prepare inputs
+        head_size = self.hidden_size // self.num_heads
+        remove_input_padding = default_net().plugin_config.remove_input_padding
+        use_gpt_attention_plugin = default_net(
+        ).plugin_config.gpt_attention_plugin
+        use_gemm_plugin = default_net().plugin_config.gemm_plugin
+        paged_kv_cache = default_net().plugin_config.paged_kv_cache
+        tokens_per_block = default_net().plugin_config.tokens_per_block
+        use_custom_all_reduce = default_net(
+        ).plugin_config.use_custom_all_reduce
+
+        model_inputs = self.prepare_basic_inputs(
+            max_batch_size,
+            max_beam_width,
+            max_input_len,
+            max_new_tokens,
+            self.num_kv_heads,
+            head_size,
+            self.num_layers,
+            self.kv_dtype,
+            remove_input_padding=remove_input_padding,
+            use_gpt_attention_plugin=use_gpt_attention_plugin,
+            use_gemm_plugin=use_gemm_plugin,
+            use_custom_all_reduce=use_custom_all_reduce,
+            paged_kv_cache=paged_kv_cache,
+            tokens_per_block=tokens_per_block,
+            dtype=self.dtype,
+            num_heads=self.num_heads,
+            mapping=self.mapping,
+            max_num_tokens=max_num_tokens)
+
+        return (model_inputs['input_ids'], model_inputs['position_ids'], True,
+                model_inputs['last_token_ids'], model_inputs['attention_mask'],
+                KeyValueCacheParams(
+                    past_key_value=model_inputs['past_key_value'],
+                    host_past_key_value_lengths=model_inputs[
+                        'host_past_key_value_lengths'],
+                    kv_cache_block_pointers=model_inputs[
+                        'kv_cache_block_pointers_list'],
+                    cache_indirection=model_inputs['cache_indirection'],
+                ),
+                AttentionParams(
+                    sequence_length=model_inputs['sequence_length'],
+                    context_lengths=model_inputs['context_lengths'],
+                    host_context_lengths=model_inputs['host_context_lengths'],
+                    max_context_length=max_input_len,
+                    host_request_types=model_inputs['host_request_types']),
+                model_inputs['hidden_states_input'],
+                model_inputs['all_reduce_workspace'])
diff --git a/tensorrt_llm/models/llama/model.py b/tensorrt_llm/models/llama/model.py
index 2270f61894..896ca77fa0 100644
--- a/tensorrt_llm/models/llama/model.py
+++ b/tensorrt_llm/models/llama/model.py
@@ -12,14 +12,17 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from typing import Optional
+
 import tensorrt as trt
 
 from ..._common import default_net
 from ..._utils import pad_vocab_size, str_dtype_to_trt
-from ...functional import gather_last_token_logits, recv, send
+from ...functional import Tensor, gather_last_token_logits, recv, send
 from ...layers import (Attention, AttentionMaskType, AttentionParams,
                        ColumnLinear, Embedding, FusedGatedMLP, GatedMLP,
-                       KeyValueCacheParams, PositionEmbeddingType, RmsNorm)
+                       KeyValueCacheParams, PositionEmbeddingType,
+                       PromptTuningEmbedding, RmsNorm)
 from ...mapping import Mapping
 from ...module import Module, ModuleList
 from ...quantization import QuantMode
@@ -158,12 +161,15 @@ class LLaMAModel(Module):
                  use_parallel_embedding=False,
                  embedding_sharding_dim=0,
                  rms_norm_eps=1e-06,
-                 use_fused_mlp=False):
+                 use_fused_mlp=False,
+                 use_prompt_tuning: bool = False):
         super().__init__()
         self.mapping = mapping
+        self.use_prompt_tuning = use_prompt_tuning
 
+        EmbeddingCls = PromptTuningEmbedding if use_prompt_tuning else Embedding
         if self.mapping.is_first_pp_rank():
-            self.vocab_embedding = Embedding(
+            self.vocab_embedding = EmbeddingCls(
                 num_embeddings=vocab_size,
                 embedding_dim=hidden_size,
                 dtype=dtype,
@@ -200,15 +206,20 @@ class LLaMAModel(Module):
                                 eps=rms_norm_eps,
                                 dtype=dtype)
 
-    def forward(self,
-                input_ids,
-                position_ids=None,
-                use_cache=False,
-                attention_mask=None,
-                kv_cache_params=None,
-                attention_params=None,
-                hidden_states=None,
-                all_reduce_workspace=None):
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        use_cache=False,
+        attention_mask=None,
+        kv_cache_params=None,
+        attention_params=None,
+        hidden_states=None,
+        all_reduce_workspace=None,
+        prompt_embedding_table: Optional[Tensor] = None,
+        prompt_tasks: Optional[Tensor] = None,
+        prompt_vocab_size: Optional[Tensor] = None,
+    ):
 
         if kv_cache_params.past_key_value is None:
             tuple([None] * len(self.layers))
@@ -216,8 +227,13 @@ class LLaMAModel(Module):
         if use_cache:
             presents = []
 
+        ptuning_args = []
+        if self.use_prompt_tuning:
+            ptuning_args = [
+                prompt_embedding_table, prompt_tasks, prompt_vocab_size
+            ]
         if self.mapping.is_first_pp_rank():
-            hidden_states = self.vocab_embedding(input_ids,
+            hidden_states = self.vocab_embedding(input_ids, *ptuning_args,
                                                  all_reduce_workspace)
         else:
             hidden_states = recv(hidden_states, self.mapping.prev_pp_rank())
@@ -274,7 +290,8 @@ class LLaMAForCausalLM(LLaMAModel, GenerationMixin):
                  use_parallel_embedding=False,
                  embedding_sharding_dim=0,
                  rms_norm_eps=1e-06,
-                 use_fused_mlp=False):
+                 use_fused_mlp=False,
+                 use_prompt_tuning: bool = False):
 
         if isinstance(dtype, str):
             self.dtype = str_dtype_to_trt(dtype)
@@ -312,7 +329,7 @@ class LLaMAForCausalLM(LLaMAModel, GenerationMixin):
                          mlp_hidden_size, position_embedding_type, rotary_base,
                          rotary_scaling, mapping, quant_mode,
                          use_parallel_embedding, embedding_sharding_dim,
-                         rms_norm_eps, use_fused_mlp)
+                         rms_norm_eps, use_fused_mlp, use_prompt_tuning)
 
         vocab_size_padded = pad_vocab_size(vocab_size, mapping.tp_size)
         if self.mapping.is_last_pp_rank():
@@ -324,20 +341,27 @@ class LLaMAForCausalLM(LLaMAModel, GenerationMixin):
                                         tp_size=mapping.tp_size,
                                         gather_output=True)
 
-    def forward(self,
-                input_ids,
-                position_ids=None,
-                use_cache=False,
-                last_token_ids=None,
-                attention_mask=None,
-                kv_cache_params=None,
-                attention_params=None,
-                hidden_states=None,
-                all_reduce_workspace=None):
+    def forward(
+        self,
+        input_ids,
+        position_ids=None,
+        use_cache=False,
+        last_token_ids=None,
+        attention_mask=None,
+        kv_cache_params=None,
+        attention_params=None,
+        hidden_states=None,
+        all_reduce_workspace=None,
+        prompt_embedding_table: Optional[Tensor] = None,
+        prompt_tasks: Optional[Tensor] = None,
+        prompt_vocab_size: Optional[Tensor] = None,
+    ):
         hidden_states = super().forward(input_ids, position_ids, use_cache,
                                         attention_mask, kv_cache_params,
                                         attention_params, hidden_states,
-                                        all_reduce_workspace)
+                                        all_reduce_workspace,
+                                        prompt_embedding_table, prompt_tasks,
+                                        prompt_vocab_size)
 
         if use_cache:
             hidden_states, presents = hidden_states
@@ -366,13 +390,16 @@ class LLaMAForCausalLM(LLaMAModel, GenerationMixin):
                 return lm_logits
             return hidden_states
 
-    def prepare_inputs(self,
-                       max_batch_size,
-                       max_input_len,
-                       max_new_tokens,
-                       use_cache,
-                       max_beam_width,
-                       max_num_tokens: int = None):
+    def prepare_inputs(
+        self,
+        max_batch_size,
+        max_input_len,
+        max_new_tokens,
+        use_cache,
+        max_beam_width,
+        max_num_tokens: int = None,
+        prompt_embedding_table_size: int = 0,
+    ):
         '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the
             ranges of the dimensions of when using TRT dynamic shapes.
 
@@ -408,23 +435,33 @@ class LLaMAForCausalLM(LLaMAModel, GenerationMixin):
             dtype=self.dtype,
             num_heads=self.num_heads,
             mapping=self.mapping,
-            max_num_tokens=max_num_tokens)
+            max_num_tokens=max_num_tokens,
+            prompt_embedding_table_size=prompt_embedding_table_size,
+        )
 
-        return (model_inputs['input_ids'], model_inputs['position_ids'], True,
-                model_inputs['last_token_ids'], model_inputs['attention_mask'],
-                KeyValueCacheParams(
-                    past_key_value=model_inputs['past_key_value'],
-                    host_past_key_value_lengths=model_inputs[
-                        'host_past_key_value_lengths'],
-                    kv_cache_block_pointers=model_inputs[
-                        'kv_cache_block_pointers_list'],
-                    cache_indirection=model_inputs['cache_indirection'],
-                ),
-                AttentionParams(
-                    sequence_length=model_inputs['sequence_length'],
-                    context_lengths=model_inputs['context_lengths'],
-                    host_context_lengths=model_inputs['host_context_lengths'],
-                    max_context_length=max_input_len,
-                    host_request_types=model_inputs['host_request_types']),
-                model_inputs['hidden_states_input'],
-                model_inputs['all_reduce_workspace'])
+        return (
+            model_inputs['input_ids'],
+            model_inputs['position_ids'],
+            True,
+            model_inputs['last_token_ids'],
+            model_inputs['attention_mask'],
+            KeyValueCacheParams(
+                past_key_value=model_inputs['past_key_value'],
+                host_past_key_value_lengths=model_inputs[
+                    'host_past_key_value_lengths'],
+                kv_cache_block_pointers=model_inputs[
+                    'kv_cache_block_pointers_list'],
+                cache_indirection=model_inputs['cache_indirection'],
+            ),
+            AttentionParams(
+                sequence_length=model_inputs['sequence_length'],
+                context_lengths=model_inputs['context_lengths'],
+                host_context_lengths=model_inputs['host_context_lengths'],
+                max_context_length=max_input_len,
+                host_request_types=model_inputs['host_request_types']),
+            model_inputs['hidden_states_input'],
+            model_inputs['all_reduce_workspace'],
+            model_inputs['prompt_embedding_table'],
+            model_inputs['tasks'],
+            model_inputs['prompt_vocab_size'],
+        )
diff --git a/tensorrt_llm/models/opt/model.py b/tensorrt_llm/models/opt/model.py
index 1c31c98e25..f2469c9729 100644
--- a/tensorrt_llm/models/opt/model.py
+++ b/tensorrt_llm/models/opt/model.py
@@ -291,7 +291,7 @@ class OPTLMHeadModel(OPTModel, GenerationMixin):
                        max_new_tokens,
                        use_cache,
                        max_beam_width,
-                       prompt_embedding_table_size=32):
+                       prompt_embedding_table_size: int = 0):
         '''@brief: Prepare inputs Tensors for the model, the given sizes are used to determine the
             ranges of the dimensions of when using TRT dynamic shapes.
 
diff --git a/tensorrt_llm/models/quantized/ammo.py b/tensorrt_llm/models/quantized/ammo.py
index 9747a6c612..cce07f4279 100644
--- a/tensorrt_llm/models/quantized/ammo.py
+++ b/tensorrt_llm/models/quantized/ammo.py
@@ -27,6 +27,29 @@ except ImportError:
 from ...logger import logger
 
 
+def _register_falcon_linears(model):
+    """Register Falcon linear modules as Quantiation.
+
+    As falcon models could use remote code, which will be loaded dynamically,
+    to build their model. Therefore, we need to register the linear on the fly
+    before quantization.
+
+    """
+    if type(model).__name__ in ["RWForCausalLM", "FalconForCausalLM"]:
+        from ammo.torch.quantization import tensor_quant
+        from ammo.torch.quantization.nn.modules.quant_module import \
+            QuantLinearConvBase
+
+        linear_type = type(model.transformer.h[0].self_attention.dense)
+
+        class QuantFalconLinearRW1B(linear_type,
+                                    QuantLinearConvBase):  # type: ignore
+            default_quant_desc_weight = tensor_quant.QUANT_DESC_8BIT_LINEAR_WEIGHT_PER_ROW
+
+        atq.module_mapping.QUANT_MODULE_MAPPING[
+            linear_type] = QuantFalconLinearRW1B.convert
+
+
 def _quantize_model(model: torch.nn.Module,
                     qformat: Literal['fp8', 'int8_sq', 'int4_awq'],
                     calib_dataloader: DataLoader,
@@ -51,6 +74,8 @@ def _quantize_model(model: torch.nn.Module,
             logger.debug(f"Calibrating batch {idx}")
             model(data)
 
+    _register_falcon_linears(model)
+
     logger.debug("Starting quantization...")
     atq.quantize(model, quant_cfg, forward_loop=calibrate_loop)
     logger.debug("Quantization done")
diff --git a/tensorrt_llm/models/quantized/quant.py b/tensorrt_llm/models/quantized/quant.py
index e35772802e..246eee0421 100644
--- a/tensorrt_llm/models/quantized/quant.py
+++ b/tensorrt_llm/models/quantized/quant.py
@@ -12,13 +12,15 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Union
+from typing import Any, Union
 
 import numpy as np
 
 from ...layers import ColumnLinear, RowLinear
-from ...models import (BloomForCausalLM, FalconForCausalLM, GPTJForCausalLM,
-                       GPTLMHeadModel, LLaMAForCausalLM)
+from ...models import (BaichuanForCausalLM, BloomForCausalLM, FalconForCausalLM,
+                       GPTJForCausalLM, GPTLMHeadModel, InternLMForCausalLM,
+                       LLaMAForCausalLM)
+from ...module import Module
 from ...quantization import QuantMode
 from ...quantization.layers import FP8Linear, FP8RowLinear
 
@@ -68,7 +70,6 @@ def _smooth_quantize_gpt(model, quant_mode):
             dtype=layer.dtype,
             quant_mode=quant_mode)
 
-    setattr(model, 'quant_mode', quant_mode)
     return model
 
 
@@ -113,7 +114,6 @@ def _smooth_quantize_llama(model, quant_mode):
             dtype=layer.dtype,
             quant_mode=quant_mode)
 
-    setattr(model, 'quant_mode', quant_mode)
     return model
 
 
@@ -160,24 +160,78 @@ def _smooth_quantize_bloom(model, quant_mode):
     return model
 
 
-def smooth_quantize(model, quant_mode):
+def _smooth_quantize_baichuan(model, quant_mode):
+    # Baichuan models' structures are similar to LLaMA so we can reuse the impl
+    return _smooth_quantize_llama(model, quant_mode)
+
+
+def _smooth_quantize_internlm(model, quant_mode):
+    assert quant_mode.has_act_and_weight_quant()
+    for layer in model.layers:
+        assert hasattr(layer,
+                       "input_layernorm"), "The layer has no input_layernorm"
+        layer.input_layernorm = SmoothQuantRmsNorm(
+            normalized_shape=layer.hidden_size,
+            dtype=layer.dtype,
+            quant_mode=quant_mode)
+        assert hasattr(layer, "attention"), "The layer has no attention"
+        layer.attention = SmoothQuantAttention(
+            layer.hidden_size,
+            num_attention_heads=layer.num_attention_heads,
+            num_kv_heads=layer.num_kv_heads,
+            max_position_embeddings=layer.max_position_embeddings,
+            num_layers=model.num_layers,
+            dtype=layer.dtype,
+            attention_mask_type=layer.attention_mask_type,
+            position_embedding_type=layer.position_embedding_type,
+            tp_group=layer.tp_group,
+            tp_size=layer.tp_size,
+            quant_mode=quant_mode,
+            bias=model.attn_bias)
+
+        assert hasattr(layer, "mlp"), "The layer has no mlp"
+        layer.mlp = SmoothQuantGatedMLP(hidden_size=model.hidden_size,
+                                        ffn_hidden_size=layer.mlp_hidden_size,
+                                        hidden_act=layer.hidden_act,
+                                        dtype=layer.dtype,
+                                        tp_group=layer.tp_group,
+                                        tp_size=layer.tp_size,
+                                        quant_mode=quant_mode,
+                                        bias=False)
+        assert hasattr(
+            layer,
+            "post_layernorm"), "The layer has no post_rmspost_layernormnorm"
+        layer.post_layernorm = SmoothQuantRmsNorm(
+            normalized_shape=layer.hidden_size,
+            dtype=layer.dtype,
+            quant_mode=quant_mode)
+
+    setattr(model, 'quant_mode', quant_mode)
+    return model
+
+
+def _smooth_quantize(model, quant_mode):
     assert isinstance(model, GPTLMHeadModel) or isinstance(model, LLaMAForCausalLM) \
-            or isinstance(model, BloomForCausalLM),\
-            "Only GPTLMHeadModel, LLaMAForCausalLM and BloomForCausalLM are well tested now"
+            or isinstance(model, BloomForCausalLM) or isinstance(model, BaichuanForCausalLM) or isinstance(model, InternLMForCausalLM), \
+            "Only GPTLMHeadModel, LLaMAForCausalLM BloomForCausalLM, InternLMForCausalLM and BaichuanForCausalLM are well tested now"
     if isinstance(model, GPTLMHeadModel):
         return _smooth_quantize_gpt(model, quant_mode)
     elif isinstance(model, LLaMAForCausalLM):
         return _smooth_quantize_llama(model, quant_mode)
     elif isinstance(model, BloomForCausalLM):
         return _smooth_quantize_bloom(model, quant_mode)
+    elif isinstance(model, BaichuanForCausalLM):
+        return _smooth_quantize_baichuan(model, quant_mode)
+    elif isinstance(model, InternLMForCausalLM):
+        return _smooth_quantize_internlm(model, quant_mode)
     else:
         assert False, f"Model {type(model).__name__} is not supported by SmoothQuant yet"
 
 
-def weight_only_quantize(model,
-                         quant_mode,
-                         exclude_modules=None,
-                         current_key_name=None):
+def _weight_only_quantize(model,
+                          quant_mode,
+                          exclude_modules=None,
+                          current_key_name=None):
     assert quant_mode.is_weight_only()
 
     exclude_modules = ['lm_head'
@@ -189,8 +243,8 @@ def weight_only_quantize(model,
         current_key_name.append(name)
 
         if len(list(module.children())) > 0:
-            weight_only_quantize(module, quant_mode, exclude_modules,
-                                 current_key_name)
+            _weight_only_quantize(module, quant_mode, exclude_modules,
+                                  current_key_name)
 
         if isinstance(module, ColumnLinear) and name not in exclude_modules:
             if not any(key in '.'.join(current_key_name)
@@ -218,18 +272,16 @@ def weight_only_quantize(model,
 
         current_key_name.pop(-1)
 
-    setattr(model, 'quant_mode', quant_mode)
-
     return model
 
 
-def weight_only_groupwise_quantize(model,
-                                   quant_mode,
-                                   group_size=128,
-                                   pre_quant_scale=False,
-                                   zero=False,
-                                   exclude_modules=None,
-                                   current_key_name=None):
+def _weight_only_groupwise_quantize(model,
+                                    quant_mode,
+                                    group_size=128,
+                                    pre_quant_scale=False,
+                                    zero=False,
+                                    exclude_modules=None,
+                                    current_key_name=None):
     exclude_modules = ['lm_head'
                        ] if exclude_modules is None else exclude_modules
 
@@ -239,9 +291,9 @@ def weight_only_groupwise_quantize(model,
         current_key_name.append(name)
 
         if len(list(module.children())) > 0:
-            weight_only_groupwise_quantize(module, quant_mode, group_size,
-                                           pre_quant_scale, zero,
-                                           exclude_modules, current_key_name)
+            _weight_only_groupwise_quantize(module, quant_mode, group_size,
+                                            pre_quant_scale, zero,
+                                            exclude_modules, current_key_name)
 
         if isinstance(module, ColumnLinear) and name not in exclude_modules:
             if not any(key in '.'.join(current_key_name)
@@ -273,8 +325,21 @@ def weight_only_groupwise_quantize(model,
 
         current_key_name.pop(-1)
 
-    setattr(model, 'quant_mode', quant_mode)
+    return model
 
+
+def quantize_model(model: Module, quant_mode: QuantMode, **kwargs: Any):
+    if quant_mode.has_fp8_qdq() or quant_mode.has_fp8_kv_cache():
+        model = _fp8_quantize(model, quant_mode, **kwargs)
+    elif quant_mode.has_act_and_weight_quant():
+        model = _smooth_quantize(model, quant_mode)
+    elif quant_mode.is_weight_only():
+        if quant_mode.has_per_group_scaling():
+            model = _weight_only_groupwise_quantize(model, quant_mode, **kwargs)
+        else:
+            model = _weight_only_quantize(model, quant_mode, **kwargs)
+
+    setattr(model, "quant_mode", quant_mode)
     return model
 
 
@@ -364,7 +429,7 @@ def _default_fp8_quantize(model: Union[GPTLMHeadModel, LLaMAForCausalLM,
     return model
 
 
-def fp8_quantize(model, quant_mode: QuantMode, quant_scales: dict = None):
+def _fp8_quantize(model, quant_mode: QuantMode, quant_scales: dict = None):
     if isinstance(
             model,
         (FalconForCausalLM, GPTJForCausalLM, GPTLMHeadModel, LLaMAForCausalLM)):
diff --git a/tensorrt_llm/parameter.py b/tensorrt_llm/parameter.py
index 69fe514804..b14a2953fa 100644
--- a/tensorrt_llm/parameter.py
+++ b/tensorrt_llm/parameter.py
@@ -12,21 +12,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from typing import Sequence, Union
+import math
+from typing import Optional, Sequence, Union
 
 import numpy as np
 import tensorrt as trt
+import torch
 
 from ._utils import str_dtype_to_trt, torch_to_numpy, trt_dtype_to_torch
 from .functional import Tensor, constant
 from .logger import logger
 
 
-class Parameter(object):
+class Parameter:
     _DEFAULT_DTYPE = trt.DataType.FLOAT
 
     def __init__(self,
-                 value: Union[np.ndarray] = None,
+                 value: Optional[Union[np.ndarray, torch.Tensor]] = None,
                  shape: Sequence[int] = None,
                  dtype: Union[str, trt.DataType] = None):
         if dtype is None:
@@ -37,11 +39,10 @@ class Parameter(object):
         if isinstance(dtype, str):
             dtype = str_dtype_to_trt(dtype)
         if value is None:
-            import torch
             assert isinstance(shape, (list, tuple))
             if len(shape) == 2:
                 # Xavier initialization see https://paperswithcode.com/method/xavier-initialization
-                v_range = np.sqrt(6) / np.sqrt(shape[0] + shape[1])
+                v_range = math.sqrt(6) / math.sqrt(shape[0] + shape[1])
             else:
                 v_range = 0.1
 
@@ -56,9 +57,8 @@ class Parameter(object):
                     (shape), dtype=trt_dtype_to_torch(dtype),
                     device='cuda') * 2 - 1
             # value ~ U[-v_range, v_range]
-            value = torch_to_numpy((value * v_range).cpu())
-
-        self._value = value
+            value = value * v_range
+        self._value = self._regularize_value(value)
 
     @property
     def value(self) -> Tensor:
@@ -75,13 +75,21 @@ class Parameter(object):
         return self._value
 
     @value.setter
-    def value(self, v: np.ndarray):
-        assert isinstance(v, np.ndarray)
+    def value(self, v: Union[np.ndarray, torch.Tensor]):
+        v = self._regularize_value(v)
         assert v.shape == self._value.shape, \
-            ('The value updated is not the same shape as the original. ', \
-            f'Updated: {v.shape}, original: {self._value.shape}')
+            f'The value updated is not the same shape as the original. ' \
+            f'Updated: {v.shape}, original: {self._value.shape}'
         self._value = v
 
     def _get_weights(self) -> trt.Weights:
         return self._value.producer.weights if isinstance(self._value,
                                                           Tensor) else None
+
+    def _regularize_value(self, value):
+        if isinstance(value, np.ndarray):
+            return value
+        elif isinstance(value, torch.Tensor):
+            return torch_to_numpy(value)
+        raise TypeError(
+            f'Expected numpy.ndarray or torch.Tensor, got {type(value)}')
diff --git a/tensorrt_llm/profiler.py b/tensorrt_llm/profiler.py
index 36803b7a43..25c296f425 100644
--- a/tensorrt_llm/profiler.py
+++ b/tensorrt_llm/profiler.py
@@ -13,6 +13,20 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import time
+from functools import partial
+from typing import Literal, Optional, Union
+
+try:
+    import psutil
+except ImportError:
+    psutil = None
+try:
+    import pynvml
+except ImportError:
+    pynvml = None
+import torch
+
+from tensorrt_llm.logger import logger
 
 
 class Timer:
@@ -41,9 +55,9 @@ class Timer:
         self._total_elapsed_times.clear()
 
     def summary(self):
-        print('Profile Results')
+        logger.info('Profile Results')
         for tag, elapsed_time in self._total_elapsed_times.items():
-            print(f' - {tag.ljust(30, ".")}: {elapsed_time:.6f} (sec)')
+            logger.info(f' - {tag.ljust(30, ".")}: {elapsed_time:.6f} (sec)')
 
 
 _default_timer = Timer()
@@ -67,3 +81,189 @@ def reset():
 
 def summary():
     _default_timer.summary()
+
+
+_pynvml_initialized = False
+
+
+def initialize_pynvml():
+    global _pynvml_initialized
+    if pynvml is not None and not _pynvml_initialized:
+        pynvml.nvmlInit()
+        _pynvml_initialized = True
+
+
+def finalize_pynvml():
+    global _pynvml_initialized
+    if pynvml is not None and _pynvml_initialized:
+        pynvml.nvmlInvmlShutdownnit()
+        _pynvml_initialized = False
+
+
+class MemoryMonitor:
+
+    TAG = '[MemUsage]'
+    UnitType = Literal['GiB', 'MiB', 'KiB']
+    units = {'GiB': 1 << 30, 'MiB': 1 << 20, 'KiB': 1 << 10}
+    # For convenience.
+    _rename_map = {'GB': 'GiB', 'MB': 'MiB', 'KiB': 'KB'}
+
+    _maybe_warned = False
+
+    def __init__(self):
+        # bytes
+        self._peak_host_memory = 0
+        self._peak_device_memory = 0
+        self._check_required_packages()
+
+        self.device_handles = {}
+        initialize_pynvml()
+
+        if pynvml.__version__ < '11.5.0':
+            logger.warning(f'Found pynvml=={pynvml.__version__}. Please use '
+                           f'pynvml>=11.5.0 to get accurate memory usage')
+            # Support legacy pynvml. Note that an old API could return
+            # wrong GPU memory usage.
+            self._device_mem__fn = pynvml.nvmlDeviceGetMemoryInfo
+        else:
+            self._device_mem__fn = partial(pynvml.nvmlDeviceGetMemoryInfo,
+                                           version=pynvml.nvmlMemory_v2)
+
+    @classmethod
+    def _check_required_packages(cls):
+        if cls._maybe_warned:
+            return
+        if psutil is None:
+            # Warning once.
+            logger.warning(
+                "A required package 'psutil' is not installed. Will not "
+                "monitor the host memory usages. Please install the package "
+                "first, e.g, 'pip install psutil'.")
+            return
+        if pynvml is None:
+            # Warning once.
+            logger.warning(
+                "A required package 'psutil' is not installed. Will not "
+                "monitor the host memory usages. Please install the package "
+                "first, e.g, 'pip install pynvml>=11.5.0'.")
+        cls._maybe_warned = True
+
+    def host_memory_info(self) -> int:
+        process = psutil.Process()
+        # USS reports the amount of memory that would be freed if the process
+        # was terminated right now.
+        #   https://psutil.readthedocs.io/en/latest/index.html#psutil.Process.memory_full_info
+        vmem = psutil.virtual_memory()
+        total_mem = vmem.total
+        free_mem = vmem.available
+        alloc_mem = process.memory_full_info().uss
+        if alloc_mem > self._peak_host_memory:
+            self._peak_host_memory = alloc_mem
+        return alloc_mem, free_mem, total_mem
+
+    def device_memory_info(
+        self,
+        device: Optional[Union[torch.device, int]] = None,
+    ) -> int:
+        index = torch._utils._get_device_index(device, optional=True)
+        if index not in self.device_handles:
+            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
+            self.device_handles[index] = handle
+        mem_info = self._device_mem__fn(self.device_handles[index])
+        if mem_info.used > self._peak_device_memory:
+            self._peak_device_memory = mem_info.used
+        return mem_info.used, mem_info.free, mem_info.total
+
+    @staticmethod
+    def _normalize_unit_name(unit: str):
+        # Rename GB -> GiB.
+        return {'GB': 'GiB', 'MB': 'MiB', 'KiB': 'KB'}[unit]
+
+    @classmethod
+    def _format(cls, mem_bytes: int, unit: UnitType) -> str:
+        if unit not in cls.units:
+            unit = cls._rename_map[unit]
+        mem_usage = float(mem_bytes) / cls.units[unit]
+        return f'{mem_usage:.4f} ({unit})'
+
+    @classmethod
+    def _print_message(cls, msg: str, tag: Optional[str] = None):
+        if tag:
+            msg = f'{tag} - {msg}'
+        logger.info(f'{cls.TAG} {msg}')
+
+    def print_host_memory_usage(self,
+                                tag: Optional[str] = None,
+                                unit: UnitType = 'GiB'):
+        if psutil is None:
+            return
+        alloc_mem, _, _ = self.host_memory_info()
+        msg = f'Allocated Host Memory {self._format(alloc_mem, unit)}'
+        self._print_message(msg, tag)
+
+    def print_device_memory_usage(
+        self,
+        tag: Optional[str] = None,
+        unit: UnitType = 'GB',
+        device: Optional[Union[torch.device, int]] = None,
+    ):
+        alloc_mem, _, _ = self.device_memory_info(device)
+        msg = f'Allocated Device Memory {self._format(alloc_mem, unit)}'
+        self._print_message(msg, tag)
+
+    def print_memory_usage(
+        self,
+        tag: Optional[str] = None,
+        unit: UnitType = 'GiB',
+        device: Optional[Union[torch.device, int]] = None,
+    ):
+        alloc_host_mem, _, _ = self.host_memory_info()
+        alloc_device_mem, _, _ = self.device_memory_info(device=device)
+        msg = f'Allocated Memory: Host {self._format(alloc_host_mem, unit)} '\
+              f'Device {self._format(alloc_device_mem, unit)}'
+        self._print_message(msg, tag)
+
+    def print_peak_memory_usage(self, unit: UnitType = 'GiB'):
+        self._print_message(
+            f'Peak Memory Usage: '
+            f'Host {self._format(self._peak_host_memory, unit)} '
+            f'Device {self._format(self._peak_device_memory, unit)}')
+
+
+if psutil is not None and pynvml is not None:
+    _default_memory_monitor = MemoryMonitor()
+else:
+    _default_memory_monitor = None
+
+
+def host_memory_info():
+    if _default_memory_monitor is not None:
+        return _default_memory_monitor.host_memory_info()
+
+
+def device_memory_info(device: Optional[Union[torch.device, int]] = None):
+    if _default_memory_monitor is not None:
+        return _default_memory_monitor.device_memory_info(device)
+
+
+def print_host_memory_usage(tag: Optional[str] = None,
+                            unit: MemoryMonitor.UnitType = 'GiB'):
+    if _default_memory_monitor is not None:
+        _default_memory_monitor.print_host_memory_usage(tag=tag, unit=unit)
+
+
+def print_device_memory_usage(tag: Optional[str] = None,
+                              unit: MemoryMonitor.UnitType = 'GiB'):
+    if _default_memory_monitor is not None:
+        _default_memory_monitor.print_device_memory_usage(tag=tag, unit=unit)
+
+
+def print_memory_usage(tag: Optional[str] = None,
+                       unit: MemoryMonitor.UnitType = 'GiB'):
+    if _default_memory_monitor is not None:
+        _default_memory_monitor.print_memory_usage(tag=tag, unit=unit)
+
+
+def print_peak_memory_usage(unit: MemoryMonitor.UnitType = 'GiB'):
+    if _default_memory_monitor is not None:
+        _default_memory_monitor.print_peak_memory_usage(unit=unit)
diff --git a/tensorrt_llm/runtime/__init__.py b/tensorrt_llm/runtime/__init__.py
index 703f1f9456..d1075424d7 100644
--- a/tensorrt_llm/runtime/__init__.py
+++ b/tensorrt_llm/runtime/__init__.py
@@ -12,7 +12,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-from .generation import (ChatGLM6BHeadModelGenerationSession, GenerationSession,
+from .generation import (ChatGLMGenerationSession, GenerationSession,
                          ModelConfig, SamplingConfig, to_word_list_format)
 from .kv_cache_manager import GenerationSequence, KVCacheManager
 from .session import Session, TensorInfo
@@ -25,6 +25,6 @@ __all__ = [
     'SamplingConfig',
     'Session',
     'TensorInfo',
-    'ChatGLM6BHeadModelGenerationSession',
+    'ChatGLMGenerationSession',
     'to_word_list_format',
 ]
diff --git a/tensorrt_llm/runtime/generation.py b/tensorrt_llm/runtime/generation.py
index 687bbb2851..6606afdc6e 100755
--- a/tensorrt_llm/runtime/generation.py
+++ b/tensorrt_llm/runtime/generation.py
@@ -249,7 +249,7 @@ class ModelConfig:
     has_position_embedding: bool = True
     has_token_type_embedding: bool = False
     tokens_per_block: int = 64
-    use_prompt_tuning: bool = False
+    max_prompt_embedding_table_size: int = 0
     quant_mode: QuantMode = QuantMode(0)
     gather_all_token_logits: bool = False
     dtype: str = ""
@@ -402,7 +402,7 @@ class GenerationSession(object):
                 'attention_mask',
             ]
 
-        if model_config.use_prompt_tuning:
+        if model_config.max_prompt_embedding_table_size > 0:
             expected_tensor_names += [
                 'prompt_embedding_table', 'tasks', 'prompt_vocab_size'
             ]
@@ -1656,6 +1656,7 @@ class GenerationSession(object):
         next_step_buffer = None
         attention_mask = None
         context_logits = None
+        generation_logits = []
 
         def get_outputs_dict(output_ids):
             outputs = {}
@@ -1666,6 +1667,7 @@ class GenerationSession(object):
                         [batch_size, beam_width])
             if self.gather_all_token_logits:
                 outputs['context_logits'] = context_logits
+                outputs['generation_logits'] = generation_logits
             return outputs
 
         for step in range(0, self.max_new_tokens):
@@ -1680,6 +1682,10 @@ class GenerationSession(object):
                 encoder_input_lengths)
             if step == 0:
                 context_logits = logits
+            if self.gather_all_token_logits:
+                generation_logits.append(
+                    next_step_buffer['logits'].clone().detach())
+
             if should_stop is not None and should_stop.item():
                 final_output_ids = self.finalize_decoder(
                     context_lengths, batch_size, beam_width, scfg)
@@ -1783,12 +1789,14 @@ class GenerationSession(object):
     def decode_batch(self,
                      input_ids: Sequence[torch.Tensor],
                      sampling_config: SamplingConfig,
-                     streaming: bool = False):
+                     streaming: bool = False,
+                     **kwargs):
         input_ids, context_lengths = _prepare_input_ids(input_ids)
         return self.decode(input_ids,
                            context_lengths,
                            sampling_config,
-                           streaming=streaming)
+                           streaming=streaming,
+                           **kwargs)
 
     # As dynamic_decoder uses torch's current stream, we must ensure it runs on the same stream that
     # dynamic_decoder was set up with
@@ -1907,24 +1915,42 @@ class GenerationSession(object):
                 encoder_output, encoder_input_lengths)
 
 
-class ChatGLM6BHeadModelGenerationSession(GenerationSession):
+class ChatGLMGenerationSession(GenerationSession):
 
     def _prepare_context_inputs(self, batch_size, context_lengths,
                                 use_gpt_attention_plugin, remove_input_padding,
                                 **kwargs):
 
-        assert not remove_input_padding
         last_token_ids = context_lengths.detach().clone()
         max_context_length = kwargs.pop('max_context_length')
-        position_ids = torch.zeros([batch_size, 2, max_context_length],
-                                   dtype=torch.int32)
-        position_ids[:, 0, :] = torch.arange(max_context_length)
-        for i in range(batch_size):
-            length = context_lengths[i]
-            position_ids[i, 0, length - 1] = length - 2
-            position_ids[i, 1, length - 1] = 1
-            position_ids[i, :, length:] = 0
-        position_ids = position_ids.cuda()
+
+        if remove_input_padding:
+            input_lengths_acc = torch.cumsum(torch.cat(
+                [torch.IntTensor([0]).cuda(), context_lengths], dim=0),
+                                             dim=0)
+            position_ids = torch.zeros([1, 2, input_lengths_acc[-1]],
+                                       dtype=torch.int32)
+            for i in range(batch_size):
+                position_ids[0, 0, input_lengths_acc[i]:input_lengths_acc[
+                    i + 1]] = torch.arange(0,
+                                           context_lengths[i],
+                                           dtype=torch.int32)
+                position_ids[0, 0, input_lengths_acc[i + 1] -
+                             1] = context_lengths[i] - 2
+                position_ids[0, 1, input_lengths_acc[i + 1] - 1] = 1
+            position_ids = position_ids.int().cuda()
+            last_token_ids = torch.cumsum(last_token_ids, dim=0).int().cuda()
+        else:
+            position_ids = torch.zeros([batch_size, 2, max_context_length],
+                                       dtype=torch.int32)
+            position_ids[:, 0, :] = torch.arange(max_context_length)
+            for i in range(batch_size):
+                length = context_lengths[i]
+                position_ids[i, 0, length - 1] = length - 2
+                position_ids[i, 1, length - 1] = 1
+                position_ids[i, :, length:] = 0
+            position_ids = position_ids.cuda()
+
         inputs = {
             'position_ids': position_ids,
             'last_token_ids': last_token_ids
@@ -1937,17 +1963,25 @@ class ChatGLM6BHeadModelGenerationSession(GenerationSession):
     def _prepare_generation_inputs(self, batch_size, context_lengths,
                                    use_gpt_attention_plugin,
                                    remove_input_padding, **kwargs):
-        assert not remove_input_padding
-        last_token_ids = torch.ones_like(context_lengths)
 
         step = kwargs.pop('step')
         num_beams = kwargs.pop('num_beams')
+        last_token_ids = torch.ones_like(context_lengths)
 
-        data = []
-        for i in range(batch_size):
-            data.append([[context_lengths[i * num_beams] - 2], [step + 2]])
-        position_ids = torch.tensor(data, dtype=torch.int32, device='cuda')
-        position_ids = _tile_beam_width(position_ids, num_beams)
+        if remove_input_padding:
+            position_ids = torch.zeros([1, 2, batch_size], dtype=torch.int32)
+            for i in range(batch_size):
+                position_ids[0, 0, i] = context_lengths[i * num_beams] - 2
+                position_ids[0, 1, i] = step + 2
+            position_ids = _tile_beam_width(position_ids, num_beams)
+            position_ids = position_ids.int().cuda()
+            last_token_ids = torch.cumsum(last_token_ids, dim=0).int().cuda()
+        else:
+            data = []
+            for i in range(batch_size):
+                data.append([[context_lengths[i * num_beams] - 2], [step + 2]])
+            position_ids = torch.tensor(data, dtype=torch.int32, device='cuda')
+            position_ids = _tile_beam_width(position_ids, num_beams)
 
         inputs = {
             'position_ids': position_ids,
diff --git a/tensorrt_llm/tools/__init__.py b/tensorrt_llm/tools/__init__.py
index e69de29bb2..96a0c34d85 100644
--- a/tensorrt_llm/tools/__init__.py
+++ b/tensorrt_llm/tools/__init__.py
@@ -0,0 +1,16 @@
+# SPDX-FileCopyrightText: Copyright (c) 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .gc_helper import cleanup  # noqa
diff --git a/tensorrt_llm/tools/gc_helper.py b/tensorrt_llm/tools/gc_helper.py
new file mode 100644
index 0000000000..843b0d8d15
--- /dev/null
+++ b/tensorrt_llm/tools/gc_helper.py
@@ -0,0 +1,12 @@
+from ..module import Module
+from ..network import Network
+
+
+def cleanup(network: Network, model: Module):
+    # TODO: A quick fix for the memory leak caused by Parameter.
+    # Remove this method once the issue fixed in a proper way.
+    for _, param in model.named_parameters():
+        # param._value captures the numpy array so that gc can't collect
+        # those buffers.
+        param._value = None
+    network._registered_ndarrays = None
diff --git a/tensorrt_llm/tools/plugin_gen/core.py b/tensorrt_llm/tools/plugin_gen/core.py
index 910e17f97d..034ba6431a 100644
--- a/tensorrt_llm/tools/plugin_gen/core.py
+++ b/tensorrt_llm/tools/plugin_gen/core.py
@@ -269,7 +269,7 @@ class KernelMetaData:
         if yaml_path:
             with open(yaml_path, "r") as f:
                 yaml_str = f.read()
-        yaml_data = yaml.load(yaml_str, Loader=yaml.Loader)
+        yaml_data = yaml.load(yaml_str, Loader=yaml.SafeLoader)
 
         kernel_name = yaml_data["name"]
         ios = []
@@ -682,13 +682,12 @@ class PluginCmakeCodegen:
 
 
 def setup_jinja_env() -> jinja2.Environment:
-    env = jinja2.Environment(
-        loader=jinja2.PackageLoader(
-            package_name="tensorrt_llm.tools.plugin_gen",
-            package_path="templates",
-        ),
-        undefined=jinja2.StrictUndefined,
-    )
+    env = jinja2.Environment(loader=jinja2.PackageLoader(
+        package_name="tensorrt_llm.tools.plugin_gen",
+        package_path="templates",
+    ),
+                             undefined=jinja2.StrictUndefined,
+                             autoescape=jinja2.select_autoescape())
     env.variable_start_string = '[['
     env.variable_end_string = ']]'
     return env
diff --git a/tensorrt_llm/tools/plugin_gen/plugin_gen.py b/tensorrt_llm/tools/plugin_gen/plugin_gen.py
index 2095c6cd8b..83aaea539e 100644
--- a/tensorrt_llm/tools/plugin_gen/plugin_gen.py
+++ b/tensorrt_llm/tools/plugin_gen/plugin_gen.py
@@ -296,14 +296,14 @@ def _mkdir(path: str):
     '''
     mkdir if not exists
     '''
-    subprocess.run(['mkdir', '-p', path], check=True)
+    subprocess.run(['/usr/bin/mkdir', '-p', path], check=True)
 
 
 def _rmdir(path: str):
     '''
     rmdir if exists
     '''
-    subprocess.run(['rm', '-rf', path], check=True)
+    subprocess.run(['/usr/bin/rm', '-rf', path], check=True)
 
 
 def _run_command(args, cwd=None):
diff --git a/tensorrt_llm/tools/ppl.py b/tensorrt_llm/tools/ppl.py
new file mode 100644
index 0000000000..4fd4d67753
--- /dev/null
+++ b/tensorrt_llm/tools/ppl.py
@@ -0,0 +1,7 @@
+def ppl(logits, output_ids):
+    """
+    Calculate per-token perplexity.
+    """
+    nlls = -logits.log_softmax(dim=-1)
+    ppls = nlls.gather(-1, output_ids.long().unsqueeze(-1))
+    return ppls.mean().exp().item()
diff --git a/tests/attention/test_gpt_attention.py b/tests/attention/test_gpt_attention.py
index e09c28ebc7..a706e8ee0b 100644
--- a/tests/attention/test_gpt_attention.py
+++ b/tests/attention/test_gpt_attention.py
@@ -427,11 +427,16 @@ class TestFunctional(unittest.TestCase):
             stream = torch.cuda.current_stream()
             # NOTE: when 8-bit kv cache is used together with paged kv cache no 8-bit tensors are exposed to TRT
             int8_trt_flag = use_int8_kv_cache and not paged_kv_cache
-            fp8_trt_flag = use_fp8_kv_cache and not paged_kv_cache
-            builder_config = builder.create_builder_config(name=attention_type,
-                                                           precision=dtype,
-                                                           int8=int8_trt_flag,
-                                                           fp8=fp8_trt_flag)
+            use_fp8_kv_cache and not paged_kv_cache
+            quant_mode = QuantMode.from_description(
+                use_fp8_kv_cache=use_fp8_kv_cache
+            ) if use_fp8_kv_cache and not paged_kv_cache else QuantMode(0)
+            builder_config = builder.create_builder_config(
+                name=attention_type,
+                precision=dtype,
+                int8=int8_trt_flag,
+                quant_mode=quant_mode)
+
             if session is None:
                 engine = builder.build_engine(net, builder_config)
                 session = tensorrt_llm.runtime.Session.from_serialized_engine(
diff --git a/tests/attention/test_gpt_attention_IFB.py b/tests/attention/test_gpt_attention_IFB.py
index 1726c05992..55c3245722 100644
--- a/tests/attention/test_gpt_attention_IFB.py
+++ b/tests/attention/test_gpt_attention_IFB.py
@@ -71,53 +71,59 @@ class TestFunctional(unittest.TestCase):
         test_cases = []
         test_cases += list(
             product(['gpt2_attention', 'llama_attention', 'gptj_attention'],
-                    [ContextFMHAType.disabled], ['float16'], [2], [128], [4],
-                    [64], [0], [False], [False], [1], [True, False]))
+                    [ContextFMHAType.disabled], ['float16'], [2], [128], [8],
+                    [4], [64], [0], [False], [False], [1], [True, False]))
 
         # TODO: add more unit tests
         test_cases += list(
             product(['llama_attention'], [
                 ContextFMHAType.disabled, ContextFMHAType.enabled,
                 ContextFMHAType.enabled_with_fp32_acc
-            ], ['float16'], [2], [90], [4], [32], [0], [False], [False], [1],
-                    [False]))
+            ], ['float16'], [2], [90], [8], [4], [32], [0], [False], [False],
+                    [1], [False]))
 
         # Test cases for the multi-block MMHA.
         test_cases += list(
             product(['llama_attention'], [
                 ContextFMHAType.enabled, ContextFMHAType.enabled_with_fp32_acc
-            ], ['float16', 'float32'], [2], [2048], [4], [64], [0], [True],
+            ], ['float16', 'float32'], [2], [2048], [8], [4], [64], [0], [True],
                     [False], [1], [True, False]))
+        test_cases += list(
+            product(['llama_attention'],
+                    [ContextFMHAType.enabled_with_fp32_acc], ['float16'], [16],
+                    [2048], [32], [4], [64], [0], [True], [False], [1],
+                    [False]))
 
         # Test cases for the int8 K/V cache.
         test_cases += list(
             product(['gpt2_attention'], [ContextFMHAType.disabled],
-                    ['float16', 'float32'], [2], [128], [4], [64], [0], [False],
-                    [True], [1], [False]))
+                    ['float16', 'float32'], [2], [128], [8], [4], [64], [0],
+                    [False], [True], [1], [False]))
 
         # test cases for multi-query attention
         test_cases += list(
             product(['gpt_bigcode_attention'], [
                 ContextFMHAType.disabled, ContextFMHAType.enabled,
                 ContextFMHAType.enabled_with_fp32_acc
-            ], ['float16'], [2], [128], [4], [64], [1], [False], [False], [1],
-                    [False]))
+            ], ['float16'], [2], [128], [8], [4], [64], [1], [False], [False],
+                    [1], [False]))
 
         # test cases for beam search
         test_cases += list(
             product(['gpt2_attention'], [ContextFMHAType.disabled], ['float16'],
-                    [2], [128], [4], [64], [0], [False], [False], [4], [False]))
+                    [2], [128], [8], [4], [64], [0], [False], [False], [4],
+                    [False]))
 
         # test cases for grouped-query attention
         test_cases += list(
             product(['llama_attention'], [ContextFMHAType.disabled],
-                    ['float16'], [2], [128], [8], [32], [2, 4], [False],
+                    ['float16'], [2], [128], [8], [8], [32], [2, 4], [False],
                     [False], [1], [False]))
 
         # test cases for rotary scaling
         test_cases += list(
             product(['llama_attention'], [ContextFMHAType.disabled],
-                    ['float32'], [2], [128], [8], [32], [2, 8], [False],
+                    ['float32'], [2], [128], [8], [8], [32], [2, 8], [False],
                     [False], [1], [False], [10000.0, 1000000.0], [
                         {
                             "type": "linear",
@@ -143,6 +149,7 @@ class TestFunctional(unittest.TestCase):
                                dtype,
                                batch_size,
                                in_len,
+                               out_len,
                                num_heads,
                                head_size,
                                num_kv_heads,
@@ -376,9 +383,9 @@ class TestFunctional(unittest.TestCase):
         plugin_kv_num_heads = num_kv_heads if attention_type == 'llama_attention' or attention_type == 'gpt_bigcode_attention' else num_heads
         kv_hidden_size = plugin_kv_num_heads * head_size
         qkv_hidden_size = hidden_size + 2 * kv_hidden_size
-        out_len = 8
-        max_seq_len = in_len + 24
+        max_seq_len = in_len + out_len * 3
         num_req = batch_size
+        in_lens = torch.randint(1, in_len + 1, (num_req, ))
         max_blocks_per_seq = math.ceil(max_seq_len / tokens_per_block)
         blocks = math.ceil(
             (num_req * beam_width * max_seq_len) / tokens_per_block)
@@ -413,8 +420,10 @@ class TestFunctional(unittest.TestCase):
         cache_indirection = torch.zeros(shape_dict['cache_indirection'],
                                         dtype=torch.int32,
                                         device='cuda')
-        for iteration in range(1, beam_width):
-            cache_indirection[:, iteration, in_len:] = iteration
+        for req_idx in range(num_req):
+            in_len_req = in_lens[req_idx]
+            for iteration in range(1, beam_width):
+                cache_indirection[req_idx, iteration, in_len_req:] = iteration
 
         kv_int8_dequant_scale = torch.randint(
             1,
@@ -612,11 +621,13 @@ class TestFunctional(unittest.TestCase):
         def torch_exec(step: int,
                        input: torch.Tensor,
                        ctx_attention_mask: torch.Tensor,
+                       req_idx: int,
                        layer_past=None):
             assert layer_past != None or input.shape[0] == 1
             nonlocal attention
             nonlocal attention_type
-            nonlocal in_len
+            nonlocal in_lens
+            in_len = in_lens[req_idx]
             position_ids = ctx_attention_mask.long().cumsum(-1) - 1
             position_ids.masked_fill_(ctx_attention_mask == 0, 1)
             if step != 0:
@@ -700,35 +711,38 @@ class TestFunctional(unittest.TestCase):
             batch_req_ids = []
             for req_idx in reversed(range(num_req)):
                 step = get_step(req_idx)
+                in_len_req = in_lens[req_idx]
                 if is_valid_step(step):
                     batch_req_ids.append(req_idx)
                     if step == 0:
-                        input_length_list.append([in_len])
-                        context_length_list += [in_len]
+                        input_length_list.append([in_len_req])
+                        context_length_list += [in_len_req]
                         request_type_list += [0]
                         host_past_key_value_length_list += [0]
                         sequence_selection += [req_idx * beam_width]
                         num_context_req += 1
                     else:
                         input_length_list.append([1] * beam_width)
-                        context_length_list += [in_len] * beam_width
+                        context_length_list += [in_len_req] * beam_width
                         request_type_list += [1] * beam_width
-                        host_past_key_value_length_list += [in_len + step - 1
-                                                            ] * beam_width
+                        host_past_key_value_length_list += [
+                            in_len_req + step - 1
+                        ] * beam_width
                         num_generation_req += 1
                         sequence_selection += list(
                             range(req_idx * beam_width,
                                   (req_idx + 1) * beam_width))
-                    sequence_length_list += [in_len + step] * beam_width
+                    sequence_length_list += [in_len_req + step] * beam_width
 
             num_seq = num_context_req + num_generation_req * beam_width
 
             # Check if new sequence arrived
             if iteration < num_req:
+                in_len_req = in_lens[iteration]
                 # Add sequence to the manager
                 sequence = GenerationSequence(seq_idx=iteration,
                                               batch_idx=iteration)
-                kv_cache_manager.add_sequence(sequence, in_len)
+                kv_cache_manager.add_sequence(sequence, in_len_req.clone())
 
             # Get arrays of pointers to the "pages" of KV values
             pointer_arrays = kv_cache_manager.get_pointer_arrays(beam_width)[0]
@@ -796,13 +810,14 @@ class TestFunctional(unittest.TestCase):
             for req_idx in batch_req_ids:
                 step = get_step(req_idx)
                 assert is_valid_step(step)
+                in_len_req = in_lens[req_idx]
                 if step == 0:
                     ctx_attention_mask_list[req_idx] = torch.ones(
-                        (1, in_len), dtype=torch.int32, device='cuda')
+                        (1, in_len_req), dtype=torch.int32, device='cuda')
                 else:
                     if step == 1:
                         ctx_attention_mask_list[req_idx] = torch.ones(
-                            (beam_width, in_len),
+                            (beam_width, in_len_req),
                             dtype=torch.int32,
                             device='cuda')
                     ctx_attention_mask_list[req_idx] = torch.cat(
@@ -831,7 +846,7 @@ class TestFunctional(unittest.TestCase):
                 torch_in = input_tensor[:, offset:offset_next, :].reshape(
                     (local_beam_width, input_length, hidden_size))
                 torch_out, torch_cache_list[req_idx] = torch_exec(
-                    step, torch_in, ctx_attention_mask_list[req_idx],
+                    step, torch_in, ctx_attention_mask_list[req_idx], req_idx,
                     torch_cache_list[req_idx])
 
                 np.testing.assert_allclose(
diff --git a/tests/bindings/test_bindings.py b/tests/bindings/test_bindings.py
new file mode 100644
index 0000000000..1d0ffcaba2
--- /dev/null
+++ b/tests/bindings/test_bindings.py
@@ -0,0 +1,343 @@
+import json
+import tempfile
+from pathlib import Path
+
+import torch
+
+import tensorrt_llm.bindings as _tb
+
+
+def test_generation_output():
+    ids = torch.ones(1)
+    lengths = torch.ones(2)
+    gen_output = _tb.GenerationOutput(ids, lengths)
+    assert torch.equal(gen_output.ids, ids)
+    assert torch.equal(gen_output.lengths, lengths)
+
+    assert gen_output.log_probs is None
+    log_probs = torch.ones(1)
+    gen_output.log_probs = log_probs
+    assert gen_output.log_probs == log_probs
+
+    assert gen_output.context_logits is None
+    torch.ones(1)
+    gen_output.context_logits = log_probs
+    assert gen_output.context_logits == log_probs
+
+
+def test_generation_input():
+    end_id = 42
+    pad_id = 13
+    ids = torch.ones(1)
+    lengths = torch.ones(2)
+    packed = True
+    gen_input = _tb.GenerationInput(end_id, pad_id, ids, lengths, packed)
+    assert gen_input.end_id == end_id
+    assert gen_input.pad_id == pad_id
+    assert torch.equal(gen_input.ids, ids)
+    assert torch.equal(gen_input.lengths, lengths)
+    assert gen_input.packed == packed
+
+    assert gen_input.max_new_tokens is None
+    max_new_tokens = 100
+    gen_input.max_new_tokens = max_new_tokens
+    assert gen_input.max_new_tokens == max_new_tokens
+
+    assert gen_input.embedding_bias is None
+    embedding_bias = torch.ones(3)
+    gen_input.embedding_bias = embedding_bias
+    assert torch.equal(gen_input.embedding_bias, embedding_bias)
+
+    assert gen_input.prompt_tuning_params.embedding_table is None
+    assert gen_input.prompt_tuning_params.tasks is None
+    assert gen_input.prompt_tuning_params.vocab_size is None
+
+    embedding_table = torch.ones(3)
+    tasks = torch.ones(2)
+    vocab_size = torch.ones(1)
+    prompt_tuning_params = _tb.PromptTuningParams(
+        embedding_table=embedding_table, tasks=tasks, vocab_size=vocab_size)
+    assert len(prompt_tuning_params.prompt_tuning_enabled) == 0
+    prompt_tuning_enabled = [True, False]
+    prompt_tuning_params.prompt_tuning_enabled = prompt_tuning_enabled
+    assert len(prompt_tuning_params.prompt_tuning_enabled) == 2
+    assert prompt_tuning_params.prompt_tuning_enabled == prompt_tuning_enabled
+    gen_input.prompt_tuning_params = prompt_tuning_params
+    assert gen_input.prompt_tuning_params is not None
+    assert torch.equal(gen_input.prompt_tuning_params.embedding_table,
+                       embedding_table)
+    assert torch.equal(gen_input.prompt_tuning_params.tasks, tasks)
+    assert torch.equal(gen_input.prompt_tuning_params.vocab_size, vocab_size)
+    assert gen_input.prompt_tuning_params.prompt_tuning_enabled == prompt_tuning_enabled
+
+
+def test_gpt_session_config():
+    kv_cache_config = _tb.KvCacheConfig()
+    assert kv_cache_config.max_tokens is None
+    max_tokens = 13
+    kv_cache_config.max_tokens = max_tokens
+    assert kv_cache_config.max_tokens == max_tokens
+    assert kv_cache_config.free_gpu_memory_fraction is None
+    free_gpu_memory_fraction = 0.5
+    kv_cache_config.free_gpu_memory_fraction = free_gpu_memory_fraction
+    assert kv_cache_config.free_gpu_memory_fraction == free_gpu_memory_fraction
+
+    max_batch_size = 1000
+    max_beam_width = 64
+    max_sequence_length = 1 << 20
+    gpt_session_config = _tb.GptSessionConfig(max_batch_size, max_beam_width,
+                                              max_sequence_length)
+    assert gpt_session_config.max_batch_size == max_batch_size
+    assert gpt_session_config.max_beam_width == max_beam_width
+    assert gpt_session_config.max_sequence_length == max_sequence_length
+
+    assert gpt_session_config.kv_cache_config is not None
+    assert gpt_session_config.kv_cache_config.max_tokens is None
+    assert gpt_session_config.kv_cache_config.free_gpu_memory_fraction is None
+    gpt_session_config.kv_cache_config = kv_cache_config
+    assert gpt_session_config.kv_cache_config.max_tokens == max_tokens
+    assert gpt_session_config.kv_cache_config.free_gpu_memory_fraction == free_gpu_memory_fraction
+    gpt_session_config.kv_cache_config.max_tokens = None
+    assert gpt_session_config.kv_cache_config.max_tokens is None
+    gpt_session_config.kv_cache_config.free_gpu_memory_fraction = None
+    assert gpt_session_config.kv_cache_config.free_gpu_memory_fraction is None
+
+    assert not gpt_session_config.decoder_per_request
+    gpt_session_config.decoder_per_request = True
+    assert gpt_session_config.decoder_per_request
+
+    assert not gpt_session_config.cuda_graph_mode
+    gpt_session_config.cuda_graph_mode = True
+    assert gpt_session_config.cuda_graph_mode
+
+    assert gpt_session_config.ctx_micro_batch_size is None
+    ctx_micro_batch_size = 10
+    gpt_session_config.ctx_micro_batch_size = ctx_micro_batch_size
+    assert gpt_session_config.ctx_micro_batch_size == ctx_micro_batch_size
+
+    assert gpt_session_config.gen_micro_batch_size is None
+    gen_micro_batch_size = 20
+    gpt_session_config.gen_micro_batch_size = gen_micro_batch_size
+    assert gpt_session_config.gen_micro_batch_size == gen_micro_batch_size
+
+
+def test_quant_mode():
+    assert _tb.QuantMode.none().value == 0
+    assert _tb.QuantMode.int4_weights().has_int4_weights
+    assert _tb.QuantMode.int8_weights().has_int8_weights
+    assert _tb.QuantMode.activations().has_activations
+    assert _tb.QuantMode.per_channel_scaling().has_per_channel_scaling
+    assert _tb.QuantMode.per_token_scaling().has_per_token_scaling
+    assert _tb.QuantMode.per_group_scaling().has_per_group_scaling
+    assert _tb.QuantMode.int8_kv_cache().has_int8_kv_cache
+    assert _tb.QuantMode.fp8_kv_cache().has_fp8_kv_cache
+    assert _tb.QuantMode.fp8_qdq().has_fp8_qdq
+
+    quant_mode = _tb.QuantMode.from_description(True, True, True, True, True,
+                                                True, True, True)
+    assert quant_mode.has_int4_weights
+    quant_mode -= _tb.QuantMode.int4_weights()
+    assert not quant_mode.has_int4_weights
+    quant_mode += _tb.QuantMode.int4_weights()
+    assert quant_mode.has_int4_weights
+
+    assert _tb.QuantMode.none() == _tb.QuantMode.none()
+
+
+def test_gpt_model_config():
+    vocab_size = 10000
+    num_layers = 12
+    num_heads = 16
+    hidden_size = 768
+    data_type = _tb.DataType.FLOAT
+    gpt_model_config = _tb.GptModelConfig(vocab_size, num_layers, num_heads,
+                                          hidden_size, data_type)
+    assert gpt_model_config.vocab_size == vocab_size
+    assert gpt_model_config.num_layers() == num_layers
+    assert gpt_model_config.num_heads == num_heads
+    assert gpt_model_config.hidden_size == hidden_size
+    assert gpt_model_config.data_type == data_type
+
+    assert gpt_model_config.vocab_size_padded(1) is not None
+    assert gpt_model_config.size_per_head == hidden_size // num_heads
+
+    assert gpt_model_config.num_kv_heads == num_heads
+    num_kv_heads = 1
+    gpt_model_config.num_kv_heads = num_kv_heads
+    assert gpt_model_config.num_kv_heads == num_kv_heads
+
+    assert not gpt_model_config.use_gpt_attention_plugin
+    gpt_model_config.use_gpt_attention_plugin = True
+    assert gpt_model_config.use_gpt_attention_plugin
+
+    assert not gpt_model_config.use_packed_input
+    gpt_model_config.use_packed_input = True
+    assert gpt_model_config.use_packed_input
+
+    assert not gpt_model_config.use_paged_kv_cache
+    gpt_model_config.use_paged_kv_cache = True
+    assert gpt_model_config.use_paged_kv_cache
+
+    assert gpt_model_config.tokens_per_block == 64
+    tokens_per_block = 1024
+    gpt_model_config.tokens_per_block = tokens_per_block
+    assert gpt_model_config.tokens_per_block == tokens_per_block
+
+    assert gpt_model_config.quant_mode == _tb.QuantMode.none()
+    gpt_model_config.quant_mode = _tb.QuantMode.int4_weights()
+    assert gpt_model_config.quant_mode.has_int4_weights
+
+    assert gpt_model_config.supports_inflight_batching
+
+    assert gpt_model_config.max_batch_size == 0
+    max_batch_size = 1000
+    gpt_model_config.max_batch_size = max_batch_size
+    assert gpt_model_config.max_batch_size == max_batch_size
+
+    assert gpt_model_config.max_input_len == 0
+    max_input_len = 2048
+    gpt_model_config.max_input_len = max_input_len
+    assert gpt_model_config.max_input_len == max_input_len
+
+    assert gpt_model_config.max_num_tokens is None
+    max_num_tokens = 10000
+    gpt_model_config.max_num_tokens = max_num_tokens
+    assert gpt_model_config.max_num_tokens == max_num_tokens
+
+    assert not gpt_model_config.compute_context_logits
+    gpt_model_config.compute_context_logits = True
+    assert gpt_model_config.compute_context_logits
+
+    assert gpt_model_config.model_variant == _tb.GptModelVariant.GPT
+    model_variant = _tb.GptModelVariant.GLM
+    gpt_model_config.model_variant = model_variant
+    assert gpt_model_config.model_variant == model_variant
+
+    assert not gpt_model_config.use_custom_all_reduce
+    gpt_model_config.use_custom_all_reduce = True
+    assert gpt_model_config.use_custom_all_reduce
+
+
+def test_world_config():
+    tensor_parallelism = 2
+    pipeline_parallelism = 4
+    rank = 3
+    gpus_per_node = 10
+    world_config = _tb.WorldConfig(tensor_parallelism, pipeline_parallelism,
+                                   rank, gpus_per_node)
+    assert world_config.tensor_parallelism == tensor_parallelism
+    assert world_config.pipeline_parallelism == pipeline_parallelism
+    assert world_config.rank == rank
+    assert world_config.gpus_per_node == gpus_per_node
+    assert world_config.size == tensor_parallelism * pipeline_parallelism
+    assert world_config.is_pipeline_parallel
+    assert world_config.is_tensor_parallel
+    assert world_config.device == rank % gpus_per_node
+    assert world_config.pipeline_parallel_rank == rank // tensor_parallelism
+    assert world_config.tensor_parallel_rank == rank % tensor_parallelism
+
+    world_config = _tb.WorldConfig.mpi(gpus_per_node)
+    assert world_config.tensor_parallelism == 1
+    assert world_config.pipeline_parallelism == 1
+    assert world_config.gpus_per_node == gpus_per_node
+    assert world_config.rank == 0
+
+
+def test_sampling_config():
+    beam_width = 12
+    sampling_config = _tb.SamplingConfig(beam_width)
+    assert sampling_config.beam_width == 12
+
+    def check_empty_then_set(member, value):
+        assert getattr(sampling_config, member) is None
+        setattr(sampling_config, member, value)
+        assert getattr(sampling_config, member) == value
+
+    float_array = [1., 2., 3.]
+    size_t_array = [1, 2, 3]
+    check_empty_then_set("temperature", float_array)
+    check_empty_then_set("min_length", size_t_array)
+    check_empty_then_set("repetition_penalty", float_array)
+    check_empty_then_set("presence_penalty", float_array)
+    check_empty_then_set("top_k", size_t_array)
+    check_empty_then_set("top_p", float_array)
+    check_empty_then_set("random_seed", size_t_array)
+    check_empty_then_set("top_p_decay", float_array)
+    check_empty_then_set("top_p_min", float_array)
+    check_empty_then_set("top_p_reset_ids", size_t_array)
+    check_empty_then_set("beam_search_diversity_rate", float_array)
+    check_empty_then_set("length_penalty", float_array)
+
+
+def test_gpt_json_config():
+    model_config = {
+        "vocab_size": 1000,
+        "num_layers": 12,
+        "num_heads": 4,
+        "hidden_size": 512,
+        "data_type": _tb.DataType.FLOAT,
+    }
+    gpt_model_config = _tb.GptModelConfig(**model_config)
+    json_config = {
+        "name": "gpt",
+        "precision": "float32",
+        "tensor_parallelism": 1,
+        "pipeline_parallelism": 1,
+        "model_config": gpt_model_config
+    }
+
+    gpt_json_config = _tb.GptJsonConfig(**json_config)
+
+    def check_properties(the_object, properties, model_config):
+        for property, value in properties.items():
+            if isinstance(value, _tb.GptModelConfig):
+                object_config = getattr(the_object, property)
+                for subproperty, subvalue in model_config.items():
+                    member = getattr(object_config, subproperty)
+                    if callable(member):
+                        member = member()
+                    assert member == subvalue
+            else:
+                assert getattr(the_object, property) == value
+
+    check_properties(gpt_json_config, json_config, model_config)
+
+    json_dict = {
+        "builder_config": {
+            "name": json_config["name"],
+            "vocab_size": model_config["vocab_size"],
+            "num_layers": model_config["num_layers"],
+            "num_heads": model_config["num_heads"],
+            "hidden_size": model_config["hidden_size"],
+            "precision": json_config["precision"],
+            "tensor_parallel": json_config["tensor_parallelism"],
+            "pipeline_parallel": json_config["pipeline_parallelism"],
+        },
+        "plugin_config": {
+            "paged_kv_cache": False,
+            "tokens_per_block": 0,
+            "gpt_attention_plugin": False,
+            "remove_input_padding": False,
+            "use_custom_all_reduce": False,
+        }
+    }
+
+    gpt_json_config = _tb.GptJsonConfig.parse(json.dumps(json_dict))
+
+    with tempfile.NamedTemporaryFile("w", delete=False) as fp:
+        json.dump(json_dict, fp)
+        fp.close()
+
+        gpt_json_config = _tb.GptJsonConfig.parse_file(fp.name)
+        Path(fp.name).unlink()
+
+    rank = 3
+    gpus_per_node = 10
+    world_config = _tb.WorldConfig(json_config["tensor_parallelism"],
+                                   json_config["pipeline_parallelism"], rank,
+                                   gpus_per_node)
+
+    assert gpt_json_config.engine_filename(
+        world_config) == json_config["name"] + "_float32_tp1_rank3.engine"
+    assert gpt_json_config.engine_filename(
+        world_config, "llama") == "llama_float32_tp1_rank3.engine"
diff --git a/tests/model/test_gptneox.py b/tests/model/test_gptneox.py
index f5137acd6e..014c99119d 100644
--- a/tests/model/test_gptneox.py
+++ b/tests/model/test_gptneox.py
@@ -408,48 +408,6 @@ class TestGPTNeoX(unittest.TestCase):
 
         compare_max_abs_error(ref, res, "generation logits")
 
-    def test_gptneox_noplugin_unsupported(self):
-
-        use_refit = False
-        apply_query_key_layer_scaling = False
-        model = 'gptneox'
-
-        log_level = 'error'
-        dtype = 'float16'
-        world_size = 1
-        rank = 0
-        hidden_act = 'gelu'
-        n_layer = 1
-        max_length = 2
-        batch_size = 4
-        seq_len = 128
-        use_attention_plugin = False
-        use_ln_gemm_plugin = True
-        beam_width = 1
-
-        gpt_config, hf_gpt = self._gen_hf_gpt_neox(hidden_act, n_layer,
-                                                   seq_len + max_length, dtype)
-        with self.assertRaisesRegex(
-                ValueError,
-                ".*GPT-NeoX RoPE is only supported with GPTAttention plugin.*"):
-            runtime, _ = self._gen_tensorrt_llm_runtime(
-                log_level, dtype, world_size, rank, gpt_config, hf_gpt, model,
-                use_attention_plugin, batch_size, beam_width, seq_len,
-                max_length, use_refit, use_ln_gemm_plugin,
-                apply_query_key_layer_scaling)
-
-        use_ln_gemm_plugin = False
-        if trt.__version__[:3] == '8.6':
-            with self.assertRaisesRegex(
-                    AssertionError,
-                    "You need to enable the LayerNorm plugin for GPT-NeoX with TensorRT"
-            ):
-                runtime, _ = self._gen_tensorrt_llm_runtime(
-                    log_level, dtype, world_size, rank, gpt_config, hf_gpt,
-                    model, use_attention_plugin, batch_size, beam_width,
-                    seq_len, max_length, use_refit, use_ln_gemm_plugin,
-                    apply_query_key_layer_scaling)
-
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/tests/quantization/test_quant.py b/tests/quantization/test_quant.py
index c98ac70109..fd287c21db 100644
--- a/tests/quantization/test_quant.py
+++ b/tests/quantization/test_quant.py
@@ -16,8 +16,7 @@ import unittest
 
 from tensorrt_llm.layers import ColumnLinear, RowLinear
 from tensorrt_llm.mapping import Mapping
-from tensorrt_llm.models import (GPTLMHeadModel, smooth_quantize,
-                                 weight_only_quantize)
+from tensorrt_llm.models import GPTLMHeadModel, quantize_model
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.quantization.layers import (SmoothQuantAttention,
                                               SmoothQuantLayerNorm,
@@ -39,7 +38,7 @@ class TestQuant(unittest.TestCase):
                                max_position_embeddings=1024,
                                dtype='float16')
 
-        quant_model = weight_only_quantize(model, mode)
+        quant_model = quantize_model(model, mode)
 
         self.assertTrue(hasattr(quant_model, 'quant_mode'))
 
@@ -82,9 +81,9 @@ class TestQuant(unittest.TestCase):
                                max_position_embeddings=1024,
                                dtype='float16')
 
-        quant_model = weight_only_quantize(model,
-                                           mode,
-                                           exclude_modules=['fc', 'dense'])
+        quant_model = quantize_model(model,
+                                     mode,
+                                     exclude_modules=['fc', 'dense'])
 
         self.assertTrue(hasattr(quant_model, 'quant_mode'))
 
@@ -111,7 +110,7 @@ class TestQuant(unittest.TestCase):
                              mapping=Mapping(world_size=1, rank=0, tp_size=1))
 
         quant_mode = QuantMode.use_smooth_quant()
-        sq_gpt = smooth_quantize(gpt, quant_mode)
+        sq_gpt = quantize_model(gpt, quant_mode)
         for layer in sq_gpt.layers:
             assert isinstance(layer.input_layernorm, SmoothQuantLayerNorm)
             assert isinstance(layer.post_layernorm, SmoothQuantLayerNorm)
diff --git a/tests/tools/plugin_gen/test_plugin_gen.py b/tests/tools/plugin_gen/test_plugin_gen.py
index 59886b1e64..7880d2ffcf 100644
--- a/tests/tools/plugin_gen/test_plugin_gen.py
+++ b/tests/tools/plugin_gen/test_plugin_gen.py
@@ -23,6 +23,13 @@ def is_triton_installed() -> bool:
     return os.path.exists(TRITON_COMPILE_BIN)
 
 
-@pytest.mark.skipif(not is_triton_installed(), reason='triton is not installed')
+def is_trt_automation() -> bool:
+    return os.path.exists("/build/config.yml")
+
+
+@pytest.mark.skipif(
+    not is_triton_installed() or is_trt_automation(),
+    reason='triton is not installed, this test is not supported in trt automation'
+)
 def test_end_to_end():
     gen_trt_plugins(workspace=WORKSPACE, metas=[KERNEL_META_DATA])
diff --git a/windows/README.md b/windows/README.md
index d0542e02e2..52ebd0b2da 100644
--- a/windows/README.md
+++ b/windows/README.md
@@ -30,7 +30,7 @@ Prerequisites:
 - [TensorRT 9.1.0.4 for TensorRT-LLM](https://developer.nvidia.com/downloads/compute/machine-learning/tensorrt/secure/9.1.0/tars/tensorrt-9.1.0.4.windows10.x86_64.cuda-12.2.llm.beta.zip)
 
 ```
-pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/nightly/cu121
+pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121
 ```
 
 ## Detailed Setup
@@ -118,7 +118,7 @@ The above command will generate `build\tensorrt_llm-0.5.0-py3-none-any.whl`. Oth
 
 To download and install the wheel, in Powershell, run:
 ```
-pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/nightly/cu121
+pip install tensorrt_llm --extra-index-url https://pypi.nvidia.com --extra-index-url https://download.pytorch.org/whl/cu121
 ```
 
 Alternatively, if you built the wheel from source, run: