open source 315e9f5ccd286e906d4c0d402fefbf2f69a1febe (#2033)

2026-01-13 22:18:36 +08:00 · 2024-07-26 16:19:24 +08:00 · 2024-07-26 16:19:24 +08:00 · 93293aa46d
commit 93293aa46d
parent 5fa9436e17
97 changed files with 3057 additions and 6767 deletions
--- a/benchmarks/python/README.md
+++ b/benchmarks/python/README.md
@ -10,8 +10,6 @@ multiple GPUs or multiple nodes with multiple GPUs using the Python runtime.

 The benchmark implementation and entrypoint can be found in [`benchmarks/python/benchmark.py`](./benchmark.py). There are some other scripts in the directory:

-* [`benchmarks/python/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
-* [`benchmarks/python/build.py`](./build.py) to build supported models for benchmarking.
 * [`benchmarks/python/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
 * [`benchmarks/python/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
 * [`benchmarks/python/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
@ -25,37 +23,29 @@ python benchmark.py -h
 ```

 ### 1. Single GPU benchmark
-Take GPT-350M as an example:
+Take LLaMA 7B as an example:
 ```
 python benchmark.py \
-    -m gpt_350m \
-    --mode plugin \
+    -m dec \
+    --engine_dir llama_7b \
    --batch_size "1;8;64" \
    --input_output_len "60,20;128,20"
 ```
 Expected outputs:
 ```
-[BENCHMARK] model_name gpt_350m world_size 1 num_heads 16 num_kv_heads 16 num_layers 24 hidden_size 1024 vocab_size 51200 precision float16 batch_size 1 input_length 60 output_length 20 gpu_peak_mem(gb) 4.2 build_time(s) 25.67 tokens_per_sec 483.54 percentile95(ms) 41.537 percentile99(ms) 42.102 latency(ms) 41.362 compute_cap sm80
-[BENCHMARK] model_name gpt_350m world_size 1 num_heads 16 num_kv_heads 16 num_layers 24 hidden_size 1024 vocab_size 51200 precision float16 batch_size 8 input_length 60 output_length 20 gpu_peak_mem(gb) 4.28 build_time(s) 25.67 tokens_per_sec 3477.28 percentile95(ms) 46.129 percentile99(ms) 46.276 latency(ms) 46.013 compute_cap sm80
-[BENCHMARK] model_name gpt_350m world_size 1 num_heads 16 num_kv_heads 16 num_layers 24 hidden_size 1024 vocab_size 51200 precision float16 batch_size 64 input_length 60 output_length 20 gpu_peak_mem(gb) 4.8 build_time(s) 25.67 tokens_per_sec 19698.07 percentile95(ms) 65.739 percentile99(ms) 65.906 latency(ms) 64.981 compute_cap sm80
+[BENCHMARK] model_name dec world_size 2 num_heads 32 num_kv_heads 32 num_layers 32 hidden_size 4096 vocab_size 32000 precision float16 batch_size 1 gpu_weights_percent 1.0 input_length 60 output_length 20 gpu_peak_mem(gb) 0.0 build_time(s) None tokens_per_sec 170.77 percentile95(ms) 117.591 percentile99(ms) 124.262 latency(ms) 117.115 compute_cap sm90 quantization QuantMode.FP8_QDQ|FP8_KV_CACHE generation_time(ms) 110.189 total_generated_tokens 19.0 generation_tokens_per_second 172.43
+[BENCHMARK] model_name dec world_size 2 num_heads 32 num_kv_heads 32 num_layers 32 hidden_size 4096 vocab_size 32000 precision float16 batch_size 8 gpu_weights_percent 1.0 input_length 60 output_length 20 gpu_peak_mem(gb) 0.0 build_time(s) None tokens_per_sec 1478.55 percentile95(ms) 108.641 percentile99(ms) 109.546 latency(ms) 108.214 compute_cap sm90 quantization QuantMode.FP8_QDQ|FP8_KV_CACHE generation_time(ms) 98.194 total_generated_tokens 152.0 generation_tokens_per_second 1547.951
+[BENCHMARK] model_name dec world_size 2 num_heads 32 num_kv_heads 32 num_layers 32 hidden_size 4096 vocab_size 32000 precision float16 batch_size 64 gpu_weights_percent 1.0 input_length 60 output_length 20 gpu_peak_mem(gb) 0.0 build_time(s) None tokens_per_sec 8214.87 percentile95(ms) 156.748 percentile99(ms) 160.203 latency(ms) 155.815 compute_cap sm90 quantization QuantMode.FP8_QDQ|FP8_KV_CACHE generation_time(ms) 111.078 total_generated_tokens 1216.0 generation_tokens_per_second 10947.303
 ...
 ```
 *Please note that the expected outputs is only for reference, specific performance numbers depend on the GPU you're using.*

 ### 2. Multi-GPU benchmark
-Take GPT-175B as an example:
+Take LLaMA 7B as an example:
 ```
-mpirun -n 8 python benchmark.py \
-    -m gpt_175b \
-    --mode plugin \
+mpirun -n 2 python benchmark.py \
+    -m dec \
+    --engine_dir llama_7b \
    --batch_size "1;8;64" \
    --input_output_len "60,20;128,20"
 ```
-
-Note: Building multi-GPU engines in parallel could be a heavy workload for the CPU system. Tuning `mpirun --map-by <XXX>` option on your system may achieve significant boost in build time, for example:
-```
-mpirun --map-by socket -n 8 python build.py \
-    --model gpt_175b \
-    --mode ootb \
-    --quantization fp8
-```
--- a/benchmarks/python/allowed_configs.py
+++ b/benchmarks/python/allowed_configs.py
--- a/benchmarks/python/base_benchmark.py
+++ b/benchmarks/python/base_benchmark.py
@ -32,13 +32,13 @@ def get_compute_cap():
    return str(int(float(csv_value) * 10))


-def get_csv_filename(model, dtype, tp_size, mode, **kwargs):
+def get_csv_filename(model, dtype, tp_size, **kwargs):
    sm = get_compute_cap()
    if len(kwargs) == 0:
        kw_pairs = ""
    else:
        kw_pairs = "_" + "_".join([str(k) + str(v) for k, v in kwargs.items()])
-    return f'{model}_{dtype}_tp{tp_size}_{mode}{kw_pairs}_sm{sm}.csv'
+    return f'{model}_{dtype}_tp{tp_size}_{kw_pairs}_sm{sm}.csv'


 def get_engine_name(model, dtype, tp_size, rank):
@ -59,13 +59,7 @@ def serialize_engine(engine, path):

 class BaseBenchmark(object):

-    def __init__(self,
-                 engine_dir,
-                 model_name,
-                 dtype,
-                 rank,
-                 world_size,
-                 serial_build: bool = False):
+    def __init__(self, engine_dir, model_name, dtype, rank, world_size):
        self.engine_dir = engine_dir
        self.model_name = model_name
        self.dtype = dtype
@ -74,73 +68,67 @@ class BaseBenchmark(object):
        self.engine_model_name = model_name
        self.quant_mode = QuantMode(0)
        self.enable_fp8 = False
-        if engine_dir is not None:
+        # Read config from engine directory
+        config_path = os.path.join(engine_dir, 'config.json')
+        with open(config_path, 'r') as f:
+            self.config = json.load(f)
+        # Sanity checks
+        if 'pretrained_config' in self.config:  # new build api branch
+            config_dtype = self.config['pretrained_config']['dtype']
+            assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
+            world_size = self.config['pretrained_config']['mapping'][
+                'world_size']
+            assert world_size == self.world_size, \
+                (f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
+            # Load config into self
+            for key, value in self.config['pretrained_config'].items():
+                setattr(self, key, value)
+
+            self.quant_mode = QuantMode.from_quant_algo(
+                quant_algo=self.quantization['quant_algo'],
+                kv_cache_quant_algo=self.quantization['kv_cache_quant_algo'])
+            self.enable_fp8 = self.quant_mode.has_fp8_qdq()
+            self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
+
+            for key, value in self.config['build_config'].items():
+                setattr(self, key, value)
+
+            for key, value in self.plugin_config.items():
+                if "plugin" in key:
+                    key = "use_" + key
+                setattr(self, key, value)
+
+            self.engine_name = f"rank{self.runtime_rank}.engine"
+
+            self.num_kv_heads = self.num_key_value_heads
+            self.num_layers = self.num_hidden_layers
+            self.num_heads = self.num_attention_heads
+        else:
            # Read config from engine directory
            config_path = os.path.join(engine_dir, 'config.json')
            with open(config_path, 'r') as f:
                self.config = json.load(f)
            # Sanity checks
-            if 'pretrained_config' in self.config:  # new build api branch
-                config_dtype = self.config['pretrained_config']['dtype']
-                assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
-                world_size = self.config['pretrained_config']['mapping'][
-                    'world_size']
-                assert world_size == self.world_size, \
-                    (f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
-                # Load config into self
-                for key, value in self.config['pretrained_config'].items():
+            config_dtype = self.config['builder_config']['precision']
+            assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
+            world_size = self.config['builder_config']['tensor_parallel']
+            assert world_size == self.world_size, \
+                (f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
+            # Load config into self
+            for key, value in self.config['builder_config'].items():
+                if key == "quant_mode":
+                    self.quant_mode = QuantMode(value)
+                elif key in "name":
+                    self.engine_model_name = value
+                else:
                    setattr(self, key, value)
-
-                self.quant_mode = QuantMode.from_quant_algo(
-                    quant_algo=self.quantization['quant_algo'],
-                    kv_cache_quant_algo=self.quantization['kv_cache_quant_algo']
-                )
-                self.enable_fp8 = self.quant_mode.has_fp8_qdq()
-                self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
-
-                for key, value in self.config['build_config'].items():
-                    setattr(self, key, value)
-
-                for key, value in self.plugin_config.items():
-                    if "plugin" in key:
-                        key = "use_" + key
-                    setattr(self, key, value)
-
-                self.engine_name = f"rank{self.runtime_rank}.engine"
-
-                self.num_kv_heads = self.num_key_value_heads
-                self.num_layers = self.num_hidden_layers
-                self.num_heads = self.num_attention_heads
-            else:
-                # Read config from engine directory
-                config_path = os.path.join(engine_dir, 'config.json')
-                with open(config_path, 'r') as f:
-                    self.config = json.load(f)
-                # Sanity checks
-                config_dtype = self.config['builder_config']['precision']
-                assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
-                world_size = self.config['builder_config']['tensor_parallel']
-                assert world_size == self.world_size, \
-                    (f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
-                # Load config into self
-                for key, value in self.config['builder_config'].items():
-                    if key == "quant_mode":
-                        self.quant_mode = QuantMode(value)
-                    elif key in "name":
-                        self.engine_model_name = value
-                    else:
-                        setattr(self, key, value)
-                self.enable_fp8 = self.quant_mode.has_fp8_qdq()
-                self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
-                for key, value in self.config['plugin_config'].items():
-                    # Same effect as self.use_foo_plugin = config.json["foo_plugin"]
-                    if "plugin" in key:
-                        key = "use_" + key
-                    setattr(self, key, value)
-                self.engine_name = get_engine_name(self.engine_model_name,
-                                                   self.dtype, self.world_size,
-                                                   self.runtime_rank)
-        else:
+            self.enable_fp8 = self.quant_mode.has_fp8_qdq()
+            self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
+            for key, value in self.config['plugin_config'].items():
+                # Same effect as self.use_foo_plugin = config.json["foo_plugin"]
+                if "plugin" in key:
+                    key = "use_" + key
+                setattr(self, key, value)
            self.engine_name = get_engine_name(self.engine_model_name,
                                               self.dtype, self.world_size,
                                               self.runtime_rank)
@ -148,9 +136,9 @@ class BaseBenchmark(object):
        self.runtime_mapping = tensorrt_llm.Mapping(world_size=self.world_size,
                                                    rank=self.runtime_rank,
                                                    tp_size=self.world_size)
-        if not serial_build:
-            torch.cuda.set_device(self.runtime_rank %
-                                  self.runtime_mapping.gpus_per_node)
+
+        torch.cuda.set_device(self.runtime_rank %
+                              self.runtime_mapping.gpus_per_node)

        self.csv_filename = ""  # lazy init

@ -189,7 +177,6 @@ class BaseBenchmark(object):
            self.csv_filename = get_csv_filename(self.model_name,
                                                 self.dtype,
                                                 self.world_size,
-                                                 self.mode,
                                                 fp8linear=int(self.enable_fp8))
        return self.csv_filename

--- a/benchmarks/python/benchmark.py
+++ b/benchmarks/python/benchmark.py
@ -20,26 +20,15 @@ import torch


 def parse_arguments():
-    from allowed_configs import get_allowed_models
    parser = argparse.ArgumentParser(
        description='Benchmark TensorRT-LLM models.')
    parser.add_argument('-m',
                        '--model',
                        type=str,
-                        default="gpt_350m",
-                        choices=get_allowed_models(),
-                        help='Specify model you want to benchmark.')
-    parser.add_argument(
-        '--mode',
-        type=str,
-        default="plugin",
-        choices=['ootb', 'plugin', 'ootb-except-mha'],
-        help=
-        ('Choose mode between ootb/plugin. '
-         '\"ootb\" means the engines will be built without any plugins, '
-         '\"plugin\" means the engines will be built with tuned recipe of using plugins.'
-         '\"ootb-except-mha\" means the engines will be built with only attention plugins.'
-         ))
+                        default="dec",
+                        choices=["dec", "enc", "enc-dec"],
+                        help='Specify type of the model you want to benchmark. '
+                        'Choose model between dec/enc/enc-dec.')

    parser.add_argument('--batch_size',
                        type=str,
@ -69,13 +58,6 @@ def parse_arguments():
        default='float16',
        choices=['float16', 'bfloat16', 'float32'],
        help='Choose data type between float16/bfloat16/float32.')
-    parser.add_argument(
-        '--refit',
-        default=False,
-        action="store_true",
-        help=
-        'If this option is specified, a refit flag is added to TensorRT engines.'
-    )

    parser.add_argument('--num_beams',
                        type=int,
@ -100,14 +82,6 @@ def parse_arguments():
                        type=str,
                        default='model.cache',
                        help='The path to write timing cache')
-    parser.add_argument(
-        '--profiling_verbosity',
-        type=str,
-        default='layer_names_only',
-        choices=['layer_names_only', 'detailed', 'none'],
-        help=
-        'The profiling verbosity for the generated TRT engine. Set to detailed can inspect tactic choices and kernel parameters.'
-    )
    parser.add_argument(
        '--log_level',
        type=str,
@ -131,75 +105,14 @@ def parse_arguments():
        default=60,
        help='Minimal duration of iterations to measure in seconds.')

-    parser.add_argument(
-        '--output_dir',
-        type=str,
-        default=None,
-        help=
-        'If this option is specified, TensorRT engines will be saved to the specified path.'
-    )
    parser.add_argument(
        '--engine_dir',
        type=str,
        default=None,
+        required=True,
        help=
        ('If this option is specified, instead of building engines on-air before benchmarking, '
         'the engines contained in the engine_dir will be used.'))
-    parser.add_argument(
-        '--max_beam_width',
-        type=int,
-        default=None,
-        help=
-        ('If this option is specified, it will override the max beam width of '
-         'TRT engines to the specified value instead of using pre-defined one'))
-    parser.add_argument(
-        '--max_input_len',
-        type=int,
-        default=None,
-        help=
-        ('If this option is specified, it will override the max input len of '
-         'TRT engines to the specified value instead of using pre-defined one'))
-    parser.add_argument(
-        '--max_encoder_input_len',
-        type=int,
-        default=None,
-        help=
-        ('This argument is only for encoder-decoder models'
-         'If this option is specified, it will override the max encoder input len of TRT engines to the specified value instead of using pre-defined one'
-         'By default when this option is not used, it will use pre-defined max encoder input len'
-         ))
-    parser.add_argument(
-        '--max_decoder_input_len',
-        type=int,
-        default=None,
-        help=
-        ('This argument is only for encoder-decoder models'
-         'If this option is specified, it will override the max decoder input len of TRT engines to the specified value instead of using pre-defined one'
-         'By default when this option is not used, it will use pre-defined max decoder input len'
-         ))
-    parser.add_argument(
-        '--max_seq_len',
-        '--max_decoder_seq_len',
-        dest='max_seq_len',
-        type=int,
-        default=None,
-        help=
-        ('If this option is specified, it will override the max sequence len of '
-         'TRT engines to the specified value instead of using pre-defined one'))
-    parser.add_argument(
-        '--max_batch_size',
-        type=int,
-        default=None,
-        help=
-        ('If this option is specified, it will override the max batch size of '
-         'TRT engines to the specified value instead of using pre-defined one'))
-    parser.add_argument(
-        '--force_num_layer_1',
-        default=False,
-        action='store_true',
-        help=
-        'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
-    )
    parser.add_argument(
        '--gpu_weights_percent',
        type=str,
@ -207,13 +120,6 @@ def parse_arguments():
        help='Specify the percentage of weights that reside on GPU (from 0 to 1).'
        'Multiple percentages can be separated by \";\", '
        'example: \"0;0.5;1\".')
-    parser.add_argument(
-        '--multiple_profiles',
-        default=False,
-        action='store_true',
-        help=
-        'This option will benefit performance, but will increase the engine build time.'
-    )

    parser.add_argument('--csv',
                        default=False,
@ -234,40 +140,7 @@ def parse_arguments():
            'int8_sq_per_channel_ootb'
        ],
        help="Optimize the model with specified quantization recipe")
-    parser.add_argument(
-        '--build_only',
-        default=False,
-        action='store_true',
-        help=
-        "Build engine only and skip inference, this can help to benchmark the build time on single gpu node for multi GPU model, where the inference is not possible"
-    )

-    parser.add_argument('--serial_build',
-                        default=False,
-                        action='store_true',
-                        help="Build engines serially")
-
-    parser.add_argument(
-        '--rank',
-        type=int,
-        default=None,
-        help=
-        "The rank of the model to be built, only used when --build_only and --serial_build is specified"
-    )
-    parser.add_argument(
-        '--world_size',
-        type=int,
-        default=None,
-        help=
-        "The number of gpus to be used for inference, only used when --build_only and --serial_build is specified"
-    )
-    parser.add_argument(
-        '--debug_memory',
-        default=False,
-        action='store_true',
-        help=
-        "Check the estimated memory usage against the total GPU memory. Raise error if the estimated memory requirement is bigger than the total GPU memory"
-        "Warning: only GPT model family is supported for now")
    parser.add_argument(
        '--dump_profile',
        default=False,
@ -281,25 +154,6 @@ def parse_arguments():
        help=
        "Print layer information of the engine to console (default = disabled)")

-    parser.add_argument(
-        '--opt_batch_size',
-        type=int,
-        default=None,
-        help=
-        "If opt_batch_size option is specified, it will override the opt batch size."
-        "This flag only takes effect when `--mode=ootb` is added. For other modes, please use --opt_num_tokens to replace it."
-    )
-
-    parser.add_argument(
-        '--opt_num_tokens',
-        type=int,
-        default=None,
-        help="It equals to max_batch_size*max_beam_width by default, set this "
-        "value as close as possible to the actual number of tokens on your workload. "
-        "Note that this argument might be removed in the future."
-        "This flag only takes effect when `--mode` is not `ootb`. For ootb mode, please use --opt_batch_size to replace it."
-    )
-
    return parser.parse_args()


@ -308,7 +162,6 @@ def main(args):
    # tensorrt_llm is imported, but mpi4py does not work well with
    # the start method `spawn` of Python multiprocessing,
    # so we set the start method first, then initialize MPI.
-    from allowed_configs import get_allowed_models
    from benchmark_profiler import BenchmarkProfiler
    from bert_benchmark import BERTBenchmark
    from enc_dec_benchmark import EncDecBenchmark
@ -341,17 +194,8 @@ def main(args):
            )
    args.weight_streaming = any([p != 1 for p in gpu_weights_percents])

-    if args.serial_build and not args.build_only:
-        raise Exception(
-            f"--serial_build must be used with --build_only, always need to parallel build to do inference in the same process"
-        )
-
-    if args.build_only and args.serial_build and args.rank is not None and args.world_size is not None:
-        rank = args.rank
-        world_size = args.world_size
-    else:
-        rank = tensorrt_llm.mpi_rank()
-        world_size = tensorrt_llm.mpi_world_size()
+    rank = tensorrt_llm.mpi_rank()
+    world_size = tensorrt_llm.mpi_world_size()

    # TODO: Re-enable memory monitor for multi-gpu benchmarks.
    # Current Mem Monitor will cause benchmark script hang
@ -361,30 +205,25 @@ def main(args):
        from mem_monitor import MemoryMonitor

    benchmark_profiler = None
-    if args.model in get_allowed_models(benchmark_type="gpt"):
+    if args.model == "dec":
        benchmark_profiler = BenchmarkProfiler()
        benchmarker = GPTBenchmark(args, batch_size_options, in_out_len_options,
                                   gpu_weights_percents, rank, world_size)
-    elif args.model in get_allowed_models(benchmark_type="bert"):
+    elif args.model == "enc":
        benchmarker = BERTBenchmark(args, batch_size_options, input_len_options,
                                    gpu_weights_percents, rank, world_size)
-    elif args.model in get_allowed_models(benchmark_type="enc_dec"):
+    elif args.model == "enc-dec":
        benchmarker = EncDecBenchmark(args, batch_size_options,
                                      in_out_len_options, gpu_weights_percents,
                                      rank, world_size)
    else:
        raise Exception(f'Unexpected model: {args.model}')

-    if args.build_only:
-        return
-
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    benchmarker.print_report_header(args.csv,
                                    benchmark_profiler=benchmark_profiler)
    for config in benchmarker.get_config():
-        if isinstance(benchmarker, GPTBenchmark):
-            benchmarker.check_memory(config, raise_exception=args.debug_memory)
        try:
            if args.weight_streaming:
                # We pass in config instead of the gpu_weights_percent here to keep this benchmark script
--- a/benchmarks/python/bert_benchmark.py
+++ b/benchmarks/python/bert_benchmark.py
@ -18,9 +18,7 @@ import os
 import torch
 import tensorrt as trt
 #isort: on
-from allowed_configs import get_build_config
 from base_benchmark import BaseBenchmark
-from build import build_bert

 import tensorrt_llm
 from tensorrt_llm._utils import trt_dtype_to_torch
@ -32,36 +30,17 @@ class BERTBenchmark(BaseBenchmark):
    def __init__(self, args, batch_sizes, in_lens, gpu_weights_percents, rank,
                 world_size):
        super().__init__(args.engine_dir, args.model, args.dtype, rank,
-                         world_size, args.serial_build)
+                         world_size)
        self.batch_sizes = batch_sizes
        self.in_lens = in_lens
        self.build_time = 0
-        self.mode = args.mode
        self.gpu_weights_percents = gpu_weights_percents

-        if args.engine_dir is not None:
-            # Deserialize engine from engine directory
-            self.serialize_path = os.path.join(args.engine_dir,
-                                               self.engine_name)
-            with open(self.serialize_path, 'rb') as f:
-                engine_buffer = f.read()
-        else:
-            # Build engine
-            for key, value in get_build_config(args.model).items():
-                setattr(self, key, value)
-            if args.force_num_layer_1:
-                self.num_layers = 1
-            if args.max_batch_size is not None:
-                self.max_batch_size = args.max_batch_size
-            if args.max_input_len is not None:
-                self.max_input_len = args.max_input_len
-
-            engine_buffer, build_time = build_bert(args)
-            self.build_time = build_time
-
-        assert engine_buffer is not None
-        if args.build_only:
-            return
+        # Deserialize engine from engine directory
+        self.serialize_path = os.path.join(args.engine_dir, self.engine_name)
+        with open(self.serialize_path, 'rb') as f:
+            engine_buffer = f.read()
+            assert engine_buffer is not None

        self.session = tensorrt_llm.runtime.Session.from_serialized_engine(
            engine_buffer)
--- a/benchmarks/python/build.py
+++ b/benchmarks/python/build.py
--- a/benchmarks/python/enc_dec_benchmark.py
+++ b/benchmarks/python/enc_dec_benchmark.py
@ -18,14 +18,13 @@ import os
 # isort: off
 import torch
 #isort: on
-from allowed_configs import get_build_config
-from base_benchmark import BaseBenchmark, get_engine_name
-from build import build_enc_dec
+from base_benchmark import BaseBenchmark

 import tensorrt_llm
 from tensorrt_llm._utils import (trt_dtype_to_torch, str_dtype_to_trt)
 from tensorrt_llm.quantization import QuantMode
 from tensorrt_llm.runtime.session import TensorInfo
+from tensorrt_llm.runtime import ModelConfig


 class EncDecBenchmark(BaseBenchmark):
@ -34,10 +33,8 @@ class EncDecBenchmark(BaseBenchmark):
                 rank, world_size):
        self.engine_dir = args.engine_dir
        self.model_name = args.model
-        self.mode = args.mode
        self.enable_fp8 = False  # hardcode for enc-dec models
        self.dtype = args.dtype
-        self.output_dir = args.output_dir
        self.runtime_rank = rank
        self.world_size = world_size
        self.csv_filename = ""  # lazy init
@ -63,87 +60,93 @@ class EncDecBenchmark(BaseBenchmark):
                                           "config.json")
                with open(config_path, "r") as f:
                    config = json.load(f)
-                # Sanity checks
-                config_dtype = config["builder_config"]["precision"]
-                assert (
-                    self.dtype == config_dtype
-                ), f"Engine dtype ({config_dtype}) != Runtime dtype ({self.dtype})"
-                world_size = config["builder_config"]["tensor_parallel"]
-                assert (
-                    world_size == self.world_size
-                ), f"Engine world size ({world_size}) != Runtime world size ({self.world_size})"
-                tp_size = config["builder_config"]["tensor_parallel"]
-                # TP only for benchmarking
-                assert (
-                    tp_size == self.world_size
-                ), f"Engine tensor parallel size ({tp_size}) should be equal to world size ({self.world_size})"
-                assert (
-                    config["plugin_config"]["remove_input_padding"] == False
-                ), "remove_input_padding should be False for enc-dec benchmarks"
-                num_heads = config["builder_config"]["num_heads"]
+
+                builder_config = config['build_config']
+                plugin_config = builder_config['plugin_config']
+                pretrained_config = config['pretrained_config']
+                lora_config = builder_config['lora_config']
+                builder_config['auto_parallel_config']
+                use_gpt_attention_plugin = plugin_config["gpt_attention_plugin"]
+                remove_input_padding = plugin_config["remove_input_padding"]
+                use_lora_plugin = plugin_config["lora_plugin"]
+                tp_size = pretrained_config['mapping']['tp_size']
+                pp_size = pretrained_config['mapping']['pp_size']
+                world_size = tp_size * pp_size
+                assert world_size == tensorrt_llm.mpi_world_size(), \
+                    f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
+                num_heads = pretrained_config["num_attention_heads"]
+                hidden_size = pretrained_config["hidden_size"]
+                head_size = pretrained_config["head_size"]
+                vocab_size = pretrained_config["vocab_size"]
+                max_batch_size = builder_config["max_batch_size"]
+                max_beam_width = builder_config["max_beam_width"]
+                num_layers = pretrained_config["num_hidden_layers"]
+                num_kv_heads = pretrained_config.get('num_kv_heads', num_heads)
+
                assert (num_heads % tp_size) == 0
-                # Get model config
                num_heads = num_heads // tp_size
-                hidden_size = config["builder_config"]["hidden_size"] // tp_size
-                num_kv_heads = config["builder_config"].get(
-                    "num_kv_heads", config["builder_config"]["num_heads"])
+                hidden_size = hidden_size // tp_size
                num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size

-                model_config = tensorrt_llm.runtime.ModelConfig(
+                cross_attention = pretrained_config[
+                    "architecture"] == "DecoderModel"
+                skip_cross_qkv = pretrained_config.get('skip_cross_qkv', False)
+                has_position_embedding = pretrained_config[
+                    "has_position_embedding"]
+                has_token_type_embedding = hasattr(pretrained_config,
+                                                   "type_vocab_size")
+                dtype = pretrained_config["dtype"]
+
+                paged_kv_cache = plugin_config['paged_kv_cache']
+                tokens_per_block = plugin_config['tokens_per_block']
+
+                gather_context_logits = builder_config.get(
+                    'gather_context_logits', False)
+                gather_generation_logits = builder_config.get(
+                    'gather_generation_logits', False)
+                max_prompt_embedding_table_size = builder_config.get(
+                    'max_prompt_embedding_table_size', 0)
+
+                self.max_batch_size = config["build_config"]["max_batch_size"]
+                self.max_input_len = config["build_config"][
+                    "max_encoder_input_len"]
+                self.max_seq_len = config["build_config"]["max_seq_len"]
+
+                model_config = ModelConfig(
                    num_heads=num_heads,
                    num_kv_heads=num_kv_heads,
                    hidden_size=hidden_size,
-                    head_size=config["builder_config"]["head_size"],
-                    max_batch_size=config["builder_config"]["max_batch_size"],
-                    max_beam_width=config["builder_config"]["max_beam_width"],
-                    vocab_size=config["builder_config"]["vocab_size"],
-                    num_layers=config["builder_config"]["num_layers"],
-                    gpt_attention_plugin=config["plugin_config"]
-                    ["gpt_attention_plugin"],
-                    remove_input_padding=config["plugin_config"]
-                    ["remove_input_padding"],
-                    cross_attention=config["builder_config"]["cross_attention"],
-                    skip_cross_qkv=config["builder_config"]["skip_cross_qkv"],
-                    has_position_embedding=config["builder_config"]
-                    ["has_position_embedding"],
-                    has_token_type_embedding=config["builder_config"]
-                    ["has_token_type_embedding"],
-                    dtype=config_dtype,
+                    head_size=head_size,
+                    max_batch_size=max_batch_size,
+                    max_beam_width=max_beam_width,
+                    vocab_size=vocab_size,
+                    num_layers=num_layers,
+                    gpt_attention_plugin=use_gpt_attention_plugin,
+                    remove_input_padding=remove_input_padding,
+                    paged_kv_cache=paged_kv_cache,
+                    tokens_per_block=tokens_per_block,
+                    cross_attention=cross_attention,
+                    has_position_embedding=has_position_embedding,
+                    has_token_type_embedding=has_token_type_embedding,
+                    dtype=dtype,
+                    gather_context_logits=gather_context_logits,
+                    gather_generation_logits=gather_generation_logits,
+                    max_prompt_embedding_table_size=
+                    max_prompt_embedding_table_size,
+                    lora_plugin=use_lora_plugin,
+                    lora_target_modules=lora_config.get('lora_target_modules'),
+                    trtllm_modules_to_hf_modules=lora_config.get(
+                        'trtllm_modules_to_hf_modules'),
+                    skip_cross_qkv=skip_cross_qkv,
                )
-                self.max_batch_size = config["builder_config"]["max_batch_size"]
-                self.max_input_len = config["builder_config"][
-                    "max_encoder_input_len"]
-                self.max_seq_len = config["builder_config"]["max_seq_len"]
-                self.n_mels = config["builder_config"][
-                    'n_mels'] if 'whisper' in self.model_name else 0

-                for key, value in config["builder_config"].items():
-                    if key == "name":
-                        engine_model_name = value
-                        break
-                return engine_model_name, model_config
+                return model_config

-            (
-                self.encoder_engine_model_name,
-                self.encoder_model_config,
-            ) = read_config("encoder")
-            (
-                self.decoder_engine_model_name,
-                self.decoder_model_config,
-            ) = read_config("decoder")
+            self.encoder_model_config = read_config("encoder")
+            self.decoder_model_config = read_config("decoder")

-        self.encoder_engine_name = get_engine_name(
-            self.encoder_engine_model_name,
-            self.dtype,
-            self.world_size,
-            self.runtime_rank,
-        )
-        self.decoder_engine_name = get_engine_name(
-            self.decoder_engine_model_name,
-            self.dtype,
-            self.world_size,
-            self.runtime_rank,
-        )
+        self.encoder_engine_name = 'rank{}.engine'.format(self.runtime_rank)
+        self.decoder_engine_name = 'rank{}.engine'.format(self.runtime_rank)
        self.encoder_runtime_mapping = tensorrt_llm.Mapping(
            world_size=self.world_size,
            rank=self.runtime_rank,
@ -155,47 +158,21 @@ class EncDecBenchmark(BaseBenchmark):
            tp_size=self.world_size,
        )

-        if not args.serial_build:
-            torch.cuda.set_device(self.runtime_rank %
-                                  self.encoder_runtime_mapping.gpus_per_node)
+        torch.cuda.set_device(self.runtime_rank %
+                              self.encoder_runtime_mapping.gpus_per_node)
        self.device = torch.cuda.current_device()

-        if self.engine_dir is not None:
-            # Deserialize engine from engine directory
-            self.encoder_serialize_path = os.path.join(self.engine_dir,
-                                                       "encoder",
-                                                       self.encoder_engine_name)
-            with open(self.encoder_serialize_path, "rb") as f:
-                encoder_engine_buffer = f.read()
-            self.decoder_serialize_path = os.path.join(self.engine_dir,
-                                                       "decoder",
-                                                       self.decoder_engine_name)
-            with open(self.decoder_serialize_path, "rb") as f:
-                decoder_engine_buffer = f.read()
-        else:
-            build_config = get_build_config(self.model_name)
-            self.max_batch_size = build_config['max_batch_size'] \
-                if args.max_batch_size is None else args.max_batch_size
-            self.max_input_len = build_config['max_encoder_input_len'] \
-                if args.max_input_len is None else args.max_input_len
-            self.max_seq_len = build_config['max_seq_len'] \
-                if args.max_seq_len is None else args.max_seq_len
-            self.n_mels = build_config[
-                'n_mels'] if 'whisper' in self.model_name else 0
-            # Build engine
-            (
-                encoder_engine_buffer,
-                decoder_engine_buffer,
-                self.encoder_model_config,
-                self.decoder_model_config,
-                encoder_build_time,
-                decoder_build_time,
-            ) = build_enc_dec(args)
-
-            self.build_time = encoder_build_time + decoder_build_time
-
-        assert encoder_engine_buffer is not None
-        assert decoder_engine_buffer is not None
+        # Deserialize engine from engine directory
+        self.encoder_serialize_path = os.path.join(self.engine_dir, "encoder",
+                                                   self.encoder_engine_name)
+        with open(self.encoder_serialize_path, "rb") as f:
+            encoder_engine_buffer = f.read()
+            assert encoder_engine_buffer is not None
+        self.decoder_serialize_path = os.path.join(self.engine_dir, "decoder",
+                                                   self.decoder_engine_name)
+        with open(self.decoder_serialize_path, "rb") as f:
+            decoder_engine_buffer = f.read()
+            assert decoder_engine_buffer is not None

        # session setup
        self.encoder_session = tensorrt_llm.runtime.Session.from_serialized_engine(
@ -216,11 +193,10 @@ class EncDecBenchmark(BaseBenchmark):
                f"[WARNING] whisper benchmark is input_len=1500, no text prompt, output_len=arbitrary"
            )
        for inlen, outlen in self.in_out_lens:
-            if (inlen > self.max_input_len
-                    or inlen + outlen > self.max_seq_len):
+            if (inlen > self.max_input_len or outlen > self.max_seq_len):
                print(
                    f"[WARNING] check inlen({inlen}) <= max_inlen({self.max_input_len}) and "
-                    f"inlen({inlen}) + outlen({outlen}) <= max_seqlen({self.max_seq_len}) failed, skipping."
+                    f"outlen({outlen}) <= max_seqlen({self.max_seq_len}) failed, skipping."
                )
                continue
            for batch_size in self.batch_sizes:
--- a/benchmarks/python/gpt_benchmark.py
+++ b/benchmarks/python/gpt_benchmark.py
@ -13,8 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import json
-import os
-from dataclasses import asdict
 from math import ceil

 import pandas as pd
@ -22,11 +20,11 @@ import tensorrt as trt
 import torch

 import tensorrt_llm
-from tensorrt_llm.profiler import bytes_to_target_unit
+from tensorrt_llm.builder import Engine
+from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
+                                  SamplingConfig)

-from allowed_configs import get_build_config, BuildConfig  # isort:skip
 from base_benchmark import BaseBenchmark  # isort:skip
-from build import build_gpt, get_quant_config  # isort:skip


 def element_size(dtype: str):
@ -46,80 +44,26 @@ class GPTBenchmark(BaseBenchmark):
    def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
                 rank, world_size):
        super().__init__(args.engine_dir, args.model, args.dtype, rank,
-                         world_size, args.serial_build)
+                         world_size)
        self.batch_sizes = batch_sizes
        self.in_out_lens = in_out_lens
        self.gpu_weights_percents = gpu_weights_percents
        self.num_beams = args.num_beams
-        self.mode = args.mode
-        self.build_time = 0
-
        self.cuda_graph_mode = args.enable_cuda_graph
-        self.build_config = None
-        # this dtype may be modified based on quantization mode later, when the fp8/int8 kv cache is used
-        self.kv_dtype = args.dtype
-
-        # approximate the weights size in the engine by using engine size
-        # the actual weights size shall be smaller because there are some other data in the engine file.
-        # for large model, this approximate is close enough.
-        self.weights_size_approx = 0
-
        self.dump_layer_info = args.dump_layer_info
-        # change profiling_verbosity to detailed when enabling dump layer info
-        if self.dump_layer_info:
-            args.profiling_verbosity = "detailed"
-
-        if args.engine_dir is not None:
-            # Get build configs from engine directory is done in base class
-            # Deserialize engine from engine directory
-            self.serialize_path = os.path.join(args.engine_dir,
-                                               self.engine_name)
-            with open(self.serialize_path, 'rb') as f:
-                engine_buffer = f.read()
-                self.weights_size_approx = len(engine_buffer)
-        else:
-            self.build_config = get_build_config(args.model, return_dict=False)
-
-            for key, value in asdict(self.build_config).items():
-                setattr(self, key, value)
-            if args.force_num_layer_1:
-                self.num_layers = 1
-            if args.max_batch_size is not None:
-                self.max_batch_size = args.max_batch_size
-            if args.max_input_len is not None:
-                self.max_input_len = args.max_input_len
-            if args.max_seq_len is not None:
-                self.max_seq_len = args.max_seq_len
-
-            self.quant_config = get_quant_config(args.quantization)
-            self.quant_mode = self.quant_config.quant_mode
-            self.enable_fp8 = self.quant_mode.has_fp8_qdq()
-            self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
-            if self.quant_mode.has_fp8_kv_cache():
-                self.kv_dtype = 'fp8'
-            if self.quant_mode.has_int8_kv_cache():
-                self.kv_dtype = 'int8'
-
-            # Plugins
-            self.use_gpt_attention_plugin = False
-            self.remove_input_padding = False
-            self.use_mamba_conv1d_plugin = False
-            if args.mode == 'plugin':
-                self.use_gpt_attention_plugin = True
-                self.remove_input_padding = True
-                self.use_moe_plugin = True
-                self.use_mamba_conv1d_plugin = True
-            elif args.mode == 'ootb-except-mha':
-                self.use_gpt_attention_plugin = True
-                self.remove_input_padding = True
-
-            engine_buffer, build_time = build_gpt(args)
-            self.weights_size_approx = engine_buffer.nbytes
-            self.build_time = build_time

+        # Get build configs from engine directory is done in base class
+        # Deserialize engine from engine directory
+        engine = Engine.from_dir(args.engine_dir, rank)
+        engine_buffer = engine.engine
        assert engine_buffer is not None
-        if args.build_only:
-            return
+        pretrained_config = engine.config.pretrained_config
+        if pretrained_config.architecture == 'ChatGLMForCausalLM' and pretrained_config.chatglm_version in [
+                'glm', 'chatglm'
+        ]:
+            session_cls = ChatGLMGenerationSession
+        else:
+            session_cls = GenerationSession

        if not hasattr(self, 'num_kv_heads') or self.num_kv_heads is None:
            self.num_kv_heads = self.num_heads
@ -155,50 +99,11 @@ class GPTBenchmark(BaseBenchmark):
            gpu_weights_percent=list(sorted(gpu_weights_percents))[0],
            **rnn_configs_kwargs,
        )
-        if args.model == 'chatglm_6b':
-            self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
-                end_id=130005,
-                pad_id=3,
-                num_beams=self.num_beams,
-                top_k=args.top_k,
-                top_p=args.top_p)
-            self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession(
-                model_config, engine_buffer, self.runtime_mapping)
-        elif args.model in ['chatglm2_6b', 'chatglm3_6b']:
-            self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
-                end_id=2,
-                pad_id=0,
-                num_beams=self.num_beams,
-                top_k=args.top_k,
-                top_p=args.top_p)
-            self.decoder = tensorrt_llm.runtime.GenerationSession(
-                model_config, engine_buffer, self.runtime_mapping)
-        if args.model == 'glm_10b':
-            self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
-                end_id=50258,
-                pad_id=50256,
-                num_beams=self.num_beams,
-                top_k=args.top_k,
-                top_p=args.top_p)
-            self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession(
-                model_config, engine_buffer, self.runtime_mapping)
-        else:
-            end_id = 50256
-            pad_id = 50256
-            if "llama" in args.model:
-                end_id = 2
-                pad_id = 0
-            self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
-                end_id=end_id,
-                pad_id=pad_id,
-                num_beams=self.num_beams,
-                top_k=args.top_k,
-                top_p=args.top_p)
-            self.decoder = tensorrt_llm.runtime.GenerationSession(
-                model_config,
-                engine_buffer,
-                self.runtime_mapping,
-                cuda_graph_mode=self.cuda_graph_mode)
+        self.sampling_config = SamplingConfig(end_id=2, pad_id=0)
+        self.decoder = session_cls(model_config,
+                                   engine_buffer,
+                                   self.runtime_mapping,
+                                   cuda_graph_mode=self.cuda_graph_mode)

        # Print context memory size for CI/CD to track.
        context_mem_size = self.decoder.context_mem_size
@ -260,72 +165,6 @@ class GPTBenchmark(BaseBenchmark):
                                benchmark_profiler=benchmark_profiler)
        torch.cuda.synchronize()

-    @staticmethod
-    def kv_cache_elem_per_token(config: BuildConfig, tp_size, pp_size) -> int:
-        # you need to multiply the size by element size, and multiply by the seq length
-        # Warning: this function returns the upper bound between different ranks when any one of the following is true:
-        # num_layer % pp_size !=0, hidden_size % num_kv_heads != 0, num_kv_heads % tp_size != 0
-        local_nlayers = ceil(config.num_layers / pp_size)
-        kv_heads = config.num_kv_heads if config.num_kv_heads is not None else config.num_heads
-        size_per_head = ceil(config.hidden_size / kv_heads)
-        local_heads = ceil(kv_heads / tp_size)
-        return 2 * local_nlayers * size_per_head * local_heads
-
-    def check_memory(self, io_shapes: list, raise_exception=False):
-        '''Compare the estimated GPU memory requirements for weights + activations + kv cache with the total GPU memory and log it.
-           Raise exception when the \p raise_exception parameter is true.
-        '''
-        # we don't want to block the test due to this
-        if self.build_config is None:
-            tensorrt_llm.logger.warning(
-                "Didn't have the build config object, skipping check the memory"
-            )
-            return
-        assert isinstance(self.build_config, BuildConfig)
-        batch_size, inlen, outlen = io_shapes[0], io_shapes[1], io_shapes[2]
-        kv_cache_size_in_bytes = batch_size*self.num_beams*(inlen + outlen)* \
-            self.kv_cache_elem_per_token(self.build_config, self.runtime_mapping.tp_size, self.runtime_mapping.pp_size) * element_size(self.kv_dtype)
-        # when MHA is OOTB, it requires extra KV cache size, because OOTB don't support inplace updating KV cache.
-        if not self.use_gpt_attention_plugin:
-            local_n_layer = ceil(self.build_config.num_layers /
-                                 self.runtime_mapping.pp_size)
-            kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
-                local_n_layer + 1)
-
-        kv_cache_size_in_mb = bytes_to_target_unit(kv_cache_size_in_bytes,
-                                                   "MiB")
-        activation_size_in_mb = bytes_to_target_unit(
-            self.decoder.runtime.engine.device_memory_size, "MiB")
-        weights_size_in_mb = bytes_to_target_unit(self.weights_size_approx,
-                                                  "MiB")
-        total_memory_approx_in_mb = kv_cache_size_in_mb + activation_size_in_mb + weights_size_in_mb
-        _, _, total = tensorrt_llm.profiler.device_memory_info()
-        total_in_mb = bytes_to_target_unit(total, 'MiB')
-        prefix = "[Memory Estimation]"
-
-        mem_msg = f"{prefix} activation memory:{activation_size_in_mb:.3f} MiB, kv_cache:{kv_cache_size_in_mb:.3f} MiB, weights approximate:{weights_size_in_mb:.3f} MiB, " \
-                  f"approximate required GPU memory: {total_memory_approx_in_mb:.3f} MiB, total GPU memory: {total_in_mb:.3f} MiB"
-        tensorrt_llm.logger.info(mem_msg)
-
-        build_args = dict(batch_size=batch_size,
-                          num_beams=self.num_beams,
-                          input_length=inlen,
-                          output_length=outlen,
-                          max_batch_size=self.build_config.max_batch_size,
-                          max_input_len=self.build_config.max_input_len,
-                          max_seq_len=self.build_config.max_seq_len,
-                          max_beam_width=self.build_config.max_beam_width)
-        for k, v in build_args.items():
-            tensorrt_llm.logger.info(f"{prefix} {k}:{v}")
-
-        tensorrt_llm.logger.info(
-            "grep the \"Total Activation\" and \"Total Weights\" from verbose TRT engine build log to see the precise memory size for those."
-        )
-        if raise_exception and total_memory_approx_in_mb >= total_in_mb:
-            raise Exception(
-                "Total memory estimation bigger than total gpu memory, the case will likely to OOM, needs enhancement of waive the test case, see logs about the memory usage details"
-            )
-
    def report(self,
               config,
               latency,
@ -348,7 +187,6 @@ class GPTBenchmark(BaseBenchmark):
        report_dict["input_length"] = inlen
        report_dict["output_length"] = outlen
        report_dict["latency(ms)"] = latency
-        report_dict["build_time(s)"] = self.build_time
        report_dict["tokens_per_sec"] = tokens_per_sec
        report_dict["percentile95(ms)"] = percentile95
        report_dict["percentile99(ms)"] = percentile99
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3e25541cdc2aaa48f6a6e4c386d22ca1832c8e120fc6e8c190db4ee066ebfb1f
-size 4293186
+oid sha256:7eec52cb658f033cf3146017cbaa3ea1554942ee7ece49329ddf7b01361fa080
+size 4293100
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:3108cd0580f6328bd46238ef708872d9d8030a9c8645b8b52bc750dfe094bc16
-size 4395794
+oid sha256:cf65778d6469a5a85bf2191fb104094aa4e606b370a25475a16017329e27fd95
+size 4395148
--- a/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/batch_manager/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-50a839e98b31729198870fc99ef2c5a9 libtensorrt_llm_batch_manager_static.a
-a39a5bf618c8514725b59aac4513223f libtensorrt_llm_batch_manager_static.pre_cxx11.a
-3511a2653f2ba73f6f827aca6d2850b3d3e8e543 commit
+08d59f31da00044ae21995c6573a55da libtensorrt_llm_batch_manager_static.a
+abdb9b58e0a4587d2d2ce6bc83655f8a libtensorrt_llm_batch_manager_static.pre_cxx11.a
+315e9f5ccd286e906d4c0d402fefbf2f69a1febe commit
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:9600435f1b9ab74c752d1831e1a6684a004927c84ab7c61fc076dbc128ca1521
-size 4154674
+oid sha256:e339bca2212b46c6227b328fc376db4628a0a96636b5f2b5b3ae387e884b7f01
+size 4155892
--- a/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-linux-gnu/libtensorrt_llm_batch_manager_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:8145ecf59dea64448ca0969553d32bc99e119cc5fc703e7b47eccfb5886594a0
-size 4133178
+oid sha256:7503446c4ef7b959970fc02b33ca81dd0dece0663d9a0f8b881c60ff66006000
+size 4136818
--- a/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
+++ b/cpp/tensorrt_llm/batch_manager/x86_64-windows-msvc/tensorrt_llm_batch_manager_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f89f551a880f4c6c1e68ed72b951ac482dec6033e55a336a0ecc401f4e9cf150
-size 24009160
+oid sha256:51174b20ed939662c92d21cdd5a0fd652a6592947270182ff026eb3a4153e4cf
+size 24015602
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33f259b374a02456f2b8d44571d92195b708c2011be4ecabe46267f49ca24c29
-size 1426724
+oid sha256:19fdeb78169c29492026b62bf147481e2b0d893916d9a20333d83fb61c0abe36
+size 1428026
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f44786aee0842bdb260de49b734d2119a0521c650f0b733f5ce6f997e72bfb34
-size 1452984
+oid sha256:1d7f36c49f24730e4038c2252b966870789d9c9cff698ccd50d0f61ae85fcc9d
+size 1455538
--- a/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/executor/aarch64-linux-gnu/version.txt
@ -1,3 +1,3 @@
-0d5e559ebc885794ab9e63086ae7a18a libtensorrt_llm_executor_static.a
-f9a3d1bf32f33f88569d4d8635e5445a libtensorrt_llm_executor_static.pre_cxx11.a
-3511a2653f2ba73f6f827aca6d2850b3d3e8e543 commit
+5bdad7b823b79b1b91439693aa25cff5 libtensorrt_llm_executor_static.a
+566734842bb731319971850583fdc9c7 libtensorrt_llm_executor_static.pre_cxx11.a
+315e9f5ccd286e906d4c0d402fefbf2f69a1febe commit
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:19bd908d16990cd11a295fcb71403e2ad285dc2c3b84d55228166d9240acd0d9
-size 1476318
+oid sha256:58e3e6d7414ab730ba54c8aabdc5f193787b44699e1289279428087cbb2e46d4
+size 1478178
--- a/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
+++ b/cpp/tensorrt_llm/executor/x86_64-linux-gnu/libtensorrt_llm_executor_static.pre_cxx11.a
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:bed0b93d23eef43ce46c01e694f9e578c64fe9b30e1b05d65b7feed1a41e5148
-size 1408208
+oid sha256:5f6598d6c2dafd9b97edfeb8fc424607374e8791c4e334cfaaf5cae865da15c6
+size 1410466
--- a/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
+++ b/cpp/tensorrt_llm/executor/x86_64-windows-msvc/tensorrt_llm_executor_static.lib
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:473c672353cb813af9ea65250bd79f61f5ea27c369c9f35bc3bace1e22c5e9bb
-size 14325956
+oid sha256:93e0c81a8d00db0e860cdfdafbae7391e0d2956c2301da1f22ef6419bcb4e02f
+size 14321264
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:286c47b52c5955ef4d2b5bd54cf555f6bafdb307a413949e1edafe4db991c887
+oid sha256:df3429c2cc6bffe3e3d12fc444426427676a85e281cab4456e5d0a03e4a6828f
 size 80318200
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/aarch64-linux-gnu/version.txt
@ -1,2 +1,2 @@
-28ead889239ca8d558c1e1a93f0485b0 libtensorrt_llm_nvrtc_wrapper.so
-3511a2653f2ba73f6f827aca6d2850b3d3e8e543 commit
+957f7c6034dca28dff7afe65ed68aa4b libtensorrt_llm_nvrtc_wrapper.so
+315e9f5ccd286e906d4c0d402fefbf2f69a1febe commit
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-linux-gnu/libtensorrt_llm_nvrtc_wrapper.so
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:f396ee533b289e7326df9061be8abba46ae061a61011c60c19051cbe219461e3
+oid sha256:829e6d2ccaed3c0e8ff351a6c418c65a9260433eff6f08feb41b3bab33d84fb4
 size 83552896
--- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
+++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQAImplJIT/nvrtcWrapper/x86_64-windows-msvc/tensorrt_llm_nvrtc_wrapper.dll
@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:20824706210bf184641c92fcb728ab0a3a74a36bc0b13e243c713a84c74a51ac
-size 1089536
+oid sha256:73ea01f6014e5c11a263f342f8c19f3a1b8bfa824441accd3cb4b7fa699a9d9a
+size 1087488
--- a/cpp/tensorrt_llm/layers/penaltyLayer.cpp
+++ b/cpp/tensorrt_llm/layers/penaltyLayer.cpp
@ -98,6 +98,8 @@ void PenaltyLayer<T>::allocateBuffer()
    TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);

    mLogitsPtrsHost = mBufferManager->pinnedPool(ITensor::makeShape({}), TRTDataType<T*>::value);
+    mLogitsPtrsDevice
+        = mBufferManager->gpu(ITensor::makeShape({mDecoderDomain.getBatchSize()}), TRTDataType<T*>::value);
    auto const batchSizeShape = ITensor::makeShape({mDecoderDomain.getBatchSize()});
    mTemperature = mBufferManager->pinnedPool(batchSizeShape, TRTDataType<float>::value);
    mRepetitionPenalty = mBufferManager->pinnedPool(batchSizeShape, TRTDataType<float>::value);
@ -233,6 +235,7 @@ void PenaltyLayer<T>::forwardAsync(
    mCyclicStep = mCyclicStep % mRuntimeMaxSeqLen;

    TensorPtr logitsPtrsHost = ITensor::slice(mLogitsPtrsHost, mCyclicStep, 1);
+    logitsPtrsHost->squeeze(0);
    auto logitsPtrsHostData = bufferCast<T*>(*logitsPtrsHost);
    for (SizeType32 bi = 0; bi < localDecoderDomain.getBatchSize(); bi++)
    {
@ -274,7 +277,13 @@ void PenaltyLayer<T>::forwardAsync(
    auto const tokensPerStep = bufferCastOrNull<SizeType32>(params->curTokensPerStep);

    InvokeBatchApplyPenaltyParams<T> penaltyParams;
-    penaltyParams.inputLogits = reinterpret_cast<T const* const*>(logitsPtrsHostData);
+
+    { // Moving the logits ptrs to device for faster access during kernel execution.
+        TensorPtr logitsPtrsDeviceSlice = ITensor::slice(mLogitsPtrsDevice, 0, localDecoderDomain.getBatchSize());
+        TensorPtr logitsPtrsHostSlice = ITensor::slice(logitsPtrsHost, 0, localDecoderDomain.getBatchSize());
+        mBufferManager->copy(*logitsPtrsHostSlice, *logitsPtrsDeviceSlice);
+        penaltyParams.inputLogits = reinterpret_cast<T const* const*>(bufferCast<T const*>(*logitsPtrsDeviceSlice));
+    }
    penaltyParams.outputLogits = bufferCast<T>(*mRuntimeLogitsDevice);
    penaltyParams.biases = embeddingBias;
    penaltyParams.penaltyWorkspace = bufferCastOrNull<TokenIdType>(mPenaltyWorkspaceDevice);
--- a/cpp/tensorrt_llm/layers/penaltyLayer.h
+++ b/cpp/tensorrt_llm/layers/penaltyLayer.h
@ -91,6 +91,7 @@ private:
    BufferPtr mPenaltyWorkspaceDevice;
    BufferPtr mPenaltyWorkspacePrevDevice;
    TensorPtr mLogitsPtrsHost;
+    TensorPtr mLogitsPtrsDevice;
 };

 } // namespace tensorrt_llm::layers
--- a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
+++ b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.cpp
@ -75,7 +75,7 @@ void _runGemm(int const M, int const N, int const K, bool const transA, bool con

 LoraPlugin::LoraPlugin(int in_hidden_size, std::vector<int> out_hidden_sizes, int transA, int transB,
    int num_lora_modules, nvinfer1::DataType type, LoraPlugin::PluginProfilerPtr const& pluginProfiler,
-    bool remove_input_padding, int max_context_length, int max_low_rank, int weight_index)
+    bool remove_input_padding, int max_num_tokens, int max_low_rank, int weight_index)
    : mInHiddenSize(in_hidden_size)
    , mTransA(transA)
    , mTransB(transB)
@ -83,7 +83,7 @@ LoraPlugin::LoraPlugin(int in_hidden_size, std::vector<int> out_hidden_sizes, in
    , mType(type)
    , mPluginProfiler(pluginProfiler)
    , mRemoveInputPadding(remove_input_padding)
-    , mMaxContextLength(max_context_length)
+    , mMaxNumTokens(max_num_tokens)
    , mMaxLowRank(max_low_rank)
    , mWeightIndex(weight_index)
 {
@ -105,7 +105,7 @@ LoraPlugin::LoraPlugin(void const* data, size_t length, LoraPlugin::PluginProfil
    read(d, mNumLoraModules);
    read(d, mType);
    read(d, mRemoveInputPadding);
-    read(d, mMaxContextLength);
+    read(d, mMaxNumTokens);
    read(d, mMaxLowRank);
    read(d, mWeightIndex);
    mOutHiddenSizes.resize(mNumLoraModules);
@ -266,10 +266,9 @@ void LoraPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, in
    TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
 }

-int64_t getLowRankWorkSpaceSize(
-    int64_t nbReq, int64_t maxContextLength, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t typeSize)
+int64_t getLowRankWorkSpaceSize(int64_t maxNumTokens, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t typeSize)
 {
-    return divUp(nbReq * maxContextLength * maxLoraModuleNum * maxLowRank * typeSize, 16) * 16;
+    return divUp(maxNumTokens * maxLoraModuleNum * maxLowRank * typeSize, 16) * 16;
 }

 int64_t getGroupedGemmParamsWorkSpaceSize(int64_t nbReq)
@ -278,16 +277,15 @@ int64_t getGroupedGemmParamsWorkSpaceSize(int64_t nbReq)
 }

 int64_t getSplitkGroupedGemmWorkSpaceSize(
-    int64_t nbReq, int64_t maxContextLength, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
+    int64_t maxNumTokens, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
 {
-    return divUp(nbReq * maxContextLength * maxLoraModuleNum * maxLowRank * sizeof(float) * splitKSlices, 16) * 16;
+    return divUp(maxNumTokens * maxLoraModuleNum * maxLowRank * sizeof(float) * splitKSlices, 16) * 16;
 }

-int64_t getGemmWorkSpaceSize(
-    int64_t nbReq, int64_t maxContextLength, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
+int64_t getGemmWorkSpaceSize(int64_t maxNumTokens, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
 {
    return std::max((int64_t) CUBLAS_WORKSPACE_SIZE,
-        getSplitkGroupedGemmWorkSpaceSize(nbReq, maxContextLength, maxLoraModuleNum, maxLowRank, splitKSlices));
+        getSplitkGroupedGemmWorkSpaceSize(maxNumTokens, maxLoraModuleNum, maxLowRank, splitKSlices));
 }

 size_t LoraPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs,
@ -298,8 +296,8 @@ size_t LoraPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, in
    auto const type = inputs[getInputTensorIdx()].type;
    auto const typeSize = tensorrt_llm::runtime::BufferDataType(type).getSize();

-    return (size_t) getGemmWorkSpaceSize(nbReq, mMaxContextLength, mNumLoraModules, mMaxLowRank, mSplitKSlices)
-        + getLowRankWorkSpaceSize(nbReq, mMaxContextLength, mNumLoraModules, mMaxLowRank, typeSize)
+    return (size_t) getGemmWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices)
+        + getLowRankWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, typeSize)
        + getGroupedGemmParamsWorkSpaceSize(nbReq * mNumLoraModules);
 }

@ -361,13 +359,12 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
        = mRemoveInputPadding ? static_cast<int32_t const*>(inputs[getHostContextLengthsIdx()]) : nullptr;
    RequestType const* reqTypes = static_cast<RequestType const*>(inputs[getHostRequestTypesIdx()]);

-    int64_t GemmWorkSpaceSize
-        = getGemmWorkSpaceSize(batch_size, mMaxContextLength, mNumLoraModules, mMaxLowRank, mSplitKSlices);
+    int64_t GemmWorkSpaceSize = getGemmWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices);
    int64_t groupGemmParamsWorkSpaceSize = getGroupedGemmParamsWorkSpaceSize(batch_size * mNumLoraModules);
    void* gemmWorkSpace = workspace; // [gemmWorkSpace, lowrankWorkSpace, groupGemmParamsWorkSpace]
    void* lowRankWorkSpace = static_cast<char*>(gemmWorkSpace) + GemmWorkSpaceSize;
    void* groupGemmParamsWorkSpace = static_cast<char*>(lowRankWorkSpace)
-        + getLowRankWorkSpaceSize(batch_size, mMaxContextLength, mNumLoraModules, mMaxLowRank, typeSize);
+        + getLowRankWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, typeSize);

    bool isWithLora = isEnableLora(batch_size, mNumLoraModules, &inputs[getLoraRanksIdx()]);

@ -514,21 +511,15 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
                    ptrB.push_back(
                        reinterpret_cast<void*>(lora_weights_ptr[batchIdx * 2] + K * N * typeSize * mWeightIndex));
                    ptrC.push_back(static_cast<void*>(static_cast<char*>(lowRankWorkSpace)
-                        + (loraModuleIdx * batch_size * mMaxContextLength * mMaxLowRank
-                              + handled_token_num * mMaxLowRank)
-                            * typeSize));
+                        + (loraModuleIdx * mMaxNumTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));
                    ptrD.push_back(static_cast<void*>(static_cast<char*>(lowRankWorkSpace)
-                        + (loraModuleIdx * batch_size * mMaxContextLength * mMaxLowRank
-                              + handled_token_num * mMaxLowRank)
-                            * typeSize));
+                        + (loraModuleIdx * mMaxNumTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));

                    auto const N2 = outputDesc[loraModuleIdx].dims.d[nbDimsA - 1];
                    cutlass::gemm::GemmCoord problem_2(M, N2, N);
                    problem_sizes_2.push_back(problem_2);
                    ptrA_2.push_back(static_cast<void*>(static_cast<char*>(lowRankWorkSpace)
-                        + (loraModuleIdx * batch_size * mMaxContextLength * mMaxLowRank
-                              + handled_token_num * mMaxLowRank)
-                            * typeSize));
+                        + (loraModuleIdx * mMaxNumTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));
                    ptrB_2.push_back(
                        reinterpret_cast<void*>(lora_weights_ptr[batchIdx * 2 + 1] + N2 * N * typeSize * mWeightIndex));
                    ptrC_2.push_back(static_cast<void*>(
@ -603,7 +594,7 @@ size_t LoraPlugin::getSerializationSize() const noexcept
 {
    TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
    return sizeof(mInHiddenSize) + sizeof(mTransA) + sizeof(mTransB) + sizeof(mNumLoraModules) + sizeof(mType)
-        + mPluginProfiler->getSerializationSize(mGemmId) + sizeof(mRemoveInputPadding) + sizeof(mMaxContextLength)
+        + mPluginProfiler->getSerializationSize(mGemmId) + sizeof(mRemoveInputPadding) + sizeof(mMaxNumTokens)
        + sizeof(mMaxLowRank) + sizeof(mWeightIndex) + sizeof(int) * mNumLoraModules; // selected tactics container size
 }

@ -617,7 +608,7 @@ void LoraPlugin::serialize(void* buffer) const noexcept
    write(d, mNumLoraModules);
    write(d, mType);
    write(d, mRemoveInputPadding);
-    write(d, mMaxContextLength);
+    write(d, mMaxNumTokens);
    write(d, mMaxLowRank);
    write(d, mWeightIndex);
    for (int i = 0; i < mNumLoraModules; i++)
@ -674,7 +665,7 @@ IPluginV2* LoraPluginCreator::createPlugin(char const* name, PluginFieldCollecti
    int num_lora_modules;
    int in_hidden_size, transA, transB;
    bool remove_input_padding;
-    int max_context_length;
+    int max_num_tokens;
    int max_low_rank;
    int weight_index;
    // Read configurations from each fields
@ -706,10 +697,10 @@ IPluginV2* LoraPluginCreator::createPlugin(char const* name, PluginFieldCollecti
            TLLM_CHECK(fields[i].type == PluginFieldType::kINT8);
            remove_input_padding = static_cast<bool>(*(static_cast<int8_t const*>(fields[i].data)));
        }
-        else if (!strcmp(attrName, "max_context_length"))
+        else if (!strcmp(attrName, "max_num_tokens"))
        {
            TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
-            max_context_length = *(static_cast<int const*>(fields[i].data));
+            max_num_tokens = *(static_cast<int const*>(fields[i].data));
        }
        else if (!strcmp(attrName, "max_low_rank"))
        {
@ -748,7 +739,7 @@ IPluginV2* LoraPluginCreator::createPlugin(char const* name, PluginFieldCollecti
        // FIXME enable tactic profiler
        auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false, /* skip */ true);
        auto* obj = new LoraPlugin(in_hidden_size, out_hidden_sizes, transA, transB, num_lora_modules, type,
-            pluginProfiler, remove_input_padding, max_context_length, max_low_rank, weight_index);
+            pluginProfiler, remove_input_padding, max_num_tokens, max_low_rank, weight_index);
        obj->setPluginNamespace(mNamespace.c_str());
        return obj;
    }
--- a/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.h
+++ b/cpp/tensorrt_llm/plugins/loraPlugin/loraPlugin.h
@ -39,7 +39,7 @@ public:
    LoraPlugin() = delete;

    LoraPlugin(int in_hidden_size, std::vector<int> out_hidden_sizes, int transA, int transB, int num_lora_modules,
-        nvinfer1::DataType type, PluginProfilerPtr const& profiler, bool remove_input_padding, int max_context_length,
+        nvinfer1::DataType type, PluginProfilerPtr const& profiler, bool remove_input_padding, int max_num_tokens,
        int max_low_rank, int weight_index);

    LoraPlugin(void const* data, size_t length, PluginProfilerPtr const& profiler);
@ -121,7 +121,7 @@ private:
    int mTransB;
    nvinfer1::DataType mType;
    bool mRemoveInputPadding;
-    int mMaxContextLength;
+    int mMaxNumTokens;
    int mMaxLowRank;
    int mNumLoraModules;
    int mWeightIndex;
--- a/cpp/tests/resources/scripts/test_cpp.py
+++ b/cpp/tests/resources/scripts/test_cpp.py
@ -522,12 +522,6 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
        leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
    run_command(trt_model_test, cwd=tests_dir, env=cpp_env,
                timeout=timeout)  # expecting ~ 1200s
-    cpp_blocking_env = copy.copy(cpp_env)
-    cpp_blocking_env["CUDA_LAUNCH_BLOCKING"] = '1'
-    run_command(trt_model_test,
-                cwd=tests_dir,
-                env=cpp_blocking_env,
-                timeout=timeout)  # expecting ~ 1200s

    #Executor test in leader mode
    new_env = copy.copy(cpp_env)
--- a/docs/source/advanced/weight-streaming.md
+++ b/docs/source/advanced/weight-streaming.md
@ -41,20 +41,14 @@ python3 examples/summarize.py \
 We can also benchmark the efficiency of Weight Streaming. Here is an example:
 ```bash
 python3 benchmarks/python/benchmark.py \
-    -m opt_30b \
-    --mode ootb  \
+    --engine_dir /tmp/llama_7b/trt_engines/fp16/1-gpu/ \
    --batch_size "1;32" \
-    --max_batch_size "32" \
    --input_output_len "256,32" \
-    --max_input_len 256\
-    --max_seq_len 288 \
    --gpu_weights_percent "0.0;0.3;0.6;1.0" \
    --dtype float16 \
    --csv \
    --log_level verbose
-
 ```
-Here we use `ootb` mode so that the GEMM operators won't use plugins. `ootb-except-mha` mode is also valid.


 ### API Changes
--- a/examples/baichuan/requirements.txt
+++ b/examples/baichuan/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.15.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/bloom/requirements.txt
+++ b/examples/bloom/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/chatglm/requirements.txt
+++ b/examples/chatglm/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 evaluate~=0.4.1
 protobuf
--- a/examples/dbrx/requirements.txt
+++ b/examples/dbrx/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/enc_dec/README.md
+++ b/examples/enc_dec/README.md
@ -241,17 +241,19 @@ In `benchmarks/python/`:
 ```bash
 # Example 1: Single-GPU benchmark
 python benchmark.py \
-    -m t5_small \
+    -m enc-dec \
    --batch_size "1;8" \
    --input_output_len "60,20;128,20" \
+    --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} \
    --dtype float32 \
    --csv # optional

 # Example 2: Multi-GPU benchmark
 mpirun --allow-run-as-root -np 4 python benchmark.py \
-    -m t5_small \
+    -m enc-dec \
    --batch_size "1;8" \
    --input_output_len "60,20;128,20" \
+    --engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} \
    --dtype float32 \
    --csv # optional
 ```
--- a/examples/falcon/requirements.txt
+++ b/examples/falcon/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 transformers>=4.31.0
 datasets~=2.14.5
 evaluate~=0.4.1
--- a/examples/gemma/requirements.txt
+++ b/examples/gemma/requirements.txt
@ -3,7 +3,7 @@
 # WAR the new posting of "nvidia-cudnn-cu12~=9.0".
 # "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
 nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 flax~=0.8.0
 # jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
 jax~=0.4.19; platform_system == "Windows"
--- a/examples/gpt/convert_checkpoint.py
+++ b/examples/gpt/convert_checkpoint.py
--- a/examples/gpt/nemo_lora_convert.py
+++ b/examples/gpt/nemo_lora_convert.py
@ -22,10 +22,10 @@ from pathlib import Path
 import numpy as np
 import torch
 import yaml
-from convert_checkpoint import cpu_map_location, unpack_nemo_ckpt

 from tensorrt_llm._utils import str_dtype_to_torch, to_json_file, torch_to_numpy
 from tensorrt_llm.lora_manager import LoraManager, get_all_nemo_lora_weights
+from tensorrt_llm.models.gpt.convert import cpu_map_location, unpack_nemo_ckpt

 log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
 logging.basicConfig(format=log_format)
--- a/examples/gpt/nemo_prompt_convert.py
+++ b/examples/gpt/nemo_prompt_convert.py
@ -22,9 +22,9 @@ from pathlib import Path
 import numpy as np
 import torch
 import yaml
-from convert_checkpoint import cpu_map_location, unpack_nemo_ckpt

 from tensorrt_llm._utils import torch_to_numpy
+from tensorrt_llm.models.gpt.convert import cpu_map_location, unpack_nemo_ckpt

 log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
 logging.basicConfig(format=log_format)
--- a/examples/gpt/requirements.txt
+++ b/examples/gpt/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/gptj/convert_checkpoint.py
+++ b/examples/gptj/convert_checkpoint.py
@ -1,17 +1,15 @@
 import argparse
-import json
 import os
 import time
 import traceback
 from concurrent.futures import ThreadPoolExecutor, as_completed
-from typing import Dict, Optional, Tuple

-import safetensors
-import torch
-from transformers import AutoModelForCausalLM, GPTJConfig, GPTJForCausalLM
+from transformers import AutoModelForCausalLM

 import tensorrt_llm
+from tensorrt_llm.hlapi import QuantConfig
 from tensorrt_llm.mapping import Mapping
+from tensorrt_llm.models import GPTJConfig, GPTJForCausalLM
 from tensorrt_llm.quantization import QuantAlgo


@ -68,298 +66,44 @@ def parse_arguments():
    return args


-def load_gptj_config(model_dir: str) -> GPTJConfig:
-    """ Helper utility to load GPTJConfig.
-
-    A pretrained checkpoint from modeling_RW.py has a different structure
-    and is not compatible with `transformers.GPTJConfig` and
-    `transformers.GPTJModel`. We need to manually set the config values.
-    """
-
-    config = GPTJConfig.from_pretrained(model_dir)
-    return config
-
-
-def split(weight: torch.Tensor,
-          tp_size: int,
-          rank: int = 0,
-          dim: int = 0) -> torch.Tensor:
-    if tp_size == 1:
-        return weight
-    elif weight.ndim == 1:
-        return torch.chunk(weight, tp_size)[rank].contiguous()
-    else:
-        return torch.chunk(weight, tp_size, dim=dim)[rank].contiguous()
-
-
-def split_matrix(weight: torch.Tensor, tp_size: int, rank: int,
-                 dim: int) -> torch.Tensor:
-    return split(weight, tp_size, rank, dim=dim)
-
-
-def get_weight(params: Dict[str, torch.Tensor], prefix: str,
-               dtype: torch.dtype) -> torch.Tensor:
-    if f'{prefix}.weight' not in params:
-        return None
-    return params[f'{prefix}.weight'].to(dtype).detach().cpu()
-
-
-def get_bias(params: Dict[str, torch.Tensor], prefix: str,
-             dtype: torch.dtype) -> torch.Tensor:
-    if f'{prefix}.bias' not in params:
-        return None
-    return params[f'{prefix}.bias'].to(dtype).detach().cpu()
-
-
-def get_weight_and_bias(params: Dict[str, torch.Tensor], prefix: str,
-                        dtype: torch.dtype) -> Tuple[torch.Tensor]:
-    return get_weight(params, prefix, dtype), get_bias(params, prefix, dtype)
-
-
-def get_tllm_linear_weight(
-    weight: torch.Tensor,
-    prefix: str,
-    bias: Optional[torch.Tensor] = None,
-    use_weight_only: bool = False,
-    plugin_weight_only_quant_type: torch.dtype = torch.int8
-) -> Dict[str, torch.Tensor]:
-    results = {}
-    if use_weight_only:
-        v = weight.t().contiguous()
-        processed_torch_weights, torch_weight_scales = \
-            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
-                v, plugin_weight_only_quant_type)
-        results[f'{prefix}.weight'] = processed_torch_weights
-        results[f'{prefix}.per_channel_scale'] = torch_weight_scales
-    else:
-        results[f'{prefix}.weight'] = weight.contiguous()
-
-    if bias is not None:
-        results[f'{prefix}.bias'] = bias
-
-    return results
-
-
-def get_tllm_param(
-    param: torch.Tensor,
-    name: str,
-    use_weight_only: bool = False,
-    plugin_weight_only_quant_type: torch.dtype = torch.int8
-) -> Dict[str, torch.Tensor]:
-    results = {}
-    if name.endswith('.weight') and use_weight_only:
-        v = param.t().contiguous()
-        processed_torch_weights, torch_weight_scales = \
-            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
-                v, plugin_weight_only_quant_type)
-        results[name] = processed_torch_weights
-        results[name.replace('weight',
-                             'per_channel_scale')] = torch_weight_scales
-    else:
-        results[name] = param
-
-    return results
-
-
-def convert_hf_gptj(hf_model: GPTJForCausalLM,
-                    hf_config: GPTJConfig,
-                    mapping: Mapping,
-                    dtype: str = 'float32',
-                    use_weight_only: bool = False,
-                    plugin_weight_only_quant_type: torch.dtype = torch.int8):
-
-    weights = {}
-    tik = time.time()
-
-    model_params = dict(hf_model.named_parameters())
-    dtype = getattr(torch, dtype)
-    num_hidden_layers = hf_config.num_hidden_layers
-
-    layers_range = mapping.pp_layers(num_hidden_layers)
-    for l in layers_range:
-        prefix = f'transformer.h.{l}'
-        tllm_prex = f'transformer.layers.{l-layers_range[0]}'
-        # Attention QKV (no bias)
-        q_weight = get_weight(model_params, f'{prefix}.attn.q_proj', dtype)
-        k_weight = get_weight(model_params, f'{prefix}.attn.k_proj', dtype)
-        v_weight = get_weight(model_params, f'{prefix}.attn.v_proj', dtype)
-        q_w = split_matrix(q_weight, mapping.tp_size, mapping.tp_rank, dim=0)
-        k_w = split_matrix(k_weight, mapping.tp_size, mapping.tp_rank, dim=0)
-        v_w = split_matrix(v_weight, mapping.tp_size, mapping.tp_rank, dim=0)
-        qkv_w = torch.concatenate([q_w, k_w, v_w], dim=0)
-        weights.update(
-            get_tllm_linear_weight(qkv_w, f'{tllm_prex}.attention.qkv', None,
-                                   use_weight_only,
-                                   plugin_weight_only_quant_type))
-        # Attention dense (not bias)
-        attn_dense_weight = get_weight(model_params, f'{prefix}.attn.out_proj',
-                                       dtype)
-        attn_dense_w = split_matrix(attn_dense_weight,
-                                    mapping.tp_size,
-                                    mapping.tp_rank,
-                                    dim=1)
-        weights.update(
-            get_tllm_linear_weight(attn_dense_w, f'{tllm_prex}.attention.dense',
-                                   None, use_weight_only,
-                                   plugin_weight_only_quant_type))
-        # MLP fc_in (with bias)
-        mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(
-            model_params, f'{prefix}.mlp.fc_in', dtype)
-        mlp_fc_w = split_matrix(mlp_fc_weight,
-                                mapping.tp_size,
-                                mapping.tp_rank,
-                                dim=0)
-        mlp_fc_b = split_matrix(mlp_fc_bias,
-                                mapping.tp_size,
-                                mapping.tp_rank,
-                                dim=0)
-        weights.update(
-            get_tllm_linear_weight(mlp_fc_w, f'{tllm_prex}.mlp.fc', mlp_fc_b,
-                                   use_weight_only,
-                                   plugin_weight_only_quant_type))
-        # MLP fc_out (with bias)
-        mlp_proj_weight, mlp_proj_bias = get_weight_and_bias(
-            model_params, f'{prefix}.mlp.fc_out', dtype)
-        mlp_proj_w = split_matrix(mlp_proj_weight,
-                                  mapping.tp_size,
-                                  mapping.tp_rank,
-                                  dim=1)
-        # Only rank0 will get bias
-        if mapping.tp_size > 1 and mapping.tp_rank > 0:
-            mlp_proj_bias = torch.zeros(mlp_proj_weight.shape[0],
-                                        dtype=mlp_proj_weight.dtype)
-        weights.update(
-            get_tllm_linear_weight(mlp_proj_w, f'{tllm_prex}.mlp.proj',
-                                   mlp_proj_bias, use_weight_only,
-                                   plugin_weight_only_quant_type))
-
-        input_ln_weight, input_ln_bias = get_weight_and_bias(
-            model_params, f'{prefix}.ln_1', dtype)
-        weights[f'{tllm_prex}.input_layernorm.weight'] = input_ln_weight
-        weights[f'{tllm_prex}.input_layernorm.bias'] = input_ln_bias
-
-    if mapping.is_first_pp_rank():
-        # Embedding
-        embed_w = get_weight(model_params, 'transformer.wte', dtype)
-        weights['transformer.vocab_embedding.weight'] = embed_w
-    if mapping.is_last_pp_rank():
-        # lm_head weight and bias
-        lm_head_w, ln_head_bias = get_weight_and_bias(model_params, 'lm_head',
-                                                      dtype)
-        weights['lm_head.weight'] = split_matrix(lm_head_w,
-                                                 mapping.tp_size,
-                                                 mapping.tp_rank,
-                                                 dim=0)
-        weights['lm_head.bias'] = split_matrix(ln_head_bias,
-                                               mapping.tp_size,
-                                               mapping.tp_rank,
-                                               dim=0)
-        ln_f_w, ln_f_b = get_weight_and_bias(model_params, 'transformer.ln_f',
-                                             dtype)
-        # ln_f weight and bias
-        weights['transformer.ln_f.weight'] = ln_f_w
-        if ln_f_b is not None:
-            weights['transformer.ln_f.bias'] = ln_f_b
-
-    tok = time.time()
-    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
-    print(f'Weights loaded. Total time: {t}')
-    return weights
-
-
-def main():
-    # TODO(qijun): Currently, the convert script depends on a torch op:
-    # torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix,
-    # which is included in tensorrt_llm Python package. Otherwise, the convert
-    # script does not need to import tensorrt_llm. Will remove it after reimplementing
-    # the op with PyTorch.
-    print(tensorrt_llm.__version__)
-    args = parse_arguments()
-    world_size = args.tp_size * args.pp_size
-
-    tik = time.time()
-
-    if not os.path.exists(args.output_dir):
-        os.makedirs(args.output_dir)
-
+def args_to_quant_config(args):
    quant_algo = None
-    plugin_weight_only_quant_type = None
    if args.use_weight_only and args.weight_only_precision == 'int8':
-        plugin_weight_only_quant_type = torch.int8
        quant_algo = QuantAlgo.W8A16
    elif args.use_weight_only and args.weight_only_precision == 'int4':
-        plugin_weight_only_quant_type = torch.quint4x2
        quant_algo = QuantAlgo.W4A16
+    return QuantConfig(quant_algo=quant_algo)

-    if args.model_dir is not None:
-        hf_config = load_gptj_config(args.model_dir)
-        architecture = hf_config.architectures[0]
-        args.vocab_size = hf_config.vocab_size
-        args.n_positions = hf_config.max_position_embeddings
-        args.n_layer = hf_config.num_hidden_layers
-        args.n_head = hf_config.num_attention_heads
-        args.n_embd = hf_config.hidden_size
-        args.norm_eps = hf_config.layer_norm_epsilon
-        args.rotary_dim = hf_config.rotary_dim
-    else:
-        architecture = "GPTJForCausalLM"

-    config = {
-        'architecture': architecture,
-        'dtype': args.dtype,
-        'num_hidden_layers': args.n_layer,
-        'num_attention_heads': args.n_head,
-        'hidden_size': args.n_embd,
-        'norm_epsilon': args.norm_eps,
-        'vocab_size': args.vocab_size,
-        'position_embedding_type': 'rope_gptj',
-        'max_position_embeddings': args.n_positions,
-        'hidden_act': 'gelu',
-        'quantization': {
-            'quant_algo': quant_algo
-        },
-        'mapping': {
-            'world_size': world_size,
-            'tp_size': args.tp_size,
-            'pp_size': args.pp_size,
-        },
-        'rotary_dim': args.rotary_dim,
-    }
+def convert_and_save_hf(args):
+    model_dir = args.model_dir
+    world_size = args.tp_size * args.pp_size
+    quant_config = args_to_quant_config(args)

-    with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
-        json.dump(config, f, indent=4)
+    hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
+                                                    torch_dtype='auto',
+                                                    trust_remote_code=True)

-    if args.model_dir is None:
-        return
-
-    hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
-                                                    trust_remote_code=True,
-                                                    torch_dtype="auto")
-
-    def covert_and_save(rank):
+    def convert_and_save_rank(args, rank):
        mapping = Mapping(world_size=world_size,
                          rank=rank,
                          tp_size=args.tp_size,
                          pp_size=args.pp_size)
-
-        weights = convert_hf_gptj(
-            hf_model,
-            hf_config,
-            mapping,
-            dtype=args.dtype,
-            use_weight_only=args.use_weight_only,
-            plugin_weight_only_quant_type=plugin_weight_only_quant_type)
-
-        safetensors.torch.save_file(
-            weights, os.path.join(args.output_dir, f'rank{rank}.safetensors'))
+        model = GPTJForCausalLM.from_hugging_face(hf_model,
+                                                  args.dtype,
+                                                  mapping=mapping,
+                                                  quant_config=quant_config)
+        model.save_checkpoint(args.output_dir, save_config=(rank == 0))
+        del model

    if args.workers == 1:
        for rank in range(world_size):
-            covert_and_save(rank)
+            convert_and_save_rank(args, rank)
    else:
        with ThreadPoolExecutor(max_workers=args.workers) as p:
            futures = [
-                p.submit(covert_and_save, rank) for rank in range(world_size)
+                p.submit(convert_and_save_rank, args, rank)
+                for rank in range(world_size)
            ]
            exceptions = []
            for future in as_completed(futures):
@ -373,6 +117,38 @@ def main():
            ) == 0, "Checkpoint conversion failed, please check error log."

    del hf_model
+
+
+def main():
+    print(tensorrt_llm.__version__)
+    args = parse_arguments()
+
+    tik = time.time()
+
+    if not os.path.exists(args.output_dir):
+        os.makedirs(args.output_dir)
+
+    if args.model_dir is None:
+        config = GPTJConfig(architecture='GPTJForCausalLM',
+                            dtype=args.dtype,
+                            num_hidden_layers=args.n_layer,
+                            num_attention_heads=args.n_head,
+                            hidden_size=args.n_embd,
+                            norm_epsilon=args.norm_eps,
+                            vocab_size=args.vocab_size,
+                            position_embedding_type='rope_gptj',
+                            max_position_embeddings=args.n_positions,
+                            hidden_act='gelu',
+                            rotary_dim=args.rotary_dim,
+                            mapping=Mapping(world_size=args.tp_size *
+                                            args.pp_size,
+                                            tp_size=args.tp_size,
+                                            pp_size=args.pp_size),
+                            quantization=args_to_quant_config(args))
+        config.to_json_file(os.path.join(args.output_dir, 'config.json'))
+    else:
+        convert_and_save_hf(args)
+
    tok = time.time()
    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
    print(f'Total time of converting checkpoints: {t}')
--- a/examples/gptj/requirements.txt
+++ b/examples/gptj/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/gptj/utils/init.py
+++ b/examples/gptj/utils/init.py
@ -1,14 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
--- a/examples/gptj/utils/convert.py
+++ b/examples/gptj/utils/convert.py
@ -1,273 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
-# SPDX-License-Identifier: Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-"""
-    Utilities for exporting a model to our custom format.
-"""
-
-import numpy as np
-import torch
-
-from tensorrt_llm._utils import torch_to_numpy
-
-
-def cpu_map_location(storage, loc):
-    return storage.cpu()
-
-
-def gpu_map_location(storage, loc):
-    if loc.startswith("cuda"):
-        training_gpu_idx = int(loc.split(":")[1])
-        inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
-        return storage.cuda(inference_gpu_idx)
-    elif loc.startswith("cpu"):
-        return storage.cpu()
-    else:
-        raise ValueError(f"Not handled {loc}")
-
-
-def save_val(val, dir, key, tp_num=None):
-    suffix = "bin" if tp_num is None else f"{tp_num}.bin"
-    val.tofile(dir / f"model.{key}.{suffix}")
-
-
-def save_split(split_vals, dir, key, i, split_factor):
-    for j, val in enumerate(split_vals):
-        save_val(val, dir, key, i * split_factor + j)
-
-
-def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
-    """
-     This function has two purposes:
-      - compute quantized weights, scaled either per-tensor or per-column
-      - compute scaling factors
-
-      Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ.
-      CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W.
-      CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor.
-
-      Here is the list of what we need (T means per-tensor, C per-column):
-        - scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8). Used before the GEMM. (T)
-        - scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T)
-        - scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C)
-        - scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32)
-          to quant range (int8) (used for CUBLAS) (T, C)
-
-      Note that we don't do anything special about row-parallel GEMM. Theoretically, we could have per-GPU scaling factors too,
-      but then the model would change depending on the number of GPUs used.
-
-      For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection, we consider it
-      as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V.
-    """
-
-    # compute weight scaling factors for fp->int8 and int8->fp
-    if is_qkv and not multi_query_mode:
-        scale_w_orig_quant_t = 127. / act_range["w"].reshape(3, -1).max(
-            dim=-1, keepdims=True)[0].cpu().numpy()
-        scale_w_orig_quant_c = 127. / act_range["w"].reshape(3,
-                                                             -1).cpu().numpy()
-    elif is_qkv and multi_query_mode:
-        raise ValueError(
-            f"Multi-query w/ int8 quant has not been supported yet")
-    else:
-        scale_w_orig_quant_t = 127. / act_range["w"].max().cpu().numpy()
-        scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy()
-    scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t
-    scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c
-
-    # compute the rest of needed scaling factors
-    scale_x_orig_quant_t = np.array(127. / act_range["x"].max().item())
-    scale_y_orig_quant_t = np.array(127. / act_range["y"].max().item())
-    scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.)
-    scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t *
-                                                    scale_w_orig_quant_t)
-    scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t *
-                                                    scale_w_orig_quant_c)
-    if is_qkv:
-        scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t,
-                                                scale_w_orig_quant_c.shape)
-        scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t,
-                                               scale_w_orig_quant_c.shape)
-
-    to_i8 = lambda x: x.round().clip(-127, 127).astype(np.int8)
-    return {
-        "weight.int8": to_i8(weights * scale_w_orig_quant_t),
-        "weight.int8.col": to_i8(weights * scale_w_orig_quant_c),
-        "scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32),
-        "scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32),
-        "scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32),
-        "scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32),
-        "scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32),
-        "scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32),
-    }
-
-
-def write_int8(vals,
-               dir,
-               base_key,
-               split_dim,
-               tp_rank,
-               split_factor,
-               kv_cache_only=False):
-    if not kv_cache_only:
-        save_split(np.split(vals["weight.int8"], split_factor, axis=split_dim),
-                   dir, f"{base_key}.weight.int8", tp_rank, split_factor)
-        save_split(
-            np.split(vals["weight.int8.col"], split_factor, axis=split_dim),
-            dir, f"{base_key}.weight.int8.col", tp_rank, split_factor)
-
-    saved_keys_once = ["scale_y_quant_orig"]
-    if not kv_cache_only:
-        saved_keys_once += [
-            "scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant"
-        ]
-    # per-column scaling factors are loaded per-gpu for ColumnParallel GEMMs (QKV, FC1)
-    if not kv_cache_only:
-        if split_dim == -1:
-            save_split(
-                np.split(vals["scale_w_quant_orig.col"],
-                         split_factor,
-                         axis=split_dim), dir,
-                f"{base_key}.scale_w_quant_orig.col", tp_rank, split_factor)
-            save_split(
-                np.split(vals["scale_y_accum_quant.col"],
-                         split_factor,
-                         axis=split_dim), dir,
-                f"{base_key}.scale_y_accum_quant.col", tp_rank, split_factor)
-        else:
-            saved_keys_once += [
-                "scale_w_quant_orig.col", "scale_y_accum_quant.col"
-            ]
-
-    if tp_rank == 0:
-        for save_key in saved_keys_once:
-            save_val(vals[save_key], dir, f"{base_key}.{save_key}")
-
-
-# Note: in multi_query_mode, only query heads are split between multiple GPUs, while key/value head
-# are not split as there is only one head per key/value.
-@torch.no_grad()
-def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals,
-                          storage_type, act_range, config):
-    use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
-    split_gated_activation = config.get("split_gated_activation", False)
-    num_attention_heads = config.get("num_attention_heads", 0)
-    tp_size = config.get("tp_size", 1)
-    int8_outputs = config.get("int8_outputs", None)
-    multi_query_mode = config.get("multi_query_mode", False)
-    local_dim = config.get("local_dim", None)
-
-    save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
-
-    if not isinstance(vals, list):
-        vals = [vals]
-
-    if config.get("transpose_weights", False) and vals[0].ndim == 2:
-        vals = [val.T for val in vals]
-    if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
-        vals = [val + 1.0 for val in vals]
-    vals = [torch_to_numpy(val.cpu().to(storage_type)) for val in vals]
-
-    if "input_layernorm.weight" in key or "input_layernorm.bias" in key or \
-        "final_layernorm.weight" in key or "final_layernorm.bias" in key or \
-        "mlp.dense_4h_to_h.bias" in key:
-
-        # shared weights, only need to convert the weights of rank 0
-        if tp_rank == 0:
-            save_val(vals[0], saved_dir, key)
-
-    elif "attention.dense.weight" in key or "mlp.dense_4h_to_h.weight" in key:
-        cat_dim = 0
-        val = np.concatenate(vals, axis=cat_dim)
-        split_vals = np.split(val, split_factor, axis=cat_dim)
-        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
-        if act_range is not None and int8_outputs == "all":
-            base_key = key.replace(".weight", "")
-            vals_i8 = generate_int8(val,
-                                    act_range,
-                                    multi_query_mode=multi_query_mode)
-            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank,
-                       split_factor)
-
-    elif "mlp.dense_h_to_4h.weight" in key or "mlp.dense_h_to_4h.bias" in key:
-        if split_gated_activation:
-            splits = [np.split(val, 2, axis=-1) for val in vals]
-            vals, gates = list(zip(*splits))
-        cat_dim = -1
-        val = np.concatenate(vals, axis=cat_dim)
-        split_vals = np.split(val, split_factor, axis=cat_dim)
-        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
-        if act_range is not None and int8_outputs == "all":
-            base_key = key.replace(".weight", "")
-            vals_i8 = generate_int8(val,
-                                    act_range,
-                                    multi_query_mode=multi_query_mode)
-            write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank,
-                       split_factor)
-
-        if split_gated_activation:
-            assert not save_int8
-            prefix, dot, suffix = key.rpartition(".")
-            key = prefix + ".gate" + dot + suffix
-
-            gate = np.concatenate(gates, axis=cat_dim)
-            split_vals = np.split(gate, split_factor, axis=cat_dim)
-            save_split(split_vals, saved_dir, key, tp_rank, split_factor)
-
-    elif "attention.query_key_value.weight" in key:
-        hidden_dim = vals[0].shape[0]
-        if local_dim is None:
-            local_dim = vals[0].shape[-1] // 3
-        if multi_query_mode:
-            val = vals[0]
-            # out_feature = local_dim + 2 * head_size; assumes local_dim equals to hidden_dim
-            head_size = (val.shape[-1] - local_dim) // 2
-            val = val.reshape(hidden_dim, local_dim + 2 * head_size)
-            w_q, w_kv = np.split(val, [local_dim], axis=-1)
-            w_q_split = np.split(w_q, split_factor, axis=-1)
-            split_vals = [np.concatenate((i, w_kv), axis=-1) for i in w_q_split]
-        else:
-            if use_attention_nemo_shape:
-                head_num = num_attention_heads // tp_size
-                size_per_head = hidden_dim // num_attention_heads
-                vals = [
-                    val.reshape(hidden_dim, head_num, 3, size_per_head)
-                    for val in vals
-                ]
-                vals = [val.transpose(0, 2, 1, 3) for val in vals]
-
-            vals = [val.reshape(hidden_dim, 3, local_dim) for val in vals]
-            cat_dim = -1
-            val = np.concatenate(vals, axis=cat_dim)
-            split_vals = np.split(val, split_factor, axis=cat_dim)
-        save_split(split_vals, saved_dir, key, tp_rank, split_factor)
-        if save_int8:
-            base_key = key.replace(".weight", "")
-            vals_i8 = generate_int8(val,
-                                    act_range,
-                                    is_qkv=True,
-                                    multi_query_mode=multi_query_mode)
-            write_int8(vals_i8,
-                       saved_dir,
-                       base_key,
-                       cat_dim,
-                       tp_rank,
-                       split_factor,
-                       kv_cache_only=int8_outputs == "kv_cache_only")
-    elif ("attention.query.weight" in key or "attention.query.bias" in key
-          or "attention.key_value.weight" in key
-          or "attention.key_value.bias" in key):
-        pass
-    else:
-        assert False, f"[ERROR] {key} not handled by converter"
--- a/examples/gptj/utils/token_encoder.py
+++ b/examples/gptj/utils/token_encoder.py
@ -1,178 +0,0 @@
-"""Byte pair encoding utilities"""
-
-# Modified MIT License
-
-# Software Copyright (c) 2019 OpenAI
-
-# We don’t claim ownership of the content you create with GPT-2, so it is yours to do with as you please.
-# We only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2.
-
-# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
-# associated documentation files (the "Software"), to deal in the Software without restriction,
-# including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
-# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
-# subject to the following conditions:
-
-# The above copyright notice and this permission notice shall be included
-# in all copies or substantial portions of the Software.
-# The above copyright notice and this permission notice need not be included
-# with content created by the Software.
-
-# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
-# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
-# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
-# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
-# OR OTHER DEALINGS IN THE SOFTWARE.
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import json
-from functools import lru_cache
-
-import regex as re
-
-
-@lru_cache()
-def bytes_to_unicode():
-    """
-    Returns list of utf-8 byte and a corresponding list of unicode strings.
-    The reversible bpe codes work on unicode strings.
-    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
-    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
-    This is a significant percentage of your normal, say, 32K bpe vocab.
-    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
-    And avoids mapping to whitespace/control characters the bpe code barfs on.
-    """
-    bs = list(range(ord("!"),
-                    ord("~") + 1)) + list(range(
-                        ord("¡"),
-                        ord("¬") + 1)) + list(range(ord("®"),
-                                                    ord("ÿ") + 1))
-    cs = bs[:]
-    n = 0
-    for b in range(2**8):
-        if b not in bs:
-            bs.append(b)
-            cs.append(2**8 + n)
-            n += 1
-    cs = [chr(n) for n in cs]
-    return dict(zip(bs, cs))
-
-
-def get_pairs(word):
-    """Return set of symbol pairs in a word.
-
-    Word is represented as tuple of symbols (symbols being variable-length strings).
-    """
-    pairs = set()
-    prev_char = word[0]
-    for char in word[1:]:
-        pairs.add((prev_char, char))
-        prev_char = char
-    return pairs
-
-
-class Encoder:
-
-    def __init__(self, encoder, bpe_merges, errors='replace'):
-        self.encoder = encoder
-        self.decoder = {v: k for k, v in self.encoder.items()}
-        self.errors = errors  # how to handle errors in decoding
-        self.byte_encoder = bytes_to_unicode()
-        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
-        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
-        self.cache = {}
-
-        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
-        self.pat = re.compile(
-            r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
-        )
-
-    def bpe(self, token):
-        if token in self.cache:
-            return self.cache[token]
-        word = tuple(token)
-        pairs = get_pairs(word)
-
-        if not pairs:
-            return token
-
-        while True:
-            bigram = min(
-                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
-            if bigram not in self.bpe_ranks:
-                break
-            first, second = bigram
-            new_word = []
-            i = 0
-            while i < len(word):
-                try:
-                    j = word.index(first, i)
-                    new_word.extend(word[i:j])
-                    i = j
-                except:
-                    new_word.extend(word[i:])
-                    break
-
-                if word[i] == first and i < len(word) - 1 and word[i +
-                                                                   1] == second:
-                    new_word.append(first + second)
-                    i += 2
-                else:
-                    new_word.append(word[i])
-                    i += 1
-            new_word = tuple(new_word)
-            word = new_word
-            if len(word) == 1:
-                break
-            else:
-                pairs = get_pairs(word)
-        word = ' '.join(word)
-        self.cache[token] = word
-        return word
-
-    def encode(self, text):
-        bpe_tokens = []
-        for token in re.findall(self.pat, text):
-            token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
-            bpe_tokens.extend(self.encoder[bpe_token]
-                              for bpe_token in self.bpe(token).split(' '))
-        return bpe_tokens
-
-    def decode(self, tokens):
-        text = ''.join([self.decoder[token] for token in tokens])
-        text = bytearray([self.byte_decoder[c]
-                          for c in text]).decode('utf-8', errors=self.errors)
-        return text
-
-    def batch_decode(self, output):
-        ret = []
-        for tokens in output:
-            ret.append(self.decode(tokens))
-        return ret
-
-
-def get_encoder(vocab_file, bpe_file):
-    with open(vocab_file, 'r', encoding="utf-8") as f:
-        encoder = json.load(f)
-    with open(bpe_file, 'r', encoding="utf-8") as f:
-        bpe_data = f.read()
-    bpe_merges = [
-        tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]
-    ]
-    return Encoder(
-        encoder=encoder,
-        bpe_merges=bpe_merges,
-    )
--- a/examples/gptneox/requirements.txt
+++ b/examples/gptneox/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 rouge_score~=0.1.2
 evaluate~=0.4.1
--- a/examples/grok/requirements.txt
+++ b/examples/grok/requirements.txt
@ -1,6 +1,6 @@
 -f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/high-level-api/requirements.txt
+++ b/examples/high-level-api/requirements.txt
@ -1,2 +1,2 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
--- a/examples/internlm/requirements.txt
+++ b/examples/internlm/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets==2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
--- a/examples/jais/requirements.txt
+++ b/examples/jais/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/llama/requirements.txt
+++ b/examples/llama/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/mamba/requirements.txt
+++ b/examples/mamba/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 transformers>=4.39.0
 datasets~=2.14.5
 evaluate
--- a/examples/medusa/requirements.txt
+++ b/examples/medusa/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 rouge_score~=0.1.2
 sentencepiece~=0.1.99
--- a/examples/mixtral/requirements.txt
+++ b/examples/mixtral/requirements.txt
@ -1,4 +1,4 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 transformers==4.38.2
 accelerate==0.25.0
--- a/examples/mpt/requirements.txt
+++ b/examples/mpt/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/nemotron/requirements.txt
+++ b/examples/nemotron/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 transformers==4.40.2
 # https://github.com/NVIDIA/NeMo/issues/9793
 huggingface_hub==0.23.5
--- a/examples/opt/requirements.txt
+++ b/examples/opt/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/phi/requirements.txt
+++ b/examples/phi/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.14.5
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/quantization/requirements.txt
+++ b/examples/quantization/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets>=2.14.4
 nemo-toolkit[all]<=1.20.0,>=1.18.0
 rouge_score~=0.1.2
--- a/examples/qwen/requirements.txt
+++ b/examples/qwen/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/qwenvl/requirements.txt
+++ b/examples/qwenvl/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.16.0
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/recurrentgemma/requirements.txt
+++ b/examples/recurrentgemma/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 git+https://github.com/google-deepmind/recurrentgemma.git
 flax>=0.8.2
 jax~=0.4.23
--- a/examples/skywork/requirements.txt
+++ b/examples/skywork/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets~=2.16.1
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/smaug/requirements.txt
+++ b/examples/smaug/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 datasets==2.14.6
 evaluate~=0.4.1
 rouge_score~=0.1.2
--- a/examples/summarize.py
+++ b/examples/summarize.py
@ -182,7 +182,7 @@ def main(args):
                input_ids = tokenizer.encode(curr_text,
                                             return_tensors='pt').squeeze(0)
                input_ids = input_ids[:test_token_num]
-            elif model_name == 'QWenForCausalLM' and model_version == 'qwen':
+            elif 'qwen' in model_name.lower() and model_version == 'qwen':
                # use make_content to generate prompt
                system_prompt = "You are a useful assistant, please directly output the corresponding summary according to the article entered by the user."
                _, input_id_list = make_context(
@ -194,7 +194,7 @@ def main(args):
                )
                input_ids = torch.tensor(input_id_list)
            else:
-                if model_name == 'QWenForCausalLM' and 'qwen2' in model_version:
+                if 'qwen' in model_name.lower() and 'qwen2' in model_version:
                    messages = [{
                        "role": "system",
                        "content": "You are a helpful assistant."
@ -527,7 +527,7 @@ def main(args):
            ite_count += 1
        del runner

-    if test_hf:
+    if test_hf and runtime_rank == 0:
        profiler.start('load HF model')
        dtype_alias_mapping = {
            'fp32': 'float32',
--- a/examples/utils.py
+++ b/examples/utils.py
@ -37,7 +37,10 @@ DEFAULT_HF_MODEL_DIRS = {
    'MPTForCausalLM': 'mosaicml/mpt-7b',
    'PhiForCausalLM': 'microsoft/phi-2',
    'OPTForCausalLM': 'facebook/opt-350m',
+    'QWenLMHeadModel': 'Qwen/Qwen-7B',
    'QWenForCausalLM': 'Qwen/Qwen-7B',
+    'Qwen2ForCausalLM': 'Qwen/Qwen1.5-7B',
+    'Qwen2MoeForCausalLM': 'Qwen/Qwen1.5-MoE-A2.7B',
    'RecurrentGemmaForCausalLM': 'google/recurrentgemma-2b',
 }

@ -46,14 +49,16 @@ INTERNLM_META_INSTRUCTION = """You are an AI assistant whose name is InternLM (
 - InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
 """

+QWEN_PROMPT_TEMPLATE = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n"
+
 DEFAULT_PROMPT_TEMPLATES = {
-    'InternLMForCausalLM':
-    "<|User|>:{input_text}<eoh>\n<|Bot|>:",
-    'InternLM2ForCausalLM':
-    "<|im_start|>system\n" + INTERNLM_META_INSTRUCTION +
+    'InternLMForCausalLM': "<|User|>:{input_text}<eoh>\n<|Bot|>:",
+    'InternLM2ForCausalLM': "<|im_start|>system\n" + INTERNLM_META_INSTRUCTION +
    "<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
-    'QWenForCausalLM':
-    "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
+    'QWenLMHeadModel': QWEN_PROMPT_TEMPLATE,
+    'QWenForCausalLM': QWEN_PROMPT_TEMPLATE,
+    'Qwen2ForCausalLM': QWEN_PROMPT_TEMPLATE,
+    'Qwen2MoeForCausalLM': QWEN_PROMPT_TEMPLATE,
 }


@ -83,7 +88,7 @@ def read_model_name(engine_dir: str):
    model_version = None
    if 'GLM' in model_arch:
        model_version = config['pretrained_config']['chatglm_version']
-    if model_arch == 'QWenForCausalLM':
+    if 'qwen' in model_arch.lower():
        model_version = config['pretrained_config']['qwen_type']
    return model_arch, model_version

@ -134,7 +139,7 @@ def load_tokenizer(tokenizer_dir: Optional[str] = None,
                                padding_side='left',
                                truncation_side='left',
                                legacy=False)
-    if model_name == 'QWenForCausalLM' and model_version == 'qwen':
+    if 'qwen' in model_name.lower() and model_version == 'qwen':
        with open(Path(tokenizer_dir) / "generation_config.json") as f:
            gen_config = json.load(f)
        pad_id = gen_config['pad_token_id']
--- a/examples/whisper/requirements.txt
+++ b/examples/whisper/requirements.txt
@ -1,5 +1,5 @@
 --extra-index-url https://pypi.nvidia.com
-tensorrt_llm==0.12.0.dev2024072301
+tensorrt_llm==0.12.0.dev2024072302
 tiktoken
 datasets
 kaldialign
--- a/tensorrt_llm/builder.py
+++ b/tensorrt_llm/builder.py
@ -28,6 +28,7 @@ from ._common import _is_building, check_max_num_tokens, serialize_engine
 from ._utils import str_dtype_to_trt, to_json_file
 from .auto_parallel import auto_parallel
 from .auto_parallel.config import AutoParallelConfig
+from .functional import PositionEmbeddingType
 from .graph_rewriting import optimize
 from .logger import logger
 from .lora_manager import LoraConfig
@ -466,34 +467,6 @@ class BuildConfig:
    dry_run: bool = False
    visualize_network: bool = False

-    def __post_init__(self):
-        """
-        Check and may modify max_num_tokens and opt_num_tokens after instantiation
-        """
-        max_num_tokens, opt_num_tokens = check_max_num_tokens(
-            max_num_tokens=self.max_num_tokens,
-            opt_num_tokens=self.opt_num_tokens,
-            max_batch_size=self.max_batch_size,
-            max_input_len=self.max_input_len,
-            max_seq_len=self.max_seq_len,
-            max_beam_width=self.max_beam_width,
-            remove_input_padding=self.plugin_config.remove_input_padding,
-            enable_context_fmha=self.plugin_config.context_fmha,
-            tokens_per_block=self.plugin_config.tokens_per_block,
-            multiple_profiles=self.plugin_config.multiple_profiles,
-        )
-        self.max_num_tokens, self.opt_num_tokens = max_num_tokens, opt_num_tokens
-
-        if self.plugin_config.remove_input_padding and self.plugin_config.context_fmha:
-            if self.max_input_len:
-                logger.warning(
-                    'padding removal and fMHA are both enabled, max_input_len is not required and will be ignored'
-                )
-        else:
-            assert self.max_input_len is not None, 'padding removal and fMHA aren\'t both enabled, max_input_len is required'
-            if self.max_seq_len:
-                assert self.max_input_len <= self.max_seq_len, 'max_input_len should not be larger than max_seq_len'
-
    @classmethod
    def from_dict(cls, config, plugin_config=None):
        max_input_len = config.pop('max_input_len')
@ -507,7 +480,7 @@ class BuildConfig:
            'max_prompt_embedding_table_size', 0)
        gather_context_logits = config.pop('gather_context_logits', False)
        gather_generation_logits = config.pop('gather_generation_logits', False)
-        strongly_typed = config.pop('strongly_typed', False)
+        strongly_typed = config.pop('strongly_typed', True)
        builder_opt = config.pop('builder_opt', None)
        force_num_profiles = config.pop('force_num_profiles', None)
        weight_sparsity = config.pop('weight_sparsity', False)
@ -730,6 +703,79 @@ def optimize_model_with_config(model: PretrainedModel,
    return model


+def _init_max_seq_len(model_config, build_config):
+    """
+    If max_seq_len is not specified, set it to max_position_embeddings * rotary_factor
+    Additional checks to ensure max_seq_len, max_input_len, and max_num_tokens have valid values.
+    """
+    # Extract rotary scaling which will be used for checks and default value of max_seq_len
+    rotary_scaling = getattr(model_config, "rotary_scaling", None)
+    if rotary_scaling is not None:
+        rotary_type = rotary_scaling.get('type',
+                                         rotary_scaling.get('rope_type'))
+        rotary_factor = rotary_scaling.get('factor',
+                                           1.0) if rotary_type != 'su' else 1
+    else:
+        rotary_factor = 1
+
+    if build_config.max_seq_len is None:
+        # Step 1: Find the upper bound of max_seq_len
+        deduced_max_seq_len = 2048
+        if model_config.max_position_embeddings is not None:
+            deduced_max_seq_len = model_config.max_position_embeddings
+
+        # Step 2: Scale max_seq_len with rotary scaling
+        if rotary_factor != 1:
+            deduced_max_seq_len *= rotary_factor
+            logger.warning(
+                f'max_seq_len is scaled to {deduced_max_seq_len} by rotary scaling {rotary_factor}'
+            )
+
+        # Step 3: Assign the new max_seq_len
+        build_config.max_seq_len = deduced_max_seq_len
+        logger.info(
+            f'max_seq_len is not specified, using deduced value {deduced_max_seq_len}'
+        )
+    else:
+        if not build_config.plugin_config.streamingllm and model_config.max_position_embeddings is not None \
+            and model_config.position_embedding_type != PositionEmbeddingType.relative:
+            if build_config.max_seq_len > model_config.max_position_embeddings * rotary_factor:
+                logger.warning(
+                    f'max_seq_len {build_config.max_seq_len} is larger than max_position_embeddings {model_config.max_position_embeddings} * rotary scaling {rotary_factor}, '
+                    'the model accuracy might be affected')
+
+    if build_config.max_input_len > build_config.max_seq_len:
+        logger.warning(
+            f'max_input_len is {build_config.max_input_len} is larger than max_seq_len {build_config.max_seq_len}, clipping it to max_seq_len'
+        )
+        build_config.max_input_len = build_config.max_seq_len
+
+    # Check and may modify max_num_tokens and opt_num_tokens (need to happen after max_seq_len is deduced)
+    max_num_tokens, opt_num_tokens = check_max_num_tokens(
+        max_num_tokens=build_config.max_num_tokens,
+        opt_num_tokens=build_config.opt_num_tokens,
+        max_batch_size=build_config.max_batch_size,
+        max_input_len=build_config.max_input_len,
+        max_seq_len=build_config.max_seq_len,
+        max_beam_width=build_config.max_beam_width,
+        remove_input_padding=build_config.plugin_config.remove_input_padding,
+        enable_context_fmha=build_config.plugin_config.context_fmha,
+        tokens_per_block=build_config.plugin_config.tokens_per_block,
+        multiple_profiles=build_config.plugin_config.multiple_profiles,
+    )
+    build_config.max_num_tokens, build_config.opt_num_tokens = max_num_tokens, opt_num_tokens
+
+    if build_config.plugin_config.remove_input_padding and build_config.plugin_config.context_fmha:
+        if build_config.max_input_len:
+            logger.warning(
+                'padding removal and fMHA are both enabled, max_input_len is not required and will be ignored'
+            )
+    else:
+        assert build_config.max_input_len is not None, 'padding removal and fMHA aren\'t both enabled, max_input_len is required'
+        if build_config.max_seq_len:
+            assert build_config.max_input_len <= build_config.max_seq_len, 'max_input_len should not be larger than max_seq_len'
+
+
 def build(model: PretrainedModel,
          build_config: BuildConfig,
          return_build_config: bool = False) -> Engine | BuildConfig:
@ -743,6 +789,8 @@ def build(model: PretrainedModel,
    build_config = copy.deepcopy(build_config)
    build_config.plugin_config.dtype = model.config.dtype

+    _init_max_seq_len(model.config, build_config)
+
    if model.config.quantization.quant_algo == QuantAlgo.FP8 or \
            model.config.quantization.kv_cache_quant_algo == QuantAlgo.FP8:
        build_config.strongly_typed = True
--- a/tensorrt_llm/commands/build.py
+++ b/tensorrt_llm/commands/build.py
@ -27,7 +27,6 @@ import torch
 from tensorrt_llm.auto_parallel import infer_cluster_config
 from tensorrt_llm.auto_parallel.cluster_info import cluster_infos
 from tensorrt_llm.builder import BuildConfig, Engine, build
-from tensorrt_llm.functional import PositionEmbeddingType
 from tensorrt_llm.logger import logger
 from tensorrt_llm.lora_manager import LoraConfig, LoraManager
 from tensorrt_llm.models import MODEL_MAP, PretrainedConfig
@ -444,48 +443,6 @@ def main():
        else:
            cluster_config = infer_cluster_config()

-        # Extract rotary scaling which will be used for checks and default value of max_seq_len
-        rotary_scaling = getattr(model_config, "rotary_scaling", None)
-        if rotary_scaling is not None:
-            rotary_type = rotary_scaling.get('type',
-                                             rotary_scaling.get('rope_type'))
-            rotary_factor = rotary_scaling.get(
-                'factor', 1.0) if rotary_type != 'su' else 1
-        else:
-            rotary_factor = 1
-
-        if args.max_seq_len is None:
-            # Step 1: Find the upper bound of max_seq_len
-            deduced_max_seq_len = 2048
-            if model_config.max_position_embeddings is not None:
-                deduced_max_seq_len = model_config.max_position_embeddings
-
-            # Step 2: Scale max_seq_len with rotary scaling
-            if rotary_factor != 1:
-                deduced_max_seq_len *= rotary_factor
-                logger.warning(
-                    f'max_seq_len is scaled to {deduced_max_seq_len} by rotary scaling {rotary_factor}'
-                )
-
-            # Step 3: Assign the new max_seq_len
-            args.max_seq_len = deduced_max_seq_len
-            logger.info(
-                f'max_seq_len is not specified, using value {deduced_max_seq_len}'
-            )
-        else:
-            if not plugin_config.streamingllm and model_config.max_position_embeddings is not None \
-                and model_config.position_embedding_type != PositionEmbeddingType.relative:
-                if args.max_seq_len > model_config.max_position_embeddings * rotary_factor:
-                    logger.warning(
-                        f'max_seq_len {args.max_seq_len} is larger than max_position_embeddings {model_config.max_position_embeddings} * rotary scaling {rotary_factor}, '
-                        'the model accuracy might be affected')
-
-        if args.max_input_len > args.max_seq_len:
-            logger.warning(
-                f'max_input_len is {args.max_input_len} is larger than max_seq_len {args.max_seq_len}, clipping it to max_seq_len'
-            )
-            args.max_input_len = args.max_seq_len
-
        build_config = BuildConfig.from_dict(
            {
                'max_input_len': args.max_input_len,
--- a/tensorrt_llm/functional.py
+++ b/tensorrt_llm/functional.py
@ -4975,6 +4975,7 @@ def gpt_attention(
    ])

    attn_plug = attn_plg_creator.create_plugin("causal_attn", pfc)
+    assert attn_plug
    plug_inputs = [*qkv] if is_unfuse_qkv_gemm else [qkv]
    if use_cache:
        plug_inputs += [
@ -5510,7 +5511,7 @@ def lora_plugin(
    transa: bool = False,
    transb: bool = False,
    host_context_lengths: Tensor = None,  # for pad-free input mode
-    max_context_length: int = 0,
+    max_num_tokens: int = 0,
    max_low_rank: int = 0,
    lora_ranks: List[Tensor] = None,
    lora_weights_pointers: List[Tensor] = None,
@ -5541,8 +5542,8 @@ def lora_plugin(
        host_context_lengths: cpu Tensor = None
            A host tensor that contains the lengths of the different inputs,

-        max_context_length : int
-            Maximum length during context phase, used to determine the workspace size.
+        max_num_tokens : int
+            Maximum number of tokens, used to determine the workspace size.

        max_low_rank : int
            Maximum low_rank, used to determine the workspace size.
@ -5591,8 +5592,8 @@ def lora_plugin(
        "remove_input_padding",
        np.array(np.int8(default_net().plugin_config.remove_input_padding),
                 dtype=np.int8), trt.PluginFieldType.INT8)
-    max_context_length_field = trt.PluginField(
-        "max_context_length", np.array(max_context_length, dtype=np.int32),
+    max_num_tokens_field = trt.PluginField(
+        "max_num_tokens", np.array(max_num_tokens, dtype=np.int32),
        trt.PluginFieldType.INT32)
    max_low_rank_field = trt.PluginField("max_low_rank",
                                         np.array(max_low_rank, dtype=np.int32),
@ -5607,7 +5608,7 @@ def lora_plugin(

    pfc = trt.PluginFieldCollection([
        in_hidden_size_field, transa, transb, num_lora_modules_field, pf_type,
-        remove_input_padding, max_context_length_field, max_low_rank_field,
+        remove_input_padding, max_num_tokens_field, max_low_rank_field,
        weight_index_field
    ] + out_hidden_size_field_list)
    lora_plug = plg_creator.create_plugin("lora", pfc)
--- a/tensorrt_llm/hlapi/llm_utils.py
+++ b/tensorrt_llm/hlapi/llm_utils.py
@ -288,6 +288,12 @@ class LlmArgs:
        else:
            self.tokenizer = tokenizer_factory(self.tokenizer)

+        if torch.cuda.get_device_properties(0).major < 8:
+            if self.dtype == 'auto':
+                self.dtype = 'float16'
+            if self.dtype == 'bfloat16':
+                raise RuntimeError("Pre SM 80 GPUs do not support bfloat16")
+
        self._engine_config: Optional[EngineConfig] = None

        self.auto_parallel_config = AutoParallelConfig(
@ -1021,7 +1027,10 @@ class ModelLoader:
            raise NotImplementedError(
                f"Unsupported model architecture in HLAPI: {architecture}")

-        if self.llm_args.quant_config.quant_mode.has_any_quant():
+        use_weight_only = self.llm_args.quant_config.quant_algo in (
+            QuantAlgo.W4A16, QuantAlgo.W8A16)
+        if self.llm_args.quant_config.quant_mode.has_any_quant(
+        ) and not use_weight_only:
            assert self.workspace is not None
            checkpoint_dir = f"{self.workspace}/quantized-checkpoint"
            if self.rank == 0:
--- a/tensorrt_llm/layers/attention.py
+++ b/tensorrt_llm/layers/attention.py
@ -612,7 +612,7 @@ class Attention(Module):
                    ],
                    host_request_types=q_lora_params.host_request_types,
                    host_context_lengths=q_lora_params.host_context_lengths,
-                    max_context_length=q_lora_params.max_context_length,
+                    max_num_tokens=q_lora_params.max_num_tokens,
                    max_encoder_context_length=q_lora_params.
                    max_encoder_context_length,
                    host_encoder_input_lengths=q_lora_params.
@ -1337,7 +1337,7 @@ class BertAttention(Module):
                    ],
                    host_request_types=q_lora_params.host_request_types,
                    host_context_lengths=q_lora_params.host_context_lengths,
-                    max_context_length=q_lora_params.max_context_length)
+                    max_num_tokens=q_lora_params.max_num_tokens)

                q_lora, k_lora, v_lora = self.qkv_lora(hidden_states,
                                                       qkv_lora_params)
--- a/tensorrt_llm/layers/lora.py
+++ b/tensorrt_llm/layers/lora.py
@ -13,7 +13,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-from typing import List
+from typing import List, Optional

 from .._common import default_net
 from ..functional import Tensor, lora_plugin
@ -28,7 +28,7 @@ class LoraRuntimeParams(object):
        lora_weights_pointers: List[Tensor] = None,
        host_request_types: Tensor = None,
        host_context_lengths: Tensor = None,
-        max_context_length: Tensor = None,
+        max_num_tokens: Optional[int] = None,
        max_encoder_context_length: Tensor = None,
        host_encoder_input_lengths: Tensor = None,
        weight_index: int = 0,
@ -38,7 +38,7 @@ class LoraRuntimeParams(object):
        self.lora_weights_pointers = lora_weights_pointers
        self.host_request_types = host_request_types
        self.host_context_lengths = host_context_lengths
-        self.max_context_length = max_context_length
+        self.max_num_tokens = max_num_tokens
        self.max_encoder_context_length = max_encoder_context_length
        self.host_encoder_input_lengths = host_encoder_input_lengths
        self.weight_index = weight_index
@ -71,8 +71,8 @@ class Lora(Module):
                host_context_lengths=lora_runtime_params.host_context_lengths
                if not is_cross_attention else
                lora_runtime_params.host_encoder_input_lengths,
-                # For cross attention, max_encoder_context_length should be used instead of max_context_length
-                max_context_length=lora_runtime_params.max_context_length
+                # For cross attention, max_encoder_context_length should be used instead of max_num_tokens
+                max_num_tokens=lora_runtime_params.max_num_tokens
                if not is_cross_attention else
                lora_runtime_params.max_encoder_context_length,
                max_low_rank=self.max_low_rank,
@ -93,7 +93,7 @@ class LoraParams(object):
        lora_ranks=None,  # : List[dict[Tensor]]
        lora_weights_pointers=None,  # : List[dict[Tensor]]
        host_context_lengths: Tensor = None,
-        max_context_length: Tensor = None,
+        max_num_tokens: Optional[int] = None,
        max_encoder_context_length: Tensor = None,  # For cross attention
        host_request_types: Tensor = None,
        host_encoder_input_lengths: Tensor = None,  # For cross attention
@ -104,7 +104,7 @@ class LoraParams(object):
        self.lora_weights_pointers = lora_weights_pointers

        self.host_context_lengths = host_context_lengths
-        self.max_context_length = max_context_length
+        self.max_num_tokens = max_num_tokens
        self.max_encoder_context_length = max_encoder_context_length
        self.host_request_types = host_request_types
        self.host_encoder_input_lengths = host_encoder_input_lengths
@ -115,7 +115,7 @@ class LoraParams(object):
            lora_ranks=[self.lora_ranks[layer_idx]],
            lora_weights_pointers=[self.lora_weights_pointers[layer_idx]],
            host_context_lengths=self.host_context_lengths,
-            max_context_length=self.max_context_length,
+            max_num_tokens=self.max_num_tokens,
            max_encoder_context_length=self.max_encoder_context_length,
            host_request_types=self.host_request_types,
            host_encoder_input_lengths=self.host_encoder_input_lengths,
@ -133,7 +133,7 @@ class LoraParams(object):
                    [f"{lora_module}_lora_weights_pointers"]
                ],
                host_context_lengths=self.host_context_lengths,
-                max_context_length=self.max_context_length,
+                max_num_tokens=self.max_num_tokens,
                max_encoder_context_length=self.max_encoder_context_length,
                host_request_types=self.host_request_types,
                host_encoder_input_lengths=self.host_encoder_input_lengths,
--- a/tensorrt_llm/layers/mlp.py
+++ b/tensorrt_llm/layers/mlp.py
@ -47,7 +47,7 @@ def fc_gate_lora(hidden_states, lora, lora_layer_params):
                ],
                host_request_types=mlp_fc_lora_params.host_request_types,
                host_context_lengths=mlp_fc_lora_params.host_context_lengths,
-                max_context_length=mlp_fc_lora_params.max_context_length)
+                max_num_tokens=mlp_fc_lora_params.max_num_tokens)

            mlp_fc_lora, mlp_gate_lora = lora(hidden_states, mlp_in_lora_params)
            mlp_in_result = concat([mlp_gate_lora, mlp_fc_lora],
--- a/tensorrt_llm/layers/moe.py
+++ b/tensorrt_llm/layers/moe.py
@ -24,12 +24,11 @@ from tensorrt_llm.layers.lora import LoraParams

 from .._common import default_net, default_trtnet
 from .._utils import int32_array
-from ..functional import (AllReduceFusionParams, AllReduceStrategy,
-                          _add_plugin_info, _create_tensor, allreduce, cast,
-                          concat, constant, div, expand, gather_nd,
-                          is_gated_activation, non_gated_version, nonzero,
-                          repeat_interleave, scatter_nd, shape, softmax, split,
-                          sum, topk)
+from ..functional import (AllReduceFusionParams, _add_plugin_info,
+                          _create_tensor, allreduce, cast, concat, constant,
+                          div, expand, gather_nd, is_gated_activation,
+                          non_gated_version, nonzero, repeat_interleave,
+                          scatter_nd, shape, softmax, split, sum, topk)
 from ..layers import MLP, GatedMLP
 from ..mapping import Mapping
 from ..module import Module, ModuleList
@ -531,7 +530,7 @@ class MoeOOTB(MOE):
                gate_lora_weights_pointers,
            }],
            host_context_lengths=lora_layer_params.host_context_lengths,
-            max_context_length=lora_layer_params.max_context_length,
+            max_num_tokens=lora_layer_params.max_num_tokens,
            max_encoder_context_length=lora_layer_params.
            max_encoder_context_length,
            host_request_types=lora_layer_params.host_request_types,
@ -603,6 +602,10 @@ class MoeOOTB(MOE):
        expert_weights = split(experts_weights, 1, dim=0)

        for i, expert in enumerate(self.experts):
+            if self.mapping.has_moe_ep():
+                index = i + self.experts_per_node * self.mapping.moe_ep_rank
+            else:
+                index = i
            # get mask token index
            non_zero_index = nonzero(experts_mask[i].view(
                concat([-1, hidden_size])))
@ -627,16 +630,9 @@ class MoeOOTB(MOE):

        output = output.view(shape(hidden_states))

-        need_ep_reduce = self.mapping.has_moe_ep(
-        ) and self.mapping.moe_ep_group is not None
-        need_tp_reduce = self.mapping.has_moe_tp(
-        ) and self.mapping.moe_tp_group is not None
-        if need_tp_reduce or need_ep_reduce:
-            group = self.mapping.moe_ep_group if need_ep_reduce else self.mapping.moe_tp_group
-            # TODO: remove this NCCL strategy WAR after fixed https://nvbugspro.nvidia.com/bug/4740067
+        if self.tp_size > 1 and self.tp_group is not None:
            output = allreduce(output,
-                               group,
-                               strategy=AllReduceStrategy.NCCL,
+                               self.mapping.tp_group,
                               reduce_fusion_params=reduce_fusion_params)

        return output
--- a/tensorrt_llm/models/init.py
+++ b/tensorrt_llm/models/init.py
@ -27,6 +27,7 @@ from .falcon.model import FalconForCausalLM, FalconModel
 from .gemma.model import GemmaForCausalLM
 from .gpt.config import GPTConfig
 from .gpt.model import GPTForCausalLM, GPTModel
+from .gptj.config import GPTJConfig
 from .gptj.model import GPTJForCausalLM, GPTJModel
 from .gptneox.model import GPTNeoXForCausalLM, GPTNeoXModel
 from .grok.model import GrokForCausalLM
@ -65,6 +66,7 @@ __all__ = [
    'MedusaConfig',
    'MedusaForCausalLm',
    'ReDrafterForCausalLM',
+    'GPTJConfig',
    'GPTJModel',
    'GPTJForCausalLM',
    'GPTNeoXModel',
--- a/tensorrt_llm/models/enc_dec/model.py
+++ b/tensorrt_llm/models/enc_dec/model.py
@ -667,6 +667,7 @@ class EncoderModel(PretrainedModel):
    def prepare_inputs(self,
                       max_batch_size,
                       max_input_len,
+                       max_num_tokens,
                       prompt_embedding_table_size: int = 0,
                       lora_target_modules: List[str] = None,
                       *args,
@ -889,7 +890,7 @@ class EncoderModel(PretrainedModel):
            lora_params = LoraParams(
                lora_ranks=lora_ranks,
                lora_weights_pointers=lora_weights_pointers,
-                max_context_length=max_input_len,
+                max_num_tokens=max_num_tokens,
                host_request_types=host_request_types,
                host_context_lengths=host_context_lengths,
            )
@ -1225,6 +1226,7 @@ class DecoderModel(PretrainedModel):
                       max_beam_width,
                       max_decoder_input_len,
                       max_seq_len,
+                       max_num_tokens,
                       max_encoder_input_len,
                       gather_context_logits: bool = False,
                       gather_generation_logits: bool = False,
@ -1594,7 +1596,7 @@ class DecoderModel(PretrainedModel):
                lora_ranks=lora_ranks,
                lora_weights_pointers=lora_weights_pointers,
                host_context_lengths=host_context_lengths,
-                max_context_length=max_decoder_input_len,
+                max_num_tokens=max_num_tokens,
                max_encoder_context_length=max_encoder_input_len,
                host_request_types=host_request_types,
                host_encoder_input_lengths=host_encoder_input_lengths,
--- a/tensorrt_llm/models/gpt/config.py
+++ b/tensorrt_llm/models/gpt/config.py
@ -15,14 +15,20 @@

 from typing import Optional, Union

+import torch
+
+from ..._utils import torch_dtype_to_str
 from ...layers import MoeConfig
-from ..modeling_utils import PretrainedConfig
+from ...logger import logger
+from ...mapping import Mapping
+from ..modeling_utils import PretrainedConfig, QuantConfig


 class GPTConfig(PretrainedConfig):

    def __init__(self,
                 *,
+                 gpt_variant: str = 'gpt2',
                 bias: bool = True,
                 q_scaling: float = 1.0,
                 embedding_scale: Optional[float] = None,
@ -30,8 +36,11 @@ class GPTConfig(PretrainedConfig):
                 rotary_pct: float = 1.0,
                 rotary_base: float = 10000.0,
                 rotary_scaling: Optional[dict] = None,
+                 inner_layernorm: bool = False,
+                 norm_before_bmm1: bool = False,
                 moe: Optional[Union[MoeConfig, dict]] = None,
                 **kwargs):
+        self.gpt_variant = gpt_variant
        self.bias = bias
        self.q_scaling = q_scaling
        self.embedding_scale = embedding_scale
@ -39,6 +48,8 @@ class GPTConfig(PretrainedConfig):
        self.rotary_pct = rotary_pct
        self.rotary_base = rotary_base
        self.rotary_scaling = rotary_scaling
+        self.inner_layernorm = inner_layernorm
+        self.norm_before_bmm1 = norm_before_bmm1
        if moe is None:
            # Legacy MOE config fields
            moe = MoeConfig(
@ -57,6 +68,7 @@ class GPTConfig(PretrainedConfig):
    def to_dict(self):
        output = super().to_dict()
        # Serialize the fields added in GPTConfig
+        output['gpt_variant'] = self.gpt_variant
        output['bias'] = self.bias
        output['q_scaling'] = self.q_scaling
        output['embedding_scale'] = self.embedding_scale
@ -65,5 +77,244 @@ class GPTConfig(PretrainedConfig):
        output['rotary_pct'] = self.rotary_pct
        output['rotary_base'] = self.rotary_base
        output['rotary_scaling'] = self.rotary_scaling
+        output['inner_layernorm'] = self.inner_layernorm
+        output['norm_before_bmm1'] = self.norm_before_bmm1
        output['moe'] = self.moe.to_dict()
        return output
+
+    @classmethod
+    def from_hugging_face(
+            cls,
+            hf_config_or_dir: Union[str, 'transformers.PretrainedConfig'],
+            dtype: str = 'auto',
+            mapping: Optional[Mapping] = None,
+            quant_config: Optional[QuantConfig] = None,
+            **kwargs):
+        import transformers
+
+        from .convert import get_needed_padding
+
+        if isinstance(hf_config_or_dir, transformers.PretrainedConfig):
+            hf_config = hf_config_or_dir
+        else:
+            hf_config = transformers.AutoConfig.from_pretrained(
+                hf_config_or_dir, trust_remote_code=True)
+
+        gpt_variant = kwargs.pop('gpt_variant', None)
+        if gpt_variant is None:
+            logger.info("Inferring gpt variant from path...")
+            for v in [
+                    'starcoder2', 'starcoder', 'santacoder', 'gpt2',
+                    'persimmon', 'fuyu', 'kosmos-2', 'jais'
+            ]:
+                if v in hf_config._name_or_path:
+                    gpt_variant = v
+                    break
+        if gpt_variant == 'fuyu':
+            gpt_variant = 'persimmon'
+
+        assert gpt_variant in [
+            'gpt2', 'santacoder', 'starcoder', 'starcoder2', 'persimmon',
+            'kosmos-2', 'jais'
+        ]
+        logger.info(f"Gpt variant: {gpt_variant}")
+
+        if gpt_variant in ['starcoder2', 'persimmon']:
+            hf_config.n_embd = hf_config.hidden_size
+            hf_config.n_inner = hf_config.intermediate_size
+            hf_config.n_head = hf_config.num_attention_heads
+            hf_config.n_kv_head = hf_config.num_key_value_heads if hasattr(
+                hf_config, 'num_key_value_heads') else hf_config.n_head
+            hf_config.n_layer = hf_config.num_hidden_layers
+            hf_config.n_positions = hf_config.max_position_embeddings
+            hf_config.activation_function = 'gelu' if gpt_variant == 'starcoder2' else 'squared-relu'
+            hf_config.layer_norm_epsilon = hf_config.norm_epsilon if gpt_variant == 'starcoder2' else hf_config.layer_norm_eps
+            hf_config.bias = hf_config.use_bias if gpt_variant == 'starcoder2' else True
+            hf_config.position_embedding_type = 'rope_gpt_neox'
+            hf_config.rotary_base = hf_config.rope_theta
+            hf_config.rotary_pct = getattr(hf_config, 'partial_rotary_factor',
+                                           1.0)
+        elif gpt_variant == "kosmos-2":
+            hf_config.n_embd = hf_config.text_config.embed_dim
+            hf_config.n_inner = hf_config.text_config.ffn_dim
+            hf_config.n_head = hf_config.text_config.attention_heads
+            hf_config.n_kv_head = hf_config.n_head
+            hf_config.n_layer = hf_config.text_config.layers
+            hf_config.n_positions = hf_config.text_config.max_position_embeddings
+            hf_config.activation_function = hf_config.text_config.activation_function
+            hf_config.layer_norm_epsilon = hf_config.text_config.layer_norm_eps
+            hf_config.bias = True
+            hf_config.vocab_size = hf_config.text_config.vocab_size
+        else:
+            if hf_config.n_inner is None:
+                hf_config.n_inner = hf_config.n_embd * 4
+            if gpt_variant in ['santacoder', 'starcoder']:
+                hf_config.n_kv_head = 1
+            else:
+                hf_config.n_kv_head = hf_config.n_head
+
+        if gpt_variant == 'jais':
+            hf_config.q_scaling = (hf_config.n_embd // hf_config.n_head)**0.5
+            if hasattr(hf_config, 'width_scale'):
+                hf_config.logits_scale = hf_config.width_scale
+            else:
+                hf_config.logits_scale = hf_config.mup_output_alpha * hf_config.mup_width_scale
+
+            if hasattr(hf_config, 'mup_embeddings_scale'):
+                hf_config.embeddings_scale = hf_config.mup_embeddings_scale
+            else:
+                assert hasattr(hf_config, 'embeddings_scale')
+
+            hf_config.n_inner += get_needed_padding(hf_config.n_inner,
+                                                    mapping.tp_size)
+
+        if gpt_variant == 'kosmos-2':
+            if hf_config.text_config.scale_embedding:
+                hf_config.embeddings_scale = hf_config.n_embd**0.5
+
+        if dtype == 'auto':
+            dtype = getattr(hf_config, 'torch_dtype', None)
+            if dtype is None:
+                dtype = 'float16'
+            if isinstance(dtype, torch.dtype):
+                dtype = torch_dtype_to_str(dtype)
+            if dtype == 'float32':
+                dtype = 'float16'
+
+        return cls(architecture=hf_config.architectures[0],
+                   dtype=dtype,
+                   num_hidden_layers=hf_config.n_layer,
+                   num_attention_heads=hf_config.n_head,
+                   num_key_value_heads=hf_config.n_kv_head,
+                   hidden_size=hf_config.n_embd,
+                   intermediate_size=hf_config.n_inner,
+                   norm_epsilon=hf_config.layer_norm_epsilon,
+                   vocab_size=hf_config.vocab_size,
+                   position_embedding_type=getattr(hf_config,
+                                                   'position_embedding_type',
+                                                   'learned_absolute'),
+                   max_position_embeddings=hf_config.n_positions,
+                   hidden_act=hf_config.activation_function,
+                   gpt_variant=gpt_variant,
+                   bias=getattr(hf_config, 'bias', True),
+                   apply_query_key_layer_scaling=getattr(
+                       hf_config, 'apply_query_key_layer_scaling', False),
+                   rotary_pct=getattr(hf_config, 'rotary_pct', 1.0),
+                   rotary_base=getattr(hf_config, 'rotary_base', 10000.0),
+                   rotary_scaling=getattr(hf_config, 'rotary_scaling', None),
+                   qk_layernorm=gpt_variant == 'persimmon',
+                   inner_layernorm=gpt_variant == 'kosmos-2',
+                   norm_before_bmm1=gpt_variant == 'kosmos-2',
+                   q_scaling=getattr(hf_config, 'q_scaling', 1),
+                   embedding_scale=getattr(hf_config, 'embeddings_scale', None),
+                   mapping=mapping,
+                   quantization=quant_config,
+                   **kwargs)
+
+    @classmethod
+    def from_nemo(cls,
+                  nemo_ckpt_dir: str,
+                  dtype: str = 'auto',
+                  mapping: Optional[Mapping] = None,
+                  quant_config: Optional[QuantConfig] = None,
+                  **kwargs):
+        import transformers
+
+        from .convert import (UnpackedNemoCheckpointDir, cpu_map_location,
+                              gpu_map_location, rename_keys)
+
+        load_model_on_cpu = kwargs.pop('load_model_on_cpu', False)
+        nemo_rename_key = kwargs.pop('nemo_rename_key', [])
+        layer_rename_config = {
+            pattern.split(':')[0]: pattern.split(':')[1]
+            for pattern in nemo_rename_key
+        }
+
+        unpacked_checkpoints_dir = UnpackedNemoCheckpointDir(
+            nemo_ckpt_dir, load_checkpoints_to_cpu=load_model_on_cpu)
+        nemo_model_config = unpacked_checkpoints_dir.model_config
+
+        training_tp_size = nemo_model_config.get("tensor_model_parallel_size",
+                                                 1)
+        training_pp_size = nemo_model_config.get("pipeline_model_parallel_size",
+                                                 1)
+
+        checkpoints_paths = unpacked_checkpoints_dir.get_checkpoints_paths(
+            training_tp_size,
+            training_pp_size,
+        )
+        if unpacked_checkpoints_dir._load_checkpoints_to_cpu:
+            map_location_fn = cpu_map_location
+        else:
+            map_location_fn = gpu_map_location
+        model_00 = torch.load(checkpoints_paths[0][0],
+                              map_location=map_location_fn)
+        model_00 = rename_keys(model_00, layer_rename_config)
+        vocab_size = model_00[
+            "model.language_model.embedding.word_embeddings.weight"].shape[
+                0] * training_tp_size
+        del model_00
+
+        hf_config = transformers.GPT2Config(
+            vocab_size=vocab_size,
+            n_positions=nemo_model_config['max_position_embeddings'],
+            n_embd=nemo_model_config['hidden_size'],
+            n_layer=nemo_model_config['num_layers'],
+            n_head=nemo_model_config['num_attention_heads'],
+            n_inner=nemo_model_config['ffn_hidden_size'],
+            activation_function=nemo_model_config['activation'],
+            layer_norm_epsilon=nemo_model_config['layernorm_epsilon'],
+        )
+        hf_config.n_kv_head = hf_config.n_head
+        hf_config.bias = nemo_model_config['bias']
+        hf_config.apply_query_key_layer_scaling = False
+
+        hf_config.position_embedding_type = nemo_model_config.get(
+            'position_embedding_type', 'learned_absolute')
+        if hf_config.position_embedding_type == 'rope':
+            hf_config.position_embedding_type = 'rope_gpt_neox'
+        hf_config.rotary_base = nemo_model_config.get('rotary_base', 10000.0)
+        hf_config.rotary_pct = nemo_model_config.get('rotary_percentage', 1.0)
+        assert hf_config.rotary_pct >= 0 and hf_config.rotary_pct <= 1
+
+        rotary_scaling_factor = nemo_model_config.get(
+            'seq_len_interpolation_factor', None)
+        if rotary_scaling_factor is None:
+            hf_config.rotary_scaling = None
+        else:
+            assert rotary_scaling_factor > 1
+            hf_config.rotary_scaling = {
+                'type': 'linear',
+                'factor': rotary_scaling_factor
+            }
+
+        if dtype == 'auto':
+            dtype = nemo_model_config['precision']
+            if dtype is None:
+                dtype = 'float16'
+            elif 'bf16' in dtype or 'bfloat16' in dtype:
+                dtype = 'bfloat16'
+            else:
+                dtype = 'float16'
+
+        return cls(architecture='GPTForCausalLM',
+                   dtype=dtype,
+                   num_hidden_layers=hf_config.n_layer,
+                   num_attention_heads=hf_config.n_head,
+                   num_key_value_heads=hf_config.n_kv_head,
+                   hidden_size=hf_config.n_embd,
+                   intermediate_size=hf_config.n_inner,
+                   norm_epsilon=hf_config.layer_norm_epsilon,
+                   vocab_size=hf_config.vocab_size,
+                   position_embedding_type=hf_config.position_embedding_type,
+                   max_position_embeddings=hf_config.n_positions,
+                   hidden_act=hf_config.activation_function,
+                   bias=hf_config.bias,
+                   apply_query_key_layer_scaling=hf_config.
+                   apply_query_key_layer_scaling,
+                   rotary_pct=hf_config.rotary_pct,
+                   rotary_base=hf_config.rotary_base,
+                   rotary_scaling=hf_config.rotary_scaling,
+                   mapping=mapping,
+                   quantization=quant_config,
+                   **kwargs)
--- a/tensorrt_llm/models/gpt/convert.py
+++ b/tensorrt_llm/models/gpt/convert.py
--- a/tensorrt_llm/models/gpt/model.py
+++ b/tensorrt_llm/models/gpt/model.py
@ -13,6 +13,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from typing import Optional, Union
+
 from ..._utils import pad_vocab_size
 from ...functional import (Tensor, is_gated_activation, non_gated_version, recv,
                           send)
@ -22,9 +24,12 @@ from ...layers import (MLP, MOE, Attention, AttentionMaskType, ColumnLinear,
 from ...lora_manager import LoraConfig, use_lora
 from ...mapping import Mapping
 from ...module import Module
-from ...quantization import QuantMode
-from ..modeling_utils import DecoderLayerList, DecoderModelForCausalLM
+from ...quantization import W8A8_SQ_PLUGIN_LIST, QuantAlgo, QuantMode
+from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
+                              QuantConfig, check_share_embedding)
 from .config import GPTConfig
+from .convert import (load_hf_gpt, load_weights_from_hf_model,
+                      load_weights_from_nemo)


 def MLPFactory(hidden_size,
@ -276,5 +281,123 @@ class GPTForCausalLM(DecoderModelForCausalLM):
        }
        super().__init__(config, transformer, lm_head)

+    @classmethod
+    def from_hugging_face(
+            cls,
+            hf_model_or_dir: Union[str, 'transformers.PreTrainedModel'],
+            dtype: str = 'auto',
+            mapping: Optional[Mapping] = None,
+            quant_config: Optional[QuantConfig] = None,
+            **kwargs):
+        ''' Create a LLaMAForCausalLM object from give parameters
+        '''
+        import transformers
+
+        load_model_on_cpu = kwargs.pop('load_model_on_cpu', False)
+
+        assert hf_model_or_dir is not None
+        use_preloading = isinstance(hf_model_or_dir,
+                                    transformers.PreTrainedModel)
+        if use_preloading:
+            hf_model = hf_model_or_dir
+            hf_config_or_dir = hf_model.config
+        else:
+            hf_model_dir = hf_model_or_dir
+            hf_config_or_dir = hf_model_or_dir
+
+        config = GPTConfig.from_hugging_face(hf_config_or_dir,
+                                             dtype=dtype,
+                                             mapping=mapping,
+                                             quant_config=quant_config,
+                                             **kwargs)
+
+        if not use_preloading:
+            hf_model = load_hf_gpt(hf_model_dir, load_model_on_cpu)
+        weights = load_weights_from_hf_model(hf_model, config)
+
+        check_share_embedding(weights, config)
+        model = cls(config)
+        model.load(weights)
+        return model
+
+    @classmethod
+    def quantize(
+        cls,
+        hf_model_dir: str,
+        output_dir: str,
+        dtype: str = 'auto',
+        mapping: Optional[Mapping] = None,
+        quant_config: Optional[QuantConfig] = None,
+        *,
+        device: str = 'cuda',
+        calib_dataset: str = 'cnn_dailymail',
+        calib_batches: int = 512,
+        calib_batch_size: int = 1,
+        calib_max_seq_length: int = 512,
+        random_seed: int = 1234,
+        tokenizer_max_seq_length: int = 2048,
+        **kwargs,
+    ):
+        DEFAULT_MODELOPT_FLOW = [
+            QuantAlgo.W4A16_AWQ, QuantAlgo.FP8, QuantAlgo.W8A8_SQ_PER_CHANNEL,
+            QuantAlgo.W4A8_AWQ
+        ]
+        config = GPTConfig.from_hugging_face(hf_model_dir,
+                                             dtype=dtype,
+                                             mapping=mapping,
+                                             quant_config=quant_config,
+                                             **kwargs)
+
+        if quant_config.quant_algo in DEFAULT_MODELOPT_FLOW:
+            super().quantize(hf_model_dir,
+                             output_dir,
+                             dtype=config.dtype,
+                             mapping=config.mapping,
+                             quant_config=config.quantization,
+                             device=device,
+                             calib_dataset=calib_dataset,
+                             calib_batches=calib_batches,
+                             calib_batch_size=calib_batch_size,
+                             calib_max_seq_length=calib_max_seq_length,
+                             random_seed=random_seed,
+                             tokenizer_max_seq_length=tokenizer_max_seq_length)
+        else:
+            # non-modelopt, the legacy TRT-LLM native quantization algorithm:
+            # sq, int4/int8 weights only, int8 kv cache
+            NATIVE_QUANT_FLOW = [QuantAlgo.W4A16, QuantAlgo.W8A16, None
+                                 ] + W8A8_SQ_PLUGIN_LIST
+            is_valid_native_quant = (quant_config.quant_algo in NATIVE_QUANT_FLOW) and \
+                (quant_config.kv_cache_quant_algo in [QuantAlgo.INT8, None])
+            assert quant_config.quant_algo is not None or quant_config.kv_cache_quant_algo is not None, \
+                "There is no point to call the quantize function if both quant_algo and kv_cache_quant_algo is None"
+            assert is_valid_native_quant, f"Internal error: shall call Modelopt for this quantization {quant_config}"
+
+            from . import convert
+            convert.quantize(hf_model_dir,
+                             output_dir,
+                             config=config,
+                             device=device,
+                             calib_dataset=calib_dataset)
+
+    @classmethod
+    def from_nemo(cls,
+                  nemo_ckpt_dir: str,
+                  dtype: str = 'auto',
+                  mapping: Optional[Mapping] = None,
+                  quant_config: Optional[QuantConfig] = None,
+                  **kwargs):
+        config = GPTConfig.from_nemo(nemo_ckpt_dir,
+                                     dtype=dtype,
+                                     mapping=mapping,
+                                     quant_config=quant_config,
+                                     **kwargs)
+
+        weights = load_weights_from_nemo(nemo_ckpt_dir, config, **kwargs)
+
+        check_share_embedding(weights, config)
+        model = cls(config)
+        model.load(weights)
+        return model
+
    def use_lora(self, lora_config: LoraConfig):
        use_lora(self, lora_config, self.trtllm_modules_to_hf_modules)
--- a/tensorrt_llm/models/gptj/config.py
+++ b/tensorrt_llm/models/gptj/config.py
@ -0,0 +1,63 @@
+from typing import Mapping, Optional, Union
+
+import torch
+
+from ..._utils import torch_dtype_to_str
+from ...mapping import Mapping
+from ..modeling_utils import PretrainedConfig, QuantConfig
+
+
+class GPTJConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of GPTJ model.
+    """
+
+    def __init__(self, *, rotary_dim: int = 64, **kwargs):
+        self.rotary_dim = rotary_dim
+        super().__init__(**kwargs)
+
+    def to_dict(self):
+        output = super().to_dict()
+        output.update(rotary_dim=self.rotary_dim)
+        return output
+
+    @classmethod
+    def from_hugging_face(
+            cls,
+            hf_config_or_dir: Union[str, 'transformers.PretrainedConfig'],
+            dtype: str = 'auto',
+            mapping: Optional[Mapping] = None,
+            quant_config: Optional[QuantConfig] = None,
+            **kwargs):
+        import transformers
+
+        if isinstance(hf_config_or_dir, transformers.PretrainedConfig):
+            hf_config = hf_config_or_dir
+        else:
+            hf_config_dir = str(hf_config_or_dir)
+            hf_config = transformers.AutoConfig.from_pretrained(
+                hf_config_dir, trust_remote_code=True)
+
+        if dtype == 'auto':
+            dtype = getattr(hf_config, 'torch_dtype', None)
+            if dtype is None:
+                dtype = 'float16'
+            if isinstance(dtype, torch.dtype):
+                dtype = torch_dtype_to_str(dtype)
+            if dtype == 'float32':
+                dtype = 'float16'
+
+        return cls(architecture=hf_config.architectures[0],
+                   dtype=dtype,
+                   num_hidden_layers=hf_config.num_hidden_layers,
+                   num_attention_heads=hf_config.num_attention_heads,
+                   hidden_size=hf_config.hidden_size,
+                   norm_epsilon=hf_config.layer_norm_epsilon,
+                   vocab_size=hf_config.vocab_size,
+                   position_embedding_type='rope_gptj',
+                   max_position_embeddings=hf_config.max_position_embeddings,
+                   hidden_act='gelu',
+                   rotary_dim=hf_config.rotary_dim,
+                   mapping=mapping,
+                   quantization=quant_config,
+                   **kwargs)
--- a/tensorrt_llm/models/gptj/convert.py
+++ b/tensorrt_llm/models/gptj/convert.py
@ -0,0 +1,205 @@
+import time
+from typing import Dict, Optional, Tuple
+
+import torch
+
+from tensorrt_llm.quantization import QuantAlgo
+
+from .config import GPTJConfig
+
+
+def split(weight: torch.Tensor,
+          tp_size: int,
+          rank: int = 0,
+          dim: int = 0) -> torch.Tensor:
+    if tp_size == 1:
+        return weight
+    elif weight.ndim == 1:
+        return torch.chunk(weight, tp_size)[rank].contiguous()
+    else:
+        return torch.chunk(weight, tp_size, dim=dim)[rank].contiguous()
+
+
+def split_matrix(weight: torch.Tensor, tp_size: int, rank: int,
+                 dim: int) -> torch.Tensor:
+    return split(weight, tp_size, rank, dim=dim)
+
+
+def get_weight(params: Dict[str, torch.Tensor], prefix: str,
+               dtype: torch.dtype) -> torch.Tensor:
+    if f'{prefix}.weight' not in params:
+        return None
+    return params[f'{prefix}.weight'].to(dtype).detach().cpu()
+
+
+def get_bias(params: Dict[str, torch.Tensor], prefix: str,
+             dtype: torch.dtype) -> torch.Tensor:
+    if f'{prefix}.bias' not in params:
+        return None
+    return params[f'{prefix}.bias'].to(dtype).detach().cpu()
+
+
+def get_weight_and_bias(params: Dict[str, torch.Tensor], prefix: str,
+                        dtype: torch.dtype) -> Tuple[torch.Tensor]:
+    return get_weight(params, prefix, dtype), get_bias(params, prefix, dtype)
+
+
+def get_tllm_linear_weight(
+    weight: torch.Tensor,
+    prefix: str,
+    bias: Optional[torch.Tensor] = None,
+    use_weight_only: bool = False,
+    plugin_weight_only_quant_type: torch.dtype = torch.int8
+) -> Dict[str, torch.Tensor]:
+    results = {}
+    if use_weight_only:
+        v = weight.t().contiguous()
+        processed_torch_weights, torch_weight_scales = \
+            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
+                v, plugin_weight_only_quant_type)
+        results[f'{prefix}.weight'] = processed_torch_weights
+        results[f'{prefix}.per_channel_scale'] = torch_weight_scales
+    else:
+        results[f'{prefix}.weight'] = weight.contiguous()
+
+    if bias is not None:
+        results[f'{prefix}.bias'] = bias
+
+    return results
+
+
+def get_tllm_param(
+    param: torch.Tensor,
+    name: str,
+    use_weight_only: bool = False,
+    plugin_weight_only_quant_type: torch.dtype = torch.int8
+) -> Dict[str, torch.Tensor]:
+    results = {}
+    if name.endswith('.weight') and use_weight_only:
+        v = param.t().contiguous()
+        processed_torch_weights, torch_weight_scales = \
+            torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
+                v, plugin_weight_only_quant_type)
+        results[name] = processed_torch_weights
+        results[name.replace('weight',
+                             'per_channel_scale')] = torch_weight_scales
+    else:
+        results[name] = param
+
+    return results
+
+
+def load_weights_from_hf_model(hf_model, config: GPTJConfig):
+    quant_algo = config.quantization.quant_algo
+    use_weight_only = quant_algo in [QuantAlgo.W8A16, QuantAlgo.W4A16]
+    if quant_algo == QuantAlgo.W8A16:
+        plugin_weight_only_quant_type = torch.int8
+    elif quant_algo == QuantAlgo.W4A16:
+        plugin_weight_only_quant_type = torch.quint4x2
+    else:
+        plugin_weight_only_quant_type = None
+
+    weights = {}
+    tik = time.time()
+
+    model_params = dict(hf_model.named_parameters())
+    dtype = getattr(torch, config.dtype)
+    num_hidden_layers = config.num_hidden_layers
+    mapping = config.mapping
+
+    layers_range = mapping.pp_layers(num_hidden_layers)
+    for l in layers_range:
+        prefix = f'transformer.h.{l}'
+        tllm_prex = f'transformer.layers.{l-layers_range[0]}'
+        # Attention QKV (no bias)
+        q_weight = get_weight(model_params, f'{prefix}.attn.q_proj', dtype)
+        k_weight = get_weight(model_params, f'{prefix}.attn.k_proj', dtype)
+        v_weight = get_weight(model_params, f'{prefix}.attn.v_proj', dtype)
+        q_w = split_matrix(q_weight, mapping.tp_size, mapping.tp_rank, dim=0)
+        k_w = split_matrix(k_weight, mapping.tp_size, mapping.tp_rank, dim=0)
+        v_w = split_matrix(v_weight, mapping.tp_size, mapping.tp_rank, dim=0)
+        qkv_w = torch.concatenate([q_w, k_w, v_w], dim=0)
+        weights.update(
+            get_tllm_linear_weight(qkv_w, f'{tllm_prex}.attention.qkv', None,
+                                   use_weight_only,
+                                   plugin_weight_only_quant_type))
+        # Attention dense (not bias)
+        attn_dense_weight = get_weight(model_params, f'{prefix}.attn.out_proj',
+                                       dtype)
+        attn_dense_w = split_matrix(attn_dense_weight,
+                                    mapping.tp_size,
+                                    mapping.tp_rank,
+                                    dim=1)
+        weights.update(
+            get_tllm_linear_weight(attn_dense_w, f'{tllm_prex}.attention.dense',
+                                   None, use_weight_only,
+                                   plugin_weight_only_quant_type))
+        # MLP fc_in (with bias)
+        mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(
+            model_params, f'{prefix}.mlp.fc_in', dtype)
+        mlp_fc_w = split_matrix(mlp_fc_weight,
+                                mapping.tp_size,
+                                mapping.tp_rank,
+                                dim=0)
+        mlp_fc_b = split_matrix(mlp_fc_bias,
+                                mapping.tp_size,
+                                mapping.tp_rank,
+                                dim=0)
+        weights.update(
+            get_tllm_linear_weight(mlp_fc_w, f'{tllm_prex}.mlp.fc', mlp_fc_b,
+                                   use_weight_only,
+                                   plugin_weight_only_quant_type))
+        # MLP fc_out (with bias)
+        mlp_proj_weight, mlp_proj_bias = get_weight_and_bias(
+            model_params, f'{prefix}.mlp.fc_out', dtype)
+        mlp_proj_w = split_matrix(mlp_proj_weight,
+                                  mapping.tp_size,
+                                  mapping.tp_rank,
+                                  dim=1)
+        # Only rank0 will get bias
+        if mapping.tp_size > 1 and mapping.tp_rank > 0:
+            mlp_proj_bias = torch.zeros(mlp_proj_weight.shape[0],
+                                        dtype=mlp_proj_weight.dtype)
+        weights.update(
+            get_tllm_linear_weight(mlp_proj_w, f'{tllm_prex}.mlp.proj',
+                                   mlp_proj_bias, use_weight_only,
+                                   plugin_weight_only_quant_type))
+
+        input_ln_weight, input_ln_bias = get_weight_and_bias(
+            model_params, f'{prefix}.ln_1', dtype)
+        weights[f'{tllm_prex}.input_layernorm.weight'] = input_ln_weight
+        weights[f'{tllm_prex}.input_layernorm.bias'] = input_ln_bias
+
+    if mapping.is_first_pp_rank():
+        # Embedding
+        embed_w = get_weight(model_params, 'transformer.wte', dtype)
+        if config.use_parallel_embedding:
+            embed_w = split_matrix(embed_w,
+                                   mapping.tp_size,
+                                   mapping.tp_rank,
+                                   dim=0)
+        weights['transformer.vocab_embedding.weight'] = embed_w
+
+    if mapping.is_last_pp_rank():
+        # lm_head weight and bias
+        lm_head_w, ln_head_bias = get_weight_and_bias(model_params, 'lm_head',
+                                                      dtype)
+        weights['lm_head.weight'] = split_matrix(lm_head_w,
+                                                 mapping.tp_size,
+                                                 mapping.tp_rank,
+                                                 dim=0)
+        weights['lm_head.bias'] = split_matrix(ln_head_bias,
+                                               mapping.tp_size,
+                                               mapping.tp_rank,
+                                               dim=0)
+        ln_f_w, ln_f_b = get_weight_and_bias(model_params, 'transformer.ln_f',
+                                             dtype)
+        # ln_f weight and bias
+        weights['transformer.ln_f.weight'] = ln_f_w
+        if ln_f_b is not None:
+            weights['transformer.ln_f.bias'] = ln_f_b
+
+    tok = time.time()
+    t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
+    print(f'Weights loaded. Total time: {t}')
+    return weights
--- a/tensorrt_llm/models/gptj/model.py
+++ b/tensorrt_llm/models/gptj/model.py
@ -13,18 +13,23 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from typing import Optional, Union
+
 from ..._utils import pad_vocab_size
 from ...functional import PositionEmbeddingType, Tensor, allreduce
 from ...layers import (MLP, Attention, AttentionMaskType, ColumnLinear,
                       Embedding, LayerNorm)
+from ...mapping import Mapping
 from ...module import Module
 from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
-                              PretrainedConfig)
+                              check_share_embedding)
+from .config import GPTJConfig
+from .convert import load_weights_from_hf_model


 class GPTJDecoderLayer(Module):

-    def __init__(self, config: PretrainedConfig, layer_idx: int):
+    def __init__(self, config: GPTJConfig, layer_idx: int):
        super().__init__()
        self.layer_idx = layer_idx
        self.config = config
@ -104,7 +109,7 @@ class GPTJDecoderLayer(Module):

 class GPTJModel(Module):

-    def __init__(self, config: PretrainedConfig):
+    def __init__(self, config: GPTJConfig):
        super().__init__()
        self.config = config

@ -144,9 +149,9 @@ class GPTJModel(Module):


 class GPTJForCausalLM(DecoderModelForCausalLM):
+    config_class = GPTJConfig

-    def __init__(self, config: PretrainedConfig):
-        self.check_config(config)
+    def __init__(self, config: GPTJConfig):
        transformer = GPTJModel(config)
        vocab_size_padded = pad_vocab_size(config.vocab_size,
                                           config.mapping.tp_size)
@ -162,5 +167,36 @@ class GPTJForCausalLM(DecoderModelForCausalLM):
            lm_head = None
        super().__init__(config, transformer, lm_head)

-    def check_config(self, config):
-        config.set_if_not_exist('rotary_dim', 64)
+    @classmethod
+    def from_hugging_face(
+            cls,
+            hf_model_or_dir: Union[str, 'transformers.PreTrainedModel'],
+            dtype: str = 'auto',
+            mapping: Optional[Mapping] = None,
+            quant_config=None,
+            **kwargs):
+        import transformers
+        use_preloading = isinstance(hf_model_or_dir,
+                                    transformers.PreTrainedModel)
+        if use_preloading:
+            hf_model = hf_model_or_dir
+            hf_config_or_dir = hf_model.config
+        else:
+            hf_model_dir = hf_model_or_dir
+            hf_config_or_dir = hf_model_or_dir
+
+        config = GPTJConfig.from_hugging_face(hf_config_or_dir,
+                                              dtype=dtype,
+                                              mapping=mapping,
+                                              quant_config=quant_config,
+                                              **kwargs)
+
+        if not use_preloading:
+            hf_model = transformers.AutoModelForCausalLM.from_pretrained(
+                hf_model_dir, torch_dtype='auto', trust_remote_code=True)
+        weights = load_weights_from_hf_model(hf_model, config)
+
+        check_share_embedding(weights, config)
+        model = GPTJForCausalLM(config)
+        model.load(weights)
+        return model
--- a/tensorrt_llm/models/llama/config.py
+++ b/tensorrt_llm/models/llama/config.py
@ -21,7 +21,6 @@ import torch

 from ..._utils import torch_dtype_to_str
 from ...layers import MoeConfig
-from ...logger import logger
 from ...mapping import Mapping
 from ..modeling_utils import PretrainedConfig, QuantConfig

@ -146,14 +145,9 @@ class LLaMAConfig(PretrainedConfig):
                dtype = torch_dtype_to_str(dtype)
            if dtype == 'float32':
                dtype = 'float16'
-        if dtype == 'bfloat16' and torch.cuda.get_device_properties(
-                0).major < 8:
-            logger.warning(
-                "Pre SM 80 GPUs do not support bfloat16, fallback to float16")
-            dtype = 'float16'

        return cls(
-            architecture='LlamaForCausalLM',
+            architecture=hf_config.architectures[0],
            dtype=dtype,
            num_hidden_layers=hf_config.num_hidden_layers,
            num_attention_heads=hf_config.num_attention_heads,
@ -208,11 +202,6 @@ class LLaMAConfig(PretrainedConfig):

        if dtype == 'auto':
            dtype = 'bfloat16'
-        if dtype == 'bfloat16' and torch.cuda.get_device_properties(
-                0).major < 8:
-            logger.warning(
-                "Pre SM 80 GPUs do not support bfloat16, fallback to float16")
-            dtype = 'float16'

        if meta_config.get('use_scaled_rope'):
            rotary_scaling = {"type": "llama3"}
--- a/tensorrt_llm/models/llama/convert.py
+++ b/tensorrt_llm/models/llama/convert.py
@ -14,7 +14,6 @@
 # limitations under the License.
 import copy
 import functools
-import json
 import os
 import sys
 import time
@ -473,8 +472,8 @@ def fp8_per_channel_quant_weight_gpu(weight, clamp_val, rank=0):
    xmax = x.abs().max(-1, keepdim=True).values
    # minimum scaling factor.
    torch_weight_scales = (xmax / 448.0).clamp(min=1.0 / (448.0 * 512.0))
+    out = x / torch_weight_scales
    torch_weight_scales = torch_weight_scales.reshape(-1)
-    out = x * 448.0 / xmax
    out = torch.clamp(out, -448, 448)
    processed_torch_weights = out.to(torch.float8_e4m3fn)

@ -1315,13 +1314,12 @@ def quantize(hf_model_dir: str,
    '''
    #TODO: currently only smooth quant and kv cache quantization are supported, needs to support mode quant algorithm calling modelopt

-    with open(os.path.join(output_dir, 'config.json'), 'w') as f:
-        json.dump(config.to_dict(), f, indent=4)
+    config.to_json_file(os.path.join(output_dir, 'config.json'))

    mapping = config.mapping
    assert mapping.rank == -1, "You shall call quantize only once in one rank, assert rank==-1 for precaution"
-    quant_config = config.quantization

+    quant_config = config.quantization
    use_smooth_quant = quant_config.use_plugin_sq
    int8_kv_cache = quant_config.kv_cache_quant_algo == QuantAlgo.INT8

--- a/tensorrt_llm/models/llama/model.py
+++ b/tensorrt_llm/models/llama/model.py
@ -14,6 +14,8 @@
 # limitations under the License.
 from typing import Optional, Union

+import transformers
+
 from ..._common import default_net
 from ..._utils import pad_vocab_size
 from ...functional import (AllReduceFusionOp, AllReduceFusionParams, Tensor,
@ -323,7 +325,7 @@ class LLaMAForCausalLM(DecoderModelForCausalLM):
            weights = load_weights_from_hf_model(hf_model, config)

        check_share_embedding(weights, config)
-        model = LLaMAForCausalLM(config)
+        model = cls(config)
        model.load(weights)
        return model

@ -349,7 +351,7 @@ class LLaMAForCausalLM(DecoderModelForCausalLM):
        weights = load_weights_from_meta_ckpt(meta_ckpt_dir, config)

        check_share_embedding(weights, config)
-        model = LLaMAForCausalLM(config)
+        model = cls(config)
        model.load(weights)
        return model

--- a/tensorrt_llm/models/modeling_utils.py
+++ b/tensorrt_llm/models/modeling_utils.py
@ -615,7 +615,7 @@ class PretrainedModel(Module,
                model_inputs['lora_ranks'],
                model_inputs['lora_weights_pointers'],
                host_context_lengths=model_inputs['host_context_lengths'],
-                max_context_length=max_input_len,
+                max_num_tokens=max_num_tokens,
                host_request_types=model_inputs['host_request_types'])
        if model_inputs['spec_decoding_params'] is not None:
            result['spec_decoding_params'] = model_inputs[
@ -757,6 +757,10 @@ def fuse_gate_mlp(
    from ..quantization.quantize import fp8_quantize

    quant_algo = model.config.quantization.quant_algo
+    if quant_algo != QuantAlgo.FP8 and quant_algo is not None:
+        logger.warning("fuse_gate_mlp cannot be done for this model. Skipping.")
+        return model
+
    for name, mlp, layer in model.named_modules_with_parent():
        if isinstance(mlp, GatedMLP):
            init_params = get_init_params(mlp)
--- a/tensorrt_llm/models/qwen/config.py
+++ b/tensorrt_llm/models/qwen/config.py
@ -18,7 +18,6 @@ import torch

 from ..._utils import torch_dtype_to_str
 from ...layers import MoeConfig
-from ...logger import logger
 from ...mapping import Mapping
 from ..modeling_utils import PretrainedConfig, QuantConfig

@ -123,14 +122,9 @@ class QWenConfig(PretrainedConfig):
                dtype = torch_dtype_to_str(dtype)
            if dtype == 'float32':
                dtype = 'float16'
-        if dtype == 'bfloat16' and torch.cuda.get_device_properties(
-                0).major < 8:
-            logger.warning(
-                "Pre SM 80 GPUs do not support bfloat16, fallback to float16")
-            dtype = 'float16'

        return cls(
-            architecture='QWenForCausalLM',
+            architecture=hf_config.architectures[0],
            dtype=dtype,
            num_hidden_layers=hf_config.num_hidden_layers,
            num_attention_heads=hf_config.num_attention_heads,
--- a/tensorrt_llm/version.py
+++ b/tensorrt_llm/version.py
@ -12,4 +12,4 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-__version__ = "0.12.0.dev2024072301"
+__version__ = "0.12.0.dev2024072302"
--- a/tests/hlapi/test_llm.py
+++ b/tests/hlapi/test_llm.py
@ -21,6 +21,10 @@ from utils.util import force_ampere, similar

 from tensorrt_llm.models.llama.model import LLaMAForCausalLM

+skip_single_gpu = pytest.mark.skipif(
+    torch.cuda.device_count() < 2,
+    reason="The test needs at least 2 GPUs, skipping")
+
 # The unittests are based on the tiny-llama, which is fast to build and run.
 # There are other tests based on llama-7B model, such as the end-to-end tests in test_e2e.py, and parallel tests in
 # test_llm_multi_gpu.py.
--- a/tests/hlapi/test_llm_models.py
+++ b/tests/hlapi/test_llm_models.py
@ -0,0 +1,114 @@
+from typing import List, Optional
+
+import pytest
+import torch
+
+from tensorrt_llm import LLM, SamplingParams
+from tensorrt_llm.hlapi import QuantAlgo, QuantConfig
+
+try:
+    from .test_llm import get_model_path
+except ImportError:
+    from test_llm import get_model_path
+
+import os
+import sys
+
+sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+from utils.util import force_ampere, similar, skip_pre_hopper
+
+gptj_model_path = get_model_path('gpt-j-6b')
+gpt2_model_path = get_model_path('gpt2-medium')
+starcoder2_model_path = get_model_path('starcoder2-3b')
+
+sampling_params = SamplingParams(max_new_tokens=10)
+
+
+def llm_test_harness(model_dir: str,
+                     prompts: List[str],
+                     references: List[str],
+                     *,
+                     sampling_params: Optional[SamplingParams] = None,
+                     similar_threshold: float = 0.8,
+                     **llm_kwargs):
+
+    # skip if no enough GPUs
+    tp_size = llm_kwargs.get('tensor_parallel_size', 1)
+    pp_size = llm_kwargs.get('pipeline_parallel_size', 1)
+    world_size = tp_size * pp_size
+    if world_size > torch.cuda.device_count():
+        pytest.skip(
+            f"world_size ({world_size}) is greater than available GPUs ({torch.cuda.device_count()})"
+        )
+
+    llm = LLM(model_dir, tokenizer=model_dir, **llm_kwargs)
+    outputs = llm.generate(prompts, sampling_params=sampling_params)
+    for out, ref in zip(outputs, references):
+        assert similar(out.outputs[0].text, ref, threshold=similar_threshold)
+
+
+@force_ampere
+def test_llm_gptj():
+    llm_test_harness(gptj_model_path,
+                     prompts=["A B C"],
+                     references=["D E F G H I J K L M"],
+                     sampling_params=sampling_params)
+
+
+@force_ampere
+def test_llm_gptj_int4_weight_only():
+    quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16)
+    llm_test_harness(gptj_model_path,
+                     prompts=["A B C"],
+                     references=["D E F G H I J K L M"],
+                     sampling_params=sampling_params,
+                     quant_config=quant_config)
+
+
+@force_ampere
+def test_llm_gptj_tp2():
+    llm_test_harness(gptj_model_path,
+                     prompts=["A B C"],
+                     references=["D E F G H I J K L M"],
+                     sampling_params=sampling_params,
+                     tensor_parallel_size=2)
+
+
+@force_ampere
+def test_llm_gpt2():
+    llm_test_harness(gpt2_model_path,
+                     prompts=["A B C"],
+                     references=["D E F G H I J K L M"],
+                     sampling_params=sampling_params)
+
+
+@skip_pre_hopper
+def test_llm_gpt2_fp8():
+    quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
+    llm_test_harness(gpt2_model_path,
+                     prompts=["A B C"],
+                     references=["D E F G H I J K L M"],
+                     sampling_params=sampling_params,
+                     quant_config=quant_config)
+
+
+@force_ampere
+def test_llm_starcoder2():
+    llm_test_harness(starcoder2_model_path,
+                     prompts=["def print_hello_world():"],
+                     references=['\n    print("Hello World")\n\ndef print'],
+                     sampling_params=sampling_params)
+
+
+@skip_pre_hopper
+def test_llm_starcoder2_fp8():
+    quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
+    llm_test_harness(starcoder2_model_path,
+                     prompts=["def print_hello_world():"],
+                     references=['\n    print("Hello World")\n\ndef print'],
+                     sampling_params=sampling_params,
+                     quant_config=quant_config)
+
+
+if __name__ == '__main__':
+    test_llm_gpt2()
--- a/tests/hlapi/test_llm_multi_gpu.py
+++ b/tests/hlapi/test_llm_multi_gpu.py
@ -22,15 +22,11 @@ from tensorrt_llm.models.llama.model import LLaMAForCausalLM
 try:
    from .test_llm import (_test_llm_generate_async, default_model_name,
                           get_model_path, llama_model_path, mixtral_model_name,
-                           prompts)
+                           prompts, skip_single_gpu)
 except ImportError:
    from test_llm import (_test_llm_generate_async, default_model_name,
                          get_model_path, llama_model_path, mixtral_model_name,
-                          prompts)
-
-skip_single_gpu = pytest.mark.skipif(
-    torch.cuda.device_count() < 2,
-    reason="The test needs at least 2 GPUs, skipping")
+                          prompts, skip_single_gpu)


@pytest.fixture(scope="module")
--- a/tests/model/test_gpt.py
+++ b/tests/model/test_gpt.py
@ -33,6 +33,7 @@ from tensorrt_llm import Builder
 from tensorrt_llm._utils import str_dtype_to_torch
 from tensorrt_llm.functional import RotaryScalingType
 from tensorrt_llm.layers import PositionEmbeddingType
+from tensorrt_llm.models.gpt.convert import load_weights_from_hf_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType
 from tensorrt_llm.runtime import ModelConfig, SamplingConfig
@ -40,9 +41,6 @@ from tensorrt_llm.runtime.generation import _prepare_attention_mask
 from tensorrt_llm.runtime.kv_cache_manager import (GenerationSequence,
                                                   KVCacheManager)

-sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
-from examples.gpt.convert_checkpoint import convert_hf_gpt
-
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utils.util import skip_fp32_accum_pre_ampere, unittest_name_func

@ -86,12 +84,9 @@ class TestGPT(unittest.TestCase):
            'bias': getattr(gpt_config, 'bias', True),
            'apply_query_key_layer_scaling': apply_query_key_layer_scaling,
        }
-        config = tensorrt_llm.models.PretrainedConfig.from_dict(config)
-        weights = convert_hf_gpt(hf_gpt,
-                                 gpt_config,
-                                 "gpt2",
-                                 config.mapping,
-                                 dtype=dtype)
+        config = tensorrt_llm.models.GPTConfig.from_dict(config)
+        weights = load_weights_from_hf_model(hf_gpt, config)
+
        tensorrt_llm_gpt = tensorrt_llm.models.GPTForCausalLM(config)
        tensorrt_llm_gpt.load(weights)

--- a/tests/model/test_gptj.py
+++ b/tests/model/test_gptj.py
@ -29,13 +29,10 @@ from transformers import GPTJConfig, GPTJForCausalLM

 import tensorrt_llm
 from tensorrt_llm import Builder
+from tensorrt_llm.models.gptj.convert import load_weights_from_hf_model
 from tensorrt_llm.network import net_guard
 from tensorrt_llm.plugin.plugin import ContextFMHAType

-sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
-
-from examples.gptj.convert_checkpoint import convert_hf_gptj
-
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
 from utils.util import skip_fp32_accum_pre_ampere, unittest_name_func

@ -82,10 +79,7 @@ class TestGPTJ(unittest.TestCase):
        }
        config = tensorrt_llm.models.PretrainedConfig.from_dict(config)
        config.set_rank(rank)
-        weights = convert_hf_gptj(hf_gpt,
-                                  gpt_config,
-                                  config.mapping,
-                                  dtype=dtype)
+        weights = load_weights_from_hf_model(hf_gpt, config)
        trtllm_model = tensorrt_llm.models.GPTJForCausalLM(config)
        trtllm_model.load(weights)

--- a/tests/model/test_llama.py
+++ b/tests/model/test_llama.py
@ -86,8 +86,9 @@ class TestLLaMA(unittest.TestCase):

            # Initialize model
            config = tensorrt_llm.models.LLaMAConfig.from_dict(config)
-            tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM(config)
            weights = load_weights_from_hf_model(hf_llama, config)
+
+            tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM(config)
            tensorrt_llm_llama.load(weights)
            optimize_model(tensorrt_llm_llama, **opt_flags)

--- a/tests/test_export.py
+++ b/tests/test_export.py
@ -12,16 +12,12 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import sys
 import unittest
-from pathlib import Path

 import numpy as np
 import torch

-sys.path.append(str(Path(__file__).parent.resolve() /
-                    "../examples/gpt"))  # more precise, avoid confusion
-from convert_checkpoint import generate_int8
+from tensorrt_llm.models.gpt.convert import generate_int8


 def dist(x, y):