open source 315e9f5ccd286e906d4c0d402fefbf2f69a1febe (#2033)

This commit is contained in:
Kaiyu Xie 2024-07-26 16:19:24 +08:00 committed by GitHub
parent 5fa9436e17
commit 93293aa46d
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
97 changed files with 3057 additions and 6767 deletions

View File

@ -10,8 +10,6 @@ multiple GPUs or multiple nodes with multiple GPUs using the Python runtime.
The benchmark implementation and entrypoint can be found in [`benchmarks/python/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
* [`benchmarks/python/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
* [`benchmarks/python/build.py`](./build.py) to build supported models for benchmarking.
* [`benchmarks/python/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
* [`benchmarks/python/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
* [`benchmarks/python/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
@ -25,37 +23,29 @@ python benchmark.py -h
```
### 1. Single GPU benchmark
Take GPT-350M as an example:
Take LLaMA 7B as an example:
```
python benchmark.py \
-m gpt_350m \
--mode plugin \
-m dec \
--engine_dir llama_7b \
--batch_size "1;8;64" \
--input_output_len "60,20;128,20"
```
Expected outputs:
```
[BENCHMARK] model_name gpt_350m world_size 1 num_heads 16 num_kv_heads 16 num_layers 24 hidden_size 1024 vocab_size 51200 precision float16 batch_size 1 input_length 60 output_length 20 gpu_peak_mem(gb) 4.2 build_time(s) 25.67 tokens_per_sec 483.54 percentile95(ms) 41.537 percentile99(ms) 42.102 latency(ms) 41.362 compute_cap sm80
[BENCHMARK] model_name gpt_350m world_size 1 num_heads 16 num_kv_heads 16 num_layers 24 hidden_size 1024 vocab_size 51200 precision float16 batch_size 8 input_length 60 output_length 20 gpu_peak_mem(gb) 4.28 build_time(s) 25.67 tokens_per_sec 3477.28 percentile95(ms) 46.129 percentile99(ms) 46.276 latency(ms) 46.013 compute_cap sm80
[BENCHMARK] model_name gpt_350m world_size 1 num_heads 16 num_kv_heads 16 num_layers 24 hidden_size 1024 vocab_size 51200 precision float16 batch_size 64 input_length 60 output_length 20 gpu_peak_mem(gb) 4.8 build_time(s) 25.67 tokens_per_sec 19698.07 percentile95(ms) 65.739 percentile99(ms) 65.906 latency(ms) 64.981 compute_cap sm80
[BENCHMARK] model_name dec world_size 2 num_heads 32 num_kv_heads 32 num_layers 32 hidden_size 4096 vocab_size 32000 precision float16 batch_size 1 gpu_weights_percent 1.0 input_length 60 output_length 20 gpu_peak_mem(gb) 0.0 build_time(s) None tokens_per_sec 170.77 percentile95(ms) 117.591 percentile99(ms) 124.262 latency(ms) 117.115 compute_cap sm90 quantization QuantMode.FP8_QDQ|FP8_KV_CACHE generation_time(ms) 110.189 total_generated_tokens 19.0 generation_tokens_per_second 172.43
[BENCHMARK] model_name dec world_size 2 num_heads 32 num_kv_heads 32 num_layers 32 hidden_size 4096 vocab_size 32000 precision float16 batch_size 8 gpu_weights_percent 1.0 input_length 60 output_length 20 gpu_peak_mem(gb) 0.0 build_time(s) None tokens_per_sec 1478.55 percentile95(ms) 108.641 percentile99(ms) 109.546 latency(ms) 108.214 compute_cap sm90 quantization QuantMode.FP8_QDQ|FP8_KV_CACHE generation_time(ms) 98.194 total_generated_tokens 152.0 generation_tokens_per_second 1547.951
[BENCHMARK] model_name dec world_size 2 num_heads 32 num_kv_heads 32 num_layers 32 hidden_size 4096 vocab_size 32000 precision float16 batch_size 64 gpu_weights_percent 1.0 input_length 60 output_length 20 gpu_peak_mem(gb) 0.0 build_time(s) None tokens_per_sec 8214.87 percentile95(ms) 156.748 percentile99(ms) 160.203 latency(ms) 155.815 compute_cap sm90 quantization QuantMode.FP8_QDQ|FP8_KV_CACHE generation_time(ms) 111.078 total_generated_tokens 1216.0 generation_tokens_per_second 10947.303
...
```
*Please note that the expected outputs is only for reference, specific performance numbers depend on the GPU you're using.*
### 2. Multi-GPU benchmark
Take GPT-175B as an example:
Take LLaMA 7B as an example:
```
mpirun -n 8 python benchmark.py \
-m gpt_175b \
--mode plugin \
mpirun -n 2 python benchmark.py \
-m dec \
--engine_dir llama_7b \
--batch_size "1;8;64" \
--input_output_len "60,20;128,20"
```
Note: Building multi-GPU engines in parallel could be a heavy workload for the CPU system. Tuning `mpirun --map-by <XXX>` option on your system may achieve significant boost in build time, for example:
```
mpirun --map-by socket -n 8 python build.py \
--model gpt_175b \
--mode ootb \
--quantization fp8
```

File diff suppressed because it is too large Load Diff

View File

@ -32,13 +32,13 @@ def get_compute_cap():
return str(int(float(csv_value) * 10))
def get_csv_filename(model, dtype, tp_size, mode, **kwargs):
def get_csv_filename(model, dtype, tp_size, **kwargs):
sm = get_compute_cap()
if len(kwargs) == 0:
kw_pairs = ""
else:
kw_pairs = "_" + "_".join([str(k) + str(v) for k, v in kwargs.items()])
return f'{model}_{dtype}_tp{tp_size}_{mode}{kw_pairs}_sm{sm}.csv'
return f'{model}_{dtype}_tp{tp_size}_{kw_pairs}_sm{sm}.csv'
def get_engine_name(model, dtype, tp_size, rank):
@ -59,13 +59,7 @@ def serialize_engine(engine, path):
class BaseBenchmark(object):
def __init__(self,
engine_dir,
model_name,
dtype,
rank,
world_size,
serial_build: bool = False):
def __init__(self, engine_dir, model_name, dtype, rank, world_size):
self.engine_dir = engine_dir
self.model_name = model_name
self.dtype = dtype
@ -74,73 +68,67 @@ class BaseBenchmark(object):
self.engine_model_name = model_name
self.quant_mode = QuantMode(0)
self.enable_fp8 = False
if engine_dir is not None:
# Read config from engine directory
config_path = os.path.join(engine_dir, 'config.json')
with open(config_path, 'r') as f:
self.config = json.load(f)
# Sanity checks
if 'pretrained_config' in self.config: # new build api branch
config_dtype = self.config['pretrained_config']['dtype']
assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
world_size = self.config['pretrained_config']['mapping'][
'world_size']
assert world_size == self.world_size, \
(f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
# Load config into self
for key, value in self.config['pretrained_config'].items():
setattr(self, key, value)
self.quant_mode = QuantMode.from_quant_algo(
quant_algo=self.quantization['quant_algo'],
kv_cache_quant_algo=self.quantization['kv_cache_quant_algo'])
self.enable_fp8 = self.quant_mode.has_fp8_qdq()
self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
for key, value in self.config['build_config'].items():
setattr(self, key, value)
for key, value in self.plugin_config.items():
if "plugin" in key:
key = "use_" + key
setattr(self, key, value)
self.engine_name = f"rank{self.runtime_rank}.engine"
self.num_kv_heads = self.num_key_value_heads
self.num_layers = self.num_hidden_layers
self.num_heads = self.num_attention_heads
else:
# Read config from engine directory
config_path = os.path.join(engine_dir, 'config.json')
with open(config_path, 'r') as f:
self.config = json.load(f)
# Sanity checks
if 'pretrained_config' in self.config: # new build api branch
config_dtype = self.config['pretrained_config']['dtype']
assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
world_size = self.config['pretrained_config']['mapping'][
'world_size']
assert world_size == self.world_size, \
(f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
# Load config into self
for key, value in self.config['pretrained_config'].items():
config_dtype = self.config['builder_config']['precision']
assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
world_size = self.config['builder_config']['tensor_parallel']
assert world_size == self.world_size, \
(f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
# Load config into self
for key, value in self.config['builder_config'].items():
if key == "quant_mode":
self.quant_mode = QuantMode(value)
elif key in "name":
self.engine_model_name = value
else:
setattr(self, key, value)
self.quant_mode = QuantMode.from_quant_algo(
quant_algo=self.quantization['quant_algo'],
kv_cache_quant_algo=self.quantization['kv_cache_quant_algo']
)
self.enable_fp8 = self.quant_mode.has_fp8_qdq()
self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
for key, value in self.config['build_config'].items():
setattr(self, key, value)
for key, value in self.plugin_config.items():
if "plugin" in key:
key = "use_" + key
setattr(self, key, value)
self.engine_name = f"rank{self.runtime_rank}.engine"
self.num_kv_heads = self.num_key_value_heads
self.num_layers = self.num_hidden_layers
self.num_heads = self.num_attention_heads
else:
# Read config from engine directory
config_path = os.path.join(engine_dir, 'config.json')
with open(config_path, 'r') as f:
self.config = json.load(f)
# Sanity checks
config_dtype = self.config['builder_config']['precision']
assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
world_size = self.config['builder_config']['tensor_parallel']
assert world_size == self.world_size, \
(f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
# Load config into self
for key, value in self.config['builder_config'].items():
if key == "quant_mode":
self.quant_mode = QuantMode(value)
elif key in "name":
self.engine_model_name = value
else:
setattr(self, key, value)
self.enable_fp8 = self.quant_mode.has_fp8_qdq()
self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
for key, value in self.config['plugin_config'].items():
# Same effect as self.use_foo_plugin = config.json["foo_plugin"]
if "plugin" in key:
key = "use_" + key
setattr(self, key, value)
self.engine_name = get_engine_name(self.engine_model_name,
self.dtype, self.world_size,
self.runtime_rank)
else:
self.enable_fp8 = self.quant_mode.has_fp8_qdq()
self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
for key, value in self.config['plugin_config'].items():
# Same effect as self.use_foo_plugin = config.json["foo_plugin"]
if "plugin" in key:
key = "use_" + key
setattr(self, key, value)
self.engine_name = get_engine_name(self.engine_model_name,
self.dtype, self.world_size,
self.runtime_rank)
@ -148,9 +136,9 @@ class BaseBenchmark(object):
self.runtime_mapping = tensorrt_llm.Mapping(world_size=self.world_size,
rank=self.runtime_rank,
tp_size=self.world_size)
if not serial_build:
torch.cuda.set_device(self.runtime_rank %
self.runtime_mapping.gpus_per_node)
torch.cuda.set_device(self.runtime_rank %
self.runtime_mapping.gpus_per_node)
self.csv_filename = "" # lazy init
@ -189,7 +177,6 @@ class BaseBenchmark(object):
self.csv_filename = get_csv_filename(self.model_name,
self.dtype,
self.world_size,
self.mode,
fp8linear=int(self.enable_fp8))
return self.csv_filename

View File

@ -20,26 +20,15 @@ import torch
def parse_arguments():
from allowed_configs import get_allowed_models
parser = argparse.ArgumentParser(
description='Benchmark TensorRT-LLM models.')
parser.add_argument('-m',
'--model',
type=str,
default="gpt_350m",
choices=get_allowed_models(),
help='Specify model you want to benchmark.')
parser.add_argument(
'--mode',
type=str,
default="plugin",
choices=['ootb', 'plugin', 'ootb-except-mha'],
help=
('Choose mode between ootb/plugin. '
'\"ootb\" means the engines will be built without any plugins, '
'\"plugin\" means the engines will be built with tuned recipe of using plugins.'
'\"ootb-except-mha\" means the engines will be built with only attention plugins.'
))
default="dec",
choices=["dec", "enc", "enc-dec"],
help='Specify type of the model you want to benchmark. '
'Choose model between dec/enc/enc-dec.')
parser.add_argument('--batch_size',
type=str,
@ -69,13 +58,6 @@ def parse_arguments():
default='float16',
choices=['float16', 'bfloat16', 'float32'],
help='Choose data type between float16/bfloat16/float32.')
parser.add_argument(
'--refit',
default=False,
action="store_true",
help=
'If this option is specified, a refit flag is added to TensorRT engines.'
)
parser.add_argument('--num_beams',
type=int,
@ -100,14 +82,6 @@ def parse_arguments():
type=str,
default='model.cache',
help='The path to write timing cache')
parser.add_argument(
'--profiling_verbosity',
type=str,
default='layer_names_only',
choices=['layer_names_only', 'detailed', 'none'],
help=
'The profiling verbosity for the generated TRT engine. Set to detailed can inspect tactic choices and kernel parameters.'
)
parser.add_argument(
'--log_level',
type=str,
@ -131,75 +105,14 @@ def parse_arguments():
default=60,
help='Minimal duration of iterations to measure in seconds.')
parser.add_argument(
'--output_dir',
type=str,
default=None,
help=
'If this option is specified, TensorRT engines will be saved to the specified path.'
)
parser.add_argument(
'--engine_dir',
type=str,
default=None,
required=True,
help=
('If this option is specified, instead of building engines on-air before benchmarking, '
'the engines contained in the engine_dir will be used.'))
parser.add_argument(
'--max_beam_width',
type=int,
default=None,
help=
('If this option is specified, it will override the max beam width of '
'TRT engines to the specified value instead of using pre-defined one'))
parser.add_argument(
'--max_input_len',
type=int,
default=None,
help=
('If this option is specified, it will override the max input len of '
'TRT engines to the specified value instead of using pre-defined one'))
parser.add_argument(
'--max_encoder_input_len',
type=int,
default=None,
help=
('This argument is only for encoder-decoder models'
'If this option is specified, it will override the max encoder input len of TRT engines to the specified value instead of using pre-defined one'
'By default when this option is not used, it will use pre-defined max encoder input len'
))
parser.add_argument(
'--max_decoder_input_len',
type=int,
default=None,
help=
('This argument is only for encoder-decoder models'
'If this option is specified, it will override the max decoder input len of TRT engines to the specified value instead of using pre-defined one'
'By default when this option is not used, it will use pre-defined max decoder input len'
))
parser.add_argument(
'--max_seq_len',
'--max_decoder_seq_len',
dest='max_seq_len',
type=int,
default=None,
help=
('If this option is specified, it will override the max sequence len of '
'TRT engines to the specified value instead of using pre-defined one'))
parser.add_argument(
'--max_batch_size',
type=int,
default=None,
help=
('If this option is specified, it will override the max batch size of '
'TRT engines to the specified value instead of using pre-defined one'))
parser.add_argument(
'--force_num_layer_1',
default=False,
action='store_true',
help=
'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
)
parser.add_argument(
'--gpu_weights_percent',
type=str,
@ -207,13 +120,6 @@ def parse_arguments():
help='Specify the percentage of weights that reside on GPU (from 0 to 1).'
'Multiple percentages can be separated by \";\", '
'example: \"0;0.5;1\".')
parser.add_argument(
'--multiple_profiles',
default=False,
action='store_true',
help=
'This option will benefit performance, but will increase the engine build time.'
)
parser.add_argument('--csv',
default=False,
@ -234,40 +140,7 @@ def parse_arguments():
'int8_sq_per_channel_ootb'
],
help="Optimize the model with specified quantization recipe")
parser.add_argument(
'--build_only',
default=False,
action='store_true',
help=
"Build engine only and skip inference, this can help to benchmark the build time on single gpu node for multi GPU model, where the inference is not possible"
)
parser.add_argument('--serial_build',
default=False,
action='store_true',
help="Build engines serially")
parser.add_argument(
'--rank',
type=int,
default=None,
help=
"The rank of the model to be built, only used when --build_only and --serial_build is specified"
)
parser.add_argument(
'--world_size',
type=int,
default=None,
help=
"The number of gpus to be used for inference, only used when --build_only and --serial_build is specified"
)
parser.add_argument(
'--debug_memory',
default=False,
action='store_true',
help=
"Check the estimated memory usage against the total GPU memory. Raise error if the estimated memory requirement is bigger than the total GPU memory"
"Warning: only GPT model family is supported for now")
parser.add_argument(
'--dump_profile',
default=False,
@ -281,25 +154,6 @@ def parse_arguments():
help=
"Print layer information of the engine to console (default = disabled)")
parser.add_argument(
'--opt_batch_size',
type=int,
default=None,
help=
"If opt_batch_size option is specified, it will override the opt batch size."
"This flag only takes effect when `--mode=ootb` is added. For other modes, please use --opt_num_tokens to replace it."
)
parser.add_argument(
'--opt_num_tokens',
type=int,
default=None,
help="It equals to max_batch_size*max_beam_width by default, set this "
"value as close as possible to the actual number of tokens on your workload. "
"Note that this argument might be removed in the future."
"This flag only takes effect when `--mode` is not `ootb`. For ootb mode, please use --opt_batch_size to replace it."
)
return parser.parse_args()
@ -308,7 +162,6 @@ def main(args):
# tensorrt_llm is imported, but mpi4py does not work well with
# the start method `spawn` of Python multiprocessing,
# so we set the start method first, then initialize MPI.
from allowed_configs import get_allowed_models
from benchmark_profiler import BenchmarkProfiler
from bert_benchmark import BERTBenchmark
from enc_dec_benchmark import EncDecBenchmark
@ -341,17 +194,8 @@ def main(args):
)
args.weight_streaming = any([p != 1 for p in gpu_weights_percents])
if args.serial_build and not args.build_only:
raise Exception(
f"--serial_build must be used with --build_only, always need to parallel build to do inference in the same process"
)
if args.build_only and args.serial_build and args.rank is not None and args.world_size is not None:
rank = args.rank
world_size = args.world_size
else:
rank = tensorrt_llm.mpi_rank()
world_size = tensorrt_llm.mpi_world_size()
rank = tensorrt_llm.mpi_rank()
world_size = tensorrt_llm.mpi_world_size()
# TODO: Re-enable memory monitor for multi-gpu benchmarks.
# Current Mem Monitor will cause benchmark script hang
@ -361,30 +205,25 @@ def main(args):
from mem_monitor import MemoryMonitor
benchmark_profiler = None
if args.model in get_allowed_models(benchmark_type="gpt"):
if args.model == "dec":
benchmark_profiler = BenchmarkProfiler()
benchmarker = GPTBenchmark(args, batch_size_options, in_out_len_options,
gpu_weights_percents, rank, world_size)
elif args.model in get_allowed_models(benchmark_type="bert"):
elif args.model == "enc":
benchmarker = BERTBenchmark(args, batch_size_options, input_len_options,
gpu_weights_percents, rank, world_size)
elif args.model in get_allowed_models(benchmark_type="enc_dec"):
elif args.model == "enc-dec":
benchmarker = EncDecBenchmark(args, batch_size_options,
in_out_len_options, gpu_weights_percents,
rank, world_size)
else:
raise Exception(f'Unexpected model: {args.model}')
if args.build_only:
return
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
benchmarker.print_report_header(args.csv,
benchmark_profiler=benchmark_profiler)
for config in benchmarker.get_config():
if isinstance(benchmarker, GPTBenchmark):
benchmarker.check_memory(config, raise_exception=args.debug_memory)
try:
if args.weight_streaming:
# We pass in config instead of the gpu_weights_percent here to keep this benchmark script

View File

@ -18,9 +18,7 @@ import os
import torch
import tensorrt as trt
#isort: on
from allowed_configs import get_build_config
from base_benchmark import BaseBenchmark
from build import build_bert
import tensorrt_llm
from tensorrt_llm._utils import trt_dtype_to_torch
@ -32,36 +30,17 @@ class BERTBenchmark(BaseBenchmark):
def __init__(self, args, batch_sizes, in_lens, gpu_weights_percents, rank,
world_size):
super().__init__(args.engine_dir, args.model, args.dtype, rank,
world_size, args.serial_build)
world_size)
self.batch_sizes = batch_sizes
self.in_lens = in_lens
self.build_time = 0
self.mode = args.mode
self.gpu_weights_percents = gpu_weights_percents
if args.engine_dir is not None:
# Deserialize engine from engine directory
self.serialize_path = os.path.join(args.engine_dir,
self.engine_name)
with open(self.serialize_path, 'rb') as f:
engine_buffer = f.read()
else:
# Build engine
for key, value in get_build_config(args.model).items():
setattr(self, key, value)
if args.force_num_layer_1:
self.num_layers = 1
if args.max_batch_size is not None:
self.max_batch_size = args.max_batch_size
if args.max_input_len is not None:
self.max_input_len = args.max_input_len
engine_buffer, build_time = build_bert(args)
self.build_time = build_time
assert engine_buffer is not None
if args.build_only:
return
# Deserialize engine from engine directory
self.serialize_path = os.path.join(args.engine_dir, self.engine_name)
with open(self.serialize_path, 'rb') as f:
engine_buffer = f.read()
assert engine_buffer is not None
self.session = tensorrt_llm.runtime.Session.from_serialized_engine(
engine_buffer)

File diff suppressed because it is too large Load Diff

View File

@ -18,14 +18,13 @@ import os
# isort: off
import torch
#isort: on
from allowed_configs import get_build_config
from base_benchmark import BaseBenchmark, get_engine_name
from build import build_enc_dec
from base_benchmark import BaseBenchmark
import tensorrt_llm
from tensorrt_llm._utils import (trt_dtype_to_torch, str_dtype_to_trt)
from tensorrt_llm.quantization import QuantMode
from tensorrt_llm.runtime.session import TensorInfo
from tensorrt_llm.runtime import ModelConfig
class EncDecBenchmark(BaseBenchmark):
@ -34,10 +33,8 @@ class EncDecBenchmark(BaseBenchmark):
rank, world_size):
self.engine_dir = args.engine_dir
self.model_name = args.model
self.mode = args.mode
self.enable_fp8 = False # hardcode for enc-dec models
self.dtype = args.dtype
self.output_dir = args.output_dir
self.runtime_rank = rank
self.world_size = world_size
self.csv_filename = "" # lazy init
@ -63,87 +60,93 @@ class EncDecBenchmark(BaseBenchmark):
"config.json")
with open(config_path, "r") as f:
config = json.load(f)
# Sanity checks
config_dtype = config["builder_config"]["precision"]
assert (
self.dtype == config_dtype
), f"Engine dtype ({config_dtype}) != Runtime dtype ({self.dtype})"
world_size = config["builder_config"]["tensor_parallel"]
assert (
world_size == self.world_size
), f"Engine world size ({world_size}) != Runtime world size ({self.world_size})"
tp_size = config["builder_config"]["tensor_parallel"]
# TP only for benchmarking
assert (
tp_size == self.world_size
), f"Engine tensor parallel size ({tp_size}) should be equal to world size ({self.world_size})"
assert (
config["plugin_config"]["remove_input_padding"] == False
), "remove_input_padding should be False for enc-dec benchmarks"
num_heads = config["builder_config"]["num_heads"]
builder_config = config['build_config']
plugin_config = builder_config['plugin_config']
pretrained_config = config['pretrained_config']
lora_config = builder_config['lora_config']
builder_config['auto_parallel_config']
use_gpt_attention_plugin = plugin_config["gpt_attention_plugin"]
remove_input_padding = plugin_config["remove_input_padding"]
use_lora_plugin = plugin_config["lora_plugin"]
tp_size = pretrained_config['mapping']['tp_size']
pp_size = pretrained_config['mapping']['pp_size']
world_size = tp_size * pp_size
assert world_size == tensorrt_llm.mpi_world_size(), \
f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
num_heads = pretrained_config["num_attention_heads"]
hidden_size = pretrained_config["hidden_size"]
head_size = pretrained_config["head_size"]
vocab_size = pretrained_config["vocab_size"]
max_batch_size = builder_config["max_batch_size"]
max_beam_width = builder_config["max_beam_width"]
num_layers = pretrained_config["num_hidden_layers"]
num_kv_heads = pretrained_config.get('num_kv_heads', num_heads)
assert (num_heads % tp_size) == 0
# Get model config
num_heads = num_heads // tp_size
hidden_size = config["builder_config"]["hidden_size"] // tp_size
num_kv_heads = config["builder_config"].get(
"num_kv_heads", config["builder_config"]["num_heads"])
hidden_size = hidden_size // tp_size
num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
model_config = tensorrt_llm.runtime.ModelConfig(
cross_attention = pretrained_config[
"architecture"] == "DecoderModel"
skip_cross_qkv = pretrained_config.get('skip_cross_qkv', False)
has_position_embedding = pretrained_config[
"has_position_embedding"]
has_token_type_embedding = hasattr(pretrained_config,
"type_vocab_size")
dtype = pretrained_config["dtype"]
paged_kv_cache = plugin_config['paged_kv_cache']
tokens_per_block = plugin_config['tokens_per_block']
gather_context_logits = builder_config.get(
'gather_context_logits', False)
gather_generation_logits = builder_config.get(
'gather_generation_logits', False)
max_prompt_embedding_table_size = builder_config.get(
'max_prompt_embedding_table_size', 0)
self.max_batch_size = config["build_config"]["max_batch_size"]
self.max_input_len = config["build_config"][
"max_encoder_input_len"]
self.max_seq_len = config["build_config"]["max_seq_len"]
model_config = ModelConfig(
num_heads=num_heads,
num_kv_heads=num_kv_heads,
hidden_size=hidden_size,
head_size=config["builder_config"]["head_size"],
max_batch_size=config["builder_config"]["max_batch_size"],
max_beam_width=config["builder_config"]["max_beam_width"],
vocab_size=config["builder_config"]["vocab_size"],
num_layers=config["builder_config"]["num_layers"],
gpt_attention_plugin=config["plugin_config"]
["gpt_attention_plugin"],
remove_input_padding=config["plugin_config"]
["remove_input_padding"],
cross_attention=config["builder_config"]["cross_attention"],
skip_cross_qkv=config["builder_config"]["skip_cross_qkv"],
has_position_embedding=config["builder_config"]
["has_position_embedding"],
has_token_type_embedding=config["builder_config"]
["has_token_type_embedding"],
dtype=config_dtype,
head_size=head_size,
max_batch_size=max_batch_size,
max_beam_width=max_beam_width,
vocab_size=vocab_size,
num_layers=num_layers,
gpt_attention_plugin=use_gpt_attention_plugin,
remove_input_padding=remove_input_padding,
paged_kv_cache=paged_kv_cache,
tokens_per_block=tokens_per_block,
cross_attention=cross_attention,
has_position_embedding=has_position_embedding,
has_token_type_embedding=has_token_type_embedding,
dtype=dtype,
gather_context_logits=gather_context_logits,
gather_generation_logits=gather_generation_logits,
max_prompt_embedding_table_size=
max_prompt_embedding_table_size,
lora_plugin=use_lora_plugin,
lora_target_modules=lora_config.get('lora_target_modules'),
trtllm_modules_to_hf_modules=lora_config.get(
'trtllm_modules_to_hf_modules'),
skip_cross_qkv=skip_cross_qkv,
)
self.max_batch_size = config["builder_config"]["max_batch_size"]
self.max_input_len = config["builder_config"][
"max_encoder_input_len"]
self.max_seq_len = config["builder_config"]["max_seq_len"]
self.n_mels = config["builder_config"][
'n_mels'] if 'whisper' in self.model_name else 0
for key, value in config["builder_config"].items():
if key == "name":
engine_model_name = value
break
return engine_model_name, model_config
return model_config
(
self.encoder_engine_model_name,
self.encoder_model_config,
) = read_config("encoder")
(
self.decoder_engine_model_name,
self.decoder_model_config,
) = read_config("decoder")
self.encoder_model_config = read_config("encoder")
self.decoder_model_config = read_config("decoder")
self.encoder_engine_name = get_engine_name(
self.encoder_engine_model_name,
self.dtype,
self.world_size,
self.runtime_rank,
)
self.decoder_engine_name = get_engine_name(
self.decoder_engine_model_name,
self.dtype,
self.world_size,
self.runtime_rank,
)
self.encoder_engine_name = 'rank{}.engine'.format(self.runtime_rank)
self.decoder_engine_name = 'rank{}.engine'.format(self.runtime_rank)
self.encoder_runtime_mapping = tensorrt_llm.Mapping(
world_size=self.world_size,
rank=self.runtime_rank,
@ -155,47 +158,21 @@ class EncDecBenchmark(BaseBenchmark):
tp_size=self.world_size,
)
if not args.serial_build:
torch.cuda.set_device(self.runtime_rank %
self.encoder_runtime_mapping.gpus_per_node)
torch.cuda.set_device(self.runtime_rank %
self.encoder_runtime_mapping.gpus_per_node)
self.device = torch.cuda.current_device()
if self.engine_dir is not None:
# Deserialize engine from engine directory
self.encoder_serialize_path = os.path.join(self.engine_dir,
"encoder",
self.encoder_engine_name)
with open(self.encoder_serialize_path, "rb") as f:
encoder_engine_buffer = f.read()
self.decoder_serialize_path = os.path.join(self.engine_dir,
"decoder",
self.decoder_engine_name)
with open(self.decoder_serialize_path, "rb") as f:
decoder_engine_buffer = f.read()
else:
build_config = get_build_config(self.model_name)
self.max_batch_size = build_config['max_batch_size'] \
if args.max_batch_size is None else args.max_batch_size
self.max_input_len = build_config['max_encoder_input_len'] \
if args.max_input_len is None else args.max_input_len
self.max_seq_len = build_config['max_seq_len'] \
if args.max_seq_len is None else args.max_seq_len
self.n_mels = build_config[
'n_mels'] if 'whisper' in self.model_name else 0
# Build engine
(
encoder_engine_buffer,
decoder_engine_buffer,
self.encoder_model_config,
self.decoder_model_config,
encoder_build_time,
decoder_build_time,
) = build_enc_dec(args)
self.build_time = encoder_build_time + decoder_build_time
assert encoder_engine_buffer is not None
assert decoder_engine_buffer is not None
# Deserialize engine from engine directory
self.encoder_serialize_path = os.path.join(self.engine_dir, "encoder",
self.encoder_engine_name)
with open(self.encoder_serialize_path, "rb") as f:
encoder_engine_buffer = f.read()
assert encoder_engine_buffer is not None
self.decoder_serialize_path = os.path.join(self.engine_dir, "decoder",
self.decoder_engine_name)
with open(self.decoder_serialize_path, "rb") as f:
decoder_engine_buffer = f.read()
assert decoder_engine_buffer is not None
# session setup
self.encoder_session = tensorrt_llm.runtime.Session.from_serialized_engine(
@ -216,11 +193,10 @@ class EncDecBenchmark(BaseBenchmark):
f"[WARNING] whisper benchmark is input_len=1500, no text prompt, output_len=arbitrary"
)
for inlen, outlen in self.in_out_lens:
if (inlen > self.max_input_len
or inlen + outlen > self.max_seq_len):
if (inlen > self.max_input_len or outlen > self.max_seq_len):
print(
f"[WARNING] check inlen({inlen}) <= max_inlen({self.max_input_len}) and "
f"inlen({inlen}) + outlen({outlen}) <= max_seqlen({self.max_seq_len}) failed, skipping."
f"outlen({outlen}) <= max_seqlen({self.max_seq_len}) failed, skipping."
)
continue
for batch_size in self.batch_sizes:

View File

@ -13,8 +13,6 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import os
from dataclasses import asdict
from math import ceil
import pandas as pd
@ -22,11 +20,11 @@ import tensorrt as trt
import torch
import tensorrt_llm
from tensorrt_llm.profiler import bytes_to_target_unit
from tensorrt_llm.builder import Engine
from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
SamplingConfig)
from allowed_configs import get_build_config, BuildConfig # isort:skip
from base_benchmark import BaseBenchmark # isort:skip
from build import build_gpt, get_quant_config # isort:skip
def element_size(dtype: str):
@ -46,80 +44,26 @@ class GPTBenchmark(BaseBenchmark):
def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
rank, world_size):
super().__init__(args.engine_dir, args.model, args.dtype, rank,
world_size, args.serial_build)
world_size)
self.batch_sizes = batch_sizes
self.in_out_lens = in_out_lens
self.gpu_weights_percents = gpu_weights_percents
self.num_beams = args.num_beams
self.mode = args.mode
self.build_time = 0
self.cuda_graph_mode = args.enable_cuda_graph
self.build_config = None
# this dtype may be modified based on quantization mode later, when the fp8/int8 kv cache is used
self.kv_dtype = args.dtype
# approximate the weights size in the engine by using engine size
# the actual weights size shall be smaller because there are some other data in the engine file.
# for large model, this approximate is close enough.
self.weights_size_approx = 0
self.dump_layer_info = args.dump_layer_info
# change profiling_verbosity to detailed when enabling dump layer info
if self.dump_layer_info:
args.profiling_verbosity = "detailed"
if args.engine_dir is not None:
# Get build configs from engine directory is done in base class
# Deserialize engine from engine directory
self.serialize_path = os.path.join(args.engine_dir,
self.engine_name)
with open(self.serialize_path, 'rb') as f:
engine_buffer = f.read()
self.weights_size_approx = len(engine_buffer)
else:
self.build_config = get_build_config(args.model, return_dict=False)
for key, value in asdict(self.build_config).items():
setattr(self, key, value)
if args.force_num_layer_1:
self.num_layers = 1
if args.max_batch_size is not None:
self.max_batch_size = args.max_batch_size
if args.max_input_len is not None:
self.max_input_len = args.max_input_len
if args.max_seq_len is not None:
self.max_seq_len = args.max_seq_len
self.quant_config = get_quant_config(args.quantization)
self.quant_mode = self.quant_config.quant_mode
self.enable_fp8 = self.quant_mode.has_fp8_qdq()
self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
if self.quant_mode.has_fp8_kv_cache():
self.kv_dtype = 'fp8'
if self.quant_mode.has_int8_kv_cache():
self.kv_dtype = 'int8'
# Plugins
self.use_gpt_attention_plugin = False
self.remove_input_padding = False
self.use_mamba_conv1d_plugin = False
if args.mode == 'plugin':
self.use_gpt_attention_plugin = True
self.remove_input_padding = True
self.use_moe_plugin = True
self.use_mamba_conv1d_plugin = True
elif args.mode == 'ootb-except-mha':
self.use_gpt_attention_plugin = True
self.remove_input_padding = True
engine_buffer, build_time = build_gpt(args)
self.weights_size_approx = engine_buffer.nbytes
self.build_time = build_time
# Get build configs from engine directory is done in base class
# Deserialize engine from engine directory
engine = Engine.from_dir(args.engine_dir, rank)
engine_buffer = engine.engine
assert engine_buffer is not None
if args.build_only:
return
pretrained_config = engine.config.pretrained_config
if pretrained_config.architecture == 'ChatGLMForCausalLM' and pretrained_config.chatglm_version in [
'glm', 'chatglm'
]:
session_cls = ChatGLMGenerationSession
else:
session_cls = GenerationSession
if not hasattr(self, 'num_kv_heads') or self.num_kv_heads is None:
self.num_kv_heads = self.num_heads
@ -155,50 +99,11 @@ class GPTBenchmark(BaseBenchmark):
gpu_weights_percent=list(sorted(gpu_weights_percents))[0],
**rnn_configs_kwargs,
)
if args.model == 'chatglm_6b':
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
end_id=130005,
pad_id=3,
num_beams=self.num_beams,
top_k=args.top_k,
top_p=args.top_p)
self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession(
model_config, engine_buffer, self.runtime_mapping)
elif args.model in ['chatglm2_6b', 'chatglm3_6b']:
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
end_id=2,
pad_id=0,
num_beams=self.num_beams,
top_k=args.top_k,
top_p=args.top_p)
self.decoder = tensorrt_llm.runtime.GenerationSession(
model_config, engine_buffer, self.runtime_mapping)
if args.model == 'glm_10b':
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
end_id=50258,
pad_id=50256,
num_beams=self.num_beams,
top_k=args.top_k,
top_p=args.top_p)
self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession(
model_config, engine_buffer, self.runtime_mapping)
else:
end_id = 50256
pad_id = 50256
if "llama" in args.model:
end_id = 2
pad_id = 0
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
end_id=end_id,
pad_id=pad_id,
num_beams=self.num_beams,
top_k=args.top_k,
top_p=args.top_p)
self.decoder = tensorrt_llm.runtime.GenerationSession(
model_config,
engine_buffer,
self.runtime_mapping,
cuda_graph_mode=self.cuda_graph_mode)
self.sampling_config = SamplingConfig(end_id=2, pad_id=0)
self.decoder = session_cls(model_config,
engine_buffer,
self.runtime_mapping,
cuda_graph_mode=self.cuda_graph_mode)
# Print context memory size for CI/CD to track.
context_mem_size = self.decoder.context_mem_size
@ -260,72 +165,6 @@ class GPTBenchmark(BaseBenchmark):
benchmark_profiler=benchmark_profiler)
torch.cuda.synchronize()
@staticmethod
def kv_cache_elem_per_token(config: BuildConfig, tp_size, pp_size) -> int:
# you need to multiply the size by element size, and multiply by the seq length
# Warning: this function returns the upper bound between different ranks when any one of the following is true:
# num_layer % pp_size !=0, hidden_size % num_kv_heads != 0, num_kv_heads % tp_size != 0
local_nlayers = ceil(config.num_layers / pp_size)
kv_heads = config.num_kv_heads if config.num_kv_heads is not None else config.num_heads
size_per_head = ceil(config.hidden_size / kv_heads)
local_heads = ceil(kv_heads / tp_size)
return 2 * local_nlayers * size_per_head * local_heads
def check_memory(self, io_shapes: list, raise_exception=False):
'''Compare the estimated GPU memory requirements for weights + activations + kv cache with the total GPU memory and log it.
Raise exception when the \p raise_exception parameter is true.
'''
# we don't want to block the test due to this
if self.build_config is None:
tensorrt_llm.logger.warning(
"Didn't have the build config object, skipping check the memory"
)
return
assert isinstance(self.build_config, BuildConfig)
batch_size, inlen, outlen = io_shapes[0], io_shapes[1], io_shapes[2]
kv_cache_size_in_bytes = batch_size*self.num_beams*(inlen + outlen)* \
self.kv_cache_elem_per_token(self.build_config, self.runtime_mapping.tp_size, self.runtime_mapping.pp_size) * element_size(self.kv_dtype)
# when MHA is OOTB, it requires extra KV cache size, because OOTB don't support inplace updating KV cache.
if not self.use_gpt_attention_plugin:
local_n_layer = ceil(self.build_config.num_layers /
self.runtime_mapping.pp_size)
kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
local_n_layer + 1)
kv_cache_size_in_mb = bytes_to_target_unit(kv_cache_size_in_bytes,
"MiB")
activation_size_in_mb = bytes_to_target_unit(
self.decoder.runtime.engine.device_memory_size, "MiB")
weights_size_in_mb = bytes_to_target_unit(self.weights_size_approx,
"MiB")
total_memory_approx_in_mb = kv_cache_size_in_mb + activation_size_in_mb + weights_size_in_mb
_, _, total = tensorrt_llm.profiler.device_memory_info()
total_in_mb = bytes_to_target_unit(total, 'MiB')
prefix = "[Memory Estimation]"
mem_msg = f"{prefix} activation memory:{activation_size_in_mb:.3f} MiB, kv_cache:{kv_cache_size_in_mb:.3f} MiB, weights approximate:{weights_size_in_mb:.3f} MiB, " \
f"approximate required GPU memory: {total_memory_approx_in_mb:.3f} MiB, total GPU memory: {total_in_mb:.3f} MiB"
tensorrt_llm.logger.info(mem_msg)
build_args = dict(batch_size=batch_size,
num_beams=self.num_beams,
input_length=inlen,
output_length=outlen,
max_batch_size=self.build_config.max_batch_size,
max_input_len=self.build_config.max_input_len,
max_seq_len=self.build_config.max_seq_len,
max_beam_width=self.build_config.max_beam_width)
for k, v in build_args.items():
tensorrt_llm.logger.info(f"{prefix} {k}:{v}")
tensorrt_llm.logger.info(
"grep the \"Total Activation\" and \"Total Weights\" from verbose TRT engine build log to see the precise memory size for those."
)
if raise_exception and total_memory_approx_in_mb >= total_in_mb:
raise Exception(
"Total memory estimation bigger than total gpu memory, the case will likely to OOM, needs enhancement of waive the test case, see logs about the memory usage details"
)
def report(self,
config,
latency,
@ -348,7 +187,6 @@ class GPTBenchmark(BaseBenchmark):
report_dict["input_length"] = inlen
report_dict["output_length"] = outlen
report_dict["latency(ms)"] = latency
report_dict["build_time(s)"] = self.build_time
report_dict["tokens_per_sec"] = tokens_per_sec
report_dict["percentile95(ms)"] = percentile95
report_dict["percentile99(ms)"] = percentile99

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3e25541cdc2aaa48f6a6e4c386d22ca1832c8e120fc6e8c190db4ee066ebfb1f
size 4293186
oid sha256:7eec52cb658f033cf3146017cbaa3ea1554942ee7ece49329ddf7b01361fa080
size 4293100

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:3108cd0580f6328bd46238ef708872d9d8030a9c8645b8b52bc750dfe094bc16
size 4395794
oid sha256:cf65778d6469a5a85bf2191fb104094aa4e606b370a25475a16017329e27fd95
size 4395148

View File

@ -1,3 +1,3 @@
50a839e98b31729198870fc99ef2c5a9 libtensorrt_llm_batch_manager_static.a
a39a5bf618c8514725b59aac4513223f libtensorrt_llm_batch_manager_static.pre_cxx11.a
3511a2653f2ba73f6f827aca6d2850b3d3e8e543 commit
08d59f31da00044ae21995c6573a55da libtensorrt_llm_batch_manager_static.a
abdb9b58e0a4587d2d2ce6bc83655f8a libtensorrt_llm_batch_manager_static.pre_cxx11.a
315e9f5ccd286e906d4c0d402fefbf2f69a1febe commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9600435f1b9ab74c752d1831e1a6684a004927c84ab7c61fc076dbc128ca1521
size 4154674
oid sha256:e339bca2212b46c6227b328fc376db4628a0a96636b5f2b5b3ae387e884b7f01
size 4155892

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:8145ecf59dea64448ca0969553d32bc99e119cc5fc703e7b47eccfb5886594a0
size 4133178
oid sha256:7503446c4ef7b959970fc02b33ca81dd0dece0663d9a0f8b881c60ff66006000
size 4136818

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f89f551a880f4c6c1e68ed72b951ac482dec6033e55a336a0ecc401f4e9cf150
size 24009160
oid sha256:51174b20ed939662c92d21cdd5a0fd652a6592947270182ff026eb3a4153e4cf
size 24015602

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:33f259b374a02456f2b8d44571d92195b708c2011be4ecabe46267f49ca24c29
size 1426724
oid sha256:19fdeb78169c29492026b62bf147481e2b0d893916d9a20333d83fb61c0abe36
size 1428026

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f44786aee0842bdb260de49b734d2119a0521c650f0b733f5ce6f997e72bfb34
size 1452984
oid sha256:1d7f36c49f24730e4038c2252b966870789d9c9cff698ccd50d0f61ae85fcc9d
size 1455538

View File

@ -1,3 +1,3 @@
0d5e559ebc885794ab9e63086ae7a18a libtensorrt_llm_executor_static.a
f9a3d1bf32f33f88569d4d8635e5445a libtensorrt_llm_executor_static.pre_cxx11.a
3511a2653f2ba73f6f827aca6d2850b3d3e8e543 commit
5bdad7b823b79b1b91439693aa25cff5 libtensorrt_llm_executor_static.a
566734842bb731319971850583fdc9c7 libtensorrt_llm_executor_static.pre_cxx11.a
315e9f5ccd286e906d4c0d402fefbf2f69a1febe commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:19bd908d16990cd11a295fcb71403e2ad285dc2c3b84d55228166d9240acd0d9
size 1476318
oid sha256:58e3e6d7414ab730ba54c8aabdc5f193787b44699e1289279428087cbb2e46d4
size 1478178

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:bed0b93d23eef43ce46c01e694f9e578c64fe9b30e1b05d65b7feed1a41e5148
size 1408208
oid sha256:5f6598d6c2dafd9b97edfeb8fc424607374e8791c4e334cfaaf5cae865da15c6
size 1410466

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:473c672353cb813af9ea65250bd79f61f5ea27c369c9f35bc3bace1e22c5e9bb
size 14325956
oid sha256:93e0c81a8d00db0e860cdfdafbae7391e0d2956c2301da1f22ef6419bcb4e02f
size 14321264

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:286c47b52c5955ef4d2b5bd54cf555f6bafdb307a413949e1edafe4db991c887
oid sha256:df3429c2cc6bffe3e3d12fc444426427676a85e281cab4456e5d0a03e4a6828f
size 80318200

View File

@ -1,2 +1,2 @@
28ead889239ca8d558c1e1a93f0485b0 libtensorrt_llm_nvrtc_wrapper.so
3511a2653f2ba73f6f827aca6d2850b3d3e8e543 commit
957f7c6034dca28dff7afe65ed68aa4b libtensorrt_llm_nvrtc_wrapper.so
315e9f5ccd286e906d4c0d402fefbf2f69a1febe commit

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f396ee533b289e7326df9061be8abba46ae061a61011c60c19051cbe219461e3
oid sha256:829e6d2ccaed3c0e8ff351a6c418c65a9260433eff6f08feb41b3bab33d84fb4
size 83552896

View File

@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:20824706210bf184641c92fcb728ab0a3a74a36bc0b13e243c713a84c74a51ac
size 1089536
oid sha256:73ea01f6014e5c11a263f342f8c19f3a1b8bfa824441accd3cb4b7fa699a9d9a
size 1087488

View File

@ -98,6 +98,8 @@ void PenaltyLayer<T>::allocateBuffer()
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
mLogitsPtrsHost = mBufferManager->pinnedPool(ITensor::makeShape({}), TRTDataType<T*>::value);
mLogitsPtrsDevice
= mBufferManager->gpu(ITensor::makeShape({mDecoderDomain.getBatchSize()}), TRTDataType<T*>::value);
auto const batchSizeShape = ITensor::makeShape({mDecoderDomain.getBatchSize()});
mTemperature = mBufferManager->pinnedPool(batchSizeShape, TRTDataType<float>::value);
mRepetitionPenalty = mBufferManager->pinnedPool(batchSizeShape, TRTDataType<float>::value);
@ -233,6 +235,7 @@ void PenaltyLayer<T>::forwardAsync(
mCyclicStep = mCyclicStep % mRuntimeMaxSeqLen;
TensorPtr logitsPtrsHost = ITensor::slice(mLogitsPtrsHost, mCyclicStep, 1);
logitsPtrsHost->squeeze(0);
auto logitsPtrsHostData = bufferCast<T*>(*logitsPtrsHost);
for (SizeType32 bi = 0; bi < localDecoderDomain.getBatchSize(); bi++)
{
@ -274,7 +277,13 @@ void PenaltyLayer<T>::forwardAsync(
auto const tokensPerStep = bufferCastOrNull<SizeType32>(params->curTokensPerStep);
InvokeBatchApplyPenaltyParams<T> penaltyParams;
penaltyParams.inputLogits = reinterpret_cast<T const* const*>(logitsPtrsHostData);
{ // Moving the logits ptrs to device for faster access during kernel execution.
TensorPtr logitsPtrsDeviceSlice = ITensor::slice(mLogitsPtrsDevice, 0, localDecoderDomain.getBatchSize());
TensorPtr logitsPtrsHostSlice = ITensor::slice(logitsPtrsHost, 0, localDecoderDomain.getBatchSize());
mBufferManager->copy(*logitsPtrsHostSlice, *logitsPtrsDeviceSlice);
penaltyParams.inputLogits = reinterpret_cast<T const* const*>(bufferCast<T const*>(*logitsPtrsDeviceSlice));
}
penaltyParams.outputLogits = bufferCast<T>(*mRuntimeLogitsDevice);
penaltyParams.biases = embeddingBias;
penaltyParams.penaltyWorkspace = bufferCastOrNull<TokenIdType>(mPenaltyWorkspaceDevice);

View File

@ -91,6 +91,7 @@ private:
BufferPtr mPenaltyWorkspaceDevice;
BufferPtr mPenaltyWorkspacePrevDevice;
TensorPtr mLogitsPtrsHost;
TensorPtr mLogitsPtrsDevice;
};
} // namespace tensorrt_llm::layers

View File

@ -75,7 +75,7 @@ void _runGemm(int const M, int const N, int const K, bool const transA, bool con
LoraPlugin::LoraPlugin(int in_hidden_size, std::vector<int> out_hidden_sizes, int transA, int transB,
int num_lora_modules, nvinfer1::DataType type, LoraPlugin::PluginProfilerPtr const& pluginProfiler,
bool remove_input_padding, int max_context_length, int max_low_rank, int weight_index)
bool remove_input_padding, int max_num_tokens, int max_low_rank, int weight_index)
: mInHiddenSize(in_hidden_size)
, mTransA(transA)
, mTransB(transB)
@ -83,7 +83,7 @@ LoraPlugin::LoraPlugin(int in_hidden_size, std::vector<int> out_hidden_sizes, in
, mType(type)
, mPluginProfiler(pluginProfiler)
, mRemoveInputPadding(remove_input_padding)
, mMaxContextLength(max_context_length)
, mMaxNumTokens(max_num_tokens)
, mMaxLowRank(max_low_rank)
, mWeightIndex(weight_index)
{
@ -105,7 +105,7 @@ LoraPlugin::LoraPlugin(void const* data, size_t length, LoraPlugin::PluginProfil
read(d, mNumLoraModules);
read(d, mType);
read(d, mRemoveInputPadding);
read(d, mMaxContextLength);
read(d, mMaxNumTokens);
read(d, mMaxLowRank);
read(d, mWeightIndex);
mOutHiddenSizes.resize(mNumLoraModules);
@ -266,10 +266,9 @@ void LoraPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, in
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
}
int64_t getLowRankWorkSpaceSize(
int64_t nbReq, int64_t maxContextLength, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t typeSize)
int64_t getLowRankWorkSpaceSize(int64_t maxNumTokens, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t typeSize)
{
return divUp(nbReq * maxContextLength * maxLoraModuleNum * maxLowRank * typeSize, 16) * 16;
return divUp(maxNumTokens * maxLoraModuleNum * maxLowRank * typeSize, 16) * 16;
}
int64_t getGroupedGemmParamsWorkSpaceSize(int64_t nbReq)
@ -278,16 +277,15 @@ int64_t getGroupedGemmParamsWorkSpaceSize(int64_t nbReq)
}
int64_t getSplitkGroupedGemmWorkSpaceSize(
int64_t nbReq, int64_t maxContextLength, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
int64_t maxNumTokens, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
{
return divUp(nbReq * maxContextLength * maxLoraModuleNum * maxLowRank * sizeof(float) * splitKSlices, 16) * 16;
return divUp(maxNumTokens * maxLoraModuleNum * maxLowRank * sizeof(float) * splitKSlices, 16) * 16;
}
int64_t getGemmWorkSpaceSize(
int64_t nbReq, int64_t maxContextLength, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
int64_t getGemmWorkSpaceSize(int64_t maxNumTokens, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
{
return std::max((int64_t) CUBLAS_WORKSPACE_SIZE,
getSplitkGroupedGemmWorkSpaceSize(nbReq, maxContextLength, maxLoraModuleNum, maxLowRank, splitKSlices));
getSplitkGroupedGemmWorkSpaceSize(maxNumTokens, maxLoraModuleNum, maxLowRank, splitKSlices));
}
size_t LoraPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs,
@ -298,8 +296,8 @@ size_t LoraPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, in
auto const type = inputs[getInputTensorIdx()].type;
auto const typeSize = tensorrt_llm::runtime::BufferDataType(type).getSize();
return (size_t) getGemmWorkSpaceSize(nbReq, mMaxContextLength, mNumLoraModules, mMaxLowRank, mSplitKSlices)
+ getLowRankWorkSpaceSize(nbReq, mMaxContextLength, mNumLoraModules, mMaxLowRank, typeSize)
return (size_t) getGemmWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices)
+ getLowRankWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, typeSize)
+ getGroupedGemmParamsWorkSpaceSize(nbReq * mNumLoraModules);
}
@ -361,13 +359,12 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
= mRemoveInputPadding ? static_cast<int32_t const*>(inputs[getHostContextLengthsIdx()]) : nullptr;
RequestType const* reqTypes = static_cast<RequestType const*>(inputs[getHostRequestTypesIdx()]);
int64_t GemmWorkSpaceSize
= getGemmWorkSpaceSize(batch_size, mMaxContextLength, mNumLoraModules, mMaxLowRank, mSplitKSlices);
int64_t GemmWorkSpaceSize = getGemmWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices);
int64_t groupGemmParamsWorkSpaceSize = getGroupedGemmParamsWorkSpaceSize(batch_size * mNumLoraModules);
void* gemmWorkSpace = workspace; // [gemmWorkSpace, lowrankWorkSpace, groupGemmParamsWorkSpace]
void* lowRankWorkSpace = static_cast<char*>(gemmWorkSpace) + GemmWorkSpaceSize;
void* groupGemmParamsWorkSpace = static_cast<char*>(lowRankWorkSpace)
+ getLowRankWorkSpaceSize(batch_size, mMaxContextLength, mNumLoraModules, mMaxLowRank, typeSize);
+ getLowRankWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, typeSize);
bool isWithLora = isEnableLora(batch_size, mNumLoraModules, &inputs[getLoraRanksIdx()]);
@ -514,21 +511,15 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
ptrB.push_back(
reinterpret_cast<void*>(lora_weights_ptr[batchIdx * 2] + K * N * typeSize * mWeightIndex));
ptrC.push_back(static_cast<void*>(static_cast<char*>(lowRankWorkSpace)
+ (loraModuleIdx * batch_size * mMaxContextLength * mMaxLowRank
+ handled_token_num * mMaxLowRank)
* typeSize));
+ (loraModuleIdx * mMaxNumTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));
ptrD.push_back(static_cast<void*>(static_cast<char*>(lowRankWorkSpace)
+ (loraModuleIdx * batch_size * mMaxContextLength * mMaxLowRank
+ handled_token_num * mMaxLowRank)
* typeSize));
+ (loraModuleIdx * mMaxNumTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));
auto const N2 = outputDesc[loraModuleIdx].dims.d[nbDimsA - 1];
cutlass::gemm::GemmCoord problem_2(M, N2, N);
problem_sizes_2.push_back(problem_2);
ptrA_2.push_back(static_cast<void*>(static_cast<char*>(lowRankWorkSpace)
+ (loraModuleIdx * batch_size * mMaxContextLength * mMaxLowRank
+ handled_token_num * mMaxLowRank)
* typeSize));
+ (loraModuleIdx * mMaxNumTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));
ptrB_2.push_back(
reinterpret_cast<void*>(lora_weights_ptr[batchIdx * 2 + 1] + N2 * N * typeSize * mWeightIndex));
ptrC_2.push_back(static_cast<void*>(
@ -603,7 +594,7 @@ size_t LoraPlugin::getSerializationSize() const noexcept
{
TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
return sizeof(mInHiddenSize) + sizeof(mTransA) + sizeof(mTransB) + sizeof(mNumLoraModules) + sizeof(mType)
+ mPluginProfiler->getSerializationSize(mGemmId) + sizeof(mRemoveInputPadding) + sizeof(mMaxContextLength)
+ mPluginProfiler->getSerializationSize(mGemmId) + sizeof(mRemoveInputPadding) + sizeof(mMaxNumTokens)
+ sizeof(mMaxLowRank) + sizeof(mWeightIndex) + sizeof(int) * mNumLoraModules; // selected tactics container size
}
@ -617,7 +608,7 @@ void LoraPlugin::serialize(void* buffer) const noexcept
write(d, mNumLoraModules);
write(d, mType);
write(d, mRemoveInputPadding);
write(d, mMaxContextLength);
write(d, mMaxNumTokens);
write(d, mMaxLowRank);
write(d, mWeightIndex);
for (int i = 0; i < mNumLoraModules; i++)
@ -674,7 +665,7 @@ IPluginV2* LoraPluginCreator::createPlugin(char const* name, PluginFieldCollecti
int num_lora_modules;
int in_hidden_size, transA, transB;
bool remove_input_padding;
int max_context_length;
int max_num_tokens;
int max_low_rank;
int weight_index;
// Read configurations from each fields
@ -706,10 +697,10 @@ IPluginV2* LoraPluginCreator::createPlugin(char const* name, PluginFieldCollecti
TLLM_CHECK(fields[i].type == PluginFieldType::kINT8);
remove_input_padding = static_cast<bool>(*(static_cast<int8_t const*>(fields[i].data)));
}
else if (!strcmp(attrName, "max_context_length"))
else if (!strcmp(attrName, "max_num_tokens"))
{
TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
max_context_length = *(static_cast<int const*>(fields[i].data));
max_num_tokens = *(static_cast<int const*>(fields[i].data));
}
else if (!strcmp(attrName, "max_low_rank"))
{
@ -748,7 +739,7 @@ IPluginV2* LoraPluginCreator::createPlugin(char const* name, PluginFieldCollecti
// FIXME enable tactic profiler
auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false, /* skip */ true);
auto* obj = new LoraPlugin(in_hidden_size, out_hidden_sizes, transA, transB, num_lora_modules, type,
pluginProfiler, remove_input_padding, max_context_length, max_low_rank, weight_index);
pluginProfiler, remove_input_padding, max_num_tokens, max_low_rank, weight_index);
obj->setPluginNamespace(mNamespace.c_str());
return obj;
}

View File

@ -39,7 +39,7 @@ public:
LoraPlugin() = delete;
LoraPlugin(int in_hidden_size, std::vector<int> out_hidden_sizes, int transA, int transB, int num_lora_modules,
nvinfer1::DataType type, PluginProfilerPtr const& profiler, bool remove_input_padding, int max_context_length,
nvinfer1::DataType type, PluginProfilerPtr const& profiler, bool remove_input_padding, int max_num_tokens,
int max_low_rank, int weight_index);
LoraPlugin(void const* data, size_t length, PluginProfilerPtr const& profiler);
@ -121,7 +121,7 @@ private:
int mTransB;
nvinfer1::DataType mType;
bool mRemoveInputPadding;
int mMaxContextLength;
int mMaxNumTokens;
int mMaxLowRank;
int mNumLoraModules;
int mWeightIndex;

View File

@ -522,12 +522,6 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
run_command(trt_model_test, cwd=tests_dir, env=cpp_env,
timeout=timeout) # expecting ~ 1200s
cpp_blocking_env = copy.copy(cpp_env)
cpp_blocking_env["CUDA_LAUNCH_BLOCKING"] = '1'
run_command(trt_model_test,
cwd=tests_dir,
env=cpp_blocking_env,
timeout=timeout) # expecting ~ 1200s
#Executor test in leader mode
new_env = copy.copy(cpp_env)

View File

@ -41,20 +41,14 @@ python3 examples/summarize.py \
We can also benchmark the efficiency of Weight Streaming. Here is an example:
```bash
python3 benchmarks/python/benchmark.py \
-m opt_30b \
--mode ootb \
--engine_dir /tmp/llama_7b/trt_engines/fp16/1-gpu/ \
--batch_size "1;32" \
--max_batch_size "32" \
--input_output_len "256,32" \
--max_input_len 256\
--max_seq_len 288 \
--gpu_weights_percent "0.0;0.3;0.6;1.0" \
--dtype float16 \
--csv \
--log_level verbose
```
Here we use `ootb` mode so that the GEMM operators won't use plugins. `ootb-except-mha` mode is also valid.
### API Changes

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.15.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
evaluate~=0.4.1
protobuf

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -241,17 +241,19 @@ In `benchmarks/python/`:
```bash
# Example 1: Single-GPU benchmark
python benchmark.py \
-m t5_small \
-m enc-dec \
--batch_size "1;8" \
--input_output_len "60,20;128,20" \
--engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} \
--dtype float32 \
--csv # optional
# Example 2: Multi-GPU benchmark
mpirun --allow-run-as-root -np 4 python benchmark.py \
-m t5_small \
-m enc-dec \
--batch_size "1;8" \
--input_output_len "60,20;128,20" \
--engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} \
--dtype float32 \
--csv # optional
```

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
transformers>=4.31.0
datasets~=2.14.5
evaluate~=0.4.1

View File

@ -3,7 +3,7 @@
# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
# "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
flax~=0.8.0
# jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
jax~=0.4.19; platform_system == "Windows"

File diff suppressed because it is too large Load Diff

View File

@ -22,10 +22,10 @@ from pathlib import Path
import numpy as np
import torch
import yaml
from convert_checkpoint import cpu_map_location, unpack_nemo_ckpt
from tensorrt_llm._utils import str_dtype_to_torch, to_json_file, torch_to_numpy
from tensorrt_llm.lora_manager import LoraManager, get_all_nemo_lora_weights
from tensorrt_llm.models.gpt.convert import cpu_map_location, unpack_nemo_ckpt
log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
logging.basicConfig(format=log_format)

View File

@ -22,9 +22,9 @@ from pathlib import Path
import numpy as np
import torch
import yaml
from convert_checkpoint import cpu_map_location, unpack_nemo_ckpt
from tensorrt_llm._utils import torch_to_numpy
from tensorrt_llm.models.gpt.convert import cpu_map_location, unpack_nemo_ckpt
log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
logging.basicConfig(format=log_format)

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,17 +1,15 @@
import argparse
import json
import os
import time
import traceback
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import Dict, Optional, Tuple
import safetensors
import torch
from transformers import AutoModelForCausalLM, GPTJConfig, GPTJForCausalLM
from transformers import AutoModelForCausalLM
import tensorrt_llm
from tensorrt_llm.hlapi import QuantConfig
from tensorrt_llm.mapping import Mapping
from tensorrt_llm.models import GPTJConfig, GPTJForCausalLM
from tensorrt_llm.quantization import QuantAlgo
@ -68,298 +66,44 @@ def parse_arguments():
return args
def load_gptj_config(model_dir: str) -> GPTJConfig:
""" Helper utility to load GPTJConfig.
A pretrained checkpoint from modeling_RW.py has a different structure
and is not compatible with `transformers.GPTJConfig` and
`transformers.GPTJModel`. We need to manually set the config values.
"""
config = GPTJConfig.from_pretrained(model_dir)
return config
def split(weight: torch.Tensor,
tp_size: int,
rank: int = 0,
dim: int = 0) -> torch.Tensor:
if tp_size == 1:
return weight
elif weight.ndim == 1:
return torch.chunk(weight, tp_size)[rank].contiguous()
else:
return torch.chunk(weight, tp_size, dim=dim)[rank].contiguous()
def split_matrix(weight: torch.Tensor, tp_size: int, rank: int,
dim: int) -> torch.Tensor:
return split(weight, tp_size, rank, dim=dim)
def get_weight(params: Dict[str, torch.Tensor], prefix: str,
dtype: torch.dtype) -> torch.Tensor:
if f'{prefix}.weight' not in params:
return None
return params[f'{prefix}.weight'].to(dtype).detach().cpu()
def get_bias(params: Dict[str, torch.Tensor], prefix: str,
dtype: torch.dtype) -> torch.Tensor:
if f'{prefix}.bias' not in params:
return None
return params[f'{prefix}.bias'].to(dtype).detach().cpu()
def get_weight_and_bias(params: Dict[str, torch.Tensor], prefix: str,
dtype: torch.dtype) -> Tuple[torch.Tensor]:
return get_weight(params, prefix, dtype), get_bias(params, prefix, dtype)
def get_tllm_linear_weight(
weight: torch.Tensor,
prefix: str,
bias: Optional[torch.Tensor] = None,
use_weight_only: bool = False,
plugin_weight_only_quant_type: torch.dtype = torch.int8
) -> Dict[str, torch.Tensor]:
results = {}
if use_weight_only:
v = weight.t().contiguous()
processed_torch_weights, torch_weight_scales = \
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
v, plugin_weight_only_quant_type)
results[f'{prefix}.weight'] = processed_torch_weights
results[f'{prefix}.per_channel_scale'] = torch_weight_scales
else:
results[f'{prefix}.weight'] = weight.contiguous()
if bias is not None:
results[f'{prefix}.bias'] = bias
return results
def get_tllm_param(
param: torch.Tensor,
name: str,
use_weight_only: bool = False,
plugin_weight_only_quant_type: torch.dtype = torch.int8
) -> Dict[str, torch.Tensor]:
results = {}
if name.endswith('.weight') and use_weight_only:
v = param.t().contiguous()
processed_torch_weights, torch_weight_scales = \
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
v, plugin_weight_only_quant_type)
results[name] = processed_torch_weights
results[name.replace('weight',
'per_channel_scale')] = torch_weight_scales
else:
results[name] = param
return results
def convert_hf_gptj(hf_model: GPTJForCausalLM,
hf_config: GPTJConfig,
mapping: Mapping,
dtype: str = 'float32',
use_weight_only: bool = False,
plugin_weight_only_quant_type: torch.dtype = torch.int8):
weights = {}
tik = time.time()
model_params = dict(hf_model.named_parameters())
dtype = getattr(torch, dtype)
num_hidden_layers = hf_config.num_hidden_layers
layers_range = mapping.pp_layers(num_hidden_layers)
for l in layers_range:
prefix = f'transformer.h.{l}'
tllm_prex = f'transformer.layers.{l-layers_range[0]}'
# Attention QKV (no bias)
q_weight = get_weight(model_params, f'{prefix}.attn.q_proj', dtype)
k_weight = get_weight(model_params, f'{prefix}.attn.k_proj', dtype)
v_weight = get_weight(model_params, f'{prefix}.attn.v_proj', dtype)
q_w = split_matrix(q_weight, mapping.tp_size, mapping.tp_rank, dim=0)
k_w = split_matrix(k_weight, mapping.tp_size, mapping.tp_rank, dim=0)
v_w = split_matrix(v_weight, mapping.tp_size, mapping.tp_rank, dim=0)
qkv_w = torch.concatenate([q_w, k_w, v_w], dim=0)
weights.update(
get_tllm_linear_weight(qkv_w, f'{tllm_prex}.attention.qkv', None,
use_weight_only,
plugin_weight_only_quant_type))
# Attention dense (not bias)
attn_dense_weight = get_weight(model_params, f'{prefix}.attn.out_proj',
dtype)
attn_dense_w = split_matrix(attn_dense_weight,
mapping.tp_size,
mapping.tp_rank,
dim=1)
weights.update(
get_tllm_linear_weight(attn_dense_w, f'{tllm_prex}.attention.dense',
None, use_weight_only,
plugin_weight_only_quant_type))
# MLP fc_in (with bias)
mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(
model_params, f'{prefix}.mlp.fc_in', dtype)
mlp_fc_w = split_matrix(mlp_fc_weight,
mapping.tp_size,
mapping.tp_rank,
dim=0)
mlp_fc_b = split_matrix(mlp_fc_bias,
mapping.tp_size,
mapping.tp_rank,
dim=0)
weights.update(
get_tllm_linear_weight(mlp_fc_w, f'{tllm_prex}.mlp.fc', mlp_fc_b,
use_weight_only,
plugin_weight_only_quant_type))
# MLP fc_out (with bias)
mlp_proj_weight, mlp_proj_bias = get_weight_and_bias(
model_params, f'{prefix}.mlp.fc_out', dtype)
mlp_proj_w = split_matrix(mlp_proj_weight,
mapping.tp_size,
mapping.tp_rank,
dim=1)
# Only rank0 will get bias
if mapping.tp_size > 1 and mapping.tp_rank > 0:
mlp_proj_bias = torch.zeros(mlp_proj_weight.shape[0],
dtype=mlp_proj_weight.dtype)
weights.update(
get_tllm_linear_weight(mlp_proj_w, f'{tllm_prex}.mlp.proj',
mlp_proj_bias, use_weight_only,
plugin_weight_only_quant_type))
input_ln_weight, input_ln_bias = get_weight_and_bias(
model_params, f'{prefix}.ln_1', dtype)
weights[f'{tllm_prex}.input_layernorm.weight'] = input_ln_weight
weights[f'{tllm_prex}.input_layernorm.bias'] = input_ln_bias
if mapping.is_first_pp_rank():
# Embedding
embed_w = get_weight(model_params, 'transformer.wte', dtype)
weights['transformer.vocab_embedding.weight'] = embed_w
if mapping.is_last_pp_rank():
# lm_head weight and bias
lm_head_w, ln_head_bias = get_weight_and_bias(model_params, 'lm_head',
dtype)
weights['lm_head.weight'] = split_matrix(lm_head_w,
mapping.tp_size,
mapping.tp_rank,
dim=0)
weights['lm_head.bias'] = split_matrix(ln_head_bias,
mapping.tp_size,
mapping.tp_rank,
dim=0)
ln_f_w, ln_f_b = get_weight_and_bias(model_params, 'transformer.ln_f',
dtype)
# ln_f weight and bias
weights['transformer.ln_f.weight'] = ln_f_w
if ln_f_b is not None:
weights['transformer.ln_f.bias'] = ln_f_b
tok = time.time()
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
print(f'Weights loaded. Total time: {t}')
return weights
def main():
# TODO(qijun): Currently, the convert script depends on a torch op:
# torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix,
# which is included in tensorrt_llm Python package. Otherwise, the convert
# script does not need to import tensorrt_llm. Will remove it after reimplementing
# the op with PyTorch.
print(tensorrt_llm.__version__)
args = parse_arguments()
world_size = args.tp_size * args.pp_size
tik = time.time()
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
def args_to_quant_config(args):
quant_algo = None
plugin_weight_only_quant_type = None
if args.use_weight_only and args.weight_only_precision == 'int8':
plugin_weight_only_quant_type = torch.int8
quant_algo = QuantAlgo.W8A16
elif args.use_weight_only and args.weight_only_precision == 'int4':
plugin_weight_only_quant_type = torch.quint4x2
quant_algo = QuantAlgo.W4A16
return QuantConfig(quant_algo=quant_algo)
if args.model_dir is not None:
hf_config = load_gptj_config(args.model_dir)
architecture = hf_config.architectures[0]
args.vocab_size = hf_config.vocab_size
args.n_positions = hf_config.max_position_embeddings
args.n_layer = hf_config.num_hidden_layers
args.n_head = hf_config.num_attention_heads
args.n_embd = hf_config.hidden_size
args.norm_eps = hf_config.layer_norm_epsilon
args.rotary_dim = hf_config.rotary_dim
else:
architecture = "GPTJForCausalLM"
config = {
'architecture': architecture,
'dtype': args.dtype,
'num_hidden_layers': args.n_layer,
'num_attention_heads': args.n_head,
'hidden_size': args.n_embd,
'norm_epsilon': args.norm_eps,
'vocab_size': args.vocab_size,
'position_embedding_type': 'rope_gptj',
'max_position_embeddings': args.n_positions,
'hidden_act': 'gelu',
'quantization': {
'quant_algo': quant_algo
},
'mapping': {
'world_size': world_size,
'tp_size': args.tp_size,
'pp_size': args.pp_size,
},
'rotary_dim': args.rotary_dim,
}
def convert_and_save_hf(args):
model_dir = args.model_dir
world_size = args.tp_size * args.pp_size
quant_config = args_to_quant_config(args)
with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
json.dump(config, f, indent=4)
hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
torch_dtype='auto',
trust_remote_code=True)
if args.model_dir is None:
return
hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
trust_remote_code=True,
torch_dtype="auto")
def covert_and_save(rank):
def convert_and_save_rank(args, rank):
mapping = Mapping(world_size=world_size,
rank=rank,
tp_size=args.tp_size,
pp_size=args.pp_size)
weights = convert_hf_gptj(
hf_model,
hf_config,
mapping,
dtype=args.dtype,
use_weight_only=args.use_weight_only,
plugin_weight_only_quant_type=plugin_weight_only_quant_type)
safetensors.torch.save_file(
weights, os.path.join(args.output_dir, f'rank{rank}.safetensors'))
model = GPTJForCausalLM.from_hugging_face(hf_model,
args.dtype,
mapping=mapping,
quant_config=quant_config)
model.save_checkpoint(args.output_dir, save_config=(rank == 0))
del model
if args.workers == 1:
for rank in range(world_size):
covert_and_save(rank)
convert_and_save_rank(args, rank)
else:
with ThreadPoolExecutor(max_workers=args.workers) as p:
futures = [
p.submit(covert_and_save, rank) for rank in range(world_size)
p.submit(convert_and_save_rank, args, rank)
for rank in range(world_size)
]
exceptions = []
for future in as_completed(futures):
@ -373,6 +117,38 @@ def main():
) == 0, "Checkpoint conversion failed, please check error log."
del hf_model
def main():
print(tensorrt_llm.__version__)
args = parse_arguments()
tik = time.time()
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
if args.model_dir is None:
config = GPTJConfig(architecture='GPTJForCausalLM',
dtype=args.dtype,
num_hidden_layers=args.n_layer,
num_attention_heads=args.n_head,
hidden_size=args.n_embd,
norm_epsilon=args.norm_eps,
vocab_size=args.vocab_size,
position_embedding_type='rope_gptj',
max_position_embeddings=args.n_positions,
hidden_act='gelu',
rotary_dim=args.rotary_dim,
mapping=Mapping(world_size=args.tp_size *
args.pp_size,
tp_size=args.tp_size,
pp_size=args.pp_size),
quantization=args_to_quant_config(args))
config.to_json_file(os.path.join(args.output_dir, 'config.json'))
else:
convert_and_save_hf(args)
tok = time.time()
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
print(f'Total time of converting checkpoints: {t}')

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,14 +0,0 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -1,273 +0,0 @@
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
Utilities for exporting a model to our custom format.
"""
import numpy as np
import torch
from tensorrt_llm._utils import torch_to_numpy
def cpu_map_location(storage, loc):
return storage.cpu()
def gpu_map_location(storage, loc):
if loc.startswith("cuda"):
training_gpu_idx = int(loc.split(":")[1])
inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
return storage.cuda(inference_gpu_idx)
elif loc.startswith("cpu"):
return storage.cpu()
else:
raise ValueError(f"Not handled {loc}")
def save_val(val, dir, key, tp_num=None):
suffix = "bin" if tp_num is None else f"{tp_num}.bin"
val.tofile(dir / f"model.{key}.{suffix}")
def save_split(split_vals, dir, key, i, split_factor):
for j, val in enumerate(split_vals):
save_val(val, dir, key, i * split_factor + j)
def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
"""
This function has two purposes:
- compute quantized weights, scaled either per-tensor or per-column
- compute scaling factors
Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ.
CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W.
CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor.
Here is the list of what we need (T means per-tensor, C per-column):
- scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8). Used before the GEMM. (T)
- scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T)
- scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C)
- scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32)
to quant range (int8) (used for CUBLAS) (T, C)
Note that we don't do anything special about row-parallel GEMM. Theoretically, we could have per-GPU scaling factors too,
but then the model would change depending on the number of GPUs used.
For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection, we consider it
as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V.
"""
# compute weight scaling factors for fp->int8 and int8->fp
if is_qkv and not multi_query_mode:
scale_w_orig_quant_t = 127. / act_range["w"].reshape(3, -1).max(
dim=-1, keepdims=True)[0].cpu().numpy()
scale_w_orig_quant_c = 127. / act_range["w"].reshape(3,
-1).cpu().numpy()
elif is_qkv and multi_query_mode:
raise ValueError(
f"Multi-query w/ int8 quant has not been supported yet")
else:
scale_w_orig_quant_t = 127. / act_range["w"].max().cpu().numpy()
scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy()
scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t
scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c
# compute the rest of needed scaling factors
scale_x_orig_quant_t = np.array(127. / act_range["x"].max().item())
scale_y_orig_quant_t = np.array(127. / act_range["y"].max().item())
scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.)
scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t *
scale_w_orig_quant_t)
scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t *
scale_w_orig_quant_c)
if is_qkv:
scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t,
scale_w_orig_quant_c.shape)
scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t,
scale_w_orig_quant_c.shape)
to_i8 = lambda x: x.round().clip(-127, 127).astype(np.int8)
return {
"weight.int8": to_i8(weights * scale_w_orig_quant_t),
"weight.int8.col": to_i8(weights * scale_w_orig_quant_c),
"scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32),
"scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32),
"scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32),
"scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32),
"scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32),
"scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32),
}
def write_int8(vals,
dir,
base_key,
split_dim,
tp_rank,
split_factor,
kv_cache_only=False):
if not kv_cache_only:
save_split(np.split(vals["weight.int8"], split_factor, axis=split_dim),
dir, f"{base_key}.weight.int8", tp_rank, split_factor)
save_split(
np.split(vals["weight.int8.col"], split_factor, axis=split_dim),
dir, f"{base_key}.weight.int8.col", tp_rank, split_factor)
saved_keys_once = ["scale_y_quant_orig"]
if not kv_cache_only:
saved_keys_once += [
"scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant"
]
# per-column scaling factors are loaded per-gpu for ColumnParallel GEMMs (QKV, FC1)
if not kv_cache_only:
if split_dim == -1:
save_split(
np.split(vals["scale_w_quant_orig.col"],
split_factor,
axis=split_dim), dir,
f"{base_key}.scale_w_quant_orig.col", tp_rank, split_factor)
save_split(
np.split(vals["scale_y_accum_quant.col"],
split_factor,
axis=split_dim), dir,
f"{base_key}.scale_y_accum_quant.col", tp_rank, split_factor)
else:
saved_keys_once += [
"scale_w_quant_orig.col", "scale_y_accum_quant.col"
]
if tp_rank == 0:
for save_key in saved_keys_once:
save_val(vals[save_key], dir, f"{base_key}.{save_key}")
# Note: in multi_query_mode, only query heads are split between multiple GPUs, while key/value head
# are not split as there is only one head per key/value.
@torch.no_grad()
def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals,
storage_type, act_range, config):
use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
split_gated_activation = config.get("split_gated_activation", False)
num_attention_heads = config.get("num_attention_heads", 0)
tp_size = config.get("tp_size", 1)
int8_outputs = config.get("int8_outputs", None)
multi_query_mode = config.get("multi_query_mode", False)
local_dim = config.get("local_dim", None)
save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
if not isinstance(vals, list):
vals = [vals]
if config.get("transpose_weights", False) and vals[0].ndim == 2:
vals = [val.T for val in vals]
if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
vals = [val + 1.0 for val in vals]
vals = [torch_to_numpy(val.cpu().to(storage_type)) for val in vals]
if "input_layernorm.weight" in key or "input_layernorm.bias" in key or \
"final_layernorm.weight" in key or "final_layernorm.bias" in key or \
"mlp.dense_4h_to_h.bias" in key:
# shared weights, only need to convert the weights of rank 0
if tp_rank == 0:
save_val(vals[0], saved_dir, key)
elif "attention.dense.weight" in key or "mlp.dense_4h_to_h.weight" in key:
cat_dim = 0
val = np.concatenate(vals, axis=cat_dim)
split_vals = np.split(val, split_factor, axis=cat_dim)
save_split(split_vals, saved_dir, key, tp_rank, split_factor)
if act_range is not None and int8_outputs == "all":
base_key = key.replace(".weight", "")
vals_i8 = generate_int8(val,
act_range,
multi_query_mode=multi_query_mode)
write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank,
split_factor)
elif "mlp.dense_h_to_4h.weight" in key or "mlp.dense_h_to_4h.bias" in key:
if split_gated_activation:
splits = [np.split(val, 2, axis=-1) for val in vals]
vals, gates = list(zip(*splits))
cat_dim = -1
val = np.concatenate(vals, axis=cat_dim)
split_vals = np.split(val, split_factor, axis=cat_dim)
save_split(split_vals, saved_dir, key, tp_rank, split_factor)
if act_range is not None and int8_outputs == "all":
base_key = key.replace(".weight", "")
vals_i8 = generate_int8(val,
act_range,
multi_query_mode=multi_query_mode)
write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank,
split_factor)
if split_gated_activation:
assert not save_int8
prefix, dot, suffix = key.rpartition(".")
key = prefix + ".gate" + dot + suffix
gate = np.concatenate(gates, axis=cat_dim)
split_vals = np.split(gate, split_factor, axis=cat_dim)
save_split(split_vals, saved_dir, key, tp_rank, split_factor)
elif "attention.query_key_value.weight" in key:
hidden_dim = vals[0].shape[0]
if local_dim is None:
local_dim = vals[0].shape[-1] // 3
if multi_query_mode:
val = vals[0]
# out_feature = local_dim + 2 * head_size; assumes local_dim equals to hidden_dim
head_size = (val.shape[-1] - local_dim) // 2
val = val.reshape(hidden_dim, local_dim + 2 * head_size)
w_q, w_kv = np.split(val, [local_dim], axis=-1)
w_q_split = np.split(w_q, split_factor, axis=-1)
split_vals = [np.concatenate((i, w_kv), axis=-1) for i in w_q_split]
else:
if use_attention_nemo_shape:
head_num = num_attention_heads // tp_size
size_per_head = hidden_dim // num_attention_heads
vals = [
val.reshape(hidden_dim, head_num, 3, size_per_head)
for val in vals
]
vals = [val.transpose(0, 2, 1, 3) for val in vals]
vals = [val.reshape(hidden_dim, 3, local_dim) for val in vals]
cat_dim = -1
val = np.concatenate(vals, axis=cat_dim)
split_vals = np.split(val, split_factor, axis=cat_dim)
save_split(split_vals, saved_dir, key, tp_rank, split_factor)
if save_int8:
base_key = key.replace(".weight", "")
vals_i8 = generate_int8(val,
act_range,
is_qkv=True,
multi_query_mode=multi_query_mode)
write_int8(vals_i8,
saved_dir,
base_key,
cat_dim,
tp_rank,
split_factor,
kv_cache_only=int8_outputs == "kv_cache_only")
elif ("attention.query.weight" in key or "attention.query.bias" in key
or "attention.key_value.weight" in key
or "attention.key_value.bias" in key):
pass
else:
assert False, f"[ERROR] {key} not handled by converter"

View File

@ -1,178 +0,0 @@
"""Byte pair encoding utilities"""
# Modified MIT License
# Software Copyright (c) 2019 OpenAI
# We dont claim ownership of the content you create with GPT-2, so it is yours to do with as you please.
# We only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2.
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
# associated documentation files (the "Software"), to deal in the Software without restriction,
# including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
# subject to the following conditions:
# The above copyright notice and this permission notice shall be included
# in all copies or substantial portions of the Software.
# The above copyright notice and this permission notice need not be included
# with content created by the Software.
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
# OR OTHER DEALINGS IN THE SOFTWARE.
# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
from functools import lru_cache
import regex as re
@lru_cache()
def bytes_to_unicode():
"""
Returns list of utf-8 byte and a corresponding list of unicode strings.
The reversible bpe codes work on unicode strings.
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
This is a significant percentage of your normal, say, 32K bpe vocab.
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
And avoids mapping to whitespace/control characters the bpe code barfs on.
"""
bs = list(range(ord("!"),
ord("~") + 1)) + list(range(
ord("¡"),
ord("¬") + 1)) + list(range(ord("®"),
ord("ÿ") + 1))
cs = bs[:]
n = 0
for b in range(2**8):
if b not in bs:
bs.append(b)
cs.append(2**8 + n)
n += 1
cs = [chr(n) for n in cs]
return dict(zip(bs, cs))
def get_pairs(word):
"""Return set of symbol pairs in a word.
Word is represented as tuple of symbols (symbols being variable-length strings).
"""
pairs = set()
prev_char = word[0]
for char in word[1:]:
pairs.add((prev_char, char))
prev_char = char
return pairs
class Encoder:
def __init__(self, encoder, bpe_merges, errors='replace'):
self.encoder = encoder
self.decoder = {v: k for k, v in self.encoder.items()}
self.errors = errors # how to handle errors in decoding
self.byte_encoder = bytes_to_unicode()
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
self.pat = re.compile(
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
)
def bpe(self, token):
if token in self.cache:
return self.cache[token]
word = tuple(token)
pairs = get_pairs(word)
if not pairs:
return token
while True:
bigram = min(
pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
if bigram not in self.bpe_ranks:
break
first, second = bigram
new_word = []
i = 0
while i < len(word):
try:
j = word.index(first, i)
new_word.extend(word[i:j])
i = j
except:
new_word.extend(word[i:])
break
if word[i] == first and i < len(word) - 1 and word[i +
1] == second:
new_word.append(first + second)
i += 2
else:
new_word.append(word[i])
i += 1
new_word = tuple(new_word)
word = new_word
if len(word) == 1:
break
else:
pairs = get_pairs(word)
word = ' '.join(word)
self.cache[token] = word
return word
def encode(self, text):
bpe_tokens = []
for token in re.findall(self.pat, text):
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
bpe_tokens.extend(self.encoder[bpe_token]
for bpe_token in self.bpe(token).split(' '))
return bpe_tokens
def decode(self, tokens):
text = ''.join([self.decoder[token] for token in tokens])
text = bytearray([self.byte_decoder[c]
for c in text]).decode('utf-8', errors=self.errors)
return text
def batch_decode(self, output):
ret = []
for tokens in output:
ret.append(self.decode(tokens))
return ret
def get_encoder(vocab_file, bpe_file):
with open(vocab_file, 'r', encoding="utf-8") as f:
encoder = json.load(f)
with open(bpe_file, 'r', encoding="utf-8") as f:
bpe_data = f.read()
bpe_merges = [
tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]
]
return Encoder(
encoder=encoder,
bpe_merges=bpe_merges,
)

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
rouge_score~=0.1.2
evaluate~=0.4.1

View File

@ -1,6 +1,6 @@
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,2 +1,2 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets==2.14.5
rouge_score~=0.1.2
sentencepiece~=0.1.99

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
transformers>=4.39.0
datasets~=2.14.5
evaluate

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
rouge_score~=0.1.2
sentencepiece~=0.1.99

View File

@ -1,4 +1,4 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
transformers==4.38.2
accelerate==0.25.0

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
transformers==4.40.2
# https://github.com/NVIDIA/NeMo/issues/9793
huggingface_hub==0.23.5

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.14.5
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets>=2.14.4
nemo-toolkit[all]<=1.20.0,>=1.18.0
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.16.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.16.0
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
git+https://github.com/google-deepmind/recurrentgemma.git
flax>=0.8.2
jax~=0.4.23

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets~=2.16.1
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
datasets==2.14.6
evaluate~=0.4.1
rouge_score~=0.1.2

View File

@ -182,7 +182,7 @@ def main(args):
input_ids = tokenizer.encode(curr_text,
return_tensors='pt').squeeze(0)
input_ids = input_ids[:test_token_num]
elif model_name == 'QWenForCausalLM' and model_version == 'qwen':
elif 'qwen' in model_name.lower() and model_version == 'qwen':
# use make_content to generate prompt
system_prompt = "You are a useful assistant, please directly output the corresponding summary according to the article entered by the user."
_, input_id_list = make_context(
@ -194,7 +194,7 @@ def main(args):
)
input_ids = torch.tensor(input_id_list)
else:
if model_name == 'QWenForCausalLM' and 'qwen2' in model_version:
if 'qwen' in model_name.lower() and 'qwen2' in model_version:
messages = [{
"role": "system",
"content": "You are a helpful assistant."
@ -527,7 +527,7 @@ def main(args):
ite_count += 1
del runner
if test_hf:
if test_hf and runtime_rank == 0:
profiler.start('load HF model')
dtype_alias_mapping = {
'fp32': 'float32',

View File

@ -37,7 +37,10 @@ DEFAULT_HF_MODEL_DIRS = {
'MPTForCausalLM': 'mosaicml/mpt-7b',
'PhiForCausalLM': 'microsoft/phi-2',
'OPTForCausalLM': 'facebook/opt-350m',
'QWenLMHeadModel': 'Qwen/Qwen-7B',
'QWenForCausalLM': 'Qwen/Qwen-7B',
'Qwen2ForCausalLM': 'Qwen/Qwen1.5-7B',
'Qwen2MoeForCausalLM': 'Qwen/Qwen1.5-MoE-A2.7B',
'RecurrentGemmaForCausalLM': 'google/recurrentgemma-2b',
}
@ -46,14 +49,16 @@ INTERNLM_META_INSTRUCTION = """You are an AI assistant whose name is InternLM (
- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
"""
QWEN_PROMPT_TEMPLATE = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n"
DEFAULT_PROMPT_TEMPLATES = {
'InternLMForCausalLM':
"<|User|>:{input_text}<eoh>\n<|Bot|>:",
'InternLM2ForCausalLM':
"<|im_start|>system\n" + INTERNLM_META_INSTRUCTION +
'InternLMForCausalLM': "<|User|>:{input_text}<eoh>\n<|Bot|>:",
'InternLM2ForCausalLM': "<|im_start|>system\n" + INTERNLM_META_INSTRUCTION +
"<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
'QWenForCausalLM':
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
'QWenLMHeadModel': QWEN_PROMPT_TEMPLATE,
'QWenForCausalLM': QWEN_PROMPT_TEMPLATE,
'Qwen2ForCausalLM': QWEN_PROMPT_TEMPLATE,
'Qwen2MoeForCausalLM': QWEN_PROMPT_TEMPLATE,
}
@ -83,7 +88,7 @@ def read_model_name(engine_dir: str):
model_version = None
if 'GLM' in model_arch:
model_version = config['pretrained_config']['chatglm_version']
if model_arch == 'QWenForCausalLM':
if 'qwen' in model_arch.lower():
model_version = config['pretrained_config']['qwen_type']
return model_arch, model_version
@ -134,7 +139,7 @@ def load_tokenizer(tokenizer_dir: Optional[str] = None,
padding_side='left',
truncation_side='left',
legacy=False)
if model_name == 'QWenForCausalLM' and model_version == 'qwen':
if 'qwen' in model_name.lower() and model_version == 'qwen':
with open(Path(tokenizer_dir) / "generation_config.json") as f:
gen_config = json.load(f)
pad_id = gen_config['pad_token_id']

View File

@ -1,5 +1,5 @@
--extra-index-url https://pypi.nvidia.com
tensorrt_llm==0.12.0.dev2024072301
tensorrt_llm==0.12.0.dev2024072302
tiktoken
datasets
kaldialign

View File

@ -28,6 +28,7 @@ from ._common import _is_building, check_max_num_tokens, serialize_engine
from ._utils import str_dtype_to_trt, to_json_file
from .auto_parallel import auto_parallel
from .auto_parallel.config import AutoParallelConfig
from .functional import PositionEmbeddingType
from .graph_rewriting import optimize
from .logger import logger
from .lora_manager import LoraConfig
@ -466,34 +467,6 @@ class BuildConfig:
dry_run: bool = False
visualize_network: bool = False
def __post_init__(self):
"""
Check and may modify max_num_tokens and opt_num_tokens after instantiation
"""
max_num_tokens, opt_num_tokens = check_max_num_tokens(
max_num_tokens=self.max_num_tokens,
opt_num_tokens=self.opt_num_tokens,
max_batch_size=self.max_batch_size,
max_input_len=self.max_input_len,
max_seq_len=self.max_seq_len,
max_beam_width=self.max_beam_width,
remove_input_padding=self.plugin_config.remove_input_padding,
enable_context_fmha=self.plugin_config.context_fmha,
tokens_per_block=self.plugin_config.tokens_per_block,
multiple_profiles=self.plugin_config.multiple_profiles,
)
self.max_num_tokens, self.opt_num_tokens = max_num_tokens, opt_num_tokens
if self.plugin_config.remove_input_padding and self.plugin_config.context_fmha:
if self.max_input_len:
logger.warning(
'padding removal and fMHA are both enabled, max_input_len is not required and will be ignored'
)
else:
assert self.max_input_len is not None, 'padding removal and fMHA aren\'t both enabled, max_input_len is required'
if self.max_seq_len:
assert self.max_input_len <= self.max_seq_len, 'max_input_len should not be larger than max_seq_len'
@classmethod
def from_dict(cls, config, plugin_config=None):
max_input_len = config.pop('max_input_len')
@ -507,7 +480,7 @@ class BuildConfig:
'max_prompt_embedding_table_size', 0)
gather_context_logits = config.pop('gather_context_logits', False)
gather_generation_logits = config.pop('gather_generation_logits', False)
strongly_typed = config.pop('strongly_typed', False)
strongly_typed = config.pop('strongly_typed', True)
builder_opt = config.pop('builder_opt', None)
force_num_profiles = config.pop('force_num_profiles', None)
weight_sparsity = config.pop('weight_sparsity', False)
@ -730,6 +703,79 @@ def optimize_model_with_config(model: PretrainedModel,
return model
def _init_max_seq_len(model_config, build_config):
"""
If max_seq_len is not specified, set it to max_position_embeddings * rotary_factor
Additional checks to ensure max_seq_len, max_input_len, and max_num_tokens have valid values.
"""
# Extract rotary scaling which will be used for checks and default value of max_seq_len
rotary_scaling = getattr(model_config, "rotary_scaling", None)
if rotary_scaling is not None:
rotary_type = rotary_scaling.get('type',
rotary_scaling.get('rope_type'))
rotary_factor = rotary_scaling.get('factor',
1.0) if rotary_type != 'su' else 1
else:
rotary_factor = 1
if build_config.max_seq_len is None:
# Step 1: Find the upper bound of max_seq_len
deduced_max_seq_len = 2048
if model_config.max_position_embeddings is not None:
deduced_max_seq_len = model_config.max_position_embeddings
# Step 2: Scale max_seq_len with rotary scaling
if rotary_factor != 1:
deduced_max_seq_len *= rotary_factor
logger.warning(
f'max_seq_len is scaled to {deduced_max_seq_len} by rotary scaling {rotary_factor}'
)
# Step 3: Assign the new max_seq_len
build_config.max_seq_len = deduced_max_seq_len
logger.info(
f'max_seq_len is not specified, using deduced value {deduced_max_seq_len}'
)
else:
if not build_config.plugin_config.streamingllm and model_config.max_position_embeddings is not None \
and model_config.position_embedding_type != PositionEmbeddingType.relative:
if build_config.max_seq_len > model_config.max_position_embeddings * rotary_factor:
logger.warning(
f'max_seq_len {build_config.max_seq_len} is larger than max_position_embeddings {model_config.max_position_embeddings} * rotary scaling {rotary_factor}, '
'the model accuracy might be affected')
if build_config.max_input_len > build_config.max_seq_len:
logger.warning(
f'max_input_len is {build_config.max_input_len} is larger than max_seq_len {build_config.max_seq_len}, clipping it to max_seq_len'
)
build_config.max_input_len = build_config.max_seq_len
# Check and may modify max_num_tokens and opt_num_tokens (need to happen after max_seq_len is deduced)
max_num_tokens, opt_num_tokens = check_max_num_tokens(
max_num_tokens=build_config.max_num_tokens,
opt_num_tokens=build_config.opt_num_tokens,
max_batch_size=build_config.max_batch_size,
max_input_len=build_config.max_input_len,
max_seq_len=build_config.max_seq_len,
max_beam_width=build_config.max_beam_width,
remove_input_padding=build_config.plugin_config.remove_input_padding,
enable_context_fmha=build_config.plugin_config.context_fmha,
tokens_per_block=build_config.plugin_config.tokens_per_block,
multiple_profiles=build_config.plugin_config.multiple_profiles,
)
build_config.max_num_tokens, build_config.opt_num_tokens = max_num_tokens, opt_num_tokens
if build_config.plugin_config.remove_input_padding and build_config.plugin_config.context_fmha:
if build_config.max_input_len:
logger.warning(
'padding removal and fMHA are both enabled, max_input_len is not required and will be ignored'
)
else:
assert build_config.max_input_len is not None, 'padding removal and fMHA aren\'t both enabled, max_input_len is required'
if build_config.max_seq_len:
assert build_config.max_input_len <= build_config.max_seq_len, 'max_input_len should not be larger than max_seq_len'
def build(model: PretrainedModel,
build_config: BuildConfig,
return_build_config: bool = False) -> Engine | BuildConfig:
@ -743,6 +789,8 @@ def build(model: PretrainedModel,
build_config = copy.deepcopy(build_config)
build_config.plugin_config.dtype = model.config.dtype
_init_max_seq_len(model.config, build_config)
if model.config.quantization.quant_algo == QuantAlgo.FP8 or \
model.config.quantization.kv_cache_quant_algo == QuantAlgo.FP8:
build_config.strongly_typed = True

View File

@ -27,7 +27,6 @@ import torch
from tensorrt_llm.auto_parallel import infer_cluster_config
from tensorrt_llm.auto_parallel.cluster_info import cluster_infos
from tensorrt_llm.builder import BuildConfig, Engine, build
from tensorrt_llm.functional import PositionEmbeddingType
from tensorrt_llm.logger import logger
from tensorrt_llm.lora_manager import LoraConfig, LoraManager
from tensorrt_llm.models import MODEL_MAP, PretrainedConfig
@ -444,48 +443,6 @@ def main():
else:
cluster_config = infer_cluster_config()
# Extract rotary scaling which will be used for checks and default value of max_seq_len
rotary_scaling = getattr(model_config, "rotary_scaling", None)
if rotary_scaling is not None:
rotary_type = rotary_scaling.get('type',
rotary_scaling.get('rope_type'))
rotary_factor = rotary_scaling.get(
'factor', 1.0) if rotary_type != 'su' else 1
else:
rotary_factor = 1
if args.max_seq_len is None:
# Step 1: Find the upper bound of max_seq_len
deduced_max_seq_len = 2048
if model_config.max_position_embeddings is not None:
deduced_max_seq_len = model_config.max_position_embeddings
# Step 2: Scale max_seq_len with rotary scaling
if rotary_factor != 1:
deduced_max_seq_len *= rotary_factor
logger.warning(
f'max_seq_len is scaled to {deduced_max_seq_len} by rotary scaling {rotary_factor}'
)
# Step 3: Assign the new max_seq_len
args.max_seq_len = deduced_max_seq_len
logger.info(
f'max_seq_len is not specified, using value {deduced_max_seq_len}'
)
else:
if not plugin_config.streamingllm and model_config.max_position_embeddings is not None \
and model_config.position_embedding_type != PositionEmbeddingType.relative:
if args.max_seq_len > model_config.max_position_embeddings * rotary_factor:
logger.warning(
f'max_seq_len {args.max_seq_len} is larger than max_position_embeddings {model_config.max_position_embeddings} * rotary scaling {rotary_factor}, '
'the model accuracy might be affected')
if args.max_input_len > args.max_seq_len:
logger.warning(
f'max_input_len is {args.max_input_len} is larger than max_seq_len {args.max_seq_len}, clipping it to max_seq_len'
)
args.max_input_len = args.max_seq_len
build_config = BuildConfig.from_dict(
{
'max_input_len': args.max_input_len,

View File

@ -4975,6 +4975,7 @@ def gpt_attention(
])
attn_plug = attn_plg_creator.create_plugin("causal_attn", pfc)
assert attn_plug
plug_inputs = [*qkv] if is_unfuse_qkv_gemm else [qkv]
if use_cache:
plug_inputs += [
@ -5510,7 +5511,7 @@ def lora_plugin(
transa: bool = False,
transb: bool = False,
host_context_lengths: Tensor = None, # for pad-free input mode
max_context_length: int = 0,
max_num_tokens: int = 0,
max_low_rank: int = 0,
lora_ranks: List[Tensor] = None,
lora_weights_pointers: List[Tensor] = None,
@ -5541,8 +5542,8 @@ def lora_plugin(
host_context_lengths: cpu Tensor = None
A host tensor that contains the lengths of the different inputs,
max_context_length : int
Maximum length during context phase, used to determine the workspace size.
max_num_tokens : int
Maximum number of tokens, used to determine the workspace size.
max_low_rank : int
Maximum low_rank, used to determine the workspace size.
@ -5591,8 +5592,8 @@ def lora_plugin(
"remove_input_padding",
np.array(np.int8(default_net().plugin_config.remove_input_padding),
dtype=np.int8), trt.PluginFieldType.INT8)
max_context_length_field = trt.PluginField(
"max_context_length", np.array(max_context_length, dtype=np.int32),
max_num_tokens_field = trt.PluginField(
"max_num_tokens", np.array(max_num_tokens, dtype=np.int32),
trt.PluginFieldType.INT32)
max_low_rank_field = trt.PluginField("max_low_rank",
np.array(max_low_rank, dtype=np.int32),
@ -5607,7 +5608,7 @@ def lora_plugin(
pfc = trt.PluginFieldCollection([
in_hidden_size_field, transa, transb, num_lora_modules_field, pf_type,
remove_input_padding, max_context_length_field, max_low_rank_field,
remove_input_padding, max_num_tokens_field, max_low_rank_field,
weight_index_field
] + out_hidden_size_field_list)
lora_plug = plg_creator.create_plugin("lora", pfc)

View File

@ -288,6 +288,12 @@ class LlmArgs:
else:
self.tokenizer = tokenizer_factory(self.tokenizer)
if torch.cuda.get_device_properties(0).major < 8:
if self.dtype == 'auto':
self.dtype = 'float16'
if self.dtype == 'bfloat16':
raise RuntimeError("Pre SM 80 GPUs do not support bfloat16")
self._engine_config: Optional[EngineConfig] = None
self.auto_parallel_config = AutoParallelConfig(
@ -1021,7 +1027,10 @@ class ModelLoader:
raise NotImplementedError(
f"Unsupported model architecture in HLAPI: {architecture}")
if self.llm_args.quant_config.quant_mode.has_any_quant():
use_weight_only = self.llm_args.quant_config.quant_algo in (
QuantAlgo.W4A16, QuantAlgo.W8A16)
if self.llm_args.quant_config.quant_mode.has_any_quant(
) and not use_weight_only:
assert self.workspace is not None
checkpoint_dir = f"{self.workspace}/quantized-checkpoint"
if self.rank == 0:

View File

@ -612,7 +612,7 @@ class Attention(Module):
],
host_request_types=q_lora_params.host_request_types,
host_context_lengths=q_lora_params.host_context_lengths,
max_context_length=q_lora_params.max_context_length,
max_num_tokens=q_lora_params.max_num_tokens,
max_encoder_context_length=q_lora_params.
max_encoder_context_length,
host_encoder_input_lengths=q_lora_params.
@ -1337,7 +1337,7 @@ class BertAttention(Module):
],
host_request_types=q_lora_params.host_request_types,
host_context_lengths=q_lora_params.host_context_lengths,
max_context_length=q_lora_params.max_context_length)
max_num_tokens=q_lora_params.max_num_tokens)
q_lora, k_lora, v_lora = self.qkv_lora(hidden_states,
qkv_lora_params)

View File

@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import List
from typing import List, Optional
from .._common import default_net
from ..functional import Tensor, lora_plugin
@ -28,7 +28,7 @@ class LoraRuntimeParams(object):
lora_weights_pointers: List[Tensor] = None,
host_request_types: Tensor = None,
host_context_lengths: Tensor = None,
max_context_length: Tensor = None,
max_num_tokens: Optional[int] = None,
max_encoder_context_length: Tensor = None,
host_encoder_input_lengths: Tensor = None,
weight_index: int = 0,
@ -38,7 +38,7 @@ class LoraRuntimeParams(object):
self.lora_weights_pointers = lora_weights_pointers
self.host_request_types = host_request_types
self.host_context_lengths = host_context_lengths
self.max_context_length = max_context_length
self.max_num_tokens = max_num_tokens
self.max_encoder_context_length = max_encoder_context_length
self.host_encoder_input_lengths = host_encoder_input_lengths
self.weight_index = weight_index
@ -71,8 +71,8 @@ class Lora(Module):
host_context_lengths=lora_runtime_params.host_context_lengths
if not is_cross_attention else
lora_runtime_params.host_encoder_input_lengths,
# For cross attention, max_encoder_context_length should be used instead of max_context_length
max_context_length=lora_runtime_params.max_context_length
# For cross attention, max_encoder_context_length should be used instead of max_num_tokens
max_num_tokens=lora_runtime_params.max_num_tokens
if not is_cross_attention else
lora_runtime_params.max_encoder_context_length,
max_low_rank=self.max_low_rank,
@ -93,7 +93,7 @@ class LoraParams(object):
lora_ranks=None, # : List[dict[Tensor]]
lora_weights_pointers=None, # : List[dict[Tensor]]
host_context_lengths: Tensor = None,
max_context_length: Tensor = None,
max_num_tokens: Optional[int] = None,
max_encoder_context_length: Tensor = None, # For cross attention
host_request_types: Tensor = None,
host_encoder_input_lengths: Tensor = None, # For cross attention
@ -104,7 +104,7 @@ class LoraParams(object):
self.lora_weights_pointers = lora_weights_pointers
self.host_context_lengths = host_context_lengths
self.max_context_length = max_context_length
self.max_num_tokens = max_num_tokens
self.max_encoder_context_length = max_encoder_context_length
self.host_request_types = host_request_types
self.host_encoder_input_lengths = host_encoder_input_lengths
@ -115,7 +115,7 @@ class LoraParams(object):
lora_ranks=[self.lora_ranks[layer_idx]],
lora_weights_pointers=[self.lora_weights_pointers[layer_idx]],
host_context_lengths=self.host_context_lengths,
max_context_length=self.max_context_length,
max_num_tokens=self.max_num_tokens,
max_encoder_context_length=self.max_encoder_context_length,
host_request_types=self.host_request_types,
host_encoder_input_lengths=self.host_encoder_input_lengths,
@ -133,7 +133,7 @@ class LoraParams(object):
[f"{lora_module}_lora_weights_pointers"]
],
host_context_lengths=self.host_context_lengths,
max_context_length=self.max_context_length,
max_num_tokens=self.max_num_tokens,
max_encoder_context_length=self.max_encoder_context_length,
host_request_types=self.host_request_types,
host_encoder_input_lengths=self.host_encoder_input_lengths,

View File

@ -47,7 +47,7 @@ def fc_gate_lora(hidden_states, lora, lora_layer_params):
],
host_request_types=mlp_fc_lora_params.host_request_types,
host_context_lengths=mlp_fc_lora_params.host_context_lengths,
max_context_length=mlp_fc_lora_params.max_context_length)
max_num_tokens=mlp_fc_lora_params.max_num_tokens)
mlp_fc_lora, mlp_gate_lora = lora(hidden_states, mlp_in_lora_params)
mlp_in_result = concat([mlp_gate_lora, mlp_fc_lora],

View File

@ -24,12 +24,11 @@ from tensorrt_llm.layers.lora import LoraParams
from .._common import default_net, default_trtnet
from .._utils import int32_array
from ..functional import (AllReduceFusionParams, AllReduceStrategy,
_add_plugin_info, _create_tensor, allreduce, cast,
concat, constant, div, expand, gather_nd,
is_gated_activation, non_gated_version, nonzero,
repeat_interleave, scatter_nd, shape, softmax, split,
sum, topk)
from ..functional import (AllReduceFusionParams, _add_plugin_info,
_create_tensor, allreduce, cast, concat, constant,
div, expand, gather_nd, is_gated_activation,
non_gated_version, nonzero, repeat_interleave,
scatter_nd, shape, softmax, split, sum, topk)
from ..layers import MLP, GatedMLP
from ..mapping import Mapping
from ..module import Module, ModuleList
@ -531,7 +530,7 @@ class MoeOOTB(MOE):
gate_lora_weights_pointers,
}],
host_context_lengths=lora_layer_params.host_context_lengths,
max_context_length=lora_layer_params.max_context_length,
max_num_tokens=lora_layer_params.max_num_tokens,
max_encoder_context_length=lora_layer_params.
max_encoder_context_length,
host_request_types=lora_layer_params.host_request_types,
@ -603,6 +602,10 @@ class MoeOOTB(MOE):
expert_weights = split(experts_weights, 1, dim=0)
for i, expert in enumerate(self.experts):
if self.mapping.has_moe_ep():
index = i + self.experts_per_node * self.mapping.moe_ep_rank
else:
index = i
# get mask token index
non_zero_index = nonzero(experts_mask[i].view(
concat([-1, hidden_size])))
@ -627,16 +630,9 @@ class MoeOOTB(MOE):
output = output.view(shape(hidden_states))
need_ep_reduce = self.mapping.has_moe_ep(
) and self.mapping.moe_ep_group is not None
need_tp_reduce = self.mapping.has_moe_tp(
) and self.mapping.moe_tp_group is not None
if need_tp_reduce or need_ep_reduce:
group = self.mapping.moe_ep_group if need_ep_reduce else self.mapping.moe_tp_group
# TODO: remove this NCCL strategy WAR after fixed https://nvbugspro.nvidia.com/bug/4740067
if self.tp_size > 1 and self.tp_group is not None:
output = allreduce(output,
group,
strategy=AllReduceStrategy.NCCL,
self.mapping.tp_group,
reduce_fusion_params=reduce_fusion_params)
return output

View File

@ -27,6 +27,7 @@ from .falcon.model import FalconForCausalLM, FalconModel
from .gemma.model import GemmaForCausalLM
from .gpt.config import GPTConfig
from .gpt.model import GPTForCausalLM, GPTModel
from .gptj.config import GPTJConfig
from .gptj.model import GPTJForCausalLM, GPTJModel
from .gptneox.model import GPTNeoXForCausalLM, GPTNeoXModel
from .grok.model import GrokForCausalLM
@ -65,6 +66,7 @@ __all__ = [
'MedusaConfig',
'MedusaForCausalLm',
'ReDrafterForCausalLM',
'GPTJConfig',
'GPTJModel',
'GPTJForCausalLM',
'GPTNeoXModel',

View File

@ -667,6 +667,7 @@ class EncoderModel(PretrainedModel):
def prepare_inputs(self,
max_batch_size,
max_input_len,
max_num_tokens,
prompt_embedding_table_size: int = 0,
lora_target_modules: List[str] = None,
*args,
@ -889,7 +890,7 @@ class EncoderModel(PretrainedModel):
lora_params = LoraParams(
lora_ranks=lora_ranks,
lora_weights_pointers=lora_weights_pointers,
max_context_length=max_input_len,
max_num_tokens=max_num_tokens,
host_request_types=host_request_types,
host_context_lengths=host_context_lengths,
)
@ -1225,6 +1226,7 @@ class DecoderModel(PretrainedModel):
max_beam_width,
max_decoder_input_len,
max_seq_len,
max_num_tokens,
max_encoder_input_len,
gather_context_logits: bool = False,
gather_generation_logits: bool = False,
@ -1594,7 +1596,7 @@ class DecoderModel(PretrainedModel):
lora_ranks=lora_ranks,
lora_weights_pointers=lora_weights_pointers,
host_context_lengths=host_context_lengths,
max_context_length=max_decoder_input_len,
max_num_tokens=max_num_tokens,
max_encoder_context_length=max_encoder_input_len,
host_request_types=host_request_types,
host_encoder_input_lengths=host_encoder_input_lengths,

View File

@ -15,14 +15,20 @@
from typing import Optional, Union
import torch
from ..._utils import torch_dtype_to_str
from ...layers import MoeConfig
from ..modeling_utils import PretrainedConfig
from ...logger import logger
from ...mapping import Mapping
from ..modeling_utils import PretrainedConfig, QuantConfig
class GPTConfig(PretrainedConfig):
def __init__(self,
*,
gpt_variant: str = 'gpt2',
bias: bool = True,
q_scaling: float = 1.0,
embedding_scale: Optional[float] = None,
@ -30,8 +36,11 @@ class GPTConfig(PretrainedConfig):
rotary_pct: float = 1.0,
rotary_base: float = 10000.0,
rotary_scaling: Optional[dict] = None,
inner_layernorm: bool = False,
norm_before_bmm1: bool = False,
moe: Optional[Union[MoeConfig, dict]] = None,
**kwargs):
self.gpt_variant = gpt_variant
self.bias = bias
self.q_scaling = q_scaling
self.embedding_scale = embedding_scale
@ -39,6 +48,8 @@ class GPTConfig(PretrainedConfig):
self.rotary_pct = rotary_pct
self.rotary_base = rotary_base
self.rotary_scaling = rotary_scaling
self.inner_layernorm = inner_layernorm
self.norm_before_bmm1 = norm_before_bmm1
if moe is None:
# Legacy MOE config fields
moe = MoeConfig(
@ -57,6 +68,7 @@ class GPTConfig(PretrainedConfig):
def to_dict(self):
output = super().to_dict()
# Serialize the fields added in GPTConfig
output['gpt_variant'] = self.gpt_variant
output['bias'] = self.bias
output['q_scaling'] = self.q_scaling
output['embedding_scale'] = self.embedding_scale
@ -65,5 +77,244 @@ class GPTConfig(PretrainedConfig):
output['rotary_pct'] = self.rotary_pct
output['rotary_base'] = self.rotary_base
output['rotary_scaling'] = self.rotary_scaling
output['inner_layernorm'] = self.inner_layernorm
output['norm_before_bmm1'] = self.norm_before_bmm1
output['moe'] = self.moe.to_dict()
return output
@classmethod
def from_hugging_face(
cls,
hf_config_or_dir: Union[str, 'transformers.PretrainedConfig'],
dtype: str = 'auto',
mapping: Optional[Mapping] = None,
quant_config: Optional[QuantConfig] = None,
**kwargs):
import transformers
from .convert import get_needed_padding
if isinstance(hf_config_or_dir, transformers.PretrainedConfig):
hf_config = hf_config_or_dir
else:
hf_config = transformers.AutoConfig.from_pretrained(
hf_config_or_dir, trust_remote_code=True)
gpt_variant = kwargs.pop('gpt_variant', None)
if gpt_variant is None:
logger.info("Inferring gpt variant from path...")
for v in [
'starcoder2', 'starcoder', 'santacoder', 'gpt2',
'persimmon', 'fuyu', 'kosmos-2', 'jais'
]:
if v in hf_config._name_or_path:
gpt_variant = v
break
if gpt_variant == 'fuyu':
gpt_variant = 'persimmon'
assert gpt_variant in [
'gpt2', 'santacoder', 'starcoder', 'starcoder2', 'persimmon',
'kosmos-2', 'jais'
]
logger.info(f"Gpt variant: {gpt_variant}")
if gpt_variant in ['starcoder2', 'persimmon']:
hf_config.n_embd = hf_config.hidden_size
hf_config.n_inner = hf_config.intermediate_size
hf_config.n_head = hf_config.num_attention_heads
hf_config.n_kv_head = hf_config.num_key_value_heads if hasattr(
hf_config, 'num_key_value_heads') else hf_config.n_head
hf_config.n_layer = hf_config.num_hidden_layers
hf_config.n_positions = hf_config.max_position_embeddings
hf_config.activation_function = 'gelu' if gpt_variant == 'starcoder2' else 'squared-relu'
hf_config.layer_norm_epsilon = hf_config.norm_epsilon if gpt_variant == 'starcoder2' else hf_config.layer_norm_eps
hf_config.bias = hf_config.use_bias if gpt_variant == 'starcoder2' else True
hf_config.position_embedding_type = 'rope_gpt_neox'
hf_config.rotary_base = hf_config.rope_theta
hf_config.rotary_pct = getattr(hf_config, 'partial_rotary_factor',
1.0)
elif gpt_variant == "kosmos-2":
hf_config.n_embd = hf_config.text_config.embed_dim
hf_config.n_inner = hf_config.text_config.ffn_dim
hf_config.n_head = hf_config.text_config.attention_heads
hf_config.n_kv_head = hf_config.n_head
hf_config.n_layer = hf_config.text_config.layers
hf_config.n_positions = hf_config.text_config.max_position_embeddings
hf_config.activation_function = hf_config.text_config.activation_function
hf_config.layer_norm_epsilon = hf_config.text_config.layer_norm_eps
hf_config.bias = True
hf_config.vocab_size = hf_config.text_config.vocab_size
else:
if hf_config.n_inner is None:
hf_config.n_inner = hf_config.n_embd * 4
if gpt_variant in ['santacoder', 'starcoder']:
hf_config.n_kv_head = 1
else:
hf_config.n_kv_head = hf_config.n_head
if gpt_variant == 'jais':
hf_config.q_scaling = (hf_config.n_embd // hf_config.n_head)**0.5
if hasattr(hf_config, 'width_scale'):
hf_config.logits_scale = hf_config.width_scale
else:
hf_config.logits_scale = hf_config.mup_output_alpha * hf_config.mup_width_scale
if hasattr(hf_config, 'mup_embeddings_scale'):
hf_config.embeddings_scale = hf_config.mup_embeddings_scale
else:
assert hasattr(hf_config, 'embeddings_scale')
hf_config.n_inner += get_needed_padding(hf_config.n_inner,
mapping.tp_size)
if gpt_variant == 'kosmos-2':
if hf_config.text_config.scale_embedding:
hf_config.embeddings_scale = hf_config.n_embd**0.5
if dtype == 'auto':
dtype = getattr(hf_config, 'torch_dtype', None)
if dtype is None:
dtype = 'float16'
if isinstance(dtype, torch.dtype):
dtype = torch_dtype_to_str(dtype)
if dtype == 'float32':
dtype = 'float16'
return cls(architecture=hf_config.architectures[0],
dtype=dtype,
num_hidden_layers=hf_config.n_layer,
num_attention_heads=hf_config.n_head,
num_key_value_heads=hf_config.n_kv_head,
hidden_size=hf_config.n_embd,
intermediate_size=hf_config.n_inner,
norm_epsilon=hf_config.layer_norm_epsilon,
vocab_size=hf_config.vocab_size,
position_embedding_type=getattr(hf_config,
'position_embedding_type',
'learned_absolute'),
max_position_embeddings=hf_config.n_positions,
hidden_act=hf_config.activation_function,
gpt_variant=gpt_variant,
bias=getattr(hf_config, 'bias', True),
apply_query_key_layer_scaling=getattr(
hf_config, 'apply_query_key_layer_scaling', False),
rotary_pct=getattr(hf_config, 'rotary_pct', 1.0),
rotary_base=getattr(hf_config, 'rotary_base', 10000.0),
rotary_scaling=getattr(hf_config, 'rotary_scaling', None),
qk_layernorm=gpt_variant == 'persimmon',
inner_layernorm=gpt_variant == 'kosmos-2',
norm_before_bmm1=gpt_variant == 'kosmos-2',
q_scaling=getattr(hf_config, 'q_scaling', 1),
embedding_scale=getattr(hf_config, 'embeddings_scale', None),
mapping=mapping,
quantization=quant_config,
**kwargs)
@classmethod
def from_nemo(cls,
nemo_ckpt_dir: str,
dtype: str = 'auto',
mapping: Optional[Mapping] = None,
quant_config: Optional[QuantConfig] = None,
**kwargs):
import transformers
from .convert import (UnpackedNemoCheckpointDir, cpu_map_location,
gpu_map_location, rename_keys)
load_model_on_cpu = kwargs.pop('load_model_on_cpu', False)
nemo_rename_key = kwargs.pop('nemo_rename_key', [])
layer_rename_config = {
pattern.split(':')[0]: pattern.split(':')[1]
for pattern in nemo_rename_key
}
unpacked_checkpoints_dir = UnpackedNemoCheckpointDir(
nemo_ckpt_dir, load_checkpoints_to_cpu=load_model_on_cpu)
nemo_model_config = unpacked_checkpoints_dir.model_config
training_tp_size = nemo_model_config.get("tensor_model_parallel_size",
1)
training_pp_size = nemo_model_config.get("pipeline_model_parallel_size",
1)
checkpoints_paths = unpacked_checkpoints_dir.get_checkpoints_paths(
training_tp_size,
training_pp_size,
)
if unpacked_checkpoints_dir._load_checkpoints_to_cpu:
map_location_fn = cpu_map_location
else:
map_location_fn = gpu_map_location
model_00 = torch.load(checkpoints_paths[0][0],
map_location=map_location_fn)
model_00 = rename_keys(model_00, layer_rename_config)
vocab_size = model_00[
"model.language_model.embedding.word_embeddings.weight"].shape[
0] * training_tp_size
del model_00
hf_config = transformers.GPT2Config(
vocab_size=vocab_size,
n_positions=nemo_model_config['max_position_embeddings'],
n_embd=nemo_model_config['hidden_size'],
n_layer=nemo_model_config['num_layers'],
n_head=nemo_model_config['num_attention_heads'],
n_inner=nemo_model_config['ffn_hidden_size'],
activation_function=nemo_model_config['activation'],
layer_norm_epsilon=nemo_model_config['layernorm_epsilon'],
)
hf_config.n_kv_head = hf_config.n_head
hf_config.bias = nemo_model_config['bias']
hf_config.apply_query_key_layer_scaling = False
hf_config.position_embedding_type = nemo_model_config.get(
'position_embedding_type', 'learned_absolute')
if hf_config.position_embedding_type == 'rope':
hf_config.position_embedding_type = 'rope_gpt_neox'
hf_config.rotary_base = nemo_model_config.get('rotary_base', 10000.0)
hf_config.rotary_pct = nemo_model_config.get('rotary_percentage', 1.0)
assert hf_config.rotary_pct >= 0 and hf_config.rotary_pct <= 1
rotary_scaling_factor = nemo_model_config.get(
'seq_len_interpolation_factor', None)
if rotary_scaling_factor is None:
hf_config.rotary_scaling = None
else:
assert rotary_scaling_factor > 1
hf_config.rotary_scaling = {
'type': 'linear',
'factor': rotary_scaling_factor
}
if dtype == 'auto':
dtype = nemo_model_config['precision']
if dtype is None:
dtype = 'float16'
elif 'bf16' in dtype or 'bfloat16' in dtype:
dtype = 'bfloat16'
else:
dtype = 'float16'
return cls(architecture='GPTForCausalLM',
dtype=dtype,
num_hidden_layers=hf_config.n_layer,
num_attention_heads=hf_config.n_head,
num_key_value_heads=hf_config.n_kv_head,
hidden_size=hf_config.n_embd,
intermediate_size=hf_config.n_inner,
norm_epsilon=hf_config.layer_norm_epsilon,
vocab_size=hf_config.vocab_size,
position_embedding_type=hf_config.position_embedding_type,
max_position_embeddings=hf_config.n_positions,
hidden_act=hf_config.activation_function,
bias=hf_config.bias,
apply_query_key_layer_scaling=hf_config.
apply_query_key_layer_scaling,
rotary_pct=hf_config.rotary_pct,
rotary_base=hf_config.rotary_base,
rotary_scaling=hf_config.rotary_scaling,
mapping=mapping,
quantization=quant_config,
**kwargs)

File diff suppressed because it is too large Load Diff

View File

@ -13,6 +13,8 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Union
from ..._utils import pad_vocab_size
from ...functional import (Tensor, is_gated_activation, non_gated_version, recv,
send)
@ -22,9 +24,12 @@ from ...layers import (MLP, MOE, Attention, AttentionMaskType, ColumnLinear,
from ...lora_manager import LoraConfig, use_lora
from ...mapping import Mapping
from ...module import Module
from ...quantization import QuantMode
from ..modeling_utils import DecoderLayerList, DecoderModelForCausalLM
from ...quantization import W8A8_SQ_PLUGIN_LIST, QuantAlgo, QuantMode
from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
QuantConfig, check_share_embedding)
from .config import GPTConfig
from .convert import (load_hf_gpt, load_weights_from_hf_model,
load_weights_from_nemo)
def MLPFactory(hidden_size,
@ -276,5 +281,123 @@ class GPTForCausalLM(DecoderModelForCausalLM):
}
super().__init__(config, transformer, lm_head)
@classmethod
def from_hugging_face(
cls,
hf_model_or_dir: Union[str, 'transformers.PreTrainedModel'],
dtype: str = 'auto',
mapping: Optional[Mapping] = None,
quant_config: Optional[QuantConfig] = None,
**kwargs):
''' Create a LLaMAForCausalLM object from give parameters
'''
import transformers
load_model_on_cpu = kwargs.pop('load_model_on_cpu', False)
assert hf_model_or_dir is not None
use_preloading = isinstance(hf_model_or_dir,
transformers.PreTrainedModel)
if use_preloading:
hf_model = hf_model_or_dir
hf_config_or_dir = hf_model.config
else:
hf_model_dir = hf_model_or_dir
hf_config_or_dir = hf_model_or_dir
config = GPTConfig.from_hugging_face(hf_config_or_dir,
dtype=dtype,
mapping=mapping,
quant_config=quant_config,
**kwargs)
if not use_preloading:
hf_model = load_hf_gpt(hf_model_dir, load_model_on_cpu)
weights = load_weights_from_hf_model(hf_model, config)
check_share_embedding(weights, config)
model = cls(config)
model.load(weights)
return model
@classmethod
def quantize(
cls,
hf_model_dir: str,
output_dir: str,
dtype: str = 'auto',
mapping: Optional[Mapping] = None,
quant_config: Optional[QuantConfig] = None,
*,
device: str = 'cuda',
calib_dataset: str = 'cnn_dailymail',
calib_batches: int = 512,
calib_batch_size: int = 1,
calib_max_seq_length: int = 512,
random_seed: int = 1234,
tokenizer_max_seq_length: int = 2048,
**kwargs,
):
DEFAULT_MODELOPT_FLOW = [
QuantAlgo.W4A16_AWQ, QuantAlgo.FP8, QuantAlgo.W8A8_SQ_PER_CHANNEL,
QuantAlgo.W4A8_AWQ
]
config = GPTConfig.from_hugging_face(hf_model_dir,
dtype=dtype,
mapping=mapping,
quant_config=quant_config,
**kwargs)
if quant_config.quant_algo in DEFAULT_MODELOPT_FLOW:
super().quantize(hf_model_dir,
output_dir,
dtype=config.dtype,
mapping=config.mapping,
quant_config=config.quantization,
device=device,
calib_dataset=calib_dataset,
calib_batches=calib_batches,
calib_batch_size=calib_batch_size,
calib_max_seq_length=calib_max_seq_length,
random_seed=random_seed,
tokenizer_max_seq_length=tokenizer_max_seq_length)
else:
# non-modelopt, the legacy TRT-LLM native quantization algorithm:
# sq, int4/int8 weights only, int8 kv cache
NATIVE_QUANT_FLOW = [QuantAlgo.W4A16, QuantAlgo.W8A16, None
] + W8A8_SQ_PLUGIN_LIST
is_valid_native_quant = (quant_config.quant_algo in NATIVE_QUANT_FLOW) and \
(quant_config.kv_cache_quant_algo in [QuantAlgo.INT8, None])
assert quant_config.quant_algo is not None or quant_config.kv_cache_quant_algo is not None, \
"There is no point to call the quantize function if both quant_algo and kv_cache_quant_algo is None"
assert is_valid_native_quant, f"Internal error: shall call Modelopt for this quantization {quant_config}"
from . import convert
convert.quantize(hf_model_dir,
output_dir,
config=config,
device=device,
calib_dataset=calib_dataset)
@classmethod
def from_nemo(cls,
nemo_ckpt_dir: str,
dtype: str = 'auto',
mapping: Optional[Mapping] = None,
quant_config: Optional[QuantConfig] = None,
**kwargs):
config = GPTConfig.from_nemo(nemo_ckpt_dir,
dtype=dtype,
mapping=mapping,
quant_config=quant_config,
**kwargs)
weights = load_weights_from_nemo(nemo_ckpt_dir, config, **kwargs)
check_share_embedding(weights, config)
model = cls(config)
model.load(weights)
return model
def use_lora(self, lora_config: LoraConfig):
use_lora(self, lora_config, self.trtllm_modules_to_hf_modules)

View File

@ -0,0 +1,63 @@
from typing import Mapping, Optional, Union
import torch
from ..._utils import torch_dtype_to_str
from ...mapping import Mapping
from ..modeling_utils import PretrainedConfig, QuantConfig
class GPTJConfig(PretrainedConfig):
"""
This is the configuration class to store the configuration of GPTJ model.
"""
def __init__(self, *, rotary_dim: int = 64, **kwargs):
self.rotary_dim = rotary_dim
super().__init__(**kwargs)
def to_dict(self):
output = super().to_dict()
output.update(rotary_dim=self.rotary_dim)
return output
@classmethod
def from_hugging_face(
cls,
hf_config_or_dir: Union[str, 'transformers.PretrainedConfig'],
dtype: str = 'auto',
mapping: Optional[Mapping] = None,
quant_config: Optional[QuantConfig] = None,
**kwargs):
import transformers
if isinstance(hf_config_or_dir, transformers.PretrainedConfig):
hf_config = hf_config_or_dir
else:
hf_config_dir = str(hf_config_or_dir)
hf_config = transformers.AutoConfig.from_pretrained(
hf_config_dir, trust_remote_code=True)
if dtype == 'auto':
dtype = getattr(hf_config, 'torch_dtype', None)
if dtype is None:
dtype = 'float16'
if isinstance(dtype, torch.dtype):
dtype = torch_dtype_to_str(dtype)
if dtype == 'float32':
dtype = 'float16'
return cls(architecture=hf_config.architectures[0],
dtype=dtype,
num_hidden_layers=hf_config.num_hidden_layers,
num_attention_heads=hf_config.num_attention_heads,
hidden_size=hf_config.hidden_size,
norm_epsilon=hf_config.layer_norm_epsilon,
vocab_size=hf_config.vocab_size,
position_embedding_type='rope_gptj',
max_position_embeddings=hf_config.max_position_embeddings,
hidden_act='gelu',
rotary_dim=hf_config.rotary_dim,
mapping=mapping,
quantization=quant_config,
**kwargs)

View File

@ -0,0 +1,205 @@
import time
from typing import Dict, Optional, Tuple
import torch
from tensorrt_llm.quantization import QuantAlgo
from .config import GPTJConfig
def split(weight: torch.Tensor,
tp_size: int,
rank: int = 0,
dim: int = 0) -> torch.Tensor:
if tp_size == 1:
return weight
elif weight.ndim == 1:
return torch.chunk(weight, tp_size)[rank].contiguous()
else:
return torch.chunk(weight, tp_size, dim=dim)[rank].contiguous()
def split_matrix(weight: torch.Tensor, tp_size: int, rank: int,
dim: int) -> torch.Tensor:
return split(weight, tp_size, rank, dim=dim)
def get_weight(params: Dict[str, torch.Tensor], prefix: str,
dtype: torch.dtype) -> torch.Tensor:
if f'{prefix}.weight' not in params:
return None
return params[f'{prefix}.weight'].to(dtype).detach().cpu()
def get_bias(params: Dict[str, torch.Tensor], prefix: str,
dtype: torch.dtype) -> torch.Tensor:
if f'{prefix}.bias' not in params:
return None
return params[f'{prefix}.bias'].to(dtype).detach().cpu()
def get_weight_and_bias(params: Dict[str, torch.Tensor], prefix: str,
dtype: torch.dtype) -> Tuple[torch.Tensor]:
return get_weight(params, prefix, dtype), get_bias(params, prefix, dtype)
def get_tllm_linear_weight(
weight: torch.Tensor,
prefix: str,
bias: Optional[torch.Tensor] = None,
use_weight_only: bool = False,
plugin_weight_only_quant_type: torch.dtype = torch.int8
) -> Dict[str, torch.Tensor]:
results = {}
if use_weight_only:
v = weight.t().contiguous()
processed_torch_weights, torch_weight_scales = \
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
v, plugin_weight_only_quant_type)
results[f'{prefix}.weight'] = processed_torch_weights
results[f'{prefix}.per_channel_scale'] = torch_weight_scales
else:
results[f'{prefix}.weight'] = weight.contiguous()
if bias is not None:
results[f'{prefix}.bias'] = bias
return results
def get_tllm_param(
param: torch.Tensor,
name: str,
use_weight_only: bool = False,
plugin_weight_only_quant_type: torch.dtype = torch.int8
) -> Dict[str, torch.Tensor]:
results = {}
if name.endswith('.weight') and use_weight_only:
v = param.t().contiguous()
processed_torch_weights, torch_weight_scales = \
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
v, plugin_weight_only_quant_type)
results[name] = processed_torch_weights
results[name.replace('weight',
'per_channel_scale')] = torch_weight_scales
else:
results[name] = param
return results
def load_weights_from_hf_model(hf_model, config: GPTJConfig):
quant_algo = config.quantization.quant_algo
use_weight_only = quant_algo in [QuantAlgo.W8A16, QuantAlgo.W4A16]
if quant_algo == QuantAlgo.W8A16:
plugin_weight_only_quant_type = torch.int8
elif quant_algo == QuantAlgo.W4A16:
plugin_weight_only_quant_type = torch.quint4x2
else:
plugin_weight_only_quant_type = None
weights = {}
tik = time.time()
model_params = dict(hf_model.named_parameters())
dtype = getattr(torch, config.dtype)
num_hidden_layers = config.num_hidden_layers
mapping = config.mapping
layers_range = mapping.pp_layers(num_hidden_layers)
for l in layers_range:
prefix = f'transformer.h.{l}'
tllm_prex = f'transformer.layers.{l-layers_range[0]}'
# Attention QKV (no bias)
q_weight = get_weight(model_params, f'{prefix}.attn.q_proj', dtype)
k_weight = get_weight(model_params, f'{prefix}.attn.k_proj', dtype)
v_weight = get_weight(model_params, f'{prefix}.attn.v_proj', dtype)
q_w = split_matrix(q_weight, mapping.tp_size, mapping.tp_rank, dim=0)
k_w = split_matrix(k_weight, mapping.tp_size, mapping.tp_rank, dim=0)
v_w = split_matrix(v_weight, mapping.tp_size, mapping.tp_rank, dim=0)
qkv_w = torch.concatenate([q_w, k_w, v_w], dim=0)
weights.update(
get_tllm_linear_weight(qkv_w, f'{tllm_prex}.attention.qkv', None,
use_weight_only,
plugin_weight_only_quant_type))
# Attention dense (not bias)
attn_dense_weight = get_weight(model_params, f'{prefix}.attn.out_proj',
dtype)
attn_dense_w = split_matrix(attn_dense_weight,
mapping.tp_size,
mapping.tp_rank,
dim=1)
weights.update(
get_tllm_linear_weight(attn_dense_w, f'{tllm_prex}.attention.dense',
None, use_weight_only,
plugin_weight_only_quant_type))
# MLP fc_in (with bias)
mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(
model_params, f'{prefix}.mlp.fc_in', dtype)
mlp_fc_w = split_matrix(mlp_fc_weight,
mapping.tp_size,
mapping.tp_rank,
dim=0)
mlp_fc_b = split_matrix(mlp_fc_bias,
mapping.tp_size,
mapping.tp_rank,
dim=0)
weights.update(
get_tllm_linear_weight(mlp_fc_w, f'{tllm_prex}.mlp.fc', mlp_fc_b,
use_weight_only,
plugin_weight_only_quant_type))
# MLP fc_out (with bias)
mlp_proj_weight, mlp_proj_bias = get_weight_and_bias(
model_params, f'{prefix}.mlp.fc_out', dtype)
mlp_proj_w = split_matrix(mlp_proj_weight,
mapping.tp_size,
mapping.tp_rank,
dim=1)
# Only rank0 will get bias
if mapping.tp_size > 1 and mapping.tp_rank > 0:
mlp_proj_bias = torch.zeros(mlp_proj_weight.shape[0],
dtype=mlp_proj_weight.dtype)
weights.update(
get_tllm_linear_weight(mlp_proj_w, f'{tllm_prex}.mlp.proj',
mlp_proj_bias, use_weight_only,
plugin_weight_only_quant_type))
input_ln_weight, input_ln_bias = get_weight_and_bias(
model_params, f'{prefix}.ln_1', dtype)
weights[f'{tllm_prex}.input_layernorm.weight'] = input_ln_weight
weights[f'{tllm_prex}.input_layernorm.bias'] = input_ln_bias
if mapping.is_first_pp_rank():
# Embedding
embed_w = get_weight(model_params, 'transformer.wte', dtype)
if config.use_parallel_embedding:
embed_w = split_matrix(embed_w,
mapping.tp_size,
mapping.tp_rank,
dim=0)
weights['transformer.vocab_embedding.weight'] = embed_w
if mapping.is_last_pp_rank():
# lm_head weight and bias
lm_head_w, ln_head_bias = get_weight_and_bias(model_params, 'lm_head',
dtype)
weights['lm_head.weight'] = split_matrix(lm_head_w,
mapping.tp_size,
mapping.tp_rank,
dim=0)
weights['lm_head.bias'] = split_matrix(ln_head_bias,
mapping.tp_size,
mapping.tp_rank,
dim=0)
ln_f_w, ln_f_b = get_weight_and_bias(model_params, 'transformer.ln_f',
dtype)
# ln_f weight and bias
weights['transformer.ln_f.weight'] = ln_f_w
if ln_f_b is not None:
weights['transformer.ln_f.bias'] = ln_f_b
tok = time.time()
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
print(f'Weights loaded. Total time: {t}')
return weights

View File

@ -13,18 +13,23 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Optional, Union
from ..._utils import pad_vocab_size
from ...functional import PositionEmbeddingType, Tensor, allreduce
from ...layers import (MLP, Attention, AttentionMaskType, ColumnLinear,
Embedding, LayerNorm)
from ...mapping import Mapping
from ...module import Module
from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
PretrainedConfig)
check_share_embedding)
from .config import GPTJConfig
from .convert import load_weights_from_hf_model
class GPTJDecoderLayer(Module):
def __init__(self, config: PretrainedConfig, layer_idx: int):
def __init__(self, config: GPTJConfig, layer_idx: int):
super().__init__()
self.layer_idx = layer_idx
self.config = config
@ -104,7 +109,7 @@ class GPTJDecoderLayer(Module):
class GPTJModel(Module):
def __init__(self, config: PretrainedConfig):
def __init__(self, config: GPTJConfig):
super().__init__()
self.config = config
@ -144,9 +149,9 @@ class GPTJModel(Module):
class GPTJForCausalLM(DecoderModelForCausalLM):
config_class = GPTJConfig
def __init__(self, config: PretrainedConfig):
self.check_config(config)
def __init__(self, config: GPTJConfig):
transformer = GPTJModel(config)
vocab_size_padded = pad_vocab_size(config.vocab_size,
config.mapping.tp_size)
@ -162,5 +167,36 @@ class GPTJForCausalLM(DecoderModelForCausalLM):
lm_head = None
super().__init__(config, transformer, lm_head)
def check_config(self, config):
config.set_if_not_exist('rotary_dim', 64)
@classmethod
def from_hugging_face(
cls,
hf_model_or_dir: Union[str, 'transformers.PreTrainedModel'],
dtype: str = 'auto',
mapping: Optional[Mapping] = None,
quant_config=None,
**kwargs):
import transformers
use_preloading = isinstance(hf_model_or_dir,
transformers.PreTrainedModel)
if use_preloading:
hf_model = hf_model_or_dir
hf_config_or_dir = hf_model.config
else:
hf_model_dir = hf_model_or_dir
hf_config_or_dir = hf_model_or_dir
config = GPTJConfig.from_hugging_face(hf_config_or_dir,
dtype=dtype,
mapping=mapping,
quant_config=quant_config,
**kwargs)
if not use_preloading:
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
hf_model_dir, torch_dtype='auto', trust_remote_code=True)
weights = load_weights_from_hf_model(hf_model, config)
check_share_embedding(weights, config)
model = GPTJForCausalLM(config)
model.load(weights)
return model

View File

@ -21,7 +21,6 @@ import torch
from ..._utils import torch_dtype_to_str
from ...layers import MoeConfig
from ...logger import logger
from ...mapping import Mapping
from ..modeling_utils import PretrainedConfig, QuantConfig
@ -146,14 +145,9 @@ class LLaMAConfig(PretrainedConfig):
dtype = torch_dtype_to_str(dtype)
if dtype == 'float32':
dtype = 'float16'
if dtype == 'bfloat16' and torch.cuda.get_device_properties(
0).major < 8:
logger.warning(
"Pre SM 80 GPUs do not support bfloat16, fallback to float16")
dtype = 'float16'
return cls(
architecture='LlamaForCausalLM',
architecture=hf_config.architectures[0],
dtype=dtype,
num_hidden_layers=hf_config.num_hidden_layers,
num_attention_heads=hf_config.num_attention_heads,
@ -208,11 +202,6 @@ class LLaMAConfig(PretrainedConfig):
if dtype == 'auto':
dtype = 'bfloat16'
if dtype == 'bfloat16' and torch.cuda.get_device_properties(
0).major < 8:
logger.warning(
"Pre SM 80 GPUs do not support bfloat16, fallback to float16")
dtype = 'float16'
if meta_config.get('use_scaled_rope'):
rotary_scaling = {"type": "llama3"}

View File

@ -14,7 +14,6 @@
# limitations under the License.
import copy
import functools
import json
import os
import sys
import time
@ -473,8 +472,8 @@ def fp8_per_channel_quant_weight_gpu(weight, clamp_val, rank=0):
xmax = x.abs().max(-1, keepdim=True).values
# minimum scaling factor.
torch_weight_scales = (xmax / 448.0).clamp(min=1.0 / (448.0 * 512.0))
out = x / torch_weight_scales
torch_weight_scales = torch_weight_scales.reshape(-1)
out = x * 448.0 / xmax
out = torch.clamp(out, -448, 448)
processed_torch_weights = out.to(torch.float8_e4m3fn)
@ -1315,13 +1314,12 @@ def quantize(hf_model_dir: str,
'''
#TODO: currently only smooth quant and kv cache quantization are supported, needs to support mode quant algorithm calling modelopt
with open(os.path.join(output_dir, 'config.json'), 'w') as f:
json.dump(config.to_dict(), f, indent=4)
config.to_json_file(os.path.join(output_dir, 'config.json'))
mapping = config.mapping
assert mapping.rank == -1, "You shall call quantize only once in one rank, assert rank==-1 for precaution"
quant_config = config.quantization
quant_config = config.quantization
use_smooth_quant = quant_config.use_plugin_sq
int8_kv_cache = quant_config.kv_cache_quant_algo == QuantAlgo.INT8

View File

@ -14,6 +14,8 @@
# limitations under the License.
from typing import Optional, Union
import transformers
from ..._common import default_net
from ..._utils import pad_vocab_size
from ...functional import (AllReduceFusionOp, AllReduceFusionParams, Tensor,
@ -323,7 +325,7 @@ class LLaMAForCausalLM(DecoderModelForCausalLM):
weights = load_weights_from_hf_model(hf_model, config)
check_share_embedding(weights, config)
model = LLaMAForCausalLM(config)
model = cls(config)
model.load(weights)
return model
@ -349,7 +351,7 @@ class LLaMAForCausalLM(DecoderModelForCausalLM):
weights = load_weights_from_meta_ckpt(meta_ckpt_dir, config)
check_share_embedding(weights, config)
model = LLaMAForCausalLM(config)
model = cls(config)
model.load(weights)
return model

View File

@ -615,7 +615,7 @@ class PretrainedModel(Module,
model_inputs['lora_ranks'],
model_inputs['lora_weights_pointers'],
host_context_lengths=model_inputs['host_context_lengths'],
max_context_length=max_input_len,
max_num_tokens=max_num_tokens,
host_request_types=model_inputs['host_request_types'])
if model_inputs['spec_decoding_params'] is not None:
result['spec_decoding_params'] = model_inputs[
@ -757,6 +757,10 @@ def fuse_gate_mlp(
from ..quantization.quantize import fp8_quantize
quant_algo = model.config.quantization.quant_algo
if quant_algo != QuantAlgo.FP8 and quant_algo is not None:
logger.warning("fuse_gate_mlp cannot be done for this model. Skipping.")
return model
for name, mlp, layer in model.named_modules_with_parent():
if isinstance(mlp, GatedMLP):
init_params = get_init_params(mlp)

View File

@ -18,7 +18,6 @@ import torch
from ..._utils import torch_dtype_to_str
from ...layers import MoeConfig
from ...logger import logger
from ...mapping import Mapping
from ..modeling_utils import PretrainedConfig, QuantConfig
@ -123,14 +122,9 @@ class QWenConfig(PretrainedConfig):
dtype = torch_dtype_to_str(dtype)
if dtype == 'float32':
dtype = 'float16'
if dtype == 'bfloat16' and torch.cuda.get_device_properties(
0).major < 8:
logger.warning(
"Pre SM 80 GPUs do not support bfloat16, fallback to float16")
dtype = 'float16'
return cls(
architecture='QWenForCausalLM',
architecture=hf_config.architectures[0],
dtype=dtype,
num_hidden_layers=hf_config.num_hidden_layers,
num_attention_heads=hf_config.num_attention_heads,

View File

@ -12,4 +12,4 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
__version__ = "0.12.0.dev2024072301"
__version__ = "0.12.0.dev2024072302"

View File

@ -21,6 +21,10 @@ from utils.util import force_ampere, similar
from tensorrt_llm.models.llama.model import LLaMAForCausalLM
skip_single_gpu = pytest.mark.skipif(
torch.cuda.device_count() < 2,
reason="The test needs at least 2 GPUs, skipping")
# The unittests are based on the tiny-llama, which is fast to build and run.
# There are other tests based on llama-7B model, such as the end-to-end tests in test_e2e.py, and parallel tests in
# test_llm_multi_gpu.py.

View File

@ -0,0 +1,114 @@
from typing import List, Optional
import pytest
import torch
from tensorrt_llm import LLM, SamplingParams
from tensorrt_llm.hlapi import QuantAlgo, QuantConfig
try:
from .test_llm import get_model_path
except ImportError:
from test_llm import get_model_path
import os
import sys
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from utils.util import force_ampere, similar, skip_pre_hopper
gptj_model_path = get_model_path('gpt-j-6b')
gpt2_model_path = get_model_path('gpt2-medium')
starcoder2_model_path = get_model_path('starcoder2-3b')
sampling_params = SamplingParams(max_new_tokens=10)
def llm_test_harness(model_dir: str,
prompts: List[str],
references: List[str],
*,
sampling_params: Optional[SamplingParams] = None,
similar_threshold: float = 0.8,
**llm_kwargs):
# skip if no enough GPUs
tp_size = llm_kwargs.get('tensor_parallel_size', 1)
pp_size = llm_kwargs.get('pipeline_parallel_size', 1)
world_size = tp_size * pp_size
if world_size > torch.cuda.device_count():
pytest.skip(
f"world_size ({world_size}) is greater than available GPUs ({torch.cuda.device_count()})"
)
llm = LLM(model_dir, tokenizer=model_dir, **llm_kwargs)
outputs = llm.generate(prompts, sampling_params=sampling_params)
for out, ref in zip(outputs, references):
assert similar(out.outputs[0].text, ref, threshold=similar_threshold)
@force_ampere
def test_llm_gptj():
llm_test_harness(gptj_model_path,
prompts=["A B C"],
references=["D E F G H I J K L M"],
sampling_params=sampling_params)
@force_ampere
def test_llm_gptj_int4_weight_only():
quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16)
llm_test_harness(gptj_model_path,
prompts=["A B C"],
references=["D E F G H I J K L M"],
sampling_params=sampling_params,
quant_config=quant_config)
@force_ampere
def test_llm_gptj_tp2():
llm_test_harness(gptj_model_path,
prompts=["A B C"],
references=["D E F G H I J K L M"],
sampling_params=sampling_params,
tensor_parallel_size=2)
@force_ampere
def test_llm_gpt2():
llm_test_harness(gpt2_model_path,
prompts=["A B C"],
references=["D E F G H I J K L M"],
sampling_params=sampling_params)
@skip_pre_hopper
def test_llm_gpt2_fp8():
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
llm_test_harness(gpt2_model_path,
prompts=["A B C"],
references=["D E F G H I J K L M"],
sampling_params=sampling_params,
quant_config=quant_config)
@force_ampere
def test_llm_starcoder2():
llm_test_harness(starcoder2_model_path,
prompts=["def print_hello_world():"],
references=['\n print("Hello World")\n\ndef print'],
sampling_params=sampling_params)
@skip_pre_hopper
def test_llm_starcoder2_fp8():
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
llm_test_harness(starcoder2_model_path,
prompts=["def print_hello_world():"],
references=['\n print("Hello World")\n\ndef print'],
sampling_params=sampling_params,
quant_config=quant_config)
if __name__ == '__main__':
test_llm_gpt2()

View File

@ -22,15 +22,11 @@ from tensorrt_llm.models.llama.model import LLaMAForCausalLM
try:
from .test_llm import (_test_llm_generate_async, default_model_name,
get_model_path, llama_model_path, mixtral_model_name,
prompts)
prompts, skip_single_gpu)
except ImportError:
from test_llm import (_test_llm_generate_async, default_model_name,
get_model_path, llama_model_path, mixtral_model_name,
prompts)
skip_single_gpu = pytest.mark.skipif(
torch.cuda.device_count() < 2,
reason="The test needs at least 2 GPUs, skipping")
prompts, skip_single_gpu)
@pytest.fixture(scope="module")

View File

@ -33,6 +33,7 @@ from tensorrt_llm import Builder
from tensorrt_llm._utils import str_dtype_to_torch
from tensorrt_llm.functional import RotaryScalingType
from tensorrt_llm.layers import PositionEmbeddingType
from tensorrt_llm.models.gpt.convert import load_weights_from_hf_model
from tensorrt_llm.network import net_guard
from tensorrt_llm.plugin.plugin import ContextFMHAType
from tensorrt_llm.runtime import ModelConfig, SamplingConfig
@ -40,9 +41,6 @@ from tensorrt_llm.runtime.generation import _prepare_attention_mask
from tensorrt_llm.runtime.kv_cache_manager import (GenerationSequence,
KVCacheManager)
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from examples.gpt.convert_checkpoint import convert_hf_gpt
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from utils.util import skip_fp32_accum_pre_ampere, unittest_name_func
@ -86,12 +84,9 @@ class TestGPT(unittest.TestCase):
'bias': getattr(gpt_config, 'bias', True),
'apply_query_key_layer_scaling': apply_query_key_layer_scaling,
}
config = tensorrt_llm.models.PretrainedConfig.from_dict(config)
weights = convert_hf_gpt(hf_gpt,
gpt_config,
"gpt2",
config.mapping,
dtype=dtype)
config = tensorrt_llm.models.GPTConfig.from_dict(config)
weights = load_weights_from_hf_model(hf_gpt, config)
tensorrt_llm_gpt = tensorrt_llm.models.GPTForCausalLM(config)
tensorrt_llm_gpt.load(weights)

View File

@ -29,13 +29,10 @@ from transformers import GPTJConfig, GPTJForCausalLM
import tensorrt_llm
from tensorrt_llm import Builder
from tensorrt_llm.models.gptj.convert import load_weights_from_hf_model
from tensorrt_llm.network import net_guard
from tensorrt_llm.plugin.plugin import ContextFMHAType
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
from examples.gptj.convert_checkpoint import convert_hf_gptj
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
from utils.util import skip_fp32_accum_pre_ampere, unittest_name_func
@ -82,10 +79,7 @@ class TestGPTJ(unittest.TestCase):
}
config = tensorrt_llm.models.PretrainedConfig.from_dict(config)
config.set_rank(rank)
weights = convert_hf_gptj(hf_gpt,
gpt_config,
config.mapping,
dtype=dtype)
weights = load_weights_from_hf_model(hf_gpt, config)
trtllm_model = tensorrt_llm.models.GPTJForCausalLM(config)
trtllm_model.load(weights)

View File

@ -86,8 +86,9 @@ class TestLLaMA(unittest.TestCase):
# Initialize model
config = tensorrt_llm.models.LLaMAConfig.from_dict(config)
tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM(config)
weights = load_weights_from_hf_model(hf_llama, config)
tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM(config)
tensorrt_llm_llama.load(weights)
optimize_model(tensorrt_llm_llama, **opt_flags)

View File

@ -12,16 +12,12 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import unittest
from pathlib import Path
import numpy as np
import torch
sys.path.append(str(Path(__file__).parent.resolve() /
"../examples/gpt")) # more precise, avoid confusion
from convert_checkpoint import generate_int8
from tensorrt_llm.models.gpt.convert import generate_int8
def dist(x, y):