mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-13 22:18:36 +08:00
open source 315e9f5ccd286e906d4c0d402fefbf2f69a1febe (#2033)
This commit is contained in:
parent
5fa9436e17
commit
93293aa46d
@ -10,8 +10,6 @@ multiple GPUs or multiple nodes with multiple GPUs using the Python runtime.
|
||||
|
||||
The benchmark implementation and entrypoint can be found in [`benchmarks/python/benchmark.py`](./benchmark.py). There are some other scripts in the directory:
|
||||
|
||||
* [`benchmarks/python/allowed_configs.py`](./allowed_configs.py) to define configuration for each supported model.
|
||||
* [`benchmarks/python/build.py`](./build.py) to build supported models for benchmarking.
|
||||
* [`benchmarks/python/base_benchmark.py`](./base_benchmark.py) to implement the base class for benchmark.
|
||||
* [`benchmarks/python/gpt_benchmark.py`](./gpt_benchmark.py) to implement benchmark scripts for GPT and GPT-like(LLaMA/OPT/GPT-J/SmoothQuant-GPT) models.
|
||||
* [`benchmarks/python/bert_benchmark.py`](./bert_benchmark.py) to implement benchmark scripts for BERT models.
|
||||
@ -25,37 +23,29 @@ python benchmark.py -h
|
||||
```
|
||||
|
||||
### 1. Single GPU benchmark
|
||||
Take GPT-350M as an example:
|
||||
Take LLaMA 7B as an example:
|
||||
```
|
||||
python benchmark.py \
|
||||
-m gpt_350m \
|
||||
--mode plugin \
|
||||
-m dec \
|
||||
--engine_dir llama_7b \
|
||||
--batch_size "1;8;64" \
|
||||
--input_output_len "60,20;128,20"
|
||||
```
|
||||
Expected outputs:
|
||||
```
|
||||
[BENCHMARK] model_name gpt_350m world_size 1 num_heads 16 num_kv_heads 16 num_layers 24 hidden_size 1024 vocab_size 51200 precision float16 batch_size 1 input_length 60 output_length 20 gpu_peak_mem(gb) 4.2 build_time(s) 25.67 tokens_per_sec 483.54 percentile95(ms) 41.537 percentile99(ms) 42.102 latency(ms) 41.362 compute_cap sm80
|
||||
[BENCHMARK] model_name gpt_350m world_size 1 num_heads 16 num_kv_heads 16 num_layers 24 hidden_size 1024 vocab_size 51200 precision float16 batch_size 8 input_length 60 output_length 20 gpu_peak_mem(gb) 4.28 build_time(s) 25.67 tokens_per_sec 3477.28 percentile95(ms) 46.129 percentile99(ms) 46.276 latency(ms) 46.013 compute_cap sm80
|
||||
[BENCHMARK] model_name gpt_350m world_size 1 num_heads 16 num_kv_heads 16 num_layers 24 hidden_size 1024 vocab_size 51200 precision float16 batch_size 64 input_length 60 output_length 20 gpu_peak_mem(gb) 4.8 build_time(s) 25.67 tokens_per_sec 19698.07 percentile95(ms) 65.739 percentile99(ms) 65.906 latency(ms) 64.981 compute_cap sm80
|
||||
[BENCHMARK] model_name dec world_size 2 num_heads 32 num_kv_heads 32 num_layers 32 hidden_size 4096 vocab_size 32000 precision float16 batch_size 1 gpu_weights_percent 1.0 input_length 60 output_length 20 gpu_peak_mem(gb) 0.0 build_time(s) None tokens_per_sec 170.77 percentile95(ms) 117.591 percentile99(ms) 124.262 latency(ms) 117.115 compute_cap sm90 quantization QuantMode.FP8_QDQ|FP8_KV_CACHE generation_time(ms) 110.189 total_generated_tokens 19.0 generation_tokens_per_second 172.43
|
||||
[BENCHMARK] model_name dec world_size 2 num_heads 32 num_kv_heads 32 num_layers 32 hidden_size 4096 vocab_size 32000 precision float16 batch_size 8 gpu_weights_percent 1.0 input_length 60 output_length 20 gpu_peak_mem(gb) 0.0 build_time(s) None tokens_per_sec 1478.55 percentile95(ms) 108.641 percentile99(ms) 109.546 latency(ms) 108.214 compute_cap sm90 quantization QuantMode.FP8_QDQ|FP8_KV_CACHE generation_time(ms) 98.194 total_generated_tokens 152.0 generation_tokens_per_second 1547.951
|
||||
[BENCHMARK] model_name dec world_size 2 num_heads 32 num_kv_heads 32 num_layers 32 hidden_size 4096 vocab_size 32000 precision float16 batch_size 64 gpu_weights_percent 1.0 input_length 60 output_length 20 gpu_peak_mem(gb) 0.0 build_time(s) None tokens_per_sec 8214.87 percentile95(ms) 156.748 percentile99(ms) 160.203 latency(ms) 155.815 compute_cap sm90 quantization QuantMode.FP8_QDQ|FP8_KV_CACHE generation_time(ms) 111.078 total_generated_tokens 1216.0 generation_tokens_per_second 10947.303
|
||||
...
|
||||
```
|
||||
*Please note that the expected outputs is only for reference, specific performance numbers depend on the GPU you're using.*
|
||||
|
||||
### 2. Multi-GPU benchmark
|
||||
Take GPT-175B as an example:
|
||||
Take LLaMA 7B as an example:
|
||||
```
|
||||
mpirun -n 8 python benchmark.py \
|
||||
-m gpt_175b \
|
||||
--mode plugin \
|
||||
mpirun -n 2 python benchmark.py \
|
||||
-m dec \
|
||||
--engine_dir llama_7b \
|
||||
--batch_size "1;8;64" \
|
||||
--input_output_len "60,20;128,20"
|
||||
```
|
||||
|
||||
Note: Building multi-GPU engines in parallel could be a heavy workload for the CPU system. Tuning `mpirun --map-by <XXX>` option on your system may achieve significant boost in build time, for example:
|
||||
```
|
||||
mpirun --map-by socket -n 8 python build.py \
|
||||
--model gpt_175b \
|
||||
--mode ootb \
|
||||
--quantization fp8
|
||||
```
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -32,13 +32,13 @@ def get_compute_cap():
|
||||
return str(int(float(csv_value) * 10))
|
||||
|
||||
|
||||
def get_csv_filename(model, dtype, tp_size, mode, **kwargs):
|
||||
def get_csv_filename(model, dtype, tp_size, **kwargs):
|
||||
sm = get_compute_cap()
|
||||
if len(kwargs) == 0:
|
||||
kw_pairs = ""
|
||||
else:
|
||||
kw_pairs = "_" + "_".join([str(k) + str(v) for k, v in kwargs.items()])
|
||||
return f'{model}_{dtype}_tp{tp_size}_{mode}{kw_pairs}_sm{sm}.csv'
|
||||
return f'{model}_{dtype}_tp{tp_size}_{kw_pairs}_sm{sm}.csv'
|
||||
|
||||
|
||||
def get_engine_name(model, dtype, tp_size, rank):
|
||||
@ -59,13 +59,7 @@ def serialize_engine(engine, path):
|
||||
|
||||
class BaseBenchmark(object):
|
||||
|
||||
def __init__(self,
|
||||
engine_dir,
|
||||
model_name,
|
||||
dtype,
|
||||
rank,
|
||||
world_size,
|
||||
serial_build: bool = False):
|
||||
def __init__(self, engine_dir, model_name, dtype, rank, world_size):
|
||||
self.engine_dir = engine_dir
|
||||
self.model_name = model_name
|
||||
self.dtype = dtype
|
||||
@ -74,73 +68,67 @@ class BaseBenchmark(object):
|
||||
self.engine_model_name = model_name
|
||||
self.quant_mode = QuantMode(0)
|
||||
self.enable_fp8 = False
|
||||
if engine_dir is not None:
|
||||
# Read config from engine directory
|
||||
config_path = os.path.join(engine_dir, 'config.json')
|
||||
with open(config_path, 'r') as f:
|
||||
self.config = json.load(f)
|
||||
# Sanity checks
|
||||
if 'pretrained_config' in self.config: # new build api branch
|
||||
config_dtype = self.config['pretrained_config']['dtype']
|
||||
assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
|
||||
world_size = self.config['pretrained_config']['mapping'][
|
||||
'world_size']
|
||||
assert world_size == self.world_size, \
|
||||
(f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
|
||||
# Load config into self
|
||||
for key, value in self.config['pretrained_config'].items():
|
||||
setattr(self, key, value)
|
||||
|
||||
self.quant_mode = QuantMode.from_quant_algo(
|
||||
quant_algo=self.quantization['quant_algo'],
|
||||
kv_cache_quant_algo=self.quantization['kv_cache_quant_algo'])
|
||||
self.enable_fp8 = self.quant_mode.has_fp8_qdq()
|
||||
self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
|
||||
|
||||
for key, value in self.config['build_config'].items():
|
||||
setattr(self, key, value)
|
||||
|
||||
for key, value in self.plugin_config.items():
|
||||
if "plugin" in key:
|
||||
key = "use_" + key
|
||||
setattr(self, key, value)
|
||||
|
||||
self.engine_name = f"rank{self.runtime_rank}.engine"
|
||||
|
||||
self.num_kv_heads = self.num_key_value_heads
|
||||
self.num_layers = self.num_hidden_layers
|
||||
self.num_heads = self.num_attention_heads
|
||||
else:
|
||||
# Read config from engine directory
|
||||
config_path = os.path.join(engine_dir, 'config.json')
|
||||
with open(config_path, 'r') as f:
|
||||
self.config = json.load(f)
|
||||
# Sanity checks
|
||||
if 'pretrained_config' in self.config: # new build api branch
|
||||
config_dtype = self.config['pretrained_config']['dtype']
|
||||
assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
|
||||
world_size = self.config['pretrained_config']['mapping'][
|
||||
'world_size']
|
||||
assert world_size == self.world_size, \
|
||||
(f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
|
||||
# Load config into self
|
||||
for key, value in self.config['pretrained_config'].items():
|
||||
config_dtype = self.config['builder_config']['precision']
|
||||
assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
|
||||
world_size = self.config['builder_config']['tensor_parallel']
|
||||
assert world_size == self.world_size, \
|
||||
(f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
|
||||
# Load config into self
|
||||
for key, value in self.config['builder_config'].items():
|
||||
if key == "quant_mode":
|
||||
self.quant_mode = QuantMode(value)
|
||||
elif key in "name":
|
||||
self.engine_model_name = value
|
||||
else:
|
||||
setattr(self, key, value)
|
||||
|
||||
self.quant_mode = QuantMode.from_quant_algo(
|
||||
quant_algo=self.quantization['quant_algo'],
|
||||
kv_cache_quant_algo=self.quantization['kv_cache_quant_algo']
|
||||
)
|
||||
self.enable_fp8 = self.quant_mode.has_fp8_qdq()
|
||||
self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
|
||||
|
||||
for key, value in self.config['build_config'].items():
|
||||
setattr(self, key, value)
|
||||
|
||||
for key, value in self.plugin_config.items():
|
||||
if "plugin" in key:
|
||||
key = "use_" + key
|
||||
setattr(self, key, value)
|
||||
|
||||
self.engine_name = f"rank{self.runtime_rank}.engine"
|
||||
|
||||
self.num_kv_heads = self.num_key_value_heads
|
||||
self.num_layers = self.num_hidden_layers
|
||||
self.num_heads = self.num_attention_heads
|
||||
else:
|
||||
# Read config from engine directory
|
||||
config_path = os.path.join(engine_dir, 'config.json')
|
||||
with open(config_path, 'r') as f:
|
||||
self.config = json.load(f)
|
||||
# Sanity checks
|
||||
config_dtype = self.config['builder_config']['precision']
|
||||
assert dtype == config_dtype, f"Engine dtype ({config_dtype}) != Runtime dtype ({dtype})"
|
||||
world_size = self.config['builder_config']['tensor_parallel']
|
||||
assert world_size == self.world_size, \
|
||||
(f'Engine world size ({world_size}) != Runtime world size ({self.world_size})')
|
||||
# Load config into self
|
||||
for key, value in self.config['builder_config'].items():
|
||||
if key == "quant_mode":
|
||||
self.quant_mode = QuantMode(value)
|
||||
elif key in "name":
|
||||
self.engine_model_name = value
|
||||
else:
|
||||
setattr(self, key, value)
|
||||
self.enable_fp8 = self.quant_mode.has_fp8_qdq()
|
||||
self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
|
||||
for key, value in self.config['plugin_config'].items():
|
||||
# Same effect as self.use_foo_plugin = config.json["foo_plugin"]
|
||||
if "plugin" in key:
|
||||
key = "use_" + key
|
||||
setattr(self, key, value)
|
||||
self.engine_name = get_engine_name(self.engine_model_name,
|
||||
self.dtype, self.world_size,
|
||||
self.runtime_rank)
|
||||
else:
|
||||
self.enable_fp8 = self.quant_mode.has_fp8_qdq()
|
||||
self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
|
||||
for key, value in self.config['plugin_config'].items():
|
||||
# Same effect as self.use_foo_plugin = config.json["foo_plugin"]
|
||||
if "plugin" in key:
|
||||
key = "use_" + key
|
||||
setattr(self, key, value)
|
||||
self.engine_name = get_engine_name(self.engine_model_name,
|
||||
self.dtype, self.world_size,
|
||||
self.runtime_rank)
|
||||
@ -148,9 +136,9 @@ class BaseBenchmark(object):
|
||||
self.runtime_mapping = tensorrt_llm.Mapping(world_size=self.world_size,
|
||||
rank=self.runtime_rank,
|
||||
tp_size=self.world_size)
|
||||
if not serial_build:
|
||||
torch.cuda.set_device(self.runtime_rank %
|
||||
self.runtime_mapping.gpus_per_node)
|
||||
|
||||
torch.cuda.set_device(self.runtime_rank %
|
||||
self.runtime_mapping.gpus_per_node)
|
||||
|
||||
self.csv_filename = "" # lazy init
|
||||
|
||||
@ -189,7 +177,6 @@ class BaseBenchmark(object):
|
||||
self.csv_filename = get_csv_filename(self.model_name,
|
||||
self.dtype,
|
||||
self.world_size,
|
||||
self.mode,
|
||||
fp8linear=int(self.enable_fp8))
|
||||
return self.csv_filename
|
||||
|
||||
|
||||
@ -20,26 +20,15 @@ import torch
|
||||
|
||||
|
||||
def parse_arguments():
|
||||
from allowed_configs import get_allowed_models
|
||||
parser = argparse.ArgumentParser(
|
||||
description='Benchmark TensorRT-LLM models.')
|
||||
parser.add_argument('-m',
|
||||
'--model',
|
||||
type=str,
|
||||
default="gpt_350m",
|
||||
choices=get_allowed_models(),
|
||||
help='Specify model you want to benchmark.')
|
||||
parser.add_argument(
|
||||
'--mode',
|
||||
type=str,
|
||||
default="plugin",
|
||||
choices=['ootb', 'plugin', 'ootb-except-mha'],
|
||||
help=
|
||||
('Choose mode between ootb/plugin. '
|
||||
'\"ootb\" means the engines will be built without any plugins, '
|
||||
'\"plugin\" means the engines will be built with tuned recipe of using plugins.'
|
||||
'\"ootb-except-mha\" means the engines will be built with only attention plugins.'
|
||||
))
|
||||
default="dec",
|
||||
choices=["dec", "enc", "enc-dec"],
|
||||
help='Specify type of the model you want to benchmark. '
|
||||
'Choose model between dec/enc/enc-dec.')
|
||||
|
||||
parser.add_argument('--batch_size',
|
||||
type=str,
|
||||
@ -69,13 +58,6 @@ def parse_arguments():
|
||||
default='float16',
|
||||
choices=['float16', 'bfloat16', 'float32'],
|
||||
help='Choose data type between float16/bfloat16/float32.')
|
||||
parser.add_argument(
|
||||
'--refit',
|
||||
default=False,
|
||||
action="store_true",
|
||||
help=
|
||||
'If this option is specified, a refit flag is added to TensorRT engines.'
|
||||
)
|
||||
|
||||
parser.add_argument('--num_beams',
|
||||
type=int,
|
||||
@ -100,14 +82,6 @@ def parse_arguments():
|
||||
type=str,
|
||||
default='model.cache',
|
||||
help='The path to write timing cache')
|
||||
parser.add_argument(
|
||||
'--profiling_verbosity',
|
||||
type=str,
|
||||
default='layer_names_only',
|
||||
choices=['layer_names_only', 'detailed', 'none'],
|
||||
help=
|
||||
'The profiling verbosity for the generated TRT engine. Set to detailed can inspect tactic choices and kernel parameters.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--log_level',
|
||||
type=str,
|
||||
@ -131,75 +105,14 @@ def parse_arguments():
|
||||
default=60,
|
||||
help='Minimal duration of iterations to measure in seconds.')
|
||||
|
||||
parser.add_argument(
|
||||
'--output_dir',
|
||||
type=str,
|
||||
default=None,
|
||||
help=
|
||||
'If this option is specified, TensorRT engines will be saved to the specified path.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--engine_dir',
|
||||
type=str,
|
||||
default=None,
|
||||
required=True,
|
||||
help=
|
||||
('If this option is specified, instead of building engines on-air before benchmarking, '
|
||||
'the engines contained in the engine_dir will be used.'))
|
||||
parser.add_argument(
|
||||
'--max_beam_width',
|
||||
type=int,
|
||||
default=None,
|
||||
help=
|
||||
('If this option is specified, it will override the max beam width of '
|
||||
'TRT engines to the specified value instead of using pre-defined one'))
|
||||
parser.add_argument(
|
||||
'--max_input_len',
|
||||
type=int,
|
||||
default=None,
|
||||
help=
|
||||
('If this option is specified, it will override the max input len of '
|
||||
'TRT engines to the specified value instead of using pre-defined one'))
|
||||
parser.add_argument(
|
||||
'--max_encoder_input_len',
|
||||
type=int,
|
||||
default=None,
|
||||
help=
|
||||
('This argument is only for encoder-decoder models'
|
||||
'If this option is specified, it will override the max encoder input len of TRT engines to the specified value instead of using pre-defined one'
|
||||
'By default when this option is not used, it will use pre-defined max encoder input len'
|
||||
))
|
||||
parser.add_argument(
|
||||
'--max_decoder_input_len',
|
||||
type=int,
|
||||
default=None,
|
||||
help=
|
||||
('This argument is only for encoder-decoder models'
|
||||
'If this option is specified, it will override the max decoder input len of TRT engines to the specified value instead of using pre-defined one'
|
||||
'By default when this option is not used, it will use pre-defined max decoder input len'
|
||||
))
|
||||
parser.add_argument(
|
||||
'--max_seq_len',
|
||||
'--max_decoder_seq_len',
|
||||
dest='max_seq_len',
|
||||
type=int,
|
||||
default=None,
|
||||
help=
|
||||
('If this option is specified, it will override the max sequence len of '
|
||||
'TRT engines to the specified value instead of using pre-defined one'))
|
||||
parser.add_argument(
|
||||
'--max_batch_size',
|
||||
type=int,
|
||||
default=None,
|
||||
help=
|
||||
('If this option is specified, it will override the max batch size of '
|
||||
'TRT engines to the specified value instead of using pre-defined one'))
|
||||
parser.add_argument(
|
||||
'--force_num_layer_1',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help=
|
||||
'Quick sanity check with num_layer=1; will be silently ignored if --engine_dir is specified.'
|
||||
)
|
||||
parser.add_argument(
|
||||
'--gpu_weights_percent',
|
||||
type=str,
|
||||
@ -207,13 +120,6 @@ def parse_arguments():
|
||||
help='Specify the percentage of weights that reside on GPU (from 0 to 1).'
|
||||
'Multiple percentages can be separated by \";\", '
|
||||
'example: \"0;0.5;1\".')
|
||||
parser.add_argument(
|
||||
'--multiple_profiles',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help=
|
||||
'This option will benefit performance, but will increase the engine build time.'
|
||||
)
|
||||
|
||||
parser.add_argument('--csv',
|
||||
default=False,
|
||||
@ -234,40 +140,7 @@ def parse_arguments():
|
||||
'int8_sq_per_channel_ootb'
|
||||
],
|
||||
help="Optimize the model with specified quantization recipe")
|
||||
parser.add_argument(
|
||||
'--build_only',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help=
|
||||
"Build engine only and skip inference, this can help to benchmark the build time on single gpu node for multi GPU model, where the inference is not possible"
|
||||
)
|
||||
|
||||
parser.add_argument('--serial_build',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help="Build engines serially")
|
||||
|
||||
parser.add_argument(
|
||||
'--rank',
|
||||
type=int,
|
||||
default=None,
|
||||
help=
|
||||
"The rank of the model to be built, only used when --build_only and --serial_build is specified"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--world_size',
|
||||
type=int,
|
||||
default=None,
|
||||
help=
|
||||
"The number of gpus to be used for inference, only used when --build_only and --serial_build is specified"
|
||||
)
|
||||
parser.add_argument(
|
||||
'--debug_memory',
|
||||
default=False,
|
||||
action='store_true',
|
||||
help=
|
||||
"Check the estimated memory usage against the total GPU memory. Raise error if the estimated memory requirement is bigger than the total GPU memory"
|
||||
"Warning: only GPT model family is supported for now")
|
||||
parser.add_argument(
|
||||
'--dump_profile',
|
||||
default=False,
|
||||
@ -281,25 +154,6 @@ def parse_arguments():
|
||||
help=
|
||||
"Print layer information of the engine to console (default = disabled)")
|
||||
|
||||
parser.add_argument(
|
||||
'--opt_batch_size',
|
||||
type=int,
|
||||
default=None,
|
||||
help=
|
||||
"If opt_batch_size option is specified, it will override the opt batch size."
|
||||
"This flag only takes effect when `--mode=ootb` is added. For other modes, please use --opt_num_tokens to replace it."
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
'--opt_num_tokens',
|
||||
type=int,
|
||||
default=None,
|
||||
help="It equals to max_batch_size*max_beam_width by default, set this "
|
||||
"value as close as possible to the actual number of tokens on your workload. "
|
||||
"Note that this argument might be removed in the future."
|
||||
"This flag only takes effect when `--mode` is not `ootb`. For ootb mode, please use --opt_batch_size to replace it."
|
||||
)
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
@ -308,7 +162,6 @@ def main(args):
|
||||
# tensorrt_llm is imported, but mpi4py does not work well with
|
||||
# the start method `spawn` of Python multiprocessing,
|
||||
# so we set the start method first, then initialize MPI.
|
||||
from allowed_configs import get_allowed_models
|
||||
from benchmark_profiler import BenchmarkProfiler
|
||||
from bert_benchmark import BERTBenchmark
|
||||
from enc_dec_benchmark import EncDecBenchmark
|
||||
@ -341,17 +194,8 @@ def main(args):
|
||||
)
|
||||
args.weight_streaming = any([p != 1 for p in gpu_weights_percents])
|
||||
|
||||
if args.serial_build and not args.build_only:
|
||||
raise Exception(
|
||||
f"--serial_build must be used with --build_only, always need to parallel build to do inference in the same process"
|
||||
)
|
||||
|
||||
if args.build_only and args.serial_build and args.rank is not None and args.world_size is not None:
|
||||
rank = args.rank
|
||||
world_size = args.world_size
|
||||
else:
|
||||
rank = tensorrt_llm.mpi_rank()
|
||||
world_size = tensorrt_llm.mpi_world_size()
|
||||
rank = tensorrt_llm.mpi_rank()
|
||||
world_size = tensorrt_llm.mpi_world_size()
|
||||
|
||||
# TODO: Re-enable memory monitor for multi-gpu benchmarks.
|
||||
# Current Mem Monitor will cause benchmark script hang
|
||||
@ -361,30 +205,25 @@ def main(args):
|
||||
from mem_monitor import MemoryMonitor
|
||||
|
||||
benchmark_profiler = None
|
||||
if args.model in get_allowed_models(benchmark_type="gpt"):
|
||||
if args.model == "dec":
|
||||
benchmark_profiler = BenchmarkProfiler()
|
||||
benchmarker = GPTBenchmark(args, batch_size_options, in_out_len_options,
|
||||
gpu_weights_percents, rank, world_size)
|
||||
elif args.model in get_allowed_models(benchmark_type="bert"):
|
||||
elif args.model == "enc":
|
||||
benchmarker = BERTBenchmark(args, batch_size_options, input_len_options,
|
||||
gpu_weights_percents, rank, world_size)
|
||||
elif args.model in get_allowed_models(benchmark_type="enc_dec"):
|
||||
elif args.model == "enc-dec":
|
||||
benchmarker = EncDecBenchmark(args, batch_size_options,
|
||||
in_out_len_options, gpu_weights_percents,
|
||||
rank, world_size)
|
||||
else:
|
||||
raise Exception(f'Unexpected model: {args.model}')
|
||||
|
||||
if args.build_only:
|
||||
return
|
||||
|
||||
start = torch.cuda.Event(enable_timing=True)
|
||||
end = torch.cuda.Event(enable_timing=True)
|
||||
benchmarker.print_report_header(args.csv,
|
||||
benchmark_profiler=benchmark_profiler)
|
||||
for config in benchmarker.get_config():
|
||||
if isinstance(benchmarker, GPTBenchmark):
|
||||
benchmarker.check_memory(config, raise_exception=args.debug_memory)
|
||||
try:
|
||||
if args.weight_streaming:
|
||||
# We pass in config instead of the gpu_weights_percent here to keep this benchmark script
|
||||
|
||||
@ -18,9 +18,7 @@ import os
|
||||
import torch
|
||||
import tensorrt as trt
|
||||
#isort: on
|
||||
from allowed_configs import get_build_config
|
||||
from base_benchmark import BaseBenchmark
|
||||
from build import build_bert
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm._utils import trt_dtype_to_torch
|
||||
@ -32,36 +30,17 @@ class BERTBenchmark(BaseBenchmark):
|
||||
def __init__(self, args, batch_sizes, in_lens, gpu_weights_percents, rank,
|
||||
world_size):
|
||||
super().__init__(args.engine_dir, args.model, args.dtype, rank,
|
||||
world_size, args.serial_build)
|
||||
world_size)
|
||||
self.batch_sizes = batch_sizes
|
||||
self.in_lens = in_lens
|
||||
self.build_time = 0
|
||||
self.mode = args.mode
|
||||
self.gpu_weights_percents = gpu_weights_percents
|
||||
|
||||
if args.engine_dir is not None:
|
||||
# Deserialize engine from engine directory
|
||||
self.serialize_path = os.path.join(args.engine_dir,
|
||||
self.engine_name)
|
||||
with open(self.serialize_path, 'rb') as f:
|
||||
engine_buffer = f.read()
|
||||
else:
|
||||
# Build engine
|
||||
for key, value in get_build_config(args.model).items():
|
||||
setattr(self, key, value)
|
||||
if args.force_num_layer_1:
|
||||
self.num_layers = 1
|
||||
if args.max_batch_size is not None:
|
||||
self.max_batch_size = args.max_batch_size
|
||||
if args.max_input_len is not None:
|
||||
self.max_input_len = args.max_input_len
|
||||
|
||||
engine_buffer, build_time = build_bert(args)
|
||||
self.build_time = build_time
|
||||
|
||||
assert engine_buffer is not None
|
||||
if args.build_only:
|
||||
return
|
||||
# Deserialize engine from engine directory
|
||||
self.serialize_path = os.path.join(args.engine_dir, self.engine_name)
|
||||
with open(self.serialize_path, 'rb') as f:
|
||||
engine_buffer = f.read()
|
||||
assert engine_buffer is not None
|
||||
|
||||
self.session = tensorrt_llm.runtime.Session.from_serialized_engine(
|
||||
engine_buffer)
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -18,14 +18,13 @@ import os
|
||||
# isort: off
|
||||
import torch
|
||||
#isort: on
|
||||
from allowed_configs import get_build_config
|
||||
from base_benchmark import BaseBenchmark, get_engine_name
|
||||
from build import build_enc_dec
|
||||
from base_benchmark import BaseBenchmark
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm._utils import (trt_dtype_to_torch, str_dtype_to_trt)
|
||||
from tensorrt_llm.quantization import QuantMode
|
||||
from tensorrt_llm.runtime.session import TensorInfo
|
||||
from tensorrt_llm.runtime import ModelConfig
|
||||
|
||||
|
||||
class EncDecBenchmark(BaseBenchmark):
|
||||
@ -34,10 +33,8 @@ class EncDecBenchmark(BaseBenchmark):
|
||||
rank, world_size):
|
||||
self.engine_dir = args.engine_dir
|
||||
self.model_name = args.model
|
||||
self.mode = args.mode
|
||||
self.enable_fp8 = False # hardcode for enc-dec models
|
||||
self.dtype = args.dtype
|
||||
self.output_dir = args.output_dir
|
||||
self.runtime_rank = rank
|
||||
self.world_size = world_size
|
||||
self.csv_filename = "" # lazy init
|
||||
@ -63,87 +60,93 @@ class EncDecBenchmark(BaseBenchmark):
|
||||
"config.json")
|
||||
with open(config_path, "r") as f:
|
||||
config = json.load(f)
|
||||
# Sanity checks
|
||||
config_dtype = config["builder_config"]["precision"]
|
||||
assert (
|
||||
self.dtype == config_dtype
|
||||
), f"Engine dtype ({config_dtype}) != Runtime dtype ({self.dtype})"
|
||||
world_size = config["builder_config"]["tensor_parallel"]
|
||||
assert (
|
||||
world_size == self.world_size
|
||||
), f"Engine world size ({world_size}) != Runtime world size ({self.world_size})"
|
||||
tp_size = config["builder_config"]["tensor_parallel"]
|
||||
# TP only for benchmarking
|
||||
assert (
|
||||
tp_size == self.world_size
|
||||
), f"Engine tensor parallel size ({tp_size}) should be equal to world size ({self.world_size})"
|
||||
assert (
|
||||
config["plugin_config"]["remove_input_padding"] == False
|
||||
), "remove_input_padding should be False for enc-dec benchmarks"
|
||||
num_heads = config["builder_config"]["num_heads"]
|
||||
|
||||
builder_config = config['build_config']
|
||||
plugin_config = builder_config['plugin_config']
|
||||
pretrained_config = config['pretrained_config']
|
||||
lora_config = builder_config['lora_config']
|
||||
builder_config['auto_parallel_config']
|
||||
use_gpt_attention_plugin = plugin_config["gpt_attention_plugin"]
|
||||
remove_input_padding = plugin_config["remove_input_padding"]
|
||||
use_lora_plugin = plugin_config["lora_plugin"]
|
||||
tp_size = pretrained_config['mapping']['tp_size']
|
||||
pp_size = pretrained_config['mapping']['pp_size']
|
||||
world_size = tp_size * pp_size
|
||||
assert world_size == tensorrt_llm.mpi_world_size(), \
|
||||
f'Engine world size ({world_size}) != Runtime world size ({tensorrt_llm.mpi_world_size()})'
|
||||
num_heads = pretrained_config["num_attention_heads"]
|
||||
hidden_size = pretrained_config["hidden_size"]
|
||||
head_size = pretrained_config["head_size"]
|
||||
vocab_size = pretrained_config["vocab_size"]
|
||||
max_batch_size = builder_config["max_batch_size"]
|
||||
max_beam_width = builder_config["max_beam_width"]
|
||||
num_layers = pretrained_config["num_hidden_layers"]
|
||||
num_kv_heads = pretrained_config.get('num_kv_heads', num_heads)
|
||||
|
||||
assert (num_heads % tp_size) == 0
|
||||
# Get model config
|
||||
num_heads = num_heads // tp_size
|
||||
hidden_size = config["builder_config"]["hidden_size"] // tp_size
|
||||
num_kv_heads = config["builder_config"].get(
|
||||
"num_kv_heads", config["builder_config"]["num_heads"])
|
||||
hidden_size = hidden_size // tp_size
|
||||
num_kv_heads = (num_kv_heads + tp_size - 1) // tp_size
|
||||
|
||||
model_config = tensorrt_llm.runtime.ModelConfig(
|
||||
cross_attention = pretrained_config[
|
||||
"architecture"] == "DecoderModel"
|
||||
skip_cross_qkv = pretrained_config.get('skip_cross_qkv', False)
|
||||
has_position_embedding = pretrained_config[
|
||||
"has_position_embedding"]
|
||||
has_token_type_embedding = hasattr(pretrained_config,
|
||||
"type_vocab_size")
|
||||
dtype = pretrained_config["dtype"]
|
||||
|
||||
paged_kv_cache = plugin_config['paged_kv_cache']
|
||||
tokens_per_block = plugin_config['tokens_per_block']
|
||||
|
||||
gather_context_logits = builder_config.get(
|
||||
'gather_context_logits', False)
|
||||
gather_generation_logits = builder_config.get(
|
||||
'gather_generation_logits', False)
|
||||
max_prompt_embedding_table_size = builder_config.get(
|
||||
'max_prompt_embedding_table_size', 0)
|
||||
|
||||
self.max_batch_size = config["build_config"]["max_batch_size"]
|
||||
self.max_input_len = config["build_config"][
|
||||
"max_encoder_input_len"]
|
||||
self.max_seq_len = config["build_config"]["max_seq_len"]
|
||||
|
||||
model_config = ModelConfig(
|
||||
num_heads=num_heads,
|
||||
num_kv_heads=num_kv_heads,
|
||||
hidden_size=hidden_size,
|
||||
head_size=config["builder_config"]["head_size"],
|
||||
max_batch_size=config["builder_config"]["max_batch_size"],
|
||||
max_beam_width=config["builder_config"]["max_beam_width"],
|
||||
vocab_size=config["builder_config"]["vocab_size"],
|
||||
num_layers=config["builder_config"]["num_layers"],
|
||||
gpt_attention_plugin=config["plugin_config"]
|
||||
["gpt_attention_plugin"],
|
||||
remove_input_padding=config["plugin_config"]
|
||||
["remove_input_padding"],
|
||||
cross_attention=config["builder_config"]["cross_attention"],
|
||||
skip_cross_qkv=config["builder_config"]["skip_cross_qkv"],
|
||||
has_position_embedding=config["builder_config"]
|
||||
["has_position_embedding"],
|
||||
has_token_type_embedding=config["builder_config"]
|
||||
["has_token_type_embedding"],
|
||||
dtype=config_dtype,
|
||||
head_size=head_size,
|
||||
max_batch_size=max_batch_size,
|
||||
max_beam_width=max_beam_width,
|
||||
vocab_size=vocab_size,
|
||||
num_layers=num_layers,
|
||||
gpt_attention_plugin=use_gpt_attention_plugin,
|
||||
remove_input_padding=remove_input_padding,
|
||||
paged_kv_cache=paged_kv_cache,
|
||||
tokens_per_block=tokens_per_block,
|
||||
cross_attention=cross_attention,
|
||||
has_position_embedding=has_position_embedding,
|
||||
has_token_type_embedding=has_token_type_embedding,
|
||||
dtype=dtype,
|
||||
gather_context_logits=gather_context_logits,
|
||||
gather_generation_logits=gather_generation_logits,
|
||||
max_prompt_embedding_table_size=
|
||||
max_prompt_embedding_table_size,
|
||||
lora_plugin=use_lora_plugin,
|
||||
lora_target_modules=lora_config.get('lora_target_modules'),
|
||||
trtllm_modules_to_hf_modules=lora_config.get(
|
||||
'trtllm_modules_to_hf_modules'),
|
||||
skip_cross_qkv=skip_cross_qkv,
|
||||
)
|
||||
self.max_batch_size = config["builder_config"]["max_batch_size"]
|
||||
self.max_input_len = config["builder_config"][
|
||||
"max_encoder_input_len"]
|
||||
self.max_seq_len = config["builder_config"]["max_seq_len"]
|
||||
self.n_mels = config["builder_config"][
|
||||
'n_mels'] if 'whisper' in self.model_name else 0
|
||||
|
||||
for key, value in config["builder_config"].items():
|
||||
if key == "name":
|
||||
engine_model_name = value
|
||||
break
|
||||
return engine_model_name, model_config
|
||||
return model_config
|
||||
|
||||
(
|
||||
self.encoder_engine_model_name,
|
||||
self.encoder_model_config,
|
||||
) = read_config("encoder")
|
||||
(
|
||||
self.decoder_engine_model_name,
|
||||
self.decoder_model_config,
|
||||
) = read_config("decoder")
|
||||
self.encoder_model_config = read_config("encoder")
|
||||
self.decoder_model_config = read_config("decoder")
|
||||
|
||||
self.encoder_engine_name = get_engine_name(
|
||||
self.encoder_engine_model_name,
|
||||
self.dtype,
|
||||
self.world_size,
|
||||
self.runtime_rank,
|
||||
)
|
||||
self.decoder_engine_name = get_engine_name(
|
||||
self.decoder_engine_model_name,
|
||||
self.dtype,
|
||||
self.world_size,
|
||||
self.runtime_rank,
|
||||
)
|
||||
self.encoder_engine_name = 'rank{}.engine'.format(self.runtime_rank)
|
||||
self.decoder_engine_name = 'rank{}.engine'.format(self.runtime_rank)
|
||||
self.encoder_runtime_mapping = tensorrt_llm.Mapping(
|
||||
world_size=self.world_size,
|
||||
rank=self.runtime_rank,
|
||||
@ -155,47 +158,21 @@ class EncDecBenchmark(BaseBenchmark):
|
||||
tp_size=self.world_size,
|
||||
)
|
||||
|
||||
if not args.serial_build:
|
||||
torch.cuda.set_device(self.runtime_rank %
|
||||
self.encoder_runtime_mapping.gpus_per_node)
|
||||
torch.cuda.set_device(self.runtime_rank %
|
||||
self.encoder_runtime_mapping.gpus_per_node)
|
||||
self.device = torch.cuda.current_device()
|
||||
|
||||
if self.engine_dir is not None:
|
||||
# Deserialize engine from engine directory
|
||||
self.encoder_serialize_path = os.path.join(self.engine_dir,
|
||||
"encoder",
|
||||
self.encoder_engine_name)
|
||||
with open(self.encoder_serialize_path, "rb") as f:
|
||||
encoder_engine_buffer = f.read()
|
||||
self.decoder_serialize_path = os.path.join(self.engine_dir,
|
||||
"decoder",
|
||||
self.decoder_engine_name)
|
||||
with open(self.decoder_serialize_path, "rb") as f:
|
||||
decoder_engine_buffer = f.read()
|
||||
else:
|
||||
build_config = get_build_config(self.model_name)
|
||||
self.max_batch_size = build_config['max_batch_size'] \
|
||||
if args.max_batch_size is None else args.max_batch_size
|
||||
self.max_input_len = build_config['max_encoder_input_len'] \
|
||||
if args.max_input_len is None else args.max_input_len
|
||||
self.max_seq_len = build_config['max_seq_len'] \
|
||||
if args.max_seq_len is None else args.max_seq_len
|
||||
self.n_mels = build_config[
|
||||
'n_mels'] if 'whisper' in self.model_name else 0
|
||||
# Build engine
|
||||
(
|
||||
encoder_engine_buffer,
|
||||
decoder_engine_buffer,
|
||||
self.encoder_model_config,
|
||||
self.decoder_model_config,
|
||||
encoder_build_time,
|
||||
decoder_build_time,
|
||||
) = build_enc_dec(args)
|
||||
|
||||
self.build_time = encoder_build_time + decoder_build_time
|
||||
|
||||
assert encoder_engine_buffer is not None
|
||||
assert decoder_engine_buffer is not None
|
||||
# Deserialize engine from engine directory
|
||||
self.encoder_serialize_path = os.path.join(self.engine_dir, "encoder",
|
||||
self.encoder_engine_name)
|
||||
with open(self.encoder_serialize_path, "rb") as f:
|
||||
encoder_engine_buffer = f.read()
|
||||
assert encoder_engine_buffer is not None
|
||||
self.decoder_serialize_path = os.path.join(self.engine_dir, "decoder",
|
||||
self.decoder_engine_name)
|
||||
with open(self.decoder_serialize_path, "rb") as f:
|
||||
decoder_engine_buffer = f.read()
|
||||
assert decoder_engine_buffer is not None
|
||||
|
||||
# session setup
|
||||
self.encoder_session = tensorrt_llm.runtime.Session.from_serialized_engine(
|
||||
@ -216,11 +193,10 @@ class EncDecBenchmark(BaseBenchmark):
|
||||
f"[WARNING] whisper benchmark is input_len=1500, no text prompt, output_len=arbitrary"
|
||||
)
|
||||
for inlen, outlen in self.in_out_lens:
|
||||
if (inlen > self.max_input_len
|
||||
or inlen + outlen > self.max_seq_len):
|
||||
if (inlen > self.max_input_len or outlen > self.max_seq_len):
|
||||
print(
|
||||
f"[WARNING] check inlen({inlen}) <= max_inlen({self.max_input_len}) and "
|
||||
f"inlen({inlen}) + outlen({outlen}) <= max_seqlen({self.max_seq_len}) failed, skipping."
|
||||
f"outlen({outlen}) <= max_seqlen({self.max_seq_len}) failed, skipping."
|
||||
)
|
||||
continue
|
||||
for batch_size in self.batch_sizes:
|
||||
|
||||
@ -13,8 +13,6 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import json
|
||||
import os
|
||||
from dataclasses import asdict
|
||||
from math import ceil
|
||||
|
||||
import pandas as pd
|
||||
@ -22,11 +20,11 @@ import tensorrt as trt
|
||||
import torch
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm.profiler import bytes_to_target_unit
|
||||
from tensorrt_llm.builder import Engine
|
||||
from tensorrt_llm.runtime import (ChatGLMGenerationSession, GenerationSession,
|
||||
SamplingConfig)
|
||||
|
||||
from allowed_configs import get_build_config, BuildConfig # isort:skip
|
||||
from base_benchmark import BaseBenchmark # isort:skip
|
||||
from build import build_gpt, get_quant_config # isort:skip
|
||||
|
||||
|
||||
def element_size(dtype: str):
|
||||
@ -46,80 +44,26 @@ class GPTBenchmark(BaseBenchmark):
|
||||
def __init__(self, args, batch_sizes, in_out_lens, gpu_weights_percents,
|
||||
rank, world_size):
|
||||
super().__init__(args.engine_dir, args.model, args.dtype, rank,
|
||||
world_size, args.serial_build)
|
||||
world_size)
|
||||
self.batch_sizes = batch_sizes
|
||||
self.in_out_lens = in_out_lens
|
||||
self.gpu_weights_percents = gpu_weights_percents
|
||||
self.num_beams = args.num_beams
|
||||
self.mode = args.mode
|
||||
self.build_time = 0
|
||||
|
||||
self.cuda_graph_mode = args.enable_cuda_graph
|
||||
self.build_config = None
|
||||
# this dtype may be modified based on quantization mode later, when the fp8/int8 kv cache is used
|
||||
self.kv_dtype = args.dtype
|
||||
|
||||
# approximate the weights size in the engine by using engine size
|
||||
# the actual weights size shall be smaller because there are some other data in the engine file.
|
||||
# for large model, this approximate is close enough.
|
||||
self.weights_size_approx = 0
|
||||
|
||||
self.dump_layer_info = args.dump_layer_info
|
||||
# change profiling_verbosity to detailed when enabling dump layer info
|
||||
if self.dump_layer_info:
|
||||
args.profiling_verbosity = "detailed"
|
||||
|
||||
if args.engine_dir is not None:
|
||||
# Get build configs from engine directory is done in base class
|
||||
# Deserialize engine from engine directory
|
||||
self.serialize_path = os.path.join(args.engine_dir,
|
||||
self.engine_name)
|
||||
with open(self.serialize_path, 'rb') as f:
|
||||
engine_buffer = f.read()
|
||||
self.weights_size_approx = len(engine_buffer)
|
||||
else:
|
||||
self.build_config = get_build_config(args.model, return_dict=False)
|
||||
|
||||
for key, value in asdict(self.build_config).items():
|
||||
setattr(self, key, value)
|
||||
if args.force_num_layer_1:
|
||||
self.num_layers = 1
|
||||
if args.max_batch_size is not None:
|
||||
self.max_batch_size = args.max_batch_size
|
||||
if args.max_input_len is not None:
|
||||
self.max_input_len = args.max_input_len
|
||||
if args.max_seq_len is not None:
|
||||
self.max_seq_len = args.max_seq_len
|
||||
|
||||
self.quant_config = get_quant_config(args.quantization)
|
||||
self.quant_mode = self.quant_config.quant_mode
|
||||
self.enable_fp8 = self.quant_mode.has_fp8_qdq()
|
||||
self.fp8_kv_cache = self.quant_mode.has_fp8_kv_cache()
|
||||
if self.quant_mode.has_fp8_kv_cache():
|
||||
self.kv_dtype = 'fp8'
|
||||
if self.quant_mode.has_int8_kv_cache():
|
||||
self.kv_dtype = 'int8'
|
||||
|
||||
# Plugins
|
||||
self.use_gpt_attention_plugin = False
|
||||
self.remove_input_padding = False
|
||||
self.use_mamba_conv1d_plugin = False
|
||||
if args.mode == 'plugin':
|
||||
self.use_gpt_attention_plugin = True
|
||||
self.remove_input_padding = True
|
||||
self.use_moe_plugin = True
|
||||
self.use_mamba_conv1d_plugin = True
|
||||
elif args.mode == 'ootb-except-mha':
|
||||
self.use_gpt_attention_plugin = True
|
||||
self.remove_input_padding = True
|
||||
|
||||
engine_buffer, build_time = build_gpt(args)
|
||||
self.weights_size_approx = engine_buffer.nbytes
|
||||
self.build_time = build_time
|
||||
|
||||
# Get build configs from engine directory is done in base class
|
||||
# Deserialize engine from engine directory
|
||||
engine = Engine.from_dir(args.engine_dir, rank)
|
||||
engine_buffer = engine.engine
|
||||
assert engine_buffer is not None
|
||||
if args.build_only:
|
||||
return
|
||||
pretrained_config = engine.config.pretrained_config
|
||||
if pretrained_config.architecture == 'ChatGLMForCausalLM' and pretrained_config.chatglm_version in [
|
||||
'glm', 'chatglm'
|
||||
]:
|
||||
session_cls = ChatGLMGenerationSession
|
||||
else:
|
||||
session_cls = GenerationSession
|
||||
|
||||
if not hasattr(self, 'num_kv_heads') or self.num_kv_heads is None:
|
||||
self.num_kv_heads = self.num_heads
|
||||
@ -155,50 +99,11 @@ class GPTBenchmark(BaseBenchmark):
|
||||
gpu_weights_percent=list(sorted(gpu_weights_percents))[0],
|
||||
**rnn_configs_kwargs,
|
||||
)
|
||||
if args.model == 'chatglm_6b':
|
||||
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
|
||||
end_id=130005,
|
||||
pad_id=3,
|
||||
num_beams=self.num_beams,
|
||||
top_k=args.top_k,
|
||||
top_p=args.top_p)
|
||||
self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession(
|
||||
model_config, engine_buffer, self.runtime_mapping)
|
||||
elif args.model in ['chatglm2_6b', 'chatglm3_6b']:
|
||||
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
|
||||
end_id=2,
|
||||
pad_id=0,
|
||||
num_beams=self.num_beams,
|
||||
top_k=args.top_k,
|
||||
top_p=args.top_p)
|
||||
self.decoder = tensorrt_llm.runtime.GenerationSession(
|
||||
model_config, engine_buffer, self.runtime_mapping)
|
||||
if args.model == 'glm_10b':
|
||||
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
|
||||
end_id=50258,
|
||||
pad_id=50256,
|
||||
num_beams=self.num_beams,
|
||||
top_k=args.top_k,
|
||||
top_p=args.top_p)
|
||||
self.decoder = tensorrt_llm.runtime.ChatGLMGenerationSession(
|
||||
model_config, engine_buffer, self.runtime_mapping)
|
||||
else:
|
||||
end_id = 50256
|
||||
pad_id = 50256
|
||||
if "llama" in args.model:
|
||||
end_id = 2
|
||||
pad_id = 0
|
||||
self.sampling_config = tensorrt_llm.runtime.SamplingConfig(
|
||||
end_id=end_id,
|
||||
pad_id=pad_id,
|
||||
num_beams=self.num_beams,
|
||||
top_k=args.top_k,
|
||||
top_p=args.top_p)
|
||||
self.decoder = tensorrt_llm.runtime.GenerationSession(
|
||||
model_config,
|
||||
engine_buffer,
|
||||
self.runtime_mapping,
|
||||
cuda_graph_mode=self.cuda_graph_mode)
|
||||
self.sampling_config = SamplingConfig(end_id=2, pad_id=0)
|
||||
self.decoder = session_cls(model_config,
|
||||
engine_buffer,
|
||||
self.runtime_mapping,
|
||||
cuda_graph_mode=self.cuda_graph_mode)
|
||||
|
||||
# Print context memory size for CI/CD to track.
|
||||
context_mem_size = self.decoder.context_mem_size
|
||||
@ -260,72 +165,6 @@ class GPTBenchmark(BaseBenchmark):
|
||||
benchmark_profiler=benchmark_profiler)
|
||||
torch.cuda.synchronize()
|
||||
|
||||
@staticmethod
|
||||
def kv_cache_elem_per_token(config: BuildConfig, tp_size, pp_size) -> int:
|
||||
# you need to multiply the size by element size, and multiply by the seq length
|
||||
# Warning: this function returns the upper bound between different ranks when any one of the following is true:
|
||||
# num_layer % pp_size !=0, hidden_size % num_kv_heads != 0, num_kv_heads % tp_size != 0
|
||||
local_nlayers = ceil(config.num_layers / pp_size)
|
||||
kv_heads = config.num_kv_heads if config.num_kv_heads is not None else config.num_heads
|
||||
size_per_head = ceil(config.hidden_size / kv_heads)
|
||||
local_heads = ceil(kv_heads / tp_size)
|
||||
return 2 * local_nlayers * size_per_head * local_heads
|
||||
|
||||
def check_memory(self, io_shapes: list, raise_exception=False):
|
||||
'''Compare the estimated GPU memory requirements for weights + activations + kv cache with the total GPU memory and log it.
|
||||
Raise exception when the \p raise_exception parameter is true.
|
||||
'''
|
||||
# we don't want to block the test due to this
|
||||
if self.build_config is None:
|
||||
tensorrt_llm.logger.warning(
|
||||
"Didn't have the build config object, skipping check the memory"
|
||||
)
|
||||
return
|
||||
assert isinstance(self.build_config, BuildConfig)
|
||||
batch_size, inlen, outlen = io_shapes[0], io_shapes[1], io_shapes[2]
|
||||
kv_cache_size_in_bytes = batch_size*self.num_beams*(inlen + outlen)* \
|
||||
self.kv_cache_elem_per_token(self.build_config, self.runtime_mapping.tp_size, self.runtime_mapping.pp_size) * element_size(self.kv_dtype)
|
||||
# when MHA is OOTB, it requires extra KV cache size, because OOTB don't support inplace updating KV cache.
|
||||
if not self.use_gpt_attention_plugin:
|
||||
local_n_layer = ceil(self.build_config.num_layers /
|
||||
self.runtime_mapping.pp_size)
|
||||
kv_cache_size_in_bytes = kv_cache_size_in_bytes / local_n_layer * (
|
||||
local_n_layer + 1)
|
||||
|
||||
kv_cache_size_in_mb = bytes_to_target_unit(kv_cache_size_in_bytes,
|
||||
"MiB")
|
||||
activation_size_in_mb = bytes_to_target_unit(
|
||||
self.decoder.runtime.engine.device_memory_size, "MiB")
|
||||
weights_size_in_mb = bytes_to_target_unit(self.weights_size_approx,
|
||||
"MiB")
|
||||
total_memory_approx_in_mb = kv_cache_size_in_mb + activation_size_in_mb + weights_size_in_mb
|
||||
_, _, total = tensorrt_llm.profiler.device_memory_info()
|
||||
total_in_mb = bytes_to_target_unit(total, 'MiB')
|
||||
prefix = "[Memory Estimation]"
|
||||
|
||||
mem_msg = f"{prefix} activation memory:{activation_size_in_mb:.3f} MiB, kv_cache:{kv_cache_size_in_mb:.3f} MiB, weights approximate:{weights_size_in_mb:.3f} MiB, " \
|
||||
f"approximate required GPU memory: {total_memory_approx_in_mb:.3f} MiB, total GPU memory: {total_in_mb:.3f} MiB"
|
||||
tensorrt_llm.logger.info(mem_msg)
|
||||
|
||||
build_args = dict(batch_size=batch_size,
|
||||
num_beams=self.num_beams,
|
||||
input_length=inlen,
|
||||
output_length=outlen,
|
||||
max_batch_size=self.build_config.max_batch_size,
|
||||
max_input_len=self.build_config.max_input_len,
|
||||
max_seq_len=self.build_config.max_seq_len,
|
||||
max_beam_width=self.build_config.max_beam_width)
|
||||
for k, v in build_args.items():
|
||||
tensorrt_llm.logger.info(f"{prefix} {k}:{v}")
|
||||
|
||||
tensorrt_llm.logger.info(
|
||||
"grep the \"Total Activation\" and \"Total Weights\" from verbose TRT engine build log to see the precise memory size for those."
|
||||
)
|
||||
if raise_exception and total_memory_approx_in_mb >= total_in_mb:
|
||||
raise Exception(
|
||||
"Total memory estimation bigger than total gpu memory, the case will likely to OOM, needs enhancement of waive the test case, see logs about the memory usage details"
|
||||
)
|
||||
|
||||
def report(self,
|
||||
config,
|
||||
latency,
|
||||
@ -348,7 +187,6 @@ class GPTBenchmark(BaseBenchmark):
|
||||
report_dict["input_length"] = inlen
|
||||
report_dict["output_length"] = outlen
|
||||
report_dict["latency(ms)"] = latency
|
||||
report_dict["build_time(s)"] = self.build_time
|
||||
report_dict["tokens_per_sec"] = tokens_per_sec
|
||||
report_dict["percentile95(ms)"] = percentile95
|
||||
report_dict["percentile99(ms)"] = percentile99
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3e25541cdc2aaa48f6a6e4c386d22ca1832c8e120fc6e8c190db4ee066ebfb1f
|
||||
size 4293186
|
||||
oid sha256:7eec52cb658f033cf3146017cbaa3ea1554942ee7ece49329ddf7b01361fa080
|
||||
size 4293100
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:3108cd0580f6328bd46238ef708872d9d8030a9c8645b8b52bc750dfe094bc16
|
||||
size 4395794
|
||||
oid sha256:cf65778d6469a5a85bf2191fb104094aa4e606b370a25475a16017329e27fd95
|
||||
size 4395148
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
50a839e98b31729198870fc99ef2c5a9 libtensorrt_llm_batch_manager_static.a
|
||||
a39a5bf618c8514725b59aac4513223f libtensorrt_llm_batch_manager_static.pre_cxx11.a
|
||||
3511a2653f2ba73f6f827aca6d2850b3d3e8e543 commit
|
||||
08d59f31da00044ae21995c6573a55da libtensorrt_llm_batch_manager_static.a
|
||||
abdb9b58e0a4587d2d2ce6bc83655f8a libtensorrt_llm_batch_manager_static.pre_cxx11.a
|
||||
315e9f5ccd286e906d4c0d402fefbf2f69a1febe commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9600435f1b9ab74c752d1831e1a6684a004927c84ab7c61fc076dbc128ca1521
|
||||
size 4154674
|
||||
oid sha256:e339bca2212b46c6227b328fc376db4628a0a96636b5f2b5b3ae387e884b7f01
|
||||
size 4155892
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:8145ecf59dea64448ca0969553d32bc99e119cc5fc703e7b47eccfb5886594a0
|
||||
size 4133178
|
||||
oid sha256:7503446c4ef7b959970fc02b33ca81dd0dece0663d9a0f8b881c60ff66006000
|
||||
size 4136818
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f89f551a880f4c6c1e68ed72b951ac482dec6033e55a336a0ecc401f4e9cf150
|
||||
size 24009160
|
||||
oid sha256:51174b20ed939662c92d21cdd5a0fd652a6592947270182ff026eb3a4153e4cf
|
||||
size 24015602
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:33f259b374a02456f2b8d44571d92195b708c2011be4ecabe46267f49ca24c29
|
||||
size 1426724
|
||||
oid sha256:19fdeb78169c29492026b62bf147481e2b0d893916d9a20333d83fb61c0abe36
|
||||
size 1428026
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f44786aee0842bdb260de49b734d2119a0521c650f0b733f5ce6f997e72bfb34
|
||||
size 1452984
|
||||
oid sha256:1d7f36c49f24730e4038c2252b966870789d9c9cff698ccd50d0f61ae85fcc9d
|
||||
size 1455538
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
0d5e559ebc885794ab9e63086ae7a18a libtensorrt_llm_executor_static.a
|
||||
f9a3d1bf32f33f88569d4d8635e5445a libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
3511a2653f2ba73f6f827aca6d2850b3d3e8e543 commit
|
||||
5bdad7b823b79b1b91439693aa25cff5 libtensorrt_llm_executor_static.a
|
||||
566734842bb731319971850583fdc9c7 libtensorrt_llm_executor_static.pre_cxx11.a
|
||||
315e9f5ccd286e906d4c0d402fefbf2f69a1febe commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:19bd908d16990cd11a295fcb71403e2ad285dc2c3b84d55228166d9240acd0d9
|
||||
size 1476318
|
||||
oid sha256:58e3e6d7414ab730ba54c8aabdc5f193787b44699e1289279428087cbb2e46d4
|
||||
size 1478178
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:bed0b93d23eef43ce46c01e694f9e578c64fe9b30e1b05d65b7feed1a41e5148
|
||||
size 1408208
|
||||
oid sha256:5f6598d6c2dafd9b97edfeb8fc424607374e8791c4e334cfaaf5cae865da15c6
|
||||
size 1410466
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:473c672353cb813af9ea65250bd79f61f5ea27c369c9f35bc3bace1e22c5e9bb
|
||||
size 14325956
|
||||
oid sha256:93e0c81a8d00db0e860cdfdafbae7391e0d2956c2301da1f22ef6419bcb4e02f
|
||||
size 14321264
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:286c47b52c5955ef4d2b5bd54cf555f6bafdb307a413949e1edafe4db991c887
|
||||
oid sha256:df3429c2cc6bffe3e3d12fc444426427676a85e281cab4456e5d0a03e4a6828f
|
||||
size 80318200
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
28ead889239ca8d558c1e1a93f0485b0 libtensorrt_llm_nvrtc_wrapper.so
|
||||
3511a2653f2ba73f6f827aca6d2850b3d3e8e543 commit
|
||||
957f7c6034dca28dff7afe65ed68aa4b libtensorrt_llm_nvrtc_wrapper.so
|
||||
315e9f5ccd286e906d4c0d402fefbf2f69a1febe commit
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f396ee533b289e7326df9061be8abba46ae061a61011c60c19051cbe219461e3
|
||||
oid sha256:829e6d2ccaed3c0e8ff351a6c418c65a9260433eff6f08feb41b3bab33d84fb4
|
||||
size 83552896
|
||||
|
||||
@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:20824706210bf184641c92fcb728ab0a3a74a36bc0b13e243c713a84c74a51ac
|
||||
size 1089536
|
||||
oid sha256:73ea01f6014e5c11a263f342f8c19f3a1b8bfa824441accd3cb4b7fa699a9d9a
|
||||
size 1087488
|
||||
|
||||
@ -98,6 +98,8 @@ void PenaltyLayer<T>::allocateBuffer()
|
||||
TLLM_LOG_TRACE("%s start", __PRETTY_FUNCTION__);
|
||||
|
||||
mLogitsPtrsHost = mBufferManager->pinnedPool(ITensor::makeShape({}), TRTDataType<T*>::value);
|
||||
mLogitsPtrsDevice
|
||||
= mBufferManager->gpu(ITensor::makeShape({mDecoderDomain.getBatchSize()}), TRTDataType<T*>::value);
|
||||
auto const batchSizeShape = ITensor::makeShape({mDecoderDomain.getBatchSize()});
|
||||
mTemperature = mBufferManager->pinnedPool(batchSizeShape, TRTDataType<float>::value);
|
||||
mRepetitionPenalty = mBufferManager->pinnedPool(batchSizeShape, TRTDataType<float>::value);
|
||||
@ -233,6 +235,7 @@ void PenaltyLayer<T>::forwardAsync(
|
||||
mCyclicStep = mCyclicStep % mRuntimeMaxSeqLen;
|
||||
|
||||
TensorPtr logitsPtrsHost = ITensor::slice(mLogitsPtrsHost, mCyclicStep, 1);
|
||||
logitsPtrsHost->squeeze(0);
|
||||
auto logitsPtrsHostData = bufferCast<T*>(*logitsPtrsHost);
|
||||
for (SizeType32 bi = 0; bi < localDecoderDomain.getBatchSize(); bi++)
|
||||
{
|
||||
@ -274,7 +277,13 @@ void PenaltyLayer<T>::forwardAsync(
|
||||
auto const tokensPerStep = bufferCastOrNull<SizeType32>(params->curTokensPerStep);
|
||||
|
||||
InvokeBatchApplyPenaltyParams<T> penaltyParams;
|
||||
penaltyParams.inputLogits = reinterpret_cast<T const* const*>(logitsPtrsHostData);
|
||||
|
||||
{ // Moving the logits ptrs to device for faster access during kernel execution.
|
||||
TensorPtr logitsPtrsDeviceSlice = ITensor::slice(mLogitsPtrsDevice, 0, localDecoderDomain.getBatchSize());
|
||||
TensorPtr logitsPtrsHostSlice = ITensor::slice(logitsPtrsHost, 0, localDecoderDomain.getBatchSize());
|
||||
mBufferManager->copy(*logitsPtrsHostSlice, *logitsPtrsDeviceSlice);
|
||||
penaltyParams.inputLogits = reinterpret_cast<T const* const*>(bufferCast<T const*>(*logitsPtrsDeviceSlice));
|
||||
}
|
||||
penaltyParams.outputLogits = bufferCast<T>(*mRuntimeLogitsDevice);
|
||||
penaltyParams.biases = embeddingBias;
|
||||
penaltyParams.penaltyWorkspace = bufferCastOrNull<TokenIdType>(mPenaltyWorkspaceDevice);
|
||||
|
||||
@ -91,6 +91,7 @@ private:
|
||||
BufferPtr mPenaltyWorkspaceDevice;
|
||||
BufferPtr mPenaltyWorkspacePrevDevice;
|
||||
TensorPtr mLogitsPtrsHost;
|
||||
TensorPtr mLogitsPtrsDevice;
|
||||
};
|
||||
|
||||
} // namespace tensorrt_llm::layers
|
||||
|
||||
@ -75,7 +75,7 @@ void _runGemm(int const M, int const N, int const K, bool const transA, bool con
|
||||
|
||||
LoraPlugin::LoraPlugin(int in_hidden_size, std::vector<int> out_hidden_sizes, int transA, int transB,
|
||||
int num_lora_modules, nvinfer1::DataType type, LoraPlugin::PluginProfilerPtr const& pluginProfiler,
|
||||
bool remove_input_padding, int max_context_length, int max_low_rank, int weight_index)
|
||||
bool remove_input_padding, int max_num_tokens, int max_low_rank, int weight_index)
|
||||
: mInHiddenSize(in_hidden_size)
|
||||
, mTransA(transA)
|
||||
, mTransB(transB)
|
||||
@ -83,7 +83,7 @@ LoraPlugin::LoraPlugin(int in_hidden_size, std::vector<int> out_hidden_sizes, in
|
||||
, mType(type)
|
||||
, mPluginProfiler(pluginProfiler)
|
||||
, mRemoveInputPadding(remove_input_padding)
|
||||
, mMaxContextLength(max_context_length)
|
||||
, mMaxNumTokens(max_num_tokens)
|
||||
, mMaxLowRank(max_low_rank)
|
||||
, mWeightIndex(weight_index)
|
||||
{
|
||||
@ -105,7 +105,7 @@ LoraPlugin::LoraPlugin(void const* data, size_t length, LoraPlugin::PluginProfil
|
||||
read(d, mNumLoraModules);
|
||||
read(d, mType);
|
||||
read(d, mRemoveInputPadding);
|
||||
read(d, mMaxContextLength);
|
||||
read(d, mMaxNumTokens);
|
||||
read(d, mMaxLowRank);
|
||||
read(d, mWeightIndex);
|
||||
mOutHiddenSizes.resize(mNumLoraModules);
|
||||
@ -266,10 +266,9 @@ void LoraPlugin::configurePlugin(nvinfer1::DynamicPluginTensorDesc const* in, in
|
||||
TLLM_LOG_TRACE("%s stop", __PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
int64_t getLowRankWorkSpaceSize(
|
||||
int64_t nbReq, int64_t maxContextLength, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t typeSize)
|
||||
int64_t getLowRankWorkSpaceSize(int64_t maxNumTokens, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t typeSize)
|
||||
{
|
||||
return divUp(nbReq * maxContextLength * maxLoraModuleNum * maxLowRank * typeSize, 16) * 16;
|
||||
return divUp(maxNumTokens * maxLoraModuleNum * maxLowRank * typeSize, 16) * 16;
|
||||
}
|
||||
|
||||
int64_t getGroupedGemmParamsWorkSpaceSize(int64_t nbReq)
|
||||
@ -278,16 +277,15 @@ int64_t getGroupedGemmParamsWorkSpaceSize(int64_t nbReq)
|
||||
}
|
||||
|
||||
int64_t getSplitkGroupedGemmWorkSpaceSize(
|
||||
int64_t nbReq, int64_t maxContextLength, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
|
||||
int64_t maxNumTokens, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
|
||||
{
|
||||
return divUp(nbReq * maxContextLength * maxLoraModuleNum * maxLowRank * sizeof(float) * splitKSlices, 16) * 16;
|
||||
return divUp(maxNumTokens * maxLoraModuleNum * maxLowRank * sizeof(float) * splitKSlices, 16) * 16;
|
||||
}
|
||||
|
||||
int64_t getGemmWorkSpaceSize(
|
||||
int64_t nbReq, int64_t maxContextLength, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
|
||||
int64_t getGemmWorkSpaceSize(int64_t maxNumTokens, int64_t maxLoraModuleNum, int64_t maxLowRank, int64_t splitKSlices)
|
||||
{
|
||||
return std::max((int64_t) CUBLAS_WORKSPACE_SIZE,
|
||||
getSplitkGroupedGemmWorkSpaceSize(nbReq, maxContextLength, maxLoraModuleNum, maxLowRank, splitKSlices));
|
||||
getSplitkGroupedGemmWorkSpaceSize(maxNumTokens, maxLoraModuleNum, maxLowRank, splitKSlices));
|
||||
}
|
||||
|
||||
size_t LoraPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, int nbInputs,
|
||||
@ -298,8 +296,8 @@ size_t LoraPlugin::getWorkspaceSize(nvinfer1::PluginTensorDesc const* inputs, in
|
||||
auto const type = inputs[getInputTensorIdx()].type;
|
||||
auto const typeSize = tensorrt_llm::runtime::BufferDataType(type).getSize();
|
||||
|
||||
return (size_t) getGemmWorkSpaceSize(nbReq, mMaxContextLength, mNumLoraModules, mMaxLowRank, mSplitKSlices)
|
||||
+ getLowRankWorkSpaceSize(nbReq, mMaxContextLength, mNumLoraModules, mMaxLowRank, typeSize)
|
||||
return (size_t) getGemmWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices)
|
||||
+ getLowRankWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, typeSize)
|
||||
+ getGroupedGemmParamsWorkSpaceSize(nbReq * mNumLoraModules);
|
||||
}
|
||||
|
||||
@ -361,13 +359,12 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
|
||||
= mRemoveInputPadding ? static_cast<int32_t const*>(inputs[getHostContextLengthsIdx()]) : nullptr;
|
||||
RequestType const* reqTypes = static_cast<RequestType const*>(inputs[getHostRequestTypesIdx()]);
|
||||
|
||||
int64_t GemmWorkSpaceSize
|
||||
= getGemmWorkSpaceSize(batch_size, mMaxContextLength, mNumLoraModules, mMaxLowRank, mSplitKSlices);
|
||||
int64_t GemmWorkSpaceSize = getGemmWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, mSplitKSlices);
|
||||
int64_t groupGemmParamsWorkSpaceSize = getGroupedGemmParamsWorkSpaceSize(batch_size * mNumLoraModules);
|
||||
void* gemmWorkSpace = workspace; // [gemmWorkSpace, lowrankWorkSpace, groupGemmParamsWorkSpace]
|
||||
void* lowRankWorkSpace = static_cast<char*>(gemmWorkSpace) + GemmWorkSpaceSize;
|
||||
void* groupGemmParamsWorkSpace = static_cast<char*>(lowRankWorkSpace)
|
||||
+ getLowRankWorkSpaceSize(batch_size, mMaxContextLength, mNumLoraModules, mMaxLowRank, typeSize);
|
||||
+ getLowRankWorkSpaceSize(mMaxNumTokens, mNumLoraModules, mMaxLowRank, typeSize);
|
||||
|
||||
bool isWithLora = isEnableLora(batch_size, mNumLoraModules, &inputs[getLoraRanksIdx()]);
|
||||
|
||||
@ -514,21 +511,15 @@ int LoraPlugin::enqueue(nvinfer1::PluginTensorDesc const* inputDesc, nvinfer1::P
|
||||
ptrB.push_back(
|
||||
reinterpret_cast<void*>(lora_weights_ptr[batchIdx * 2] + K * N * typeSize * mWeightIndex));
|
||||
ptrC.push_back(static_cast<void*>(static_cast<char*>(lowRankWorkSpace)
|
||||
+ (loraModuleIdx * batch_size * mMaxContextLength * mMaxLowRank
|
||||
+ handled_token_num * mMaxLowRank)
|
||||
* typeSize));
|
||||
+ (loraModuleIdx * mMaxNumTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));
|
||||
ptrD.push_back(static_cast<void*>(static_cast<char*>(lowRankWorkSpace)
|
||||
+ (loraModuleIdx * batch_size * mMaxContextLength * mMaxLowRank
|
||||
+ handled_token_num * mMaxLowRank)
|
||||
* typeSize));
|
||||
+ (loraModuleIdx * mMaxNumTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));
|
||||
|
||||
auto const N2 = outputDesc[loraModuleIdx].dims.d[nbDimsA - 1];
|
||||
cutlass::gemm::GemmCoord problem_2(M, N2, N);
|
||||
problem_sizes_2.push_back(problem_2);
|
||||
ptrA_2.push_back(static_cast<void*>(static_cast<char*>(lowRankWorkSpace)
|
||||
+ (loraModuleIdx * batch_size * mMaxContextLength * mMaxLowRank
|
||||
+ handled_token_num * mMaxLowRank)
|
||||
* typeSize));
|
||||
+ (loraModuleIdx * mMaxNumTokens * mMaxLowRank + handled_token_num * mMaxLowRank) * typeSize));
|
||||
ptrB_2.push_back(
|
||||
reinterpret_cast<void*>(lora_weights_ptr[batchIdx * 2 + 1] + N2 * N * typeSize * mWeightIndex));
|
||||
ptrC_2.push_back(static_cast<void*>(
|
||||
@ -603,7 +594,7 @@ size_t LoraPlugin::getSerializationSize() const noexcept
|
||||
{
|
||||
TLLM_LOG_DEBUG("%s", __PRETTY_FUNCTION__);
|
||||
return sizeof(mInHiddenSize) + sizeof(mTransA) + sizeof(mTransB) + sizeof(mNumLoraModules) + sizeof(mType)
|
||||
+ mPluginProfiler->getSerializationSize(mGemmId) + sizeof(mRemoveInputPadding) + sizeof(mMaxContextLength)
|
||||
+ mPluginProfiler->getSerializationSize(mGemmId) + sizeof(mRemoveInputPadding) + sizeof(mMaxNumTokens)
|
||||
+ sizeof(mMaxLowRank) + sizeof(mWeightIndex) + sizeof(int) * mNumLoraModules; // selected tactics container size
|
||||
}
|
||||
|
||||
@ -617,7 +608,7 @@ void LoraPlugin::serialize(void* buffer) const noexcept
|
||||
write(d, mNumLoraModules);
|
||||
write(d, mType);
|
||||
write(d, mRemoveInputPadding);
|
||||
write(d, mMaxContextLength);
|
||||
write(d, mMaxNumTokens);
|
||||
write(d, mMaxLowRank);
|
||||
write(d, mWeightIndex);
|
||||
for (int i = 0; i < mNumLoraModules; i++)
|
||||
@ -674,7 +665,7 @@ IPluginV2* LoraPluginCreator::createPlugin(char const* name, PluginFieldCollecti
|
||||
int num_lora_modules;
|
||||
int in_hidden_size, transA, transB;
|
||||
bool remove_input_padding;
|
||||
int max_context_length;
|
||||
int max_num_tokens;
|
||||
int max_low_rank;
|
||||
int weight_index;
|
||||
// Read configurations from each fields
|
||||
@ -706,10 +697,10 @@ IPluginV2* LoraPluginCreator::createPlugin(char const* name, PluginFieldCollecti
|
||||
TLLM_CHECK(fields[i].type == PluginFieldType::kINT8);
|
||||
remove_input_padding = static_cast<bool>(*(static_cast<int8_t const*>(fields[i].data)));
|
||||
}
|
||||
else if (!strcmp(attrName, "max_context_length"))
|
||||
else if (!strcmp(attrName, "max_num_tokens"))
|
||||
{
|
||||
TLLM_CHECK(fields[i].type == PluginFieldType::kINT32);
|
||||
max_context_length = *(static_cast<int const*>(fields[i].data));
|
||||
max_num_tokens = *(static_cast<int const*>(fields[i].data));
|
||||
}
|
||||
else if (!strcmp(attrName, "max_low_rank"))
|
||||
{
|
||||
@ -748,7 +739,7 @@ IPluginV2* LoraPluginCreator::createPlugin(char const* name, PluginFieldCollecti
|
||||
// FIXME enable tactic profiler
|
||||
auto pluginProfiler = gemmPluginProfileManager.createGemmPluginProfiler(/* inference */ false, /* skip */ true);
|
||||
auto* obj = new LoraPlugin(in_hidden_size, out_hidden_sizes, transA, transB, num_lora_modules, type,
|
||||
pluginProfiler, remove_input_padding, max_context_length, max_low_rank, weight_index);
|
||||
pluginProfiler, remove_input_padding, max_num_tokens, max_low_rank, weight_index);
|
||||
obj->setPluginNamespace(mNamespace.c_str());
|
||||
return obj;
|
||||
}
|
||||
|
||||
@ -39,7 +39,7 @@ public:
|
||||
LoraPlugin() = delete;
|
||||
|
||||
LoraPlugin(int in_hidden_size, std::vector<int> out_hidden_sizes, int transA, int transB, int num_lora_modules,
|
||||
nvinfer1::DataType type, PluginProfilerPtr const& profiler, bool remove_input_padding, int max_context_length,
|
||||
nvinfer1::DataType type, PluginProfilerPtr const& profiler, bool remove_input_padding, int max_num_tokens,
|
||||
int max_low_rank, int weight_index);
|
||||
|
||||
LoraPlugin(void const* data, size_t length, PluginProfilerPtr const& profiler);
|
||||
@ -121,7 +121,7 @@ private:
|
||||
int mTransB;
|
||||
nvinfer1::DataType mType;
|
||||
bool mRemoveInputPadding;
|
||||
int mMaxContextLength;
|
||||
int mMaxNumTokens;
|
||||
int mMaxLowRank;
|
||||
int mNumLoraModules;
|
||||
int mWeightIndex;
|
||||
|
||||
@ -522,12 +522,6 @@ def run_multi_gpu_tests(build_dir: _pl.Path, timeout=1500):
|
||||
leader_commands=[f"--gtest_output=xml:{xml_output_file}"])
|
||||
run_command(trt_model_test, cwd=tests_dir, env=cpp_env,
|
||||
timeout=timeout) # expecting ~ 1200s
|
||||
cpp_blocking_env = copy.copy(cpp_env)
|
||||
cpp_blocking_env["CUDA_LAUNCH_BLOCKING"] = '1'
|
||||
run_command(trt_model_test,
|
||||
cwd=tests_dir,
|
||||
env=cpp_blocking_env,
|
||||
timeout=timeout) # expecting ~ 1200s
|
||||
|
||||
#Executor test in leader mode
|
||||
new_env = copy.copy(cpp_env)
|
||||
|
||||
@ -41,20 +41,14 @@ python3 examples/summarize.py \
|
||||
We can also benchmark the efficiency of Weight Streaming. Here is an example:
|
||||
```bash
|
||||
python3 benchmarks/python/benchmark.py \
|
||||
-m opt_30b \
|
||||
--mode ootb \
|
||||
--engine_dir /tmp/llama_7b/trt_engines/fp16/1-gpu/ \
|
||||
--batch_size "1;32" \
|
||||
--max_batch_size "32" \
|
||||
--input_output_len "256,32" \
|
||||
--max_input_len 256\
|
||||
--max_seq_len 288 \
|
||||
--gpu_weights_percent "0.0;0.3;0.6;1.0" \
|
||||
--dtype float16 \
|
||||
--csv \
|
||||
--log_level verbose
|
||||
|
||||
```
|
||||
Here we use `ootb` mode so that the GEMM operators won't use plugins. `ootb-except-mha` mode is also valid.
|
||||
|
||||
|
||||
### API Changes
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.15.0
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
protobuf
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -241,17 +241,19 @@ In `benchmarks/python/`:
|
||||
```bash
|
||||
# Example 1: Single-GPU benchmark
|
||||
python benchmark.py \
|
||||
-m t5_small \
|
||||
-m enc-dec \
|
||||
--batch_size "1;8" \
|
||||
--input_output_len "60,20;128,20" \
|
||||
--engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} \
|
||||
--dtype float32 \
|
||||
--csv # optional
|
||||
|
||||
# Example 2: Multi-GPU benchmark
|
||||
mpirun --allow-run-as-root -np 4 python benchmark.py \
|
||||
-m t5_small \
|
||||
-m enc-dec \
|
||||
--batch_size "1;8" \
|
||||
--input_output_len "60,20;128,20" \
|
||||
--engine_dir tmp/trt_engines/${MODEL_NAME}/${INFERENCE_PRECISION} \
|
||||
--dtype float32 \
|
||||
--csv # optional
|
||||
```
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
transformers>=4.31.0
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
|
||||
@ -3,7 +3,7 @@
|
||||
# WAR the new posting of "nvidia-cudnn-cu12~=9.0".
|
||||
# "jax[cuda12_pip]~=0.4.19" specifies "nvidia-cudnn-cu12>=8.9" but actually requires "nvidia-cudnn-cu12~=8.9".
|
||||
nvidia-cudnn-cu12~=8.9; platform_machine == "x86_64"
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
flax~=0.8.0
|
||||
# jax[cuda12_pip]~=0.4.19; platform_system != "Windows"
|
||||
jax~=0.4.19; platform_system == "Windows"
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@ -22,10 +22,10 @@ from pathlib import Path
|
||||
import numpy as np
|
||||
import torch
|
||||
import yaml
|
||||
from convert_checkpoint import cpu_map_location, unpack_nemo_ckpt
|
||||
|
||||
from tensorrt_llm._utils import str_dtype_to_torch, to_json_file, torch_to_numpy
|
||||
from tensorrt_llm.lora_manager import LoraManager, get_all_nemo_lora_weights
|
||||
from tensorrt_llm.models.gpt.convert import cpu_map_location, unpack_nemo_ckpt
|
||||
|
||||
log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
|
||||
logging.basicConfig(format=log_format)
|
||||
|
||||
@ -22,9 +22,9 @@ from pathlib import Path
|
||||
import numpy as np
|
||||
import torch
|
||||
import yaml
|
||||
from convert_checkpoint import cpu_map_location, unpack_nemo_ckpt
|
||||
|
||||
from tensorrt_llm._utils import torch_to_numpy
|
||||
from tensorrt_llm.models.gpt.convert import cpu_map_location, unpack_nemo_ckpt
|
||||
|
||||
log_format = "%(asctime)s %(name)s [%(levelname)s] %(message)s"
|
||||
logging.basicConfig(format=log_format)
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,17 +1,15 @@
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import traceback
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import safetensors
|
||||
import torch
|
||||
from transformers import AutoModelForCausalLM, GPTJConfig, GPTJForCausalLM
|
||||
from transformers import AutoModelForCausalLM
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm.hlapi import QuantConfig
|
||||
from tensorrt_llm.mapping import Mapping
|
||||
from tensorrt_llm.models import GPTJConfig, GPTJForCausalLM
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
|
||||
@ -68,298 +66,44 @@ def parse_arguments():
|
||||
return args
|
||||
|
||||
|
||||
def load_gptj_config(model_dir: str) -> GPTJConfig:
|
||||
""" Helper utility to load GPTJConfig.
|
||||
|
||||
A pretrained checkpoint from modeling_RW.py has a different structure
|
||||
and is not compatible with `transformers.GPTJConfig` and
|
||||
`transformers.GPTJModel`. We need to manually set the config values.
|
||||
"""
|
||||
|
||||
config = GPTJConfig.from_pretrained(model_dir)
|
||||
return config
|
||||
|
||||
|
||||
def split(weight: torch.Tensor,
|
||||
tp_size: int,
|
||||
rank: int = 0,
|
||||
dim: int = 0) -> torch.Tensor:
|
||||
if tp_size == 1:
|
||||
return weight
|
||||
elif weight.ndim == 1:
|
||||
return torch.chunk(weight, tp_size)[rank].contiguous()
|
||||
else:
|
||||
return torch.chunk(weight, tp_size, dim=dim)[rank].contiguous()
|
||||
|
||||
|
||||
def split_matrix(weight: torch.Tensor, tp_size: int, rank: int,
|
||||
dim: int) -> torch.Tensor:
|
||||
return split(weight, tp_size, rank, dim=dim)
|
||||
|
||||
|
||||
def get_weight(params: Dict[str, torch.Tensor], prefix: str,
|
||||
dtype: torch.dtype) -> torch.Tensor:
|
||||
if f'{prefix}.weight' not in params:
|
||||
return None
|
||||
return params[f'{prefix}.weight'].to(dtype).detach().cpu()
|
||||
|
||||
|
||||
def get_bias(params: Dict[str, torch.Tensor], prefix: str,
|
||||
dtype: torch.dtype) -> torch.Tensor:
|
||||
if f'{prefix}.bias' not in params:
|
||||
return None
|
||||
return params[f'{prefix}.bias'].to(dtype).detach().cpu()
|
||||
|
||||
|
||||
def get_weight_and_bias(params: Dict[str, torch.Tensor], prefix: str,
|
||||
dtype: torch.dtype) -> Tuple[torch.Tensor]:
|
||||
return get_weight(params, prefix, dtype), get_bias(params, prefix, dtype)
|
||||
|
||||
|
||||
def get_tllm_linear_weight(
|
||||
weight: torch.Tensor,
|
||||
prefix: str,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
use_weight_only: bool = False,
|
||||
plugin_weight_only_quant_type: torch.dtype = torch.int8
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
results = {}
|
||||
if use_weight_only:
|
||||
v = weight.t().contiguous()
|
||||
processed_torch_weights, torch_weight_scales = \
|
||||
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
|
||||
v, plugin_weight_only_quant_type)
|
||||
results[f'{prefix}.weight'] = processed_torch_weights
|
||||
results[f'{prefix}.per_channel_scale'] = torch_weight_scales
|
||||
else:
|
||||
results[f'{prefix}.weight'] = weight.contiguous()
|
||||
|
||||
if bias is not None:
|
||||
results[f'{prefix}.bias'] = bias
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_tllm_param(
|
||||
param: torch.Tensor,
|
||||
name: str,
|
||||
use_weight_only: bool = False,
|
||||
plugin_weight_only_quant_type: torch.dtype = torch.int8
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
results = {}
|
||||
if name.endswith('.weight') and use_weight_only:
|
||||
v = param.t().contiguous()
|
||||
processed_torch_weights, torch_weight_scales = \
|
||||
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
|
||||
v, plugin_weight_only_quant_type)
|
||||
results[name] = processed_torch_weights
|
||||
results[name.replace('weight',
|
||||
'per_channel_scale')] = torch_weight_scales
|
||||
else:
|
||||
results[name] = param
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def convert_hf_gptj(hf_model: GPTJForCausalLM,
|
||||
hf_config: GPTJConfig,
|
||||
mapping: Mapping,
|
||||
dtype: str = 'float32',
|
||||
use_weight_only: bool = False,
|
||||
plugin_weight_only_quant_type: torch.dtype = torch.int8):
|
||||
|
||||
weights = {}
|
||||
tik = time.time()
|
||||
|
||||
model_params = dict(hf_model.named_parameters())
|
||||
dtype = getattr(torch, dtype)
|
||||
num_hidden_layers = hf_config.num_hidden_layers
|
||||
|
||||
layers_range = mapping.pp_layers(num_hidden_layers)
|
||||
for l in layers_range:
|
||||
prefix = f'transformer.h.{l}'
|
||||
tllm_prex = f'transformer.layers.{l-layers_range[0]}'
|
||||
# Attention QKV (no bias)
|
||||
q_weight = get_weight(model_params, f'{prefix}.attn.q_proj', dtype)
|
||||
k_weight = get_weight(model_params, f'{prefix}.attn.k_proj', dtype)
|
||||
v_weight = get_weight(model_params, f'{prefix}.attn.v_proj', dtype)
|
||||
q_w = split_matrix(q_weight, mapping.tp_size, mapping.tp_rank, dim=0)
|
||||
k_w = split_matrix(k_weight, mapping.tp_size, mapping.tp_rank, dim=0)
|
||||
v_w = split_matrix(v_weight, mapping.tp_size, mapping.tp_rank, dim=0)
|
||||
qkv_w = torch.concatenate([q_w, k_w, v_w], dim=0)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(qkv_w, f'{tllm_prex}.attention.qkv', None,
|
||||
use_weight_only,
|
||||
plugin_weight_only_quant_type))
|
||||
# Attention dense (not bias)
|
||||
attn_dense_weight = get_weight(model_params, f'{prefix}.attn.out_proj',
|
||||
dtype)
|
||||
attn_dense_w = split_matrix(attn_dense_weight,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=1)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(attn_dense_w, f'{tllm_prex}.attention.dense',
|
||||
None, use_weight_only,
|
||||
plugin_weight_only_quant_type))
|
||||
# MLP fc_in (with bias)
|
||||
mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(
|
||||
model_params, f'{prefix}.mlp.fc_in', dtype)
|
||||
mlp_fc_w = split_matrix(mlp_fc_weight,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
mlp_fc_b = split_matrix(mlp_fc_bias,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(mlp_fc_w, f'{tllm_prex}.mlp.fc', mlp_fc_b,
|
||||
use_weight_only,
|
||||
plugin_weight_only_quant_type))
|
||||
# MLP fc_out (with bias)
|
||||
mlp_proj_weight, mlp_proj_bias = get_weight_and_bias(
|
||||
model_params, f'{prefix}.mlp.fc_out', dtype)
|
||||
mlp_proj_w = split_matrix(mlp_proj_weight,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=1)
|
||||
# Only rank0 will get bias
|
||||
if mapping.tp_size > 1 and mapping.tp_rank > 0:
|
||||
mlp_proj_bias = torch.zeros(mlp_proj_weight.shape[0],
|
||||
dtype=mlp_proj_weight.dtype)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(mlp_proj_w, f'{tllm_prex}.mlp.proj',
|
||||
mlp_proj_bias, use_weight_only,
|
||||
plugin_weight_only_quant_type))
|
||||
|
||||
input_ln_weight, input_ln_bias = get_weight_and_bias(
|
||||
model_params, f'{prefix}.ln_1', dtype)
|
||||
weights[f'{tllm_prex}.input_layernorm.weight'] = input_ln_weight
|
||||
weights[f'{tllm_prex}.input_layernorm.bias'] = input_ln_bias
|
||||
|
||||
if mapping.is_first_pp_rank():
|
||||
# Embedding
|
||||
embed_w = get_weight(model_params, 'transformer.wte', dtype)
|
||||
weights['transformer.vocab_embedding.weight'] = embed_w
|
||||
if mapping.is_last_pp_rank():
|
||||
# lm_head weight and bias
|
||||
lm_head_w, ln_head_bias = get_weight_and_bias(model_params, 'lm_head',
|
||||
dtype)
|
||||
weights['lm_head.weight'] = split_matrix(lm_head_w,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
weights['lm_head.bias'] = split_matrix(ln_head_bias,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
ln_f_w, ln_f_b = get_weight_and_bias(model_params, 'transformer.ln_f',
|
||||
dtype)
|
||||
# ln_f weight and bias
|
||||
weights['transformer.ln_f.weight'] = ln_f_w
|
||||
if ln_f_b is not None:
|
||||
weights['transformer.ln_f.bias'] = ln_f_b
|
||||
|
||||
tok = time.time()
|
||||
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
|
||||
print(f'Weights loaded. Total time: {t}')
|
||||
return weights
|
||||
|
||||
|
||||
def main():
|
||||
# TODO(qijun): Currently, the convert script depends on a torch op:
|
||||
# torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix,
|
||||
# which is included in tensorrt_llm Python package. Otherwise, the convert
|
||||
# script does not need to import tensorrt_llm. Will remove it after reimplementing
|
||||
# the op with PyTorch.
|
||||
print(tensorrt_llm.__version__)
|
||||
args = parse_arguments()
|
||||
world_size = args.tp_size * args.pp_size
|
||||
|
||||
tik = time.time()
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
def args_to_quant_config(args):
|
||||
quant_algo = None
|
||||
plugin_weight_only_quant_type = None
|
||||
if args.use_weight_only and args.weight_only_precision == 'int8':
|
||||
plugin_weight_only_quant_type = torch.int8
|
||||
quant_algo = QuantAlgo.W8A16
|
||||
elif args.use_weight_only and args.weight_only_precision == 'int4':
|
||||
plugin_weight_only_quant_type = torch.quint4x2
|
||||
quant_algo = QuantAlgo.W4A16
|
||||
return QuantConfig(quant_algo=quant_algo)
|
||||
|
||||
if args.model_dir is not None:
|
||||
hf_config = load_gptj_config(args.model_dir)
|
||||
architecture = hf_config.architectures[0]
|
||||
args.vocab_size = hf_config.vocab_size
|
||||
args.n_positions = hf_config.max_position_embeddings
|
||||
args.n_layer = hf_config.num_hidden_layers
|
||||
args.n_head = hf_config.num_attention_heads
|
||||
args.n_embd = hf_config.hidden_size
|
||||
args.norm_eps = hf_config.layer_norm_epsilon
|
||||
args.rotary_dim = hf_config.rotary_dim
|
||||
else:
|
||||
architecture = "GPTJForCausalLM"
|
||||
|
||||
config = {
|
||||
'architecture': architecture,
|
||||
'dtype': args.dtype,
|
||||
'num_hidden_layers': args.n_layer,
|
||||
'num_attention_heads': args.n_head,
|
||||
'hidden_size': args.n_embd,
|
||||
'norm_epsilon': args.norm_eps,
|
||||
'vocab_size': args.vocab_size,
|
||||
'position_embedding_type': 'rope_gptj',
|
||||
'max_position_embeddings': args.n_positions,
|
||||
'hidden_act': 'gelu',
|
||||
'quantization': {
|
||||
'quant_algo': quant_algo
|
||||
},
|
||||
'mapping': {
|
||||
'world_size': world_size,
|
||||
'tp_size': args.tp_size,
|
||||
'pp_size': args.pp_size,
|
||||
},
|
||||
'rotary_dim': args.rotary_dim,
|
||||
}
|
||||
def convert_and_save_hf(args):
|
||||
model_dir = args.model_dir
|
||||
world_size = args.tp_size * args.pp_size
|
||||
quant_config = args_to_quant_config(args)
|
||||
|
||||
with open(os.path.join(args.output_dir, 'config.json'), 'w') as f:
|
||||
json.dump(config, f, indent=4)
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(model_dir,
|
||||
torch_dtype='auto',
|
||||
trust_remote_code=True)
|
||||
|
||||
if args.model_dir is None:
|
||||
return
|
||||
|
||||
hf_model = AutoModelForCausalLM.from_pretrained(args.model_dir,
|
||||
trust_remote_code=True,
|
||||
torch_dtype="auto")
|
||||
|
||||
def covert_and_save(rank):
|
||||
def convert_and_save_rank(args, rank):
|
||||
mapping = Mapping(world_size=world_size,
|
||||
rank=rank,
|
||||
tp_size=args.tp_size,
|
||||
pp_size=args.pp_size)
|
||||
|
||||
weights = convert_hf_gptj(
|
||||
hf_model,
|
||||
hf_config,
|
||||
mapping,
|
||||
dtype=args.dtype,
|
||||
use_weight_only=args.use_weight_only,
|
||||
plugin_weight_only_quant_type=plugin_weight_only_quant_type)
|
||||
|
||||
safetensors.torch.save_file(
|
||||
weights, os.path.join(args.output_dir, f'rank{rank}.safetensors'))
|
||||
model = GPTJForCausalLM.from_hugging_face(hf_model,
|
||||
args.dtype,
|
||||
mapping=mapping,
|
||||
quant_config=quant_config)
|
||||
model.save_checkpoint(args.output_dir, save_config=(rank == 0))
|
||||
del model
|
||||
|
||||
if args.workers == 1:
|
||||
for rank in range(world_size):
|
||||
covert_and_save(rank)
|
||||
convert_and_save_rank(args, rank)
|
||||
else:
|
||||
with ThreadPoolExecutor(max_workers=args.workers) as p:
|
||||
futures = [
|
||||
p.submit(covert_and_save, rank) for rank in range(world_size)
|
||||
p.submit(convert_and_save_rank, args, rank)
|
||||
for rank in range(world_size)
|
||||
]
|
||||
exceptions = []
|
||||
for future in as_completed(futures):
|
||||
@ -373,6 +117,38 @@ def main():
|
||||
) == 0, "Checkpoint conversion failed, please check error log."
|
||||
|
||||
del hf_model
|
||||
|
||||
|
||||
def main():
|
||||
print(tensorrt_llm.__version__)
|
||||
args = parse_arguments()
|
||||
|
||||
tik = time.time()
|
||||
|
||||
if not os.path.exists(args.output_dir):
|
||||
os.makedirs(args.output_dir)
|
||||
|
||||
if args.model_dir is None:
|
||||
config = GPTJConfig(architecture='GPTJForCausalLM',
|
||||
dtype=args.dtype,
|
||||
num_hidden_layers=args.n_layer,
|
||||
num_attention_heads=args.n_head,
|
||||
hidden_size=args.n_embd,
|
||||
norm_epsilon=args.norm_eps,
|
||||
vocab_size=args.vocab_size,
|
||||
position_embedding_type='rope_gptj',
|
||||
max_position_embeddings=args.n_positions,
|
||||
hidden_act='gelu',
|
||||
rotary_dim=args.rotary_dim,
|
||||
mapping=Mapping(world_size=args.tp_size *
|
||||
args.pp_size,
|
||||
tp_size=args.tp_size,
|
||||
pp_size=args.pp_size),
|
||||
quantization=args_to_quant_config(args))
|
||||
config.to_json_file(os.path.join(args.output_dir, 'config.json'))
|
||||
else:
|
||||
convert_and_save_hf(args)
|
||||
|
||||
tok = time.time()
|
||||
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
|
||||
print(f'Total time of converting checkpoints: {t}')
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,14 +0,0 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
@ -1,273 +0,0 @@
|
||||
# SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
Utilities for exporting a model to our custom format.
|
||||
"""
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from tensorrt_llm._utils import torch_to_numpy
|
||||
|
||||
|
||||
def cpu_map_location(storage, loc):
|
||||
return storage.cpu()
|
||||
|
||||
|
||||
def gpu_map_location(storage, loc):
|
||||
if loc.startswith("cuda"):
|
||||
training_gpu_idx = int(loc.split(":")[1])
|
||||
inference_gpu_idx = training_gpu_idx % torch.cuda.device_count()
|
||||
return storage.cuda(inference_gpu_idx)
|
||||
elif loc.startswith("cpu"):
|
||||
return storage.cpu()
|
||||
else:
|
||||
raise ValueError(f"Not handled {loc}")
|
||||
|
||||
|
||||
def save_val(val, dir, key, tp_num=None):
|
||||
suffix = "bin" if tp_num is None else f"{tp_num}.bin"
|
||||
val.tofile(dir / f"model.{key}.{suffix}")
|
||||
|
||||
|
||||
def save_split(split_vals, dir, key, i, split_factor):
|
||||
for j, val in enumerate(split_vals):
|
||||
save_val(val, dir, key, i * split_factor + j)
|
||||
|
||||
|
||||
def generate_int8(weights, act_range, is_qkv=False, multi_query_mode=False):
|
||||
"""
|
||||
This function has two purposes:
|
||||
- compute quantized weights, scaled either per-tensor or per-column
|
||||
- compute scaling factors
|
||||
|
||||
Depending on the GEMM API (CUTLASS/CUBLAS) the required scaling factors differ.
|
||||
CUTLASS uses two sets of scaling factors. One for the activation X, one for the weight W.
|
||||
CUBLAS only has one (we can't do per-row scaling). So we must provide pre-multiplied scaling factor.
|
||||
|
||||
Here is the list of what we need (T means per-tensor, C per-column):
|
||||
- scale_x_orig_quant puts fp activation into the quantized range (i.e. [-128, 127], for int8). Used before the GEMM. (T)
|
||||
- scale_y_quant_orig puts quantized activation into the fp range. Used if the GEMM outputs int8. (T)
|
||||
- scale_w_quant_orig puts weights from quant range to fp range (used with CUTLASS) (T, C)
|
||||
- scale_y_accum_quant puts the GEMM result (XW) from accumulation range (int32)
|
||||
to quant range (int8) (used for CUBLAS) (T, C)
|
||||
|
||||
Note that we don't do anything special about row-parallel GEMM. Theoretically, we could have per-GPU scaling factors too,
|
||||
but then the model would change depending on the number of GPUs used.
|
||||
|
||||
For QKV projection, the behavior is special. Even if we have a single matrix to perform QKV projection, we consider it
|
||||
as three different matrices: Q, K, and V. So per-tensor actually means one scaling factor for each Q, K and V.
|
||||
"""
|
||||
|
||||
# compute weight scaling factors for fp->int8 and int8->fp
|
||||
if is_qkv and not multi_query_mode:
|
||||
scale_w_orig_quant_t = 127. / act_range["w"].reshape(3, -1).max(
|
||||
dim=-1, keepdims=True)[0].cpu().numpy()
|
||||
scale_w_orig_quant_c = 127. / act_range["w"].reshape(3,
|
||||
-1).cpu().numpy()
|
||||
elif is_qkv and multi_query_mode:
|
||||
raise ValueError(
|
||||
f"Multi-query w/ int8 quant has not been supported yet")
|
||||
else:
|
||||
scale_w_orig_quant_t = 127. / act_range["w"].max().cpu().numpy()
|
||||
scale_w_orig_quant_c = 127. / act_range["w"].cpu().numpy()
|
||||
scale_w_quant_orig_t = 1.0 / scale_w_orig_quant_t
|
||||
scale_w_quant_orig_c = 1.0 / scale_w_orig_quant_c
|
||||
|
||||
# compute the rest of needed scaling factors
|
||||
scale_x_orig_quant_t = np.array(127. / act_range["x"].max().item())
|
||||
scale_y_orig_quant_t = np.array(127. / act_range["y"].max().item())
|
||||
scale_y_quant_orig_t = np.array(act_range["y"].max().item() / 127.)
|
||||
scale_y_accum_quant_t = scale_y_orig_quant_t / (scale_x_orig_quant_t *
|
||||
scale_w_orig_quant_t)
|
||||
scale_y_accum_quant_c = scale_y_orig_quant_t / (scale_x_orig_quant_t *
|
||||
scale_w_orig_quant_c)
|
||||
if is_qkv:
|
||||
scale_y_accum_quant_t = np.broadcast_to(scale_y_accum_quant_t,
|
||||
scale_w_orig_quant_c.shape)
|
||||
scale_w_quant_orig_t = np.broadcast_to(scale_w_quant_orig_t,
|
||||
scale_w_orig_quant_c.shape)
|
||||
|
||||
to_i8 = lambda x: x.round().clip(-127, 127).astype(np.int8)
|
||||
return {
|
||||
"weight.int8": to_i8(weights * scale_w_orig_quant_t),
|
||||
"weight.int8.col": to_i8(weights * scale_w_orig_quant_c),
|
||||
"scale_x_orig_quant": scale_x_orig_quant_t.astype(np.float32),
|
||||
"scale_w_quant_orig": scale_w_quant_orig_t.astype(np.float32),
|
||||
"scale_w_quant_orig.col": scale_w_quant_orig_c.astype(np.float32),
|
||||
"scale_y_accum_quant": scale_y_accum_quant_t.astype(np.float32),
|
||||
"scale_y_accum_quant.col": scale_y_accum_quant_c.astype(np.float32),
|
||||
"scale_y_quant_orig": scale_y_quant_orig_t.astype(np.float32),
|
||||
}
|
||||
|
||||
|
||||
def write_int8(vals,
|
||||
dir,
|
||||
base_key,
|
||||
split_dim,
|
||||
tp_rank,
|
||||
split_factor,
|
||||
kv_cache_only=False):
|
||||
if not kv_cache_only:
|
||||
save_split(np.split(vals["weight.int8"], split_factor, axis=split_dim),
|
||||
dir, f"{base_key}.weight.int8", tp_rank, split_factor)
|
||||
save_split(
|
||||
np.split(vals["weight.int8.col"], split_factor, axis=split_dim),
|
||||
dir, f"{base_key}.weight.int8.col", tp_rank, split_factor)
|
||||
|
||||
saved_keys_once = ["scale_y_quant_orig"]
|
||||
if not kv_cache_only:
|
||||
saved_keys_once += [
|
||||
"scale_x_orig_quant", "scale_w_quant_orig", "scale_y_accum_quant"
|
||||
]
|
||||
# per-column scaling factors are loaded per-gpu for ColumnParallel GEMMs (QKV, FC1)
|
||||
if not kv_cache_only:
|
||||
if split_dim == -1:
|
||||
save_split(
|
||||
np.split(vals["scale_w_quant_orig.col"],
|
||||
split_factor,
|
||||
axis=split_dim), dir,
|
||||
f"{base_key}.scale_w_quant_orig.col", tp_rank, split_factor)
|
||||
save_split(
|
||||
np.split(vals["scale_y_accum_quant.col"],
|
||||
split_factor,
|
||||
axis=split_dim), dir,
|
||||
f"{base_key}.scale_y_accum_quant.col", tp_rank, split_factor)
|
||||
else:
|
||||
saved_keys_once += [
|
||||
"scale_w_quant_orig.col", "scale_y_accum_quant.col"
|
||||
]
|
||||
|
||||
if tp_rank == 0:
|
||||
for save_key in saved_keys_once:
|
||||
save_val(vals[save_key], dir, f"{base_key}.{save_key}")
|
||||
|
||||
|
||||
# Note: in multi_query_mode, only query heads are split between multiple GPUs, while key/value head
|
||||
# are not split as there is only one head per key/value.
|
||||
@torch.no_grad()
|
||||
def split_and_save_weight(tp_rank, saved_dir, split_factor, key, vals,
|
||||
storage_type, act_range, config):
|
||||
use_attention_nemo_shape = config.get("use_attention_nemo_shape", False)
|
||||
split_gated_activation = config.get("split_gated_activation", False)
|
||||
num_attention_heads = config.get("num_attention_heads", 0)
|
||||
tp_size = config.get("tp_size", 1)
|
||||
int8_outputs = config.get("int8_outputs", None)
|
||||
multi_query_mode = config.get("multi_query_mode", False)
|
||||
local_dim = config.get("local_dim", None)
|
||||
|
||||
save_int8 = int8_outputs == "all" or int8_outputs == "kv_cache_only"
|
||||
|
||||
if not isinstance(vals, list):
|
||||
vals = [vals]
|
||||
|
||||
if config.get("transpose_weights", False) and vals[0].ndim == 2:
|
||||
vals = [val.T for val in vals]
|
||||
if "layernorm.weight" in key and config.get("apply_layernorm_1p", False):
|
||||
vals = [val + 1.0 for val in vals]
|
||||
vals = [torch_to_numpy(val.cpu().to(storage_type)) for val in vals]
|
||||
|
||||
if "input_layernorm.weight" in key or "input_layernorm.bias" in key or \
|
||||
"final_layernorm.weight" in key or "final_layernorm.bias" in key or \
|
||||
"mlp.dense_4h_to_h.bias" in key:
|
||||
|
||||
# shared weights, only need to convert the weights of rank 0
|
||||
if tp_rank == 0:
|
||||
save_val(vals[0], saved_dir, key)
|
||||
|
||||
elif "attention.dense.weight" in key or "mlp.dense_4h_to_h.weight" in key:
|
||||
cat_dim = 0
|
||||
val = np.concatenate(vals, axis=cat_dim)
|
||||
split_vals = np.split(val, split_factor, axis=cat_dim)
|
||||
save_split(split_vals, saved_dir, key, tp_rank, split_factor)
|
||||
if act_range is not None and int8_outputs == "all":
|
||||
base_key = key.replace(".weight", "")
|
||||
vals_i8 = generate_int8(val,
|
||||
act_range,
|
||||
multi_query_mode=multi_query_mode)
|
||||
write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank,
|
||||
split_factor)
|
||||
|
||||
elif "mlp.dense_h_to_4h.weight" in key or "mlp.dense_h_to_4h.bias" in key:
|
||||
if split_gated_activation:
|
||||
splits = [np.split(val, 2, axis=-1) for val in vals]
|
||||
vals, gates = list(zip(*splits))
|
||||
cat_dim = -1
|
||||
val = np.concatenate(vals, axis=cat_dim)
|
||||
split_vals = np.split(val, split_factor, axis=cat_dim)
|
||||
save_split(split_vals, saved_dir, key, tp_rank, split_factor)
|
||||
if act_range is not None and int8_outputs == "all":
|
||||
base_key = key.replace(".weight", "")
|
||||
vals_i8 = generate_int8(val,
|
||||
act_range,
|
||||
multi_query_mode=multi_query_mode)
|
||||
write_int8(vals_i8, saved_dir, base_key, cat_dim, tp_rank,
|
||||
split_factor)
|
||||
|
||||
if split_gated_activation:
|
||||
assert not save_int8
|
||||
prefix, dot, suffix = key.rpartition(".")
|
||||
key = prefix + ".gate" + dot + suffix
|
||||
|
||||
gate = np.concatenate(gates, axis=cat_dim)
|
||||
split_vals = np.split(gate, split_factor, axis=cat_dim)
|
||||
save_split(split_vals, saved_dir, key, tp_rank, split_factor)
|
||||
|
||||
elif "attention.query_key_value.weight" in key:
|
||||
hidden_dim = vals[0].shape[0]
|
||||
if local_dim is None:
|
||||
local_dim = vals[0].shape[-1] // 3
|
||||
if multi_query_mode:
|
||||
val = vals[0]
|
||||
# out_feature = local_dim + 2 * head_size; assumes local_dim equals to hidden_dim
|
||||
head_size = (val.shape[-1] - local_dim) // 2
|
||||
val = val.reshape(hidden_dim, local_dim + 2 * head_size)
|
||||
w_q, w_kv = np.split(val, [local_dim], axis=-1)
|
||||
w_q_split = np.split(w_q, split_factor, axis=-1)
|
||||
split_vals = [np.concatenate((i, w_kv), axis=-1) for i in w_q_split]
|
||||
else:
|
||||
if use_attention_nemo_shape:
|
||||
head_num = num_attention_heads // tp_size
|
||||
size_per_head = hidden_dim // num_attention_heads
|
||||
vals = [
|
||||
val.reshape(hidden_dim, head_num, 3, size_per_head)
|
||||
for val in vals
|
||||
]
|
||||
vals = [val.transpose(0, 2, 1, 3) for val in vals]
|
||||
|
||||
vals = [val.reshape(hidden_dim, 3, local_dim) for val in vals]
|
||||
cat_dim = -1
|
||||
val = np.concatenate(vals, axis=cat_dim)
|
||||
split_vals = np.split(val, split_factor, axis=cat_dim)
|
||||
save_split(split_vals, saved_dir, key, tp_rank, split_factor)
|
||||
if save_int8:
|
||||
base_key = key.replace(".weight", "")
|
||||
vals_i8 = generate_int8(val,
|
||||
act_range,
|
||||
is_qkv=True,
|
||||
multi_query_mode=multi_query_mode)
|
||||
write_int8(vals_i8,
|
||||
saved_dir,
|
||||
base_key,
|
||||
cat_dim,
|
||||
tp_rank,
|
||||
split_factor,
|
||||
kv_cache_only=int8_outputs == "kv_cache_only")
|
||||
elif ("attention.query.weight" in key or "attention.query.bias" in key
|
||||
or "attention.key_value.weight" in key
|
||||
or "attention.key_value.bias" in key):
|
||||
pass
|
||||
else:
|
||||
assert False, f"[ERROR] {key} not handled by converter"
|
||||
@ -1,178 +0,0 @@
|
||||
"""Byte pair encoding utilities"""
|
||||
|
||||
# Modified MIT License
|
||||
|
||||
# Software Copyright (c) 2019 OpenAI
|
||||
|
||||
# We don’t claim ownership of the content you create with GPT-2, so it is yours to do with as you please.
|
||||
# We only ask that you use GPT-2 responsibly and clearly indicate your content was created using GPT-2.
|
||||
|
||||
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and
|
||||
# associated documentation files (the "Software"), to deal in the Software without restriction,
|
||||
# including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
|
||||
# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so,
|
||||
# subject to the following conditions:
|
||||
|
||||
# The above copyright notice and this permission notice shall be included
|
||||
# in all copies or substantial portions of the Software.
|
||||
# The above copyright notice and this permission notice need not be included
|
||||
# with content created by the Software.
|
||||
|
||||
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED,
|
||||
# INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
|
||||
# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
|
||||
# TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE
|
||||
# OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
# Copyright (c) 2021-2022, NVIDIA CORPORATION. All rights reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
import json
|
||||
from functools import lru_cache
|
||||
|
||||
import regex as re
|
||||
|
||||
|
||||
@lru_cache()
|
||||
def bytes_to_unicode():
|
||||
"""
|
||||
Returns list of utf-8 byte and a corresponding list of unicode strings.
|
||||
The reversible bpe codes work on unicode strings.
|
||||
This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
|
||||
When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
|
||||
This is a significant percentage of your normal, say, 32K bpe vocab.
|
||||
To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
|
||||
And avoids mapping to whitespace/control characters the bpe code barfs on.
|
||||
"""
|
||||
bs = list(range(ord("!"),
|
||||
ord("~") + 1)) + list(range(
|
||||
ord("¡"),
|
||||
ord("¬") + 1)) + list(range(ord("®"),
|
||||
ord("ÿ") + 1))
|
||||
cs = bs[:]
|
||||
n = 0
|
||||
for b in range(2**8):
|
||||
if b not in bs:
|
||||
bs.append(b)
|
||||
cs.append(2**8 + n)
|
||||
n += 1
|
||||
cs = [chr(n) for n in cs]
|
||||
return dict(zip(bs, cs))
|
||||
|
||||
|
||||
def get_pairs(word):
|
||||
"""Return set of symbol pairs in a word.
|
||||
|
||||
Word is represented as tuple of symbols (symbols being variable-length strings).
|
||||
"""
|
||||
pairs = set()
|
||||
prev_char = word[0]
|
||||
for char in word[1:]:
|
||||
pairs.add((prev_char, char))
|
||||
prev_char = char
|
||||
return pairs
|
||||
|
||||
|
||||
class Encoder:
|
||||
|
||||
def __init__(self, encoder, bpe_merges, errors='replace'):
|
||||
self.encoder = encoder
|
||||
self.decoder = {v: k for k, v in self.encoder.items()}
|
||||
self.errors = errors # how to handle errors in decoding
|
||||
self.byte_encoder = bytes_to_unicode()
|
||||
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
||||
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
|
||||
self.cache = {}
|
||||
|
||||
# Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
|
||||
self.pat = re.compile(
|
||||
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
|
||||
)
|
||||
|
||||
def bpe(self, token):
|
||||
if token in self.cache:
|
||||
return self.cache[token]
|
||||
word = tuple(token)
|
||||
pairs = get_pairs(word)
|
||||
|
||||
if not pairs:
|
||||
return token
|
||||
|
||||
while True:
|
||||
bigram = min(
|
||||
pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
|
||||
if bigram not in self.bpe_ranks:
|
||||
break
|
||||
first, second = bigram
|
||||
new_word = []
|
||||
i = 0
|
||||
while i < len(word):
|
||||
try:
|
||||
j = word.index(first, i)
|
||||
new_word.extend(word[i:j])
|
||||
i = j
|
||||
except:
|
||||
new_word.extend(word[i:])
|
||||
break
|
||||
|
||||
if word[i] == first and i < len(word) - 1 and word[i +
|
||||
1] == second:
|
||||
new_word.append(first + second)
|
||||
i += 2
|
||||
else:
|
||||
new_word.append(word[i])
|
||||
i += 1
|
||||
new_word = tuple(new_word)
|
||||
word = new_word
|
||||
if len(word) == 1:
|
||||
break
|
||||
else:
|
||||
pairs = get_pairs(word)
|
||||
word = ' '.join(word)
|
||||
self.cache[token] = word
|
||||
return word
|
||||
|
||||
def encode(self, text):
|
||||
bpe_tokens = []
|
||||
for token in re.findall(self.pat, text):
|
||||
token = ''.join(self.byte_encoder[b] for b in token.encode('utf-8'))
|
||||
bpe_tokens.extend(self.encoder[bpe_token]
|
||||
for bpe_token in self.bpe(token).split(' '))
|
||||
return bpe_tokens
|
||||
|
||||
def decode(self, tokens):
|
||||
text = ''.join([self.decoder[token] for token in tokens])
|
||||
text = bytearray([self.byte_decoder[c]
|
||||
for c in text]).decode('utf-8', errors=self.errors)
|
||||
return text
|
||||
|
||||
def batch_decode(self, output):
|
||||
ret = []
|
||||
for tokens in output:
|
||||
ret.append(self.decode(tokens))
|
||||
return ret
|
||||
|
||||
|
||||
def get_encoder(vocab_file, bpe_file):
|
||||
with open(vocab_file, 'r', encoding="utf-8") as f:
|
||||
encoder = json.load(f)
|
||||
with open(bpe_file, 'r', encoding="utf-8") as f:
|
||||
bpe_data = f.read()
|
||||
bpe_merges = [
|
||||
tuple(merge_str.split()) for merge_str in bpe_data.split('\n')[1:-1]
|
||||
]
|
||||
return Encoder(
|
||||
encoder=encoder,
|
||||
bpe_merges=bpe_merges,
|
||||
)
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
evaluate~=0.4.1
|
||||
|
||||
@ -1,6 +1,6 @@
|
||||
-f https://storage.googleapis.com/jax-releases/jax_cuda_releases.html
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,2 +1,2 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets==2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece~=0.1.99
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
transformers>=4.39.0
|
||||
datasets~=2.14.5
|
||||
evaluate
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
rouge_score~=0.1.2
|
||||
sentencepiece~=0.1.99
|
||||
|
||||
@ -1,4 +1,4 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
transformers==4.38.2
|
||||
accelerate==0.25.0
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
transformers==4.40.2
|
||||
# https://github.com/NVIDIA/NeMo/issues/9793
|
||||
huggingface_hub==0.23.5
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.14.5
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets>=2.14.4
|
||||
nemo-toolkit[all]<=1.20.0,>=1.18.0
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.16.0
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.16.0
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
git+https://github.com/google-deepmind/recurrentgemma.git
|
||||
flax>=0.8.2
|
||||
jax~=0.4.23
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets~=2.16.1
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
datasets==2.14.6
|
||||
evaluate~=0.4.1
|
||||
rouge_score~=0.1.2
|
||||
|
||||
@ -182,7 +182,7 @@ def main(args):
|
||||
input_ids = tokenizer.encode(curr_text,
|
||||
return_tensors='pt').squeeze(0)
|
||||
input_ids = input_ids[:test_token_num]
|
||||
elif model_name == 'QWenForCausalLM' and model_version == 'qwen':
|
||||
elif 'qwen' in model_name.lower() and model_version == 'qwen':
|
||||
# use make_content to generate prompt
|
||||
system_prompt = "You are a useful assistant, please directly output the corresponding summary according to the article entered by the user."
|
||||
_, input_id_list = make_context(
|
||||
@ -194,7 +194,7 @@ def main(args):
|
||||
)
|
||||
input_ids = torch.tensor(input_id_list)
|
||||
else:
|
||||
if model_name == 'QWenForCausalLM' and 'qwen2' in model_version:
|
||||
if 'qwen' in model_name.lower() and 'qwen2' in model_version:
|
||||
messages = [{
|
||||
"role": "system",
|
||||
"content": "You are a helpful assistant."
|
||||
@ -527,7 +527,7 @@ def main(args):
|
||||
ite_count += 1
|
||||
del runner
|
||||
|
||||
if test_hf:
|
||||
if test_hf and runtime_rank == 0:
|
||||
profiler.start('load HF model')
|
||||
dtype_alias_mapping = {
|
||||
'fp32': 'float32',
|
||||
|
||||
@ -37,7 +37,10 @@ DEFAULT_HF_MODEL_DIRS = {
|
||||
'MPTForCausalLM': 'mosaicml/mpt-7b',
|
||||
'PhiForCausalLM': 'microsoft/phi-2',
|
||||
'OPTForCausalLM': 'facebook/opt-350m',
|
||||
'QWenLMHeadModel': 'Qwen/Qwen-7B',
|
||||
'QWenForCausalLM': 'Qwen/Qwen-7B',
|
||||
'Qwen2ForCausalLM': 'Qwen/Qwen1.5-7B',
|
||||
'Qwen2MoeForCausalLM': 'Qwen/Qwen1.5-MoE-A2.7B',
|
||||
'RecurrentGemmaForCausalLM': 'google/recurrentgemma-2b',
|
||||
}
|
||||
|
||||
@ -46,14 +49,16 @@ INTERNLM_META_INSTRUCTION = """You are an AI assistant whose name is InternLM (
|
||||
- InternLM (书生·浦语) can understand and communicate fluently in the language chosen by the user such as English and 中文.
|
||||
"""
|
||||
|
||||
QWEN_PROMPT_TEMPLATE = "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
DEFAULT_PROMPT_TEMPLATES = {
|
||||
'InternLMForCausalLM':
|
||||
"<|User|>:{input_text}<eoh>\n<|Bot|>:",
|
||||
'InternLM2ForCausalLM':
|
||||
"<|im_start|>system\n" + INTERNLM_META_INSTRUCTION +
|
||||
'InternLMForCausalLM': "<|User|>:{input_text}<eoh>\n<|Bot|>:",
|
||||
'InternLM2ForCausalLM': "<|im_start|>system\n" + INTERNLM_META_INSTRUCTION +
|
||||
"<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
|
||||
'QWenForCausalLM':
|
||||
"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_text}<|im_end|>\n<|im_start|>assistant\n",
|
||||
'QWenLMHeadModel': QWEN_PROMPT_TEMPLATE,
|
||||
'QWenForCausalLM': QWEN_PROMPT_TEMPLATE,
|
||||
'Qwen2ForCausalLM': QWEN_PROMPT_TEMPLATE,
|
||||
'Qwen2MoeForCausalLM': QWEN_PROMPT_TEMPLATE,
|
||||
}
|
||||
|
||||
|
||||
@ -83,7 +88,7 @@ def read_model_name(engine_dir: str):
|
||||
model_version = None
|
||||
if 'GLM' in model_arch:
|
||||
model_version = config['pretrained_config']['chatglm_version']
|
||||
if model_arch == 'QWenForCausalLM':
|
||||
if 'qwen' in model_arch.lower():
|
||||
model_version = config['pretrained_config']['qwen_type']
|
||||
return model_arch, model_version
|
||||
|
||||
@ -134,7 +139,7 @@ def load_tokenizer(tokenizer_dir: Optional[str] = None,
|
||||
padding_side='left',
|
||||
truncation_side='left',
|
||||
legacy=False)
|
||||
if model_name == 'QWenForCausalLM' and model_version == 'qwen':
|
||||
if 'qwen' in model_name.lower() and model_version == 'qwen':
|
||||
with open(Path(tokenizer_dir) / "generation_config.json") as f:
|
||||
gen_config = json.load(f)
|
||||
pad_id = gen_config['pad_token_id']
|
||||
|
||||
@ -1,5 +1,5 @@
|
||||
--extra-index-url https://pypi.nvidia.com
|
||||
tensorrt_llm==0.12.0.dev2024072301
|
||||
tensorrt_llm==0.12.0.dev2024072302
|
||||
tiktoken
|
||||
datasets
|
||||
kaldialign
|
||||
|
||||
@ -28,6 +28,7 @@ from ._common import _is_building, check_max_num_tokens, serialize_engine
|
||||
from ._utils import str_dtype_to_trt, to_json_file
|
||||
from .auto_parallel import auto_parallel
|
||||
from .auto_parallel.config import AutoParallelConfig
|
||||
from .functional import PositionEmbeddingType
|
||||
from .graph_rewriting import optimize
|
||||
from .logger import logger
|
||||
from .lora_manager import LoraConfig
|
||||
@ -466,34 +467,6 @@ class BuildConfig:
|
||||
dry_run: bool = False
|
||||
visualize_network: bool = False
|
||||
|
||||
def __post_init__(self):
|
||||
"""
|
||||
Check and may modify max_num_tokens and opt_num_tokens after instantiation
|
||||
"""
|
||||
max_num_tokens, opt_num_tokens = check_max_num_tokens(
|
||||
max_num_tokens=self.max_num_tokens,
|
||||
opt_num_tokens=self.opt_num_tokens,
|
||||
max_batch_size=self.max_batch_size,
|
||||
max_input_len=self.max_input_len,
|
||||
max_seq_len=self.max_seq_len,
|
||||
max_beam_width=self.max_beam_width,
|
||||
remove_input_padding=self.plugin_config.remove_input_padding,
|
||||
enable_context_fmha=self.plugin_config.context_fmha,
|
||||
tokens_per_block=self.plugin_config.tokens_per_block,
|
||||
multiple_profiles=self.plugin_config.multiple_profiles,
|
||||
)
|
||||
self.max_num_tokens, self.opt_num_tokens = max_num_tokens, opt_num_tokens
|
||||
|
||||
if self.plugin_config.remove_input_padding and self.plugin_config.context_fmha:
|
||||
if self.max_input_len:
|
||||
logger.warning(
|
||||
'padding removal and fMHA are both enabled, max_input_len is not required and will be ignored'
|
||||
)
|
||||
else:
|
||||
assert self.max_input_len is not None, 'padding removal and fMHA aren\'t both enabled, max_input_len is required'
|
||||
if self.max_seq_len:
|
||||
assert self.max_input_len <= self.max_seq_len, 'max_input_len should not be larger than max_seq_len'
|
||||
|
||||
@classmethod
|
||||
def from_dict(cls, config, plugin_config=None):
|
||||
max_input_len = config.pop('max_input_len')
|
||||
@ -507,7 +480,7 @@ class BuildConfig:
|
||||
'max_prompt_embedding_table_size', 0)
|
||||
gather_context_logits = config.pop('gather_context_logits', False)
|
||||
gather_generation_logits = config.pop('gather_generation_logits', False)
|
||||
strongly_typed = config.pop('strongly_typed', False)
|
||||
strongly_typed = config.pop('strongly_typed', True)
|
||||
builder_opt = config.pop('builder_opt', None)
|
||||
force_num_profiles = config.pop('force_num_profiles', None)
|
||||
weight_sparsity = config.pop('weight_sparsity', False)
|
||||
@ -730,6 +703,79 @@ def optimize_model_with_config(model: PretrainedModel,
|
||||
return model
|
||||
|
||||
|
||||
def _init_max_seq_len(model_config, build_config):
|
||||
"""
|
||||
If max_seq_len is not specified, set it to max_position_embeddings * rotary_factor
|
||||
Additional checks to ensure max_seq_len, max_input_len, and max_num_tokens have valid values.
|
||||
"""
|
||||
# Extract rotary scaling which will be used for checks and default value of max_seq_len
|
||||
rotary_scaling = getattr(model_config, "rotary_scaling", None)
|
||||
if rotary_scaling is not None:
|
||||
rotary_type = rotary_scaling.get('type',
|
||||
rotary_scaling.get('rope_type'))
|
||||
rotary_factor = rotary_scaling.get('factor',
|
||||
1.0) if rotary_type != 'su' else 1
|
||||
else:
|
||||
rotary_factor = 1
|
||||
|
||||
if build_config.max_seq_len is None:
|
||||
# Step 1: Find the upper bound of max_seq_len
|
||||
deduced_max_seq_len = 2048
|
||||
if model_config.max_position_embeddings is not None:
|
||||
deduced_max_seq_len = model_config.max_position_embeddings
|
||||
|
||||
# Step 2: Scale max_seq_len with rotary scaling
|
||||
if rotary_factor != 1:
|
||||
deduced_max_seq_len *= rotary_factor
|
||||
logger.warning(
|
||||
f'max_seq_len is scaled to {deduced_max_seq_len} by rotary scaling {rotary_factor}'
|
||||
)
|
||||
|
||||
# Step 3: Assign the new max_seq_len
|
||||
build_config.max_seq_len = deduced_max_seq_len
|
||||
logger.info(
|
||||
f'max_seq_len is not specified, using deduced value {deduced_max_seq_len}'
|
||||
)
|
||||
else:
|
||||
if not build_config.plugin_config.streamingllm and model_config.max_position_embeddings is not None \
|
||||
and model_config.position_embedding_type != PositionEmbeddingType.relative:
|
||||
if build_config.max_seq_len > model_config.max_position_embeddings * rotary_factor:
|
||||
logger.warning(
|
||||
f'max_seq_len {build_config.max_seq_len} is larger than max_position_embeddings {model_config.max_position_embeddings} * rotary scaling {rotary_factor}, '
|
||||
'the model accuracy might be affected')
|
||||
|
||||
if build_config.max_input_len > build_config.max_seq_len:
|
||||
logger.warning(
|
||||
f'max_input_len is {build_config.max_input_len} is larger than max_seq_len {build_config.max_seq_len}, clipping it to max_seq_len'
|
||||
)
|
||||
build_config.max_input_len = build_config.max_seq_len
|
||||
|
||||
# Check and may modify max_num_tokens and opt_num_tokens (need to happen after max_seq_len is deduced)
|
||||
max_num_tokens, opt_num_tokens = check_max_num_tokens(
|
||||
max_num_tokens=build_config.max_num_tokens,
|
||||
opt_num_tokens=build_config.opt_num_tokens,
|
||||
max_batch_size=build_config.max_batch_size,
|
||||
max_input_len=build_config.max_input_len,
|
||||
max_seq_len=build_config.max_seq_len,
|
||||
max_beam_width=build_config.max_beam_width,
|
||||
remove_input_padding=build_config.plugin_config.remove_input_padding,
|
||||
enable_context_fmha=build_config.plugin_config.context_fmha,
|
||||
tokens_per_block=build_config.plugin_config.tokens_per_block,
|
||||
multiple_profiles=build_config.plugin_config.multiple_profiles,
|
||||
)
|
||||
build_config.max_num_tokens, build_config.opt_num_tokens = max_num_tokens, opt_num_tokens
|
||||
|
||||
if build_config.plugin_config.remove_input_padding and build_config.plugin_config.context_fmha:
|
||||
if build_config.max_input_len:
|
||||
logger.warning(
|
||||
'padding removal and fMHA are both enabled, max_input_len is not required and will be ignored'
|
||||
)
|
||||
else:
|
||||
assert build_config.max_input_len is not None, 'padding removal and fMHA aren\'t both enabled, max_input_len is required'
|
||||
if build_config.max_seq_len:
|
||||
assert build_config.max_input_len <= build_config.max_seq_len, 'max_input_len should not be larger than max_seq_len'
|
||||
|
||||
|
||||
def build(model: PretrainedModel,
|
||||
build_config: BuildConfig,
|
||||
return_build_config: bool = False) -> Engine | BuildConfig:
|
||||
@ -743,6 +789,8 @@ def build(model: PretrainedModel,
|
||||
build_config = copy.deepcopy(build_config)
|
||||
build_config.plugin_config.dtype = model.config.dtype
|
||||
|
||||
_init_max_seq_len(model.config, build_config)
|
||||
|
||||
if model.config.quantization.quant_algo == QuantAlgo.FP8 or \
|
||||
model.config.quantization.kv_cache_quant_algo == QuantAlgo.FP8:
|
||||
build_config.strongly_typed = True
|
||||
|
||||
@ -27,7 +27,6 @@ import torch
|
||||
from tensorrt_llm.auto_parallel import infer_cluster_config
|
||||
from tensorrt_llm.auto_parallel.cluster_info import cluster_infos
|
||||
from tensorrt_llm.builder import BuildConfig, Engine, build
|
||||
from tensorrt_llm.functional import PositionEmbeddingType
|
||||
from tensorrt_llm.logger import logger
|
||||
from tensorrt_llm.lora_manager import LoraConfig, LoraManager
|
||||
from tensorrt_llm.models import MODEL_MAP, PretrainedConfig
|
||||
@ -444,48 +443,6 @@ def main():
|
||||
else:
|
||||
cluster_config = infer_cluster_config()
|
||||
|
||||
# Extract rotary scaling which will be used for checks and default value of max_seq_len
|
||||
rotary_scaling = getattr(model_config, "rotary_scaling", None)
|
||||
if rotary_scaling is not None:
|
||||
rotary_type = rotary_scaling.get('type',
|
||||
rotary_scaling.get('rope_type'))
|
||||
rotary_factor = rotary_scaling.get(
|
||||
'factor', 1.0) if rotary_type != 'su' else 1
|
||||
else:
|
||||
rotary_factor = 1
|
||||
|
||||
if args.max_seq_len is None:
|
||||
# Step 1: Find the upper bound of max_seq_len
|
||||
deduced_max_seq_len = 2048
|
||||
if model_config.max_position_embeddings is not None:
|
||||
deduced_max_seq_len = model_config.max_position_embeddings
|
||||
|
||||
# Step 2: Scale max_seq_len with rotary scaling
|
||||
if rotary_factor != 1:
|
||||
deduced_max_seq_len *= rotary_factor
|
||||
logger.warning(
|
||||
f'max_seq_len is scaled to {deduced_max_seq_len} by rotary scaling {rotary_factor}'
|
||||
)
|
||||
|
||||
# Step 3: Assign the new max_seq_len
|
||||
args.max_seq_len = deduced_max_seq_len
|
||||
logger.info(
|
||||
f'max_seq_len is not specified, using value {deduced_max_seq_len}'
|
||||
)
|
||||
else:
|
||||
if not plugin_config.streamingllm and model_config.max_position_embeddings is not None \
|
||||
and model_config.position_embedding_type != PositionEmbeddingType.relative:
|
||||
if args.max_seq_len > model_config.max_position_embeddings * rotary_factor:
|
||||
logger.warning(
|
||||
f'max_seq_len {args.max_seq_len} is larger than max_position_embeddings {model_config.max_position_embeddings} * rotary scaling {rotary_factor}, '
|
||||
'the model accuracy might be affected')
|
||||
|
||||
if args.max_input_len > args.max_seq_len:
|
||||
logger.warning(
|
||||
f'max_input_len is {args.max_input_len} is larger than max_seq_len {args.max_seq_len}, clipping it to max_seq_len'
|
||||
)
|
||||
args.max_input_len = args.max_seq_len
|
||||
|
||||
build_config = BuildConfig.from_dict(
|
||||
{
|
||||
'max_input_len': args.max_input_len,
|
||||
|
||||
@ -4975,6 +4975,7 @@ def gpt_attention(
|
||||
])
|
||||
|
||||
attn_plug = attn_plg_creator.create_plugin("causal_attn", pfc)
|
||||
assert attn_plug
|
||||
plug_inputs = [*qkv] if is_unfuse_qkv_gemm else [qkv]
|
||||
if use_cache:
|
||||
plug_inputs += [
|
||||
@ -5510,7 +5511,7 @@ def lora_plugin(
|
||||
transa: bool = False,
|
||||
transb: bool = False,
|
||||
host_context_lengths: Tensor = None, # for pad-free input mode
|
||||
max_context_length: int = 0,
|
||||
max_num_tokens: int = 0,
|
||||
max_low_rank: int = 0,
|
||||
lora_ranks: List[Tensor] = None,
|
||||
lora_weights_pointers: List[Tensor] = None,
|
||||
@ -5541,8 +5542,8 @@ def lora_plugin(
|
||||
host_context_lengths: cpu Tensor = None
|
||||
A host tensor that contains the lengths of the different inputs,
|
||||
|
||||
max_context_length : int
|
||||
Maximum length during context phase, used to determine the workspace size.
|
||||
max_num_tokens : int
|
||||
Maximum number of tokens, used to determine the workspace size.
|
||||
|
||||
max_low_rank : int
|
||||
Maximum low_rank, used to determine the workspace size.
|
||||
@ -5591,8 +5592,8 @@ def lora_plugin(
|
||||
"remove_input_padding",
|
||||
np.array(np.int8(default_net().plugin_config.remove_input_padding),
|
||||
dtype=np.int8), trt.PluginFieldType.INT8)
|
||||
max_context_length_field = trt.PluginField(
|
||||
"max_context_length", np.array(max_context_length, dtype=np.int32),
|
||||
max_num_tokens_field = trt.PluginField(
|
||||
"max_num_tokens", np.array(max_num_tokens, dtype=np.int32),
|
||||
trt.PluginFieldType.INT32)
|
||||
max_low_rank_field = trt.PluginField("max_low_rank",
|
||||
np.array(max_low_rank, dtype=np.int32),
|
||||
@ -5607,7 +5608,7 @@ def lora_plugin(
|
||||
|
||||
pfc = trt.PluginFieldCollection([
|
||||
in_hidden_size_field, transa, transb, num_lora_modules_field, pf_type,
|
||||
remove_input_padding, max_context_length_field, max_low_rank_field,
|
||||
remove_input_padding, max_num_tokens_field, max_low_rank_field,
|
||||
weight_index_field
|
||||
] + out_hidden_size_field_list)
|
||||
lora_plug = plg_creator.create_plugin("lora", pfc)
|
||||
|
||||
@ -288,6 +288,12 @@ class LlmArgs:
|
||||
else:
|
||||
self.tokenizer = tokenizer_factory(self.tokenizer)
|
||||
|
||||
if torch.cuda.get_device_properties(0).major < 8:
|
||||
if self.dtype == 'auto':
|
||||
self.dtype = 'float16'
|
||||
if self.dtype == 'bfloat16':
|
||||
raise RuntimeError("Pre SM 80 GPUs do not support bfloat16")
|
||||
|
||||
self._engine_config: Optional[EngineConfig] = None
|
||||
|
||||
self.auto_parallel_config = AutoParallelConfig(
|
||||
@ -1021,7 +1027,10 @@ class ModelLoader:
|
||||
raise NotImplementedError(
|
||||
f"Unsupported model architecture in HLAPI: {architecture}")
|
||||
|
||||
if self.llm_args.quant_config.quant_mode.has_any_quant():
|
||||
use_weight_only = self.llm_args.quant_config.quant_algo in (
|
||||
QuantAlgo.W4A16, QuantAlgo.W8A16)
|
||||
if self.llm_args.quant_config.quant_mode.has_any_quant(
|
||||
) and not use_weight_only:
|
||||
assert self.workspace is not None
|
||||
checkpoint_dir = f"{self.workspace}/quantized-checkpoint"
|
||||
if self.rank == 0:
|
||||
|
||||
@ -612,7 +612,7 @@ class Attention(Module):
|
||||
],
|
||||
host_request_types=q_lora_params.host_request_types,
|
||||
host_context_lengths=q_lora_params.host_context_lengths,
|
||||
max_context_length=q_lora_params.max_context_length,
|
||||
max_num_tokens=q_lora_params.max_num_tokens,
|
||||
max_encoder_context_length=q_lora_params.
|
||||
max_encoder_context_length,
|
||||
host_encoder_input_lengths=q_lora_params.
|
||||
@ -1337,7 +1337,7 @@ class BertAttention(Module):
|
||||
],
|
||||
host_request_types=q_lora_params.host_request_types,
|
||||
host_context_lengths=q_lora_params.host_context_lengths,
|
||||
max_context_length=q_lora_params.max_context_length)
|
||||
max_num_tokens=q_lora_params.max_num_tokens)
|
||||
|
||||
q_lora, k_lora, v_lora = self.qkv_lora(hidden_states,
|
||||
qkv_lora_params)
|
||||
|
||||
@ -13,7 +13,7 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import List
|
||||
from typing import List, Optional
|
||||
|
||||
from .._common import default_net
|
||||
from ..functional import Tensor, lora_plugin
|
||||
@ -28,7 +28,7 @@ class LoraRuntimeParams(object):
|
||||
lora_weights_pointers: List[Tensor] = None,
|
||||
host_request_types: Tensor = None,
|
||||
host_context_lengths: Tensor = None,
|
||||
max_context_length: Tensor = None,
|
||||
max_num_tokens: Optional[int] = None,
|
||||
max_encoder_context_length: Tensor = None,
|
||||
host_encoder_input_lengths: Tensor = None,
|
||||
weight_index: int = 0,
|
||||
@ -38,7 +38,7 @@ class LoraRuntimeParams(object):
|
||||
self.lora_weights_pointers = lora_weights_pointers
|
||||
self.host_request_types = host_request_types
|
||||
self.host_context_lengths = host_context_lengths
|
||||
self.max_context_length = max_context_length
|
||||
self.max_num_tokens = max_num_tokens
|
||||
self.max_encoder_context_length = max_encoder_context_length
|
||||
self.host_encoder_input_lengths = host_encoder_input_lengths
|
||||
self.weight_index = weight_index
|
||||
@ -71,8 +71,8 @@ class Lora(Module):
|
||||
host_context_lengths=lora_runtime_params.host_context_lengths
|
||||
if not is_cross_attention else
|
||||
lora_runtime_params.host_encoder_input_lengths,
|
||||
# For cross attention, max_encoder_context_length should be used instead of max_context_length
|
||||
max_context_length=lora_runtime_params.max_context_length
|
||||
# For cross attention, max_encoder_context_length should be used instead of max_num_tokens
|
||||
max_num_tokens=lora_runtime_params.max_num_tokens
|
||||
if not is_cross_attention else
|
||||
lora_runtime_params.max_encoder_context_length,
|
||||
max_low_rank=self.max_low_rank,
|
||||
@ -93,7 +93,7 @@ class LoraParams(object):
|
||||
lora_ranks=None, # : List[dict[Tensor]]
|
||||
lora_weights_pointers=None, # : List[dict[Tensor]]
|
||||
host_context_lengths: Tensor = None,
|
||||
max_context_length: Tensor = None,
|
||||
max_num_tokens: Optional[int] = None,
|
||||
max_encoder_context_length: Tensor = None, # For cross attention
|
||||
host_request_types: Tensor = None,
|
||||
host_encoder_input_lengths: Tensor = None, # For cross attention
|
||||
@ -104,7 +104,7 @@ class LoraParams(object):
|
||||
self.lora_weights_pointers = lora_weights_pointers
|
||||
|
||||
self.host_context_lengths = host_context_lengths
|
||||
self.max_context_length = max_context_length
|
||||
self.max_num_tokens = max_num_tokens
|
||||
self.max_encoder_context_length = max_encoder_context_length
|
||||
self.host_request_types = host_request_types
|
||||
self.host_encoder_input_lengths = host_encoder_input_lengths
|
||||
@ -115,7 +115,7 @@ class LoraParams(object):
|
||||
lora_ranks=[self.lora_ranks[layer_idx]],
|
||||
lora_weights_pointers=[self.lora_weights_pointers[layer_idx]],
|
||||
host_context_lengths=self.host_context_lengths,
|
||||
max_context_length=self.max_context_length,
|
||||
max_num_tokens=self.max_num_tokens,
|
||||
max_encoder_context_length=self.max_encoder_context_length,
|
||||
host_request_types=self.host_request_types,
|
||||
host_encoder_input_lengths=self.host_encoder_input_lengths,
|
||||
@ -133,7 +133,7 @@ class LoraParams(object):
|
||||
[f"{lora_module}_lora_weights_pointers"]
|
||||
],
|
||||
host_context_lengths=self.host_context_lengths,
|
||||
max_context_length=self.max_context_length,
|
||||
max_num_tokens=self.max_num_tokens,
|
||||
max_encoder_context_length=self.max_encoder_context_length,
|
||||
host_request_types=self.host_request_types,
|
||||
host_encoder_input_lengths=self.host_encoder_input_lengths,
|
||||
|
||||
@ -47,7 +47,7 @@ def fc_gate_lora(hidden_states, lora, lora_layer_params):
|
||||
],
|
||||
host_request_types=mlp_fc_lora_params.host_request_types,
|
||||
host_context_lengths=mlp_fc_lora_params.host_context_lengths,
|
||||
max_context_length=mlp_fc_lora_params.max_context_length)
|
||||
max_num_tokens=mlp_fc_lora_params.max_num_tokens)
|
||||
|
||||
mlp_fc_lora, mlp_gate_lora = lora(hidden_states, mlp_in_lora_params)
|
||||
mlp_in_result = concat([mlp_gate_lora, mlp_fc_lora],
|
||||
|
||||
@ -24,12 +24,11 @@ from tensorrt_llm.layers.lora import LoraParams
|
||||
|
||||
from .._common import default_net, default_trtnet
|
||||
from .._utils import int32_array
|
||||
from ..functional import (AllReduceFusionParams, AllReduceStrategy,
|
||||
_add_plugin_info, _create_tensor, allreduce, cast,
|
||||
concat, constant, div, expand, gather_nd,
|
||||
is_gated_activation, non_gated_version, nonzero,
|
||||
repeat_interleave, scatter_nd, shape, softmax, split,
|
||||
sum, topk)
|
||||
from ..functional import (AllReduceFusionParams, _add_plugin_info,
|
||||
_create_tensor, allreduce, cast, concat, constant,
|
||||
div, expand, gather_nd, is_gated_activation,
|
||||
non_gated_version, nonzero, repeat_interleave,
|
||||
scatter_nd, shape, softmax, split, sum, topk)
|
||||
from ..layers import MLP, GatedMLP
|
||||
from ..mapping import Mapping
|
||||
from ..module import Module, ModuleList
|
||||
@ -531,7 +530,7 @@ class MoeOOTB(MOE):
|
||||
gate_lora_weights_pointers,
|
||||
}],
|
||||
host_context_lengths=lora_layer_params.host_context_lengths,
|
||||
max_context_length=lora_layer_params.max_context_length,
|
||||
max_num_tokens=lora_layer_params.max_num_tokens,
|
||||
max_encoder_context_length=lora_layer_params.
|
||||
max_encoder_context_length,
|
||||
host_request_types=lora_layer_params.host_request_types,
|
||||
@ -603,6 +602,10 @@ class MoeOOTB(MOE):
|
||||
expert_weights = split(experts_weights, 1, dim=0)
|
||||
|
||||
for i, expert in enumerate(self.experts):
|
||||
if self.mapping.has_moe_ep():
|
||||
index = i + self.experts_per_node * self.mapping.moe_ep_rank
|
||||
else:
|
||||
index = i
|
||||
# get mask token index
|
||||
non_zero_index = nonzero(experts_mask[i].view(
|
||||
concat([-1, hidden_size])))
|
||||
@ -627,16 +630,9 @@ class MoeOOTB(MOE):
|
||||
|
||||
output = output.view(shape(hidden_states))
|
||||
|
||||
need_ep_reduce = self.mapping.has_moe_ep(
|
||||
) and self.mapping.moe_ep_group is not None
|
||||
need_tp_reduce = self.mapping.has_moe_tp(
|
||||
) and self.mapping.moe_tp_group is not None
|
||||
if need_tp_reduce or need_ep_reduce:
|
||||
group = self.mapping.moe_ep_group if need_ep_reduce else self.mapping.moe_tp_group
|
||||
# TODO: remove this NCCL strategy WAR after fixed https://nvbugspro.nvidia.com/bug/4740067
|
||||
if self.tp_size > 1 and self.tp_group is not None:
|
||||
output = allreduce(output,
|
||||
group,
|
||||
strategy=AllReduceStrategy.NCCL,
|
||||
self.mapping.tp_group,
|
||||
reduce_fusion_params=reduce_fusion_params)
|
||||
|
||||
return output
|
||||
|
||||
@ -27,6 +27,7 @@ from .falcon.model import FalconForCausalLM, FalconModel
|
||||
from .gemma.model import GemmaForCausalLM
|
||||
from .gpt.config import GPTConfig
|
||||
from .gpt.model import GPTForCausalLM, GPTModel
|
||||
from .gptj.config import GPTJConfig
|
||||
from .gptj.model import GPTJForCausalLM, GPTJModel
|
||||
from .gptneox.model import GPTNeoXForCausalLM, GPTNeoXModel
|
||||
from .grok.model import GrokForCausalLM
|
||||
@ -65,6 +66,7 @@ __all__ = [
|
||||
'MedusaConfig',
|
||||
'MedusaForCausalLm',
|
||||
'ReDrafterForCausalLM',
|
||||
'GPTJConfig',
|
||||
'GPTJModel',
|
||||
'GPTJForCausalLM',
|
||||
'GPTNeoXModel',
|
||||
|
||||
@ -667,6 +667,7 @@ class EncoderModel(PretrainedModel):
|
||||
def prepare_inputs(self,
|
||||
max_batch_size,
|
||||
max_input_len,
|
||||
max_num_tokens,
|
||||
prompt_embedding_table_size: int = 0,
|
||||
lora_target_modules: List[str] = None,
|
||||
*args,
|
||||
@ -889,7 +890,7 @@ class EncoderModel(PretrainedModel):
|
||||
lora_params = LoraParams(
|
||||
lora_ranks=lora_ranks,
|
||||
lora_weights_pointers=lora_weights_pointers,
|
||||
max_context_length=max_input_len,
|
||||
max_num_tokens=max_num_tokens,
|
||||
host_request_types=host_request_types,
|
||||
host_context_lengths=host_context_lengths,
|
||||
)
|
||||
@ -1225,6 +1226,7 @@ class DecoderModel(PretrainedModel):
|
||||
max_beam_width,
|
||||
max_decoder_input_len,
|
||||
max_seq_len,
|
||||
max_num_tokens,
|
||||
max_encoder_input_len,
|
||||
gather_context_logits: bool = False,
|
||||
gather_generation_logits: bool = False,
|
||||
@ -1594,7 +1596,7 @@ class DecoderModel(PretrainedModel):
|
||||
lora_ranks=lora_ranks,
|
||||
lora_weights_pointers=lora_weights_pointers,
|
||||
host_context_lengths=host_context_lengths,
|
||||
max_context_length=max_decoder_input_len,
|
||||
max_num_tokens=max_num_tokens,
|
||||
max_encoder_context_length=max_encoder_input_len,
|
||||
host_request_types=host_request_types,
|
||||
host_encoder_input_lengths=host_encoder_input_lengths,
|
||||
|
||||
@ -15,14 +15,20 @@
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from ..._utils import torch_dtype_to_str
|
||||
from ...layers import MoeConfig
|
||||
from ..modeling_utils import PretrainedConfig
|
||||
from ...logger import logger
|
||||
from ...mapping import Mapping
|
||||
from ..modeling_utils import PretrainedConfig, QuantConfig
|
||||
|
||||
|
||||
class GPTConfig(PretrainedConfig):
|
||||
|
||||
def __init__(self,
|
||||
*,
|
||||
gpt_variant: str = 'gpt2',
|
||||
bias: bool = True,
|
||||
q_scaling: float = 1.0,
|
||||
embedding_scale: Optional[float] = None,
|
||||
@ -30,8 +36,11 @@ class GPTConfig(PretrainedConfig):
|
||||
rotary_pct: float = 1.0,
|
||||
rotary_base: float = 10000.0,
|
||||
rotary_scaling: Optional[dict] = None,
|
||||
inner_layernorm: bool = False,
|
||||
norm_before_bmm1: bool = False,
|
||||
moe: Optional[Union[MoeConfig, dict]] = None,
|
||||
**kwargs):
|
||||
self.gpt_variant = gpt_variant
|
||||
self.bias = bias
|
||||
self.q_scaling = q_scaling
|
||||
self.embedding_scale = embedding_scale
|
||||
@ -39,6 +48,8 @@ class GPTConfig(PretrainedConfig):
|
||||
self.rotary_pct = rotary_pct
|
||||
self.rotary_base = rotary_base
|
||||
self.rotary_scaling = rotary_scaling
|
||||
self.inner_layernorm = inner_layernorm
|
||||
self.norm_before_bmm1 = norm_before_bmm1
|
||||
if moe is None:
|
||||
# Legacy MOE config fields
|
||||
moe = MoeConfig(
|
||||
@ -57,6 +68,7 @@ class GPTConfig(PretrainedConfig):
|
||||
def to_dict(self):
|
||||
output = super().to_dict()
|
||||
# Serialize the fields added in GPTConfig
|
||||
output['gpt_variant'] = self.gpt_variant
|
||||
output['bias'] = self.bias
|
||||
output['q_scaling'] = self.q_scaling
|
||||
output['embedding_scale'] = self.embedding_scale
|
||||
@ -65,5 +77,244 @@ class GPTConfig(PretrainedConfig):
|
||||
output['rotary_pct'] = self.rotary_pct
|
||||
output['rotary_base'] = self.rotary_base
|
||||
output['rotary_scaling'] = self.rotary_scaling
|
||||
output['inner_layernorm'] = self.inner_layernorm
|
||||
output['norm_before_bmm1'] = self.norm_before_bmm1
|
||||
output['moe'] = self.moe.to_dict()
|
||||
return output
|
||||
|
||||
@classmethod
|
||||
def from_hugging_face(
|
||||
cls,
|
||||
hf_config_or_dir: Union[str, 'transformers.PretrainedConfig'],
|
||||
dtype: str = 'auto',
|
||||
mapping: Optional[Mapping] = None,
|
||||
quant_config: Optional[QuantConfig] = None,
|
||||
**kwargs):
|
||||
import transformers
|
||||
|
||||
from .convert import get_needed_padding
|
||||
|
||||
if isinstance(hf_config_or_dir, transformers.PretrainedConfig):
|
||||
hf_config = hf_config_or_dir
|
||||
else:
|
||||
hf_config = transformers.AutoConfig.from_pretrained(
|
||||
hf_config_or_dir, trust_remote_code=True)
|
||||
|
||||
gpt_variant = kwargs.pop('gpt_variant', None)
|
||||
if gpt_variant is None:
|
||||
logger.info("Inferring gpt variant from path...")
|
||||
for v in [
|
||||
'starcoder2', 'starcoder', 'santacoder', 'gpt2',
|
||||
'persimmon', 'fuyu', 'kosmos-2', 'jais'
|
||||
]:
|
||||
if v in hf_config._name_or_path:
|
||||
gpt_variant = v
|
||||
break
|
||||
if gpt_variant == 'fuyu':
|
||||
gpt_variant = 'persimmon'
|
||||
|
||||
assert gpt_variant in [
|
||||
'gpt2', 'santacoder', 'starcoder', 'starcoder2', 'persimmon',
|
||||
'kosmos-2', 'jais'
|
||||
]
|
||||
logger.info(f"Gpt variant: {gpt_variant}")
|
||||
|
||||
if gpt_variant in ['starcoder2', 'persimmon']:
|
||||
hf_config.n_embd = hf_config.hidden_size
|
||||
hf_config.n_inner = hf_config.intermediate_size
|
||||
hf_config.n_head = hf_config.num_attention_heads
|
||||
hf_config.n_kv_head = hf_config.num_key_value_heads if hasattr(
|
||||
hf_config, 'num_key_value_heads') else hf_config.n_head
|
||||
hf_config.n_layer = hf_config.num_hidden_layers
|
||||
hf_config.n_positions = hf_config.max_position_embeddings
|
||||
hf_config.activation_function = 'gelu' if gpt_variant == 'starcoder2' else 'squared-relu'
|
||||
hf_config.layer_norm_epsilon = hf_config.norm_epsilon if gpt_variant == 'starcoder2' else hf_config.layer_norm_eps
|
||||
hf_config.bias = hf_config.use_bias if gpt_variant == 'starcoder2' else True
|
||||
hf_config.position_embedding_type = 'rope_gpt_neox'
|
||||
hf_config.rotary_base = hf_config.rope_theta
|
||||
hf_config.rotary_pct = getattr(hf_config, 'partial_rotary_factor',
|
||||
1.0)
|
||||
elif gpt_variant == "kosmos-2":
|
||||
hf_config.n_embd = hf_config.text_config.embed_dim
|
||||
hf_config.n_inner = hf_config.text_config.ffn_dim
|
||||
hf_config.n_head = hf_config.text_config.attention_heads
|
||||
hf_config.n_kv_head = hf_config.n_head
|
||||
hf_config.n_layer = hf_config.text_config.layers
|
||||
hf_config.n_positions = hf_config.text_config.max_position_embeddings
|
||||
hf_config.activation_function = hf_config.text_config.activation_function
|
||||
hf_config.layer_norm_epsilon = hf_config.text_config.layer_norm_eps
|
||||
hf_config.bias = True
|
||||
hf_config.vocab_size = hf_config.text_config.vocab_size
|
||||
else:
|
||||
if hf_config.n_inner is None:
|
||||
hf_config.n_inner = hf_config.n_embd * 4
|
||||
if gpt_variant in ['santacoder', 'starcoder']:
|
||||
hf_config.n_kv_head = 1
|
||||
else:
|
||||
hf_config.n_kv_head = hf_config.n_head
|
||||
|
||||
if gpt_variant == 'jais':
|
||||
hf_config.q_scaling = (hf_config.n_embd // hf_config.n_head)**0.5
|
||||
if hasattr(hf_config, 'width_scale'):
|
||||
hf_config.logits_scale = hf_config.width_scale
|
||||
else:
|
||||
hf_config.logits_scale = hf_config.mup_output_alpha * hf_config.mup_width_scale
|
||||
|
||||
if hasattr(hf_config, 'mup_embeddings_scale'):
|
||||
hf_config.embeddings_scale = hf_config.mup_embeddings_scale
|
||||
else:
|
||||
assert hasattr(hf_config, 'embeddings_scale')
|
||||
|
||||
hf_config.n_inner += get_needed_padding(hf_config.n_inner,
|
||||
mapping.tp_size)
|
||||
|
||||
if gpt_variant == 'kosmos-2':
|
||||
if hf_config.text_config.scale_embedding:
|
||||
hf_config.embeddings_scale = hf_config.n_embd**0.5
|
||||
|
||||
if dtype == 'auto':
|
||||
dtype = getattr(hf_config, 'torch_dtype', None)
|
||||
if dtype is None:
|
||||
dtype = 'float16'
|
||||
if isinstance(dtype, torch.dtype):
|
||||
dtype = torch_dtype_to_str(dtype)
|
||||
if dtype == 'float32':
|
||||
dtype = 'float16'
|
||||
|
||||
return cls(architecture=hf_config.architectures[0],
|
||||
dtype=dtype,
|
||||
num_hidden_layers=hf_config.n_layer,
|
||||
num_attention_heads=hf_config.n_head,
|
||||
num_key_value_heads=hf_config.n_kv_head,
|
||||
hidden_size=hf_config.n_embd,
|
||||
intermediate_size=hf_config.n_inner,
|
||||
norm_epsilon=hf_config.layer_norm_epsilon,
|
||||
vocab_size=hf_config.vocab_size,
|
||||
position_embedding_type=getattr(hf_config,
|
||||
'position_embedding_type',
|
||||
'learned_absolute'),
|
||||
max_position_embeddings=hf_config.n_positions,
|
||||
hidden_act=hf_config.activation_function,
|
||||
gpt_variant=gpt_variant,
|
||||
bias=getattr(hf_config, 'bias', True),
|
||||
apply_query_key_layer_scaling=getattr(
|
||||
hf_config, 'apply_query_key_layer_scaling', False),
|
||||
rotary_pct=getattr(hf_config, 'rotary_pct', 1.0),
|
||||
rotary_base=getattr(hf_config, 'rotary_base', 10000.0),
|
||||
rotary_scaling=getattr(hf_config, 'rotary_scaling', None),
|
||||
qk_layernorm=gpt_variant == 'persimmon',
|
||||
inner_layernorm=gpt_variant == 'kosmos-2',
|
||||
norm_before_bmm1=gpt_variant == 'kosmos-2',
|
||||
q_scaling=getattr(hf_config, 'q_scaling', 1),
|
||||
embedding_scale=getattr(hf_config, 'embeddings_scale', None),
|
||||
mapping=mapping,
|
||||
quantization=quant_config,
|
||||
**kwargs)
|
||||
|
||||
@classmethod
|
||||
def from_nemo(cls,
|
||||
nemo_ckpt_dir: str,
|
||||
dtype: str = 'auto',
|
||||
mapping: Optional[Mapping] = None,
|
||||
quant_config: Optional[QuantConfig] = None,
|
||||
**kwargs):
|
||||
import transformers
|
||||
|
||||
from .convert import (UnpackedNemoCheckpointDir, cpu_map_location,
|
||||
gpu_map_location, rename_keys)
|
||||
|
||||
load_model_on_cpu = kwargs.pop('load_model_on_cpu', False)
|
||||
nemo_rename_key = kwargs.pop('nemo_rename_key', [])
|
||||
layer_rename_config = {
|
||||
pattern.split(':')[0]: pattern.split(':')[1]
|
||||
for pattern in nemo_rename_key
|
||||
}
|
||||
|
||||
unpacked_checkpoints_dir = UnpackedNemoCheckpointDir(
|
||||
nemo_ckpt_dir, load_checkpoints_to_cpu=load_model_on_cpu)
|
||||
nemo_model_config = unpacked_checkpoints_dir.model_config
|
||||
|
||||
training_tp_size = nemo_model_config.get("tensor_model_parallel_size",
|
||||
1)
|
||||
training_pp_size = nemo_model_config.get("pipeline_model_parallel_size",
|
||||
1)
|
||||
|
||||
checkpoints_paths = unpacked_checkpoints_dir.get_checkpoints_paths(
|
||||
training_tp_size,
|
||||
training_pp_size,
|
||||
)
|
||||
if unpacked_checkpoints_dir._load_checkpoints_to_cpu:
|
||||
map_location_fn = cpu_map_location
|
||||
else:
|
||||
map_location_fn = gpu_map_location
|
||||
model_00 = torch.load(checkpoints_paths[0][0],
|
||||
map_location=map_location_fn)
|
||||
model_00 = rename_keys(model_00, layer_rename_config)
|
||||
vocab_size = model_00[
|
||||
"model.language_model.embedding.word_embeddings.weight"].shape[
|
||||
0] * training_tp_size
|
||||
del model_00
|
||||
|
||||
hf_config = transformers.GPT2Config(
|
||||
vocab_size=vocab_size,
|
||||
n_positions=nemo_model_config['max_position_embeddings'],
|
||||
n_embd=nemo_model_config['hidden_size'],
|
||||
n_layer=nemo_model_config['num_layers'],
|
||||
n_head=nemo_model_config['num_attention_heads'],
|
||||
n_inner=nemo_model_config['ffn_hidden_size'],
|
||||
activation_function=nemo_model_config['activation'],
|
||||
layer_norm_epsilon=nemo_model_config['layernorm_epsilon'],
|
||||
)
|
||||
hf_config.n_kv_head = hf_config.n_head
|
||||
hf_config.bias = nemo_model_config['bias']
|
||||
hf_config.apply_query_key_layer_scaling = False
|
||||
|
||||
hf_config.position_embedding_type = nemo_model_config.get(
|
||||
'position_embedding_type', 'learned_absolute')
|
||||
if hf_config.position_embedding_type == 'rope':
|
||||
hf_config.position_embedding_type = 'rope_gpt_neox'
|
||||
hf_config.rotary_base = nemo_model_config.get('rotary_base', 10000.0)
|
||||
hf_config.rotary_pct = nemo_model_config.get('rotary_percentage', 1.0)
|
||||
assert hf_config.rotary_pct >= 0 and hf_config.rotary_pct <= 1
|
||||
|
||||
rotary_scaling_factor = nemo_model_config.get(
|
||||
'seq_len_interpolation_factor', None)
|
||||
if rotary_scaling_factor is None:
|
||||
hf_config.rotary_scaling = None
|
||||
else:
|
||||
assert rotary_scaling_factor > 1
|
||||
hf_config.rotary_scaling = {
|
||||
'type': 'linear',
|
||||
'factor': rotary_scaling_factor
|
||||
}
|
||||
|
||||
if dtype == 'auto':
|
||||
dtype = nemo_model_config['precision']
|
||||
if dtype is None:
|
||||
dtype = 'float16'
|
||||
elif 'bf16' in dtype or 'bfloat16' in dtype:
|
||||
dtype = 'bfloat16'
|
||||
else:
|
||||
dtype = 'float16'
|
||||
|
||||
return cls(architecture='GPTForCausalLM',
|
||||
dtype=dtype,
|
||||
num_hidden_layers=hf_config.n_layer,
|
||||
num_attention_heads=hf_config.n_head,
|
||||
num_key_value_heads=hf_config.n_kv_head,
|
||||
hidden_size=hf_config.n_embd,
|
||||
intermediate_size=hf_config.n_inner,
|
||||
norm_epsilon=hf_config.layer_norm_epsilon,
|
||||
vocab_size=hf_config.vocab_size,
|
||||
position_embedding_type=hf_config.position_embedding_type,
|
||||
max_position_embeddings=hf_config.n_positions,
|
||||
hidden_act=hf_config.activation_function,
|
||||
bias=hf_config.bias,
|
||||
apply_query_key_layer_scaling=hf_config.
|
||||
apply_query_key_layer_scaling,
|
||||
rotary_pct=hf_config.rotary_pct,
|
||||
rotary_base=hf_config.rotary_base,
|
||||
rotary_scaling=hf_config.rotary_scaling,
|
||||
mapping=mapping,
|
||||
quantization=quant_config,
|
||||
**kwargs)
|
||||
|
||||
1581
tensorrt_llm/models/gpt/convert.py
Normal file
1581
tensorrt_llm/models/gpt/convert.py
Normal file
File diff suppressed because it is too large
Load Diff
@ -13,6 +13,8 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from ..._utils import pad_vocab_size
|
||||
from ...functional import (Tensor, is_gated_activation, non_gated_version, recv,
|
||||
send)
|
||||
@ -22,9 +24,12 @@ from ...layers import (MLP, MOE, Attention, AttentionMaskType, ColumnLinear,
|
||||
from ...lora_manager import LoraConfig, use_lora
|
||||
from ...mapping import Mapping
|
||||
from ...module import Module
|
||||
from ...quantization import QuantMode
|
||||
from ..modeling_utils import DecoderLayerList, DecoderModelForCausalLM
|
||||
from ...quantization import W8A8_SQ_PLUGIN_LIST, QuantAlgo, QuantMode
|
||||
from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
|
||||
QuantConfig, check_share_embedding)
|
||||
from .config import GPTConfig
|
||||
from .convert import (load_hf_gpt, load_weights_from_hf_model,
|
||||
load_weights_from_nemo)
|
||||
|
||||
|
||||
def MLPFactory(hidden_size,
|
||||
@ -276,5 +281,123 @@ class GPTForCausalLM(DecoderModelForCausalLM):
|
||||
}
|
||||
super().__init__(config, transformer, lm_head)
|
||||
|
||||
@classmethod
|
||||
def from_hugging_face(
|
||||
cls,
|
||||
hf_model_or_dir: Union[str, 'transformers.PreTrainedModel'],
|
||||
dtype: str = 'auto',
|
||||
mapping: Optional[Mapping] = None,
|
||||
quant_config: Optional[QuantConfig] = None,
|
||||
**kwargs):
|
||||
''' Create a LLaMAForCausalLM object from give parameters
|
||||
'''
|
||||
import transformers
|
||||
|
||||
load_model_on_cpu = kwargs.pop('load_model_on_cpu', False)
|
||||
|
||||
assert hf_model_or_dir is not None
|
||||
use_preloading = isinstance(hf_model_or_dir,
|
||||
transformers.PreTrainedModel)
|
||||
if use_preloading:
|
||||
hf_model = hf_model_or_dir
|
||||
hf_config_or_dir = hf_model.config
|
||||
else:
|
||||
hf_model_dir = hf_model_or_dir
|
||||
hf_config_or_dir = hf_model_or_dir
|
||||
|
||||
config = GPTConfig.from_hugging_face(hf_config_or_dir,
|
||||
dtype=dtype,
|
||||
mapping=mapping,
|
||||
quant_config=quant_config,
|
||||
**kwargs)
|
||||
|
||||
if not use_preloading:
|
||||
hf_model = load_hf_gpt(hf_model_dir, load_model_on_cpu)
|
||||
weights = load_weights_from_hf_model(hf_model, config)
|
||||
|
||||
check_share_embedding(weights, config)
|
||||
model = cls(config)
|
||||
model.load(weights)
|
||||
return model
|
||||
|
||||
@classmethod
|
||||
def quantize(
|
||||
cls,
|
||||
hf_model_dir: str,
|
||||
output_dir: str,
|
||||
dtype: str = 'auto',
|
||||
mapping: Optional[Mapping] = None,
|
||||
quant_config: Optional[QuantConfig] = None,
|
||||
*,
|
||||
device: str = 'cuda',
|
||||
calib_dataset: str = 'cnn_dailymail',
|
||||
calib_batches: int = 512,
|
||||
calib_batch_size: int = 1,
|
||||
calib_max_seq_length: int = 512,
|
||||
random_seed: int = 1234,
|
||||
tokenizer_max_seq_length: int = 2048,
|
||||
**kwargs,
|
||||
):
|
||||
DEFAULT_MODELOPT_FLOW = [
|
||||
QuantAlgo.W4A16_AWQ, QuantAlgo.FP8, QuantAlgo.W8A8_SQ_PER_CHANNEL,
|
||||
QuantAlgo.W4A8_AWQ
|
||||
]
|
||||
config = GPTConfig.from_hugging_face(hf_model_dir,
|
||||
dtype=dtype,
|
||||
mapping=mapping,
|
||||
quant_config=quant_config,
|
||||
**kwargs)
|
||||
|
||||
if quant_config.quant_algo in DEFAULT_MODELOPT_FLOW:
|
||||
super().quantize(hf_model_dir,
|
||||
output_dir,
|
||||
dtype=config.dtype,
|
||||
mapping=config.mapping,
|
||||
quant_config=config.quantization,
|
||||
device=device,
|
||||
calib_dataset=calib_dataset,
|
||||
calib_batches=calib_batches,
|
||||
calib_batch_size=calib_batch_size,
|
||||
calib_max_seq_length=calib_max_seq_length,
|
||||
random_seed=random_seed,
|
||||
tokenizer_max_seq_length=tokenizer_max_seq_length)
|
||||
else:
|
||||
# non-modelopt, the legacy TRT-LLM native quantization algorithm:
|
||||
# sq, int4/int8 weights only, int8 kv cache
|
||||
NATIVE_QUANT_FLOW = [QuantAlgo.W4A16, QuantAlgo.W8A16, None
|
||||
] + W8A8_SQ_PLUGIN_LIST
|
||||
is_valid_native_quant = (quant_config.quant_algo in NATIVE_QUANT_FLOW) and \
|
||||
(quant_config.kv_cache_quant_algo in [QuantAlgo.INT8, None])
|
||||
assert quant_config.quant_algo is not None or quant_config.kv_cache_quant_algo is not None, \
|
||||
"There is no point to call the quantize function if both quant_algo and kv_cache_quant_algo is None"
|
||||
assert is_valid_native_quant, f"Internal error: shall call Modelopt for this quantization {quant_config}"
|
||||
|
||||
from . import convert
|
||||
convert.quantize(hf_model_dir,
|
||||
output_dir,
|
||||
config=config,
|
||||
device=device,
|
||||
calib_dataset=calib_dataset)
|
||||
|
||||
@classmethod
|
||||
def from_nemo(cls,
|
||||
nemo_ckpt_dir: str,
|
||||
dtype: str = 'auto',
|
||||
mapping: Optional[Mapping] = None,
|
||||
quant_config: Optional[QuantConfig] = None,
|
||||
**kwargs):
|
||||
config = GPTConfig.from_nemo(nemo_ckpt_dir,
|
||||
dtype=dtype,
|
||||
mapping=mapping,
|
||||
quant_config=quant_config,
|
||||
**kwargs)
|
||||
|
||||
weights = load_weights_from_nemo(nemo_ckpt_dir, config, **kwargs)
|
||||
|
||||
check_share_embedding(weights, config)
|
||||
model = cls(config)
|
||||
model.load(weights)
|
||||
return model
|
||||
|
||||
def use_lora(self, lora_config: LoraConfig):
|
||||
use_lora(self, lora_config, self.trtllm_modules_to_hf_modules)
|
||||
|
||||
63
tensorrt_llm/models/gptj/config.py
Normal file
63
tensorrt_llm/models/gptj/config.py
Normal file
@ -0,0 +1,63 @@
|
||||
from typing import Mapping, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from ..._utils import torch_dtype_to_str
|
||||
from ...mapping import Mapping
|
||||
from ..modeling_utils import PretrainedConfig, QuantConfig
|
||||
|
||||
|
||||
class GPTJConfig(PretrainedConfig):
|
||||
"""
|
||||
This is the configuration class to store the configuration of GPTJ model.
|
||||
"""
|
||||
|
||||
def __init__(self, *, rotary_dim: int = 64, **kwargs):
|
||||
self.rotary_dim = rotary_dim
|
||||
super().__init__(**kwargs)
|
||||
|
||||
def to_dict(self):
|
||||
output = super().to_dict()
|
||||
output.update(rotary_dim=self.rotary_dim)
|
||||
return output
|
||||
|
||||
@classmethod
|
||||
def from_hugging_face(
|
||||
cls,
|
||||
hf_config_or_dir: Union[str, 'transformers.PretrainedConfig'],
|
||||
dtype: str = 'auto',
|
||||
mapping: Optional[Mapping] = None,
|
||||
quant_config: Optional[QuantConfig] = None,
|
||||
**kwargs):
|
||||
import transformers
|
||||
|
||||
if isinstance(hf_config_or_dir, transformers.PretrainedConfig):
|
||||
hf_config = hf_config_or_dir
|
||||
else:
|
||||
hf_config_dir = str(hf_config_or_dir)
|
||||
hf_config = transformers.AutoConfig.from_pretrained(
|
||||
hf_config_dir, trust_remote_code=True)
|
||||
|
||||
if dtype == 'auto':
|
||||
dtype = getattr(hf_config, 'torch_dtype', None)
|
||||
if dtype is None:
|
||||
dtype = 'float16'
|
||||
if isinstance(dtype, torch.dtype):
|
||||
dtype = torch_dtype_to_str(dtype)
|
||||
if dtype == 'float32':
|
||||
dtype = 'float16'
|
||||
|
||||
return cls(architecture=hf_config.architectures[0],
|
||||
dtype=dtype,
|
||||
num_hidden_layers=hf_config.num_hidden_layers,
|
||||
num_attention_heads=hf_config.num_attention_heads,
|
||||
hidden_size=hf_config.hidden_size,
|
||||
norm_epsilon=hf_config.layer_norm_epsilon,
|
||||
vocab_size=hf_config.vocab_size,
|
||||
position_embedding_type='rope_gptj',
|
||||
max_position_embeddings=hf_config.max_position_embeddings,
|
||||
hidden_act='gelu',
|
||||
rotary_dim=hf_config.rotary_dim,
|
||||
mapping=mapping,
|
||||
quantization=quant_config,
|
||||
**kwargs)
|
||||
205
tensorrt_llm/models/gptj/convert.py
Normal file
205
tensorrt_llm/models/gptj/convert.py
Normal file
@ -0,0 +1,205 @@
|
||||
import time
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import torch
|
||||
|
||||
from tensorrt_llm.quantization import QuantAlgo
|
||||
|
||||
from .config import GPTJConfig
|
||||
|
||||
|
||||
def split(weight: torch.Tensor,
|
||||
tp_size: int,
|
||||
rank: int = 0,
|
||||
dim: int = 0) -> torch.Tensor:
|
||||
if tp_size == 1:
|
||||
return weight
|
||||
elif weight.ndim == 1:
|
||||
return torch.chunk(weight, tp_size)[rank].contiguous()
|
||||
else:
|
||||
return torch.chunk(weight, tp_size, dim=dim)[rank].contiguous()
|
||||
|
||||
|
||||
def split_matrix(weight: torch.Tensor, tp_size: int, rank: int,
|
||||
dim: int) -> torch.Tensor:
|
||||
return split(weight, tp_size, rank, dim=dim)
|
||||
|
||||
|
||||
def get_weight(params: Dict[str, torch.Tensor], prefix: str,
|
||||
dtype: torch.dtype) -> torch.Tensor:
|
||||
if f'{prefix}.weight' not in params:
|
||||
return None
|
||||
return params[f'{prefix}.weight'].to(dtype).detach().cpu()
|
||||
|
||||
|
||||
def get_bias(params: Dict[str, torch.Tensor], prefix: str,
|
||||
dtype: torch.dtype) -> torch.Tensor:
|
||||
if f'{prefix}.bias' not in params:
|
||||
return None
|
||||
return params[f'{prefix}.bias'].to(dtype).detach().cpu()
|
||||
|
||||
|
||||
def get_weight_and_bias(params: Dict[str, torch.Tensor], prefix: str,
|
||||
dtype: torch.dtype) -> Tuple[torch.Tensor]:
|
||||
return get_weight(params, prefix, dtype), get_bias(params, prefix, dtype)
|
||||
|
||||
|
||||
def get_tllm_linear_weight(
|
||||
weight: torch.Tensor,
|
||||
prefix: str,
|
||||
bias: Optional[torch.Tensor] = None,
|
||||
use_weight_only: bool = False,
|
||||
plugin_weight_only_quant_type: torch.dtype = torch.int8
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
results = {}
|
||||
if use_weight_only:
|
||||
v = weight.t().contiguous()
|
||||
processed_torch_weights, torch_weight_scales = \
|
||||
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
|
||||
v, plugin_weight_only_quant_type)
|
||||
results[f'{prefix}.weight'] = processed_torch_weights
|
||||
results[f'{prefix}.per_channel_scale'] = torch_weight_scales
|
||||
else:
|
||||
results[f'{prefix}.weight'] = weight.contiguous()
|
||||
|
||||
if bias is not None:
|
||||
results[f'{prefix}.bias'] = bias
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def get_tllm_param(
|
||||
param: torch.Tensor,
|
||||
name: str,
|
||||
use_weight_only: bool = False,
|
||||
plugin_weight_only_quant_type: torch.dtype = torch.int8
|
||||
) -> Dict[str, torch.Tensor]:
|
||||
results = {}
|
||||
if name.endswith('.weight') and use_weight_only:
|
||||
v = param.t().contiguous()
|
||||
processed_torch_weights, torch_weight_scales = \
|
||||
torch.ops.trtllm.symmetric_quantize_last_axis_of_batched_matrix(
|
||||
v, plugin_weight_only_quant_type)
|
||||
results[name] = processed_torch_weights
|
||||
results[name.replace('weight',
|
||||
'per_channel_scale')] = torch_weight_scales
|
||||
else:
|
||||
results[name] = param
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def load_weights_from_hf_model(hf_model, config: GPTJConfig):
|
||||
quant_algo = config.quantization.quant_algo
|
||||
use_weight_only = quant_algo in [QuantAlgo.W8A16, QuantAlgo.W4A16]
|
||||
if quant_algo == QuantAlgo.W8A16:
|
||||
plugin_weight_only_quant_type = torch.int8
|
||||
elif quant_algo == QuantAlgo.W4A16:
|
||||
plugin_weight_only_quant_type = torch.quint4x2
|
||||
else:
|
||||
plugin_weight_only_quant_type = None
|
||||
|
||||
weights = {}
|
||||
tik = time.time()
|
||||
|
||||
model_params = dict(hf_model.named_parameters())
|
||||
dtype = getattr(torch, config.dtype)
|
||||
num_hidden_layers = config.num_hidden_layers
|
||||
mapping = config.mapping
|
||||
|
||||
layers_range = mapping.pp_layers(num_hidden_layers)
|
||||
for l in layers_range:
|
||||
prefix = f'transformer.h.{l}'
|
||||
tllm_prex = f'transformer.layers.{l-layers_range[0]}'
|
||||
# Attention QKV (no bias)
|
||||
q_weight = get_weight(model_params, f'{prefix}.attn.q_proj', dtype)
|
||||
k_weight = get_weight(model_params, f'{prefix}.attn.k_proj', dtype)
|
||||
v_weight = get_weight(model_params, f'{prefix}.attn.v_proj', dtype)
|
||||
q_w = split_matrix(q_weight, mapping.tp_size, mapping.tp_rank, dim=0)
|
||||
k_w = split_matrix(k_weight, mapping.tp_size, mapping.tp_rank, dim=0)
|
||||
v_w = split_matrix(v_weight, mapping.tp_size, mapping.tp_rank, dim=0)
|
||||
qkv_w = torch.concatenate([q_w, k_w, v_w], dim=0)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(qkv_w, f'{tllm_prex}.attention.qkv', None,
|
||||
use_weight_only,
|
||||
plugin_weight_only_quant_type))
|
||||
# Attention dense (not bias)
|
||||
attn_dense_weight = get_weight(model_params, f'{prefix}.attn.out_proj',
|
||||
dtype)
|
||||
attn_dense_w = split_matrix(attn_dense_weight,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=1)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(attn_dense_w, f'{tllm_prex}.attention.dense',
|
||||
None, use_weight_only,
|
||||
plugin_weight_only_quant_type))
|
||||
# MLP fc_in (with bias)
|
||||
mlp_fc_weight, mlp_fc_bias = get_weight_and_bias(
|
||||
model_params, f'{prefix}.mlp.fc_in', dtype)
|
||||
mlp_fc_w = split_matrix(mlp_fc_weight,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
mlp_fc_b = split_matrix(mlp_fc_bias,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(mlp_fc_w, f'{tllm_prex}.mlp.fc', mlp_fc_b,
|
||||
use_weight_only,
|
||||
plugin_weight_only_quant_type))
|
||||
# MLP fc_out (with bias)
|
||||
mlp_proj_weight, mlp_proj_bias = get_weight_and_bias(
|
||||
model_params, f'{prefix}.mlp.fc_out', dtype)
|
||||
mlp_proj_w = split_matrix(mlp_proj_weight,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=1)
|
||||
# Only rank0 will get bias
|
||||
if mapping.tp_size > 1 and mapping.tp_rank > 0:
|
||||
mlp_proj_bias = torch.zeros(mlp_proj_weight.shape[0],
|
||||
dtype=mlp_proj_weight.dtype)
|
||||
weights.update(
|
||||
get_tllm_linear_weight(mlp_proj_w, f'{tllm_prex}.mlp.proj',
|
||||
mlp_proj_bias, use_weight_only,
|
||||
plugin_weight_only_quant_type))
|
||||
|
||||
input_ln_weight, input_ln_bias = get_weight_and_bias(
|
||||
model_params, f'{prefix}.ln_1', dtype)
|
||||
weights[f'{tllm_prex}.input_layernorm.weight'] = input_ln_weight
|
||||
weights[f'{tllm_prex}.input_layernorm.bias'] = input_ln_bias
|
||||
|
||||
if mapping.is_first_pp_rank():
|
||||
# Embedding
|
||||
embed_w = get_weight(model_params, 'transformer.wte', dtype)
|
||||
if config.use_parallel_embedding:
|
||||
embed_w = split_matrix(embed_w,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
weights['transformer.vocab_embedding.weight'] = embed_w
|
||||
|
||||
if mapping.is_last_pp_rank():
|
||||
# lm_head weight and bias
|
||||
lm_head_w, ln_head_bias = get_weight_and_bias(model_params, 'lm_head',
|
||||
dtype)
|
||||
weights['lm_head.weight'] = split_matrix(lm_head_w,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
weights['lm_head.bias'] = split_matrix(ln_head_bias,
|
||||
mapping.tp_size,
|
||||
mapping.tp_rank,
|
||||
dim=0)
|
||||
ln_f_w, ln_f_b = get_weight_and_bias(model_params, 'transformer.ln_f',
|
||||
dtype)
|
||||
# ln_f weight and bias
|
||||
weights['transformer.ln_f.weight'] = ln_f_w
|
||||
if ln_f_b is not None:
|
||||
weights['transformer.ln_f.bias'] = ln_f_b
|
||||
|
||||
tok = time.time()
|
||||
t = time.strftime('%H:%M:%S', time.gmtime(tok - tik))
|
||||
print(f'Weights loaded. Total time: {t}')
|
||||
return weights
|
||||
@ -13,18 +13,23 @@
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
|
||||
from typing import Optional, Union
|
||||
|
||||
from ..._utils import pad_vocab_size
|
||||
from ...functional import PositionEmbeddingType, Tensor, allreduce
|
||||
from ...layers import (MLP, Attention, AttentionMaskType, ColumnLinear,
|
||||
Embedding, LayerNorm)
|
||||
from ...mapping import Mapping
|
||||
from ...module import Module
|
||||
from ..modeling_utils import (DecoderLayerList, DecoderModelForCausalLM,
|
||||
PretrainedConfig)
|
||||
check_share_embedding)
|
||||
from .config import GPTJConfig
|
||||
from .convert import load_weights_from_hf_model
|
||||
|
||||
|
||||
class GPTJDecoderLayer(Module):
|
||||
|
||||
def __init__(self, config: PretrainedConfig, layer_idx: int):
|
||||
def __init__(self, config: GPTJConfig, layer_idx: int):
|
||||
super().__init__()
|
||||
self.layer_idx = layer_idx
|
||||
self.config = config
|
||||
@ -104,7 +109,7 @@ class GPTJDecoderLayer(Module):
|
||||
|
||||
class GPTJModel(Module):
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
def __init__(self, config: GPTJConfig):
|
||||
super().__init__()
|
||||
self.config = config
|
||||
|
||||
@ -144,9 +149,9 @@ class GPTJModel(Module):
|
||||
|
||||
|
||||
class GPTJForCausalLM(DecoderModelForCausalLM):
|
||||
config_class = GPTJConfig
|
||||
|
||||
def __init__(self, config: PretrainedConfig):
|
||||
self.check_config(config)
|
||||
def __init__(self, config: GPTJConfig):
|
||||
transformer = GPTJModel(config)
|
||||
vocab_size_padded = pad_vocab_size(config.vocab_size,
|
||||
config.mapping.tp_size)
|
||||
@ -162,5 +167,36 @@ class GPTJForCausalLM(DecoderModelForCausalLM):
|
||||
lm_head = None
|
||||
super().__init__(config, transformer, lm_head)
|
||||
|
||||
def check_config(self, config):
|
||||
config.set_if_not_exist('rotary_dim', 64)
|
||||
@classmethod
|
||||
def from_hugging_face(
|
||||
cls,
|
||||
hf_model_or_dir: Union[str, 'transformers.PreTrainedModel'],
|
||||
dtype: str = 'auto',
|
||||
mapping: Optional[Mapping] = None,
|
||||
quant_config=None,
|
||||
**kwargs):
|
||||
import transformers
|
||||
use_preloading = isinstance(hf_model_or_dir,
|
||||
transformers.PreTrainedModel)
|
||||
if use_preloading:
|
||||
hf_model = hf_model_or_dir
|
||||
hf_config_or_dir = hf_model.config
|
||||
else:
|
||||
hf_model_dir = hf_model_or_dir
|
||||
hf_config_or_dir = hf_model_or_dir
|
||||
|
||||
config = GPTJConfig.from_hugging_face(hf_config_or_dir,
|
||||
dtype=dtype,
|
||||
mapping=mapping,
|
||||
quant_config=quant_config,
|
||||
**kwargs)
|
||||
|
||||
if not use_preloading:
|
||||
hf_model = transformers.AutoModelForCausalLM.from_pretrained(
|
||||
hf_model_dir, torch_dtype='auto', trust_remote_code=True)
|
||||
weights = load_weights_from_hf_model(hf_model, config)
|
||||
|
||||
check_share_embedding(weights, config)
|
||||
model = GPTJForCausalLM(config)
|
||||
model.load(weights)
|
||||
return model
|
||||
|
||||
@ -21,7 +21,6 @@ import torch
|
||||
|
||||
from ..._utils import torch_dtype_to_str
|
||||
from ...layers import MoeConfig
|
||||
from ...logger import logger
|
||||
from ...mapping import Mapping
|
||||
from ..modeling_utils import PretrainedConfig, QuantConfig
|
||||
|
||||
@ -146,14 +145,9 @@ class LLaMAConfig(PretrainedConfig):
|
||||
dtype = torch_dtype_to_str(dtype)
|
||||
if dtype == 'float32':
|
||||
dtype = 'float16'
|
||||
if dtype == 'bfloat16' and torch.cuda.get_device_properties(
|
||||
0).major < 8:
|
||||
logger.warning(
|
||||
"Pre SM 80 GPUs do not support bfloat16, fallback to float16")
|
||||
dtype = 'float16'
|
||||
|
||||
return cls(
|
||||
architecture='LlamaForCausalLM',
|
||||
architecture=hf_config.architectures[0],
|
||||
dtype=dtype,
|
||||
num_hidden_layers=hf_config.num_hidden_layers,
|
||||
num_attention_heads=hf_config.num_attention_heads,
|
||||
@ -208,11 +202,6 @@ class LLaMAConfig(PretrainedConfig):
|
||||
|
||||
if dtype == 'auto':
|
||||
dtype = 'bfloat16'
|
||||
if dtype == 'bfloat16' and torch.cuda.get_device_properties(
|
||||
0).major < 8:
|
||||
logger.warning(
|
||||
"Pre SM 80 GPUs do not support bfloat16, fallback to float16")
|
||||
dtype = 'float16'
|
||||
|
||||
if meta_config.get('use_scaled_rope'):
|
||||
rotary_scaling = {"type": "llama3"}
|
||||
|
||||
@ -14,7 +14,6 @@
|
||||
# limitations under the License.
|
||||
import copy
|
||||
import functools
|
||||
import json
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
@ -473,8 +472,8 @@ def fp8_per_channel_quant_weight_gpu(weight, clamp_val, rank=0):
|
||||
xmax = x.abs().max(-1, keepdim=True).values
|
||||
# minimum scaling factor.
|
||||
torch_weight_scales = (xmax / 448.0).clamp(min=1.0 / (448.0 * 512.0))
|
||||
out = x / torch_weight_scales
|
||||
torch_weight_scales = torch_weight_scales.reshape(-1)
|
||||
out = x * 448.0 / xmax
|
||||
out = torch.clamp(out, -448, 448)
|
||||
processed_torch_weights = out.to(torch.float8_e4m3fn)
|
||||
|
||||
@ -1315,13 +1314,12 @@ def quantize(hf_model_dir: str,
|
||||
'''
|
||||
#TODO: currently only smooth quant and kv cache quantization are supported, needs to support mode quant algorithm calling modelopt
|
||||
|
||||
with open(os.path.join(output_dir, 'config.json'), 'w') as f:
|
||||
json.dump(config.to_dict(), f, indent=4)
|
||||
config.to_json_file(os.path.join(output_dir, 'config.json'))
|
||||
|
||||
mapping = config.mapping
|
||||
assert mapping.rank == -1, "You shall call quantize only once in one rank, assert rank==-1 for precaution"
|
||||
quant_config = config.quantization
|
||||
|
||||
quant_config = config.quantization
|
||||
use_smooth_quant = quant_config.use_plugin_sq
|
||||
int8_kv_cache = quant_config.kv_cache_quant_algo == QuantAlgo.INT8
|
||||
|
||||
|
||||
@ -14,6 +14,8 @@
|
||||
# limitations under the License.
|
||||
from typing import Optional, Union
|
||||
|
||||
import transformers
|
||||
|
||||
from ..._common import default_net
|
||||
from ..._utils import pad_vocab_size
|
||||
from ...functional import (AllReduceFusionOp, AllReduceFusionParams, Tensor,
|
||||
@ -323,7 +325,7 @@ class LLaMAForCausalLM(DecoderModelForCausalLM):
|
||||
weights = load_weights_from_hf_model(hf_model, config)
|
||||
|
||||
check_share_embedding(weights, config)
|
||||
model = LLaMAForCausalLM(config)
|
||||
model = cls(config)
|
||||
model.load(weights)
|
||||
return model
|
||||
|
||||
@ -349,7 +351,7 @@ class LLaMAForCausalLM(DecoderModelForCausalLM):
|
||||
weights = load_weights_from_meta_ckpt(meta_ckpt_dir, config)
|
||||
|
||||
check_share_embedding(weights, config)
|
||||
model = LLaMAForCausalLM(config)
|
||||
model = cls(config)
|
||||
model.load(weights)
|
||||
return model
|
||||
|
||||
|
||||
@ -615,7 +615,7 @@ class PretrainedModel(Module,
|
||||
model_inputs['lora_ranks'],
|
||||
model_inputs['lora_weights_pointers'],
|
||||
host_context_lengths=model_inputs['host_context_lengths'],
|
||||
max_context_length=max_input_len,
|
||||
max_num_tokens=max_num_tokens,
|
||||
host_request_types=model_inputs['host_request_types'])
|
||||
if model_inputs['spec_decoding_params'] is not None:
|
||||
result['spec_decoding_params'] = model_inputs[
|
||||
@ -757,6 +757,10 @@ def fuse_gate_mlp(
|
||||
from ..quantization.quantize import fp8_quantize
|
||||
|
||||
quant_algo = model.config.quantization.quant_algo
|
||||
if quant_algo != QuantAlgo.FP8 and quant_algo is not None:
|
||||
logger.warning("fuse_gate_mlp cannot be done for this model. Skipping.")
|
||||
return model
|
||||
|
||||
for name, mlp, layer in model.named_modules_with_parent():
|
||||
if isinstance(mlp, GatedMLP):
|
||||
init_params = get_init_params(mlp)
|
||||
|
||||
@ -18,7 +18,6 @@ import torch
|
||||
|
||||
from ..._utils import torch_dtype_to_str
|
||||
from ...layers import MoeConfig
|
||||
from ...logger import logger
|
||||
from ...mapping import Mapping
|
||||
from ..modeling_utils import PretrainedConfig, QuantConfig
|
||||
|
||||
@ -123,14 +122,9 @@ class QWenConfig(PretrainedConfig):
|
||||
dtype = torch_dtype_to_str(dtype)
|
||||
if dtype == 'float32':
|
||||
dtype = 'float16'
|
||||
if dtype == 'bfloat16' and torch.cuda.get_device_properties(
|
||||
0).major < 8:
|
||||
logger.warning(
|
||||
"Pre SM 80 GPUs do not support bfloat16, fallback to float16")
|
||||
dtype = 'float16'
|
||||
|
||||
return cls(
|
||||
architecture='QWenForCausalLM',
|
||||
architecture=hf_config.architectures[0],
|
||||
dtype=dtype,
|
||||
num_hidden_layers=hf_config.num_hidden_layers,
|
||||
num_attention_heads=hf_config.num_attention_heads,
|
||||
|
||||
@ -12,4 +12,4 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
__version__ = "0.12.0.dev2024072301"
|
||||
__version__ = "0.12.0.dev2024072302"
|
||||
|
||||
@ -21,6 +21,10 @@ from utils.util import force_ampere, similar
|
||||
|
||||
from tensorrt_llm.models.llama.model import LLaMAForCausalLM
|
||||
|
||||
skip_single_gpu = pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2,
|
||||
reason="The test needs at least 2 GPUs, skipping")
|
||||
|
||||
# The unittests are based on the tiny-llama, which is fast to build and run.
|
||||
# There are other tests based on llama-7B model, such as the end-to-end tests in test_e2e.py, and parallel tests in
|
||||
# test_llm_multi_gpu.py.
|
||||
|
||||
114
tests/hlapi/test_llm_models.py
Normal file
114
tests/hlapi/test_llm_models.py
Normal file
@ -0,0 +1,114 @@
|
||||
from typing import List, Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from tensorrt_llm import LLM, SamplingParams
|
||||
from tensorrt_llm.hlapi import QuantAlgo, QuantConfig
|
||||
|
||||
try:
|
||||
from .test_llm import get_model_path
|
||||
except ImportError:
|
||||
from test_llm import get_model_path
|
||||
|
||||
import os
|
||||
import sys
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
||||
from utils.util import force_ampere, similar, skip_pre_hopper
|
||||
|
||||
gptj_model_path = get_model_path('gpt-j-6b')
|
||||
gpt2_model_path = get_model_path('gpt2-medium')
|
||||
starcoder2_model_path = get_model_path('starcoder2-3b')
|
||||
|
||||
sampling_params = SamplingParams(max_new_tokens=10)
|
||||
|
||||
|
||||
def llm_test_harness(model_dir: str,
|
||||
prompts: List[str],
|
||||
references: List[str],
|
||||
*,
|
||||
sampling_params: Optional[SamplingParams] = None,
|
||||
similar_threshold: float = 0.8,
|
||||
**llm_kwargs):
|
||||
|
||||
# skip if no enough GPUs
|
||||
tp_size = llm_kwargs.get('tensor_parallel_size', 1)
|
||||
pp_size = llm_kwargs.get('pipeline_parallel_size', 1)
|
||||
world_size = tp_size * pp_size
|
||||
if world_size > torch.cuda.device_count():
|
||||
pytest.skip(
|
||||
f"world_size ({world_size}) is greater than available GPUs ({torch.cuda.device_count()})"
|
||||
)
|
||||
|
||||
llm = LLM(model_dir, tokenizer=model_dir, **llm_kwargs)
|
||||
outputs = llm.generate(prompts, sampling_params=sampling_params)
|
||||
for out, ref in zip(outputs, references):
|
||||
assert similar(out.outputs[0].text, ref, threshold=similar_threshold)
|
||||
|
||||
|
||||
@force_ampere
|
||||
def test_llm_gptj():
|
||||
llm_test_harness(gptj_model_path,
|
||||
prompts=["A B C"],
|
||||
references=["D E F G H I J K L M"],
|
||||
sampling_params=sampling_params)
|
||||
|
||||
|
||||
@force_ampere
|
||||
def test_llm_gptj_int4_weight_only():
|
||||
quant_config = QuantConfig(quant_algo=QuantAlgo.W4A16)
|
||||
llm_test_harness(gptj_model_path,
|
||||
prompts=["A B C"],
|
||||
references=["D E F G H I J K L M"],
|
||||
sampling_params=sampling_params,
|
||||
quant_config=quant_config)
|
||||
|
||||
|
||||
@force_ampere
|
||||
def test_llm_gptj_tp2():
|
||||
llm_test_harness(gptj_model_path,
|
||||
prompts=["A B C"],
|
||||
references=["D E F G H I J K L M"],
|
||||
sampling_params=sampling_params,
|
||||
tensor_parallel_size=2)
|
||||
|
||||
|
||||
@force_ampere
|
||||
def test_llm_gpt2():
|
||||
llm_test_harness(gpt2_model_path,
|
||||
prompts=["A B C"],
|
||||
references=["D E F G H I J K L M"],
|
||||
sampling_params=sampling_params)
|
||||
|
||||
|
||||
@skip_pre_hopper
|
||||
def test_llm_gpt2_fp8():
|
||||
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
|
||||
llm_test_harness(gpt2_model_path,
|
||||
prompts=["A B C"],
|
||||
references=["D E F G H I J K L M"],
|
||||
sampling_params=sampling_params,
|
||||
quant_config=quant_config)
|
||||
|
||||
|
||||
@force_ampere
|
||||
def test_llm_starcoder2():
|
||||
llm_test_harness(starcoder2_model_path,
|
||||
prompts=["def print_hello_world():"],
|
||||
references=['\n print("Hello World")\n\ndef print'],
|
||||
sampling_params=sampling_params)
|
||||
|
||||
|
||||
@skip_pre_hopper
|
||||
def test_llm_starcoder2_fp8():
|
||||
quant_config = QuantConfig(quant_algo=QuantAlgo.FP8)
|
||||
llm_test_harness(starcoder2_model_path,
|
||||
prompts=["def print_hello_world():"],
|
||||
references=['\n print("Hello World")\n\ndef print'],
|
||||
sampling_params=sampling_params,
|
||||
quant_config=quant_config)
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
test_llm_gpt2()
|
||||
@ -22,15 +22,11 @@ from tensorrt_llm.models.llama.model import LLaMAForCausalLM
|
||||
try:
|
||||
from .test_llm import (_test_llm_generate_async, default_model_name,
|
||||
get_model_path, llama_model_path, mixtral_model_name,
|
||||
prompts)
|
||||
prompts, skip_single_gpu)
|
||||
except ImportError:
|
||||
from test_llm import (_test_llm_generate_async, default_model_name,
|
||||
get_model_path, llama_model_path, mixtral_model_name,
|
||||
prompts)
|
||||
|
||||
skip_single_gpu = pytest.mark.skipif(
|
||||
torch.cuda.device_count() < 2,
|
||||
reason="The test needs at least 2 GPUs, skipping")
|
||||
prompts, skip_single_gpu)
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
|
||||
@ -33,6 +33,7 @@ from tensorrt_llm import Builder
|
||||
from tensorrt_llm._utils import str_dtype_to_torch
|
||||
from tensorrt_llm.functional import RotaryScalingType
|
||||
from tensorrt_llm.layers import PositionEmbeddingType
|
||||
from tensorrt_llm.models.gpt.convert import load_weights_from_hf_model
|
||||
from tensorrt_llm.network import net_guard
|
||||
from tensorrt_llm.plugin.plugin import ContextFMHAType
|
||||
from tensorrt_llm.runtime import ModelConfig, SamplingConfig
|
||||
@ -40,9 +41,6 @@ from tensorrt_llm.runtime.generation import _prepare_attention_mask
|
||||
from tensorrt_llm.runtime.kv_cache_manager import (GenerationSequence,
|
||||
KVCacheManager)
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
|
||||
from examples.gpt.convert_checkpoint import convert_hf_gpt
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
||||
from utils.util import skip_fp32_accum_pre_ampere, unittest_name_func
|
||||
|
||||
@ -86,12 +84,9 @@ class TestGPT(unittest.TestCase):
|
||||
'bias': getattr(gpt_config, 'bias', True),
|
||||
'apply_query_key_layer_scaling': apply_query_key_layer_scaling,
|
||||
}
|
||||
config = tensorrt_llm.models.PretrainedConfig.from_dict(config)
|
||||
weights = convert_hf_gpt(hf_gpt,
|
||||
gpt_config,
|
||||
"gpt2",
|
||||
config.mapping,
|
||||
dtype=dtype)
|
||||
config = tensorrt_llm.models.GPTConfig.from_dict(config)
|
||||
weights = load_weights_from_hf_model(hf_gpt, config)
|
||||
|
||||
tensorrt_llm_gpt = tensorrt_llm.models.GPTForCausalLM(config)
|
||||
tensorrt_llm_gpt.load(weights)
|
||||
|
||||
|
||||
@ -29,13 +29,10 @@ from transformers import GPTJConfig, GPTJForCausalLM
|
||||
|
||||
import tensorrt_llm
|
||||
from tensorrt_llm import Builder
|
||||
from tensorrt_llm.models.gptj.convert import load_weights_from_hf_model
|
||||
from tensorrt_llm.network import net_guard
|
||||
from tensorrt_llm.plugin.plugin import ContextFMHAType
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '../..'))
|
||||
|
||||
from examples.gptj.convert_checkpoint import convert_hf_gptj
|
||||
|
||||
sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
|
||||
from utils.util import skip_fp32_accum_pre_ampere, unittest_name_func
|
||||
|
||||
@ -82,10 +79,7 @@ class TestGPTJ(unittest.TestCase):
|
||||
}
|
||||
config = tensorrt_llm.models.PretrainedConfig.from_dict(config)
|
||||
config.set_rank(rank)
|
||||
weights = convert_hf_gptj(hf_gpt,
|
||||
gpt_config,
|
||||
config.mapping,
|
||||
dtype=dtype)
|
||||
weights = load_weights_from_hf_model(hf_gpt, config)
|
||||
trtllm_model = tensorrt_llm.models.GPTJForCausalLM(config)
|
||||
trtllm_model.load(weights)
|
||||
|
||||
|
||||
@ -86,8 +86,9 @@ class TestLLaMA(unittest.TestCase):
|
||||
|
||||
# Initialize model
|
||||
config = tensorrt_llm.models.LLaMAConfig.from_dict(config)
|
||||
tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM(config)
|
||||
weights = load_weights_from_hf_model(hf_llama, config)
|
||||
|
||||
tensorrt_llm_llama = tensorrt_llm.models.LLaMAForCausalLM(config)
|
||||
tensorrt_llm_llama.load(weights)
|
||||
optimize_model(tensorrt_llm_llama, **opt_flags)
|
||||
|
||||
|
||||
@ -12,16 +12,12 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import sys
|
||||
import unittest
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
sys.path.append(str(Path(__file__).parent.resolve() /
|
||||
"../examples/gpt")) # more precise, avoid confusion
|
||||
from convert_checkpoint import generate_int8
|
||||
from tensorrt_llm.models.gpt.convert import generate_int8
|
||||
|
||||
|
||||
def dist(x, y):
|
||||
|
||||
Loading…
Reference in New Issue
Block a user