TensorRT-LLMs/benchmarks/python/allowed_configs.py
2023-09-28 09:00:05 -07:00

549 lines
19 KiB
Python

# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from typing import Literal, Optional
from pydantic import BaseModel, Extra
from tensorrt_llm.functional import PositionEmbeddingType
class BuildConfig(BaseModel, extra=Extra.allow):
num_layers: int
num_heads: int
hidden_size: int
vocab_size: int
hidden_act: Optional[str]
n_positions: int
max_batch_size: int
max_input_len: int
num_kv_heads: Optional[int] = None
max_output_len: Optional[int] = None
# TRT builder_optimization_level from 0 to 5
builder_opt: Optional[int] = None
inter_size: Optional[int] = None
rotary_dim: Optional[int] = None
type_vocab_size: Optional[int] = None
use_smooth_quant: bool = False
per_token: bool = False
per_channel: bool = False
pre_norm: Optional[bool] = None
do_layer_norm_before: Optional[bool] = None
enable_qk_half_accum: bool = False
enable_context_fmha: bool = True
# None means using the model family's default value defined in the ctor
position_embedding_type: Optional[PositionEmbeddingType] = None
# Only when position embedding is RoPE, this value makes sense, make
# default value to be None, not 0 or 1 to prevent misuse
rotary_pct: Optional[float] = None
bias: bool = True
class ModelConfig(BaseModel):
name: str
family: str
benchmark_type: Literal["gpt", "bert"]
build_config: BuildConfig
_allowed_configs = {
"gpt_350m":
ModelConfig(name="gpt_350m",
family="gpt",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=24,
num_heads=16,
hidden_size=1024,
vocab_size=51200,
hidden_act='gelu',
n_positions=1024,
max_batch_size=256,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"gpt_1.5b":
ModelConfig(name="gpt_1.5b",
family="gpt",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=48,
num_heads=25,
hidden_size=1600,
vocab_size=51200,
hidden_act='gelu',
n_positions=1024,
max_batch_size=256,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"gpt_175b":
ModelConfig(name="gpt_175b",
family="gpt",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=96,
num_heads=96,
hidden_size=12288,
vocab_size=51200,
hidden_act='gelu',
n_positions=2048,
max_batch_size=64,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"gpt_350m_sq_per_tensor":
ModelConfig(name="gpt_350m_sq_per_tensor",
family="gpt",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=24,
num_heads=16,
hidden_size=1024,
vocab_size=51200,
hidden_act='gelu',
n_positions=1024,
max_batch_size=256,
max_input_len=512,
max_output_len=200,
builder_opt=None,
use_smooth_quant=True,
)),
"gpt_350m_sq_per_token_channel":
ModelConfig(name="gpt_350m_sq_per_token_channel",
family="gpt",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=24,
num_heads=16,
hidden_size=1024,
vocab_size=51200,
hidden_act='gelu',
n_positions=1024,
max_batch_size=256,
max_input_len=512,
max_output_len=200,
builder_opt=None,
use_smooth_quant=True,
per_token=True,
per_channel=True,
)),
"gpt-next_2b":
ModelConfig(name="gpt-next_2b",
family="gpt",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=24,
num_heads=16,
hidden_size=2048,
vocab_size=256000,
hidden_act='swiglu',
n_positions=1024,
max_batch_size=256,
max_input_len=512,
max_output_len=200,
builder_opt=None,
position_embedding_type=PositionEmbeddingType.rope_gpt_neox,
rotary_pct=0.5,
bias=False,
)),
"opt_350m":
ModelConfig(name="opt_350m",
family="opt",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=24,
num_heads=16,
hidden_size=1024,
vocab_size=50272,
hidden_act='relu',
n_positions=2048,
max_batch_size=256,
max_input_len=512,
max_output_len=200,
builder_opt=None,
pre_norm=False,
do_layer_norm_before=False,
)),
"opt_2.7b":
ModelConfig(name="opt_2.7b",
family="opt",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=32,
num_heads=32,
hidden_size=2560,
vocab_size=50272,
hidden_act='relu',
n_positions=2048,
max_batch_size=256,
max_input_len=512,
max_output_len=200,
builder_opt=None,
pre_norm=False,
do_layer_norm_before=True,
)),
"opt_6.7b":
ModelConfig(name="opt_6.7b",
family="opt",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=32,
num_heads=32,
hidden_size=4096,
vocab_size=50272,
hidden_act='relu',
n_positions=2048,
max_batch_size=256,
max_input_len=512,
max_output_len=200,
builder_opt=None,
pre_norm=False,
do_layer_norm_before=True,
)),
"opt_66b":
ModelConfig(name="opt_66b",
family="opt",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=64,
num_heads=72,
hidden_size=9216,
vocab_size=50272,
hidden_act='relu',
n_positions=2048,
max_batch_size=64,
max_input_len=512,
max_output_len=200,
builder_opt=None,
pre_norm=True,
do_layer_norm_before=True,
)),
"llama_7b":
ModelConfig(name="llama_7b",
family="llama",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=32,
num_heads=32,
hidden_size=4096,
vocab_size=32000,
hidden_act='silu',
n_positions=2048,
inter_size=11008,
max_batch_size=128,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"llama_13b":
ModelConfig(name="llama_13b",
family="llama",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=40,
num_heads=40,
hidden_size=5120,
vocab_size=32000,
hidden_act='silu',
n_positions=2048,
inter_size=13824,
max_batch_size=128,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"llama_30b":
ModelConfig(name="llama_30b",
family="llama",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=60,
num_heads=52,
hidden_size=6656,
vocab_size=32000,
hidden_act='silu',
n_positions=2048,
inter_size=17920,
max_batch_size=64,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"llama_70b":
ModelConfig(name="llama_70b",
family="llama",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=80,
num_heads=64,
num_kv_heads=8,
hidden_size=8192,
vocab_size=32000,
hidden_act='silu',
n_positions=2048,
inter_size=28672,
max_batch_size=64,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"llama_70b_sq_per_tensor":
ModelConfig(name="llama_70b_sq_per_tensor",
family="llama",
benchmark_type="gpt",
build_config=BuildConfig(num_layers=80,
num_heads=64,
num_kv_heads=8,
hidden_size=8192,
vocab_size=32000,
hidden_act='silu',
n_positions=2048,
inter_size=28672,
max_batch_size=128,
max_input_len=512,
max_output_len=200,
builder_opt=None,
use_smooth_quant=True)),
"gptj_6b":
ModelConfig(name="gptj_6b",
family="gptj",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=28,
num_heads=16,
hidden_size=4096,
vocab_size=50401,
hidden_act='gelu',
n_positions=1024,
rotary_dim=64,
max_batch_size=256,
max_input_len=512,
max_output_len=200,
builder_opt=None,
)),
"gptneox_20b":
ModelConfig(name="gptneox_20b",
family="gptneox",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=44,
num_heads=64,
hidden_size=6144,
vocab_size=50432,
hidden_act='gelu',
n_positions=2048,
rotary_dim=24,
max_batch_size=16,
max_input_len=512,
max_output_len=512,
builder_opt=None,
)),
"chatglm_6b":
ModelConfig(name="chatglm_6b",
family="chatglm",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=28,
num_heads=32,
hidden_size=4096,
vocab_size=130528,
hidden_act='gelu',
n_positions=2048,
max_batch_size=256,
max_input_len=512,
max_output_len=200,
builder_opt=None,
remove_input_padding=False,
)),
"bloom_560m":
ModelConfig(name="bloom_560m",
family="bloom",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=24,
num_heads=16,
hidden_size=1024,
vocab_size=250880,
hidden_act=None,
n_positions=2048,
max_batch_size=8,
max_input_len=1024,
max_output_len=1024,
builder_opt=None,
)),
"bloom_176b":
ModelConfig(name="bloom_176b",
family="bloom",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=70,
num_heads=112,
hidden_size=14336,
vocab_size=250880,
hidden_act=None,
n_positions=2048,
max_batch_size=8,
max_input_len=1024,
max_output_len=1024,
builder_opt=None,
)),
"bert_base":
ModelConfig(name="bert_base",
family="bert",
benchmark_type="bert",
build_config=BuildConfig(
num_layers=12,
num_heads=12,
hidden_size=768,
vocab_size=30522,
type_vocab_size=2,
hidden_act='gelu',
n_positions=1024,
max_batch_size=256,
max_input_len=512,
builder_opt=None,
enable_qk_half_accum=False,
enable_context_fmha=False,
)),
"bert_large":
ModelConfig(name="bert_large",
family="bert",
benchmark_type="bert",
build_config=BuildConfig(
num_layers=24,
num_heads=16,
hidden_size=1024,
vocab_size=30522,
type_vocab_size=2,
hidden_act='gelu',
n_positions=1024,
max_batch_size=64,
max_input_len=512,
builder_opt=None,
enable_qk_half_accum=False,
enable_context_fmha=False,
)),
"falcon_rw_1b":
ModelConfig(name="falcon_rw_1b",
family="falcon",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=24,
num_heads=32,
hidden_size=2048,
vocab_size=50304,
hidden_act=None,
n_positions=2048,
max_batch_size=256,
max_input_len=1024,
max_output_len=1024,
builder_opt=None,
bias=True,
use_alibi=True,
parallel_attention=False,
new_decoder_architecture=False,
)),
"falcon_7b":
ModelConfig(name="falcon_7b",
family="falcon",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=32,
num_heads=71,
num_kv_heads=1,
hidden_size=4544,
vocab_size=65024,
hidden_act=None,
n_positions=2048,
max_batch_size=128,
max_input_len=512,
max_output_len=200,
builder_opt=None,
bias=False,
use_alibi=False,
parallel_attention=True,
new_decoder_architecture=False,
)),
"falcon_40b":
ModelConfig(name="falcon_40b",
family="falcon",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=60,
num_heads=128,
num_kv_heads=8,
hidden_size=8192,
vocab_size=65024,
hidden_act=None,
n_positions=2048,
max_batch_size=64,
max_input_len=512,
max_output_len=200,
builder_opt=None,
bias=False,
use_alibi=False,
parallel_attention=True,
new_decoder_architecture=False,
)),
"falcon_180b":
ModelConfig(name="falcon_180b",
family="falcon",
benchmark_type="gpt",
build_config=BuildConfig(
num_layers=80,
num_heads=232,
num_kv_heads=8,
hidden_size=14848,
vocab_size=65024,
hidden_act=None,
n_positions=2048,
max_batch_size=8,
max_input_len=1024,
max_output_len=1024,
builder_opt=None,
bias=False,
use_alibi=False,
parallel_attention=True,
new_decoder_architecture=False,
)),
}
def get_allowed_models(benchmark_type=None):
if benchmark_type is None:
return set(_allowed_configs.keys())
else:
return set(i.name for i in _allowed_configs.values()
if i.benchmark_type == benchmark_type)
def get_build_config(model_name):
if model_name in _allowed_configs:
return dict(_allowed_configs[model_name].build_config)
else:
raise KeyError(f'Unexpected model: {model_name}. Please add the model '
'to allowed_configs.py')
def get_model_family(model_name):
if model_name in _allowed_configs:
return _allowed_configs[model_name].family
else:
raise KeyError(f'Unexpected model: {model_name}. Please add the model '
'to allowed_configs.py')