mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* move rest models to examples/models/core directory Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * update multimodal readme Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix example path Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix cpp test Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix tensorrt test Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> * fix ci Signed-off-by: junq <22017000+QiJune@users.noreply.github.com> --------- Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
131 lines
5.0 KiB
Python
131 lines
5.0 KiB
Python
from argparse import BooleanOptionalAction
|
|
|
|
|
|
def add_common_args(parser):
|
|
# sampling arguments
|
|
parser.add_argument('--num_beams',
|
|
type=int,
|
|
help="Use beam search if num_beams > 1",
|
|
default=1)
|
|
parser.add_argument('--num_return_sequences',
|
|
type=int,
|
|
help="Number of sequences to generate for each input.",
|
|
default=None)
|
|
parser.add_argument('--temperature', type=float, default=1.0)
|
|
parser.add_argument('--top_k', type=int, default=1)
|
|
parser.add_argument('--top_p', type=float, default=0.0)
|
|
parser.add_argument('--length_penalty', type=float, default=1.0)
|
|
parser.add_argument('--repetition_penalty', type=float, default=1.0)
|
|
parser.add_argument('--presence_penalty', type=float, default=0.0)
|
|
parser.add_argument('--frequency_penalty', type=float, default=0.0)
|
|
parser.add_argument('--random_seed', type=int, default=0)
|
|
parser.add_argument('--early_stopping',
|
|
type=int,
|
|
help='Use early stopping if num_beams > 1, '
|
|
'1 for early-stopping, 0 for non-early-stopping'
|
|
'other values for stopping by length',
|
|
default=1)
|
|
parser.add_argument('--no_repeat_ngram_size', type=int, default=None)
|
|
|
|
# common runtime arguments
|
|
parser.add_argument('--sink_token_length',
|
|
type=int,
|
|
default=None,
|
|
help='The sink token length.')
|
|
parser.add_argument(
|
|
'--max_attention_window_size',
|
|
type=int,
|
|
default=None,
|
|
nargs="+",
|
|
help=
|
|
'The attention window size that controls the sliding window attention / cyclic kv cache behavior'
|
|
)
|
|
parser.add_argument(
|
|
'--multi_block_mode',
|
|
type=lambda s: s.lower() in
|
|
("yes", "true", "t", "1"
|
|
), # custom boolean function to convert input string to boolean
|
|
default=True,
|
|
help=
|
|
"Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel."
|
|
)
|
|
parser.add_argument('--enable_context_fmha_fp32_acc',
|
|
action='store_true',
|
|
help="Enable FMHA runner FP32 accumulation.")
|
|
parser.add_argument('--cuda_graph_mode',
|
|
action='store_true',
|
|
help="Enable cuda graphs in the inference.")
|
|
parser.add_argument(
|
|
'--log_level',
|
|
type=str,
|
|
choices=['verbose', 'info', 'warning', 'error', 'internal_error'],
|
|
default='info')
|
|
parser.add_argument('--use_py_session',
|
|
default=False,
|
|
action='store_true',
|
|
help="Whether or not to use Python runtime session")
|
|
parser.add_argument('--debug_mode',
|
|
default=False,
|
|
action='store_true',
|
|
help="Whether or not to turn on the debug mode")
|
|
parser.add_argument('--lora_dir',
|
|
type=str,
|
|
default=None,
|
|
nargs="+",
|
|
help="The directory of LoRA weights")
|
|
parser.add_argument('--lora_ckpt_source',
|
|
type=str,
|
|
default="hf",
|
|
choices=["hf", "nemo"],
|
|
help="The source of lora checkpoint.")
|
|
parser.add_argument(
|
|
'--lora_task_uids',
|
|
type=str,
|
|
default=None,
|
|
nargs="+",
|
|
help="The list of LoRA task uids; use -1 to disable the LoRA module")
|
|
|
|
# model arguments
|
|
parser.add_argument('--engine_dir', type=str, default='engine_outputs')
|
|
parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None)
|
|
parser.add_argument(
|
|
'--tokenizer_dir',
|
|
default=None,
|
|
help='tokenizer path; defaults to hf_model_dir if left unspecified')
|
|
|
|
# memory argument
|
|
parser.add_argument(
|
|
'--gpu_weights_percent',
|
|
default=1,
|
|
type=float,
|
|
help=
|
|
'Specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime.',
|
|
)
|
|
parser.add_argument(
|
|
'--max_tokens_in_paged_kv_cache',
|
|
default=None,
|
|
type=int,
|
|
help=
|
|
'Specify the maximum number of tokens in a kv cache page (only available with cpp session).',
|
|
)
|
|
parser.add_argument(
|
|
'--kv_cache_enable_block_reuse',
|
|
default=True,
|
|
action=BooleanOptionalAction,
|
|
help=
|
|
'Enables block reuse in kv cache (only available with cpp session).',
|
|
)
|
|
parser.add_argument(
|
|
'--kv_cache_free_gpu_memory_fraction',
|
|
default=0.9,
|
|
type=float,
|
|
help='Specify the free gpu memory fraction.',
|
|
)
|
|
parser.add_argument(
|
|
'--enable_chunked_context',
|
|
action='store_true',
|
|
help='Enables chunked context (only available with cpp session).',
|
|
)
|
|
|
|
return parser
|