TensorRT-LLMs/examples/models/core/qwen2audio/utils.py
QI JUN d51ae53940
move the reset models into examples/models/core directory (#3555)
* move rest models to examples/models/core directory

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* update multimodal readme

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix example path

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix cpp test

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix tensorrt test

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

* fix ci

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>

---------

Signed-off-by: junq <22017000+QiJune@users.noreply.github.com>
2025-04-19 20:48:59 -07:00

131 lines
5.0 KiB
Python

from argparse import BooleanOptionalAction
def add_common_args(parser):
# sampling arguments
parser.add_argument('--num_beams',
type=int,
help="Use beam search if num_beams > 1",
default=1)
parser.add_argument('--num_return_sequences',
type=int,
help="Number of sequences to generate for each input.",
default=None)
parser.add_argument('--temperature', type=float, default=1.0)
parser.add_argument('--top_k', type=int, default=1)
parser.add_argument('--top_p', type=float, default=0.0)
parser.add_argument('--length_penalty', type=float, default=1.0)
parser.add_argument('--repetition_penalty', type=float, default=1.0)
parser.add_argument('--presence_penalty', type=float, default=0.0)
parser.add_argument('--frequency_penalty', type=float, default=0.0)
parser.add_argument('--random_seed', type=int, default=0)
parser.add_argument('--early_stopping',
type=int,
help='Use early stopping if num_beams > 1, '
'1 for early-stopping, 0 for non-early-stopping'
'other values for stopping by length',
default=1)
parser.add_argument('--no_repeat_ngram_size', type=int, default=None)
# common runtime arguments
parser.add_argument('--sink_token_length',
type=int,
default=None,
help='The sink token length.')
parser.add_argument(
'--max_attention_window_size',
type=int,
default=None,
nargs="+",
help=
'The attention window size that controls the sliding window attention / cyclic kv cache behavior'
)
parser.add_argument(
'--multi_block_mode',
type=lambda s: s.lower() in
("yes", "true", "t", "1"
), # custom boolean function to convert input string to boolean
default=True,
help=
"Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel."
)
parser.add_argument('--enable_context_fmha_fp32_acc',
action='store_true',
help="Enable FMHA runner FP32 accumulation.")
parser.add_argument('--cuda_graph_mode',
action='store_true',
help="Enable cuda graphs in the inference.")
parser.add_argument(
'--log_level',
type=str,
choices=['verbose', 'info', 'warning', 'error', 'internal_error'],
default='info')
parser.add_argument('--use_py_session',
default=False,
action='store_true',
help="Whether or not to use Python runtime session")
parser.add_argument('--debug_mode',
default=False,
action='store_true',
help="Whether or not to turn on the debug mode")
parser.add_argument('--lora_dir',
type=str,
default=None,
nargs="+",
help="The directory of LoRA weights")
parser.add_argument('--lora_ckpt_source',
type=str,
default="hf",
choices=["hf", "nemo"],
help="The source of lora checkpoint.")
parser.add_argument(
'--lora_task_uids',
type=str,
default=None,
nargs="+",
help="The list of LoRA task uids; use -1 to disable the LoRA module")
# model arguments
parser.add_argument('--engine_dir', type=str, default='engine_outputs')
parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None)
parser.add_argument(
'--tokenizer_dir',
default=None,
help='tokenizer path; defaults to hf_model_dir if left unspecified')
# memory argument
parser.add_argument(
'--gpu_weights_percent',
default=1,
type=float,
help=
'Specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime.',
)
parser.add_argument(
'--max_tokens_in_paged_kv_cache',
default=None,
type=int,
help=
'Specify the maximum number of tokens in a kv cache page (only available with cpp session).',
)
parser.add_argument(
'--kv_cache_enable_block_reuse',
default=True,
action=BooleanOptionalAction,
help=
'Enables block reuse in kv cache (only available with cpp session).',
)
parser.add_argument(
'--kv_cache_free_gpu_memory_fraction',
default=0.9,
type=float,
help='Specify the free gpu memory fraction.',
)
parser.add_argument(
'--enable_chunked_context',
action='store_true',
help='Enables chunked context (only available with cpp session).',
)
return parser