from argparse import BooleanOptionalAction def add_common_args(parser): # sampling arguments parser.add_argument('--num_beams', type=int, help="Use beam search if num_beams > 1", default=1) parser.add_argument('--num_return_sequences', type=int, help="Number of sequences to generate for each input.", default=None) parser.add_argument('--temperature', type=float, default=1.0) parser.add_argument('--top_k', type=int, default=1) parser.add_argument('--top_p', type=float, default=0.0) parser.add_argument('--length_penalty', type=float, default=1.0) parser.add_argument('--repetition_penalty', type=float, default=1.0) parser.add_argument('--presence_penalty', type=float, default=0.0) parser.add_argument('--frequency_penalty', type=float, default=0.0) parser.add_argument('--random_seed', type=int, default=0) parser.add_argument('--early_stopping', type=int, help='Use early stopping if num_beams > 1, ' '1 for early-stopping, 0 for non-early-stopping' 'other values for stopping by length', default=1) parser.add_argument('--no_repeat_ngram_size', type=int, default=None) # common runtime arguments parser.add_argument('--sink_token_length', type=int, default=None, help='The sink token length.') parser.add_argument( '--max_attention_window_size', type=int, default=None, nargs="+", help= 'The attention window size that controls the sliding window attention / cyclic kv cache behavior' ) parser.add_argument( '--multi_block_mode', type=lambda s: s.lower() in ("yes", "true", "t", "1" ), # custom boolean function to convert input string to boolean default=True, help= "Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel." ) parser.add_argument('--enable_context_fmha_fp32_acc', action='store_true', help="Enable FMHA runner FP32 accumulation.") parser.add_argument('--cuda_graph_mode', action='store_true', help="Enable cuda graphs in the inference.") parser.add_argument( '--log_level', type=str, choices=['verbose', 'info', 'warning', 'error', 'internal_error'], default='info') parser.add_argument('--use_py_session', default=False, action='store_true', help="Whether or not to use Python runtime session") parser.add_argument('--debug_mode', default=False, action='store_true', help="Whether or not to turn on the debug mode") parser.add_argument('--lora_dir', type=str, default=None, nargs="+", help="The directory of LoRA weights") parser.add_argument('--lora_ckpt_source', type=str, default="hf", choices=["hf", "nemo"], help="The source of lora checkpoint.") parser.add_argument( '--lora_task_uids', type=str, default=None, nargs="+", help="The list of LoRA task uids; use -1 to disable the LoRA module") # model arguments parser.add_argument('--engine_dir', type=str, default='engine_outputs') parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None) parser.add_argument( '--tokenizer_dir', default=None, help='tokenizer path; defaults to hf_model_dir if left unspecified') # memory argument parser.add_argument( '--gpu_weights_percent', default=1, type=float, help= 'Specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime.', ) parser.add_argument( '--max_tokens_in_paged_kv_cache', default=None, type=int, help= 'Specify the maximum number of tokens in a kv cache page (only available with cpp session).', ) parser.add_argument( '--kv_cache_enable_block_reuse', default=True, action=BooleanOptionalAction, help= 'Enables block reuse in kv cache (only available with cpp session).', ) parser.add_argument( '--kv_cache_free_gpu_memory_fraction', default=0.9, type=float, help='Specify the free gpu memory fraction.', ) parser.add_argument( '--enable_chunked_context', action='store_true', help='Enables chunked context (only available with cpp session).', ) return parser