mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
This merge request attempts to support more SWA KV cache functionality inside the KV cache manager. Before this merge request, the KV cache for sliding window attention (SWA) only holds "window size" number of blocks and reuse them in a cyclic manner. We will not be able to utilize more GPU memory with this design, leading to a limited max batch size throughput. Additionally, we will not be able to support KV cache reuse with this design. In this MR, we change such behavior to let the manager write blocks in a linear manner. With a linear block writing behavior, as the attention window moves on, the out-of-window (OOW) blocks will be detached. Right now for the sake of a correct feature first, we directly offload the OOW block from the primary block pool (GPU memory) to the secondary block pool (host memory). We will improve this in the future by delegating the block movement to the eviction policy. KV cache reuse for SWA is not developed in this merge request and will be amended in a follow-up merge request. Writing the blocks linearly, the maximum number of blocks allocated for a sequence(`GenerationRequest`) is the "max sequence length" specified. The `GenerationRequest` that stores the cache block bookkeeping structure will now keep "max sequence length" tokens of blocks. Given the above, main changes are (more context in the MR): - Remove "cyclic" concept under the kv cache manager, such concept originally guards the block reuse under kv cache manager. - Add detach mechanism and have it under `KVCacheManager::addToken`. Please note that detach is still guarded off for SWA when reuse is enabled. A follow-up merge request will proceed to improve this. - Enforce "max sequence length" to be a non-optional parameter to the `KVCacheManager`/`BlockManager` - Let all window size resource pool get identical proportion of memory - Fix free memory calculation under `resource_manager.py` Signed-off-by: eopXD <yuehtingc@nvidia.com> Co-authored-by: Tomer Asida <tasida@nvidia.com>
131 lines
5.0 KiB
Python
131 lines
5.0 KiB
Python
from argparse import BooleanOptionalAction
|
|
|
|
|
|
def add_common_args(parser):
|
|
# sampling arguments
|
|
parser.add_argument('--num_beams',
|
|
type=int,
|
|
help="Use beam search if num_beams > 1",
|
|
default=1)
|
|
parser.add_argument('--num_return_sequences',
|
|
type=int,
|
|
help="Number of sequences to generate for each input.",
|
|
default=None)
|
|
parser.add_argument('--temperature', type=float, default=1.0)
|
|
parser.add_argument('--top_k', type=int, default=1)
|
|
parser.add_argument('--top_p', type=float, default=0.0)
|
|
parser.add_argument('--length_penalty', type=float, default=1.0)
|
|
parser.add_argument('--repetition_penalty', type=float, default=1.0)
|
|
parser.add_argument('--presence_penalty', type=float, default=0.0)
|
|
parser.add_argument('--frequency_penalty', type=float, default=0.0)
|
|
parser.add_argument('--random_seed', type=int, default=0)
|
|
parser.add_argument('--early_stopping',
|
|
type=int,
|
|
help='Use early stopping if num_beams > 1, '
|
|
'1 for early-stopping, 0 for non-early-stopping'
|
|
'other values for stopping by length',
|
|
default=1)
|
|
parser.add_argument('--no_repeat_ngram_size', type=int, default=None)
|
|
|
|
# common runtime arguments
|
|
parser.add_argument('--sink_token_length',
|
|
type=int,
|
|
default=None,
|
|
help='The sink token length.')
|
|
parser.add_argument(
|
|
'--max_attention_window_size',
|
|
type=int,
|
|
default=None,
|
|
nargs="+",
|
|
help=
|
|
'The attention window size that controls the sliding window attention kv cache behavior'
|
|
)
|
|
parser.add_argument(
|
|
'--multi_block_mode',
|
|
type=lambda s: s.lower() in
|
|
("yes", "true", "t", "1"
|
|
), # custom boolean function to convert input string to boolean
|
|
default=True,
|
|
help=
|
|
"Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel."
|
|
)
|
|
parser.add_argument('--enable_context_fmha_fp32_acc',
|
|
action='store_true',
|
|
help="Enable FMHA runner FP32 accumulation.")
|
|
parser.add_argument('--cuda_graph_mode',
|
|
action='store_true',
|
|
help="Enable cuda graphs in the inference.")
|
|
parser.add_argument(
|
|
'--log_level',
|
|
type=str,
|
|
choices=['verbose', 'info', 'warning', 'error', 'internal_error'],
|
|
default='info')
|
|
parser.add_argument('--use_py_session',
|
|
default=False,
|
|
action='store_true',
|
|
help="Whether or not to use Python runtime session")
|
|
parser.add_argument('--debug_mode',
|
|
default=False,
|
|
action='store_true',
|
|
help="Whether or not to turn on the debug mode")
|
|
parser.add_argument('--lora_dir',
|
|
type=str,
|
|
default=None,
|
|
nargs="+",
|
|
help="The directory of LoRA weights")
|
|
parser.add_argument('--lora_ckpt_source',
|
|
type=str,
|
|
default="hf",
|
|
choices=["hf", "nemo"],
|
|
help="The source of lora checkpoint.")
|
|
parser.add_argument(
|
|
'--lora_task_uids',
|
|
type=str,
|
|
default=None,
|
|
nargs="+",
|
|
help="The list of LoRA task uids; use -1 to disable the LoRA module")
|
|
|
|
# model arguments
|
|
parser.add_argument('--engine_dir', type=str, default='engine_outputs')
|
|
parser.add_argument('--hf_model_dir', '--model_dir', type=str, default=None)
|
|
parser.add_argument(
|
|
'--tokenizer_dir',
|
|
default=None,
|
|
help='tokenizer path; defaults to hf_model_dir if left unspecified')
|
|
|
|
# memory argument
|
|
parser.add_argument(
|
|
'--gpu_weights_percent',
|
|
default=1,
|
|
type=float,
|
|
help=
|
|
'Specify the percentage of weights that reside on GPU instead of CPU and streaming load during runtime.',
|
|
)
|
|
parser.add_argument(
|
|
'--max_tokens_in_paged_kv_cache',
|
|
default=None,
|
|
type=int,
|
|
help=
|
|
'Specify the maximum number of tokens in a kv cache page (only available with cpp session).',
|
|
)
|
|
parser.add_argument(
|
|
'--kv_cache_enable_block_reuse',
|
|
default=True,
|
|
action=BooleanOptionalAction,
|
|
help=
|
|
'Enables block reuse in kv cache (only available with cpp session).',
|
|
)
|
|
parser.add_argument(
|
|
'--kv_cache_free_gpu_memory_fraction',
|
|
default=0.9,
|
|
type=float,
|
|
help='Specify the free gpu memory fraction.',
|
|
)
|
|
parser.add_argument(
|
|
'--enable_chunked_context',
|
|
action='store_true',
|
|
help='Enables chunked context (only available with cpp session).',
|
|
)
|
|
|
|
return parser
|