TensorRT-LLMs/examples/models/core/multimodal/utils.py
katec846 eeb605abd6
feat: Offloading Multimodal embedding table to CPU in Chunked Prefill Mode (#3380)
* Feat: Offload ptable to cpu if enable_chunk_context

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>

* Feat: offload ptable to cpu for chunk context mode

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>

* Fix and add comment

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>

* Update Readme for multimodal and add a new param mm_embedding_offloading

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>

* fix: Correct prompt table offloading condition in PromptTuningBuffers

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>

* Clean up the code

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>

* Add commits to explain copy from cpu <-> gpu using pinned memory

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>

* Fix namings based on comments

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>

* Fix format based on precommit

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>

* Modify --mm_embedding_offloading flag

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>

---------

Signed-off-by: Kate Cheng <yunhsuanc@nvidia.com>
Co-authored-by: Haohang Huang <31998628+symphonylyh@users.noreply.github.com>
2025-04-21 14:31:01 +08:00

158 lines
6.1 KiB
Python

def add_common_args(parser):
parser.add_argument('--max_new_tokens', type=int, default=128)
parser.add_argument('--batch_size', type=int, default=1)
parser.add_argument('--log_level', type=str, default='info')
parser.add_argument('--engine_dir',
type=str,
default=None,
help='Directory containing visual and LLM TRT engines')
parser.add_argument('--visual_engine_name',
type=str,
default='model.engine',
help='Name of visual TRT engine')
parser.add_argument('--audio_engine_name',
type=str,
default='model.engine',
help='Name of audio TRT engine')
parser.add_argument('--hf_model_dir',
type=str,
default=None,
help="Directory containing tokenizer")
parser.add_argument('--input_text',
type=str,
nargs='+',
default=None,
help='Text prompt to LLM')
parser.add_argument('--num_beams',
type=int,
help="Use beam search if num_beams >1",
default=1)
parser.add_argument('--top_k', type=int, default=1)
parser.add_argument('--top_p', type=float, default=0.0)
parser.add_argument('--temperature', type=float, default=1.0)
parser.add_argument('--repetition_penalty', type=float, default=1.0)
parser.add_argument('--run_profiling',
action='store_true',
help='Profile runtime over several iterations')
parser.add_argument('--profiling_iterations',
type=int,
help="Number of iterations to run profiling",
default=20)
parser.add_argument('--check_accuracy',
action='store_true',
help='Check correctness of text output')
parser.add_argument(
'--video_path',
type=str,
default=None,
help=
'Path to your local video file, using \'llava-onevision-accuracy\' to check the Llava-OneVision model accuracy'
)
parser.add_argument(
'--video_num_frames',
type=int,
help=
"The number of frames sampled from the video in the Llava-OneVision model.",
default=None)
parser.add_argument("--image_path",
type=str,
nargs='+',
default=None,
help='List of input image paths, separated by symbol')
parser.add_argument("--audio_path",
type=str,
default=None,
help='input audio path')
parser.add_argument("--path_sep",
type=str,
default=",",
help='Path separator symbol')
parser.add_argument("--prompt_sep",
type=str,
default=",",
help="Prompt separator symbol")
parser.add_argument('--enable_context_fmha_fp32_acc',
action='store_true',
default=None,
help="Enable FMHA runner FP32 accumulation.")
parser.add_argument(
'--enable_chunked_context',
action='store_true',
help='Enables chunked context (only available with cpp session).',
)
parser.add_argument(
'--mm_embedding_offloading',
type=lambda s: s.lower() == "true",
default=None,
help=
'Enable position table offloading. When not specified, defaults to True if using a multimodal model with chunked context.'
)
parser.add_argument(
'--session',
default='cpp_llm_only',
type=str,
choices=['python', 'cpp_llm_only', 'cpp'],
help=
'Rumtime used to run the models. \n`cpp_llm_only`: vision engine run in python runtime, but LLM in pybind cpp runtime\n`python`: everything runs in python runtime\n`cpp`: everything runs in C++ runtime'
)
parser.add_argument(
'--kv_cache_free_gpu_memory_fraction',
default=0.7,
type=float,
help='Specify the free gpu memory fraction.',
)
parser.add_argument(
'--cross_kv_cache_fraction',
default=0.5,
type=float,
help=
'Specify the kv cache fraction reserved for cross attention. Only applicable for encoder-decoder models. By default 0.5 for self and 0.5 for cross.',
)
parser.add_argument(
'--multi_block_mode',
type=lambda s: s.lower() in
("yes", "true", "t", "1"
), # custom boolean function to convert input string to boolean
default=True,
help=
"Distribute the work across multiple CUDA thread-blocks on the GPU for masked MHA kernel."
)
parser.add_argument(
'--lora_task_uids',
type=str,
default=None,
nargs="+",
help="The list of LoRA task uids; use -1 to disable the LoRA module")
parser.add_argument('--debug_mode',
default=False,
action='store_true',
help="Whether or not to turn on the debug mode")
return parser
def levenshtein_distance(s1, s2):
if len(s1) < len(s2):
return levenshtein_distance(s2, s1)
if len(s2) == 0:
return len(s1)
previous_row = range(len(s2) + 1)
for i, c1 in enumerate(s1):
current_row = [i + 1]
for j, c2 in enumerate(s2):
insertions = previous_row[j + 1] + 1
deletions = current_row[j] + 1
substitutions = previous_row[j] + (c1 != c2)
current_row.append(min(insertions, deletions, substitutions))
previous_row = current_row
return previous_row[-1]
def compute_str_match_rate(s1, s2):
distance = levenshtein_distance(s1, s2)
max_length = max(len(s1), len(s2))
match_rate = (1 - distance / max_length) * 100
return match_rate