[enhanchment] Add beam width to low latency. (#4812)

Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
This commit is contained in:
Frank 2025-06-03 02:24:55 -07:00 committed by GitHub
parent 3de02582dd
commit 80f9989a1e
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -75,6 +75,12 @@ from tensorrt_llm.sampling_params import SamplingParams
@optgroup.group("Request Load Control Options",
cls=MutuallyExclusiveOptionGroup,
help="Limits how requests are loaded.")
@optgroup.option(
"--beam_width",
type=int,
default=1,
help="Number of search beams.",
)
@optgroup.option(
"--concurrency",
type=int,
@ -133,6 +139,7 @@ def latency_command(
checkpoint_path: Path = bench_env.checkpoint_path or bench_env.model
engine_dir: Path = params.pop("engine_dir")
concurrency: int = params.pop("concurrency")
beam_width: int = params.pop("beam_width")
warmup: int = params.get("warmup")
# Engine configuration parsing
exec_settings, build_cfg = get_settings_from_engine(engine_dir)
@ -153,7 +160,7 @@ def latency_command(
exec_settings["settings_config"]["kv_cache_percent"] = kv_cache_percent
exec_settings["settings_config"]["max_batch_size"] = 1
exec_settings["settings_config"]["max_num_tokens"] = engine_tokens
exec_settings["settings_config"]["beam_width"] = 1
exec_settings["settings_config"]["beam_width"] = beam_width
exec_settings["settings_config"]["chunking"] = False
exec_settings["settings_config"][
"scheduler_policy"] = CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
@ -208,7 +215,8 @@ def latency_command(
sampling_params = SamplingParams(
end_id=eos_id,
pad_id=pad_id,
n=1,
n=beam_width,
use_beam_search=beam_width > 1,
)
llm = LLM(**kwargs)