mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[enhanchment] Add beam width to low latency. (#4812)
Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
This commit is contained in:
parent
3de02582dd
commit
80f9989a1e
@ -75,6 +75,12 @@ from tensorrt_llm.sampling_params import SamplingParams
|
||||
@optgroup.group("Request Load Control Options",
|
||||
cls=MutuallyExclusiveOptionGroup,
|
||||
help="Limits how requests are loaded.")
|
||||
@optgroup.option(
|
||||
"--beam_width",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of search beams.",
|
||||
)
|
||||
@optgroup.option(
|
||||
"--concurrency",
|
||||
type=int,
|
||||
@ -133,6 +139,7 @@ def latency_command(
|
||||
checkpoint_path: Path = bench_env.checkpoint_path or bench_env.model
|
||||
engine_dir: Path = params.pop("engine_dir")
|
||||
concurrency: int = params.pop("concurrency")
|
||||
beam_width: int = params.pop("beam_width")
|
||||
warmup: int = params.get("warmup")
|
||||
# Engine configuration parsing
|
||||
exec_settings, build_cfg = get_settings_from_engine(engine_dir)
|
||||
@ -153,7 +160,7 @@ def latency_command(
|
||||
exec_settings["settings_config"]["kv_cache_percent"] = kv_cache_percent
|
||||
exec_settings["settings_config"]["max_batch_size"] = 1
|
||||
exec_settings["settings_config"]["max_num_tokens"] = engine_tokens
|
||||
exec_settings["settings_config"]["beam_width"] = 1
|
||||
exec_settings["settings_config"]["beam_width"] = beam_width
|
||||
exec_settings["settings_config"]["chunking"] = False
|
||||
exec_settings["settings_config"][
|
||||
"scheduler_policy"] = CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
|
||||
@ -208,7 +215,8 @@ def latency_command(
|
||||
sampling_params = SamplingParams(
|
||||
end_id=eos_id,
|
||||
pad_id=pad_id,
|
||||
n=1,
|
||||
n=beam_width,
|
||||
use_beam_search=beam_width > 1,
|
||||
)
|
||||
llm = LLM(**kwargs)
|
||||
|
||||
|
||||
Loading…
Reference in New Issue
Block a user