[enhanchment] Add beam width to low latency. (#4812)

Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-06-03 02:24:55 -07:00 · 2025-06-03 02:24:55 -07:00 · 80f9989a1e
commit 80f9989a1e
parent 3de02582dd
1 changed files with 10 additions and 2 deletions
--- a/tensorrt_llm/bench/benchmark/low_latency.py
+++ b/tensorrt_llm/bench/benchmark/low_latency.py
@ -75,6 +75,12 @@ from tensorrt_llm.sampling_params import SamplingParams
@optgroup.group("Request Load Control Options",
                cls=MutuallyExclusiveOptionGroup,
                help="Limits how requests are loaded.")
+@optgroup.option(
+    "--beam_width",
+    type=int,
+    default=1,
+    help="Number of search beams.",
+)
@optgroup.option(
    "--concurrency",
    type=int,
@ -133,6 +139,7 @@ def latency_command(
    checkpoint_path: Path = bench_env.checkpoint_path or bench_env.model
    engine_dir: Path = params.pop("engine_dir")
    concurrency: int = params.pop("concurrency")
+    beam_width: int = params.pop("beam_width")
    warmup: int = params.get("warmup")
    # Engine configuration parsing
    exec_settings, build_cfg = get_settings_from_engine(engine_dir)
@ -153,7 +160,7 @@ def latency_command(
    exec_settings["settings_config"]["kv_cache_percent"] = kv_cache_percent
    exec_settings["settings_config"]["max_batch_size"] = 1
    exec_settings["settings_config"]["max_num_tokens"] = engine_tokens
-    exec_settings["settings_config"]["beam_width"] = 1
+    exec_settings["settings_config"]["beam_width"] = beam_width
    exec_settings["settings_config"]["chunking"] = False
    exec_settings["settings_config"][
        "scheduler_policy"] = CapacitySchedulerPolicy.GUARANTEED_NO_EVICT
@ -208,7 +215,8 @@ def latency_command(
        sampling_params = SamplingParams(
            end_id=eos_id,
            pad_id=pad_id,
-            n=1,
+            n=beam_width,
+            use_beam_search=beam_width > 1,
        )
        llm = LLM(**kwargs)