[feat]: Allow for a settable end-of-sequence/padding token in max throughput benchmark. (#3776)

* Move world options to a different group for clarity.

Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>

* Add eos_id option.

Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>

---------

Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
This commit is contained in:
Frank 2025-04-30 21:42:46 -04:00 committed by GitHub
parent 9cc5922a0b
commit 1e317c98c6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -94,6 +94,14 @@ from tensorrt_llm.sampling_params import SamplingParams
required=False,
help="Pass in a dataset file for parsing instead of stdin.",
)
@optgroup.option(
"--eos_id",
type=int,
default=-1,
required=False,
help=
"Set the end-of-sequence token for the benchmark. Set to -1 to disable EOS.",
)
@optgroup.option(
"--modality",
type=click.Choice(["image", "video"]),
@ -122,6 +130,22 @@ from tensorrt_llm.sampling_params import SamplingParams
default=2,
help="Number of requests warm up benchmark.",
)
@optgroup.option(
"--target_input_len",
default=None,
type=click.IntRange(min=1),
help="Target (average) input length for tuning heuristics.",
)
@optgroup.option(
"--target_output_len",
default=None,
type=click.IntRange(min=1),
help="Target (average) sequence length for tuning heuristics.",
)
@optgroup.group(
"World Configuration",
help="Options for configuring the backend multi-GPU world.",
)
@optgroup.option(
"--tp",
type=int,
@ -146,18 +170,6 @@ from tensorrt_llm.sampling_params import SamplingParams
default=None,
help="expert cluster parallelism size",
)
@optgroup.option(
"--target_input_len",
default=None,
type=click.IntRange(min=1),
help="Target (average) input length for tuning heuristics.",
)
@optgroup.option(
"--target_output_len",
default=None,
type=click.IntRange(min=1),
help="Target (average) sequence length for tuning heuristics.",
)
@optgroup.group("Request Load Control Options",
cls=MutuallyExclusiveOptionGroup,
help="Limits how requests are loaded.")
@ -218,6 +230,7 @@ def throughput_command(
# Parameters from CLI
# Model, experiment, and engine params
dataset_path: Path = params.pop("dataset")
eos_id: int = params.pop("eos_id")
warmup: int = params.get("warmup")
num_requests: int = params.pop("num_requests")
max_seq_len: int = params.pop("max_seq_len")
@ -329,8 +342,8 @@ def throughput_command(
else:
llm = LLM(**kwargs)
sampling_params = SamplingParams(end_id=-1,
pad_id=-1,
sampling_params = SamplingParams(end_id=eos_id,
pad_id=eos_id,
beam_width=beam_width)
# Perform warmup if requested.