mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
[feat]: Allow for a settable end-of-sequence/padding token in max throughput benchmark. (#3776)
* Move world options to a different group for clarity. Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> * Add eos_id option. Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> --------- Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
This commit is contained in:
parent
9cc5922a0b
commit
1e317c98c6
@ -94,6 +94,14 @@ from tensorrt_llm.sampling_params import SamplingParams
|
||||
required=False,
|
||||
help="Pass in a dataset file for parsing instead of stdin.",
|
||||
)
|
||||
@optgroup.option(
|
||||
"--eos_id",
|
||||
type=int,
|
||||
default=-1,
|
||||
required=False,
|
||||
help=
|
||||
"Set the end-of-sequence token for the benchmark. Set to -1 to disable EOS.",
|
||||
)
|
||||
@optgroup.option(
|
||||
"--modality",
|
||||
type=click.Choice(["image", "video"]),
|
||||
@ -122,6 +130,22 @@ from tensorrt_llm.sampling_params import SamplingParams
|
||||
default=2,
|
||||
help="Number of requests warm up benchmark.",
|
||||
)
|
||||
@optgroup.option(
|
||||
"--target_input_len",
|
||||
default=None,
|
||||
type=click.IntRange(min=1),
|
||||
help="Target (average) input length for tuning heuristics.",
|
||||
)
|
||||
@optgroup.option(
|
||||
"--target_output_len",
|
||||
default=None,
|
||||
type=click.IntRange(min=1),
|
||||
help="Target (average) sequence length for tuning heuristics.",
|
||||
)
|
||||
@optgroup.group(
|
||||
"World Configuration",
|
||||
help="Options for configuring the backend multi-GPU world.",
|
||||
)
|
||||
@optgroup.option(
|
||||
"--tp",
|
||||
type=int,
|
||||
@ -146,18 +170,6 @@ from tensorrt_llm.sampling_params import SamplingParams
|
||||
default=None,
|
||||
help="expert cluster parallelism size",
|
||||
)
|
||||
@optgroup.option(
|
||||
"--target_input_len",
|
||||
default=None,
|
||||
type=click.IntRange(min=1),
|
||||
help="Target (average) input length for tuning heuristics.",
|
||||
)
|
||||
@optgroup.option(
|
||||
"--target_output_len",
|
||||
default=None,
|
||||
type=click.IntRange(min=1),
|
||||
help="Target (average) sequence length for tuning heuristics.",
|
||||
)
|
||||
@optgroup.group("Request Load Control Options",
|
||||
cls=MutuallyExclusiveOptionGroup,
|
||||
help="Limits how requests are loaded.")
|
||||
@ -218,6 +230,7 @@ def throughput_command(
|
||||
# Parameters from CLI
|
||||
# Model, experiment, and engine params
|
||||
dataset_path: Path = params.pop("dataset")
|
||||
eos_id: int = params.pop("eos_id")
|
||||
warmup: int = params.get("warmup")
|
||||
num_requests: int = params.pop("num_requests")
|
||||
max_seq_len: int = params.pop("max_seq_len")
|
||||
@ -329,8 +342,8 @@ def throughput_command(
|
||||
else:
|
||||
llm = LLM(**kwargs)
|
||||
|
||||
sampling_params = SamplingParams(end_id=-1,
|
||||
pad_id=-1,
|
||||
sampling_params = SamplingParams(end_id=eos_id,
|
||||
pad_id=eos_id,
|
||||
beam_width=beam_width)
|
||||
|
||||
# Perform warmup if requested.
|
||||
|
||||
Loading…
Reference in New Issue
Block a user