mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Add Acceptance Rate calculation to benchmark_serving (#6240)
Signed-off-by: Zero Zeng <38289304+zerollzeng@users.noreply.github.com>
This commit is contained in:
parent
97f7e12588
commit
c9b8b6180f
@ -44,6 +44,7 @@ class RequestFuncOutput:
|
||||
tpot: float = 0.0 # avg next-token latencies
|
||||
prompt_len: int = 0
|
||||
error: str = ""
|
||||
decode_iteration: int = 0 # Number of decoding iterations
|
||||
|
||||
|
||||
async def async_request_trt_llm(
|
||||
@ -77,6 +78,7 @@ async def async_request_trt_llm(
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
decode_iteration_count = 0 # Track decoding iterations
|
||||
try:
|
||||
async with request_session.post(url=api_url, json=payload) as response:
|
||||
if response.status == 200:
|
||||
@ -102,9 +104,12 @@ async def async_request_trt_llm(
|
||||
else:
|
||||
output.itl.append(timestamp - most_recent_timestamp)
|
||||
|
||||
# Increment decode iteration for each chunk
|
||||
decode_iteration_count += 1
|
||||
most_recent_timestamp = timestamp
|
||||
|
||||
output.latency = most_recent_timestamp - st
|
||||
output.decode_iteration = decode_iteration_count
|
||||
else:
|
||||
content = await response.content.read()
|
||||
data = json.loads(content.decode())
|
||||
@ -112,6 +117,9 @@ async def async_request_trt_llm(
|
||||
output.itl = []
|
||||
output.generated_text = data["text_output"]
|
||||
output.latency = time.perf_counter() - st
|
||||
# For non-streaming, estimate decode_iteration as number of output tokens
|
||||
output.decode_iteration = len(output.generated_text.split(
|
||||
)) if output.generated_text else 1
|
||||
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
@ -170,6 +178,7 @@ async def async_request_openai_completions(
|
||||
generated_text = ""
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
decode_iteration_count = 0 # Track decoding iterations
|
||||
try:
|
||||
async with request_session.post(url=api_url,
|
||||
json=payload,
|
||||
@ -206,6 +215,9 @@ async def async_request_openai_completions(
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
# Increment decode iteration for each chunk with text
|
||||
if text is not None:
|
||||
decode_iteration_count += 1
|
||||
most_recent_timestamp = timestamp
|
||||
generated_text += text or ""
|
||||
elif usage := data.get("usage"):
|
||||
@ -220,6 +232,7 @@ async def async_request_openai_completions(
|
||||
"This response will be marked as failed!")
|
||||
output.generated_text = generated_text
|
||||
output.latency = most_recent_timestamp - st
|
||||
output.decode_iteration = decode_iteration_count
|
||||
else:
|
||||
content = await response.content.read()
|
||||
data = json.loads(content.decode())
|
||||
@ -230,6 +243,8 @@ async def async_request_openai_completions(
|
||||
output.ttft = -1
|
||||
output.itl = []
|
||||
output.output_tokens = data["usage"]["completion_tokens"]
|
||||
# For non-streaming, estimate decode_iteration as number of output tokens
|
||||
output.decode_iteration = output.output_tokens if output.output_tokens > 0 else 1
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
output.success = False
|
||||
@ -306,6 +321,7 @@ async def async_request_openai_chat_completions(
|
||||
ttft = 0.0
|
||||
st = time.perf_counter()
|
||||
most_recent_timestamp = st
|
||||
decode_iteration_count = 0 # Track decoding iterations
|
||||
try:
|
||||
async with request_session.post(url=api_url,
|
||||
json=payload,
|
||||
@ -336,6 +352,9 @@ async def async_request_openai_chat_completions(
|
||||
output.itl.append(timestamp -
|
||||
most_recent_timestamp)
|
||||
|
||||
# Increment decode iteration for each chunk with content
|
||||
if content is not None:
|
||||
decode_iteration_count += 1
|
||||
generated_text += content or ""
|
||||
elif usage := data.get("usage"):
|
||||
output.output_tokens = usage.get(
|
||||
@ -345,6 +364,7 @@ async def async_request_openai_chat_completions(
|
||||
|
||||
output.generated_text = generated_text
|
||||
output.latency = most_recent_timestamp - st
|
||||
output.decode_iteration = decode_iteration_count
|
||||
else:
|
||||
content = await response.content.read()
|
||||
data = json.loads(content.decode())
|
||||
@ -354,6 +374,8 @@ async def async_request_openai_chat_completions(
|
||||
output.itl = []
|
||||
output.latency = time.perf_counter() - st
|
||||
output.ttft = -1
|
||||
# For non-streaming, estimate decode_iteration as number of output tokens
|
||||
output.decode_iteration = output.output_tokens if output.output_tokens > 0 else 1
|
||||
|
||||
else:
|
||||
output.error = response.reason or ""
|
||||
|
||||
@ -79,6 +79,11 @@ class BenchmarkMetrics:
|
||||
std_e2el_ms: float
|
||||
percentiles_e2el_ms: list[tuple[float, float]]
|
||||
tput_user: list[float]
|
||||
# Request accuracy rate metrics
|
||||
mean_request_ar: float
|
||||
median_request_ar: float
|
||||
std_request_ar: float
|
||||
percentiles_request_ar: list[tuple[float, float]]
|
||||
|
||||
|
||||
async def get_request(
|
||||
@ -131,7 +136,7 @@ def calculate_metrics(
|
||||
selected_percentile_metrics: list[str],
|
||||
selected_percentiles: list[float],
|
||||
goodput_config_dict: dict[str, float],
|
||||
) -> tuple[BenchmarkMetrics, list[int]]:
|
||||
) -> tuple[BenchmarkMetrics, list[int], list[float]]:
|
||||
actual_output_lens: list[int] = []
|
||||
total_input = 0
|
||||
completed = 0
|
||||
@ -142,6 +147,7 @@ def calculate_metrics(
|
||||
ttfts: list[float] = []
|
||||
e2els: list[float] = []
|
||||
tput_user: list[float] = []
|
||||
request_ars: list[float] = [] # Request accuracy rates
|
||||
for i in range(len(outputs)):
|
||||
if outputs[i].success:
|
||||
output_len = outputs[i].output_tokens
|
||||
@ -167,9 +173,24 @@ def calculate_metrics(
|
||||
ttfts.append(outputs[i].ttft)
|
||||
e2els.append(outputs[i].latency)
|
||||
tput_user.append(output_len / (outputs[i].latency))
|
||||
|
||||
# Calculate request accuracy rate (num_generated_tokens / (decode_iteration + 1))
|
||||
decode_iter = outputs[i].decode_iteration
|
||||
if decode_iter >= 0:
|
||||
# For generated tokens, we use output_len - 1 (excluding the first token if needed)
|
||||
# But according to the reference, it should be num_generated_tokens
|
||||
num_generated_tokens = max(0, output_len -
|
||||
1) if output_len > 1 else output_len
|
||||
request_ar = num_generated_tokens / (
|
||||
decode_iter + 1) if decode_iter >= 0 else 0.0
|
||||
request_ars.append(request_ar)
|
||||
else:
|
||||
request_ars.append(0.0)
|
||||
|
||||
completed += 1
|
||||
else:
|
||||
actual_output_lens.append(0)
|
||||
request_ars.append(0.0)
|
||||
|
||||
if goodput_config_dict:
|
||||
valid_metrics = []
|
||||
@ -228,8 +249,13 @@ def calculate_metrics(
|
||||
percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000)
|
||||
for p in selected_percentiles],
|
||||
tput_user=np.mean(tput_user or 0),
|
||||
mean_request_ar=np.mean(request_ars or 0),
|
||||
median_request_ar=np.median(request_ars or 0),
|
||||
std_request_ar=np.std(request_ars or 0),
|
||||
percentiles_request_ar=[(p, np.percentile(request_ars or 0, p))
|
||||
for p in selected_percentiles],
|
||||
)
|
||||
return metrics, actual_output_lens
|
||||
return metrics, actual_output_lens, request_ars
|
||||
|
||||
|
||||
async def benchmark(
|
||||
@ -403,7 +429,7 @@ async def benchmark(
|
||||
# Close the session
|
||||
await session.close()
|
||||
|
||||
metrics, actual_output_lens = calculate_metrics(
|
||||
metrics, actual_output_lens, request_ars = calculate_metrics(
|
||||
input_requests=input_requests,
|
||||
outputs=outputs,
|
||||
dur_s=benchmark_duration,
|
||||
@ -431,6 +457,10 @@ async def benchmark(
|
||||
metrics.total_token_throughput))
|
||||
print("{:<40} {:<10.2f}".format("User throughput (tok/s):",
|
||||
metrics.tput_user))
|
||||
print("{:<40} {:<10.4f}".format("Mean Request AR:",
|
||||
metrics.mean_request_ar))
|
||||
print("{:<40} {:<10.4f}".format("Median Request AR:",
|
||||
metrics.median_request_ar))
|
||||
|
||||
result = {
|
||||
"duration": benchmark_duration,
|
||||
@ -443,12 +473,16 @@ async def benchmark(
|
||||
"output_throughput": metrics.output_throughput,
|
||||
"total_token_throughput": metrics.total_token_throughput,
|
||||
"user_throughput": metrics.tput_user,
|
||||
"mean_request_ar": metrics.mean_request_ar,
|
||||
"median_request_ar": metrics.median_request_ar,
|
||||
"input_lens": [output.prompt_len for output in outputs],
|
||||
"output_lens": actual_output_lens,
|
||||
"ttfts": [output.ttft for output in outputs],
|
||||
"itls": [output.itl for output in outputs],
|
||||
"generated_texts": [output.generated_text for output in outputs],
|
||||
"errors": [output.error for output in outputs],
|
||||
"request_ars": request_ars,
|
||||
"decode_iterations": [output.decode_iteration for output in outputs],
|
||||
}
|
||||
|
||||
def process_one_metric(
|
||||
@ -534,11 +568,15 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace,
|
||||
metrics = [
|
||||
"median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms",
|
||||
"mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms",
|
||||
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms"
|
||||
"median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms",
|
||||
"mean_request_ar", "median_request_ar", "std_request_ar"
|
||||
]
|
||||
# These raw data might be useful, but they are rather big. They can be added
|
||||
# later if needed
|
||||
ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"]
|
||||
ignored_metrics = [
|
||||
"ttfts", "itls", "generated_texts", "errors", "request_ars",
|
||||
"decode_iterations"
|
||||
]
|
||||
pt_records = convert_to_pytorch_benchmark_format(
|
||||
args=args,
|
||||
metrics={k: [results[k]]
|
||||
@ -762,7 +800,8 @@ def main(args: argparse.Namespace):
|
||||
# Remove fields with too many data points
|
||||
for field in [
|
||||
"input_lens", "output_lens", "ttfts", "itls",
|
||||
"generated_texts", "errors"
|
||||
"generated_texts", "errors", "request_ars",
|
||||
"decode_iterations"
|
||||
]:
|
||||
if field in result_json:
|
||||
del result_json[field]
|
||||
@ -963,11 +1002,11 @@ if __name__ == "__main__":
|
||||
parser.add_argument(
|
||||
"--percentile-metrics",
|
||||
type=str,
|
||||
default="ttft,tpot,itl",
|
||||
default="ttft,tpot,itl,request_ar",
|
||||
help="Comma-separated list of selected metrics to report percentils. "
|
||||
"This argument specifies the metrics to report percentiles. "
|
||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". "
|
||||
"Default value is \"ttft,tpot,itl\".")
|
||||
"Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\", \"request_ar\". "
|
||||
"Default value is \"ttft,tpot,itl,request_ar\".")
|
||||
parser.add_argument(
|
||||
"--metric-percentiles",
|
||||
type=str,
|
||||
|
||||
Loading…
Reference in New Issue
Block a user