perf: Add total token throughput metric. (#3212)

Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
This commit is contained in:
Frank 2025-04-04 22:17:59 -07:00 committed by GitHub
parent e12e7a753d
commit f8a4cc0629
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 14 additions and 0 deletions

View File

@ -194,6 +194,12 @@ class ReportUtility:
"""Output throughput in tokens per second."""
return self.convert_rate_to_s(self.statistics.output_throughput_tok_ns)
@property
def total_token_throughput_tok_s(self) -> float:
"""Total token throughput in tokens per second."""
return self.convert_rate_to_s(
self.statistics.total_token_throughput_tok_ns)
@property
def per_user_generation_token_throughput_s(self) -> float:
"""Output throughput per user in tokens per second."""
@ -314,6 +320,8 @@ class ReportUtility:
"system_output_throughput_tok_s":
self.output_throughput_tok_s,
# Output throughput per user (average per request output throughput)
"system_total_throughput_tok_s":
self.total_token_throughput_tok_s,
"output_throughput_per_user_tok_s":
self.per_user_output_throughput_tok_s,
# Output throughput per GPU (total throughput / world size)
@ -477,6 +485,7 @@ class ReportUtility:
f"Total Output Throughput (tokens/sec): {perf['system_output_throughput_tok_s']:.4f}\n"
f"Per User Output Throughput (tokens/sec/user): {perf['output_throughput_per_user_tok_s']:.4f}\n"
f"Per GPU Output Throughput (tokens/sec/gpu): {perf['output_throughput_per_gpu_tok_s']:.4f}\n"
f"Total Token Throughput (tokens/sec): {perf['system_total_throughput_tok_s']:.4f}\n"
f"Total Latency (ms): {perf['total_latency_ms']:.4f}\n"
f"Average request latency (ms): {perf['avg_request_latency_ms']:.4f}\n"
)

View File

@ -183,6 +183,11 @@ class BenchmarkStatistics(BaseModel):
def output_throughput_tok_ns(self) -> float:
return float(self.total_output_tokens) / self.total_latency_ns
@computed_field
def total_token_throughput_tok_ns(self) -> float:
return float(self.total_input_tokens +
self.total_output_tokens) / self.total_latency_ns
@computed_field
def output_throughput_tok_ns_per_user(self) -> float:
return self.output_throughput_percentiles.average