From f8a4cc062900e0e84e4d67298a2c6e86717a3d0c Mon Sep 17 00:00:00 2001 From: Frank <3429989+FrankD412@users.noreply.github.com> Date: Fri, 4 Apr 2025 22:17:59 -0700 Subject: [PATCH] perf: Add total token throughput metric. (#3212) Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com> --- tensorrt_llm/bench/dataclasses/reporting.py | 9 +++++++++ tensorrt_llm/bench/dataclasses/statistics.py | 5 +++++ 2 files changed, 14 insertions(+) diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py index 373d9c47bd..ad4bf9ed9f 100755 --- a/tensorrt_llm/bench/dataclasses/reporting.py +++ b/tensorrt_llm/bench/dataclasses/reporting.py @@ -194,6 +194,12 @@ class ReportUtility: """Output throughput in tokens per second.""" return self.convert_rate_to_s(self.statistics.output_throughput_tok_ns) + @property + def total_token_throughput_tok_s(self) -> float: + """Total token throughput in tokens per second.""" + return self.convert_rate_to_s( + self.statistics.total_token_throughput_tok_ns) + @property def per_user_generation_token_throughput_s(self) -> float: """Output throughput per user in tokens per second.""" @@ -314,6 +320,8 @@ class ReportUtility: "system_output_throughput_tok_s": self.output_throughput_tok_s, # Output throughput per user (average per request output throughput) + "system_total_throughput_tok_s": + self.total_token_throughput_tok_s, "output_throughput_per_user_tok_s": self.per_user_output_throughput_tok_s, # Output throughput per GPU (total throughput / world size) @@ -477,6 +485,7 @@ class ReportUtility: f"Total Output Throughput (tokens/sec): {perf['system_output_throughput_tok_s']:.4f}\n" f"Per User Output Throughput (tokens/sec/user): {perf['output_throughput_per_user_tok_s']:.4f}\n" f"Per GPU Output Throughput (tokens/sec/gpu): {perf['output_throughput_per_gpu_tok_s']:.4f}\n" + f"Total Token Throughput (tokens/sec): {perf['system_total_throughput_tok_s']:.4f}\n" f"Total Latency (ms): {perf['total_latency_ms']:.4f}\n" f"Average request latency (ms): {perf['avg_request_latency_ms']:.4f}\n" ) diff --git a/tensorrt_llm/bench/dataclasses/statistics.py b/tensorrt_llm/bench/dataclasses/statistics.py index 0aa88d5555..84a1ac089e 100644 --- a/tensorrt_llm/bench/dataclasses/statistics.py +++ b/tensorrt_llm/bench/dataclasses/statistics.py @@ -183,6 +183,11 @@ class BenchmarkStatistics(BaseModel): def output_throughput_tok_ns(self) -> float: return float(self.total_output_tokens) / self.total_latency_ns + @computed_field + def total_token_throughput_tok_ns(self) -> float: + return float(self.total_input_tokens + + self.total_output_tokens) / self.total_latency_ns + @computed_field def output_throughput_tok_ns_per_user(self) -> float: return self.output_throughput_percentiles.average