perf: Add total token throughput metric. (#3212)

Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
2026-01-14 06:27:45 +08:00 · 2025-04-04 22:17:59 -07:00 · 2025-04-04 22:17:59 -07:00 · f8a4cc0629
commit f8a4cc0629
parent e12e7a753d
2 changed files with 14 additions and 0 deletions
--- a/tensorrt_llm/bench/dataclasses/reporting.py
+++ b/tensorrt_llm/bench/dataclasses/reporting.py
@ -194,6 +194,12 @@ class ReportUtility:
        """Output throughput in tokens per second."""
        return self.convert_rate_to_s(self.statistics.output_throughput_tok_ns)

+    @property
+    def total_token_throughput_tok_s(self) -> float:
+        """Total token throughput in tokens per second."""
+        return self.convert_rate_to_s(
+            self.statistics.total_token_throughput_tok_ns)
+
    @property
    def per_user_generation_token_throughput_s(self) -> float:
        """Output throughput per user in tokens per second."""
@ -314,6 +320,8 @@ class ReportUtility:
            "system_output_throughput_tok_s":
            self.output_throughput_tok_s,
            # Output throughput per user (average per request output throughput)
+            "system_total_throughput_tok_s":
+            self.total_token_throughput_tok_s,
            "output_throughput_per_user_tok_s":
            self.per_user_output_throughput_tok_s,
            # Output throughput per GPU (total throughput / world size)
@ -477,6 +485,7 @@ class ReportUtility:
            f"Total Output Throughput (tokens/sec):             {perf['system_output_throughput_tok_s']:.4f}\n"
            f"Per User Output Throughput (tokens/sec/user):     {perf['output_throughput_per_user_tok_s']:.4f}\n"
            f"Per GPU Output Throughput (tokens/sec/gpu):       {perf['output_throughput_per_gpu_tok_s']:.4f}\n"
+            f"Total Token Throughput (tokens/sec):              {perf['system_total_throughput_tok_s']:.4f}\n"
            f"Total Latency (ms):                               {perf['total_latency_ms']:.4f}\n"
            f"Average request latency (ms):                     {perf['avg_request_latency_ms']:.4f}\n"
        )
--- a/tensorrt_llm/bench/dataclasses/statistics.py
+++ b/tensorrt_llm/bench/dataclasses/statistics.py
@ -183,6 +183,11 @@ class BenchmarkStatistics(BaseModel):
    def output_throughput_tok_ns(self) -> float:
        return float(self.total_output_tokens) / self.total_latency_ns

+    @computed_field
+    def total_token_throughput_tok_ns(self) -> float:
+        return float(self.total_input_tokens +
+                     self.total_output_tokens) / self.total_latency_ns
+
    @computed_field
    def output_throughput_tok_ns_per_user(self) -> float:
        return self.output_throughput_percentiles.average