From f8a4cc062900e0e84e4d67298a2c6e86717a3d0c Mon Sep 17 00:00:00 2001
From: Frank <3429989+FrankD412@users.noreply.github.com>
Date: Fri, 4 Apr 2025 22:17:59 -0700
Subject: [PATCH] perf: Add total token throughput metric. (#3212)

Signed-off-by: Frank Di Natale <3429989+FrankD412@users.noreply.github.com>
---
 tensorrt_llm/bench/dataclasses/reporting.py  | 9 +++++++++
 tensorrt_llm/bench/dataclasses/statistics.py | 5 +++++
 2 files changed, 14 insertions(+)

diff --git a/tensorrt_llm/bench/dataclasses/reporting.py b/tensorrt_llm/bench/dataclasses/reporting.py
index 373d9c47bd..ad4bf9ed9f 100755
--- a/tensorrt_llm/bench/dataclasses/reporting.py
+++ b/tensorrt_llm/bench/dataclasses/reporting.py
@@ -194,6 +194,12 @@ class ReportUtility:
         """Output throughput in tokens per second."""
         return self.convert_rate_to_s(self.statistics.output_throughput_tok_ns)
 
+    @property
+    def total_token_throughput_tok_s(self) -> float:
+        """Total token throughput in tokens per second."""
+        return self.convert_rate_to_s(
+            self.statistics.total_token_throughput_tok_ns)
+
     @property
     def per_user_generation_token_throughput_s(self) -> float:
         """Output throughput per user in tokens per second."""
@@ -314,6 +320,8 @@ class ReportUtility:
             "system_output_throughput_tok_s":
             self.output_throughput_tok_s,
             # Output throughput per user (average per request output throughput)
+            "system_total_throughput_tok_s":
+            self.total_token_throughput_tok_s,
             "output_throughput_per_user_tok_s":
             self.per_user_output_throughput_tok_s,
             # Output throughput per GPU (total throughput / world size)
@@ -477,6 +485,7 @@ class ReportUtility:
             f"Total Output Throughput (tokens/sec):             {perf['system_output_throughput_tok_s']:.4f}\n"
             f"Per User Output Throughput (tokens/sec/user):     {perf['output_throughput_per_user_tok_s']:.4f}\n"
             f"Per GPU Output Throughput (tokens/sec/gpu):       {perf['output_throughput_per_gpu_tok_s']:.4f}\n"
+            f"Total Token Throughput (tokens/sec):              {perf['system_total_throughput_tok_s']:.4f}\n"
             f"Total Latency (ms):                               {perf['total_latency_ms']:.4f}\n"
             f"Average request latency (ms):                     {perf['avg_request_latency_ms']:.4f}\n"
         )
diff --git a/tensorrt_llm/bench/dataclasses/statistics.py b/tensorrt_llm/bench/dataclasses/statistics.py
index 0aa88d5555..84a1ac089e 100644
--- a/tensorrt_llm/bench/dataclasses/statistics.py
+++ b/tensorrt_llm/bench/dataclasses/statistics.py
@@ -183,6 +183,11 @@ class BenchmarkStatistics(BaseModel):
     def output_throughput_tok_ns(self) -> float:
         return float(self.total_output_tokens) / self.total_latency_ns
 
+    @computed_field
+    def total_token_throughput_tok_ns(self) -> float:
+        return float(self.total_input_tokens +
+                     self.total_output_tokens) / self.total_latency_ns
+
     @computed_field
     def output_throughput_tok_ns_per_user(self) -> float:
         return self.output_throughput_percentiles.average