TensorRT-LLMs/tensorrt_llm/bench/dataclasses/statistics.py
kris1025 6a3a921284
[TRTLLM-6685][feat] Add speculative metrics for trt llm bench (#6476)
Signed-off-by: linquanh <linquanh@nvidia.com>
2025-08-04 15:22:57 -07:00

198 lines
6.3 KiB
Python

from __future__ import annotations
from typing import Any, List, Optional
from pydantic import BaseModel, computed_field
class RequestRecord(BaseModel):
id: int = -1
num_input_tokens: int = -1
tokens: List[int] = []
error_tokens: int = 0
start_timestamp: int = -1
first_token_timestamp: int = -1
end_timestamp: int = -1
decode_iteration: int = 0
def register_event(self,
is_error: bool,
is_final: bool,
timestamp: int,
decoding_iter: int,
tokens: List[int],
first_token_timestamp: int = None) -> None:
if is_final:
self.end_timestamp = timestamp
elif self.first_token_timestamp == -1:
self.first_token_timestamp = timestamp
if first_token_timestamp is not None and is_final:
self.first_token_timestamp = first_token_timestamp
if is_error:
self.error_tokens += 1
self.tokens += tokens
self.decode_iteration = decoding_iter
@computed_field
def num_total_output_tokens(self) -> int:
"""
Returns the total number of output tokens generated by the request.
"""
return len(self.tokens)
@computed_field
def num_generated_tokens(self) -> int:
"""
Returns the number of generated (OSL - 1) tokens by the request.
"""
return self.num_total_output_tokens - 1
@computed_field
def generation_time(self) -> int:
"""
Returns the generation time of the request (E2E Latency - TTFT).
"""
return self.end_to_end_latency - self.time_to_first_token
@computed_field
def time_to_first_token(self) -> int:
return (self.first_token_timestamp -
self.start_timestamp if self.first_token_timestamp > 0 else 0.0)
@computed_field
def intertoken_latency(self) -> float:
"""
Returns the time-per-output-token latency of the request [(OSL - 1) / (E2E Latency - TTFT)].
"""
return ((self.end_timestamp - self.first_token_timestamp) /
self.num_generated_tokens
if self.num_generated_tokens > 0 else 0.0)
@computed_field
def end_to_end_latency(self) -> int:
"""
Returns the end-to-end latency of the request (end time - start time).
"""
return self.end_timestamp - self.start_timestamp
@computed_field
def output_token_throughput(self) -> float:
"""
Returns the total token throughput of the request (Total output tokens / E2E Latency).
"""
return float(self.num_total_output_tokens) / self.end_to_end_latency
@computed_field
def generation_token_throughput(self) -> float:
return (self.num_generated_tokens / self.generation_time)
class PercentileStats(BaseModel):
p50: float
p90: float
p95: float
p99: float
minimum: float
maximum: float
average: float
@classmethod
def from_iterable(cls, values: List[Any]) -> PercentileStats:
length = len(values)
sorted_values = sorted(values)
return cls(
p50=sorted_values[int(length * 0.50)],
p90=sorted_values[int(length * 0.90)],
p95=sorted_values[int(length * 0.95)],
p99=sorted_values[int(length * 0.99)],
average=float(sum(values)) / length,
minimum=min(values),
maximum=max(values),
)
class BenchmarkStatistics(BaseModel):
# Time-related Properties
total_latency_ns: float
# Token-related Properties
total_output_tokens: int
total_input_tokens: int
# General Information
num_requests: int
issue_rate_ns: float
# Speculative Information
acceptance_length: float
# Percentile-related Statistics
request_latency_percentiles: Optional[PercentileStats] = None
output_throughput_percentiles: Optional[PercentileStats] = None
token_percentiles: Optional[PercentileStats] = None
tpot_percentiles: Optional[PercentileStats] = None
ttft_percentiles: Optional[PercentileStats] = None
generation_tp_percentiles: Optional[PercentileStats] = None
generation_latency_percentiles: Optional[PercentileStats] = None
# Percentile-related Speculative Statistics
num_draft_tokens_percentiles: Optional[PercentileStats] = None
num_accepted_draft_tokens_percentiles: Optional[PercentileStats] = None
draft_acceptance_rate_percentiles: Optional[PercentileStats] = None
acceptance_length_percentiles: Optional[PercentileStats] = None
@computed_field
def sum_per_request_latencies_ns(self) -> float:
return self.request_latency_percentiles.average * self.num_requests
@computed_field
def avg_concurrent_requests(self) -> int:
return self.sum_per_request_latencies_ns / self.total_latency_ns
@computed_field
def generation_tokens(self) -> int:
return int(self.total_output_tokens - self.num_requests)
@computed_field
def total_generation_time_ns(self) -> float:
return self.generation_latency_percentiles.average * self.num_requests
@computed_field
def per_user_time_per_output_token_ns(self) -> float:
return self.tpot_percentiles.average
@computed_field
def per_user_time_to_first_token_ns(self) -> float:
return self.ttft_percentiles.average
@computed_field
def per_user_generation_token_throughput_ns(self) -> float:
return self.generation_tp_percentiles.average
@computed_field
def request_throughput_ns(self) -> float:
return float(self.num_requests) / self.total_latency_ns
@computed_field
def average_input_length(self) -> float:
return float(self.total_input_tokens) / self.num_requests
@computed_field
def average_output_length(self) -> float:
return float(self.total_output_tokens) / self.num_requests
@computed_field
def output_throughput_tok_ns(self) -> float:
return float(self.total_output_tokens) / self.total_latency_ns
@computed_field
def total_token_throughput_tok_ns(self) -> float:
return float(self.total_input_tokens +
self.total_output_tokens) / self.total_latency_ns
@computed_field
def output_throughput_tok_ns_per_user(self) -> float:
return self.output_throughput_percentiles.average