diff --git a/tensorrt_llm/serve/scripts/backend_request_func.py b/tensorrt_llm/serve/scripts/backend_request_func.py index 990fcc72e9..c65cd8e839 100644 --- a/tensorrt_llm/serve/scripts/backend_request_func.py +++ b/tensorrt_llm/serve/scripts/backend_request_func.py @@ -44,6 +44,7 @@ class RequestFuncOutput: tpot: float = 0.0 # avg next-token latencies prompt_len: int = 0 error: str = "" + decode_iteration: int = 0 # Number of decoding iterations async def async_request_trt_llm( @@ -77,6 +78,7 @@ async def async_request_trt_llm( ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st + decode_iteration_count = 0 # Track decoding iterations try: async with request_session.post(url=api_url, json=payload) as response: if response.status == 200: @@ -102,9 +104,12 @@ async def async_request_trt_llm( else: output.itl.append(timestamp - most_recent_timestamp) + # Increment decode iteration for each chunk + decode_iteration_count += 1 most_recent_timestamp = timestamp output.latency = most_recent_timestamp - st + output.decode_iteration = decode_iteration_count else: content = await response.content.read() data = json.loads(content.decode()) @@ -112,6 +117,9 @@ async def async_request_trt_llm( output.itl = [] output.generated_text = data["text_output"] output.latency = time.perf_counter() - st + # For non-streaming, estimate decode_iteration as number of output tokens + output.decode_iteration = len(output.generated_text.split( + )) if output.generated_text else 1 else: output.error = response.reason or "" @@ -170,6 +178,7 @@ async def async_request_openai_completions( generated_text = "" st = time.perf_counter() most_recent_timestamp = st + decode_iteration_count = 0 # Track decoding iterations try: async with request_session.post(url=api_url, json=payload, @@ -206,6 +215,9 @@ async def async_request_openai_completions( output.itl.append(timestamp - most_recent_timestamp) + # Increment decode iteration for each chunk with text + if text is not None: + decode_iteration_count += 1 most_recent_timestamp = timestamp generated_text += text or "" elif usage := data.get("usage"): @@ -220,6 +232,7 @@ async def async_request_openai_completions( "This response will be marked as failed!") output.generated_text = generated_text output.latency = most_recent_timestamp - st + output.decode_iteration = decode_iteration_count else: content = await response.content.read() data = json.loads(content.decode()) @@ -230,6 +243,8 @@ async def async_request_openai_completions( output.ttft = -1 output.itl = [] output.output_tokens = data["usage"]["completion_tokens"] + # For non-streaming, estimate decode_iteration as number of output tokens + output.decode_iteration = output.output_tokens if output.output_tokens > 0 else 1 else: output.error = response.reason or "" output.success = False @@ -306,6 +321,7 @@ async def async_request_openai_chat_completions( ttft = 0.0 st = time.perf_counter() most_recent_timestamp = st + decode_iteration_count = 0 # Track decoding iterations try: async with request_session.post(url=api_url, json=payload, @@ -336,6 +352,9 @@ async def async_request_openai_chat_completions( output.itl.append(timestamp - most_recent_timestamp) + # Increment decode iteration for each chunk with content + if content is not None: + decode_iteration_count += 1 generated_text += content or "" elif usage := data.get("usage"): output.output_tokens = usage.get( @@ -345,6 +364,7 @@ async def async_request_openai_chat_completions( output.generated_text = generated_text output.latency = most_recent_timestamp - st + output.decode_iteration = decode_iteration_count else: content = await response.content.read() data = json.loads(content.decode()) @@ -354,6 +374,8 @@ async def async_request_openai_chat_completions( output.itl = [] output.latency = time.perf_counter() - st output.ttft = -1 + # For non-streaming, estimate decode_iteration as number of output tokens + output.decode_iteration = output.output_tokens if output.output_tokens > 0 else 1 else: output.error = response.reason or "" diff --git a/tensorrt_llm/serve/scripts/benchmark_serving.py b/tensorrt_llm/serve/scripts/benchmark_serving.py index cedbe34056..5ca3a63a5d 100644 --- a/tensorrt_llm/serve/scripts/benchmark_serving.py +++ b/tensorrt_llm/serve/scripts/benchmark_serving.py @@ -79,6 +79,11 @@ class BenchmarkMetrics: std_e2el_ms: float percentiles_e2el_ms: list[tuple[float, float]] tput_user: list[float] + # Request accuracy rate metrics + mean_request_ar: float + median_request_ar: float + std_request_ar: float + percentiles_request_ar: list[tuple[float, float]] async def get_request( @@ -131,7 +136,7 @@ def calculate_metrics( selected_percentile_metrics: list[str], selected_percentiles: list[float], goodput_config_dict: dict[str, float], -) -> tuple[BenchmarkMetrics, list[int]]: +) -> tuple[BenchmarkMetrics, list[int], list[float]]: actual_output_lens: list[int] = [] total_input = 0 completed = 0 @@ -142,6 +147,7 @@ def calculate_metrics( ttfts: list[float] = [] e2els: list[float] = [] tput_user: list[float] = [] + request_ars: list[float] = [] # Request accuracy rates for i in range(len(outputs)): if outputs[i].success: output_len = outputs[i].output_tokens @@ -167,9 +173,24 @@ def calculate_metrics( ttfts.append(outputs[i].ttft) e2els.append(outputs[i].latency) tput_user.append(output_len / (outputs[i].latency)) + + # Calculate request accuracy rate (num_generated_tokens / (decode_iteration + 1)) + decode_iter = outputs[i].decode_iteration + if decode_iter >= 0: + # For generated tokens, we use output_len - 1 (excluding the first token if needed) + # But according to the reference, it should be num_generated_tokens + num_generated_tokens = max(0, output_len - + 1) if output_len > 1 else output_len + request_ar = num_generated_tokens / ( + decode_iter + 1) if decode_iter >= 0 else 0.0 + request_ars.append(request_ar) + else: + request_ars.append(0.0) + completed += 1 else: actual_output_lens.append(0) + request_ars.append(0.0) if goodput_config_dict: valid_metrics = [] @@ -228,8 +249,13 @@ def calculate_metrics( percentiles_e2el_ms=[(p, np.percentile(e2els or 0, p) * 1000) for p in selected_percentiles], tput_user=np.mean(tput_user or 0), + mean_request_ar=np.mean(request_ars or 0), + median_request_ar=np.median(request_ars or 0), + std_request_ar=np.std(request_ars or 0), + percentiles_request_ar=[(p, np.percentile(request_ars or 0, p)) + for p in selected_percentiles], ) - return metrics, actual_output_lens + return metrics, actual_output_lens, request_ars async def benchmark( @@ -403,7 +429,7 @@ async def benchmark( # Close the session await session.close() - metrics, actual_output_lens = calculate_metrics( + metrics, actual_output_lens, request_ars = calculate_metrics( input_requests=input_requests, outputs=outputs, dur_s=benchmark_duration, @@ -431,6 +457,10 @@ async def benchmark( metrics.total_token_throughput)) print("{:<40} {:<10.2f}".format("User throughput (tok/s):", metrics.tput_user)) + print("{:<40} {:<10.4f}".format("Mean Request AR:", + metrics.mean_request_ar)) + print("{:<40} {:<10.4f}".format("Median Request AR:", + metrics.median_request_ar)) result = { "duration": benchmark_duration, @@ -443,12 +473,16 @@ async def benchmark( "output_throughput": metrics.output_throughput, "total_token_throughput": metrics.total_token_throughput, "user_throughput": metrics.tput_user, + "mean_request_ar": metrics.mean_request_ar, + "median_request_ar": metrics.median_request_ar, "input_lens": [output.prompt_len for output in outputs], "output_lens": actual_output_lens, "ttfts": [output.ttft for output in outputs], "itls": [output.itl for output in outputs], "generated_texts": [output.generated_text for output in outputs], "errors": [output.error for output in outputs], + "request_ars": request_ars, + "decode_iterations": [output.decode_iteration for output in outputs], } def process_one_metric( @@ -534,11 +568,15 @@ def save_to_pytorch_benchmark_format(args: argparse.Namespace, metrics = [ "median_ttft_ms", "mean_ttft_ms", "std_ttft_ms", "p99_ttft_ms", "mean_tpot_ms", "median_tpot_ms", "std_tpot_ms", "p99_tpot_ms", - "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms" + "median_itl_ms", "mean_itl_ms", "std_itl_ms", "p99_itl_ms", + "mean_request_ar", "median_request_ar", "std_request_ar" ] # These raw data might be useful, but they are rather big. They can be added # later if needed - ignored_metrics = ["ttfts", "itls", "generated_texts", "errors"] + ignored_metrics = [ + "ttfts", "itls", "generated_texts", "errors", "request_ars", + "decode_iterations" + ] pt_records = convert_to_pytorch_benchmark_format( args=args, metrics={k: [results[k]] @@ -762,7 +800,8 @@ def main(args: argparse.Namespace): # Remove fields with too many data points for field in [ "input_lens", "output_lens", "ttfts", "itls", - "generated_texts", "errors" + "generated_texts", "errors", "request_ars", + "decode_iterations" ]: if field in result_json: del result_json[field] @@ -963,11 +1002,11 @@ if __name__ == "__main__": parser.add_argument( "--percentile-metrics", type=str, - default="ttft,tpot,itl", + default="ttft,tpot,itl,request_ar", help="Comma-separated list of selected metrics to report percentils. " "This argument specifies the metrics to report percentiles. " - "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\". " - "Default value is \"ttft,tpot,itl\".") + "Allowed metric names are \"ttft\", \"tpot\", \"itl\", \"e2el\", \"request_ar\". " + "Default value is \"ttft,tpot,itl,request_ar\".") parser.add_argument( "--metric-percentiles", type=str,