mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
* Move TRT-LLM backend repo to TRT-LLM repo Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * Address review comments Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * debug ci Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * Update triton backend Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> * Fixes after update Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com> --------- Signed-off-by: Iman Tabrizian <10105175+tabrizian@users.noreply.github.com>
484 lines
18 KiB
Python
484 lines
18 KiB
Python
import csv
|
|
import json
|
|
import math
|
|
import queue
|
|
import random
|
|
from datetime import timedelta
|
|
from functools import partial
|
|
|
|
import numpy as np
|
|
import pandas as pd
|
|
import tritonclient.grpc as grpcclient
|
|
import tritonclient.http as httpclient
|
|
from tabulate import tabulate
|
|
from tritonclient.utils import np_to_triton_dtype
|
|
|
|
|
|
class UserData:
|
|
|
|
def __init__(self):
|
|
self._completed_requests = queue.Queue()
|
|
self._latencies = []
|
|
self._latency_dict = {}
|
|
self._start_time_dict = {}
|
|
self._stop_time_dict = {}
|
|
|
|
|
|
# Callback function used for async_stream_infer()
|
|
def completion_callback(user_data, result, error):
|
|
# passing error raise and handling out
|
|
user_data._completed_requests.put((result, error))
|
|
|
|
|
|
def prepare_tensor(name, input, protocol):
|
|
client_util = httpclient if protocol == "http" else grpcclient
|
|
t = client_util.InferInput(name, input.shape,
|
|
np_to_triton_dtype(input.dtype))
|
|
t.set_data_from_numpy(input)
|
|
return t
|
|
|
|
|
|
def prepare_outputs(protocol,
|
|
return_log_probs=False,
|
|
return_context_logits=False,
|
|
return_generation_logits=False,
|
|
return_finish_reason=False,
|
|
return_stop_reason=False,
|
|
return_cumulative_logprob=False):
|
|
|
|
client_util = httpclient if protocol == "http" else grpcclient
|
|
|
|
outputs = []
|
|
outputs.append(client_util.InferRequestedOutput("text_output"))
|
|
|
|
if return_log_probs:
|
|
outputs.append(client_util.InferRequestedOutput("cum_log_probs"))
|
|
outputs.append(client_util.InferRequestedOutput("output_log_probs"))
|
|
|
|
if return_context_logits:
|
|
outputs.append(client_util.InferRequestedOutput("context_logits"))
|
|
|
|
if return_generation_logits:
|
|
outputs.append(client_util.InferRequestedOutput("generation_logits"))
|
|
|
|
if return_finish_reason:
|
|
outputs.append(client_util.InferRequestedOutput("finish_reason"))
|
|
|
|
if return_stop_reason:
|
|
outputs.append(client_util.InferRequestedOutput("stop_reason"))
|
|
|
|
if return_cumulative_logprob:
|
|
outputs.append(client_util.InferRequestedOutput("cumulative_logprob"))
|
|
|
|
return outputs
|
|
|
|
|
|
def prepare_inputs(input_start_ids, input_len, pad_id, end_id, flags):
|
|
output_len = np.ones([input_start_ids.shape[0], 1]).astype(
|
|
np.int32) * flags.output_len
|
|
runtime_top_k = (flags.topk *
|
|
np.ones([input_start_ids.shape[0], 1])).astype(np.int32)
|
|
runtime_top_p = flags.topp * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
|
beam_search_diversity_rate = 0.0 * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
|
temperature = 1.0 * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
|
len_penalty = 1.0 * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
|
repetition_penalty = 1.0 * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
|
seed = 0 * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.uint64)
|
|
output_log_probs = True * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(bool)
|
|
beam_width = (flags.beam_width *
|
|
np.ones([input_start_ids.shape[0], 1])).astype(np.int32)
|
|
pad_ids = pad_id * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
|
|
end_ids = end_id * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
|
|
min_tokens = 1 * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
|
|
presence_penalty = 0.0 * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
|
frequency_penalty = 0.0 * \
|
|
np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
|
|
bad_words_list = np.concatenate([
|
|
np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32),
|
|
(-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)
|
|
],
|
|
axis=1)
|
|
stop_word_list = np.concatenate([
|
|
np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32),
|
|
(-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)
|
|
],
|
|
axis=1)
|
|
inputs = [
|
|
prepare_tensor("input_ids", input_start_ids, flags.protocol),
|
|
prepare_tensor("input_lengths", input_len, flags.protocol),
|
|
prepare_tensor("request_output_len", output_len, flags.protocol),
|
|
prepare_tensor("pad_id", pad_ids, flags.protocol),
|
|
prepare_tensor("end_id", end_ids, flags.protocol),
|
|
prepare_tensor("beam_width", beam_width, flags.protocol),
|
|
prepare_tensor("temperature", temperature, flags.protocol),
|
|
prepare_tensor("runtime_top_k", runtime_top_k, flags.protocol),
|
|
prepare_tensor("runtime_top_p", runtime_top_p, flags.protocol),
|
|
prepare_tensor("len_penalty", len_penalty, flags.protocol),
|
|
prepare_tensor("repetition_penalty", repetition_penalty,
|
|
flags.protocol),
|
|
prepare_tensor("min_tokens", min_tokens, flags.protocol),
|
|
prepare_tensor("presence_penalty", presence_penalty, flags.protocol),
|
|
prepare_tensor("frequency_penalty", frequency_penalty, flags.protocol),
|
|
prepare_tensor("seed", seed, flags.protocol),
|
|
prepare_tensor("output_log_probs", output_log_probs, flags.protocol),
|
|
# prepare_tensor("bad_words_list", bad_words_list, flags.protocol),
|
|
# prepare_tensor("stop_words_list", stop_word_list, flags.protocol),
|
|
]
|
|
return inputs
|
|
|
|
|
|
def create_inference_server_client(protocol, url, concurrency, verbose):
|
|
client_util = httpclient if protocol == "http" else grpcclient
|
|
if protocol == "http":
|
|
return client_util.InferenceServerClient(url,
|
|
concurrency=concurrency,
|
|
verbose=verbose)
|
|
elif protocol == "grpc":
|
|
return client_util.InferenceServerClient(url, verbose=verbose)
|
|
|
|
|
|
def send_requests(model_name, inputs, client, request_parallelism):
|
|
results = []
|
|
for _ in range(request_parallelism):
|
|
result = client.infer(model_name, inputs)
|
|
results.append(result)
|
|
return results
|
|
|
|
|
|
def send_requests_async(model_name, inputs, client, flags, request_parallelism):
|
|
if flags.protocol == "http":
|
|
async_requests = []
|
|
for _ in range(request_parallelism):
|
|
async_requests.append(client.async_infer(model_name, inputs))
|
|
return async_requests
|
|
else:
|
|
user_data = UserData()
|
|
for _ in range(request_parallelism):
|
|
client.async_infer(model_name, inputs,
|
|
partial(completion_callback, user_data))
|
|
return user_data
|
|
|
|
|
|
def get_http_results(async_requests):
|
|
results = []
|
|
for async_request in async_requests:
|
|
results.append(async_request.get_result())
|
|
return results
|
|
|
|
|
|
def get_grpc_results(user_data, request_parallelism):
|
|
results = []
|
|
processed_count = 0
|
|
while processed_count < request_parallelism:
|
|
(result, error) = user_data._completed_requests.get()
|
|
processed_count += 1
|
|
if error is not None:
|
|
raise RuntimeError(error)
|
|
results.append(result)
|
|
return results
|
|
|
|
|
|
def append_start_and_end_ids(inputs,
|
|
batch_size,
|
|
flags,
|
|
start_id=None,
|
|
end_id=None):
|
|
if start_id is not None:
|
|
start_ids = start_id * np.ones([batch_size, 1]).astype(np.int32)
|
|
inputs.append(prepare_tensor("start_id", start_ids, flags.protocol))
|
|
if end_id is not None:
|
|
end_ids = end_id * np.ones([batch_size, 1]).astype(np.int32)
|
|
inputs.append(prepare_tensor("end_id", end_ids, flags.protocol))
|
|
|
|
|
|
def generate_histogram(range_buckets, frequencies):
|
|
histogram = []
|
|
|
|
for i in range(len(range_buckets)):
|
|
bucket = range_buckets[i]
|
|
frequency = frequencies[i]
|
|
|
|
# Split the bucket range into min and max values
|
|
min_range, max_range = bucket
|
|
|
|
# Generate 'frequency' random values within the specified range
|
|
random.seed(420)
|
|
random_values = [
|
|
random.randint(min_range, max_range) for _ in range(frequency)
|
|
]
|
|
|
|
# Extend the histogram with the random values
|
|
histogram.extend(random_values)
|
|
|
|
# Randomize the order of values in the histogram
|
|
random.shuffle(histogram)
|
|
|
|
return histogram
|
|
|
|
|
|
def get_token_list_from_histogram(histogram_key):
|
|
|
|
histogram_buckets = {
|
|
"example_ip": [(151, 175), (176, 200), (201, 225), (226, 250),
|
|
(251, 275)],
|
|
"example_op": [(6, 10), (11, 15), (16, 20), (21, 25), (26, 30)]
|
|
}
|
|
histogram_freq = {
|
|
"example_ip": [220, 225, 150, 150, 140],
|
|
"example_op": [76, 210, 174, 130, 152]
|
|
}
|
|
|
|
range_buckets = histogram_buckets[histogram_key]
|
|
freqs = histogram_freq[histogram_key]
|
|
assert (len(range_buckets) == len(freqs))
|
|
|
|
return generate_histogram(range_buckets, freqs)
|
|
|
|
|
|
def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs):
|
|
if delay_dist == "constant":
|
|
delays = [mean_time_bet_reqs] * num_reqs
|
|
elif delay_dist == "exponential_dist":
|
|
delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs)
|
|
|
|
return delays
|
|
|
|
|
|
def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs):
|
|
# set seed for determinism
|
|
np.random.seed(420)
|
|
return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist()
|
|
|
|
|
|
def get_norm_dist_tokens(mean, stdev, num_reqs):
|
|
# set seed for determinism
|
|
np.random.seed(420)
|
|
numbers_list = np.random.normal(loc=mean, scale=stdev,
|
|
size=num_reqs).tolist()
|
|
return [max(1, math.ceil(x)) for x in numbers_list]
|
|
|
|
|
|
def gen_random_start_ids(ip_lens):
|
|
input_start_ids = []
|
|
for ip_len in ip_lens:
|
|
start_ids = list(
|
|
np.random.randint(low=0,
|
|
high=np.iinfo(np.int32).max,
|
|
size=ip_len,
|
|
dtype=np.int32))
|
|
input_start_ids.append(np.array([start_ids]))
|
|
|
|
return input_start_ids
|
|
|
|
|
|
def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs):
|
|
if delay_dist == "constant":
|
|
delays = [mean_time_bet_reqs] * num_reqs
|
|
elif delay_dist == "exponential_dist":
|
|
delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs)
|
|
|
|
return delays
|
|
|
|
|
|
def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs):
|
|
return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist()
|
|
|
|
|
|
def get_norm_dist_tokens(mean, stdev, num_reqs):
|
|
numbers_list = np.random.normal(loc=mean, scale=stdev,
|
|
size=num_reqs).tolist()
|
|
return [max(1, math.ceil(x)) for x in numbers_list]
|
|
|
|
|
|
def get_inflight_reqs_profile(start_times, end_times, requests_per_sec):
|
|
"""
|
|
Receives start and end times of all requests,
|
|
divides total E2E time into equal intervals and assigns how many requests are in flight
|
|
in each interval.
|
|
"""
|
|
# Calculate min of start time and max of end time
|
|
min_start_time = min(start_times)
|
|
max_end_time = max(end_times)
|
|
|
|
# need to have enough resolution intervals depending on avg. latency per request. 10 times smaller than request processing time
|
|
sec_per_request = 1.0 / requests_per_sec
|
|
NUM_INTERVALS = int((max_end_time - min_start_time) /
|
|
timedelta(seconds=(sec_per_request / 10)))
|
|
print(NUM_INTERVALS)
|
|
# Calculate interval length
|
|
interval_length = (max_end_time - min_start_time) / NUM_INTERVALS
|
|
|
|
# Initialize a list to store the count of requests in each interval
|
|
interval_counts = [0] * NUM_INTERVALS
|
|
|
|
# Iterate through the requests and update interval counts
|
|
for i in range(len(start_times)):
|
|
start = start_times[i]
|
|
end = end_times[i]
|
|
|
|
# Calculate which interval the request falls into
|
|
interval_index = int((start - min_start_time) / interval_length)
|
|
|
|
# Increment the count for that interval and subsequent intervals until end
|
|
while start < end and interval_index < NUM_INTERVALS:
|
|
interval_counts[interval_index] += 1
|
|
interval_index += 1
|
|
start += interval_length
|
|
|
|
return interval_counts
|
|
|
|
|
|
def extract_print_stats(ip_token_len_list, responses, user_data, FLAGS):
|
|
|
|
#### Gather info about requests
|
|
op_token_len_list = []
|
|
op_token_len_ooo = {}
|
|
|
|
for response in responses:
|
|
#JG: long sequence to extract output length from response json dict. Responses are out of order
|
|
op_token_len_ooo[response.get_response(as_json=True)['id']] = \
|
|
int(response.get_response(as_json=True)['outputs'][0]['shape'][2])
|
|
|
|
op_token_len_list = [
|
|
value for key, value in sorted(op_token_len_ooo.items())
|
|
]
|
|
|
|
assert (len(op_token_len_list) == len(ip_token_len_list))
|
|
if not FLAGS.exclude_input_in_output:
|
|
for i in range(len(op_token_len_list)):
|
|
op_token_len_list[i] = op_token_len_list[i] - ip_token_len_list[i]
|
|
|
|
# Get latencies per request
|
|
# Order latencies based on issue order.
|
|
latency_list_in_order = [
|
|
value for key, value in sorted(user_data._latency_dict.items())
|
|
]
|
|
start_time_list_in_order = [
|
|
value for key, value in sorted(user_data._start_time_dict.items())
|
|
]
|
|
stop_time_list_in_order = [
|
|
value for key, value in sorted(user_data._stop_time_dict.items())
|
|
]
|
|
|
|
latency_sorted = np.sort(latency_list_in_order)
|
|
index_99 = math.ceil(len(latency_sorted) * 0.99)
|
|
index_90 = math.ceil(len(latency_sorted) * 0.90)
|
|
|
|
data = {
|
|
'latency': latency_list_in_order,
|
|
'start_time': start_time_list_in_order,
|
|
'stop_time': stop_time_list_in_order,
|
|
'num_ip_tokens': ip_token_len_list,
|
|
'num_op_tokens': op_token_len_list
|
|
}
|
|
|
|
# Bundle everything in a single DF
|
|
df = pd.DataFrame(data)
|
|
|
|
#stats
|
|
df['num_ip_tokens'].sum()
|
|
avg_ip_tokens = df['num_ip_tokens'].mean()
|
|
df['num_ip_tokens'].median()
|
|
df['num_ip_tokens'].std()
|
|
total_op_tokens = df['num_op_tokens'].sum()
|
|
avg_op_tokens = df['num_op_tokens'].mean()
|
|
df['num_op_tokens'].median()
|
|
df['num_op_tokens'].std()
|
|
|
|
tend = max(df['stop_time'].tolist())
|
|
t0 = min(df['start_time'].tolist())
|
|
total_latency = (tend - t0).total_seconds()
|
|
requests_per_sec = len(responses) / total_latency
|
|
tokens_generated_per_sec = total_op_tokens / total_latency
|
|
|
|
avg_in_flight_requests = 0
|
|
|
|
print_data_dict = {}
|
|
print_data_dict["Requests/Sec"] = requests_per_sec
|
|
print_data_dict["OP tokens/sec"] = tokens_generated_per_sec
|
|
print_data_dict["Avg. latency (ms)"] = np.mean(latency_list_in_order)
|
|
print_data_dict["P99 latency (ms)"] = latency_sorted[index_99 - 1]
|
|
print_data_dict["P90 latency (ms)"] = latency_sorted[index_90 - 1]
|
|
print_data_dict["Avg. Input tokens per request"] = avg_ip_tokens
|
|
print_data_dict["Avg. Output tokens per request"] = avg_op_tokens
|
|
print_data_dict["Avg. InFlight requests"] = avg_in_flight_requests
|
|
print_data_dict["Total latency (ms)"] = total_latency * 1000
|
|
print_data_dict["Total requests"] = len(responses)
|
|
|
|
print_data = [["Requests/Sec", requests_per_sec],
|
|
["OP tokens/sec", tokens_generated_per_sec],
|
|
["Avg. latency (ms)",
|
|
np.mean(latency_list_in_order)],
|
|
["P99 latency (ms)", latency_sorted[index_99 - 1]],
|
|
["P90 latency (ms)", latency_sorted[index_90 - 1]],
|
|
["Avg. IP tokens per request", avg_ip_tokens],
|
|
["Avg. OP tokens per request", avg_op_tokens],
|
|
["Avg. InFlight requests", avg_in_flight_requests],
|
|
["Total latency (ms)", total_latency * 1000],
|
|
["Total requests", len(responses)]]
|
|
|
|
# Format numerical values to 2 decimal places
|
|
formatted_data = [[item, f"{value:.2f}"] for item, value in print_data]
|
|
headers = ["Stat", "Value"]
|
|
table = tabulate(formatted_data, headers=headers, tablefmt="pretty")
|
|
|
|
if FLAGS.op_stats_csv is not None:
|
|
with open(FLAGS.op_stats_csv, "a", newline="") as file:
|
|
filednames = print_data_dict.keys()
|
|
writer = csv.DictWriter(file, fieldnames=filednames)
|
|
|
|
# Check if the file is empty, and write the header if needed
|
|
if file.tell() == 0:
|
|
writer.writeheader()
|
|
|
|
# Write the dictionaries as new rows
|
|
writer.writerow(print_data_dict)
|
|
|
|
print(table)
|
|
|
|
if FLAGS.dump_perfetto_trace:
|
|
json_dict = []
|
|
for i in range(len(op_token_len_list)):
|
|
req_dict = {}
|
|
req_dict['name'] = 'req_{}'.format(i)
|
|
req_dict["cat"] = "batch"
|
|
req_dict["ph"] = "X"
|
|
req_dict["ts"] = (start_time_list_in_order[i].timestamp() -
|
|
t0.timestamp()) * 1000000 #perfetto expects us
|
|
req_dict["dur"] = (
|
|
stop_time_list_in_order[i] -
|
|
start_time_list_in_order[i]).total_seconds() * 1000000
|
|
req_dict["pid"] = "1"
|
|
req_dict["args"] = {
|
|
"isl": int(ip_token_len_list[i]),
|
|
"osl": int(op_token_len_list[i])
|
|
}
|
|
json_dict.append(req_dict)
|
|
|
|
with open("prfetto_dump.json", "w") as file:
|
|
json.dump(json_dict, file, indent=4)
|
|
|
|
return print_data_dict
|
|
|
|
|
|
def extract_string_from_nested_list(nested_list):
|
|
if isinstance(nested_list, str):
|
|
return nested_list
|
|
elif isinstance(nested_list, list):
|
|
for item in nested_list:
|
|
extracted_string = extract_string_from_nested_list(item)
|
|
if extracted_string:
|
|
return extracted_string
|
|
return ""
|