TensorRT-LLMs/triton_backend/tools/utils/utils.py

import csv
import json
import math
import queue
import random
from datetime import timedelta
from functools import partial

import numpy as np
import pandas as pd
import tritonclient.grpc as grpcclient
import tritonclient.http as httpclient
from tabulate import tabulate
from tritonclient.utils import np_to_triton_dtype


class UserData:

    def __init__(self):
        self._completed_requests = queue.Queue()
        self._latencies = []
        self._latency_dict = {}
        self._start_time_dict = {}
        self._stop_time_dict = {}


# Callback function used for async_stream_infer()
def completion_callback(user_data, result, error):
    # passing error raise and handling out
    user_data._completed_requests.put((result, error))


def prepare_tensor(name, input, protocol):
    client_util = httpclient if protocol == "http" else grpcclient
    t = client_util.InferInput(name, input.shape,
                               np_to_triton_dtype(input.dtype))
    t.set_data_from_numpy(input)
    return t


def prepare_outputs(protocol,
                    return_log_probs=False,
                    return_context_logits=False,
                    return_generation_logits=False,
                    return_finish_reason=False,
                    return_stop_reason=False,
                    return_cumulative_logprob=False):

    client_util = httpclient if protocol == "http" else grpcclient

    outputs = []
    outputs.append(client_util.InferRequestedOutput("text_output"))

    if return_log_probs:
        outputs.append(client_util.InferRequestedOutput("cum_log_probs"))
        outputs.append(client_util.InferRequestedOutput("output_log_probs"))

    if return_context_logits:
        outputs.append(client_util.InferRequestedOutput("context_logits"))

    if return_generation_logits:
        outputs.append(client_util.InferRequestedOutput("generation_logits"))

    if return_finish_reason:
        outputs.append(client_util.InferRequestedOutput("finish_reason"))

    if return_stop_reason:
        outputs.append(client_util.InferRequestedOutput("stop_reason"))

    if return_cumulative_logprob:
        outputs.append(client_util.InferRequestedOutput("cumulative_logprob"))

    return outputs


def prepare_inputs(input_start_ids, input_len, pad_id, end_id, flags):
    output_len = np.ones([input_start_ids.shape[0], 1]).astype(
        np.int32) * flags.output_len
    runtime_top_k = (flags.topk *
                     np.ones([input_start_ids.shape[0], 1])).astype(np.int32)
    runtime_top_p = flags.topp * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
    beam_search_diversity_rate = 0.0 * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
    temperature = 1.0 * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
    len_penalty = 1.0 * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
    repetition_penalty = 1.0 * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
    seed = 0 * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.uint64)
    output_log_probs = True * \
        np.ones([input_start_ids.shape[0], 1]).astype(bool)
    beam_width = (flags.beam_width *
                  np.ones([input_start_ids.shape[0], 1])).astype(np.int32)
    pad_ids = pad_id * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
    end_ids = end_id * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
    min_tokens = 1 * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.int32)
    presence_penalty = 0.0 * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
    frequency_penalty = 0.0 * \
        np.ones([input_start_ids.shape[0], 1]).astype(np.float32)
    bad_words_list = np.concatenate([
        np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32),
        (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)
    ],
                                    axis=1)
    stop_word_list = np.concatenate([
        np.zeros([input_start_ids.shape[0], 1, 1]).astype(np.int32),
        (-1 * np.ones([input_start_ids.shape[0], 1, 1])).astype(np.int32)
    ],
                                    axis=1)
    inputs = [
        prepare_tensor("input_ids", input_start_ids, flags.protocol),
        prepare_tensor("input_lengths", input_len, flags.protocol),
        prepare_tensor("request_output_len", output_len, flags.protocol),
        prepare_tensor("pad_id", pad_ids, flags.protocol),
        prepare_tensor("end_id", end_ids, flags.protocol),
        prepare_tensor("beam_width", beam_width, flags.protocol),
        prepare_tensor("temperature", temperature, flags.protocol),
        prepare_tensor("runtime_top_k", runtime_top_k, flags.protocol),
        prepare_tensor("runtime_top_p", runtime_top_p, flags.protocol),
        prepare_tensor("len_penalty", len_penalty, flags.protocol),
        prepare_tensor("repetition_penalty", repetition_penalty,
                       flags.protocol),
        prepare_tensor("min_tokens", min_tokens, flags.protocol),
        prepare_tensor("presence_penalty", presence_penalty, flags.protocol),
        prepare_tensor("frequency_penalty", frequency_penalty, flags.protocol),
        prepare_tensor("seed", seed, flags.protocol),
        prepare_tensor("output_log_probs", output_log_probs, flags.protocol),
        # prepare_tensor("bad_words_list", bad_words_list, flags.protocol),
        # prepare_tensor("stop_words_list", stop_word_list, flags.protocol),
    ]
    return inputs


def create_inference_server_client(protocol, url, concurrency, verbose):
    client_util = httpclient if protocol == "http" else grpcclient
    if protocol == "http":
        return client_util.InferenceServerClient(url,
                                                 concurrency=concurrency,
                                                 verbose=verbose)
    elif protocol == "grpc":
        return client_util.InferenceServerClient(url, verbose=verbose)


def send_requests(model_name, inputs, client, request_parallelism):
    results = []
    for _ in range(request_parallelism):
        result = client.infer(model_name, inputs)
        results.append(result)
    return results


def send_requests_async(model_name, inputs, client, flags, request_parallelism):
    if flags.protocol == "http":
        async_requests = []
        for _ in range(request_parallelism):
            async_requests.append(client.async_infer(model_name, inputs))
        return async_requests
    else:
        user_data = UserData()
        for _ in range(request_parallelism):
            client.async_infer(model_name, inputs,
                               partial(completion_callback, user_data))
        return user_data


def get_http_results(async_requests):
    results = []
    for async_request in async_requests:
        results.append(async_request.get_result())
    return results


def get_grpc_results(user_data, request_parallelism):
    results = []
    processed_count = 0
    while processed_count < request_parallelism:
        (result, error) = user_data._completed_requests.get()
        processed_count += 1
        if error is not None:
            raise RuntimeError(error)
        results.append(result)
    return results


def append_start_and_end_ids(inputs,
                             batch_size,
                             flags,
                             start_id=None,
                             end_id=None):
    if start_id is not None:
        start_ids = start_id * np.ones([batch_size, 1]).astype(np.int32)
        inputs.append(prepare_tensor("start_id", start_ids, flags.protocol))
    if end_id is not None:
        end_ids = end_id * np.ones([batch_size, 1]).astype(np.int32)
        inputs.append(prepare_tensor("end_id", end_ids, flags.protocol))


def generate_histogram(range_buckets, frequencies):
    histogram = []

    for i in range(len(range_buckets)):
        bucket = range_buckets[i]
        frequency = frequencies[i]

        # Split the bucket range into min and max values
        min_range, max_range = bucket

        # Generate 'frequency' random values within the specified range
        random.seed(420)
        random_values = [
            random.randint(min_range, max_range) for _ in range(frequency)
        ]

        # Extend the histogram with the random values
        histogram.extend(random_values)

    # Randomize the order of values in the histogram
    random.shuffle(histogram)

    return histogram


def get_token_list_from_histogram(histogram_key):

    histogram_buckets = {
        "example_ip": [(151, 175), (176, 200), (201, 225), (226, 250),
                       (251, 275)],
        "example_op": [(6, 10), (11, 15), (16, 20), (21, 25), (26, 30)]
    }
    histogram_freq = {
        "example_ip": [220, 225, 150, 150, 140],
        "example_op": [76, 210, 174, 130, 152]
    }

    range_buckets = histogram_buckets[histogram_key]
    freqs = histogram_freq[histogram_key]
    assert (len(range_buckets) == len(freqs))

    return generate_histogram(range_buckets, freqs)


def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs):
    if delay_dist == "constant":
        delays = [mean_time_bet_reqs] * num_reqs
    elif delay_dist == "exponential_dist":
        delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs)

    return delays


def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs):
    # set seed for determinism
    np.random.seed(420)
    return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist()


def get_norm_dist_tokens(mean, stdev, num_reqs):
    # set seed for determinism
    np.random.seed(420)
    numbers_list = np.random.normal(loc=mean, scale=stdev,
                                    size=num_reqs).tolist()
    return [max(1, math.ceil(x)) for x in numbers_list]


def gen_random_start_ids(ip_lens):
    input_start_ids = []
    for ip_len in ip_lens:
        start_ids = list(
            np.random.randint(low=0,
                              high=np.iinfo(np.int32).max,
                              size=ip_len,
                              dtype=np.int32))
        input_start_ids.append(np.array([start_ids]))

    return input_start_ids


def get_list_of_delays(delay_dist, mean_time_bet_reqs, num_reqs):
    if delay_dist == "constant":
        delays = [mean_time_bet_reqs] * num_reqs
    elif delay_dist == "exponential_dist":
        delays = get_exponential_dist_delays(mean_time_bet_reqs, num_reqs)

    return delays


def get_exponential_dist_delays(mean_time_bet_reqs, num_reqs):
    return np.random.exponential(mean_time_bet_reqs, num_reqs).tolist()


def get_norm_dist_tokens(mean, stdev, num_reqs):
    numbers_list = np.random.normal(loc=mean, scale=stdev,
                                    size=num_reqs).tolist()
    return [max(1, math.ceil(x)) for x in numbers_list]


def get_inflight_reqs_profile(start_times, end_times, requests_per_sec):
    """
    Receives start and end times of all requests,
    divides total E2E time into equal intervals and assigns how many requests are in flight
    in each interval.
    """
    # Calculate min of start time and max of end time
    min_start_time = min(start_times)
    max_end_time = max(end_times)

    # need to have enough resolution intervals depending on avg. latency per request. 10 times smaller than request processing time
    sec_per_request = 1.0 / requests_per_sec
    NUM_INTERVALS = int((max_end_time - min_start_time) /
                        timedelta(seconds=(sec_per_request / 10)))
    print(NUM_INTERVALS)
    # Calculate interval length
    interval_length = (max_end_time - min_start_time) / NUM_INTERVALS

    # Initialize a list to store the count of requests in each interval
    interval_counts = [0] * NUM_INTERVALS

    # Iterate through the requests and update interval counts
    for i in range(len(start_times)):
        start = start_times[i]
        end = end_times[i]

        # Calculate which interval the request falls into
        interval_index = int((start - min_start_time) / interval_length)

        # Increment the count for that interval and subsequent intervals until end
        while start < end and interval_index < NUM_INTERVALS:
            interval_counts[interval_index] += 1
            interval_index += 1
            start += interval_length

    return interval_counts


def extract_print_stats(ip_token_len_list, responses, user_data, FLAGS):

    #### Gather info about requests
    op_token_len_list = []
    op_token_len_ooo = {}

    for response in responses:
        #JG: long sequence to extract output length from response json dict. Responses are out of order
        op_token_len_ooo[response.get_response(as_json=True)['id']] = \
            int(response.get_response(as_json=True)['outputs'][0]['shape'][2])

    op_token_len_list = [
        value for key, value in sorted(op_token_len_ooo.items())
    ]

    assert (len(op_token_len_list) == len(ip_token_len_list))
    if not FLAGS.exclude_input_in_output:
        for i in range(len(op_token_len_list)):
            op_token_len_list[i] = op_token_len_list[i] - ip_token_len_list[i]

    # Get latencies per request
    # Order latencies based on issue order.
    latency_list_in_order = [
        value for key, value in sorted(user_data._latency_dict.items())
    ]
    start_time_list_in_order = [
        value for key, value in sorted(user_data._start_time_dict.items())
    ]
    stop_time_list_in_order = [
        value for key, value in sorted(user_data._stop_time_dict.items())
    ]

    latency_sorted = np.sort(latency_list_in_order)
    index_99 = math.ceil(len(latency_sorted) * 0.99)
    index_90 = math.ceil(len(latency_sorted) * 0.90)

    data = {
        'latency': latency_list_in_order,
        'start_time': start_time_list_in_order,
        'stop_time': stop_time_list_in_order,
        'num_ip_tokens': ip_token_len_list,
        'num_op_tokens': op_token_len_list
    }

    # Bundle everything in a single DF
    df = pd.DataFrame(data)

    #stats
    df['num_ip_tokens'].sum()
    avg_ip_tokens = df['num_ip_tokens'].mean()
    df['num_ip_tokens'].median()
    df['num_ip_tokens'].std()
    total_op_tokens = df['num_op_tokens'].sum()
    avg_op_tokens = df['num_op_tokens'].mean()
    df['num_op_tokens'].median()
    df['num_op_tokens'].std()

    tend = max(df['stop_time'].tolist())
    t0 = min(df['start_time'].tolist())
    total_latency = (tend - t0).total_seconds()
    requests_per_sec = len(responses) / total_latency
    tokens_generated_per_sec = total_op_tokens / total_latency

    avg_in_flight_requests = 0

    print_data_dict = {}
    print_data_dict["Requests/Sec"] = requests_per_sec
    print_data_dict["OP tokens/sec"] = tokens_generated_per_sec
    print_data_dict["Avg. latency (ms)"] = np.mean(latency_list_in_order)
    print_data_dict["P99 latency (ms)"] = latency_sorted[index_99 - 1]
    print_data_dict["P90 latency (ms)"] = latency_sorted[index_90 - 1]
    print_data_dict["Avg. Input tokens per request"] = avg_ip_tokens
    print_data_dict["Avg. Output tokens per request"] = avg_op_tokens
    print_data_dict["Avg. InFlight requests"] = avg_in_flight_requests
    print_data_dict["Total latency (ms)"] = total_latency * 1000
    print_data_dict["Total requests"] = len(responses)

    print_data = [["Requests/Sec", requests_per_sec],
                  ["OP tokens/sec", tokens_generated_per_sec],
                  ["Avg. latency (ms)",
                   np.mean(latency_list_in_order)],
                  ["P99 latency (ms)", latency_sorted[index_99 - 1]],
                  ["P90 latency (ms)", latency_sorted[index_90 - 1]],
                  ["Avg. IP tokens per request", avg_ip_tokens],
                  ["Avg. OP tokens per request", avg_op_tokens],
                  ["Avg. InFlight requests", avg_in_flight_requests],
                  ["Total latency (ms)", total_latency * 1000],
                  ["Total requests", len(responses)]]

    # Format numerical values to 2 decimal places
    formatted_data = [[item, f"{value:.2f}"] for item, value in print_data]
    headers = ["Stat", "Value"]
    table = tabulate(formatted_data, headers=headers, tablefmt="pretty")

    if FLAGS.op_stats_csv is not None:
        with open(FLAGS.op_stats_csv, "a", newline="") as file:
            filednames = print_data_dict.keys()
            writer = csv.DictWriter(file, fieldnames=filednames)

            # Check if the file is empty, and write the header if needed
            if file.tell() == 0:
                writer.writeheader()

            # Write the dictionaries as new rows
            writer.writerow(print_data_dict)

    print(table)

    if FLAGS.dump_perfetto_trace:
        json_dict = []
        for i in range(len(op_token_len_list)):
            req_dict = {}
            req_dict['name'] = 'req_{}'.format(i)
            req_dict["cat"] = "batch"
            req_dict["ph"] = "X"
            req_dict["ts"] = (start_time_list_in_order[i].timestamp() -
                              t0.timestamp()) * 1000000  #perfetto expects us
            req_dict["dur"] = (
                stop_time_list_in_order[i] -
                start_time_list_in_order[i]).total_seconds() * 1000000
            req_dict["pid"] = "1"
            req_dict["args"] = {
                "isl": int(ip_token_len_list[i]),
                "osl": int(op_token_len_list[i])
            }
            json_dict.append(req_dict)

        with open("prfetto_dump.json", "w") as file:
            json.dump(json_dict, file, indent=4)

    return print_data_dict


def extract_string_from_nested_list(nested_list):
    if isinstance(nested_list, str):
        return nested_list
    elif isinstance(nested_list, list):
        for item in nested_list:
            extracted_string = extract_string_from_nested_list(item)
            if extracted_string:
                return extracted_string
    return ""