TensorRT-LLMs/tensorrt_llm/profiler.py

# SPDX-FileCopyrightText: Copyright (c) 2022-2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import time
from functools import partial
from typing import Literal, Optional, Union

import tensorrt as trt

try:
    import psutil
except ImportError:
    psutil = None
try:
    import pynvml
except ImportError:
    pynvml = None
import traceback

import torch

from tensorrt_llm.builder import _is_building
from tensorrt_llm.logger import logger


class Timer:

    def __init__(self):
        self._start_times = {}
        self._total_elapsed_times = {}

    def start(self, tag):
        self._start_times[tag] = time.time()

    def stop(self, tag) -> float:
        elapsed_time = time.time() - self._start_times[tag]
        if tag not in self._total_elapsed_times:
            self._total_elapsed_times[tag] = 0
        self._total_elapsed_times[tag] += elapsed_time
        return elapsed_time

    def elapsed_time_in_sec(self, tag) -> float:
        if tag not in self._total_elapsed_times:
            return None
        return self._total_elapsed_times[tag]

    def reset(self):
        self._start_times.clear()
        self._total_elapsed_times.clear()

    def summary(self):
        logger.info('Profile Results')
        for tag, elapsed_time in self._total_elapsed_times.items():
            logger.info(f' - {tag.ljust(30, ".")}: {elapsed_time:.6f} (sec)')


_default_timer = Timer()


def start(tag):
    _default_timer.start(tag)


def stop(tag):
    return _default_timer.stop(tag)


def elapsed_time_in_sec(tag):
    return _default_timer.elapsed_time_in_sec(tag)


def reset():
    _default_timer.reset()


def summary():
    _default_timer.summary()


_pynvml_initialized = False


def initialize_pynvml():
    global _pynvml_initialized
    if pynvml is not None and not _pynvml_initialized:
        pynvml.nvmlInit()
        _pynvml_initialized = True


def finalize_pynvml():
    global _pynvml_initialized
    if pynvml is not None and _pynvml_initialized:
        pynvml.nvmlInvmlShutdownnit()
        _pynvml_initialized = False


class MemoryMonitor:

    TAG = '[MemUsage]'
    UnitType = Literal['GiB', 'MiB', 'KiB']
    units = {'GiB': 1 << 30, 'MiB': 1 << 20, 'KiB': 1 << 10}
    # For convenience.
    _rename_map = {'GB': 'GiB', 'MB': 'MiB', 'KiB': 'KB'}

    _maybe_warned = False

    def __init__(self):
        # bytes
        self._peak_host_memory = 0
        self._peak_device_memory = 0
        self._check_required_packages()

        self.device_handles = {}
        initialize_pynvml()

        if pynvml.__version__ < '11.5.0':
            logger.warning(f'Found pynvml=={pynvml.__version__}. Please use '
                           f'pynvml>=11.5.0 to get accurate memory usage')
            # Support legacy pynvml. Note that an old API could return
            # wrong GPU memory usage.
            self._device_mem__fn = pynvml.nvmlDeviceGetMemoryInfo
        else:
            self._device_mem__fn = partial(pynvml.nvmlDeviceGetMemoryInfo,
                                           version=pynvml.nvmlMemory_v2)

    @classmethod
    def _check_required_packages(cls):
        if cls._maybe_warned:
            return
        if psutil is None:
            # Warning once.
            logger.warning(
                "A required package 'psutil' is not installed. Will not "
                "monitor the host memory usages. Please install the package "
                "first, e.g, 'pip install psutil'.")
            return
        if pynvml is None:
            # Warning once.
            logger.warning(
                "A required package 'psutil' is not installed. Will not "
                "monitor the host memory usages. Please install the package "
                "first, e.g, 'pip install pynvml>=11.5.0'.")
        cls._maybe_warned = True

    def host_memory_info(self) -> int:
        process = psutil.Process()
        # USS reports the amount of memory that would be freed if the process
        # was terminated right now.
        #   https://psutil.readthedocs.io/en/latest/index.html#psutil.Process.memory_full_info
        vmem = psutil.virtual_memory()
        total_mem = vmem.total
        free_mem = vmem.available
        alloc_mem = process.memory_full_info().uss
        if alloc_mem > self._peak_host_memory:
            self._peak_host_memory = alloc_mem
        return alloc_mem, free_mem, total_mem

    def device_memory_info(
        self,
        device: Optional[Union[torch.device, int]] = None,
    ) -> int:
        if device is None:
            device = torch.cuda.current_device()
        index = device.index if isinstance(device, torch.device) else device
        if index not in self.device_handles:
            handle = pynvml.nvmlDeviceGetHandleByIndex(index)
            self.device_handles[index] = handle
        mem_info = self._device_mem__fn(self.device_handles[index])
        if mem_info.used > self._peak_device_memory:
            self._peak_device_memory = mem_info.used
        return mem_info.used, mem_info.free, mem_info.total

    @staticmethod
    def _normalize_unit_name(unit: str):
        # Rename GB -> GiB.
        return {'GB': 'GiB', 'MB': 'MiB', 'KiB': 'KB'}[unit]

    @classmethod
    def _format(cls, mem_bytes: int, unit: UnitType) -> str:
        if unit not in cls.units:
            unit = cls._rename_map[unit]
        mem_usage = float(mem_bytes) / cls.units[unit]
        return f'{mem_usage:.4f} ({unit})'

    @classmethod
    def _print_message(cls, msg: str, tag: Optional[str] = None):
        if tag:
            msg = f'{tag} - {msg}'
        logger.info(f'{cls.TAG} {msg}')

    def print_host_memory_usage(self,
                                tag: Optional[str] = None,
                                unit: UnitType = 'GiB'):
        if psutil is None:
            return
        alloc_mem, _, _ = self.host_memory_info()
        msg = f'Allocated Host Memory {self._format(alloc_mem, unit)}'
        self._print_message(msg, tag)

    def print_device_memory_usage(
        self,
        tag: Optional[str] = None,
        unit: UnitType = 'GB',
        device: Optional[Union[torch.device, int]] = None,
    ):
        alloc_mem, _, _ = self.device_memory_info(device)
        msg = f'Allocated Device Memory {self._format(alloc_mem, unit)}'
        self._print_message(msg, tag)

    def print_memory_usage(
        self,
        tag: Optional[str] = None,
        unit: UnitType = 'GiB',
        device: Optional[Union[torch.device, int]] = None,
    ):
        alloc_host_mem, _, _ = self.host_memory_info()
        alloc_device_mem, _, _ = self.device_memory_info(device=device)
        msg = f'Allocated Memory: Host {self._format(alloc_host_mem, unit)} '\
              f'Device {self._format(alloc_device_mem, unit)}'
        self._print_message(msg, tag)

    def print_peak_memory_usage(self, unit: UnitType = 'GiB'):
        self._print_message(
            f'Peak Memory Usage: '
            f'Host {self._format(self._peak_host_memory, unit)} '
            f'Device {self._format(self._peak_device_memory, unit)}')


if psutil is not None and pynvml is not None:
    _default_memory_monitor = MemoryMonitor()
else:
    _default_memory_monitor = None


def host_memory_info():
    if _default_memory_monitor is not None:
        return _default_memory_monitor.host_memory_info()


def device_memory_info(device: Optional[Union[torch.device, int]] = None):
    if _default_memory_monitor is not None:
        return _default_memory_monitor.device_memory_info(device)


def print_host_memory_usage(tag: Optional[str] = None,
                            unit: MemoryMonitor.UnitType = 'GiB'):
    if _default_memory_monitor is not None:
        _default_memory_monitor.print_host_memory_usage(tag=tag, unit=unit)


def print_device_memory_usage(tag: Optional[str] = None,
                              unit: MemoryMonitor.UnitType = 'GiB'):
    if _default_memory_monitor is not None:
        _default_memory_monitor.print_device_memory_usage(tag=tag, unit=unit)


def print_memory_usage(tag: Optional[str] = None,
                       unit: MemoryMonitor.UnitType = 'GiB'):
    if _default_memory_monitor is not None:
        _default_memory_monitor.print_memory_usage(tag=tag, unit=unit)


def print_peak_memory_usage(unit: MemoryMonitor.UnitType = 'GiB'):
    if _default_memory_monitor is not None:
        _default_memory_monitor.print_peak_memory_usage(unit=unit)


@_is_building
def check_gpt_mem_usage(engine, kv_dtype, use_gpt_attention_plugin,
                        paged_kv_cache, max_batch_size, max_beam_width,
                        max_input_len, max_output_len, local_num_kv_heads,
                        head_size, num_layers):
    # Get the amount of memory
    runtime = trt.Runtime(logger.trt_logger)
    activation_size = 0
    try:
        cuda_engine = runtime.deserialize_cuda_engine(engine)
        assert cuda_engine is not None
        activation_size = cuda_engine.device_memory_size / 1024 / 1024
        del cuda_engine
    except Exception:
        logger.warning(
            f'Exception when deserializing engine: {traceback.format_exc()}')
        logger.warning(f'Activation memory size will be regarded as 0.')
    logger.info(f'Activation memory size: {activation_size:.2f} MiB')
    weights_size = engine.nbytes / 1024 / 1024
    logger.info(f'Weights memory size: {weights_size:.2f} MiB')
    kv_cache_size = max_batch_size * max_beam_width * 2 * local_num_kv_heads * (
        max_input_len +
        max_output_len) * (head_size) * num_layers * kv_dtype.itemsize
    # without plugin, we need two set of kv cache buffers,
    # one for inputs, and the other for outputs.
    if not use_gpt_attention_plugin:
        kv_cache_size *= 2
    kv_cache_size = kv_cache_size / 1024 / 1024
    logger.info(f'Max KV Cache memory size: {kv_cache_size:.2f} MiB')
    est_memory_size = activation_size + weights_size + kv_cache_size
    logger.info(
        f'Estimated max memory usage on runtime: {est_memory_size:.2f} MiB')
    _, _, total_mem = device_memory_info(torch.cuda.current_device())
    if est_memory_size > total_mem:
        logger.warning(
            f'Engine is successfully built, but GPU Memory ({total_mem:.2f} GB)',
            ' may not be enough when inferencing on max shape.')
        if paged_kv_cache:
            logger.warning(
                f'Since paged_kv_cache is enabled, the max KV Cache ',
                'memory size is a estimate for very extreme cases, ',
                'it\'s possible that most cases won\'t meet OOM.')
        else:
            logger.warning(
                f'Enabling `--paged_kv_cache` could help reduce the ',
                'GPU memory usage on runtime.')

    return est_memory_size