TensorRT-LLMs/tests/test_leak.py

import unittest

import tensorrt_llm as tllm
import tensorrt_llm.profiler as profiler

import psutil  # isort:skip


def create_model():
    ''' Lots of parameters are created here, and thus memory increases
    '''
    profiler.print_memory_usage('Before creating Module')

    config = {
        'architecture': "LlamaForCausalLM",
        'dtype': 'float32',
        'num_hidden_layers': 2,
        'num_attention_heads': 80,
        'hidden_size': 12800,
        'num_key_value_heads': 80,
        'vocab_size': 50000,
        'position_embedding_type': 'rope_gpt_neox',
        'max_position_embeddings': 2048,
        'hidden_act': 'silu'
    }
    config = tllm.models.PretrainedConfig.from_dict(config)

    # About 24GiB model size, big enough to detect leak and avoid noise and false positive
    # and small enough to make sure CI single-gpu machine can run it.
    model = tllm.models.LLaMAForCausalLM.from_config(config)
    profiler.print_memory_usage('After creating Module')
    return model


def create_optimize_network():
    builder = tllm.Builder()
    model = create_model()
    network = builder.create_network()
    network.plugin_config.gpt_attention_plugin = 'float16'
    profiler.print_memory_usage('Before creating Network')
    with tllm.net_guard(network):
        # Forward
        inputs = model.prepare_inputs(max_batch_size=1,
                                      max_input_len=1024,
                                      max_seq_len=1024 + 32,
                                      max_num_tokens=1024,
                                      use_cache=True,
                                      max_beam_width=1)
        model(**inputs)
    profiler.print_memory_usage('After creating Network')

    # When the Network has gpt attention plugin layer, graph rewriting pattern matching is triggered,
    # thus the Network._get_graph_impl will be called, and a lru_cache will be created to cache this Network object
    # and thus these registered ndarrays inside the Network, these objects are destroyed only when the cache is full or the
    # program ends
    tllm.graph_rewriting.optimize(network)


def run():
    # Create a TRT builder to warm up the memory, and avoid the noise of leak detection.
    # Builder creation will create global objects like kernels.
    _ = tllm.Builder()

    used, _, _ = profiler.host_memory_info()

    for i in range(5):
        # Ideally the memory used inside create_optimize_network will all be released after the function returns
        profiler.print_memory_usage(f'create_optimize_network {i} started')
        create_optimize_network()
        profiler.print_memory_usage(f'create_optimize_network {i} returned')

        used_after, _, _ = profiler.host_memory_info()
        mem_increase_in_gb = (used_after - used) / (1024**3)
        # The model has more than 10GB, so if there is leak, it will be absolutely bigger than 1GB
        assert mem_increase_in_gb < 1, f"Memory increased {mem_increase_in_gb} GB"


class TestHostMemLeak(unittest.TestCase):

    def test_host_mem_leak(self):
        tllm.logger.set_level('info')
        run()


if __name__ == '__main__':
    unittest.main()