import unittest import tensorrt_llm as tllm import tensorrt_llm.profiler as profiler import psutil # isort:skip def create_model(): ''' Lots of parameters are created here, and thus memory increases ''' profiler.print_memory_usage('Before creating Module') config = { 'architecture': "LlamaForCausalLM", 'dtype': 'float32', 'num_hidden_layers': 2, 'num_attention_heads': 80, 'hidden_size': 12800, 'num_key_value_heads': 80, 'vocab_size': 50000, 'position_embedding_type': 'rope_gpt_neox', 'max_position_embeddings': 2048, 'hidden_act': 'silu' } config = tllm.models.PretrainedConfig.from_dict(config) # About 24GiB model size, big enough to detect leak and avoid noise and false positive # and small enough to make sure CI single-gpu machine can run it. model = tllm.models.LLaMAForCausalLM.from_config(config) profiler.print_memory_usage('After creating Module') return model def create_optimize_network(): builder = tllm.Builder() model = create_model() network = builder.create_network() network.plugin_config.gpt_attention_plugin = 'float16' network.plugin_config.paged_kv_cache = True profiler.print_memory_usage('Before creating Network') with tllm.net_guard(network): # Forward inputs = model.prepare_inputs(max_batch_size=1, max_input_len=1024, max_seq_len=1024 + 32, max_num_tokens=1024, use_cache=True, max_beam_width=1) model(**inputs) profiler.print_memory_usage('After creating Network') # When the Network has gpt attention plugin layer, graph rewriting pattern matching is triggered, # thus the Network._get_graph_impl will be called, and a lru_cache will be created to cache this Network object # and thus these registered ndarrays inside the Network, these objects are destroyed only when the cache is full or the # program ends tllm.graph_rewriting.optimize(network) def run(): # Create a TRT builder to warm up the memory, and avoid the noise of leak detection. # Builder creation will create global objects like kernels. _ = tllm.Builder() used, _, _ = profiler.host_memory_info() for i in range(5): # Ideally the memory used inside create_optimize_network will all be released after the function returns profiler.print_memory_usage(f'create_optimize_network {i} started') create_optimize_network() profiler.print_memory_usage(f'create_optimize_network {i} returned') used_after, _, _ = profiler.host_memory_info() mem_increase_in_gb = (used_after - used) / (1024**3) # The model has more than 10GB, so if there is leak, it will be absolutely bigger than 1GB assert mem_increase_in_gb < 1, f"Memory increased {mem_increase_in_gb} GB" class TestHostMemLeak(unittest.TestCase): def test_host_mem_leak(self): tllm.logger.set_level('info') run() if __name__ == '__main__': unittest.main()