TensorRT-LLMs/tests/test_leak.py
2024-07-17 20:45:02 +08:00

87 lines
3.1 KiB
Python

import unittest
import tensorrt_llm as tllm
import tensorrt_llm.profiler as profiler
import psutil # isort:skip
def create_model():
''' Lots of parameters are created here, and thus memory increases
'''
profiler.print_memory_usage('Before creating Module')
config = {
'architecture': "LlamaForCausalLM",
'dtype': 'float32',
'num_hidden_layers': 2,
'num_attention_heads': 80,
'hidden_size': 12800,
'num_key_value_heads': 80,
'vocab_size': 50000,
'position_embedding_type': 'rope_gpt_neox',
'max_position_embeddings': 2048,
'hidden_act': 'silu'
}
config = tllm.models.PretrainedConfig.from_dict(config)
# About 24GiB model size, big enough to detect leak and avoid noise and false positive
# and small enough to make sure CI single-gpu machine can run it.
model = tllm.models.LLaMAForCausalLM.from_config(config)
profiler.print_memory_usage('After creating Module')
return model
def create_optimize_network():
builder = tllm.Builder()
model = create_model()
network = builder.create_network()
network.plugin_config.gpt_attention_plugin = 'float16'
profiler.print_memory_usage('Before creating Network')
with tllm.net_guard(network):
# Forward
inputs = model.prepare_inputs(max_batch_size=1,
max_input_len=1024,
max_seq_len=1024 + 32,
max_num_tokens=1024,
use_cache=True,
max_beam_width=1)
model(**inputs)
profiler.print_memory_usage('After creating Network')
# When the Network has gpt attention plugin layer, graph rewriting pattern matching is triggered,
# thus the Network._get_graph_impl will be called, and a lru_cache will be created to cache this Network object
# and thus these registered ndarrays inside the Network, these objects are destroyed only when the cache is full or the
# program ends
tllm.graph_rewriting.optimize(network)
def run():
# Create a TRT builder to warm up the memory, and avoid the noise of leak detection.
# Builder creation will create global objects like kernels.
_ = tllm.Builder()
used, _, _ = profiler.host_memory_info()
for i in range(5):
# Ideally the memory used inside create_optimize_network will all be released after the function returns
profiler.print_memory_usage(f'create_optimize_network {i} started')
create_optimize_network()
profiler.print_memory_usage(f'create_optimize_network {i} returned')
used_after, _, _ = profiler.host_memory_info()
mem_increase_in_gb = (used_after - used) / (1024**3)
# The model has more than 10GB, so if there is leak, it will be absolutely bigger than 1GB
assert mem_increase_in_gb < 1, f"Memory increased {mem_increase_in_gb} GB"
class TestHostMemLeak(unittest.TestCase):
def test_host_mem_leak(self):
tllm.logger.set_level('info')
run()
if __name__ == '__main__':
unittest.main()