mirror of
https://github.com/vllm-project/vllm.git
synced 2026-06-06 00:16:14 +00:00
35d0141a0b
Signed-off-by: Divakar Verma <divakar.verma@amd.com>
132 lines
4.8 KiB
Python
132 lines
4.8 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
from unittest.mock import MagicMock, patch
|
|
|
|
import torch
|
|
from vllm_test_utils.monitor import monitor
|
|
|
|
from vllm.utils.mem_utils import MemorySnapshot, memory_profiling
|
|
|
|
from ..utils import create_new_process_for_each_test
|
|
|
|
|
|
@create_new_process_for_each_test()
|
|
def test_memory_profiling():
|
|
# Fake out some model loading + inference memory usage to test profiling
|
|
# Memory used by other processes will show up as cuda usage outside of torch
|
|
from vllm.distributed.device_communicators.cuda_wrapper import CudaRTLibrary
|
|
|
|
lib = CudaRTLibrary()
|
|
# 512 MiB allocation outside of this instance
|
|
handle1 = lib.cudaMalloc(512 * 1024 * 1024)
|
|
|
|
# Warm up PyTorch's CUDA/ROCm context so that its internal initialization
|
|
# overhead (streams, cuBLAS handles, etc.) is included in the baseline and
|
|
# does not inflate non-torch increase which is larger on ROCm than on CUDA
|
|
_warmup = torch.zeros(1, device="cuda")
|
|
del _warmup
|
|
torch.accelerator.empty_cache()
|
|
|
|
baseline_snapshot = MemorySnapshot()
|
|
|
|
# load weights
|
|
|
|
weights = torch.randn(128, 1024, 1024, device="cuda", dtype=torch.float32)
|
|
|
|
weights_memory = 128 * 1024 * 1024 * 4 # 512 MiB
|
|
|
|
def measure_current_non_torch():
|
|
free, total = torch.cuda.mem_get_info()
|
|
current_used = total - free
|
|
current_torch = torch.accelerator.memory_reserved()
|
|
current_non_torch = current_used - current_torch
|
|
return current_non_torch
|
|
|
|
with (
|
|
memory_profiling(
|
|
baseline_snapshot=baseline_snapshot, weights_memory=weights_memory
|
|
) as result,
|
|
monitor(measure_current_non_torch) as monitored_values,
|
|
):
|
|
# make a memory spike, 1 GiB
|
|
spike = torch.randn(256, 1024, 1024, device="cuda", dtype=torch.float32)
|
|
del spike
|
|
|
|
# Add some extra non-torch memory 256 MiB (simulate NCCL)
|
|
handle2 = lib.cudaMalloc(256 * 1024 * 1024)
|
|
|
|
# this is an analytic value, it is exact,
|
|
# we only have 256 MiB non-torch memory increase
|
|
measured_diff = monitored_values.values[-1] - monitored_values.values[0]
|
|
assert measured_diff == 256 * 1024 * 1024
|
|
|
|
# Check that the memory usage is within 5% of the expected values
|
|
# 5% tolerance is caused by cuda runtime.
|
|
# we cannot control cuda runtime in the granularity of bytes,
|
|
# which causes a small error (<10 MiB in practice)
|
|
non_torch_ratio = result.non_torch_increase / (256 * 1024 * 1024) # noqa
|
|
assert abs(non_torch_ratio - 1) <= 0.05
|
|
assert result.torch_peak_increase == 1024 * 1024 * 1024
|
|
del weights
|
|
lib.cudaFree(handle1)
|
|
lib.cudaFree(handle2)
|
|
|
|
|
|
def test_memory_snapshot_uses_psutil_on_integrated_gpu():
|
|
"""On integrated (UMA) GPUs, free_memory should come from psutil."""
|
|
mock_cuda_free = 40 * 1024**3
|
|
mock_cuda_total = 120 * 1024**3
|
|
mock_psutil_available = 100 * 1024**3
|
|
|
|
with (
|
|
patch("vllm.utils.mem_utils.current_platform") as mock_platform,
|
|
patch("vllm.utils.mem_utils.psutil") as mock_psutil,
|
|
):
|
|
mock_platform.mem_get_info.return_value = (
|
|
mock_cuda_free,
|
|
mock_cuda_total,
|
|
)
|
|
mock_platform.is_integrated_gpu.return_value = True
|
|
mock_platform.memory_stats.return_value = {
|
|
"allocated_bytes.all.peak": 0,
|
|
}
|
|
mock_platform.memory_reserved.return_value = 0
|
|
mock_platform.current_device = lambda: "cuda:0"
|
|
|
|
mock_vmem = MagicMock()
|
|
mock_vmem.available = mock_psutil_available
|
|
mock_psutil.virtual_memory.return_value = mock_vmem
|
|
|
|
snapshot = MemorySnapshot(device="cuda:0")
|
|
|
|
assert snapshot.free_memory == mock_psutil_available
|
|
assert snapshot.total_memory == mock_cuda_total
|
|
mock_psutil.virtual_memory.assert_called_once()
|
|
|
|
|
|
def test_memory_snapshot_uses_cuda_on_discrete_gpu():
|
|
"""On discrete GPUs, free_memory should come from CUDA mem_get_info."""
|
|
mock_cuda_free = 70 * 1024**3
|
|
mock_cuda_total = 80 * 1024**3
|
|
|
|
with (
|
|
patch("vllm.utils.mem_utils.current_platform") as mock_platform,
|
|
patch("vllm.utils.mem_utils.psutil") as mock_psutil,
|
|
):
|
|
mock_platform.mem_get_info.return_value = (
|
|
mock_cuda_free,
|
|
mock_cuda_total,
|
|
)
|
|
mock_platform.is_integrated_gpu.return_value = False
|
|
mock_platform.memory_stats.return_value = {
|
|
"allocated_bytes.all.peak": 0,
|
|
}
|
|
mock_platform.memory_reserved.return_value = 0
|
|
mock_platform.current_device = lambda: "cuda:0"
|
|
|
|
snapshot = MemorySnapshot(device="cuda:0")
|
|
|
|
assert snapshot.free_memory == mock_cuda_free
|
|
assert snapshot.total_memory == mock_cuda_total
|
|
mock_psutil.virtual_memory.assert_not_called()
|