[None][chore] Print memory usage before/after accuracy test in CI (#11155)

Signed-off-by: Taylor Yeonbok Lee <249374542+taylor-yb-lee@users.noreply.github.com>
This commit is contained in:
Taylor Yeonbok Lee 2026-02-02 21:23:14 -08:00 committed by GitHub
parent 12b4ebd0ad
commit 304dc6f3c0
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -14,6 +14,7 @@
# limitations under the License.
import pytest
import torch
from defs.conftest import skip_pre_blackwell
from test_common.llm_data import hf_id_to_local_model_dir, llm_models_root
@ -25,6 +26,34 @@ from ..conftest import get_device_count, llm_models_root
from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness
def print_memory_usage(label: str):
"""Print detailed memory usage for all CUDA devices."""
print(f"\n{'=' * 60}")
print(f"Memory Usage: {label}")
print(f"{'=' * 60}")
num_devices = torch.cuda.device_count()
for device_id in range(num_devices):
allocated = torch.cuda.memory_allocated(device_id) / 1024**3
reserved = torch.cuda.memory_reserved(device_id) / 1024**3
peak_allocated = torch.cuda.max_memory_allocated(device_id) / 1024**3
peak_reserved = torch.cuda.max_memory_reserved(device_id) / 1024**3
cached_available = reserved - allocated # Available in PyTorch's cache
free, total = torch.cuda.mem_get_info(device_id)
free_gb = free / 1024**3
total_gb = total / 1024**3
used_gb = total_gb - free_gb
print(f" Device {device_id}:")
print(f" Allocated: {allocated:.2f} GB")
print(f" Reserved: {reserved:.2f} GB")
print(f" Peak Allocated: {peak_allocated:.2f} GB")
print(f" Peak Reserved: {peak_reserved:.2f} GB")
print(
f" Available: {cached_available:.2f} GB (in PyTorch cache) | {free_gb:.2f} GB (on GPU)"
)
print(f" GPU Total: {used_gb:.2f} / {total_gb:.2f} GB")
print(f"{'=' * 60}\n")
class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
MODEL_NAME = "meta-llama/Llama-3.1-8B"
MODEL_PATH = hf_id_to_local_model_dir(MODEL_NAME)
@ -310,6 +339,7 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
def test_bf16(self):
kwargs = self.get_default_kwargs()
sampling_params = self.get_default_sampling_params()
print_memory_usage("Before evaluation")
with AutoDeployLLM(model=self.MODEL_PATH_BF16,
tokenizer=self.MODEL_PATH_BF16,
world_size=4,
@ -318,6 +348,7 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
task.evaluate(llm, sampling_params=sampling_params)
task = GSM8K(self.MODEL_NAME)
task.evaluate(llm)
print_memory_usage("After evaluation")
@pytest.mark.skip("Skipping FP8 test until it is supported")
@pytest.mark.skip_less_device_memory(180000)