[None][chore] Print memory usage before/after accuracy test in CI (#11155)

Signed-off-by: Taylor Yeonbok Lee <249374542+taylor-yb-lee@users.noreply.github.com>
2026-02-04 18:21:52 +08:00 · 2026-02-02 21:23:14 -08:00 · 2026-02-02 21:23:14 -08:00 · 304dc6f3c0
commit 304dc6f3c0
parent 12b4ebd0ad
1 changed files with 31 additions and 0 deletions
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@ -14,6 +14,7 @@
 # limitations under the License.

 import pytest
+import torch
 from defs.conftest import skip_pre_blackwell
 from test_common.llm_data import hf_id_to_local_model_dir, llm_models_root

@ -25,6 +26,34 @@ from ..conftest import get_device_count, llm_models_root
 from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness


+def print_memory_usage(label: str):
+    """Print detailed memory usage for all CUDA devices."""
+    print(f"\n{'=' * 60}")
+    print(f"Memory Usage: {label}")
+    print(f"{'=' * 60}")
+    num_devices = torch.cuda.device_count()
+    for device_id in range(num_devices):
+        allocated = torch.cuda.memory_allocated(device_id) / 1024**3
+        reserved = torch.cuda.memory_reserved(device_id) / 1024**3
+        peak_allocated = torch.cuda.max_memory_allocated(device_id) / 1024**3
+        peak_reserved = torch.cuda.max_memory_reserved(device_id) / 1024**3
+        cached_available = reserved - allocated  # Available in PyTorch's cache
+        free, total = torch.cuda.mem_get_info(device_id)
+        free_gb = free / 1024**3
+        total_gb = total / 1024**3
+        used_gb = total_gb - free_gb
+        print(f"  Device {device_id}:")
+        print(f"    Allocated:       {allocated:.2f} GB")
+        print(f"    Reserved:        {reserved:.2f} GB")
+        print(f"    Peak Allocated:  {peak_allocated:.2f} GB")
+        print(f"    Peak Reserved:   {peak_reserved:.2f} GB")
+        print(
+            f"    Available:       {cached_available:.2f} GB (in PyTorch cache) | {free_gb:.2f} GB (on GPU)"
+        )
+        print(f"    GPU Total:       {used_gb:.2f} / {total_gb:.2f} GB")
+    print(f"{'=' * 60}\n")
+
+
 class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
    MODEL_NAME = "meta-llama/Llama-3.1-8B"
    MODEL_PATH = hf_id_to_local_model_dir(MODEL_NAME)
@ -310,6 +339,7 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
    def test_bf16(self):
        kwargs = self.get_default_kwargs()
        sampling_params = self.get_default_sampling_params()
+        print_memory_usage("Before evaluation")
        with AutoDeployLLM(model=self.MODEL_PATH_BF16,
                           tokenizer=self.MODEL_PATH_BF16,
                           world_size=4,
@ -318,6 +348,7 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
            task.evaluate(llm, sampling_params=sampling_params)
            task = GSM8K(self.MODEL_NAME)
            task.evaluate(llm)
+        print_memory_usage("After evaluation")

    @pytest.mark.skip("Skipping FP8 test until it is supported")
    @pytest.mark.skip_less_device_memory(180000)