diff --git a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
index 1ae49622a0..7c36bac3db 100644
--- a/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
+++ b/tests/integration/defs/accuracy/test_llm_api_autodeploy.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 import pytest
+import torch
 from defs.conftest import skip_pre_blackwell
 from test_common.llm_data import hf_id_to_local_model_dir, llm_models_root
 
@@ -25,6 +26,34 @@ from ..conftest import get_device_count, llm_models_root
 from .accuracy_core import GSM8K, MMLU, CnnDailymail, LlmapiAccuracyTestHarness
 
 
+def print_memory_usage(label: str):
+    """Print detailed memory usage for all CUDA devices."""
+    print(f"\n{'=' * 60}")
+    print(f"Memory Usage: {label}")
+    print(f"{'=' * 60}")
+    num_devices = torch.cuda.device_count()
+    for device_id in range(num_devices):
+        allocated = torch.cuda.memory_allocated(device_id) / 1024**3
+        reserved = torch.cuda.memory_reserved(device_id) / 1024**3
+        peak_allocated = torch.cuda.max_memory_allocated(device_id) / 1024**3
+        peak_reserved = torch.cuda.max_memory_reserved(device_id) / 1024**3
+        cached_available = reserved - allocated  # Available in PyTorch's cache
+        free, total = torch.cuda.mem_get_info(device_id)
+        free_gb = free / 1024**3
+        total_gb = total / 1024**3
+        used_gb = total_gb - free_gb
+        print(f"  Device {device_id}:")
+        print(f"    Allocated:       {allocated:.2f} GB")
+        print(f"    Reserved:        {reserved:.2f} GB")
+        print(f"    Peak Allocated:  {peak_allocated:.2f} GB")
+        print(f"    Peak Reserved:   {peak_reserved:.2f} GB")
+        print(
+            f"    Available:       {cached_available:.2f} GB (in PyTorch cache) | {free_gb:.2f} GB (on GPU)"
+        )
+        print(f"    GPU Total:       {used_gb:.2f} / {total_gb:.2f} GB")
+    print(f"{'=' * 60}\n")
+
+
 class TestLlama3_1_8B(LlmapiAccuracyTestHarness):
     MODEL_NAME = "meta-llama/Llama-3.1-8B"
     MODEL_PATH = hf_id_to_local_model_dir(MODEL_NAME)
@@ -310,6 +339,7 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
     def test_bf16(self):
         kwargs = self.get_default_kwargs()
         sampling_params = self.get_default_sampling_params()
+        print_memory_usage("Before evaluation")
         with AutoDeployLLM(model=self.MODEL_PATH_BF16,
                            tokenizer=self.MODEL_PATH_BF16,
                            world_size=4,
@@ -318,6 +348,7 @@ class TestNemotronSuperV3(LlmapiAccuracyTestHarness):
             task.evaluate(llm, sampling_params=sampling_params)
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm)
+        print_memory_usage("After evaluation")
 
     @pytest.mark.skip("Skipping FP8 test until it is supported")
     @pytest.mark.skip_less_device_memory(180000)