mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
767 lines
55 KiB
HTML
767 lines
55 KiB
HTML
|
||
|
||
<!DOCTYPE html>
|
||
|
||
|
||
<html lang="en" data-content_root="../" >
|
||
|
||
<head>
|
||
<meta charset="utf-8" />
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<title>Memory Usage of TensorRT-LLM — TensorRT-LLM</title>
|
||
|
||
|
||
|
||
<script data-cfasync="false">
|
||
document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
|
||
document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
|
||
</script>
|
||
<!--
|
||
this give us a css class that will be invisible only if js is disabled
|
||
-->
|
||
<noscript>
|
||
<style>
|
||
.pst-js-only { display: none !important; }
|
||
|
||
</style>
|
||
</noscript>
|
||
|
||
<!-- Loaded before other Sphinx assets -->
|
||
<link href="../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
|
||
<link href="../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
|
||
|
||
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/styles/nvidia-sphinx-theme.css?v=df3ac72c" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/autodoc_pydantic.css" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/custom.css?v=95073da6" />
|
||
|
||
<!-- So that users can add custom icons -->
|
||
<script src="../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
|
||
<!-- Pre-loaded scripts that we'll load fully later -->
|
||
<link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
|
||
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />
|
||
|
||
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
|
||
<script src="../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../_static/clipboard.min.js?v=a7894cd8"></script>
|
||
<script src="../_static/copybutton.js?v=65e89d2a"></script>
|
||
<script>let toggleHintShow = 'Click to show';</script>
|
||
<script>let toggleHintHide = 'Click to hide';</script>
|
||
<script>let toggleOpenOnPrint = 'true';</script>
|
||
<script src="../_static/togglebutton.js?v=4a39c7ea"></script>
|
||
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
|
||
<script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
|
||
<script>DOCUMENTATION_OPTIONS.pagename = 'reference/memory';</script>
|
||
<script>
|
||
DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
|
||
DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
|
||
DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.1.0rc4';
|
||
DOCUMENTATION_OPTIONS.show_version_warning_banner =
|
||
false;
|
||
</script>
|
||
<link rel="icon" href="../_static/favicon.png"/>
|
||
<link rel="index" title="Index" href="../genindex.html" />
|
||
<link rel="search" title="Search" href="../search.html" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1"/>
|
||
<meta name="docsearch:language" content="en"/>
|
||
<meta name="docsearch:version" content="1.1.0rc4" />
|
||
|
||
|
||
</head>
|
||
|
||
|
||
<body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">
|
||
|
||
|
||
|
||
<div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>
|
||
|
||
<div id="pst-scroll-pixel-helper"></div>
|
||
|
||
<button type="button" class="btn rounded-pill" id="pst-back-to-top">
|
||
<i class="fa-solid fa-arrow-up"></i>Back to top</button>
|
||
|
||
|
||
<dialog id="pst-search-dialog">
|
||
|
||
<form class="bd-search d-flex align-items-center"
|
||
action="../search.html"
|
||
method="get">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<input type="search"
|
||
class="form-control"
|
||
name="q"
|
||
placeholder="Search the docs ..."
|
||
aria-label="Search the docs ..."
|
||
autocomplete="off"
|
||
autocorrect="off"
|
||
autocapitalize="off"
|
||
spellcheck="false"/>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
|
||
</form>
|
||
</dialog>
|
||
|
||
<div class="pst-async-banner-revealer d-none">
|
||
<aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
|
||
</div>
|
||
|
||
|
||
<header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
|
||
<div class="bd-header__inner bd-page-width">
|
||
<button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
|
||
<span class="fa-solid fa-bars"></span>
|
||
</button>
|
||
|
||
|
||
<div class="col-lg-3 navbar-header-items__start">
|
||
|
||
<div class="navbar-item">
|
||
|
||
|
||
|
||
|
||
|
||
<a class="navbar-brand logo" href="../index.html">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
|
||
<img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
|
||
|
||
|
||
<p class="title logo__title">TensorRT-LLM</p>
|
||
|
||
</a></div>
|
||
|
||
</div>
|
||
|
||
<div class="col-lg-9 navbar-header-items">
|
||
|
||
<div class="me-auto navbar-header-items__center">
|
||
|
||
<div class="navbar-item">
|
||
|
||
|
||
<div class="version-switcher__container dropdown pst-js-only">
|
||
<button id="pst-version-switcher-button-2"
|
||
type="button"
|
||
class="version-switcher__button btn btn-sm dropdown-toggle"
|
||
data-bs-toggle="dropdown"
|
||
aria-haspopup="listbox"
|
||
aria-controls="pst-version-switcher-list-2"
|
||
aria-label="Version switcher list"
|
||
>
|
||
Choose version <!-- this text may get changed later by javascript -->
|
||
<span class="caret"></span>
|
||
</button>
|
||
<div id="pst-version-switcher-list-2"
|
||
class="version-switcher__menu dropdown-menu list-group-flush py-0"
|
||
role="listbox" aria-labelledby="pst-version-switcher-button-2">
|
||
<!-- dropdown will be populated by javascript on page load -->
|
||
</div>
|
||
</div></div>
|
||
|
||
</div>
|
||
|
||
|
||
<div class="navbar-header-items__end">
|
||
|
||
<div class="navbar-item navbar-persistent--container">
|
||
|
||
|
||
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<span class="search-button__default-text">Search</span>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
|
||
</button>
|
||
</div>
|
||
|
||
|
||
<div class="navbar-item">
|
||
|
||
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
|
||
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
|
||
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
|
||
</button></div>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
|
||
|
||
<div class="navbar-persistent--mobile">
|
||
|
||
<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="fa-solid fa-magnifying-glass"></i>
|
||
<span class="search-button__default-text">Search</span>
|
||
<span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
|
||
</button>
|
||
</div>
|
||
|
||
|
||
|
||
<button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
|
||
<span class="fa-solid fa-outdent"></span>
|
||
</button>
|
||
|
||
</div>
|
||
|
||
</header>
|
||
|
||
|
||
<div class="bd-container">
|
||
<div class="bd-container__inner bd-page-width">
|
||
|
||
|
||
|
||
<dialog id="pst-primary-sidebar-modal"></dialog>
|
||
<div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<a class="navbar-brand logo" href="../index.html">
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
|
||
<img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT-LLM - Home"/>
|
||
<img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT-LLM - Home"/>
|
||
|
||
|
||
<p class="title logo__title">TensorRT-LLM</p>
|
||
|
||
</a>
|
||
|
||
|
||
|
||
<div class="sidebar-header-items sidebar-primary__section">
|
||
|
||
|
||
<div class="sidebar-header-items__center">
|
||
|
||
|
||
|
||
<div class="navbar-item">
|
||
|
||
|
||
<div class="version-switcher__container dropdown pst-js-only">
|
||
<button id="pst-version-switcher-button-3"
|
||
type="button"
|
||
class="version-switcher__button btn btn-sm dropdown-toggle"
|
||
data-bs-toggle="dropdown"
|
||
aria-haspopup="listbox"
|
||
aria-controls="pst-version-switcher-list-3"
|
||
aria-label="Version switcher list"
|
||
>
|
||
Choose version <!-- this text may get changed later by javascript -->
|
||
<span class="caret"></span>
|
||
</button>
|
||
<div id="pst-version-switcher-list-3"
|
||
class="version-switcher__menu dropdown-menu list-group-flush py-0"
|
||
role="listbox" aria-labelledby="pst-version-switcher-button-3">
|
||
<!-- dropdown will be populated by javascript on page load -->
|
||
</div>
|
||
</div></div>
|
||
|
||
|
||
</div>
|
||
|
||
|
||
|
||
<div class="sidebar-header-items__end">
|
||
|
||
<div class="navbar-item">
|
||
|
||
<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode" data-bs-placement="bottom" data-bs-toggle="tooltip">
|
||
<i class="theme-switch fa-solid fa-sun fa-lg" data-mode="light" title="Light"></i>
|
||
<i class="theme-switch fa-solid fa-moon fa-lg" data-mode="dark" title="Dark"></i>
|
||
<i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto" title="System Settings"></i>
|
||
</button></div>
|
||
|
||
</div>
|
||
|
||
</div>
|
||
|
||
<div class="sidebar-primary-items__start sidebar-primary__section">
|
||
<div class="sidebar-primary-item">
|
||
|
||
|
||
|
||
<nav class="bd-docs-nav bd-links"
|
||
aria-label="Table of Contents">
|
||
<p class="bd-links__title" role="heading" aria-level="1">Table of Contents</p>
|
||
<div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../installation/index.html">Installation</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../installation/containers.html">Pre-built release container images on NGC</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../installation/linux.html">Installing on Linux via <code class="docutils literal notranslate"><span class="pre">pip</span></code></a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
|
||
</ul>
|
||
</details></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Deployment Guide</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate text asynchronously</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate text in streaming</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_kv_cache_connector.html">KV Cache Connector</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
|
||
</ul>
|
||
</details></li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/trtllm_serve_examples.html">Online Serving Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client.html">Curl Chat Client</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client_for_multimodal.html">Curl Chat Client For Multimodal</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_completion_client.html">Curl Completion Client</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client.html">Genai Perf Client</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client for Multimodal</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client_for_lora.html">Openai Completion Client For Lora</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client_json_schema.html">OpenAI Completion Client with JSON Schema</a></li>
|
||
</ul>
|
||
</details></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../examples/dynamo_k8s_example.html">Dynamo K8s Example</a></li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../deployment-guide/index.html">Model Recipes</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/quick-start-recipe-for-deepseek-r1-on-trtllm.html">Quick Start Recipe for DeepSeek R1 on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/quick-start-recipe-for-llama3.3-70b-on-trtllm.html">Quick Start Recipe for Llama3.3 70B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/quick-start-recipe-for-llama4-scout-on-trtllm.html">Quick Start Recipe for Llama4 Scout 17B on TensorRT LLM - Blackwell & Hopper Hardware</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/quick-start-recipe-for-gpt-oss-on-trtllm.html">Quick Start Recipe for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
|
||
</ul>
|
||
</details></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Models</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../models/supported-models.html">Supported Models</a></li>
|
||
|
||
<li class="toctree-l1"><a class="reference internal" href="../models/adding-new-model.html">Adding a New Model</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">CLI Reference</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>
|
||
|
||
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-eval.html">trtllm-eval</a></li>
|
||
<li class="toctree-l1 has-children"><a class="reference internal" href="../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
|
||
</ul>
|
||
</details></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">API Reference</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">LLM API Introduction</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Features</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/feature-combination-matrix.html">Feature Combination Matrix</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/disagg-serving.html">Disaggregated Serving (Beta)</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/kvcache.html">KV Cache System</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/long-sequence.html">Long Sequences</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/lora.html">LoRA (Low-Rank Adaptation)</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/multi-modality.html">Multimodal Support in TensorRT LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/overlap-scheduler.html">Overlap Scheduler</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/paged-attention-ifb-scheduler.html">Paged Attention, IFB, and Request Scheduling</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/parallel-strategy.html">Parallelism in TensorRT LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/quantization.html">Quantization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/sampling.html">Sampling</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/speculative-decoding.html">Speculative Decoding</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/checkpoint-loading.html">Checkpoint Loading</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../features/auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">Architecture Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/perf-analysis.html">Performance Analysis</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/perf-benchmarking.html">TensorRT LLM Benchmarking</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/ci-overview.html">Continuous Integration Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/dev-containers.html">Using Dev Containers</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog10_ADP_Balance_Strategy.html">ADP Balance Strategy</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog11_GPT_OSS_Eagle3.html">Running GPT-OSS-120B with Eagle3 Speculative Decoding on GB200/B200 (TensorRT LLM)</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html">Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.html">Scaling Expert Parallelism in TensorRT LLM (Part 1: Design and Implementation of Large-scale EP)</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html">Disaggregated Serving in TensorRT LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html">How to launch Llama4 Maverick + Eagle3 TensorRT LLM server</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.html">N-Gram Speculative Decoding in TensorRT LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html">How to get best performance on DeepSeek-R1 in TensorRT LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Quick Links</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/releases">Releases</a></li>
|
||
<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM">Github Code</a></li>
|
||
<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap">Roadmap</a></li>
|
||
</ul>
|
||
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
|
||
<ul class="nav bd-sidenav">
|
||
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
|
||
</ul>
|
||
</div>
|
||
</nav></div>
|
||
</div>
|
||
|
||
|
||
<div class="sidebar-primary-items__end sidebar-primary__section">
|
||
</div>
|
||
|
||
|
||
|
||
</div>
|
||
|
||
<main id="main-content" class="bd-main" role="main">
|
||
|
||
|
||
<div class="bd-content">
|
||
<div class="bd-article-container">
|
||
|
||
<div class="bd-header-article d-print-none">
|
||
<div class="header-article-items header-article__inner">
|
||
|
||
<div class="header-article-items__start">
|
||
|
||
<div class="header-article-item">
|
||
|
||
<nav aria-label="Breadcrumb" class="d-print-none">
|
||
<ul class="bd-breadcrumbs">
|
||
|
||
<li class="breadcrumb-item breadcrumb-home">
|
||
<a href="../index.html" class="nav-link" aria-label="Home">
|
||
<i class="fa-solid fa-home"></i>
|
||
</a>
|
||
</li>
|
||
<li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">Memory Usage of TensorRT-LLM</span></li>
|
||
</ul>
|
||
</nav>
|
||
</div>
|
||
|
||
</div>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
|
||
|
||
|
||
|
||
<div id="searchbox"></div>
|
||
<article class="bd-article">
|
||
|
||
<section id="memory-usage-of-tensorrt-llm">
|
||
<span id="memory"></span><h1>Memory Usage of TensorRT-LLM<a class="headerlink" href="#memory-usage-of-tensorrt-llm" title="Link to this heading">#</a></h1>
|
||
<p>This document summarizes the memory usage of TensorRT-LLM, and addresses common issues and questions reported by users.</p>
|
||
<section id="understand-inference-time-gpu-memory-usage">
|
||
<h2>Understand inference time GPU memory usage<a class="headerlink" href="#understand-inference-time-gpu-memory-usage" title="Link to this heading">#</a></h2>
|
||
<p>At inference time, there are 3 major contributors to GPU memory usage for a given TRT engine generated from a TensorRT-LLM model: weights, internal activation tensors, and I/O tensors. For I/O tensors, the major memory footprint comes from the KV cache tensor.</p>
|
||
<section id="weights-size">
|
||
<h3>1. Weights size<a class="headerlink" href="#weights-size" title="Link to this heading">#</a></h3>
|
||
<p>Weights size is fixed depending on the model size, the chosen precision of the weights and the parallelization strategy.
|
||
Using lower precision like INT8 or FP8 can reduce the weights size.
|
||
When tensor parallelism or pipeline parallelism is used, each rank stores only some portion of the weights.
|
||
For example, each rank typically uses just 1/8 of the model weights when using 8-way tensor parallelism or 8-stages pipeline parallelism.</p>
|
||
</section>
|
||
<section id="activation-size">
|
||
<h3>2. Activation size<a class="headerlink" href="#activation-size" title="Link to this heading">#</a></h3>
|
||
<p>TensorRT can optimize the memory usage by reusing memory for different tensors based on live analysis and tensor size. To avoid out of memory errors at runtime and to reduce the runtime cost of switching optimization profiles and changing shapes, <strong>TensorRT pre-computes the activation tensors memory requirement at build time</strong>. The memory requirement is computed based on an optimized TensorRT graph, one profile’s memory usage is computed by using the max tensor shape, and the memory requirement of one engine is computed by the maximum size between different profiles. There are external and internal factors that can affect the activation size returned by TensorRT, such as the network structure, kernel fusion, operation scheduling, etc.</p>
|
||
<p>Once the TensorRT engine is built, the activation memory size of that engine <strong>cannot be changed</strong>, and can be queried by the API <code class="docutils literal notranslate"><span class="pre">trt.ICudaEngine.device_memory_size_v2</span></code>.</p>
|
||
<p>Practically, for a given model, specified precision and parallelization strategy, one can tune the activation memory usage by adjusting the max batch size, max input length, max beam width, max number of tokens, padding removal on/off flag, context FMHA on/off flag.
|
||
Here some explanations on how these values affect the memory:</p>
|
||
<ol class="arabic">
|
||
<li><p>Reduce build time max number of input tokens (<code class="docutils literal notranslate"><span class="pre">max_num_tokens</span></code>)</p>
|
||
<p>Most of the tensors inside a transformer network have a linear relationship with number of input tokens, so activation size will be close to <code class="docutils literal notranslate"><span class="pre">max</span> <span class="pre">number</span> <span class="pre">of</span> <span class="pre">input</span> <span class="pre">tokens</span> <span class="pre">*</span> <span class="pre">some</span> <span class="pre">constant</span> <span class="pre">factor</span></code>, the constant factor depends on the network structure and TRT internal optimization. The max number of input tokens is derived from build time arguments, one can change the parameters provided to the <code class="docutils literal notranslate"><span class="pre">prepare_inputs</span></code> function, like <code class="docutils literal notranslate"><span class="pre">PretrainedModel.prepare_inputs</span></code> to affect the memory usage, or one can change the command line options of the <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> command used in the examples.</p>
|
||
<p>When using the <a class="reference internal" href="../advanced/gpt-attention.html#padded-and-packed-tensors"><span class="std std-ref">packed tensors</span></a> format and <code class="docutils literal notranslate"><span class="pre">max_num_tokens</span></code> is specified, reducing its value will also reduce activation memory size.</p>
|
||
<p>When using the <a class="reference internal" href="../advanced/gpt-attention.html#padded-and-packed-tensors"><span class="std std-ref">padded tensors</span></a> format, the max number of input tokens equals to <code class="docutils literal notranslate"><span class="pre">max_batch_size*max_input_len</span></code>, so reducing <code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code> and <code class="docutils literal notranslate"><span class="pre">max_input_len</span></code> can almost linearly reduce the activation memory size.</p>
|
||
<p>The packed tensors format is recommended, because it saves both memory and compute.</p>
|
||
<p>The beam width will be folded into the batch size dimension when passing the tensors range into TensorRT, so reducing <code class="docutils literal notranslate"><span class="pre">max_beam_width</span></code> can also reduce the memory usage.</p>
|
||
</li>
|
||
<li><p>Turn on context FMHA</p>
|
||
<p>When the GPT attention plugin is used, turning on the <code class="docutils literal notranslate"><span class="pre">context_fmha_type</span></code> of the plugin will reduce the memory footprint significantly. See the <a class="reference internal" href="../advanced/gpt-attention.html#context-phase"><span class="std std-ref">Context Phase</span></a> for details. When the <code class="docutils literal notranslate"><span class="pre">context_fmha_type</span></code> is set to disabled, a workspace size of the plugin will quadratically depend on the sequence length.</p>
|
||
</li>
|
||
<li><p>Tensor parallelism and pipeline parallelism</p>
|
||
<p>TensorRT will reuse memory between layers as much as possible, for a typical example, given <em>N</em> decoder blocks in one transformer network, TRT will not allocate <em>N</em> copies of the activation memory for each block, since the memory of tensors in the 1st block can be released after the execution, memory can be reused for later blocks, only 1 block’s memory is needed.</p>
|
||
<p>When using tensor parallelism, some tensors are split into smaller chunks and each rank only holds one chunk of the tensor, the activation memory size of each rank will be smaller than when executing the network on a single GPU. When using pipeline parallelism, each rank executes several decoder blocks, and all the tensors are full-size tensors, so the activation memory size is equal to 1 block’s memory size. Thus tensor parallelism normally has higher memory efficiency than pipeline parallelism when all other parameters are the same.</p>
|
||
</li>
|
||
</ol>
|
||
</section>
|
||
<section id="i-o-tensors">
|
||
<h3>3. I/O tensors<a class="headerlink" href="#i-o-tensors" title="Link to this heading">#</a></h3>
|
||
<section id="runtime-and-decoder-buffers-except-kv-cache-tensor">
|
||
<h4>3.1 Runtime and decoder buffers except KV cache tensor<a class="headerlink" href="#runtime-and-decoder-buffers-except-kv-cache-tensor" title="Link to this heading">#</a></h4>
|
||
<section id="c-runtime">
|
||
<h5>C++ runtime<a class="headerlink" href="#c-runtime" title="Link to this heading">#</a></h5>
|
||
<p>Before KV cache blocks are allocated, some amount of GPU memory are pre-allocated by C++ runtime for storing I/O tensors of TensorRT engine and the decoupled dynamic decoder, it’s allocated based on runtime max_batch_size and max_seq_len so that OOM can be avoided when there are indeed that amount of requests scheduled.</p>
|
||
</section>
|
||
</section>
|
||
<section id="kv-cache-tensor">
|
||
<h4>3.2 KV cache tensor<a class="headerlink" href="#kv-cache-tensor" title="Link to this heading">#</a></h4>
|
||
<section id="id1">
|
||
<h5>C++ runtime<a class="headerlink" href="#id1" title="Link to this heading">#</a></h5>
|
||
<p>TensorRT-LLM runtime pre-allocates paged KV cache pools during initialization for a configured number of blocks and distributes them at runtime.</p>
|
||
<p>KV cache tensors are allocated based on the <code class="docutils literal notranslate"><span class="pre">KVCacheConfig</span></code> object when creating the <code class="docutils literal notranslate"><span class="pre">Executor</span></code>. If neither <code class="docutils literal notranslate"><span class="pre">maxTokens</span></code> nor <code class="docutils literal notranslate"><span class="pre">freeGpuMemoryFraction</span></code> is specified, KV cache will by default allocate 90% of the remaining free GPU memory. When either <code class="docutils literal notranslate"><span class="pre">maxTokens</span></code> or <code class="docutils literal notranslate"><span class="pre">freeGpuMemoryFraction</span></code> is specified, the specified value will be used to compute the KV cache memory size. And if both are specified, firstly the <code class="docutils literal notranslate"><span class="pre">freeGpuMemoryFraction</span></code> is used to compute the number of tokens in KV cache, and then the minimum between this computed number of tokens and <code class="docutils literal notranslate"><span class="pre">maxTokens</span></code> is used.</p>
|
||
<p>In in-flight batching the scheduler can automatically schedule requests as long as enough KV cache space is available (exact behavior depends on the scheduler policy).</p>
|
||
</section>
|
||
<section id="python-runtime-not-recommended-to-be-used">
|
||
<h5>Python runtime (Not recommended to be used)<a class="headerlink" href="#python-runtime-not-recommended-to-be-used" title="Link to this heading">#</a></h5>
|
||
<p>The Python runtime allocates KV cache tensors based on the parameters of the <code class="docutils literal notranslate"><span class="pre">GenerationSession.setup</span></code> function, the KV cache size is linearly dependent on the <code class="docutils literal notranslate"><span class="pre">batch_size</span></code> and <code class="docutils literal notranslate"><span class="pre">max_context_length+max_new_tokens</span></code>. <strong>Note: This may change in the future, as the Python bindings of the C++ runtime may replace the current python runtime in the future. The Python bindings of C++ runtime behave like C++ runtime.</strong></p>
|
||
</section>
|
||
</section>
|
||
</section>
|
||
</section>
|
||
<section id="memory-pool">
|
||
<h2>Memory pool<a class="headerlink" href="#memory-pool" title="Link to this heading">#</a></h2>
|
||
<p>TensorRT-LLM C++ runtime is using stream-ordered memory allocator to allocate and free buffers, see <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/HEAD/cpp/tensorrt_llm/runtime/bufferManager.cpp">BufferManager::initMemoryPool</a>, which uses the default memory pool managed by the CUDA driver. When a <code class="docutils literal notranslate"><span class="pre">TrtGptModel</span></code> object is destroyed, memory is returned to the memory pool and can be reused by the next instance of a <code class="docutils literal notranslate"><span class="pre">TrtGptModel</span></code> object. Memory will be released from the pool if it is required for other memory allocations.</p>
|
||
<p>However, <code class="docutils literal notranslate"><span class="pre">nvidia-smi</span></code> may still show high memory occupation after memory is returned to the CUDA driver’s memory pool. This should not be a concern and is intended behavior. The amount of reserved and free memory in the pool can be inspected by <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/HEAD/cpp/tensorrt_llm/runtime/bufferManager.cpp">BufferManager::memoryPoolReserved())</a> and <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/HEAD/cpp/tensorrt_llm/runtime/bufferManager.cpp">BufferManager::memoryPoolFree())</a>, respectively.</p>
|
||
</section>
|
||
<section id="known-issues">
|
||
<h2>Known Issues<a class="headerlink" href="#known-issues" title="Link to this heading">#</a></h2>
|
||
<p>When FP8 GEMM is used, the activation memory might be larger than the theoretical optimized memory size, this will be enhanced in a future release.</p>
|
||
</section>
|
||
<section id="faq">
|
||
<h2>FAQ<a class="headerlink" href="#faq" title="Link to this heading">#</a></h2>
|
||
<ol class="arabic">
|
||
<li><p>How to debug the memory usage of TensorRT-LLM?</p>
|
||
<p>When the <code class="docutils literal notranslate"><span class="pre">info</span></code> logging level is used, TensorRT and TensorRT-LLM will print messages about memory usage details. Here is part of a log example with <code class="docutils literal notranslate"><span class="pre">info</span></code> logging level at runtime:</p>
|
||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="n">Loaded</span> <span class="n">engine</span> <span class="n">size</span><span class="p">:</span> <span class="mi">6695</span> <span class="n">MiB</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="p">[</span><span class="n">MemUsageChange</span><span class="p">]</span> <span class="n">Allocated</span> <span class="mf">1134.01</span> <span class="n">MiB</span> <span class="k">for</span> <span class="n">execution</span> <span class="n">context</span> <span class="n">memory</span><span class="o">.</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="p">[</span><span class="n">MS</span><span class="p">]</span> <span class="n">Running</span> <span class="n">engine</span> <span class="k">with</span> <span class="n">multi</span> <span class="n">stream</span> <span class="n">info</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="p">[</span><span class="n">MS</span><span class="p">]</span> <span class="n">Number</span> <span class="n">of</span> <span class="n">aux</span> <span class="n">streams</span> <span class="ow">is</span> <span class="mi">1</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="p">[</span><span class="n">MS</span><span class="p">]</span> <span class="n">Number</span> <span class="n">of</span> <span class="n">total</span> <span class="n">worker</span> <span class="n">streams</span> <span class="ow">is</span> <span class="mi">2</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="p">[</span><span class="n">MS</span><span class="p">]</span> <span class="n">The</span> <span class="n">main</span> <span class="n">stream</span> <span class="n">provided</span> <span class="n">by</span> <span class="n">execute</span><span class="o">/</span><span class="n">enqueue</span> <span class="n">calls</span> <span class="ow">is</span> <span class="n">the</span> <span class="n">first</span> <span class="n">worker</span> <span class="n">stream</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="p">[</span><span class="n">MemUsageChange</span><span class="p">]</span> <span class="n">TensorRT</span><span class="o">-</span><span class="n">managed</span> <span class="n">allocation</span> <span class="ow">in</span> <span class="n">IExecutionContext</span> <span class="n">creation</span><span class="p">:</span> <span class="n">CPU</span> <span class="o">+</span><span class="mi">0</span><span class="p">,</span> <span class="n">GPU</span> <span class="o">+</span><span class="mi">0</span><span class="p">,</span> <span class="n">now</span><span class="p">:</span> <span class="n">CPU</span> <span class="mi">0</span><span class="p">,</span> <span class="n">GPU</span> <span class="mi">6678</span> <span class="p">(</span><span class="n">MiB</span><span class="p">)</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="p">[</span><span class="n">MemUsageChange</span><span class="p">]</span> <span class="n">Allocated</span> <span class="mf">43.29</span> <span class="n">MB</span> <span class="n">GPU</span> <span class="n">memory</span> <span class="k">for</span> <span class="n">runtime</span> <span class="n">buffers</span><span class="o">.</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="p">[</span><span class="n">MemUsageChange</span><span class="p">]</span> <span class="n">Allocated</span> <span class="mf">180.30</span> <span class="n">MB</span> <span class="n">GPU</span> <span class="n">memory</span> <span class="k">for</span> <span class="n">decoder</span><span class="o">.</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="n">Memory</span> <span class="n">usage</span> <span class="n">when</span> <span class="n">calculating</span> <span class="nb">max</span> <span class="n">tokens</span> <span class="ow">in</span> <span class="n">paged</span> <span class="n">kv</span> <span class="n">cache</span><span class="p">:</span> <span class="n">total</span><span class="p">:</span> <span class="mf">79.10</span> <span class="n">GiB</span><span class="p">,</span> <span class="n">available</span><span class="p">:</span> <span class="mf">70.48</span> <span class="n">GiB</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="n">Number</span> <span class="n">of</span> <span class="n">blocks</span> <span class="ow">in</span> <span class="n">KV</span> <span class="n">cache</span> <span class="n">primary</span> <span class="n">pool</span><span class="p">:</span> <span class="mi">4060</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="n">Number</span> <span class="n">of</span> <span class="n">blocks</span> <span class="ow">in</span> <span class="n">KV</span> <span class="n">cache</span> <span class="n">secondary</span> <span class="n">pool</span><span class="p">:</span> <span class="mi">0</span><span class="p">,</span> <span class="n">onboard</span> <span class="n">blocks</span> <span class="n">to</span> <span class="n">primary</span> <span class="n">memory</span> <span class="n">before</span> <span class="n">reuse</span><span class="p">:</span> <span class="n">true</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="n">Max</span> <span class="n">KV</span> <span class="n">cache</span> <span class="n">pages</span> <span class="n">per</span> <span class="n">sequence</span><span class="p">:</span> <span class="mi">32</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="n">Number</span> <span class="n">of</span> <span class="n">tokens</span> <span class="n">per</span> <span class="n">block</span><span class="p">:</span> <span class="mf">64.</span>
|
||
<span class="p">[</span><span class="n">TensorRT</span><span class="o">-</span><span class="n">LLM</span><span class="p">][</span><span class="n">INFO</span><span class="p">]</span> <span class="p">[</span><span class="n">MemUsageChange</span><span class="p">]</span> <span class="n">Allocated</span> <span class="mf">63.44</span> <span class="n">GiB</span> <span class="k">for</span> <span class="nb">max</span> <span class="n">tokens</span> <span class="ow">in</span> <span class="n">paged</span> <span class="n">KV</span> <span class="n">cache</span> <span class="p">(</span><span class="mi">259840</span><span class="p">)</span><span class="o">.</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>You can see that there are several GPU memory allocation started with <code class="docutils literal notranslate"><span class="pre">[MemUsageChange]</span></code> keyword happened at runtime.</p>
|
||
<p>The line showing “Total Weights Memory” indicates the weights memory size, and the line “Total Activation Memory” indicates the activation memory size.</p>
|
||
<p>Normally the weights memory size is close to the TensorRT engine size, since most of the content in the engine is from weights for LLM networks.</p>
|
||
</li>
|
||
<li><p>Why is the memory size large even though a small batch size and sequence length are used in the runtime?</p>
|
||
<p>As explained above, the activation memory size is computed based on the max tensor shapes at TensorRT engine building time, try to reduce the engine building time parameters like <code class="docutils literal notranslate"><span class="pre">max_num_token</span></code>, see <a class="reference internal" href="#activation-size"><span class="xref myst">Activation size</span></a> for details.</p>
|
||
</li>
|
||
<li><p>Why can the engine be generated, but the inference will run out of memory (OOM) at runtime?</p>
|
||
<p>At engine building time, TensorRT will tune the kernel selection layer by layer, it does not necessarily allocate all the memory required to run the entire engine. If the activation tensors required to run a single layer are small, while the I/O tensor (like KV cache) sizes required to run the engine are large, building will succeed since it may not need to allocate the large I/O tensors, runtime may fail with OOM errors on allocating large IO tensors.</p>
|
||
<p>TensorRT-LLM has provided a <code class="docutils literal notranslate"><span class="pre">check_gpt_mem_usage</span></code> utility function to check the upper bound of the memory size given an engine, and the related batch size, I/O sequence length, etc., when the upper boundary check exceeded the GPU physical memory size, warning messages will be printed.</p>
|
||
</li>
|
||
<li><p>For pipeline parallelism, is build time max batch size the limit of micro batch size?</p>
|
||
<p>Yes, in pipeline parallel mode, TensorRT-LLM runtime will split the batch of requests into micro batches, and enqueue these micro batches into TRT engine sequentially.</p>
|
||
<p>The <code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code> at build time means that batch size of one engine enqueue call shall be smaller than it. The total batch size before splitting into micro batches can be larger than the build time <code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code>.</p>
|
||
<p>For example, if you have 4-stages pipeline parallelism, and intend to run the engine using micro batch size 2 and run 16 micro batches (total batch size 32) in one <code class="docutils literal notranslate"><span class="pre">generate</span></code> call.</p>
|
||
<p>You could just set the <code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code> at building time to 2, instead of 32. Setting build time <code class="docutils literal notranslate"><span class="pre">max_batch_size</span></code> 32 will occupy almost 16x more activation memory.</p>
|
||
</li>
|
||
</ol>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</article>
|
||
|
||
|
||
|
||
|
||
|
||
<footer class="prev-next-footer d-print-none">
|
||
|
||
<div class="prev-next-area">
|
||
</div>
|
||
</footer>
|
||
|
||
</div>
|
||
|
||
|
||
|
||
|
||
|
||
<dialog id="pst-secondary-sidebar-modal"></dialog>
|
||
<div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">
|
||
|
||
|
||
<div class="sidebar-secondary-item">
|
||
<div
|
||
id="pst-page-navigation-heading-2"
|
||
class="page-toc tocsection onthispage">
|
||
<i class="fa-solid fa-list"></i> On this page
|
||
</div>
|
||
<nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
|
||
<ul class="visible nav section-nav flex-column">
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#understand-inference-time-gpu-memory-usage">Understand inference time GPU memory usage</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#weights-size">1. Weights size</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#activation-size">2. Activation size</a></li>
|
||
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#i-o-tensors">3. I/O tensors</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#runtime-and-decoder-buffers-except-kv-cache-tensor">3.1 Runtime and decoder buffers except KV cache tensor</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#c-runtime">C++ runtime</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#kv-cache-tensor">3.2 KV cache tensor</a><ul class="nav section-nav flex-column">
|
||
<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#id1">C++ runtime</a></li>
|
||
<li class="toc-h5 nav-item toc-entry"><a class="reference internal nav-link" href="#python-runtime-not-recommended-to-be-used">Python runtime (Not recommended to be used)</a></li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
</ul>
|
||
</li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#memory-pool">Memory pool</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#known-issues">Known Issues</a></li>
|
||
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#faq">FAQ</a></li>
|
||
</ul>
|
||
</nav></div>
|
||
|
||
</div></div>
|
||
|
||
|
||
|
||
</div>
|
||
<footer class="bd-footer-content">
|
||
|
||
</footer>
|
||
|
||
</main>
|
||
</div>
|
||
</div>
|
||
|
||
<!-- Scripts loaded after <body> so the DOM is not blocked -->
|
||
<script defer src="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
|
||
<script defer src="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>
|
||
|
||
<footer class="bd-footer">
|
||
<div class="bd-footer__inner bd-page-width">
|
||
|
||
<div class="footer-items__start">
|
||
|
||
<div class="footer-item">
|
||
<a class="footer-brand logo" href="https://www.nvidia.com">
|
||
<img src="../_static/nvidia-logo-horiz-rgb-1c-blk-for-screen.svg" class="logo__image only-light" alt="NVIDIA"/>
|
||
<img src="../_static/nvidia-logo-horiz-rgb-1c-wht-for-screen.svg" class="logo__image only-dark" alt="NVIDIA"/>
|
||
</a></div>
|
||
|
||
<div class="footer-item">
|
||
|
||
<div class="footer-links">
|
||
|
||
|
||
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/">Privacy Policy</a>
|
||
|
|
||
|
||
|
||
|
||
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/">Manage My Privacy</a>
|
||
|
|
||
|
||
|
||
|
||
<a class="external" href="https://www.nvidia.com/en-us/preferences/start/">Do Not Sell or Share My Data</a>
|
||
|
|
||
|
||
|
||
|
||
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/">Terms of Service</a>
|
||
|
|
||
|
||
|
||
|
||
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/">Accessibility</a>
|
||
|
|
||
|
||
|
||
|
||
<a class="external" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/">Corporate Policies</a>
|
||
|
|
||
|
||
|
||
|
||
<a class="external" href="https://www.nvidia.com/en-us/product-security/">Product Security</a>
|
||
|
|
||
|
||
|
||
|
||
<a class="external" href="https://www.nvidia.com/en-us/contact/">Contact</a>
|
||
|
||
|
||
|
||
</div>
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
|
||
|
||
|
||
|
||
<p class="copyright">
|
||
|
||
Copyright © 2025, NVidia.
|
||
<br/>
|
||
|
||
</p>
|
||
</div>
|
||
|
||
<div class="footer-item">
|
||
<div class="extra_footer">
|
||
|
||
<p>Last updated on September 09, 2025.</p>
|
||
|
||
<p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/62b564a">62b564a</a>.</p>
|
||
|
||
</div></div>
|
||
|
||
</div>
|
||
|
||
|
||
|
||
</div>
|
||
|
||
</footer>
|
||
</body>
|
||
</html> |