TensorRT-LLMs/features/attention.html



<!DOCTYPE html>


<html lang="en" data-content_root="../" >

  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="viewport" content="width=device-width, initial-scale=1" />

    <title>Multi-Head, Multi-Query, and Group-Query Attention &#8212; TensorRT LLM</title>


  <script data-cfasync="false">
    document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
    document.documentElement.dataset.theme = localStorage.getItem("theme") || "";
  </script>
  <!--
    this give us a css class that will be invisible only if js is disabled
  -->
  <noscript>
    <style>
      .pst-js-only { display: none !important; }

    </style>
  </noscript>

  <!-- Loaded before other Sphinx assets -->
  <link href="../_static/styles/theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />
<link href="../_static/styles/pydata-sphinx-theme.css?digest=8878045cc6db502f8baf" rel="stylesheet" />

    <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=8f2a1f02" />
    <link rel="stylesheet" type="text/css" href="../_static/styles/nvidia-sphinx-theme.css?v=933278ad" />
    <link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />
    <link rel="stylesheet" type="text/css" href="../_static/autodoc_pydantic.css" />
    <link rel="stylesheet" type="text/css" href="../_static/togglebutton.css?v=13237357" />
    <link rel="stylesheet" type="text/css" href="../_static/custom.css?v=19d20f17" />

  <!-- So that users can add custom icons -->
  <script src="../_static/scripts/fontawesome.js?digest=8878045cc6db502f8baf"></script>
  <!-- Pre-loaded scripts that we'll load fully later -->
  <link rel="preload" as="script" href="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf" />
<link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf" />


    <script src="../_static/documentation_options.js?v=5929fcd5"></script>
    <script src="../_static/doctools.js?v=9a2dae69"></script>
    <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
    <script src="../_static/clipboard.min.js?v=a7894cd8"></script>
    <script src="../_static/copybutton.js?v=65e89d2a"></script>
    <script>let toggleHintShow = 'Click to show';</script>
    <script>let toggleHintHide = 'Click to hide';</script>
    <script>let toggleOpenOnPrint = 'true';</script>
    <script src="../_static/togglebutton.js?v=4a39c7ea"></script>
    <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
    <script>var togglebuttonSelector = '.toggle, .admonition.dropdown';</script>
    <script>DOCUMENTATION_OPTIONS.pagename = 'features/attention';</script>
    <script>
        DOCUMENTATION_OPTIONS.theme_version = '0.16.1';
        DOCUMENTATION_OPTIONS.theme_switcher_json_url = './_static/switcher.json';
        DOCUMENTATION_OPTIONS.theme_switcher_version_match = '1.2.0rc6';
        DOCUMENTATION_OPTIONS.show_version_warning_banner =
            false;
        </script>

    <link rel="icon" href="../_static/favicon.png"/>

    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="next" title="Disaggregated Serving" href="disagg-serving.html" />
    <link rel="prev" title="Feature Combination Matrix" href="feature-combination-matrix.html" />


  <meta name="viewport" content="width=device-width, initial-scale=1"/>
  <meta name="docsearch:language" content="en"/>
  <meta name="docsearch:version" content="1.2.0rc6" />


  </head>


  <body data-bs-spy="scroll" data-bs-target=".bd-toc-nav" data-offset="180" data-bs-root-margin="0px 0px -60%" data-default-mode="">


  <div id="pst-skip-link" class="skip-link d-print-none"><a href="#main-content">Skip to main content</a></div>


  <div id="pst-scroll-pixel-helper"></div>

  <button type="button" class="btn rounded-pill" id="pst-back-to-top">
    <i class="fa-solid fa-arrow-up"></i>Back to top</button>


  <dialog id="pst-search-dialog">

<form class="bd-search d-flex align-items-center"
      action="../search.html"
      method="get">
  <i class="fa-solid fa-magnifying-glass"></i>
  <input type="search"
         class="form-control"
         name="q"
         placeholder="Search the docs ..."
         aria-label="Search the docs ..."
         autocomplete="off"
         autocorrect="off"
         autocapitalize="off"
         spellcheck="false"/>
  <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd>K</kbd></span>
</form>
  </dialog>

  <div class="pst-async-banner-revealer d-none">
  <aside id="bd-header-version-warning" class="d-none d-print-none" aria-label="Version warning"></aside>
</div>


    <header class="bd-header navbar navbar-expand-lg bd-navbar d-print-none">
<div class="bd-header__inner bd-page-width">
  <button class="pst-navbar-icon sidebar-toggle primary-toggle" aria-label="Site navigation">
    <span class="fa-solid fa-bars"></span>
  </button>


  <div class="col-lg-3 navbar-header-items__start">

      <div class="navbar-item">


<a class="navbar-brand logo" href="../index.html">


    <img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT LLM - Home"/>
    <img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT LLM - Home"/>


    <p class="title logo__title">TensorRT LLM</p>

</a></div>

  </div>

  <div class="col-lg-9 navbar-header-items">

    <div class="me-auto navbar-header-items__center">

        <div class="navbar-item">


<div class="version-switcher__container dropdown pst-js-only">
  <button id="pst-version-switcher-button-2"
    type="button"
    class="version-switcher__button btn btn-sm dropdown-toggle"
    data-bs-toggle="dropdown"
    aria-haspopup="listbox"
    aria-controls="pst-version-switcher-list-2"
    aria-label="Version switcher list"
  >
    Choose version  <!-- this text may get changed later by javascript -->
    <span class="caret"></span>
  </button>
  <div id="pst-version-switcher-list-2"
    class="version-switcher__menu dropdown-menu list-group-flush py-0"
    role="listbox" aria-labelledby="pst-version-switcher-button-2">
    <!-- dropdown will be populated by javascript on page load -->
  </div>
</div></div>

    </div>


    <div class="navbar-header-items__end">

        <div class="navbar-item navbar-persistent--container">


<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
 <i class="fa-solid fa-magnifying-glass"></i>
 <span class="search-button__default-text">Search</span>
 <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
        </div>


        <div class="navbar-item">

<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
</button></div>

    </div>

  </div>


    <div class="navbar-persistent--mobile">

<button class="btn search-button-field search-button__button pst-js-only" title="Search" aria-label="Search" data-bs-placement="bottom" data-bs-toggle="tooltip">
 <i class="fa-solid fa-magnifying-glass"></i>
 <span class="search-button__default-text">Search</span>
 <span class="search-button__kbd-shortcut"><kbd class="kbd-shortcut__modifier">Ctrl</kbd>+<kbd class="kbd-shortcut__modifier">K</kbd></span>
</button>
    </div>


    <button class="pst-navbar-icon sidebar-toggle secondary-toggle" aria-label="On this page">
      <span class="fa-solid fa-outdent"></span>
    </button>

</div>

    </header>


  <div class="bd-container">
    <div class="bd-container__inner bd-page-width">


      <dialog id="pst-primary-sidebar-modal"></dialog>
      <div id="pst-primary-sidebar" class="bd-sidebar-primary bd-sidebar">


<a class="navbar-brand logo" href="../index.html">


    <img src="../_static/nvidia-logo-horiz-rgb-blk-for-screen.svg" class="logo__image only-light" alt="TensorRT LLM - Home"/>
    <img src="../_static/nvidia-logo-horiz-rgb-wht-for-screen.svg" class="logo__image only-dark pst-js-only" alt="TensorRT LLM - Home"/>


    <p class="title logo__title">TensorRT LLM</p>

</a>


  <div class="sidebar-header-items sidebar-primary__section">


      <div class="sidebar-header-items__center">


            <div class="navbar-item">


<div class="version-switcher__container dropdown pst-js-only">
  <button id="pst-version-switcher-button-3"
    type="button"
    class="version-switcher__button btn btn-sm dropdown-toggle"
    data-bs-toggle="dropdown"
    aria-haspopup="listbox"
    aria-controls="pst-version-switcher-list-3"
    aria-label="Version switcher list"
  >
    Choose version  <!-- this text may get changed later by javascript -->
    <span class="caret"></span>
  </button>
  <div id="pst-version-switcher-list-3"
    class="version-switcher__menu dropdown-menu list-group-flush py-0"
    role="listbox" aria-labelledby="pst-version-switcher-button-3">
    <!-- dropdown will be populated by javascript on page load -->
  </div>
</div></div>


      </div>


      <div class="sidebar-header-items__end">

          <div class="navbar-item">

<button class="btn btn-sm nav-link pst-navbar-icon theme-switch-button pst-js-only" aria-label="Color mode" data-bs-title="Color mode"  data-bs-placement="bottom" data-bs-toggle="tooltip">
  <i class="theme-switch fa-solid fa-sun                fa-lg" data-mode="light" title="Light"></i>
  <i class="theme-switch fa-solid fa-moon               fa-lg" data-mode="dark"  title="Dark"></i>
  <i class="theme-switch fa-solid fa-circle-half-stroke fa-lg" data-mode="auto"  title="System Settings"></i>
</button></div>

      </div>

  </div>

    <div class="sidebar-primary-items__start sidebar-primary__section">
        <div class="sidebar-primary-item">


<nav class="bd-docs-nav bd-links"
     aria-label="Table of Contents">
  <p class="bd-links__title" role="heading" aria-level="1">Table of Contents</p>
  <div class="bd-toc-item navbar-nav"><p aria-level="2" class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../installation/index.html">Installation</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../installation/containers.html">Pre-built release container images on NGC</a></li>
<li class="toctree-l2"><a class="reference internal" href="../installation/linux.html">Installing on Linux via <code class="docutils literal notranslate"><span class="pre">pip</span></code></a></li>
<li class="toctree-l2"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
</ul>
</details></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Deployment Guide</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/llm_api_examples.html">LLM Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async.html">Generate text asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_async_streaming.html">Generate text in streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_logits_processor.html">Control generated text using logits processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_sparse_attention.html">Sparse Attention</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_speculative_decoding.html">Speculative Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_kv_cache_connector.html">KV Cache Connector</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_kv_cache_offloading.html">KV Cache Offloading</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_runtime.html">Runtime Configuration Examples</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_sampling.html">Sampling Techniques Showcase</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_llm_distributed.html">Run LLM-API with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_bench.html">Run trtllm-bench with pytorch backend on Slurm</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/llm_mgmn_trtllm_serve.html">Run trtllm-serve with pytorch backend on Slurm</a></li>
</ul>
</details></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../examples/trtllm_serve_examples.html">Online Serving Examples</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client.html">Curl Chat Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_chat_client_for_multimodal.html">Curl Chat Client For Multimodal</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_completion_client.html">Curl Completion Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/curl_responses_client.html">Curl Responses Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/deepseek_r1_reasoning_parser.html">Deepseek R1 Reasoning Parser</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client.html">Genai Perf Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/genai_perf_client_for_multimodal.html">Genai Perf Client For Multimodal</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client.html">OpenAI Chat Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_chat_client_for_multimodal.html">OpenAI Chat Client for Multimodal</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client.html">OpenAI Completion Client</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client_for_lora.html">Openai Completion Client For Lora</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_completion_client_json_schema.html">OpenAI Completion Client with JSON Schema</a></li>
<li class="toctree-l2"><a class="reference internal" href="../examples/openai_responses_client.html">OpenAI Responses Client</a></li>
</ul>
</details></li>
<li class="toctree-l1"><a class="reference internal" href="../examples/dynamo_k8s_example.html">Dynamo K8s Example</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../deployment-guide/index.html">Model Recipes</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.html">Deployment Guide for DeepSeek R1 on TensorRT LLM - Blackwell &amp; Hopper Hardware</a></li>
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.html">Deployment Guide for Llama3.3 70B on TensorRT LLM - Blackwell &amp; Hopper Hardware</a></li>
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.html">Deployment Guide for Llama4 Scout 17B on TensorRT LLM - Blackwell &amp; Hopper Hardware</a></li>
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.html">Deployment Guide for GPT-OSS on TensorRT-LLM - Blackwell Hardware</a></li>
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-qwen3-on-trtllm.html">Deployment Guide for Qwen3 on TensorRT LLM - Blackwell &amp; Hopper Hardware</a></li>
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.html">Deployment Guide for Qwen3 Next on TensorRT LLM - Blackwell &amp; Hopper Hardware</a></li>
<li class="toctree-l2"><a class="reference internal" href="../deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.html">Deployment Guide for Kimi K2 Thinking on TensorRT LLM - Blackwell</a></li>
</ul>
</details></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Models</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../models/supported-models.html">Supported Models</a></li>

<li class="toctree-l1"><a class="reference internal" href="../models/adding-new-model.html">Adding a New Model</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">CLI Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-bench.html">trtllm-bench</a></li>

<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-eval.html">trtllm-eval</a></li>
<li class="toctree-l1 has-children"><a class="reference internal" href="../commands/trtllm-serve/index.html">trtllm-serve</a><details><summary><span class="toctree-toggle" role="presentation"><i class="fa-solid fa-chevron-down"></i></span></summary><ul>
<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/trtllm-serve.html">trtllm-serve</a></li>
<li class="toctree-l2"><a class="reference internal" href="../commands/trtllm-serve/run-benchmark-with-trtllm-serve.html">Run benchmarking with <code class="docutils literal notranslate"><span class="pre">trtllm-serve</span></code></a></li>
</ul>
</details></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">API Reference</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">LLM API Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Features</span></p>
<ul class="current nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="feature-combination-matrix.html">Feature Combination Matrix</a></li>
<li class="toctree-l1 current active"><a class="current reference internal" href="#">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="disagg-serving.html">Disaggregated Serving</a></li>
<li class="toctree-l1"><a class="reference internal" href="kvcache.html">KV Cache System</a></li>
<li class="toctree-l1"><a class="reference internal" href="long-sequence.html">Long Sequences</a></li>
<li class="toctree-l1"><a class="reference internal" href="lora.html">LoRA (Low-Rank Adaptation)</a></li>
<li class="toctree-l1"><a class="reference internal" href="multi-modality.html">Multimodal Support in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="overlap-scheduler.html">Overlap Scheduler</a></li>
<li class="toctree-l1"><a class="reference internal" href="paged-attention-ifb-scheduler.html">Paged Attention, IFB, and Request Scheduling</a></li>
<li class="toctree-l1"><a class="reference internal" href="parallel-strategy.html">Parallelism in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="quantization.html">Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="sampling.html">Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="additional-outputs.html">Additional Outputs</a></li>
<li class="toctree-l1"><a class="reference internal" href="guided-decoding.html">Guided Decoding</a></li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Decoding</a></li>
<li class="toctree-l1"><a class="reference internal" href="checkpoint-loading.html">Checkpoint Loading</a></li>
<li class="toctree-l1"><a class="reference internal" href="auto_deploy/auto-deploy.html">AutoDeploy (Prototype)</a></li>
<li class="toctree-l1"><a class="reference internal" href="ray-orchestrator.html">Ray Orchestrator (Prototype)</a></li>
<li class="toctree-l1"><a class="reference internal" href="torch_compile_and_piecewise_cuda_graph.html">Torch Compile &amp; Piecewise CUDA Graph</a></li>
<li class="toctree-l1"><a class="reference internal" href="helix.html">Helix Parallelism</a></li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-connector.html">KV Cache Connector</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Developer Guide</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/overview.html">Architecture Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/perf-analysis.html">Performance Analysis</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/perf-benchmarking.html">TensorRT LLM Benchmarking</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/ci-overview.html">Continuous Integration Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/dev-containers.html">Using Dev Containers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/api-change.html">LLM API Change Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../developer-guide/kv-transfer.html">Introduction to KV Cache Transmission</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Blogs</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog10_ADP_Balance_Strategy.html">ADP Balance Strategy</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog11_GPT_OSS_Eagle3.html">Running GPT-OSS-120B with Eagle3 Speculative Decoding on GB200/B200 (TensorRT LLM)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog12_Combining_Guided_Decoding_and_Speculative_Decoding.html">Combining Guided Decoding and Speculative Decoding: Making CPU and GPU Cooperate Seamlessly</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog13_Inference_Time_Compute_Implementation_in_TensorRT-LLM.html">Inference Time Compute Implementation in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog14_Scaling_Expert_Parallelism_in_TensorRT-LLM_part3.html">Scaling Expert Parallelism in TensorRT LLM (Part 3: Pushing the Performance Boundary)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog1_Pushing_Latency_Boundaries_Optimizing_DeepSeek-R1_Performance_on_NVIDIA_B200_GPUs.html">Pushing Latency Boundaries: Optimizing DeepSeek-R1 Performance on NVIDIA B200 GPUs</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog2_DeepSeek_R1_MTP_Implementation_and_Optimization.html">DeepSeek R1 MTP Implementation and Optimization</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog3_Optimizing_DeepSeek_R1_Throughput_on_NVIDIA_Blackwell_GPUs.html">Optimizing DeepSeek R1 Throughput on NVIDIA Blackwell GPUs: A Deep Dive for Developers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog4_Scaling_Expert_Parallelism_in_TensorRT-LLM.html">Scaling Expert Parallelism in TensorRT LLM (Part 1: Design and Implementation of Large-scale EP)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog5_Disaggregated_Serving_in_TensorRT-LLM.html">Disaggregated Serving in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.html">How to launch Llama4 Maverick + Eagle3 TensorRT LLM server</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog7_NGram_performance_Analysis_And_Auto_Enablement.html">N-Gram Speculative Decoding in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog8_Scaling_Expert_Parallelism_in_TensorRT-LLM_part2.html">Scaling Expert Parallelism in TensorRT LLM (Part 2: Performance Status and Optimization)</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/tech_blog/blog9_Deploying_GPT_OSS_on_TRTLLM.html">Running a High Performance GPT-OSS-120B Inference Server with TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.html">How to get best performance on DeepSeek-R1 in TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Quick Links</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/releases">Releases</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM">Github Code</a></li>
<li class="toctree-l1"><a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap">Roadmap</a></li>
</ul>
<p aria-level="2" class="caption" role="heading"><span class="caption-text">Use TensorRT Engine</span></p>
<ul class="nav bd-sidenav">
<li class="toctree-l1"><a class="reference internal" href="../legacy/tensorrt_quickstart.html">LLM API with TensorRT Engine</a></li>
</ul>
</div>
</nav></div>
    </div>


  <div class="sidebar-primary-items__end sidebar-primary__section">
  </div>


      </div>

      <main id="main-content" class="bd-main" role="main">


          <div class="bd-content">
            <div class="bd-article-container">

              <div class="bd-header-article d-print-none">
<div class="header-article-items header-article__inner">

    <div class="header-article-items__start">

        <div class="header-article-item">

<nav aria-label="Breadcrumb" class="d-print-none">
  <ul class="bd-breadcrumbs">

    <li class="breadcrumb-item breadcrumb-home">
      <a href="../index.html" class="nav-link" aria-label="Home">
        <i class="fa-solid fa-home"></i>
      </a>
    </li>
    <li class="breadcrumb-item active" aria-current="page"><span class="ellipsis">Multi-Head, Multi-Query, and Group-Query Attention</span></li>
  </ul>
</nav>
</div>

    </div>


</div>
</div>


<div id="searchbox"></div>
                <article class="bd-article">

  <section class="tex2jax_ignore mathjax_ignore" id="multi-head-multi-query-and-group-query-attention">
<span id="attention"></span><h1>Multi-Head, Multi-Query, and Group-Query Attention<a class="headerlink" href="#multi-head-multi-query-and-group-query-attention" title="Link to this heading">#</a></h1>
<p>This document details the implementation of multi-head attention (MHA),
multi-query attention (MQA), and group-query attention (GQA) for autoregressive
models in TensorRT LLM’s PyTorch backend.</p>
<p>Multi-head attention involves a sequence of batched matrix multiplications, a softmax operation, and another batched matrix multiplication,
as described in the <a class="reference external" href="https://arxiv.org/abs/1706.03762">Attention Is All You Need</a> paper.
<a class="reference external" href="https://arxiv.org/abs/1911.02150">Multi-query Attention (MQA)</a> and <a class="reference external" href="https://arxiv.org/abs/2307.09288">Group-query Attention (GQA)</a> are
variants of MHA that use fewer KV heads than the number of query heads.
TensorRT LLM provides several implementations using different backends in <code class="docutils literal notranslate"><span class="pre">tensorrt_llm/_torch/attention_backend/</span></code>.
The following sections explain how to use these implementations and provide a brief guide on implementing new backends.</p>
<section id="attention-backends">
<h2>Attention Backends<a class="headerlink" href="#attention-backends" title="Link to this heading">#</a></h2>
<p>There are currently three available attention backends: the vanilla backend, the TRT-LLM backend, and the Flashinfer backend.
You can specify the desired attention backend using <code class="docutils literal notranslate"><span class="pre">PyTorchConfig.attn_backend</span></code>. For instance, to utilize the Flashinfer backend, you can pass <code class="docutils literal notranslate"><span class="pre">attn_backend=&quot;flashinfer&quot;</span></code> to the <code class="docutils literal notranslate"><span class="pre">LLM</span></code> constructor as follows: <code class="docutils literal notranslate"><span class="pre">LLM(attn_backend=&quot;flashinfer&quot;)</span></code>. This will enable the use of the Flashinfer backend for your model.</p>
<p>The vanilla backend, <code class="docutils literal notranslate"><span class="pre">VanillaAttention</span></code>, is a reference implementation designed primarily for inflight batching and linear KV cache support. While it serves as a useful baseline, it is not recommended for production use due to its limited optimizations.</p>
<p>In contrast, the Flashinfer backend, <code class="docutils literal notranslate"><span class="pre">FlashInferAttention</span></code>, is performance-optimized and supports both inflight batching and paged KV cache. It also includes the following advanced features:</p>
<ol class="arabic simple">
<li><p><strong>FP8 Quantization</strong>: This feature enables the quantization of inputs and KV cache into FP8 format, significantly reducing memory usage and improving computational throughput.</p></li>
<li><p><strong>RoPE Fusion</strong>: By integrating rotary position embedding (RoPE) directly into the attention computation, this feature enhances efficiency and reduces overhead.</p></li>
</ol>
<p>The TRT-LLM backend, <code class="docutils literal notranslate"><span class="pre">TrtllmAttention</span></code>, serves as the default backend and supports all the features available in the Flashinfer backend while being further optimized for enhanced performance. It is the recommended choice for production environments. Additionally, it offers the following advanced features:</p>
<ol class="arabic simple">
<li><p><strong>Fused QKV Input</strong>: It can accept a single QKV tensor as input, which is more efficient compared to using separate Q, K, and V tensors.</p></li>
<li><p><strong>FP8 Output</strong>: It supports outputting the attention result in FP8 format, fusing quantization into the attention computation process.</p></li>
</ol>
</section>
<section id="implement-a-new-attention-backend">
<h2>Implement a New Attention Backend<a class="headerlink" href="#implement-a-new-attention-backend" title="Link to this heading">#</a></h2>
<p>You can implement a new attention backend to integrate other attention libraries.
An attention backend consists of an <code class="docutils literal notranslate"><span class="pre">AttentionBackend</span></code> class and an <code class="docutils literal notranslate"><span class="pre">AttentionMetadata</span></code> class.
There are three stages in the PyTorch that involve the attention backend:</p>
<ol class="arabic simple">
<li><p>Model construction: During the model’s <code class="docutils literal notranslate"><span class="pre">__init__</span></code>, call <code class="docutils literal notranslate"><span class="pre">AttentionBackend.__init__</span></code> to create an attention backend for each layer.</p></li>
<li><p>Metadata preparation: Before each forward step of the model:</p>
<ol class="arabic simple">
<li><p>If the metadata is uninitialized, call <code class="docutils literal notranslate"><span class="pre">AttentionMetadata.__init__</span></code> to create the attention metadata.</p></li>
<li><p>If using CUDA graphs, call <code class="docutils literal notranslate"><span class="pre">AttentionMetadata.create_cuda_graph_metadata</span></code> to convert the metadata to CUDA graph metadata, which pre-allocates all tensors and can be used to capture CUDA graphs. Do not re-allocate any tensors stored inside <code class="docutils literal notranslate"><span class="pre">AttentionMetadata</span></code> after the initial warmup run when using CUDA graphs.</p></li>
<li><p>To prepare parameters of the input and KV cache, call <code class="docutils literal notranslate"><span class="pre">AttentionMetadata.prepare</span></code> to convert from existing metadata and KV cache manager.</p></li>
</ol>
</li>
<li><p>Single step forward: During the forward pass of each attention layer, call <code class="docutils literal notranslate"><span class="pre">AttentionBackend.forward</span></code> to perform the attention operation. The <code class="docutils literal notranslate"><span class="pre">AttentionMetadata</span></code> will be provided as a forward argument.</p></li>
</ol>
<section id="implement-attentionmetadata">
<h3>Implement <code class="docutils literal notranslate"><span class="pre">AttentionMetadata</span></code><a class="headerlink" href="#implement-attentionmetadata" title="Link to this heading">#</a></h3>
<p>The <code class="docutils literal notranslate"><span class="pre">AttentionMetadata</span></code> class stores metadata from the batched input and KV cache for the attention backend.
It contains the following predefined fields:</p>
<div class="pst-scrollable-table-container"><table class="table">
<thead>
<tr class="row-odd"><th class="head"><p>Field</p></th>
<th class="head"><p>Type</p></th>
<th class="head"><p>Description</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>max_num_requests</p></td>
<td><p>int</p></td>
<td><p>The max number of requests in a single batch.</p></td>
</tr>
<tr class="row-odd"><td><p>num_contexts</p></td>
<td><p>int</p></td>
<td><p>The number of context-phase sequences in the batch.</p></td>
</tr>
<tr class="row-even"><td><p>num_generations</p></td>
<td><p>int</p></td>
<td><p>The number of generation-phase sequences in the batch.</p></td>
</tr>
<tr class="row-odd"><td><p>max_num_tokens</p></td>
<td><p>int</p></td>
<td><p>The max number of tokens in all requests in a single batch.</p></td>
</tr>
<tr class="row-even"><td><p>num_tokens</p></td>
<td><p>int</p></td>
<td><p>Number of tokens in the batch.</p></td>
</tr>
<tr class="row-odd"><td><p>num_ctx_tokens</p></td>
<td><p>int</p></td>
<td><p>Number of tokens in sequences in the context phase.</p></td>
</tr>
<tr class="row-even"><td><p>kv_cache_manager</p></td>
<td><p>KVCacheManager</p></td>
<td><p>The KV cache manager.</p></td>
</tr>
<tr class="row-odd"><td><p>is_cuda_graph</p></td>
<td><p>bool</p></td>
<td><p>Whether CUDA graph is enabled.</p></td>
</tr>
<tr class="row-even"><td><p>seq_lens</p></td>
<td><p>Tensor</p></td>
<td><p>The length of each sequence in the batch. The shape is (batch_size), and located on CPU memory.</p></td>
</tr>
<tr class="row-odd"><td><p>seq_lens_cuda</p></td>
<td><p>Tensor</p></td>
<td><p>A copy of <code class="docutils literal notranslate"><span class="pre">seq_lens</span></code> store on the GPU.</p></td>
</tr>
<tr class="row-even"><td><p>context_lens</p></td>
<td><p>Tensor</p></td>
<td><p>The length of each context-phase sequence in the batch. The shape is (<code class="docutils literal notranslate"><span class="pre">num_contexts</span></code>).</p></td>
</tr>
<tr class="row-odd"><td><p>position_ids</p></td>
<td><p>Optional[Tensor]</p></td>
<td><p>The position of each token in each sequence. May be None if positional embedding is applied outside of the backend.</p></td>
</tr>
<tr class="row-even"><td><p>request_ids</p></td>
<td><p>List[int]</p></td>
<td><p>The request ID of each sequence in the batch.</p></td>
</tr>
<tr class="row-odd"><td><p>prompt_lens</p></td>
<td><p>List[int]</p></td>
<td><p>The prompt length of each sequence in the batch.</p></td>
</tr>
<tr class="row-even"><td><p>kv_cache_params</p></td>
<td><p>KVCacheParams</p></td>
<td><p>The parameters for the KV cache.</p></td>
</tr>
</tbody>
</table>
</div>
<p>During <code class="docutils literal notranslate"><span class="pre">AttentionMetadata.__init__</span></code>, you can initialize additional fields for the new attention metadata.
For example, the Flashinfer metadata initializes <code class="docutils literal notranslate"><span class="pre">decode_wrapper</span></code> here.
During <code class="docutils literal notranslate"><span class="pre">AttentionMetadata.prepare</span></code>, the runtime will fill all predefined fields, and you can fill your customized fields according to these predefined fields.
For example, the Flashinfer metadata fills <code class="docutils literal notranslate"><span class="pre">qo_indptr</span></code> by combining <code class="docutils literal notranslate"><span class="pre">context_lens</span></code> and <code class="docutils literal notranslate"><span class="pre">num_generations</span></code> here.</p>
</section>
<section id="implement-attentionbackend">
<h3>Implement <code class="docutils literal notranslate"><span class="pre">AttentionBackend</span></code><a class="headerlink" href="#implement-attentionbackend" title="Link to this heading">#</a></h3>
<p>The <code class="docutils literal notranslate"><span class="pre">AttentionBackend</span></code> delegates the attention operation to the backend implementation.</p>
<p>Its <code class="docutils literal notranslate"><span class="pre">__init__</span></code> accepts the following arguments:</p>
<div class="pst-scrollable-table-container"><table class="table">
<thead>
<tr class="row-odd"><th class="head"><p>Field</p></th>
<th class="head"><p>Type</p></th>
<th class="head"><p>Description</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>layer_idx</p></td>
<td><p>int</p></td>
<td><p>The index of the attention layer in the model.</p></td>
</tr>
<tr class="row-odd"><td><p>num_heads</p></td>
<td><p>int</p></td>
<td><p>The number of query heads.</p></td>
</tr>
<tr class="row-even"><td><p>head_dim</p></td>
<td><p>int</p></td>
<td><p>The size of each attention head <code class="docutils literal notranslate"><span class="pre">(hidden_size</span> <span class="pre">//</span> <span class="pre">num_heads)</span></code>.</p></td>
</tr>
<tr class="row-odd"><td><p>num_kv_heads</p></td>
<td><p>Optional[int]</p></td>
<td><p>The number of KV heads. Defaults to num_heads if None.</p></td>
</tr>
<tr class="row-even"><td><p>quant_config</p></td>
<td><p>QuantConfig</p></td>
<td><p>Optional quantization configuration. If None, no quantization is applied.</p></td>
</tr>
<tr class="row-odd"><td><p>pos_embd_params</p></td>
<td><p>PositionalEmbeddingParams</p></td>
<td><p>Optional parameters defining how positional embedding should be applied. If None, positional embedding should be applied by the model before calling the backend. Otherwise, the backend is in-charge of applying positional embedding and may cache K without embedding it first.</p></td>
</tr>
</tbody>
</table>
</div>
<p>Its <code class="docutils literal notranslate"><span class="pre">forward</span></code> accepts the following arguments:</p>
<div class="pst-scrollable-table-container"><table class="table">
<thead>
<tr class="row-odd"><th class="head"><p>Field</p></th>
<th class="head"><p>Type</p></th>
<th class="head"><p>Description</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>q</p></td>
<td><p>Tensor</p></td>
<td><p>Query tensor with shape <code class="docutils literal notranslate"><span class="pre">(num_tokens,</span> <span class="pre">num_heads</span> <span class="pre">*</span> <span class="pre">head_dim)</span></code>.</p></td>
</tr>
<tr class="row-odd"><td><p>k</p></td>
<td><p>Tensor</p></td>
<td><p>Key tensor with shape <code class="docutils literal notranslate"><span class="pre">(num_tokens,</span> <span class="pre">num_kv_heads</span> <span class="pre">*</span> <span class="pre">head_dim)</span></code>.</p></td>
</tr>
<tr class="row-even"><td><p>v</p></td>
<td><p>Tensor</p></td>
<td><p>Value tensor with shape <code class="docutils literal notranslate"><span class="pre">(num_tokens,</span> <span class="pre">num_kv_heads</span> <span class="pre">*</span> <span class="pre">head_dim)</span></code>.</p></td>
</tr>
<tr class="row-odd"><td><p>metadata</p></td>
<td><p>AttentionMetadata</p></td>
<td><p>Metadata for the attention operation.</p></td>
</tr>
<tr class="row-even"><td><p>attention_mask</p></td>
<td><p>AttentionMask</p></td>
<td><p>Optional attention mask. If None, causal mask is applied.</p></td>
</tr>
</tbody>
</table>
</div>
<p>For example, the Flashinfer backend calls <code class="docutils literal notranslate"><span class="pre">append_paged_kv_cache</span></code> and then <code class="docutils literal notranslate"><span class="pre">wrapper.run</span></code> to perform the attention operation here.</p>
</section>
</section>
<section id="the-features-of-the-trtllmattention-backend">
<h2>The Features of the <code class="docutils literal notranslate"><span class="pre">TrtllmAttention</span></code> Backend<a class="headerlink" href="#the-features-of-the-trtllmattention-backend" title="Link to this heading">#</a></h2>
<p>The following sections introduce some features of the default <code class="docutils literal notranslate"><span class="pre">TrtllmAttention</span></code> backend.</p>
<section id="packed-tensors">
<h3>Packed Tensors<a class="headerlink" href="#packed-tensors" title="Link to this heading">#</a></h3>
<p>In the <code class="docutils literal notranslate"><span class="pre">TrtllmAttention</span></code> backend, the attention operator supports the packed (i.e. non padded) QKV inputs.
A naive layout for the QKV inputs is padding the sequences
that are shorter than the <code class="docutils literal notranslate"><span class="pre">max_sequence_length</span></code> to the maximum
length. It may result in excessive memory consumption as well as unneeded
computations on padding tokens (in the various matrix multiplications that
surround the MHA block).
To overcome that problem, TensorRT LLM supports a mode without padding where
the different tokens are packed together and the user provides the operator
with a 1D tensor containing the lengths of the different sequences.</p>
</section>
<section id="context-and-generation-phases">
<h3>Context and Generation Phases<a class="headerlink" href="#context-and-generation-phases" title="Link to this heading">#</a></h3>
<p>The <code class="docutils literal notranslate"><span class="pre">TrtllmAttention</span></code> backend encapsulates different implementations for both
context and generation phases into a single custom torch op.</p>
<section id="context-phase">
<h4>Context Phase<a class="headerlink" href="#context-phase" title="Link to this heading">#</a></h4>
<p>A context-phase implementation without optimization maps to a sequence of GPU kernels that will store the
intermediate <code class="docutils literal notranslate"><span class="pre">Q*K^T</span></code> tensor in memory before calling the softmax operator. It
is the slowest method and the memory footprint is significant (grows quadratically in proportion to the sequence length).</p>
<p>The <code class="docutils literal notranslate"><span class="pre">TrtllmAttention</span></code> backend will trigger a kernel that performs the MHA/MQA block
using a single kernel instead. For short sequences, that kernel uses a vanilla
implementation of MHA/MQA. For larger sequences, this kernel uses the Flash
Attention algorithm as described in
<a class="reference external" href="https://arxiv.org/abs/2205.14135">FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness</a>
and
<a class="reference external" href="https://arxiv.org/abs/2307.08691">FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning</a>.</p>
<p>Currently, the implementation triggers extra kernels that apply pre-processing
to the elements (like RoPE) and populate the KV cache (see below). In a future
release, the number of such kernels may be reduced to improve the overall performance.</p>
</section>
<section id="fp8-context-fmha">
<h4>FP8 Context FMHA<a class="headerlink" href="#fp8-context-fmha" title="Link to this heading">#</a></h4>
<p>When FP8 quantization is activated, the attention can be further accelerated by
enabling FP8 Context FMHA.</p>
<p>FP8 Paged Context FMHA is also supported with the fp8 quantization workflow.
You need to specify <code class="docutils literal notranslate"><span class="pre">use_paged_context_fmha</span> <span class="pre">=</span> <span class="pre">True</span></code> for the attention operator.</p>
<p>Please be aware that this feature is only supported on Ada, Hopper and above.</p>
</section>
<section id="generation-phase">
<h4>Generation Phase<a class="headerlink" href="#generation-phase" title="Link to this heading">#</a></h4>
<p>The generation phase is implemented using a single kernel called the masked
multi-head attention in TensorRT LLM. That kernel is able to apply
pre-processing on the Q, K, and V elements on-the-fly: it adds the QKV bias, applies
RoPE, and performs dequantization and quantization. TensorRT LLM will continue to add (or
enable) additional features in future releases, such as enabling support for IA3.</p>
<p>The masked MHA kernel has a special version that distributes the work across
multiple CUDA thread-blocks on the GPU for cases where the GPU occupancy is
low. That mode called multi-block is always enabled.
NVIDIA recommends users to test that mode in scenarios where both the batch
size and the number of heads in the model are relatively small.
The definition of ‘small’ in that context is hard to quantify because it depends on the model of the GPU.
However, NVIDIA currently recommends testing that mode when <code class="docutils literal notranslate"><span class="pre">batch_size</span> <span class="pre">*</span> <span class="pre">num_heads</span></code> is less than the number of multi-processors on the GPU.
This guidance may be subject to change in the future.</p>
<p>Note that even if the multi-block mode is enabled, the attention operator will
not immediately trigger the multi-block version of the GPU kernel. There is a
minimum number of tokens (input + generated) that are required for the
multi-block version to become more efficient than the “vanilla” implementation
that uses a single CUDA thread-block per head. It is controlled by an internal
heuristic.</p>
<p>Another note is that as the masked MHA kernels use shared memory size
proportional to sequence length, so there can be some cases that GPU’s shared
memory is not enough when multi-block mode is not enabled. To get masked MHA kernel to work in those cases, multi-block mode is forced on and a warning message is printed in the log.</p>
</section>
<section id="xqa-optimization">
<h4>XQA Optimization<a class="headerlink" href="#xqa-optimization" title="Link to this heading">#</a></h4>
<p>XQA optimization is another optimization for MQA/GQA in the generation phase.
It currently only supports a limited number of model configurations, such as the LLAMA2 70B model.</p>
<p>Support matrix of the XQA optimization:</p>
<ul class="simple">
<li><p>FP16 / BF16 compute data type.</p></li>
<li><p>FP16 / BF16 / FP8 / INT8 KV cache data type.</p></li>
<li><p>Paged KV cache (8 / 16 / 32 / 64 / 128 tokens per block).</p></li>
</ul>
<p>By default, this is enabled. Note that a heuristic algorithm
is also used to decide whether to use XQA kernel or masked MHA kernel to get
better performance.
If you want to use that kernel whenever possible, set <code class="docutils literal notranslate"><span class="pre">TRTLLM_FORCE_XQA=1</span></code> to force use of the XQA kernel when the model config is supported.
Supported configurations can be found using the <code class="docutils literal notranslate"><span class="pre">shouldUse</span></code> function of the <code class="docutils literal notranslate"><span class="pre">DecoderXQARunner</span></code> class in
<code class="docutils literal notranslate"><span class="pre">cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/decoderXQARunner.h</span></code>.</p>
</section>
</section>
<section id="in-flight-batching">
<span id="inflight-batching"></span><h3>In-flight Batching<a class="headerlink" href="#in-flight-batching" title="Link to this heading">#</a></h3>
<p>TensorRT LLM supports in-flight batching of requests (also known as continuous
batching or iteration-level batching) for higher serving throughput. With this feature,
sequences in the context phase can be processed together with sequences in
the generation phase. The purpose of that technique is to better interleave
requests to reduce latency as well as make better use of the GPUs.
For efficiency reasons (1), the support for inflight batching <em><strong>requires the
input tensors to be packed (no padding)</strong></em>.</p>
<p><em><strong>In the current implementation, the sequences that are going through the
context phase must be before the sequences in the generation phase in the input
tensor. For example, for sequences <code class="docutils literal notranslate"><span class="pre">S0</span></code>, <code class="docutils literal notranslate"><span class="pre">S1</span></code> and <code class="docutils literal notranslate"><span class="pre">S2</span></code>, if <code class="docutils literal notranslate"><span class="pre">S0</span></code> and <code class="docutils literal notranslate"><span class="pre">S2</span></code> are in
context phase (and <code class="docutils literal notranslate"><span class="pre">S1</span></code> in generation), tokens from <code class="docutils literal notranslate"><span class="pre">S0</span></code> and <code class="docutils literal notranslate"><span class="pre">S2</span></code> must appear
before the tokens of <code class="docutils literal notranslate"><span class="pre">S1</span></code> in the input tensor</strong></em>.</p>
<p><em>(1) Padding sequences in the generation phase, that contain a single token, to
the length of the maximum input sequence is inefficient use of resources</em>.</p>
</section>
<section id="chunked-context">
<h3>Chunked Context<a class="headerlink" href="#chunked-context" title="Link to this heading">#</a></h3>
<p>In the original state, the common behavior was to process all context tokens at
once. This feature splits the context into several chunks. In this way, the
context chunks can be batched with more tokens during the generation phase,
which is expected to increase the total throughput. Chunking contexts also removes
constraints on input length. Except for the last one, the size of the context chunk needs
to be an integer multiple of the kv-cache block size.</p>
<blockquote>
<div><p>To enable this feature, the FMHA paged kv-cache also needs to be enabled.</p>
</div></blockquote>
</section>
<section id="kv-cache">
<h3>KV Cache<a class="headerlink" href="#kv-cache" title="Link to this heading">#</a></h3>
<p>In the generation phase, a common optimization is to provide the MHA kernel
with a cache containing the values of the past K and V elements that have
already been computed.  That cache is known as the KV cache. TensorRT LLM uses
that technique to accelerate its generation phase. In TensorRT LLM, there is
one KV cache per Transformer layer, which means that there are as many KV
caches as layers in a model. The current version of TensorRT LLM supports two
different types of KV caches: <strong>contiguous</strong> and <strong>paged</strong> KV caches.</p>
<section id="contiguous-kv-cache">
<h4>Contiguous KV Cache<a class="headerlink" href="#contiguous-kv-cache" title="Link to this heading">#</a></h4>
<p>The contiguous KV cache is a monolithic tensor. Its shape is:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="p">[</span><span class="n">max_batch_size</span> <span class="o">*</span> <span class="n">max_beam_width</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="n">num_heads</span><span class="p">,</span> <span class="n">max_seqlen</span><span class="p">,</span> <span class="n">hidden_dim_per_head</span><span class="p">]</span><span class="o">.</span>
</pre></div>
</div>
<p>That implementation uses a lot more memory than needed when the sequences are
shorter than the maximum sequence length (even if they end up close to the
limit after the generation of many output tokens, it may take a lot of steps to
reach that point).</p>
</section>
<section id="paged-kv-cache">
<h4>Paged KV Cache<a class="headerlink" href="#paged-kv-cache" title="Link to this heading">#</a></h4>
<p>The paged KV cache decomposes the KV cache into blocks that are distributed to
the different requests by a cache manager during processing. That cache manager
keeps track of the sequences, allocates new blocks from a pool and recycles those
blocks when required. See the implementation of
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ba1426/tensorrt_llm/_torch/pyexecutor/resource_manager.py"><code class="docutils literal notranslate"><span class="pre">KVCacheManager</span></code></a>.</p>
</section>
<section id="int8-fp8-kv-caches">
<h4>INT8/FP8 KV Caches<a class="headerlink" href="#int8-fp8-kv-caches" title="Link to this heading">#</a></h4>
<p>In its current implementation, even if the rest of the network runs in INT8 or
FP8, the attention operator works with FP32, FP16, and BFloat16 inputs and
outputs. However, TensorRT LLM supports INT8 and FP8
(<code class="docutils literal notranslate"><span class="pre">QuantMode.INT8_KV_CACHE</span></code> and
<code class="docutils literal notranslate"><span class="pre">QuantMode.FP8_KV_CACHE</span></code>) KV caches.</p>
<p>The attention operator populates the KV cache. When INT8 or FP8 KV caches
are enabled, the input values have to be quantized to 8 bits using a scaling
factor. For quantization, the scaling factor is stored in the
<code class="docutils literal notranslate"><span class="pre">kv_cache_scaling_factor</span></code> tensor. Its shape is <code class="docutils literal notranslate"><span class="pre">[1]</span></code> and only per-tensor
quantization is supported in the current version. Quantization uses inversed scale
since it does multiply as <code class="docutils literal notranslate"><span class="pre">fp_value</span> <span class="pre">*</span> <span class="pre">(1.0</span> <span class="pre">/</span> <span class="pre">kv_cache_scaling_factor)</span></code> in plugin.</p>
<p>During generation, the values read from the cache are dequantized on-the-fly in
the MHA/MQA kernel. Dequantization is defined as
<code class="docutils literal notranslate"><span class="pre">quantized_value</span> <span class="pre">*</span> <span class="pre">kv_cache_scaling_factor</span></code>.</p>
</section>
</section>
<section id="sliding-window-attention-cyclic-rolling-buffer-kv-cache">
<h3>Sliding Window Attention, Cyclic (Rolling Buffer) KV Cache<a class="headerlink" href="#sliding-window-attention-cyclic-rolling-buffer-kv-cache" title="Link to this heading">#</a></h3>
<p>TensorRT LLM has a feature called <code class="docutils literal notranslate"><span class="pre">Cyclic</span> <span class="pre">KV</span> <span class="pre">Cache</span></code>, which treats the kv cache
as a circular buffer. This means that it only stores the kv cache for the last N
tokens, where N is determined by the <code class="docutils literal notranslate"><span class="pre">attention_window_size</span></code> parameter in
<code class="docutils literal notranslate"><span class="pre">TrtllmAttention.forward</span></code>. When the cache is full, new tokens’ kv cache will
overwrite the “least recently used” caches.</p>
<p>In the context phase, if the input length surpasses the <code class="docutils literal notranslate"><span class="pre">attention_window_size</span></code>,
<code class="docutils literal notranslate"><span class="pre">Sliding</span> <span class="pre">Window</span> <span class="pre">Attention</span></code> will be activated. This serves the same function as
the sliding window size.</p>
<p>This feature helps to reduce the memory footprint of the kv cache when
dealing with very long sequences.</p>
<p>_Note that the cyclic kv cache feature doesn’t work with beam searching currently as
the context kv cache are shared across beams.</p>
</section>
<section id="streamingllm">
<h3>StreamingLLM<a class="headerlink" href="#streamingllm" title="Link to this heading">#</a></h3>
<p>The StreamingLLM feature uses a window attention to perform efficient and stable LLM
on long texts, which means that only <code class="docutils literal notranslate"><span class="pre">N</span></code> tokens need to be stored in the KV cache.
Similar to the cyclic KV cache feature in TensorRT LLM, <code class="docutils literal notranslate"><span class="pre">attention_window_size</span></code>
parameter is used to determine <code class="docutils literal notranslate"><span class="pre">N</span></code>. Different from the cyclic KV cache feature,
the first <code class="docutils literal notranslate"><span class="pre">S</span></code> tokens, called sink tokens, are always kept in the attention window,
where <code class="docutils literal notranslate"><span class="pre">S</span></code> is determined by <code class="docutils literal notranslate"><span class="pre">sink_token_length</span></code> parameter.
But in context phase, the self-attentions are dense in the official implementation of
StreamingLLM. It uses all of the tokens for computation and only saves <code class="docutils literal notranslate"><span class="pre">N</span></code> tokens
to the KV cache.</p>
<p>In addition, the relative position embedding is also changed in StreamingLLM.
When determining the relative distance and adding positional information to tokens,
StreamingLLM use the positions within the cache rather than those in the original text.</p>
<p><code class="docutils literal notranslate"><span class="pre">sink_token_length</span></code> is also used to enable this feature.</p>
</section>
<section id="beam-search">
<h3>Beam-Search<a class="headerlink" href="#beam-search" title="Link to this heading">#</a></h3>
<p>The attention operator supports beam-search. In the context phase, a single
beam is computed per input sequence. In the generation phase, the MHA/MQA/GQA
kernel uses an additional tensor to reconstruct the correct path for each beam.
That tensor is called the <code class="docutils literal notranslate"><span class="pre">cache_indirection</span></code>. Its shape is <code class="docutils literal notranslate"><span class="pre">[batch_size,</span> <span class="pre">beam_width,</span> <span class="pre">max_seqlen]</span></code>.</p>
<p>For a sequence <code class="docutils literal notranslate"><span class="pre">si</span></code>, a beam <code class="docutils literal notranslate"><span class="pre">bi</span></code> and a token <code class="docutils literal notranslate"><span class="pre">ti</span></code>, the element
<code class="docutils literal notranslate"><span class="pre">cache_indirection[si][bi][ti]</span></code> is an integer between <code class="docutils literal notranslate"><span class="pre">0</span></code> and <code class="docutils literal notranslate"><span class="pre">beam_width-1</span></code>
that indicates which path in the beam to read the K and V elements from in the
KV cache. This tensor is populated in the sampling stage.</p>
</section>
<section id="input-qkv-tensor">
<h3>Input QKV tensor<a class="headerlink" href="#input-qkv-tensor" title="Link to this heading">#</a></h3>
<p>The input QKV tensor packs the Q, K and V tensors (concatenated along the last
dimension) after the projection of the hidden states. It is a 3D tensor. RoPE
and quantization to INT8 or FP8 (when needed) are performed by the GPT
attention operator.</p>
<p>In packed mode, its shape is <code class="docutils literal notranslate"><span class="pre">[num_tokens,</span> <span class="pre">3</span> <span class="pre">*</span> <span class="pre">hidden_dim]</span></code> where
<code class="docutils literal notranslate"><span class="pre">num_tokens</span></code> is the total number of tokens in the batch. For the sequences in
context phase, the number of tokens of a sequence corresponds to its input
length (even if the beam width is greater than <code class="docutils literal notranslate"><span class="pre">1</span></code> for beam search). For the
sequences in generation phase, there are <code class="docutils literal notranslate"><span class="pre">beam_width</span></code> tokens per sequence. The
beam width can be different for each sequence.</p>
<p>The following pseudo code explains how the number of tokens is computed:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">num_tokens</span> <span class="o">=</span> <span class="mi">0</span>

<span class="c1"># Add the length of each sequence in context phase.</span>
<span class="k">for</span> <span class="n">seq</span> <span class="ow">in</span> <span class="n">context_phase</span><span class="p">:</span>
    <span class="n">num_tokens</span> <span class="o">+=</span> <span class="n">seq</span><span class="o">.</span><span class="n">length</span>

<span class="c1"># Add the width of the beam for each sequence in generation phase.</span>
<span class="k">for</span> <span class="n">seq</span> <span class="ow">in</span> <span class="n">generation_phase</span><span class="p">:</span>
    <span class="n">num_tokens</span> <span class="o">+=</span> <span class="n">seq</span><span class="o">.</span><span class="n">beam_width</span>
</pre></div>
</div>
</section>
<section id="rotary-positional-embedding-rope">
<h3>Rotary Positional Embedding (RoPE)<a class="headerlink" href="#rotary-positional-embedding-rope" title="Link to this heading">#</a></h3>
<p>The attention operator can perform the computation of the Rotary
Positional Embedding (RoPE). When that operation is enabled,
<code class="docutils literal notranslate"><span class="pre">rotary_embedding_dim</span></code> is set to a value greater than 0, it is fused with other
operations. The GPT operator supports GPT-NeoX and GPT-J forms of RoPE by
setting <code class="docutils literal notranslate"><span class="pre">position_embedding_type</span></code> to <code class="docutils literal notranslate"><span class="pre">PositionEmbeddingType.rope_gpt_neox</span></code>
or <code class="docutils literal notranslate"><span class="pre">PositionEmbeddingType.rope_gptj</span></code>.</p>
</section>
<section id="alibi">
<h3>ALiBi<a class="headerlink" href="#alibi" title="Link to this heading">#</a></h3>
<p>The attention operator can apply ALiBi to the result of the <code class="docutils literal notranslate"><span class="pre">Q*K^T</span></code>
product. The bias is computed on-the-fly from the ALiBi slopes in the optimized
kernel.</p>
</section>
<section id="scaling-factor-s">
<h3>Scaling factor(s)<a class="headerlink" href="#scaling-factor-s" title="Link to this heading">#</a></h3>
<p>In MHA, the output of the <code class="docutils literal notranslate"><span class="pre">Q*K^T</span></code> product is scaled by a constant value that
is computed as:</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">norm_factor</span> <span class="o">=</span> <span class="mf">1.</span><span class="n">f</span> <span class="o">/</span> <span class="p">(</span><span class="n">q_scaling</span> <span class="o">*</span> <span class="n">sqrt</span><span class="p">(</span><span class="n">head_size</span><span class="p">))</span><span class="o">.</span>
</pre></div>
</div>
</section>
<section id="cross-attention">
<h3>Cross Attention<a class="headerlink" href="#cross-attention" title="Link to this heading">#</a></h3>
<p>On top of the MHA as self attention needed by GPT-style decoder-only models, the attention operator also supports cross attention.</p>
<p>This enables the attention operator to be more broadly used as a generic decoder component. For example, the Encoder-Decoder model uses it to issue both the self attention and cross attention modules in its Decoder.</p>
</section>
</section>
</section>


                </article>


                <footer class="prev-next-footer d-print-none">

<div class="prev-next-area">
    <a class="left-prev"
       href="feature-combination-matrix.html"
       title="previous page">
      <i class="fa-solid fa-angle-left"></i>
      <div class="prev-next-info">
        <p class="prev-next-subtitle">previous</p>
        <p class="prev-next-title">Feature Combination Matrix</p>
      </div>
    </a>
    <a class="right-next"
       href="disagg-serving.html"
       title="next page">
      <div class="prev-next-info">
        <p class="prev-next-subtitle">next</p>
        <p class="prev-next-title">Disaggregated Serving</p>
      </div>
      <i class="fa-solid fa-angle-right"></i>
    </a>
</div>
                </footer>

            </div>


                <dialog id="pst-secondary-sidebar-modal"></dialog>
                <div id="pst-secondary-sidebar" class="bd-sidebar-secondary bd-toc"><div class="sidebar-secondary-items sidebar-secondary__inner">


  <div class="sidebar-secondary-item">
<div
    id="pst-page-navigation-heading-2"
    class="page-toc tocsection onthispage">
    <i class="fa-solid fa-list"></i> On this page
  </div>
  <nav class="bd-toc-nav page-toc" aria-labelledby="pst-page-navigation-heading-2">
    <ul class="visible nav section-nav flex-column">
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#attention-backends">Attention Backends</a></li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#implement-a-new-attention-backend">Implement a New Attention Backend</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#implement-attentionmetadata">Implement <code class="docutils literal notranslate"><span class="pre">AttentionMetadata</span></code></a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#implement-attentionbackend">Implement <code class="docutils literal notranslate"><span class="pre">AttentionBackend</span></code></a></li>
</ul>
</li>
<li class="toc-h2 nav-item toc-entry"><a class="reference internal nav-link" href="#the-features-of-the-trtllmattention-backend">The Features of the <code class="docutils literal notranslate"><span class="pre">TrtllmAttention</span></code> Backend</a><ul class="nav section-nav flex-column">
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#packed-tensors">Packed Tensors</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#context-and-generation-phases">Context and Generation Phases</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#context-phase">Context Phase</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#fp8-context-fmha">FP8 Context FMHA</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#generation-phase">Generation Phase</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#xqa-optimization">XQA Optimization</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#in-flight-batching">In-flight Batching</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#chunked-context">Chunked Context</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#kv-cache">KV Cache</a><ul class="nav section-nav flex-column">
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#contiguous-kv-cache">Contiguous KV Cache</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#paged-kv-cache">Paged KV Cache</a></li>
<li class="toc-h4 nav-item toc-entry"><a class="reference internal nav-link" href="#int8-fp8-kv-caches">INT8/FP8 KV Caches</a></li>
</ul>
</li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#sliding-window-attention-cyclic-rolling-buffer-kv-cache">Sliding Window Attention, Cyclic (Rolling Buffer) KV Cache</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#streamingllm">StreamingLLM</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#beam-search">Beam-Search</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#input-qkv-tensor">Input QKV tensor</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#rotary-positional-embedding-rope">Rotary Positional Embedding (RoPE)</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#alibi">ALiBi</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#scaling-factor-s">Scaling factor(s)</a></li>
<li class="toc-h3 nav-item toc-entry"><a class="reference internal nav-link" href="#cross-attention">Cross Attention</a></li>
</ul>
</li>
</ul>
  </nav></div>

</div></div>


          </div>
          <footer class="bd-footer-content">

          </footer>

      </main>
    </div>
  </div>


  <!-- Scripts loaded after <body> so the DOM is not blocked -->
  <script defer src="../_static/scripts/bootstrap.js?digest=8878045cc6db502f8baf"></script>
<script defer src="../_static/scripts/pydata-sphinx-theme.js?digest=8878045cc6db502f8baf"></script>


  <footer class="bd-footer">
<div class="bd-footer__inner bd-page-width">

    <div class="footer-items__start">

        <div class="footer-item">
<a class="footer-brand logo" href="https://www.nvidia.com">
  <img src="../_static/nvidia-logo-horiz-rgb-1c-blk-for-screen.svg" class="logo__image only-light" alt="NVIDIA"/>
  <img src="../_static/nvidia-logo-horiz-rgb-1c-wht-for-screen.svg" class="logo__image only-dark" alt="NVIDIA"/>
</a></div>

        <div class="footer-item">

<div class="footer-links">


  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/">Privacy Policy</a>
   |


  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/">Your Privacy Choices</a>
   |


  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/">Terms of Service</a>
   |


  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/">Accessibility</a>
   |


  <a class="external" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/">Corporate Policies</a>
   |


  <a class="external" href="https://www.nvidia.com/en-us/product-security/">Product Security</a>
   |


  <a class="external" href="https://www.nvidia.com/en-us/contact/">Contact</a>


</div>
</div>

        <div class="footer-item">


  <p class="copyright">

      Copyright © 2025, NVidia.
      <br/>

  </p>
</div>

        <div class="footer-item">
<div class="extra_footer">

  <p>Last updated on December 15, 2025.</p>

  <p>This page is generated by TensorRT-LLM commit <a href="https://github.com/NVIDIA/TensorRT-LLM/tree/9ba1426">9ba1426</a>.</p>

</div></div>

    </div>


</div>

  </footer>
  </body>
</html>