TensorRT-LLMs/performance/useful-build-time-flags.html



<!DOCTYPE html>
<html class="writer-html5" lang="en" data-content_root="../">
<head>
  <meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Useful Build-Time Flags &mdash; tensorrt_llm  documentation</title>
      <link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=b86133f3" />
      <link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=e59714d7" />
      <link rel="stylesheet" type="text/css" href="../_static/copybutton.css?v=76b2166b" />


      <script src="../_static/jquery.js?v=5d32c60e"></script>
      <script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
      <script src="../_static/documentation_options.js?v=5929fcd5"></script>
      <script src="../_static/doctools.js?v=9bcbadda"></script>
      <script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
      <script src="../_static/clipboard.min.js?v=a7894cd8"></script>
      <script src="../_static/copybutton.js?v=65e89d2a"></script>
    <script src="../_static/js/theme.js"></script>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
</head>

<body class="wy-body-for-nav">
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >


          <a href="../index.html" class="icon icon-home">
            tensorrt_llm
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
<li class="toctree-l1"><a class="reference internal" href="../key-features.html">Key Features</a></li>
<li class="toctree-l1"><a class="reference internal" href="../torch.html">PyTorch Backend</a></li>
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
<li class="toctree-l1"><a class="reference internal" href="../installation/grace-hopper.html">Installing on Grace Hopper</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">LLM API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../llm-api/index.html">API Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api/reference.html">API Reference</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">LLM API Examples</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/customization.html">Common Customizations</a></li>
<li class="toctree-l1"><a class="reference internal" href="../llm-api-examples/llm_api_examples.html">Examples</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Model Definition API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.layers.html">Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.functional.html">Functionals</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.plugin.html">Plugin</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.quantization.html">Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="../python-api/tensorrt_llm.runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Command-Line Reference</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-build.html">trtllm-build</a></li>
<li class="toctree-l1"><a class="reference internal" href="../commands/trtllm-serve.html">trtllm-serve</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/gpt-runtime.html">C++ GPT Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/executor.html">Executor API</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/graph-rewriting.html">Graph Rewriting Module</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html">Inference Request</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/inference-request.html#responses">Responses</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="perf-overview.html">Overview</a></li>
<li class="toctree-l1"><a class="reference internal" href="perf-benchmarking.html">Benchmarking</a></li>
<li class="toctree-l1"><a class="reference internal" href="introduction.html">Best Practices</a></li>
<li class="toctree-l1"><a class="reference internal" href="perf-analysis.html">Performance Analysis</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="../index.html">tensorrt_llm</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
      <li class="breadcrumb-item active">Useful Build-Time Flags</li>
      <li class="wy-breadcrumbs-aside">
            <a href="../_sources/performance/useful-build-time-flags.md.txt" rel="nofollow"> View page source</a>
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">

  <section id="useful-build-time-flags">
<span id="id1"></span><h1>Useful Build-Time Flags<a class="headerlink" href="#useful-build-time-flags" title="Link to this heading"></a></h1>
<p>This page presents several build-time flags, set via the LLM-API’s <code class="docutils literal notranslate"><span class="pre">BuildConfig</span></code> class that you can enable to improve upon the baseline performance. Build-time refers to the fact that these flags affect how the TensorRT-LLM engine is built and cannot be changed without rebuilding the engine. For each flag there is an explanation of what it does, a description of how to enable it, and then an example of running it through the benchmarking flow described in <a class="reference internal" href="benchmarking-default-performance.html"><span class="std std-doc">Benchmarking Default Performance</span></a> to showcase its impact on performance. All options compatible with <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> can be found in the Command Line Reference section of the docs.</p>
<blockquote>
<div><p>Disclaimer: While performance numbers shown here are real, they are only for demonstration purposes. Differences in environment, SKU, interconnect, and workload can all significantly affect performance and lead to your results differing from what is shown here.</p>
</div></blockquote>
<section id="multiple-profiles">
<h2>Multiple Profiles<a class="headerlink" href="#multiple-profiles" title="Link to this heading"></a></h2>
<p>TensorRT-LLM is built on TensorRT, which handles engine building through “optimization profiles” defining min, optimal, and max input tensor shapes. TensorRT optimizes for the optimal shape while supporting the range between min and max.</p>
<p>TensorRT-LLM abstracts away the need to create optimization profiles although flags like max_batch_size and max_num_tokens (covered later) influence how they are created. By default, only one profile is created.</p>
<p>During inference serving, varying request loads can pose different tensor shapes to the engine. TensorRT addresses this by allowing multiple profiles, which TensorRT-LLM supports via the BuildConfig option in the LLM-API. Enabling multiple profiles increases build times but has no performance downsides, so it is recommended for production builds.</p>
<p>The only thing to watch out for is that enabling this can lead to slightly different outputs when the same prompt is run multiple times as different profiles and consequently kernels might be used depending on the request load. However this variance should not affect output quality so it is safe to enable this flag as long as you don’t need completely deterministic outputs.</p>
<section id="enabling-building-with-multiple-profiles">
<h3>Enabling building with multiple profiles<a class="headerlink" href="#enabling-building-with-multiple-profiles" title="Link to this heading"></a></h3>
<p>Below is an example of how you can modify the baseline example to enable multiple profiles.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="kn">from</span><span class="w"> </span><span class="nn">tensorrt_llm</span><span class="w"> </span><span class="kn">import</span> <span class="n">LLM</span><span class="p">,</span> <span class="n">BuildConfig</span>

<span class="k">def</span><span class="w"> </span><span class="nf">main</span><span class="p">():</span>
    <span class="n">build_config</span> <span class="o">=</span> <span class="n">BuildConfig</span><span class="p">()</span>
    <span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">multiple_profiles</span> <span class="o">=</span> <span class="kc">True</span>

    <span class="n">llm</span> <span class="o">=</span> <span class="n">LLM</span><span class="p">(</span>
        <span class="n">model</span><span class="o">=</span><span class="s2">&quot;/scratch/Llama-3.3-70B-Instruct&quot;</span><span class="p">,</span>
        <span class="n">tensor_parallel_size</span><span class="o">=</span><span class="mi">4</span><span class="p">,</span>
        <span class="n">build_config</span><span class="o">=</span><span class="n">build_config</span>
    <span class="p">)</span>

    <span class="n">llm</span><span class="o">.</span><span class="n">save</span><span class="p">(</span><span class="s2">&quot;build_flags_multiple_profiles&quot;</span><span class="p">)</span>

<span class="k">if</span> <span class="vm">__name__</span> <span class="o">==</span> <span class="s1">&#39;__main__&#39;</span><span class="p">:</span>
    <span class="n">main</span><span class="p">()</span>
</pre></div>
</div>
<p>If you are using the <a class="reference internal" href="benchmarking-default-performance.html#building-and-saving-engines-via-cli"><span class="std std-ref">CLI flow for building engines</span></a> pass <code class="docutils literal notranslate"><span class="pre">--multiple_profiles</span></code> to <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> to enable the feature.</p>
</section>
<section id="performance-with-multiple-profiles">
<h3>Performance with multiple profiles<a class="headerlink" href="#performance-with-multiple-profiles" title="Link to this heading"></a></h3>
<p>Baseline refers to the engine that was benchmarked in the previous Benchmarking Default Performance page.</p>
<table class="docutils align-default">
<thead>
<tr class="row-odd"><th class="head"><p>Metric</p></th>
<th class="head"><p>Baseline</p></th>
<th class="head"><p>Multiple Profiles ON</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Token Throughput (tokens/sec)</p></td>
<td><p>1564.3040</p></td>
<td><p>1861.0881</p></td>
</tr>
<tr class="row-odd"><td><p>Request Throughput (req/sec)</p></td>
<td><p>0.7638</p></td>
<td><p>0.9087</p></td>
</tr>
<tr class="row-even"><td><p>Average Time To First Token (ms)</p></td>
<td><p>147.6976</p></td>
<td><p>145.8958</p></td>
</tr>
<tr class="row-odd"><td><p>Average Inter-Token Latency (ms)</p></td>
<td><p>31.3276</p></td>
<td><p>19.6452</p></td>
</tr>
</tbody>
</table>
<p>As you can see, enabling multiple profiles significantly improves the metrics across the board.</p>
</section>
</section>
<section id="paged-context-attention">
<h2>Paged Context Attention<a class="headerlink" href="#paged-context-attention" title="Link to this heading"></a></h2>
<p>By default all the tokens of the prompt of a new request are processed in one iteration as the context phase. Enabling paged context attention allows TensorRT-LLM to break the context phase into chunks and handle the prompt over several iterations. This is particularly useful for workloads with large input length. In the worst case, this feature can provide a small performance hit in benchmarking runs (&lt;2%) so it can be safely enabled. This feature is discussed further in the <a class="reference internal" href="tuning-max-batch-size-and-max-num-tokens.html#revisiting-paged-context-attention-and-context-chunking"><span class="std std-ref">next page</span></a> of the guide.</p>
<section id="enabling-paged-context-attention">
<h3>Enabling Paged Context Attention<a class="headerlink" href="#enabling-paged-context-attention" title="Link to this heading"></a></h3>
<p>Add the following line to our multiple profiles example from above to enable paged context attention</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">use_paged_context_fmha</span><span class="o">=</span><span class="kc">True</span>
</pre></div>
</div>
<p>If you are using the <a class="reference internal" href="benchmarking-default-performance.html#building-and-saving-engines-via-cli"><span class="std std-ref">CLI flow for building engines</span></a> pass <code class="docutils literal notranslate"><span class="pre">--use_paged_context_fmha</span></code> to <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> to enable the feature.</p>
</section>
<section id="performance">
<h3>Performance<a class="headerlink" href="#performance" title="Link to this heading"></a></h3>
<p>Paged Context OFF refers to the same engine shown as Multiple Profiles ON in the previous example.</p>
<table class="docutils align-default">
<thead>
<tr class="row-odd"><th class="head"><p>Metric</p></th>
<th class="head"><p>Paged Context OFF</p></th>
<th class="head"><p>Paged Context ON</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Token Throughput (tokens/sec)</p></td>
<td><p>1861.0881</p></td>
<td><p>1866.6684</p></td>
</tr>
<tr class="row-odd"><td><p>Request Throughput (req/sec)</p></td>
<td><p>0.9087</p></td>
<td><p>0.9115</p></td>
</tr>
<tr class="row-even"><td><p>Average Time To First Token (ms)</p></td>
<td><p>145.8958</p></td>
<td><p>145.4089</p></td>
</tr>
<tr class="row-odd"><td><p>Average Inter-Token Latency (ms)</p></td>
<td><p>19.6452</p></td>
<td><p>19.6523</p></td>
</tr>
</tbody>
</table>
<p>In this case enabling paged context attention provides a small boost to performance, but a rerun of our tests found this to be within run to run variance of around 10 tok/s for token throughput and 2ms for average time to first token (ITL was stable with &lt;1ms and request throughput corresponded directly to token throughput). In other cases naively enabling it might actually provide a small hit to performance. However, further guidance on how to reason about this flag and why we recommend enabling it is discussed in the <a class="reference internal" href="tuning-max-batch-size-and-max-num-tokens.html#revisiting-paged-context-attention-and-context-chunking"><span class="std std-ref">next page</span></a> as it is closely intertwined with how TensorRT-LLM schedules requests as well as the max-num tokens flag.</p>
</section>
</section>
<section id="gemm-plugin">
<h2>GEMM Plugin<a class="headerlink" href="#gemm-plugin" title="Link to this heading"></a></h2>
<p>TensorRT allows you to add “plugins” or custom kernels that can be used instead of the kernels that TensorRT selects for particular operations. TensorRT-LLM has a host of custom plugins that are specifically tailored to speed up supported modules. The GEMM plugin utilizes NVIDIA cuBLASLt and some custom kernels to perform GEMM operations. On FP16 and BF16, it’s recommended to be enabled for better performance and smaller GPU memory usage. On FP8, it’s recommended to be disabled.</p>
<section id="enabling-gemm-plugin">
<h3>Enabling GEMM Plugin<a class="headerlink" href="#enabling-gemm-plugin" title="Link to this heading"></a></h3>
<p>Add the following line to the multiple profiles example from above to enable paged context attention.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">gemm_plugin</span> <span class="o">=</span> <span class="s1">&#39;auto&#39;</span>
</pre></div>
</div>
<p>If you are using the <a class="reference internal" href="benchmarking-default-performance.html#building-and-saving-engines-via-cli"><span class="std std-ref">CLI flow for building engines</span></a> pass <code class="docutils literal notranslate"><span class="pre">--gemm_plugin</span> <span class="pre">auto</span></code> to <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> to enable the feature. <code class="docutils literal notranslate"><span class="pre">'auto'</span></code> tells the GEMM plugin to have the same type as the model (fp16, bf16, etc). It is fine to leave it on auto unless you are trying to do mixed precision.</p>
</section>
<section id="performance-with-gemm-plugin">
<h3>Performance with GEMM Plugin<a class="headerlink" href="#performance-with-gemm-plugin" title="Link to this heading"></a></h3>
<p>GEMM Plugin OFF refers to the same engine shown as Paged Context ON in the previous example.</p>
<table class="docutils align-default">
<thead>
<tr class="row-odd"><th class="head"><p>Metric</p></th>
<th class="head"><p>GEMM Plugin OFF</p></th>
<th class="head"><p>GEMM Plugin ON</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Token Throughput (tokens/sec)</p></td>
<td><p>1866.6684</p></td>
<td><p>2033.2640</p></td>
</tr>
<tr class="row-odd"><td><p>Request Throughput (req/sec)</p></td>
<td><p>0.9115</p></td>
<td><p>0.9928</p></td>
</tr>
<tr class="row-even"><td><p>Average Time To First Token (ms)</p></td>
<td><p>145.4089</p></td>
<td><p>147.8307</p></td>
</tr>
<tr class="row-odd"><td><p>Average Inter-Token Latency (ms)</p></td>
<td><p>19.6523</p></td>
<td><p>15.4133</p></td>
</tr>
</tbody>
</table>
<p>In this case the GEMM plugin greatly improves throughput as well as ITL, with a slight hit to TTFT.</p>
</section>
</section>
<section id="reduce-norm-fusion-plugin-for-llama-models">
<h2>Reduce Norm Fusion Plugin for Llama models:<a class="headerlink" href="#reduce-norm-fusion-plugin-for-llama-models" title="Link to this heading"></a></h2>
<p>TensorRT-LLM has custom kernels for AllReduce operations that are enabled by default. This feature extends this functionality by fusing the ResidualAdd and LayerNorm kernels that run after AllReduce into the AllReduce kernel, resulting in a single kernel that handles those operations and improves end-to-end performance. This feature is currently only available for Llama models. It is most beneficial in workloads that are generation-phase heavy. For extremely context-phase heavy workloads its worth checking performance with and without this. Additionally, since this is an optimization for AllReduce, it is only beneficial for cases with tensor-parallelism. For scenarios only using pipeline parallelism this should stay disabled since pipeline parallelism doesn’t require any AllReduce operations.</p>
<section id="enabling-reduce-norm-fusion-plugin">
<h3>Enabling Reduce Norm Fusion Plugin<a class="headerlink" href="#enabling-reduce-norm-fusion-plugin" title="Link to this heading"></a></h3>
<p>Add the following line to the multiple profiles example from above to enable paged context attention.</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">reduce_fusion</span> <span class="o">=</span> <span class="kc">True</span>
</pre></div>
</div>
<p>If you are using the <a class="reference internal" href="benchmarking-default-performance.html#building-and-saving-engines-via-cli"><span class="std std-ref">CLI flow for building engines</span></a> pass <code class="docutils literal notranslate"><span class="pre">--reduce_fusion</span> <span class="pre">enable</span></code> to <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code> to enable the feature.</p>
</section>
<section id="performance-with-reduce-norm-fusion">
<h3>Performance with Reduce Norm Fusion<a class="headerlink" href="#performance-with-reduce-norm-fusion" title="Link to this heading"></a></h3>
<p>Reduce Fusion OFF refers to the same engine shown as GEMM Plugin ON in the previous example.</p>
<table class="docutils align-default">
<thead>
<tr class="row-odd"><th class="head"><p>Metric</p></th>
<th class="head"><p>REDUCE FUSION OFF</p></th>
<th class="head"><p>REDUCE FUSION ON</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Token Throughput (tokens/sec)</p></td>
<td><p>2033.2640</p></td>
<td><p>2044.2628</p></td>
</tr>
<tr class="row-odd"><td><p>Request Throughput (req/sec)</p></td>
<td><p>0.9928</p></td>
<td><p>0.9982</p></td>
</tr>
<tr class="row-even"><td><p>Average Time To First Token (ms)</p></td>
<td><p>147.8307</p></td>
<td><p>146.6628</p></td>
</tr>
<tr class="row-odd"><td><p>Average Inter-Token Latency (ms)</p></td>
<td><p>15.4133</p></td>
<td><p>14.4493</p></td>
</tr>
</tbody>
</table>
<p>For the ISL/OSL pair of 2048/2048 enabling the reduce norm fusion plugin slightly improves performance all around. However, test reruns found that with run to run variance, in the worst case, they performed at par. Again this flag’s effectiveness is dependent on the workload so users should check whether it provides meaningful performance boosts in their case.</p>
</section>
</section>
<section id="pipeline-parallel-reduce-scatter-optimization">
<h2>Pipeline Parallel Reduce Scatter Optimization<a class="headerlink" href="#pipeline-parallel-reduce-scatter-optimization" title="Link to this heading"></a></h2>
<p>This feature adds a pipeline parallelism optimization with ReduceScatter + AllGather targeting large mixture of experts models.
This can be enabled via the LLM-API as such</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span>    <span class="n">build_config</span><span class="o">.</span><span class="n">plugin_config</span><span class="o">.</span><span class="n">pp_reduce_scatter</span> <span class="o">=</span> <span class="kc">True</span>
</pre></div>
</div>
<p>If you are using the <a class="reference internal" href="benchmarking-default-performance.html#building-and-saving-engines-via-cli"><span class="std std-ref">CLI flow for building engines</span></a> flow you can enable this feature by adding <code class="docutils literal notranslate"><span class="pre">--pp_reduce_scatter</span></code> to <code class="docutils literal notranslate"><span class="pre">trtllm-build</span></code>.</p>
<p>As the Llama model is not a MoE model this flag was not included as part of the case study.</p>
</section>
<section id="conclusion">
<h2>Conclusion<a class="headerlink" href="#conclusion" title="Link to this heading"></a></h2>
<p>Overall, enabling these flags can greatly boost performance. However, the degree to which they are effective can vary from workload to workload, and it’s recommended that you run sanity checks on your workloads to verify performance.</p>
<p>The case-study example showed that enabling these flags provided the following performance uplifts from the baseline numbers. This included significant boosts in Token Throughput, Request Throughput, and Average Inter-Token Latency. TTFT remained largely unchanged.</p>
<table class="docutils align-default">
<thead>
<tr class="row-odd"><th class="head"><p>Metric</p></th>
<th class="head"><p>Baseline</p></th>
<th class="head"><p>Build-Time Flags ON</p></th>
<th class="head"><p>% Improvement</p></th>
</tr>
</thead>
<tbody>
<tr class="row-even"><td><p>Token Throughput (tokens/sec)</p></td>
<td><p>1564.3040</p></td>
<td><p>2044.2628</p></td>
<td><p>30.68</p></td>
</tr>
<tr class="row-odd"><td><p>Request Throughput (req/sec)</p></td>
<td><p>0.7638</p></td>
<td><p>0.9982</p></td>
<td><p>30.69</p></td>
</tr>
<tr class="row-even"><td><p>Average Time To First Token (ms)</p></td>
<td><p>147.6976</p></td>
<td><p>146.6628</p></td>
<td><p>0.70</p></td>
</tr>
<tr class="row-odd"><td><p>Average Inter-Token Latency (ms)</p></td>
<td><p>31.3276</p></td>
<td><p>14.4493</p></td>
<td><p>53.88</p></td>
</tr>
</tbody>
</table>
<section id="summary-of-configuration-option-recommendations">
<h3>Summary of Configuration Option Recommendations:<a class="headerlink" href="#summary-of-configuration-option-recommendations" title="Link to this heading"></a></h3>
<ol class="arabic simple">
<li><p>Multiple profiles: Always enable. It may increase build times a little but will only ever help performance. Enabling might cause engine to produce slightly different outputs when the same prompt is run multiple times depending on request load but it should not affect output quality, see <a class="reference internal" href="#multiple-profiles"><span class="std std-ref">Multiple Profiles section</span></a> for explanation.</p></li>
<li><p>Paged Context Attention: In the worst case it may hurt performance a little initially but typically helps with request scheduling and boosts performance after further tuning of max batch size and max num tokens. More on this topic is discussed in the next page.</p></li>
<li><p>GEMM Plugin: It’s recommended to enable it for FP16 and BF16 models as it usually helps. However, it is a good idea to benchmark your workload and double check that it is helping.</p></li>
<li><p>Reduce Fusion: This feature is only supported on Llama and Mistral/Mixtral models. Effectiveness is workload dependent and it’s recommend that you benchmark your workload with and without it and compare the results.</p></li>
</ol>
</section>
</section>
</section>


           </div>
          </div>
          <footer>

  <hr/>

  <div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f64003ed7f0>

<div class="footer">
    <p>
        Copyright © 2024 NVIDIA Corporation
    </p>
    <p>
        <a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
            data-cms-ai="0">Privacy Policy</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
            data-cms-ai="0">Manage My Privacy</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
            data-cms-ai="0">Do Not Sell or Share My Data</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
            rel="noopener" data-cms-ai="0">Terms of Service</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
            data-cms-ai="0">Accessibility</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
            rel="noopener" data-cms-ai="0">Corporate Policies</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
            data-cms-ai="0">Product Security</a> |
        <a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
            data-cms-ai="0">Contact</a>
    </p>
</div>


  </div>


</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

</body>
</html>