TensorRT-LLMs/architecture.html

<!DOCTYPE html>
<html class="writer-html5" lang="en" >
<head>
  <meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />

  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>TensorRT-LLM Architecture &mdash; tensorrt_llm  documentation</title>
      <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
      <link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
  <!--[if lt IE 9]>
    <script src="_static/js/html5shiv.min.js"></script>
  <![endif]-->

        <script src="_static/jquery.js?v=5d32c60e"></script>
        <script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
        <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=b3ba4146"></script>
        <script src="_static/doctools.js?v=888ff710"></script>
        <script src="_static/sphinx_highlight.js?v=4825356b"></script>
    <script src="_static/js/theme.js"></script>
    <link rel="index" title="Index" href="genindex.html" />
    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="C++ GPT Runtime" href="gpt_runtime.html" />
    <link rel="prev" title="Welcome to TensorRT-LLM’s documentation!" href="index.html" />
</head>

<body class="wy-body-for-nav">
  <div class="wy-grid-for-nav">
    <nav data-toggle="wy-nav-shift" class="wy-nav-side">
      <div class="wy-side-scroll">
        <div class="wy-side-nav-search" >


          <a href="index.html" class="icon icon-home">
            tensorrt_llm
          </a>
<div role="search">
  <form id="rtd-search-form" class="wy-form" action="search.html" method="get">
    <input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
    <input type="hidden" name="check_keywords" value="yes" />
    <input type="hidden" name="area" value="default" />
  </form>
</div>
        </div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
              <p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
<ul class="current">
<li class="toctree-l1 current"><a class="current reference internal" href="#">TensorRT-LLM Architecture</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#model-definition">Model Definition</a></li>
<li class="toctree-l2"><a class="reference internal" href="#compilation">Compilation</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#weight-bindings">Weight Bindings</a></li>
<li class="toctree-l3"><a class="reference internal" href="#pattern-matching-and-fusion">Pattern-Matching and Fusion</a></li>
<li class="toctree-l3"><a class="reference internal" href="#plugins">Plugins</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#runtime">Runtime</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
</ul>
</li>
<li class="toctree-l2"><a class="reference internal" href="#in-flight-batching">In-flight Batching</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="gpt_runtime.html">C++ GPT Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="batch_manager.html">The Batch Manager in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="gpt_attention.html">Multi-head, Multi-query and Group-query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="precision.html">Numerical Precision</a></li>
<li class="toctree-l1"><a class="reference internal" href="performance.html">Performance of TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="installation.html">Build From Sources</a></li>
<li class="toctree-l1"><a class="reference internal" href="2023-05-19-how-to-debug.html">How to debug</a></li>
<li class="toctree-l1"><a class="reference internal" href="2023-05-17-how-to-add-a-new-model.html">How to add a new model</a></li>
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Python API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.layers.html">Layers</a></li>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.functional.html">Functionals</a></li>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.models.html">Models</a></li>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.plugin.html">Plugin</a></li>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.quantization.html">Qunatization</a></li>
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.runtime.html">Runtime</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="_cpp_gen/runtime.html">Runtime</a></li>
</ul>

        </div>
      </div>
    </nav>

    <section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
          <i data-toggle="wy-nav-top" class="fa fa-bars"></i>
          <a href="index.html">tensorrt_llm</a>
      </nav>

      <div class="wy-nav-content">
        <div class="rst-content">
          <div role="navigation" aria-label="Page navigation">
  <ul class="wy-breadcrumbs">
      <li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
      <li class="breadcrumb-item active">TensorRT-LLM Architecture</li>
      <li class="wy-breadcrumbs-aside">
            <a href="_sources/architecture.md.txt" rel="nofollow"> View page source</a>
      </li>
  </ul>
  <hr/>
</div>
          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
           <div itemprop="articleBody">

  <section id="tensorrt-llm-architecture">
<h1>TensorRT-LLM Architecture<a class="headerlink" href="#tensorrt-llm-architecture" title="Permalink to this heading"></a></h1>
<p>TensorRT-LLM is a toolkit to assemble optimized solutions to perform Large
Language Model (LLM) inference. It offers a Python API to define models and
compile efficient <a class="reference external" href="https://developer.nvidia.com/tensorrt">TensorRT</a> engines for
NVIDIA GPUs. It also contains Python and C++ components to build runtimes to
execute those engines as well as backends for the <a class="reference external" href="https://developer.nvidia.com/nvidia-triton-inference-server">Triton Inference
Server</a> to easily
create web-based services for LLMs. TensorRT-LLM supports multi-GPU and
multi-node configurations (through MPI).</p>
<p>As a user, the very first step to create an inference solution is to either
define your own model or select a pre-defined network architecture (see
<a class="reference internal" href="#"><span class="xref myst">here</span></a> for the list of models supported by TensorRT-LLM). Once defined, that
model must be trained using a training framework (training is outside of the
scope of TensorRT-LLM). For pre-defined models, checkpoints can be downloaded
from various providers.  To illustrate that point, a lot of examples in
TensorRT-LLM use model weights obtained from the
<a class="reference external" href="https://huggingface.co">HuggingFace</a> hub and trained using <a class="reference external" href="https://developer.nvidia.com/nemo">NVIDIA
Nemo</a> or <a class="reference external" href="https://pytorch.org">PyTorch</a>.</p>
<p>Equipped with the model definition and the weights, a user must use
TensorRT-LLM’s Python API to recreate the model in a way that can be compiled
by TensorRT into an efficient engine. For ease of use, TensorRT-LLM already
supports a handful of standard models.</p>
<p>Together with the Python API to describe models, TensorRT-LLM provides users
with components to create a runtime that executes the efficient TensorRT
engine. Runtime components offer beam-search, along with extensive sampling
functionalities such as top-K and top-P sampling. The exhaustive list can be
found in the documentation of the <a class="reference internal" href="gpt_runtime.html"><span class="std std-doc">Runtime</span></a>. The C++ runtime
is the recommended runtime.</p>
<p>TensorRT-LLM also includes Python and C++ backends for NVIDIA Triton Inference
Server to assemble solutions for LLM online serving. The C++ backend implements
in-flight batching as explained in the <a class="reference internal" href="batch_manager.html"><span class="std std-doc">Batch Manager</span></a>
documentation and is the recommended backend.</p>
<section id="model-definition">
<h2>Model Definition<a class="headerlink" href="#model-definition" title="Permalink to this heading"></a></h2>
<p>As mentioned above, TensorRT-LLM has a Python API that can be used to define
Large Language Models. This API is built on top of the powerful
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html#">TensorRT Python API</a>
to create graph representations of deep neural networks in TensorRT. To become
familiar with the core concepts of the TensorRT API, refer to the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/coreConcepts.html">Core Concepts</a>
section of the TensorRT documentation before proceeding further.</p>
<p>In TensorRT-LLM, the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0/tensorrt_llm/builder.py"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder</span></code></a> class
contains a
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
object. That instance is used in the <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder.create_network</span></code>
method to create an instance of the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Network.html#tensorrt.INetworkDefinition"><code class="docutils literal notranslate"><span class="pre">tensorrt.INetworkDefinition</span></code></a>
class. The <code class="docutils literal notranslate"><span class="pre">INetworkDefinition</span></code> object can then be populated using the free
functions defined in the
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0/tensorrt_llm/functional.py"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm.functional</span></code></a>.</p>
<p>A simple example of such a free function is <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.activation</span></code> that inserts a
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Layers.html#tensorrt.IActivationLayer"><code class="docutils literal notranslate"><span class="pre">tensorrt.IActivationLayer</span></code></a>
node in the graph of the model:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># In tensorrt_llm.functional:</span>

<span class="k">def</span> <span class="nf">activation</span><span class="p">(</span><span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">act_type</span><span class="p">:</span> <span class="n">trt</span><span class="o">.</span><span class="n">ActivationType</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
    <span class="n">layer</span> <span class="o">=</span> <span class="n">default_trtnet</span><span class="p">()</span><span class="o">.</span><span class="n">add_activation</span><span class="p">(</span><span class="nb">input</span><span class="o">.</span><span class="n">trt_tensor</span><span class="p">,</span> <span class="n">act_type</span><span class="p">)</span>   <span class="c1"># default_trtnet() -&gt; INetworkDefinition</span>
    <span class="k">return</span> <span class="n">_create_tensor</span><span class="p">(</span><span class="n">layer</span><span class="o">.</span><span class="n">get_output</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">layer</span><span class="p">)</span>
</pre></div>
</div>
<p>To make it even easier for users, a few of the most standard activation
functions found in LLMs are derived from that function:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># In tensorrt_llm.functional:</span>

<span class="n">relu</span>    <span class="o">=</span> <span class="n">partial</span><span class="p">(</span><span class="n">activation</span><span class="p">,</span> <span class="n">act_type</span><span class="o">=</span><span class="n">trt</span><span class="o">.</span><span class="n">ActivationType</span><span class="o">.</span><span class="n">RELU</span><span class="p">)</span>
<span class="n">sigmoid</span> <span class="o">=</span> <span class="n">partial</span><span class="p">(</span><span class="n">activation</span><span class="p">,</span> <span class="n">act_type</span><span class="o">=</span><span class="n">trt</span><span class="o">.</span><span class="n">ActivationType</span><span class="o">.</span><span class="n">SIGMOID</span><span class="p">)</span>

</pre></div>
</div>
<p>Specialized activation functions can be used to assemble more advanced
functions such as the <code class="docutils literal notranslate"><span class="pre">silu</span></code> activation:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># In tensorrt_llm.functional:</span>

<span class="k">def</span> <span class="nf">silu</span><span class="p">(</span><span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span><span class="p">:</span>
    <span class="k">return</span> <span class="nb">input</span> <span class="o">*</span> <span class="n">sigmoid</span><span class="p">(</span><span class="nb">input</span><span class="p">)</span>
</pre></div>
</div>
<p>When the TensorRT-LLM’s Python API is utilized, a graph of the network is
assembled.  The graph can later be traversed or transformed using the graph
traversal API exposed by the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/LayerBase.html#tensorrt.ILayer"><code class="docutils literal notranslate"><span class="pre">tensorrt.ILayer</span></code></a>
class. That graph will also be optimized by TensorRT during the compilation of
the engine, as explained in the next section.</p>
</section>
<section id="compilation">
<h2>Compilation<a class="headerlink" href="#compilation" title="Permalink to this heading"></a></h2>
<p>Once populated, the instance of the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Network.html#tensorrt.INetworkDefinition"><code class="docutils literal notranslate"><span class="pre">tensorrt.INetworkDefinition</span></code></a>,
can be compiled into an efficient engine by the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
In TensorRT-LLM, it is done through the <code class="docutils literal notranslate"><span class="pre">build_engine</span></code> member function of the
<code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder</span></code> class that calls the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder.build_serialized_network"><code class="docutils literal notranslate"><span class="pre">build_serialized_network</span></code></a>
method of the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
object. That call, if everything works as expected, produces an instance of the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/FoundationalTypes/HostMemory.html#tensorrt.IHostMemory"><code class="docutils literal notranslate"><span class="pre">tensorrt.IHostMemory</span></code></a>
class. That object is an optimized TensorRT engine that can be stored as a
binary file.</p>
<section id="weight-bindings">
<h3>Weight Bindings<a class="headerlink" href="#weight-bindings" title="Permalink to this heading"></a></h3>
<p>TensorRT engines embed the network weights, that must be known for compilation.
For that reason, the weights must be bound to parameters in the model
definition before calling <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder.build_engine</span></code>. It leads to code like:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># The Linear operator exposes two parameters (see tensorrt_llm/layers/linear.py):</span>
<span class="k">class</span> <span class="nc">Linear</span><span class="p">(</span><span class="n">Module</span><span class="p">):</span>
    <span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">...</span><span class="p">):</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">weight</span> <span class="o">=</span> <span class="n">Parameter</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">out_features</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">in_features</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span>
        <span class="bp">self</span><span class="o">.</span><span class="n">bias</span>   <span class="o">=</span> <span class="n">Parameter</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">out_features</span><span class="p">,</span> <span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span>

<span class="c1"># The parameters are bound to the weights before compiling the model. See examples/gpt/weight.py:</span>
<span class="n">tensorrt_llm_gpt</span><span class="o">.</span><span class="n">layers</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">mlp</span><span class="o">.</span><span class="n">fc</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">fromfile</span><span class="p">(</span><span class="o">...</span><span class="p">)</span>
<span class="n">tensorrt_llm_gpt</span><span class="o">.</span><span class="n">layers</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">mlp</span><span class="o">.</span><span class="n">fc</span><span class="o">.</span><span class="n">bias</span><span class="o">.</span><span class="n">value</span>   <span class="o">=</span> <span class="n">fromfile</span><span class="p">(</span><span class="o">...</span><span class="p">)</span>
</pre></div>
</div>
<p>Note that TensorRT can also
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#refitting-engine-c">refit</a>
engines to update the weights after compilation. This feature is available to
TensorRT-LLM users through the <code class="docutils literal notranslate"><span class="pre">refit_engine</span></code> method in the
<code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder</span></code> class.</p>
</section>
<section id="pattern-matching-and-fusion">
<h3>Pattern-Matching and Fusion<a class="headerlink" href="#pattern-matching-and-fusion" title="Permalink to this heading"></a></h3>
<p>One of the key steps performed by TensorRT when it compiles the network graph
is the fusion of operations. Fusion is a well-known technique to improve the
efficiency when executing LLMs. It helps reduce the amount of data transferred
between the memory (DRAM) and the compute cores (CUDA cores as well as Tensor
Cores located on the <a class="reference external" href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#introduction">Streaming
Multiprocessors</a>
of a GPU). It also removes kernel launch overhead (each time a kernel is
launched on the GPU, there is a small additional CPU cost that is called the
launch overhead). A classical example is the fusion of the activation function
with the matrix multiplication (matmul) that usually precedes it in the
network.</p>
<p>In TensorRT-LLM, when defining the model, such a sequence can be written as:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">c</span> <span class="o">=</span> <span class="n">tensorrt_llm</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">matmul</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span>
<span class="n">c</span> <span class="o">=</span> <span class="n">tensorrt_llm</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">c</span><span class="p">)</span>
</pre></div>
</div>
<p>During inference, if the above sequence is executed without fusion, the <code class="docutils literal notranslate"><span class="pre">c</span></code>
tensor has to be written to global memory at the end of the <code class="docutils literal notranslate"><span class="pre">matmul</span></code>, read from
that same memory in <code class="docutils literal notranslate"><span class="pre">relu</span></code> and written again after <code class="docutils literal notranslate"><span class="pre">relu</span></code>. If no other
operation uses the intermediate values between <code class="docutils literal notranslate"><span class="pre">matmul</span></code> and <code class="docutils literal notranslate"><span class="pre">relu</span></code>, it is
suboptimal.  That is why, during compilation, TensorRT will identify that
pattern and automatically produce a GPU kernel that applies <code class="docutils literal notranslate"><span class="pre">relu</span></code> at the end
of <code class="docutils literal notranslate"><span class="pre">matmul</span></code> without an intermediate step through global memory. With that
optimization, the <code class="docutils literal notranslate"><span class="pre">c</span></code> tensor is written only once (after <code class="docutils literal notranslate"><span class="pre">relu</span></code>) instead of
twice, and is not read between the two operations.</p>
<p>The process of identifying the sequences of operations that can be fused is
called <em>pattern-matching</em>. TensorRT has a powerful pattern-matching algorithm
that can identify a lot of possible fusions. All the identified patterns are
converted into more efficient kernels by an advanced kernel compiler.</p>
</section>
<section id="plugins">
<h3>Plugins<a class="headerlink" href="#plugins" title="Permalink to this heading"></a></h3>
<p>The number of possible fusions is almost infinite and some useful fusions
involve very advanced modifications of the graph. A well-known example
is the <a class="reference external" href="https://arxiv.org/abs/2205.14135">Flash-Attention</a> technique to
optimize the <a class="reference external" href="https://arxiv.org/abs/1706.03762">Multihead-Attention</a> block
found in many LLMs. Flash-Attention requires modifications to the arithmetic
performed in the sequence <code class="docutils literal notranslate"><span class="pre">BMM-Softmax-BMM</span></code> (where <code class="docutils literal notranslate"><span class="pre">BMM</span></code> stands for Batched
Matrix-Matrix product) and the interleaving of the <code class="docutils literal notranslate"><span class="pre">for</span></code>-loops of the two
batched matrix products.  That’s non-trivial and not necessarily something
you can expect a compiler to “discover” on its own (or it might require the
support for a <a class="reference external" href="https://en.wikipedia.org/wiki/Polytope_model">polyhedral
model</a>).</p>
<p>As a result, even if TensorRT has a powerful pattern-matching algorithm and
supports a lot of possible fusions, there is always the risk that it cannot
identify uncommon and/or very advanced patterns. To overcome that inevitable
limitation, TensorRT offers a powerful mechanism known as
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Plugin/pyPlugin.html">plugins</a>.</p>
<p>The plugins are nodes inserted in the network graph definition that map to user-defined
GPU kernels. TensorRT-LLM uses a number of such plugins. They can be found in
the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0//cpp/tensorrt_llm/plugins"><code class="docutils literal notranslate"><span class="pre">cpp/tensorrt_llm/plugins</span></code></a> directory.</p>
<p>Plugins are written in C++ and follow a well-defined interface described in the
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#extending">Extending TensorRT with Custom Layers</a>
section of the TensorRT
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html">Developer Guide</a>.
When executed within a TensorRT engine, plugins trigger the execution of
their encapsulated GPU kernels. A fairly simple example of plugins is the
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0//cpp/tensorrt_llm/plugins/quantizeTensorPlugin"><code class="docutils literal notranslate"><span class="pre">QuantizeTensorPlugin</span></code></a> that
triggers a CUDA kernel in the <code class="docutils literal notranslate"><span class="pre">QuantizeTensorPlugin::enqueue</span></code> member function:</p>
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="c1">// In cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp:</span>

<span class="kt">int</span><span class="w"> </span><span class="nf">QuantizeTensorPlugin::enqueue</span><span class="p">(...)</span><span class="w"> </span><span class="p">{</span>
<span class="w">    </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">inputDesc</span><span class="p">[</span><span class="mi">0</span><span class="p">].</span><span class="n">type</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="n">DataType</span><span class="o">::</span><span class="n">kFLOAT</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
<span class="w">        </span><span class="n">invokeQuantization</span><span class="o">&lt;</span><span class="kt">float</span><span class="o">&gt;</span><span class="p">(...);</span>
<span class="w">    </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span>
<span class="w">        </span><span class="n">invokeQuantization</span><span class="o">&lt;</span><span class="n">half</span><span class="o">&gt;</span><span class="p">(...);</span>
<span class="w">    </span><span class="p">}</span>
<span class="w">    </span><span class="k">return</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span>
<span class="p">}</span>

<span class="c1">// In cpp/tensorrt_llm/kernels/quantization.cu:</span>

<span class="k">template</span><span class="w"> </span><span class="o">&lt;</span><span class="k">typename</span><span class="w"> </span><span class="nc">T</span><span class="o">&gt;</span>
<span class="kt">void</span><span class="w"> </span><span class="n">invokeQuantization</span><span class="p">(...)</span><span class="w"> </span><span class="p">{</span>
<span class="w">    </span><span class="c1">// The standard &lt;&lt;&lt; &gt;&gt;&gt; construct to launch CUDA kernels</span>
<span class="w">    </span><span class="n">quantizedKernel</span><span class="o">&lt;&lt;&lt;</span><span class="n">grid</span><span class="p">,</span><span class="w"> </span><span class="n">block</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">stream</span><span class="o">&gt;&gt;&gt;</span><span class="p">(...);</span>
<span class="p">}</span>
</pre></div>
</div>
<p>For more details on how TensorRT-LLM implements the GPT Attention operator, see
the <a class="reference internal" href="gpt_attention.html"><span class="std std-doc">Multi-head, Multi-query and Group-query Attention</span></a> document.</p>
</section>
</section>
<section id="runtime">
<h2>Runtime<a class="headerlink" href="#runtime" title="Permalink to this heading"></a></h2>
<p>TensorRT-LLM includes an API to implement Python and C++ runtimes. The role of
the runtime components is to load the TensorRT engines and drive their
execution. Typically, for an auto-regressive model like GPT, the runtime is in
charge of loading the engine that implements both the processing of the input
sequence as well as the body of the generation loop. See the <a class="reference internal" href="gpt_runtime.html"><span class="std std-doc">GPT C++
Runtime</span></a> document for details on the C++ Runtime.</p>
<section id="multi-gpu-and-multi-node-support">
<h3>Multi-GPU and Multi-Node Support<a class="headerlink" href="#multi-gpu-and-multi-node-support" title="Permalink to this heading"></a></h3>
<p>Even if TensorRT is designed for single-GPU systems, TensorRT-LLM adds the
support for systems with multiple GPUs and nodes. It is enabled
using TensorRT plugins that wrap communication primitives from the
<a class="reference external" href="https://developer.nvidia.com/nccl">NCCL</a> library as well as a custom
plugin that optimize the All-Reduce primitive in the presence of All-to-all
connections between GPUs (through NVSwitch in DGX systems).</p>
<p>The communication plugins can be found in
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0/cpp/tensorrt_llm/plugins/ncclPlugin">cpp/tensorrt_llm/plugins/ncclPlugin</a>
and the multi-GPU functions are exposed in the TensorRT-LLM Python API
as:</p>
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># In tensorrt_llm/functional.py:</span>

<span class="c1"># Collectives.</span>
<span class="k">def</span> <span class="nf">allreduce</span><span class="p">(</span><span class="n">tensor</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">group</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Tensor</span>
<span class="k">def</span> <span class="nf">allgather</span><span class="p">(</span><span class="n">tensor</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">group</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-&gt;</span> <span class="n">Tensor</span>

<span class="c1"># Point-to-point communication primitives.</span>
<span class="k">def</span> <span class="nf">send</span><span class="p">(</span><span class="n">tensor</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">tgt</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span>
<span class="k">def</span> <span class="nf">recv</span><span class="p">(</span><span class="n">tensor</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">src</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-&gt;</span> <span class="n">Tensor</span>
</pre></div>
</div>
<p>The multi-GPU support can be enabled through two different modes of model
parallelism: Tensor Parallelism and Pipeline Parallelism. The former mode
splits the different layers of a model across the GPUs. Each GPU runs the
entire network and synchronizes with its sibblings when needed. The Pipeline
Parallelism distributes the different layers to the GPUs. Each GPU runs a
subset of the entire model and communications happen at the boundary of those
subsets of layers. Tensor Parallelism usually leads to more balanced executions
but requires more memory bandwdith between the GPUs. Pipeline Parallelism
reduces the need for high-bandwidth communication but may incur load-balancing
issues and may be less efficient in terms of GPU utilization.</p>
</section>
</section>
<section id="in-flight-batching">
<h2>In-flight Batching<a class="headerlink" href="#in-flight-batching" title="Permalink to this heading"></a></h2>
<p>TensorRT-LLM supports in-flight batching of requests (also known as continuous
batching or iteration-level batching) for higher serving throughput. See the
<a class="reference internal" href="batch_manager.html"><span class="std std-doc">Batch Manager</span></a> document for more details.</p>
</section>
</section>


           </div>
          </div>
          <footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
        <a href="index.html" class="btn btn-neutral float-left" title="Welcome to TensorRT-LLM’s documentation!" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
        <a href="gpt_runtime.html" class="btn btn-neutral float-right" title="C++ GPT Runtime" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
    </div>

  <hr/>

  <div role="contentinfo">
    <p>&#169; Copyright 2023, NVidia.</p>
  </div>

  Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
    <a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
    provided by <a href="https://readthedocs.org">Read the Docs</a>.


</footer>
        </div>
      </div>
    </section>
  </div>
  <script>
      jQuery(function () {
          SphinxRtdTheme.Navigation.enable(true);
      });
  </script>

</body>
</html>