mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
399 lines
35 KiB
HTML
399 lines
35 KiB
HTML
<!DOCTYPE html>
|
||
<html class="writer-html5" lang="en" >
|
||
<head>
|
||
<meta charset="utf-8" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||
<title>TensorRT-LLM Architecture — tensorrt_llm documentation</title>
|
||
<link rel="stylesheet" href="_static/pygments.css" type="text/css" />
|
||
<link rel="stylesheet" href="_static/css/theme.css" type="text/css" />
|
||
<!--[if lt IE 9]>
|
||
<script src="_static/js/html5shiv.min.js"></script>
|
||
<![endif]-->
|
||
|
||
<script src="_static/jquery.js?v=5d32c60e"></script>
|
||
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
||
<script data-url_root="./" id="documentation_options" src="_static/documentation_options.js?v=b3ba4146"></script>
|
||
<script src="_static/doctools.js?v=888ff710"></script>
|
||
<script src="_static/sphinx_highlight.js?v=4825356b"></script>
|
||
<script src="_static/js/theme.js"></script>
|
||
<link rel="index" title="Index" href="genindex.html" />
|
||
<link rel="search" title="Search" href="search.html" />
|
||
<link rel="next" title="C++ GPT Runtime" href="gpt_runtime.html" />
|
||
<link rel="prev" title="Welcome to TensorRT-LLM’s documentation!" href="index.html" />
|
||
</head>
|
||
|
||
<body class="wy-body-for-nav">
|
||
<div class="wy-grid-for-nav">
|
||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||
<div class="wy-side-scroll">
|
||
<div class="wy-side-nav-search" >
|
||
|
||
|
||
|
||
<a href="index.html" class="icon icon-home">
|
||
tensorrt_llm
|
||
</a>
|
||
<div role="search">
|
||
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
|
||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
||
<input type="hidden" name="check_keywords" value="yes" />
|
||
<input type="hidden" name="area" value="default" />
|
||
</form>
|
||
</div>
|
||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
||
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1 current"><a class="current reference internal" href="#">TensorRT-LLM Architecture</a><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="#model-definition">Model Definition</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#compilation">Compilation</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#weight-bindings">Weight Bindings</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#pattern-matching-and-fusion">Pattern-Matching and Fusion</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#plugins">Plugins</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#runtime">Runtime</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#in-flight-batching">In-flight Batching</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="gpt_runtime.html">C++ GPT Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="batch_manager.html">The Batch Manager in TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="gpt_attention.html">Multi-head, Multi-query and Group-query Attention</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="precision.html">Numerical Precision</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="performance.html">Performance of TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="installation.html">Build From Sources</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="2023-05-19-how-to-debug.html">How to debug</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="2023-05-17-how-to-add-a-new-model.html">How to add a new model</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Python API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.layers.html">Layers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.functional.html">Functionals</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.models.html">Models</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.plugin.html">Plugin</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.quantization.html">Qunatization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="_cpp_gen/runtime.html">Runtime</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</nav>
|
||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||
<a href="index.html">tensorrt_llm</a>
|
||
</nav>
|
||
|
||
<div class="wy-nav-content">
|
||
<div class="rst-content">
|
||
<div role="navigation" aria-label="Page navigation">
|
||
<ul class="wy-breadcrumbs">
|
||
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
|
||
<li class="breadcrumb-item active">TensorRT-LLM Architecture</li>
|
||
<li class="wy-breadcrumbs-aside">
|
||
<a href="_sources/architecture.md.txt" rel="nofollow"> View page source</a>
|
||
</li>
|
||
</ul>
|
||
<hr/>
|
||
</div>
|
||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
||
<div itemprop="articleBody">
|
||
|
||
<section id="tensorrt-llm-architecture">
|
||
<h1>TensorRT-LLM Architecture<a class="headerlink" href="#tensorrt-llm-architecture" title="Permalink to this heading"></a></h1>
|
||
<p>TensorRT-LLM is a toolkit to assemble optimized solutions to perform Large
|
||
Language Model (LLM) inference. It offers a Python API to define models and
|
||
compile efficient <a class="reference external" href="https://developer.nvidia.com/tensorrt">TensorRT</a> engines for
|
||
NVIDIA GPUs. It also contains Python and C++ components to build runtimes to
|
||
execute those engines as well as backends for the <a class="reference external" href="https://developer.nvidia.com/nvidia-triton-inference-server">Triton Inference
|
||
Server</a> to easily
|
||
create web-based services for LLMs. TensorRT-LLM supports multi-GPU and
|
||
multi-node configurations (through MPI).</p>
|
||
<p>As a user, the very first step to create an inference solution is to either
|
||
define your own model or select a pre-defined network architecture (see
|
||
<a class="reference internal" href="#"><span class="xref myst">here</span></a> for the list of models supported by TensorRT-LLM). Once defined, that
|
||
model must be trained using a training framework (training is outside of the
|
||
scope of TensorRT-LLM). For pre-defined models, checkpoints can be downloaded
|
||
from various providers. To illustrate that point, a lot of examples in
|
||
TensorRT-LLM use model weights obtained from the
|
||
<a class="reference external" href="https://huggingface.co">HuggingFace</a> hub and trained using <a class="reference external" href="https://developer.nvidia.com/nemo">NVIDIA
|
||
Nemo</a> or <a class="reference external" href="https://pytorch.org">PyTorch</a>.</p>
|
||
<p>Equipped with the model definition and the weights, a user must use
|
||
TensorRT-LLM’s Python API to recreate the model in a way that can be compiled
|
||
by TensorRT into an efficient engine. For ease of use, TensorRT-LLM already
|
||
supports a handful of standard models.</p>
|
||
<p>Together with the Python API to describe models, TensorRT-LLM provides users
|
||
with components to create a runtime that executes the efficient TensorRT
|
||
engine. Runtime components offer beam-search, along with extensive sampling
|
||
functionalities such as top-K and top-P sampling. The exhaustive list can be
|
||
found in the documentation of the <a class="reference internal" href="gpt_runtime.html"><span class="std std-doc">Runtime</span></a>. The C++ runtime
|
||
is the recommended runtime.</p>
|
||
<p>TensorRT-LLM also includes Python and C++ backends for NVIDIA Triton Inference
|
||
Server to assemble solutions for LLM online serving. The C++ backend implements
|
||
in-flight batching as explained in the <a class="reference internal" href="batch_manager.html"><span class="std std-doc">Batch Manager</span></a>
|
||
documentation and is the recommended backend.</p>
|
||
<section id="model-definition">
|
||
<h2>Model Definition<a class="headerlink" href="#model-definition" title="Permalink to this heading"></a></h2>
|
||
<p>As mentioned above, TensorRT-LLM has a Python API that can be used to define
|
||
Large Language Models. This API is built on top of the powerful
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/index.html#">TensorRT Python API</a>
|
||
to create graph representations of deep neural networks in TensorRT. To become
|
||
familiar with the core concepts of the TensorRT API, refer to the
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/coreConcepts.html">Core Concepts</a>
|
||
section of the TensorRT documentation before proceeding further.</p>
|
||
<p>In TensorRT-LLM, the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0/tensorrt_llm/builder.py"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder</span></code></a> class
|
||
contains a
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
|
||
object. That instance is used in the <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder.create_network</span></code>
|
||
method to create an instance of the
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Network.html#tensorrt.INetworkDefinition"><code class="docutils literal notranslate"><span class="pre">tensorrt.INetworkDefinition</span></code></a>
|
||
class. The <code class="docutils literal notranslate"><span class="pre">INetworkDefinition</span></code> object can then be populated using the free
|
||
functions defined in the
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0/tensorrt_llm/functional.py"><code class="docutils literal notranslate"><span class="pre">tensorrt_llm.functional</span></code></a>.</p>
|
||
<p>A simple example of such a free function is <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.activation</span></code> that inserts a
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Layers.html#tensorrt.IActivationLayer"><code class="docutils literal notranslate"><span class="pre">tensorrt.IActivationLayer</span></code></a>
|
||
node in the graph of the model:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># In tensorrt_llm.functional:</span>
|
||
|
||
<span class="k">def</span> <span class="nf">activation</span><span class="p">(</span><span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">act_type</span><span class="p">:</span> <span class="n">trt</span><span class="o">.</span><span class="n">ActivationType</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tensor</span><span class="p">:</span>
|
||
<span class="n">layer</span> <span class="o">=</span> <span class="n">default_trtnet</span><span class="p">()</span><span class="o">.</span><span class="n">add_activation</span><span class="p">(</span><span class="nb">input</span><span class="o">.</span><span class="n">trt_tensor</span><span class="p">,</span> <span class="n">act_type</span><span class="p">)</span> <span class="c1"># default_trtnet() -> INetworkDefinition</span>
|
||
<span class="k">return</span> <span class="n">_create_tensor</span><span class="p">(</span><span class="n">layer</span><span class="o">.</span><span class="n">get_output</span><span class="p">(</span><span class="mi">0</span><span class="p">),</span> <span class="n">layer</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>To make it even easier for users, a few of the most standard activation
|
||
functions found in LLMs are derived from that function:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># In tensorrt_llm.functional:</span>
|
||
|
||
<span class="n">relu</span> <span class="o">=</span> <span class="n">partial</span><span class="p">(</span><span class="n">activation</span><span class="p">,</span> <span class="n">act_type</span><span class="o">=</span><span class="n">trt</span><span class="o">.</span><span class="n">ActivationType</span><span class="o">.</span><span class="n">RELU</span><span class="p">)</span>
|
||
<span class="n">sigmoid</span> <span class="o">=</span> <span class="n">partial</span><span class="p">(</span><span class="n">activation</span><span class="p">,</span> <span class="n">act_type</span><span class="o">=</span><span class="n">trt</span><span class="o">.</span><span class="n">ActivationType</span><span class="o">.</span><span class="n">SIGMOID</span><span class="p">)</span>
|
||
|
||
</pre></div>
|
||
</div>
|
||
<p>Specialized activation functions can be used to assemble more advanced
|
||
functions such as the <code class="docutils literal notranslate"><span class="pre">silu</span></code> activation:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># In tensorrt_llm.functional:</span>
|
||
|
||
<span class="k">def</span> <span class="nf">silu</span><span class="p">(</span><span class="nb">input</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tensor</span><span class="p">:</span>
|
||
<span class="k">return</span> <span class="nb">input</span> <span class="o">*</span> <span class="n">sigmoid</span><span class="p">(</span><span class="nb">input</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>When the TensorRT-LLM’s Python API is utilized, a graph of the network is
|
||
assembled. The graph can later be traversed or transformed using the graph
|
||
traversal API exposed by the
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/LayerBase.html#tensorrt.ILayer"><code class="docutils literal notranslate"><span class="pre">tensorrt.ILayer</span></code></a>
|
||
class. That graph will also be optimized by TensorRT during the compilation of
|
||
the engine, as explained in the next section.</p>
|
||
</section>
|
||
<section id="compilation">
|
||
<h2>Compilation<a class="headerlink" href="#compilation" title="Permalink to this heading"></a></h2>
|
||
<p>Once populated, the instance of the
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Graph/Network.html#tensorrt.INetworkDefinition"><code class="docutils literal notranslate"><span class="pre">tensorrt.INetworkDefinition</span></code></a>,
|
||
can be compiled into an efficient engine by the
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
|
||
In TensorRT-LLM, it is done through the <code class="docutils literal notranslate"><span class="pre">build_engine</span></code> member function of the
|
||
<code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder</span></code> class that calls the
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder.build_serialized_network"><code class="docutils literal notranslate"><span class="pre">build_serialized_network</span></code></a>
|
||
method of the
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Core/Builder.html#tensorrt.Builder"><code class="docutils literal notranslate"><span class="pre">tensorrt.Builder</span></code></a>
|
||
object. That call, if everything works as expected, produces an instance of the
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/FoundationalTypes/HostMemory.html#tensorrt.IHostMemory"><code class="docutils literal notranslate"><span class="pre">tensorrt.IHostMemory</span></code></a>
|
||
class. That object is an optimized TensorRT engine that can be stored as a
|
||
binary file.</p>
|
||
<section id="weight-bindings">
|
||
<h3>Weight Bindings<a class="headerlink" href="#weight-bindings" title="Permalink to this heading"></a></h3>
|
||
<p>TensorRT engines embed the network weights, that must be known for compilation.
|
||
For that reason, the weights must be bound to parameters in the model
|
||
definition before calling <code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder.build_engine</span></code>. It leads to code like:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># The Linear operator exposes two parameters (see tensorrt_llm/layers/linear.py):</span>
|
||
<span class="k">class</span> <span class="nc">Linear</span><span class="p">(</span><span class="n">Module</span><span class="p">):</span>
|
||
<span class="k">def</span> <span class="fm">__init__</span><span class="p">(</span><span class="bp">self</span><span class="p">,</span> <span class="o">...</span><span class="p">):</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">weight</span> <span class="o">=</span> <span class="n">Parameter</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">out_features</span><span class="p">,</span> <span class="bp">self</span><span class="o">.</span><span class="n">in_features</span><span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span>
|
||
<span class="bp">self</span><span class="o">.</span><span class="n">bias</span> <span class="o">=</span> <span class="n">Parameter</span><span class="p">(</span><span class="n">shape</span><span class="o">=</span><span class="p">(</span><span class="bp">self</span><span class="o">.</span><span class="n">out_features</span><span class="p">,</span> <span class="p">),</span> <span class="n">dtype</span><span class="o">=</span><span class="n">dtype</span><span class="p">)</span>
|
||
|
||
<span class="c1"># The parameters are bound to the weights before compiling the model. See examples/gpt/weight.py:</span>
|
||
<span class="n">tensorrt_llm_gpt</span><span class="o">.</span><span class="n">layers</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">mlp</span><span class="o">.</span><span class="n">fc</span><span class="o">.</span><span class="n">weight</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">fromfile</span><span class="p">(</span><span class="o">...</span><span class="p">)</span>
|
||
<span class="n">tensorrt_llm_gpt</span><span class="o">.</span><span class="n">layers</span><span class="p">[</span><span class="n">i</span><span class="p">]</span><span class="o">.</span><span class="n">mlp</span><span class="o">.</span><span class="n">fc</span><span class="o">.</span><span class="n">bias</span><span class="o">.</span><span class="n">value</span> <span class="o">=</span> <span class="n">fromfile</span><span class="p">(</span><span class="o">...</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Note that TensorRT can also
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#refitting-engine-c">refit</a>
|
||
engines to update the weights after compilation. This feature is available to
|
||
TensorRT-LLM users through the <code class="docutils literal notranslate"><span class="pre">refit_engine</span></code> method in the
|
||
<code class="docutils literal notranslate"><span class="pre">tensorrt_llm.Builder</span></code> class.</p>
|
||
</section>
|
||
<section id="pattern-matching-and-fusion">
|
||
<h3>Pattern-Matching and Fusion<a class="headerlink" href="#pattern-matching-and-fusion" title="Permalink to this heading"></a></h3>
|
||
<p>One of the key steps performed by TensorRT when it compiles the network graph
|
||
is the fusion of operations. Fusion is a well-known technique to improve the
|
||
efficiency when executing LLMs. It helps reduce the amount of data transferred
|
||
between the memory (DRAM) and the compute cores (CUDA cores as well as Tensor
|
||
Cores located on the <a class="reference external" href="https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#introduction">Streaming
|
||
Multiprocessors</a>
|
||
of a GPU). It also removes kernel launch overhead (each time a kernel is
|
||
launched on the GPU, there is a small additional CPU cost that is called the
|
||
launch overhead). A classical example is the fusion of the activation function
|
||
with the matrix multiplication (matmul) that usually precedes it in the
|
||
network.</p>
|
||
<p>In TensorRT-LLM, when defining the model, such a sequence can be written as:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="n">c</span> <span class="o">=</span> <span class="n">tensorrt_llm</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">matmul</span><span class="p">(</span><span class="n">a</span><span class="p">,</span> <span class="n">b</span><span class="p">)</span>
|
||
<span class="n">c</span> <span class="o">=</span> <span class="n">tensorrt_llm</span><span class="o">.</span><span class="n">functional</span><span class="o">.</span><span class="n">relu</span><span class="p">(</span><span class="n">c</span><span class="p">)</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>During inference, if the above sequence is executed without fusion, the <code class="docutils literal notranslate"><span class="pre">c</span></code>
|
||
tensor has to be written to global memory at the end of the <code class="docutils literal notranslate"><span class="pre">matmul</span></code>, read from
|
||
that same memory in <code class="docutils literal notranslate"><span class="pre">relu</span></code> and written again after <code class="docutils literal notranslate"><span class="pre">relu</span></code>. If no other
|
||
operation uses the intermediate values between <code class="docutils literal notranslate"><span class="pre">matmul</span></code> and <code class="docutils literal notranslate"><span class="pre">relu</span></code>, it is
|
||
suboptimal. That is why, during compilation, TensorRT will identify that
|
||
pattern and automatically produce a GPU kernel that applies <code class="docutils literal notranslate"><span class="pre">relu</span></code> at the end
|
||
of <code class="docutils literal notranslate"><span class="pre">matmul</span></code> without an intermediate step through global memory. With that
|
||
optimization, the <code class="docutils literal notranslate"><span class="pre">c</span></code> tensor is written only once (after <code class="docutils literal notranslate"><span class="pre">relu</span></code>) instead of
|
||
twice, and is not read between the two operations.</p>
|
||
<p>The process of identifying the sequences of operations that can be fused is
|
||
called <em>pattern-matching</em>. TensorRT has a powerful pattern-matching algorithm
|
||
that can identify a lot of possible fusions. All the identified patterns are
|
||
converted into more efficient kernels by an advanced kernel compiler.</p>
|
||
</section>
|
||
<section id="plugins">
|
||
<h3>Plugins<a class="headerlink" href="#plugins" title="Permalink to this heading"></a></h3>
|
||
<p>The number of possible fusions is almost infinite and some useful fusions
|
||
involve very advanced modifications of the graph. A well-known example
|
||
is the <a class="reference external" href="https://arxiv.org/abs/2205.14135">Flash-Attention</a> technique to
|
||
optimize the <a class="reference external" href="https://arxiv.org/abs/1706.03762">Multihead-Attention</a> block
|
||
found in many LLMs. Flash-Attention requires modifications to the arithmetic
|
||
performed in the sequence <code class="docutils literal notranslate"><span class="pre">BMM-Softmax-BMM</span></code> (where <code class="docutils literal notranslate"><span class="pre">BMM</span></code> stands for Batched
|
||
Matrix-Matrix product) and the interleaving of the <code class="docutils literal notranslate"><span class="pre">for</span></code>-loops of the two
|
||
batched matrix products. That’s non-trivial and not necessarily something
|
||
you can expect a compiler to “discover” on its own (or it might require the
|
||
support for a <a class="reference external" href="https://en.wikipedia.org/wiki/Polytope_model">polyhedral
|
||
model</a>).</p>
|
||
<p>As a result, even if TensorRT has a powerful pattern-matching algorithm and
|
||
supports a lot of possible fusions, there is always the risk that it cannot
|
||
identify uncommon and/or very advanced patterns. To overcome that inevitable
|
||
limitation, TensorRT offers a powerful mechanism known as
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/api/python_api/infer/Plugin/pyPlugin.html">plugins</a>.</p>
|
||
<p>The plugins are nodes inserted in the network graph definition that map to user-defined
|
||
GPU kernels. TensorRT-LLM uses a number of such plugins. They can be found in
|
||
the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0//cpp/tensorrt_llm/plugins"><code class="docutils literal notranslate"><span class="pre">cpp/tensorrt_llm/plugins</span></code></a> directory.</p>
|
||
<p>Plugins are written in C++ and follow a well-defined interface described in the
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html#extending">Extending TensorRT with Custom Layers</a>
|
||
section of the TensorRT
|
||
<a class="reference external" href="https://docs.nvidia.com/deeplearning/tensorrt/developer-guide/index.html">Developer Guide</a>.
|
||
When executed within a TensorRT engine, plugins trigger the execution of
|
||
their encapsulated GPU kernels. A fairly simple example of plugins is the
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0//cpp/tensorrt_llm/plugins/quantizeTensorPlugin"><code class="docutils literal notranslate"><span class="pre">QuantizeTensorPlugin</span></code></a> that
|
||
triggers a CUDA kernel in the <code class="docutils literal notranslate"><span class="pre">QuantizeTensorPlugin::enqueue</span></code> member function:</p>
|
||
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="c1">// In cpp/tensorrt_llm/plugins/quantizeTensorPlugin/quantizeTensorPlugin.cpp:</span>
|
||
|
||
<span class="kt">int</span><span class="w"> </span><span class="nf">QuantizeTensorPlugin::enqueue</span><span class="p">(...)</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="k">if</span><span class="w"> </span><span class="p">(</span><span class="n">inputDesc</span><span class="p">[</span><span class="mi">0</span><span class="p">].</span><span class="n">type</span><span class="w"> </span><span class="o">==</span><span class="w"> </span><span class="n">DataType</span><span class="o">::</span><span class="n">kFLOAT</span><span class="p">)</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="n">invokeQuantization</span><span class="o"><</span><span class="kt">float</span><span class="o">></span><span class="p">(...);</span>
|
||
<span class="w"> </span><span class="p">}</span><span class="w"> </span><span class="k">else</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="n">invokeQuantization</span><span class="o"><</span><span class="n">half</span><span class="o">></span><span class="p">(...);</span>
|
||
<span class="w"> </span><span class="p">}</span>
|
||
<span class="w"> </span><span class="k">return</span><span class="w"> </span><span class="mi">0</span><span class="p">;</span>
|
||
<span class="p">}</span>
|
||
|
||
<span class="c1">// In cpp/tensorrt_llm/kernels/quantization.cu:</span>
|
||
|
||
<span class="k">template</span><span class="w"> </span><span class="o"><</span><span class="k">typename</span><span class="w"> </span><span class="nc">T</span><span class="o">></span>
|
||
<span class="kt">void</span><span class="w"> </span><span class="n">invokeQuantization</span><span class="p">(...)</span><span class="w"> </span><span class="p">{</span>
|
||
<span class="w"> </span><span class="c1">// The standard <<< >>> construct to launch CUDA kernels</span>
|
||
<span class="w"> </span><span class="n">quantizedKernel</span><span class="o"><<<</span><span class="n">grid</span><span class="p">,</span><span class="w"> </span><span class="n">block</span><span class="p">,</span><span class="w"> </span><span class="mi">0</span><span class="p">,</span><span class="w"> </span><span class="n">stream</span><span class="o">>>></span><span class="p">(...);</span>
|
||
<span class="p">}</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>For more details on how TensorRT-LLM implements the GPT Attention operator, see
|
||
the <a class="reference internal" href="gpt_attention.html"><span class="std std-doc">Multi-head, Multi-query and Group-query Attention</span></a> document.</p>
|
||
</section>
|
||
</section>
|
||
<section id="runtime">
|
||
<h2>Runtime<a class="headerlink" href="#runtime" title="Permalink to this heading"></a></h2>
|
||
<p>TensorRT-LLM includes an API to implement Python and C++ runtimes. The role of
|
||
the runtime components is to load the TensorRT engines and drive their
|
||
execution. Typically, for an auto-regressive model like GPT, the runtime is in
|
||
charge of loading the engine that implements both the processing of the input
|
||
sequence as well as the body of the generation loop. See the <a class="reference internal" href="gpt_runtime.html"><span class="std std-doc">GPT C++
|
||
Runtime</span></a> document for details on the C++ Runtime.</p>
|
||
<section id="multi-gpu-and-multi-node-support">
|
||
<h3>Multi-GPU and Multi-Node Support<a class="headerlink" href="#multi-gpu-and-multi-node-support" title="Permalink to this heading"></a></h3>
|
||
<p>Even if TensorRT is designed for single-GPU systems, TensorRT-LLM adds the
|
||
support for systems with multiple GPUs and nodes. It is enabled
|
||
using TensorRT plugins that wrap communication primitives from the
|
||
<a class="reference external" href="https://developer.nvidia.com/nccl">NCCL</a> library as well as a custom
|
||
plugin that optimize the All-Reduce primitive in the presence of All-to-all
|
||
connections between GPUs (through NVSwitch in DGX systems).</p>
|
||
<p>The communication plugins can be found in
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/release/0.5.0/cpp/tensorrt_llm/plugins/ncclPlugin">cpp/tensorrt_llm/plugins/ncclPlugin</a>
|
||
and the multi-GPU functions are exposed in the TensorRT-LLM Python API
|
||
as:</p>
|
||
<div class="highlight-python notranslate"><div class="highlight"><pre><span></span><span class="c1"># In tensorrt_llm/functional.py:</span>
|
||
|
||
<span class="c1"># Collectives.</span>
|
||
<span class="k">def</span> <span class="nf">allreduce</span><span class="p">(</span><span class="n">tensor</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">group</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="n">Tensor</span>
|
||
<span class="k">def</span> <span class="nf">allgather</span><span class="p">(</span><span class="n">tensor</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">group</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="nb">int</span><span class="p">])</span> <span class="o">-></span> <span class="n">Tensor</span>
|
||
|
||
<span class="c1"># Point-to-point communication primitives.</span>
|
||
<span class="k">def</span> <span class="nf">send</span><span class="p">(</span><span class="n">tensor</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">tgt</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tensor</span>
|
||
<span class="k">def</span> <span class="nf">recv</span><span class="p">(</span><span class="n">tensor</span><span class="p">:</span> <span class="n">Tensor</span><span class="p">,</span> <span class="n">src</span><span class="p">:</span> <span class="nb">int</span><span class="p">)</span> <span class="o">-></span> <span class="n">Tensor</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The multi-GPU support can be enabled through two different modes of model
|
||
parallelism: Tensor Parallelism and Pipeline Parallelism. The former mode
|
||
splits the different layers of a model across the GPUs. Each GPU runs the
|
||
entire network and synchronizes with its sibblings when needed. The Pipeline
|
||
Parallelism distributes the different layers to the GPUs. Each GPU runs a
|
||
subset of the entire model and communications happen at the boundary of those
|
||
subsets of layers. Tensor Parallelism usually leads to more balanced executions
|
||
but requires more memory bandwdith between the GPUs. Pipeline Parallelism
|
||
reduces the need for high-bandwidth communication but may incur load-balancing
|
||
issues and may be less efficient in terms of GPU utilization.</p>
|
||
</section>
|
||
</section>
|
||
<section id="in-flight-batching">
|
||
<h2>In-flight Batching<a class="headerlink" href="#in-flight-batching" title="Permalink to this heading"></a></h2>
|
||
<p>TensorRT-LLM supports in-flight batching of requests (also known as continuous
|
||
batching or iteration-level batching) for higher serving throughput. See the
|
||
<a class="reference internal" href="batch_manager.html"><span class="std std-doc">Batch Manager</span></a> document for more details.</p>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||
<a href="index.html" class="btn btn-neutral float-left" title="Welcome to TensorRT-LLM’s documentation!" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||
<a href="gpt_runtime.html" class="btn btn-neutral float-right" title="C++ GPT Runtime" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||
</div>
|
||
|
||
<hr/>
|
||
|
||
<div role="contentinfo">
|
||
<p>© Copyright 2023, NVidia.</p>
|
||
</div>
|
||
|
||
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
|
||
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
|
||
provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||
|
||
|
||
</footer>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
<script>
|
||
jQuery(function () {
|
||
SphinxRtdTheme.Navigation.enable(true);
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |