mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-23 12:12:39 +08:00
424 lines
37 KiB
HTML
424 lines
37 KiB
HTML
<!DOCTYPE html>
|
||
<html class="writer-html5" lang="en" data-content_root="../">
|
||
<head>
|
||
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||
<title>C++ GPT Runtime — tensorrt_llm documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="../_static/pygments.css?v=80d5e7a1" />
|
||
<link rel="stylesheet" type="text/css" href="../_static/css/theme.css?v=19f00094" />
|
||
|
||
|
||
<!--[if lt IE 9]>
|
||
<script src="../_static/js/html5shiv.min.js"></script>
|
||
<![endif]-->
|
||
|
||
<script src="../_static/jquery.js?v=5d32c60e"></script>
|
||
<script src="../_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
||
<script src="../_static/documentation_options.js?v=5929fcd5"></script>
|
||
<script src="../_static/doctools.js?v=9a2dae69"></script>
|
||
<script src="../_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="../_static/js/theme.js"></script>
|
||
<link rel="index" title="Index" href="../genindex.html" />
|
||
<link rel="search" title="Search" href="../search.html" />
|
||
<link rel="next" title="Graph Rewriting Module" href="graph-rewriting.html" />
|
||
<link rel="prev" title="Multi-Head, Multi-Query, and Group-Query Attention" href="gpt-attention.html" />
|
||
</head>
|
||
|
||
<body class="wy-body-for-nav">
|
||
<div class="wy-grid-for-nav">
|
||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||
<div class="wy-side-scroll">
|
||
<div class="wy-side-nav-search" >
|
||
|
||
|
||
|
||
<a href="../index.html" class="icon icon-home">
|
||
tensorrt_llm
|
||
</a>
|
||
<div role="search">
|
||
<form id="rtd-search-form" class="wy-form" action="../search.html" method="get">
|
||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
||
<input type="hidden" name="check_keywords" value="yes" />
|
||
<input type="hidden" name="area" value="default" />
|
||
</form>
|
||
</div>
|
||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
||
<p class="caption" role="heading"><span class="caption-text">Getting Started</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../overview.html">Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../quick-start-guide.html">Quick Start Guide</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../release-notes.html">Release Notes</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Installation</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/linux.html">Installing on Linux</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-linux.html">Building from Source Code on Linux</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/windows.html">Installing on Windows</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../installation/build-from-source-windows.html">Building from Source Code on Windows</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Architecture</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/overview.html">TensorRT-LLM Architecture</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html">Model Definition</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#compilation">Compilation</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#runtime">Runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/core-concepts.html#multi-gpu-and-multi-node-support">Multi-GPU and Multi-Node Support</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/checkpoint.html">TensorRT-LLM Checkpoint</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/workflow.html">TensorRT-LLM Build Workflow</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1"><a class="reference internal" href="gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
|
||
<li class="toctree-l1 current"><a class="current reference internal" href="#">C++ GPT Runtime</a><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="#overview">Overview</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#model-configuration">Model Configuration</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#world-configuration">World Configuration</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#sampling-parameters">Sampling Parameters</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#the-session">The Session</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#internal-components">Internal Components</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#in-flight-batching-support">In-flight Batching Support</a></li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#know-issues-and-future-changes">Know Issues and Future Changes</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="batch-manager.html">The Batch Manager in TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="inference-request.html">Inference Request</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="inference-request.html#responses">Responses</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="lora.html">Run gpt-2b + LoRA using GptManager / cpp runtime</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-overview.html">Overview</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-best-practices.html">Best Practices for Tuning the Performance of TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../performance/perf-analysis.html">Performance Analysis</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Reference</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/troubleshooting.html">Troubleshooting</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/support-matrix.html">Support Matrix</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/precision.html">Numerical Precision</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../reference/memory.html">Memory Usage of TensorRT-LLM</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/executor.html">Executor</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../_cpp_gen/runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/quantization-in-TRT-LLM.html">Speed up inference with SOTA quantization techniques in TRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="../blogs/XQA-kernel.html">New XQA-kernel provides 2.4x more Llama-70B throughput within the same latency budget</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</nav>
|
||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||
<a href="../index.html">tensorrt_llm</a>
|
||
</nav>
|
||
|
||
<div class="wy-nav-content">
|
||
<div class="rst-content">
|
||
<div role="navigation" aria-label="Page navigation">
|
||
<ul class="wy-breadcrumbs">
|
||
<li><a href="../index.html" class="icon icon-home" aria-label="Home"></a></li>
|
||
<li class="breadcrumb-item active">C++ GPT Runtime</li>
|
||
<li class="wy-breadcrumbs-aside">
|
||
<a href="../_sources/advanced/gpt-runtime.md.txt" rel="nofollow"> View page source</a>
|
||
</li>
|
||
</ul>
|
||
<hr/>
|
||
</div>
|
||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
||
<div itemprop="articleBody">
|
||
|
||
<section id="c-gpt-runtime">
|
||
<span id="gpt-runtime"></span><h1>C++ GPT Runtime<a class="headerlink" href="#c-gpt-runtime" title="Link to this heading"></a></h1>
|
||
<p>TensorRT-LLM includes a C++ component to execute TensorRT engines built with
|
||
the Python API as described in the <a class="reference internal" href="../architecture/overview.html#architecture-overview"><span class="std std-ref">TensorRT-LLM Architecture</span></a> section.
|
||
That component is called the C++ runtime.</p>
|
||
<p>The API of the C++ runtime is composed of the classes declared in
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/main/cpp/include/tensorrt_llm/runtime"><code class="docutils literal notranslate"><span class="pre">cpp/include/tensorrt_llm/runtime</span></code></a> and
|
||
implemented in <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/main/cpp/tensorrt_llm/runtime"><code class="docutils literal notranslate"><span class="pre">cpp/tensorrt_llm/runtime</span></code></a>.</p>
|
||
<p>Even if the different components described in that document mention GPT in
|
||
their name, they are not restricted to this specific model. Those classes can
|
||
be used to implement auto-regressive models like BLOOM, GPT-J, GPT-NeoX or
|
||
LLaMA, for example.</p>
|
||
<p>Complete support of encoder-decoder models, like T5, will be added to
|
||
TensorRT-LLM in a future release. An experimental version, only in Python for
|
||
now, can be found in the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/main/examples/enc_dec"><code class="docutils literal notranslate"><span class="pre">examples/enc_dec</span></code></a> folder.</p>
|
||
<section id="overview">
|
||
<h2>Overview<a class="headerlink" href="#overview" title="Link to this heading"></a></h2>
|
||
<p>Runtime models are described by an instance of the
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/runtime//modelConfig.h"><code class="docutils literal notranslate"><span class="pre">ModelConfig</span></code></a>
|
||
class and a pointer to the TensorRT engine that must be
|
||
executed to perform the inference.
|
||
The environment is configured through the
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/runtime/worldConfig.h"><code class="docutils literal notranslate"><span class="pre">WorldConfig</span></code></a>
|
||
(that name comes from
|
||
<a class="reference external" href="https://en.wikipedia.org/wiki/Message_Passing_Interface">MPI</a> and its “famous”
|
||
<code class="docutils literal notranslate"><span class="pre">MPI_COMM_WORLD</span></code> default communicator).
|
||
The <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/runtime/samplingConfig.h"><code class="docutils literal notranslate"><span class="pre">SamplingConfig</span></code></a>
|
||
class encapsulates parameters that control the
|
||
<a class="reference external" href="https://huggingface.co/blog/how-to-generate">generation</a> of new tokens.</p>
|
||
<section id="model-configuration">
|
||
<h3>Model Configuration<a class="headerlink" href="#model-configuration" title="Link to this heading"></a></h3>
|
||
<p>The model configuration is an instance of the
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/runtime//modelConfig.h"><code class="docutils literal notranslate"><span class="pre">ModelConfig</span></code></a> class.
|
||
That class encapsulates the following parameters (they are declared as private
|
||
member variables and exposed through getters and setters):</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">vocabSize</span></code>, the size of the vocabulary,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">numLayers</span></code>, the number of layers in the model,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">numHeads</span></code>, the number of heads in the attention block,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">numKvHeads</span></code>, the number of heads for K and V in the attention component.
|
||
When the number of K/V heads is the same as the number of (Q) heads, the
|
||
model uses multi-head attention. When the number of K/V heads is 1, it uses
|
||
multi-query attention. Otherwise, it uses group-query attention. Refer to <a class="reference internal" href="gpt-attention.html#gpt-attention"><span class="std std-ref">Multi-Head, Multi-Query, and Group-Query Attention</span></a> for more information,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">hiddenSize</span></code>, the size of the hidden dimension,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">dataType</span></code>, the datatype that was used to build the TensorRT engine and that
|
||
must be used to run the model during inference,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">useGptAttentionPlugin</span></code>, indicates if the <a class="reference internal" href="gpt-attention.html#gpt-attention"><span class="std std-ref">Multi-Head, Multi-Query, and Group-Query Attention</span></a> operator was compiled using the
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/main/cpp/tensorrt_llm/plugins/gptAttentionPlugin">GPT Attention plugin</a>,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">inputPacked</span></code>, indicates that the input must be packed (or padded when set
|
||
to <code class="docutils literal notranslate"><span class="pre">false</span></code>). For performance reasons, it is recommended to always use packed,
|
||
even if its default is set to <code class="docutils literal notranslate"><span class="pre">false</span></code> (will be changed in a future release).
|
||
Refer to <a class="reference internal" href="gpt-attention.html#gpt-attention"><span class="std std-ref">Multi-Head, Multi-Query, and Group-Query Attention</span></a> for more information,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">pagedKvCache</span></code>, indicates if the K/V cache uses paging.
|
||
Refer to <a class="reference internal" href="gpt-attention.html#gpt-attention"><span class="std std-ref">Multi-Head, Multi-Query, and Group-Query Attention</span></a> for more information,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">tokensPerBlock</span></code>, is the number of tokens in each block of the K/V cache.
|
||
It’s relevant when the paged K/V cache is enabled. By default, the value is
|
||
64. Refer to <a class="reference internal" href="gpt-attention.html#gpt-attention"><span class="std std-ref">Multi-Head, Multi-Query, and Group-Query Attention</span></a> for more information,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">quantMode</span></code>, controls the quantization method. Refer to <a class="reference internal" href="../reference/precision.html#precision"><span class="std std-ref">Numerical Precision</span></a> for more information.</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">maxBatchSize</span></code>, indicates the maximum batch size that the TensorRT engine
|
||
was built for,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">maxInputLen</span></code>, the maximum size of the input sequences,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">maxSequenceLen</span></code>, the maximum total size (input+output) of the sequences.</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="world-configuration">
|
||
<h3>World Configuration<a class="headerlink" href="#world-configuration" title="Link to this heading"></a></h3>
|
||
<p>Familiarity with
|
||
<a class="reference external" href="https://en.wikipedia.org/wiki/Message_Passing_Interface">MPI</a>, is not required
|
||
to utilize the TensorRT-LMM C++ runtime. There are two main things
|
||
you need to know:</p>
|
||
<ul class="simple">
|
||
<li><p>The C++ Runtime in TensorRT-LLM uses
|
||
<a class="reference external" href="https://en.wikipedia.org/wiki/Process_(computing)">processes</a> to execute
|
||
TensorRT engines on the different GPUs. Those GPUs can be located on a single
|
||
node as well as on different nodes in a cluster. Each process is called a
|
||
<em>rank</em> in MPI.</p></li>
|
||
<li><p>The ranks are grouped in communication groups. The
|
||
TensorRT-LLM C++ Runtime calls that group the <em>world</em>.</p></li>
|
||
</ul>
|
||
<p>The world configuration is an instance of the
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/runtime/worldConfig.h"><code class="docutils literal notranslate"><span class="pre">WorldConfig</span></code></a>
|
||
class, which encapsulates the following parameters:</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">tensorParallelism</span></code>, the number of ranks that collaborate together to
|
||
implement Tensor Parallelism (TP). With TP, each GPU performs computations for
|
||
all the layers of the model. Some of those computations are distributed
|
||
across the GPU. TP is more balanced than Pipeline Parallelism (PP), in most cases, but
|
||
requires higher bandwidth between the GPUs. It is the recommended setting in
|
||
the presence of NVLINK between GPUs,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">pipelineParallelism</span></code>, the number of ranks that collaborate together to
|
||
implement Pipeline Parallelism (PP). With PP, each GPU works on a subset of
|
||
consecutive layers. Communications between the GPUs happen only at the
|
||
boundaries of the subsets of layers. It is harder to guarantee the full
|
||
utilization of the GPUs with PP but it requires less memory bandwidth. It
|
||
is the recommended setting in the absence of NVLINK between GPUs,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">rank</span></code>, the unique identifier of the rank,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">gpusPerNode</span></code>, indicates the number of GPUs on each node. Having that
|
||
information allows the C++ runtime to optimize communications between GPUs in
|
||
a node (like taking advantage of the
|
||
<a class="reference external" href="https://www.nvidia.com/en-us/data-center/nvlink/">NVLINK</a>
|
||
interconnect between GPUs of an A100
|
||
<a class="reference external" href="https://www.nvidia.com/en-us/data-center/dgx-platform/">DGX</a>
|
||
node).</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="sampling-parameters">
|
||
<h3>Sampling Parameters<a class="headerlink" href="#sampling-parameters" title="Link to this heading"></a></h3>
|
||
<p>The <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/runtime/samplingConfig.h"><code class="docutils literal notranslate"><span class="pre">SamplingConfig</span></code></a>
|
||
class encapsulates parameters that control the
|
||
<a class="reference external" href="https://huggingface.co/blog/how-to-generate">generation</a> of new tokens.
|
||
Except for the <code class="docutils literal notranslate"><span class="pre">beamWidth</span></code> parameter, all the fields are optional and the
|
||
runtime will use a default value if no values are provided by the user. For
|
||
vector fields, the TensorRT-LLM runtime supports one value per sequence (that is,
|
||
the vector contains <code class="docutils literal notranslate"><span class="pre">batchSize</span></code> values). If all the sequences use the same
|
||
value for a given parameter, the vector can be limited to a single element
|
||
(that is, <code class="docutils literal notranslate"><span class="pre">size()</span> <span class="pre">==</span> <span class="pre">1</span></code>).</p>
|
||
<p><em><strong>General</strong></em></p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">temperature</span></code>, a vector of floating-point numbers to control the
|
||
modulation of logits when sampling new tokens. It can have any value <code class="docutils literal notranslate"><span class="pre">>=</span> <span class="pre">0.0f</span></code>. The default value is <code class="docutils literal notranslate"><span class="pre">1.0f</span></code>(no modulation).</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">minLength</span></code>, a vector of integers to set a lower-bound on the number of tokens
|
||
generated. It can have any value <code class="docutils literal notranslate"><span class="pre">>=</span> <span class="pre">0</span></code>. Value <code class="docutils literal notranslate"><span class="pre">0</span></code> has no effect, the first generated token can be EOS. The default value is <code class="docutils literal notranslate"><span class="pre">1</span></code> (at least one non-EOS token is generated).</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">repetitionPenalty</span></code>, a vector of float-point numbers to penalize tokens
|
||
(irrespective of the number of appearances). It is multiplicative penalty. It can have any value <code class="docutils literal notranslate"><span class="pre">></span> <span class="pre">0.0f</span></code>. Repetition penalty <code class="docutils literal notranslate"><span class="pre"><</span> <span class="pre">1.0f</span></code> encourages repetition, <code class="docutils literal notranslate"><span class="pre">></span> <span class="pre">1.0f</span></code> discourages it. The default value is <code class="docutils literal notranslate"><span class="pre">1.0f</span></code> (no effect).</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">presencePenalty</span></code>, a vector of float-point numbers to penalize tokens
|
||
already present in the sequence (irrespective of the number of appearances). It is additive penalty.
|
||
It can have any value, values <code class="docutils literal notranslate"><span class="pre"><</span> <span class="pre">0.0f</span></code> encourage repetition, <code class="docutils literal notranslate"><span class="pre">></span> <span class="pre">0.f</span></code> discourage it. The default value is <code class="docutils literal notranslate"><span class="pre">0.0f</span></code> (no effect).</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">frequencyPenalty</span></code>, a vector of float-point numbers to penalize tokens
|
||
already present in the sequence (dependent on the number of appearances). It is additive penalty. It can have any value, values <code class="docutils literal notranslate"><span class="pre"><</span> <span class="pre">0.0f</span></code> encourage repetition, <code class="docutils literal notranslate"><span class="pre">></span> <span class="pre">0.0f</span></code> discourage it.
|
||
The default value is <code class="docutils literal notranslate"><span class="pre">0.0f</span></code>(no effect).</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">noRepeatNgramSize</span></code>, a vector of integers. It can have any value <code class="docutils literal notranslate"><span class="pre">></span> <span class="pre">0</span></code>. If set to int <code class="docutils literal notranslate"><span class="pre">></span> <span class="pre">0</span></code>, all ngrams of that size can only occur once.</p></li>
|
||
</ul>
|
||
<p>The parameters <code class="docutils literal notranslate"><span class="pre">repetitionPenalty</span></code>, <code class="docutils literal notranslate"><span class="pre">presencePenalty</span></code>, and <code class="docutils literal notranslate"><span class="pre">frequencyPenalty</span></code> are not mutually
|
||
exclusive.</p>
|
||
<p><em><strong>Sampling</strong></em></p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">randomSeed</span></code>, a vector of 64-bit integers to control the random seed used by
|
||
the random number generator in sampling. Its default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">topK</span></code>, a vector of integers to control the number of logits to sample from.
|
||
Must be in range of <code class="docutils literal notranslate"><span class="pre">[0,</span> <span class="pre">1024]</span></code>. Its default value is <code class="docutils literal notranslate"><span class="pre">0</span></code>.
|
||
Note that if different values are provided for the
|
||
different sequences in the batch, the performance of the implementation will
|
||
depend on the largest value. For efficiency reasons, we recommend to batch
|
||
requests with similar <code class="docutils literal notranslate"><span class="pre">topK</span></code> values together,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">topP</span></code>, a vector of floating-point values to control the top-P probability
|
||
to sample from. Must be in range of <code class="docutils literal notranslate"><span class="pre">[0.f,</span> <span class="pre">1.f]</span></code>. Its default value is <code class="docutils literal notranslate"><span class="pre">0.f</span></code>,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">topPDecay</span></code>, <code class="docutils literal notranslate"><span class="pre">topPMin</span></code> and <code class="docutils literal notranslate"><span class="pre">topPResetIds</span></code>, vectors to control the decay in
|
||
the <code class="docutils literal notranslate"><span class="pre">topP</span></code> algorithm. The <code class="docutils literal notranslate"><span class="pre">topP</span></code> values are modulated by
|
||
a decay that exponentially depends on the length of the sequence as explained in
|
||
<a class="reference external" href="https://arxiv.org/abs/2206.04624"><em>Factuality Enhanced Language Models for Open-Ended Text Generation</em></a>.
|
||
<code class="docutils literal notranslate"><span class="pre">topPDecay</span></code> is the decay, <code class="docutils literal notranslate"><span class="pre">topPMin</span></code> is the lower-bound and <code class="docutils literal notranslate"><span class="pre">topPResetIds</span></code>
|
||
indicates where to reset the decay.
|
||
<code class="docutils literal notranslate"><span class="pre">topPDecay</span></code>, <code class="docutils literal notranslate"><span class="pre">topPMin</span></code> must be in ranges of <code class="docutils literal notranslate"><span class="pre">(0.f,</span> <span class="pre">1.f]</span></code> and <code class="docutils literal notranslate"><span class="pre">(0.f,</span> <span class="pre">1.f]</span></code> respectively.
|
||
Defaults are <code class="docutils literal notranslate"><span class="pre">1.f</span></code>, <code class="docutils literal notranslate"><span class="pre">1.0e-6,f</span></code> and <code class="docutils literal notranslate"><span class="pre">-1</span></code>,</p></li>
|
||
</ul>
|
||
<p>If both <code class="docutils literal notranslate"><span class="pre">topK</span></code> and <code class="docutils literal notranslate"><span class="pre">topP</span></code> fields are set, the <code class="docutils literal notranslate"><span class="pre">topK</span></code> method will be run for
|
||
sequences with a <code class="docutils literal notranslate"><span class="pre">topK</span></code> value greater than <code class="docutils literal notranslate"><span class="pre">0.f</span></code>. In that case, the <code class="docutils literal notranslate"><span class="pre">topP</span></code>
|
||
value for that sequence also influences the result. If the <code class="docutils literal notranslate"><span class="pre">topK</span></code> values for
|
||
some sequences are <code class="docutils literal notranslate"><span class="pre">0.f</span></code>, the <code class="docutils literal notranslate"><span class="pre">topP</span></code> method will be used for those remaining
|
||
sequences. If both <code class="docutils literal notranslate"><span class="pre">topK</span></code> and <code class="docutils literal notranslate"><span class="pre">topP</span></code> are zero, greedy search is performed.</p>
|
||
<p><em><strong>Beam-search</strong></em></p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">beamWidth</span></code>, is the width used for the <a class="reference external" href="https://en.wikipedia.org/wiki/Beam_search">beam
|
||
search</a> sampling algorithm. There
|
||
is no explicit upper-bound on the beam width but increasing the beam width
|
||
will likely increase the latency. Use <code class="docutils literal notranslate"><span class="pre">1</span></code> to disable beam-search,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">beamSearchDiversityRate</span></code>, a floating-point value that controls the
|
||
diversity in beam-search. It can have any value <code class="docutils literal notranslate"><span class="pre">>=</span> <span class="pre">0.0f</span></code>. The default value is <code class="docutils literal notranslate"><span class="pre">0.f</span></code>,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">lengthPenalty</span></code>, a floating-point value that controls how to penalize the
|
||
longer sequences in beam-search (the log-probability of a sequence will be
|
||
penalized by a factor that depends on <code class="docutils literal notranslate"><span class="pre">1.f</span> <span class="pre">/</span> <span class="pre">(length</span> <span class="pre">^</span> <span class="pre">lengthPenalty)</span></code>). The
|
||
default is value <code class="docutils literal notranslate"><span class="pre">0.f</span></code>,</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">earlyStopping</span></code>, a integer value that controls whether the generation process
|
||
finishes once <code class="docutils literal notranslate"><span class="pre">beamWidth</span></code> sentences are generated (end up with <code class="docutils literal notranslate"><span class="pre">end_token</span></code>).
|
||
Default value <code class="docutils literal notranslate"><span class="pre">1</span></code> means <code class="docutils literal notranslate"><span class="pre">earlyStopping</span></code> is enabled, value <code class="docutils literal notranslate"><span class="pre">0</span></code> means <code class="docutils literal notranslate"><span class="pre">earlyStopping</span></code>
|
||
is disable, other values means the generation process is depended on
|
||
<code class="docutils literal notranslate"><span class="pre">length_penalty</span></code>.
|
||
The <code class="docutils literal notranslate"><span class="pre">beamWidth</span></code> parameter is a scalar value. It means that in this release of
|
||
TensorRT-LLM, it is not possible to specify a different width for each input
|
||
sequence. This limitation is likely to be removed in a future release.</p></li>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
<section id="the-session">
|
||
<h2>The Session<a class="headerlink" href="#the-session" title="Link to this heading"></a></h2>
|
||
<p><em>The runtime session is deprecated in favor of the <a class="reference internal" href="../executor.html#executor"><span class="std std-ref">Executor API</span></a>.
|
||
It will be removed in a future release of TensorRT-LLM.</em></p>
|
||
<p>An example of how to use the <code class="docutils literal notranslate"><span class="pre">GptSession</span></code> to run a GPT-like auto-regressive model can be found in
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tests/runtime/gptSessionTest.cpp"><code class="docutils literal notranslate"><span class="pre">cpp/tests/runtime/gptSessionTest.cpp</span></code></a>.</p>
|
||
<section id="internal-components">
|
||
<h3>Internal Components<a class="headerlink" href="#internal-components" title="Link to this heading"></a></h3>
|
||
<p>The <code class="docutils literal notranslate"><span class="pre">GptSession</span></code> class encapsulates two main components. The
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/tensorrt_llm/runtime/tllmRuntime.h"><code class="docutils literal notranslate"><span class="pre">TllmRuntime</span></code></a> is in charge of the
|
||
execution of the TensorRT engine. The
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/blob/main/cpp/include/tensorrt_llm/runtime/gptDecoder.h"><code class="docutils literal notranslate"><span class="pre">GptDecoder</span></code></a>
|
||
does the generation of the tokens from the logits. The <code class="docutils literal notranslate"><span class="pre">TllmRuntime</span></code> class is
|
||
an internal component and you are not expected to use that class directly.
|
||
The <code class="docutils literal notranslate"><span class="pre">GptDecoder</span></code> can be used directly to implement custom generation loop
|
||
and for use cases that cannot be satisfied by the implementation in
|
||
<code class="docutils literal notranslate"><span class="pre">GptSession</span></code>.</p>
|
||
</section>
|
||
</section>
|
||
<section id="in-flight-batching-support">
|
||
<h2>In-flight Batching Support<a class="headerlink" href="#in-flight-batching-support" title="Link to this heading"></a></h2>
|
||
<p>In-flight batching is supported using separate decoders per
|
||
request. The biggest difference compared to using a single decoder is in how
|
||
the token generation from logits is managed. A batch is split into <code class="docutils literal notranslate"><span class="pre">batchSize</span></code>
|
||
individual requests and kernels are issued using separated CUDA streams.
|
||
This behavior may be revisited in a future release to maintain the structure
|
||
of the batch and improve efficiency.</p>
|
||
</section>
|
||
<section id="know-issues-and-future-changes">
|
||
<h2>Know Issues and Future Changes<a class="headerlink" href="#know-issues-and-future-changes" title="Link to this heading"></a></h2>
|
||
<ul class="simple">
|
||
<li><p>In the current release of TensorRT-LLM, the C++ and Python runtimes are two
|
||
separate software components and the C++ runtime is being more actively
|
||
developed (with features like in-flight batching). An objective, for a
|
||
future release, could be to rebuild the Python runtime on top of the C++
|
||
one.</p></li>
|
||
</ul>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||
<a href="gpt-attention.html" class="btn btn-neutral float-left" title="Multi-Head, Multi-Query, and Group-Query Attention" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||
<a href="graph-rewriting.html" class="btn btn-neutral float-right" title="Graph Rewriting Module" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||
</div>
|
||
|
||
<hr/>
|
||
|
||
<div role="contentinfo">
|
||
<jinja2.runtime.BlockReference object at 0x7fedf3045180>
|
||
|
||
<div class="footer">
|
||
<p>
|
||
Copyright © 2024 NVIDIA Corporation
|
||
</p>
|
||
<p>
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-policy/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Privacy Policy</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/privacy-center/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Manage My Privacy</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/preferences/start/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Do Not Sell or Share My Data</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/terms-of-service/" target="_blank"
|
||
rel="noopener" data-cms-ai="0">Terms of Service</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/accessibility/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Accessibility</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/about-nvidia/company-policies/" target="_blank"
|
||
rel="noopener" data-cms-ai="0">Corporate Policies</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/product-security/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Product Security</a> |
|
||
<a class="Link" href="https://www.nvidia.com/en-us/contact/" target="_blank" rel="noopener"
|
||
data-cms-ai="0">Contact</a>
|
||
</p>
|
||
</div>
|
||
|
||
|
||
</div>
|
||
|
||
|
||
|
||
</footer>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
<script>
|
||
jQuery(function () {
|
||
SphinxRtdTheme.Navigation.enable(true);
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |