mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
326 lines
28 KiB
HTML
326 lines
28 KiB
HTML
<!DOCTYPE html>
|
||
<html class="writer-html5" lang="en" data-content_root="./">
|
||
<head>
|
||
<meta charset="utf-8" /><meta name="viewport" content="width=device-width, initial-scale=1" />
|
||
|
||
<meta name="viewport" content="width=device-width, initial-scale=1.0" />
|
||
<title>The Batch Manager in TensorRT-LLM — tensorrt_llm documentation</title>
|
||
<link rel="stylesheet" type="text/css" href="_static/pygments.css?v=80d5e7a1" />
|
||
<link rel="stylesheet" type="text/css" href="_static/css/theme.css?v=19f00094" />
|
||
|
||
|
||
<!--[if lt IE 9]>
|
||
<script src="_static/js/html5shiv.min.js"></script>
|
||
<![endif]-->
|
||
|
||
<script src="_static/jquery.js?v=5d32c60e"></script>
|
||
<script src="_static/_sphinx_javascript_frameworks_compat.js?v=2cd50e6c"></script>
|
||
<script src="_static/documentation_options.js?v=5929fcd5"></script>
|
||
<script src="_static/doctools.js?v=888ff710"></script>
|
||
<script src="_static/sphinx_highlight.js?v=dc90522c"></script>
|
||
<script src="_static/js/theme.js"></script>
|
||
<link rel="index" title="Index" href="genindex.html" />
|
||
<link rel="search" title="Search" href="search.html" />
|
||
<link rel="next" title="Multi-head, Multi-query and Group-query Attention" href="gpt_attention.html" />
|
||
<link rel="prev" title="C++ GPT Runtime" href="gpt_runtime.html" />
|
||
</head>
|
||
|
||
<body class="wy-body-for-nav">
|
||
<div class="wy-grid-for-nav">
|
||
<nav data-toggle="wy-nav-shift" class="wy-nav-side">
|
||
<div class="wy-side-scroll">
|
||
<div class="wy-side-nav-search" >
|
||
|
||
|
||
|
||
<a href="index.html" class="icon icon-home">
|
||
tensorrt_llm
|
||
</a>
|
||
<div role="search">
|
||
<form id="rtd-search-form" class="wy-form" action="search.html" method="get">
|
||
<input type="text" name="q" placeholder="Search docs" aria-label="Search docs" />
|
||
<input type="hidden" name="check_keywords" value="yes" />
|
||
<input type="hidden" name="area" value="default" />
|
||
</form>
|
||
</div>
|
||
</div><div class="wy-menu wy-menu-vertical" data-spy="affix" role="navigation" aria-label="Navigation menu">
|
||
<p class="caption" role="heading"><span class="caption-text">Contents:</span></p>
|
||
<ul class="current">
|
||
<li class="toctree-l1"><a class="reference internal" href="architecture.html">TensorRT-LLM Architecture</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="gpt_runtime.html">C++ GPT Runtime</a></li>
|
||
<li class="toctree-l1 current"><a class="current reference internal" href="#">The Batch Manager in TensorRT-LLM</a><ul>
|
||
<li class="toctree-l2"><a class="reference internal" href="#the-batch-manager-api">The Batch Manager API</a><ul>
|
||
<li class="toctree-l3"><a class="reference internal" href="#get-and-send-callbacks">Get and Send Callbacks</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#request-interruption">Request Interruption</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#statistics">Statistics</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#gptmanager-design">GptManager Design</a></li>
|
||
<li class="toctree-l3"><a class="reference internal" href="#multi-gpu-execution">Multi-GPU execution</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l2"><a class="reference internal" href="#in-flight-batching-with-the-triton-inference-server">In-flight Batching with the Triton Inference Server</a></li>
|
||
</ul>
|
||
</li>
|
||
<li class="toctree-l1"><a class="reference internal" href="gpt_attention.html">Multi-head, Multi-query and Group-query Attention</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="precision.html">Numerical Precision</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="installation.html">TensorRT-LLM Installation</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="performance.html">Performance of TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="2023-05-19-how-to-debug.html">How to debug</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="2023-05-17-how-to-add-a-new-model.html">How to add a new model</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="graph-rewriting.html">Graph Rewriting Module</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="memory.html">Memory Usage of TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="new_workflow.html">New Workflow</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Python API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.layers.html">Layers</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.functional.html">Functionals</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.models.html">Models</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.plugin.html">Plugin</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.quantization.html">Quantization</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="python-api/tensorrt_llm.runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">C++ API</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="_cpp_gen/runtime.html">Runtime</a></li>
|
||
</ul>
|
||
<p class="caption" role="heading"><span class="caption-text">Blogs</span></p>
|
||
<ul>
|
||
<li class="toctree-l1"><a class="reference internal" href="blogs/H100vsA100.html">H100 has 4.6x A100 Performance in TensorRT-LLM, achieving 10,000 tok/s at 100ms to first token</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="blogs/H200launch.html">H200 achieves nearly 12,000 tokens/sec on Llama2-13B with TensorRT-LLM</a></li>
|
||
<li class="toctree-l1"><a class="reference internal" href="blogs/Falcon180B-H200.html">Falcon-180B on a single H200 GPU with INT4 AWQ, and 6.7x faster Llama-70B over A100</a></li>
|
||
</ul>
|
||
|
||
</div>
|
||
</div>
|
||
</nav>
|
||
|
||
<section data-toggle="wy-nav-shift" class="wy-nav-content-wrap"><nav class="wy-nav-top" aria-label="Mobile navigation menu" >
|
||
<i data-toggle="wy-nav-top" class="fa fa-bars"></i>
|
||
<a href="index.html">tensorrt_llm</a>
|
||
</nav>
|
||
|
||
<div class="wy-nav-content">
|
||
<div class="rst-content">
|
||
<div role="navigation" aria-label="Page navigation">
|
||
<ul class="wy-breadcrumbs">
|
||
<li><a href="index.html" class="icon icon-home" aria-label="Home"></a></li>
|
||
<li class="breadcrumb-item active">The Batch Manager in TensorRT-LLM</li>
|
||
<li class="wy-breadcrumbs-aside">
|
||
<a href="_sources/batch_manager.md.txt" rel="nofollow"> View page source</a>
|
||
</li>
|
||
</ul>
|
||
<hr/>
|
||
</div>
|
||
<div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
|
||
<div itemprop="articleBody">
|
||
|
||
<section id="the-batch-manager-in-tensorrt-llm">
|
||
<h1>The Batch Manager in TensorRT-LLM<a class="headerlink" href="#the-batch-manager-in-tensorrt-llm" title="Link to this heading"></a></h1>
|
||
<p>TensorRT-LLM relies on a component, called the Batch Manager, to support
|
||
in-flight batching of requests (also known in the community as continuous
|
||
batching or iteration-level batching). That technique that aims at reducing
|
||
wait times in queues, eliminating the need for padding requests and allowing
|
||
for higher GPU utilization.</p>
|
||
<p>In more details, this feature allows for the inclusion of newly arrived
|
||
requests and the return of newly completed requests at each iteration of the
|
||
token generation loop. In-flight batching is accessed via a TensorRT-LLM component
|
||
called the <em>Batch Manager</em>. That batch manager exposes hooks for the user to
|
||
register function pointers to define how TensorRT-LLM reads in new requests and
|
||
how it returns completed requests to the user.</p>
|
||
<section id="the-batch-manager-api">
|
||
<h2>The Batch Manager API<a class="headerlink" href="#the-batch-manager-api" title="Link to this heading"></a></h2>
|
||
<p>A software component (called the client in the text that follows) can interact
|
||
with the batch manager using two mandatory, and several optional callbacks. Their signatures are defined
|
||
in the <a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/cpp/include/tensorrt_llm/batch_manager/callbacks.h"><code class="docutils literal notranslate"><span class="pre">callbacks.h</span></code></a> file.</p>
|
||
<p>These callbacks are invoked in the generation loop at regular intervals and serve a variety of functions described below.</p>
|
||
<section id="get-and-send-callbacks">
|
||
<h3>Get and Send Callbacks<a class="headerlink" href="#get-and-send-callbacks" title="Link to this heading"></a></h3>
|
||
<p>The entry point to pass new requests to the batch manager is a callback of type
|
||
<code class="docutils literal notranslate"><span class="pre">GetInferenceRequestsCallback</span></code>. An implementation of that callback must return
|
||
a list of requests (<code class="docutils literal notranslate"><span class="pre">std::list<std::shared_ptr<InferenceRequest></span></code>) to be
|
||
processed by the batch manager. It takes a parameter indicating the maximum
|
||
number of requests that can be accepted (a negative value indicates that an
|
||
unbounded number of requests can be accepted). The complete signature of that
|
||
callback is:</p>
|
||
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">using</span><span class="w"> </span><span class="n">GetInferenceRequestsCallback</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">function</span><span class="o"><</span><span class="n">std</span><span class="o">::</span><span class="n">list</span><span class="o"><</span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o"><</span><span class="n">InferenceRequest</span><span class="o">>></span><span class="p">(</span><span class="kt">int32_t</span><span class="p">)</span><span class="o">></span><span class="p">;</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>For each new request, the client must provide the batch manager with its input
|
||
tensors and a 64-bit unsigned number (<code class="docutils literal notranslate"><span class="pre">uint64_t</span></code>) that will uniquely identify
|
||
the request. That identifier is called the <em>request ID</em> in the text that
|
||
follows (and in the code of the batch manager). The input tensors are collected
|
||
in a map (<code class="docutils literal notranslate"><span class="pre">std::map<std::string,</span> <span class="pre">Tensor></span></code>) that associates input names to
|
||
tensor. See
|
||
<a class="reference external" href="https://github.com/NVIDIA/TensorRT-LLM/tree/rel/cpp/include/tensorrt_llm/batch_manager/InferenceRequest.h"><code class="docutils literal notranslate"><span class="pre">InferenceRequest.h</span></code></a>
|
||
for more details.</p>
|
||
<p>Responses are delivered to the client through a callback of type
|
||
<code class="docutils literal notranslate"><span class="pre">SendResponseCallback</span></code>. A conforming callback must accept the 64-bit
|
||
request ID that uniquely identifies the request, the list of output tensors,
|
||
a boolean (identifying the last response for the request when set to
|
||
<code class="docutils literal notranslate"><span class="pre">true</span></code>) and a potentially non-empty error message.
|
||
A non-empty error message indicates that an error has been encountered.
|
||
In that case, the boolean indicating that this is the last response will be set to true,
|
||
and the callback must properly handle the error.
|
||
Its signature is:</p>
|
||
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">using</span><span class="w"> </span><span class="n">SendResponseCallback</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">function</span><span class="o"><</span><span class="kt">void</span><span class="p">(</span><span class="kt">uint64_t</span><span class="p">,</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">list</span><span class="o"><</span><span class="n">std</span><span class="o">::</span><span class="n">shared_ptr</span><span class="o"><</span><span class="n">Tensor</span><span class="o">>></span><span class="w"> </span><span class="k">const</span><span class="o">&</span><span class="p">,</span><span class="w"> </span><span class="kt">bool</span><span class="p">,</span><span class="w"> </span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&</span><span class="p">)</span><span class="o">></span><span class="p">;</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>Note that the batch manager will reject any request sent using the
|
||
<code class="docutils literal notranslate"><span class="pre">GetInferenceRequestsCallback</span></code> callback if the request ID passed by the
|
||
client corresponds to the request ID of a request that is being processed
|
||
by the batch manager. A request ID can be reused after it appears in a
|
||
call to the <code class="docutils literal notranslate"><span class="pre">SendResponseCallback</span></code> callback marked as final (third argument set
|
||
to <code class="docutils literal notranslate"><span class="pre">true</span></code>).</p>
|
||
</section>
|
||
<section id="request-interruption">
|
||
<h3>Request Interruption<a class="headerlink" href="#request-interruption" title="Link to this heading"></a></h3>
|
||
<p>The batch manager allows users to stop the execution of requests currently in-flight.
|
||
The set of request IDs to be stopped can be passed to the batch manager
|
||
through the callback:</p>
|
||
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">using</span><span class="w"> </span><span class="n">PollStopSignalCallback</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">function</span><span class="o"><</span><span class="n">std</span><span class="o">::</span><span class="n">unordered_set</span><span class="o"><</span><span class="kt">uint64_t</span><span class="o">></span><span class="p">()</span><span class="o">></span><span class="p">;</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>When an active request appears in the set of requests to be interrupted, the
|
||
batch manager will ensure that it is properly stopped.</p>
|
||
</section>
|
||
<section id="statistics">
|
||
<h3>Statistics<a class="headerlink" href="#statistics" title="Link to this heading"></a></h3>
|
||
<p>The batch manager can report execution statistics when provided with the following
|
||
callback:</p>
|
||
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="k">using</span><span class="w"> </span><span class="n">ReturnBatchManagerStatsCallback</span><span class="w"> </span><span class="o">=</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">function</span><span class="o"><</span><span class="kt">void</span><span class="p">(</span><span class="k">const</span><span class="w"> </span><span class="n">std</span><span class="o">::</span><span class="n">string</span><span class="o">&</span><span class="p">)</span><span class="o">></span><span class="p">;</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The statistics are packaged as a JSON string. That string contains the following fields:</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Timestamp</span></code>, the timestamp of the request (obtained using
|
||
<code class="docutils literal notranslate"><span class="pre">std::put_time(&tm,</span> <span class="pre">"%m-%d-%Y</span> <span class="pre">%H:%M:%S")</span></code>),</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Iteration</span> <span class="pre">Counter</span></code>, a global step counter value that increases monotonically over time</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Active</span> <span class="pre">Request</span> <span class="pre">Count</span></code>, the number of active requests in batch manager</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Max</span> <span class="pre">Request</span> <span class="pre">Count</span></code>, the max number of requests batch manager can support at a time</p></li>
|
||
</ul>
|
||
<p>When using paged KV cache, following statistics are reported:</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Max</span> <span class="pre">KV</span> <span class="pre">cache</span> <span class="pre">blocks</span></code>, the maximum number of KV cache blocks per GPU</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Free</span> <span class="pre">KV</span> <span class="pre">cache</span> <span class="pre">blocks</span></code>, number of free KV cache blocks per GPU</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Used</span> <span class="pre">KV</span> <span class="pre">cache</span> <span class="pre">blocks</span></code>, number of used KV cache blocks per GPU</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Tokens</span> <span class="pre">per</span> <span class="pre">KV</span> <span class="pre">cache</span> <span class="pre">block</span></code>, number of tokens per KV cache block</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Scheduled</span> <span class="pre">Requests</span></code>, number of requests scheduled this iteration</p></li>
|
||
</ul>
|
||
<p>When using in-flight batching, the following additional statistics are reported per step/iteration:</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Scheduled</span> <span class="pre">Requests</span></code>, number of total requests scheduled</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Context</span> <span class="pre">Requests</span></code>, number of requests in Context phase</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Generation</span> <span class="pre">Requests</span></code>, number of requests in Generation phase</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Total</span> <span class="pre">Context</span> <span class="pre">Tokens</span></code>, total number of tokens across requests in context phase</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">MicroBatch</span> <span class="pre">ID</span></code>, micro batch ID</p></li>
|
||
</ul>
|
||
<p>When using V1 batching, the following additional statistics are reported per V1 iteration:</p>
|
||
<ul class="simple">
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Scheduled</span> <span class="pre">Requests</span></code>, number of total requests scheduled</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Context</span> <span class="pre">Requests</span></code>, number of requests in Context phase</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Total</span> <span class="pre">Generation</span> <span class="pre">Tokens</span></code>, Total number of tokens generated</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Total</span> <span class="pre">Context</span> <span class="pre">Tokens</span></code>, total number of tokens across requests in context phase</p></li>
|
||
<li><p><code class="docutils literal notranslate"><span class="pre">Empty</span> <span class="pre">Generation</span> <span class="pre">Slots</span></code>, total number of padded Slots during generation phase</p></li>
|
||
</ul>
|
||
</section>
|
||
<section id="gptmanager-design">
|
||
<h3>GptManager Design<a class="headerlink" href="#gptmanager-design" title="Link to this heading"></a></h3>
|
||
<p>Batch Manager is designed to integrate into an inference server that’s executing a pool of
|
||
active work items populated by a stream of requests actively received
|
||
by the server. GptManager assumes a GPT-style autoregressive model architecture.
|
||
GptManager spawns a worker thread in its constructor that then
|
||
persistently runs the token generation loop. The worker thread invokes <code class="docutils literal notranslate"><span class="pre">GetInferenceRequestsCallback</span></code>
|
||
at the start of each loop iteration, which is intended to read new
|
||
requests. It invokes <code class="docutils literal notranslate"><span class="pre">SendResponseCallback</span></code> at the end of each iteration when one or
|
||
more requests have generated a response to send back to the user. This response
|
||
can be a single token in the case of requests that have streaming mode enabled or
|
||
the full response when streaming mode is disabled.
|
||
<code class="docutils literal notranslate"><span class="pre">PollStopSignalCallback</span></code> and <code class="docutils literal notranslate"><span class="pre">ReturnBatchManagerStatsCallback</span></code>, if provided, are both invoked at the end of each
|
||
iteration loop. <code class="docutils literal notranslate"><span class="pre">ReturnBatchManagerStatsCallback</span></code> is not called when the system has no active requests.
|
||
The server can safely retire requests from its pool of work
|
||
items when notified of completion (via the final_response boolean argument) by the batch manager in
|
||
<code class="docutils literal notranslate"><span class="pre">SendResponseCallback</span></code>. All TensorRT-LLM internal state related to that
|
||
request will have been freed before this point.
|
||
An instance of the batch manager to serve an
|
||
auto-regressive model like GPT can be created as follows:</p>
|
||
<div class="highlight-cpp notranslate"><div class="highlight"><pre><span></span><span class="cp">#include</span><span class="w"> </span><span class="cpf"><tensorrt_llm/batch_manager/GptManager.h></span>
|
||
|
||
<span class="k">using</span><span class="w"> </span><span class="k">namespace</span><span class="w"> </span><span class="nn">tensorrt_llm</span><span class="o">::</span><span class="nn">batch_manager</span><span class="p">;</span>
|
||
|
||
<span class="n">GptManager</span><span class="w"> </span><span class="n">batchManager</span><span class="p">(</span><span class="n">pathToTrtEngine</span><span class="p">,</span><span class="w"> </span><span class="c1">// Path to the TensorRT engine of the model,</span>
|
||
<span class="w"> </span><span class="n">TrtGptModelType</span><span class="o">::</span><span class="n">InflightBatching</span><span class="p">,</span><span class="w"> </span><span class="c1">// Use in-flight batching,</span>
|
||
<span class="w"> </span><span class="n">maxBeamWidth</span><span class="p">,</span><span class="w"> </span><span class="c1">// Maximum beam width (must be >= 1),</span>
|
||
<span class="w"> </span><span class="n">schedulerPolicy</span><span class="p">,</span><span class="w"> </span><span class="c1">// Scheduling policy (see below),</span>
|
||
<span class="w"> </span><span class="n">maxNumRequests</span><span class="p">,</span><span class="w"> </span><span class="c1">// Maximum number of requests,</span>
|
||
<span class="w"> </span><span class="n">getInferenceRequestsCb</span><span class="p">,</span><span class="w"> </span><span class="c1">// The Get callback (see above),</span>
|
||
<span class="w"> </span><span class="n">sendResponseCb</span><span class="p">);</span><span class="w"> </span><span class="c1">// The Send callback (see above).</span>
|
||
</pre></div>
|
||
</div>
|
||
<p>The scheduler policy helps the batch manager adjust how requests are scheduled
|
||
for execution. The batch manager can try to maximize the utilization of the
|
||
GPUs by aggressively scheduling requests (<code class="docutils literal notranslate"><span class="pre">schedulerPolicy</span></code> set to
|
||
<code class="docutils literal notranslate"><span class="pre">MAX_UTILIZATION</span></code>) at the risk of having to pause requests if it runs short on
|
||
memory for KV caches. Note that any paused request will be automatically resumed
|
||
and the only user-visible effect may be increased latency.
|
||
It can also adopt a more conservative approach and schedule requests only when it
|
||
knows that the memory allocation will be sufficient to process all active requests
|
||
even in the worst case of KV cache consumption. That mode corresponds to a
|
||
<code class="docutils literal notranslate"><span class="pre">schedulerPolicy</span></code> set to <code class="docutils literal notranslate"><span class="pre">GUARANTEED_NO_EVICT</span></code>.</p>
|
||
<p>The <code class="docutils literal notranslate"><span class="pre">GptManager</span></code>’s worker thread terminates when the <code class="docutils literal notranslate"><span class="pre">GptManager</span></code> destructor is
|
||
called and there are no more active requests.</p>
|
||
</section>
|
||
<section id="multi-gpu-execution">
|
||
<h3>Multi-GPU execution<a class="headerlink" href="#multi-gpu-execution" title="Link to this heading"></a></h3>
|
||
<p>When running on multiple GPUs using either tensor or pipeline parallelism, it
|
||
is assumed that the server launches as many processes as GPU ranks, and each
|
||
process runs its own instance of <code class="docutils literal notranslate"><span class="pre">GptManager</span></code>. The number of GPUs visible on a given
|
||
node can be controlled using the <code class="docutils literal notranslate"><span class="pre">CUDA_VISIBLE_DEVICES</span></code> environment variable.</p>
|
||
<p>Care must be taken to ensure all ranks see the same inputs at each iteration of
|
||
the generation loop. In TensorRT-LLM Triton backend, an MPI broadcast is
|
||
performed in <code class="docutils literal notranslate"><span class="pre">GetInferenceRequestsCallback</span></code> to ensure the same set of requests
|
||
is seen by each of the MPI ranks. <code class="docutils literal notranslate"><span class="pre">ReturnBatchManagerStatsCallback</span></code> need only
|
||
be called from a single rank; all ranks hold identical copies of the final
|
||
results.</p>
|
||
</section>
|
||
</section>
|
||
<section id="in-flight-batching-with-the-triton-inference-server">
|
||
<h2>In-flight Batching with the Triton Inference Server<a class="headerlink" href="#in-flight-batching-with-the-triton-inference-server" title="Link to this heading"></a></h2>
|
||
<p>A Triton Inference Server C++ backend is provided with TensorRT-LLM that
|
||
includes the mechanisms needed to serve models using in-flight batching. That
|
||
backend is also a good starting example how to implement in-flight batching using
|
||
the TensorRT-LLM batch manager.</p>
|
||
</section>
|
||
</section>
|
||
|
||
|
||
</div>
|
||
</div>
|
||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||
<a href="gpt_runtime.html" class="btn btn-neutral float-left" title="C++ GPT Runtime" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||
<a href="gpt_attention.html" class="btn btn-neutral float-right" title="Multi-head, Multi-query and Group-query Attention" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||
</div>
|
||
|
||
<hr/>
|
||
|
||
<div role="contentinfo">
|
||
<p>© Copyright 2023, NVidia.</p>
|
||
</div>
|
||
|
||
Built with <a href="https://www.sphinx-doc.org/">Sphinx</a> using a
|
||
<a href="https://github.com/readthedocs/sphinx_rtd_theme">theme</a>
|
||
provided by <a href="https://readthedocs.org">Read the Docs</a>.
|
||
|
||
|
||
</footer>
|
||
</div>
|
||
</div>
|
||
</section>
|
||
</div>
|
||
<script>
|
||
jQuery(function () {
|
||
SphinxRtdTheme.Navigation.enable(true);
|
||
});
|
||
</script>
|
||
|
||
</body>
|
||
</html> |