Update gh-pages (#2651)

This commit is contained in:
Kaiyu Xie 2025-01-03 15:12:39 +08:00 committed by GitHub
parent 4ad18fd144
commit f11aeed624
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
82 changed files with 1827 additions and 1030 deletions

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -110,6 +110,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -232,7 +233,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e6e6c6e40>
<jinja2.runtime.BlockReference object at 0x7da791a96e40>
<div class="footer">
<p>

View File

@ -110,6 +110,7 @@
<li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -2362,8 +2363,10 @@
<span class="n">out</span> <span class="o">=</span> <span class="n">record</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
<span class="c1"># Left the result_handler determine the final output dtype.</span>
<span class="c1"># NOTE: This will change the CompletionOutput._postprocess_result</span>
<span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">nvtx</span><span class="o">.</span><span class="n">range_push</span><span class="p">(</span><span class="s2">&quot;_result_handler&quot;</span><span class="p">)</span>
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_result_handler</span><span class="p">:</span>
<span class="n">out</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_result_handler</span><span class="p">(</span><span class="n">record</span><span class="p">)</span>
<span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">nvtx</span><span class="o">.</span><span class="n">range_pop</span><span class="p">()</span>
<span class="c1"># TODO: Keep only the diff token_ids and text in streaming mode when</span>
<span class="c1"># result_handler is not set</span>
@ -2388,6 +2391,7 @@
<span class="n">Input</span> <span class="o">=</span> <span class="n">ResponsePostprocessWorker</span><span class="o">.</span><span class="n">Input</span>
<span class="n">Output</span> <span class="o">=</span> <span class="n">ResponsePostprocessWorker</span><span class="o">.</span><span class="n">Output</span>
<span class="nd">@nvtx_range</span><span class="p">(</span><span class="s2">&quot;handle_single_input&quot;</span><span class="p">)</span>
<span class="k">async</span> <span class="k">def</span> <span class="nf">handle_single_input</span><span class="p">(</span>
<span class="nb">input</span><span class="p">:</span> <span class="n">ResponsePostprocessWorker</span><span class="o">.</span><span class="n">Input</span><span class="p">,</span>
<span class="n">batch</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ResponsePostprocessWorker</span><span class="o">.</span><span class="n">Output</span><span class="p">]):</span>
@ -2469,7 +2473,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e6dddfe00>
<jinja2.runtime.BlockReference object at 0x7da7918410d0>
<div class="footer">
<p>

View File

@ -110,6 +110,7 @@
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -716,7 +717,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e6dd86870>
<jinja2.runtime.BlockReference object at 0x7da791842210>
<div class="footer">
<p>

View File

@ -110,6 +110,7 @@
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -1889,7 +1890,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e6de5f0e0>
<jinja2.runtime.BlockReference object at 0x7da791af4950>
<div class="footer">
<p>

View File

@ -110,6 +110,7 @@
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -1349,7 +1350,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e6e3f6270>
<jinja2.runtime.BlockReference object at 0x7da791a97050>
<div class="footer">
<p>

View File

@ -4,6 +4,12 @@ Executor
.. Here are files in the cpp/include/executor
.. We manually add subsection to enable detailed description in the future
.. It is also doable to automatically generate this file and list all the modules in the conf.py
disaggServerUtil.h
__________________
.. doxygenfile:: disaggServerUtil.h
:project: TensorRT-LLM
executor.h
__________
@ -28,12 +34,6 @@ _______
.. doxygenfile:: types.h
:project: TensorRT-LLM
disaggServerUtil.h
__________________
.. doxygenfile:: disaggServerUtil.h
:project: TensorRT-LLM
version.h
_________

View File

@ -40,6 +40,12 @@ ________________
.. doxygenfile:: decodingOutput.h
:project: TensorRT-LLM
eagleBuffers.h
______________
.. doxygenfile:: eagleBuffers.h
:project: TensorRT-LLM
explicitDraftTokensBuffers.h
____________________________
@ -112,6 +118,12 @@ __________
.. doxygenfile:: ipcUtils.h
:project: TensorRT-LLM
lookaheadBuffers.h
__________________
.. doxygenfile:: lookaheadBuffers.h
:project: TensorRT-LLM
lookaheadModule.h
_________________
@ -172,6 +184,12 @@ _________
.. doxygenfile:: request.h
:project: TensorRT-LLM
runtimeDefaults.h
_________________
.. doxygenfile:: runtimeDefaults.h
:project: TensorRT-LLM
samplingConfig.h
________________
@ -202,21 +220,3 @@ _____________
.. doxygenfile:: worldConfig.h
:project: TensorRT-LLM
eagleBuffers.h
______________
.. doxygenfile:: eagleBuffers.h
:project: TensorRT-LLM
lookaheadBuffers.h
__________________
.. doxygenfile:: lookaheadBuffers.h
:project: TensorRT-LLM
runtimeDefaults.h
_________________
.. doxygenfile:: runtimeDefaults.h
:project: TensorRT-LLM

View File

@ -58,13 +58,13 @@ In the code above, the `requestId` assigned to a request by different executors
![disaggregated-service usage](images/disaggregated-service_usage.png)
An `orchestrator` is required in `disaggregated-service` to manage multiple executor instance and route requests to different executors, TRT-LLM provides class [DisaggExecutorOrchestrator](../../../cpp/include/tensorrt_llm/executor/disaggServerUtil.h) to help user to launch multiple executor instances, however, `DisaggExecutorOrchestrator` only routes requests to executors in a simple round-robin policy, users need to implement their own orchestrator for disaggregated-service based on their business.
An `orchestrator` is required in `disaggregated-service` to manage multiple executor instance and route requests to different executors, TRT-LLM provides class `DisaggExecutorOrchestrator` in `cpp/include/tensorrt_llm/executor/disaggServerUtil.h` to help user to launch multiple executor instances, however, `DisaggExecutorOrchestrator` only routes requests to executors in a simple round-robin policy, users need to implement their own orchestrator for disaggregated-service based on their business.
TRT-LLM currently implements kvCache transfer using `CUDA-aware MPI`, and all executor processes involved need to hold same MPI world communicator. Therefore, TRT-LLM only supports launching multiple executors using `MPI`, and the `CommunicationMode` of the executors must be set to `KLEADER` or `kORCHESTRATOR` with `SpawnProcesses=false` for `disaggregated-service`, TRT-LLM will relax this restriction in future version to manage executors with greater ease.
## Benchmarks
Please refer to [disaggServerBenchmark](../../../benchmarks/cpp/README.md#4.launch-C++-disaggServerBenchmark)
Please refer to `benchmarks/cpp/disaggServerBenchmark.cpp` and `benchmarks/cpp/README.md`
## Troubleshooting and FAQ

View File

@ -106,6 +106,7 @@ Welcome to TensorRT-LLM's Documentation!
advanced/expert-parallelism.md
advanced/kv-cache-reuse.md
advanced/speculative-decoding.md
advanced/disaggregated-service.md
.. toctree::
:maxdepth: 2

View File

@ -5,15 +5,15 @@ Examples
:maxdepth: 2
:caption: Scripts
llm_guided_decoding
llm_inference
llm_inference_async
llm_inference_async_streaming
llm_inference_customize
llm_inference_distributed
llm_logits_processor
llm_quantization
llm_guided_decoding
llm_lookahead_decoding
llm_medusa_decoding
llm_multilora
llm_quantization
llm_auto_parallel

View File

@ -21,7 +21,9 @@
<script src="../_static/copybutton.js?v=65e89d2a"></script>
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Overview" href="../performance/perf-overview.html" />
<link rel="prev" title="Speculative Sampling" href="speculative-decoding.html" />
</head>
<body class="wy-body-for-nav">
@ -100,7 +102,7 @@
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
<ul>
<ul class="current">
<li class="toctree-l1"><a class="reference internal" href="gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
<li class="toctree-l1"><a class="reference internal" href="gpt-runtime.html">C++ GPT Runtime</a></li>
<li class="toctree-l1"><a class="reference internal" href="executor.html">Executor API</a></li>
@ -111,6 +113,16 @@
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Disaggregated-Service (experimental)</a><ul>
<li class="toctree-l2"><a class="reference internal" href="#usage">Usage</a></li>
<li class="toctree-l2"><a class="reference internal" href="#benchmarks">Benchmarks</a></li>
<li class="toctree-l2"><a class="reference internal" href="#troubleshooting-and-faq">Troubleshooting and FAQ</a><ul>
<li class="toctree-l3"><a class="reference internal" href="#general-faqs">General FAQs</a></li>
<li class="toctree-l3"><a class="reference internal" href="#debugging-faqs">Debugging FAQs</a></li>
</ul>
</li>
</ul>
</li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -202,12 +214,12 @@ This feature is currently experimental, and the related API is subjected to chan
<p>The generationExecutor will require data such as kvCache from the corresponding contextExecutor based on the <code class="docutils literal notranslate"><span class="pre">contextPhaseParams</span></code> attached to the request, so please make sure that the corresponding contextExecutor is not shut down before getting the generationExecutors response.</p>
<p>In the code above, the <code class="docutils literal notranslate"><span class="pre">requestId</span></code> assigned to a request by different executors may be different, it is the users responsibility to manage the mapping of the <code class="docutils literal notranslate"><span class="pre">requestId</span></code> for context-only requests to the <code class="docutils literal notranslate"><span class="pre">requestId</span></code> for generation-only requests.</p>
<p><img alt="disaggregated-service usage" src="../_images/disaggregated-service_usage.png" /></p>
<p>An <code class="docutils literal notranslate"><span class="pre">orchestrator</span></code> is required in <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code> to manage multiple executor instance and route requests to different executors, TRT-LLM provides class <a class="reference download internal" download="" href="../_downloads/5b0862c3dcc6c1b12192056ed28ee5c1/disaggServerUtil.h"><span class="xref download myst">DisaggExecutorOrchestrator</span></a> to help user to launch multiple executor instances, however, <code class="docutils literal notranslate"><span class="pre">DisaggExecutorOrchestrator</span></code> only routes requests to executors in a simple round-robin policy, users need to implement their own orchestrator for disaggregated-service based on their business.</p>
<p>An <code class="docutils literal notranslate"><span class="pre">orchestrator</span></code> is required in <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code> to manage multiple executor instance and route requests to different executors, TRT-LLM provides class <code class="docutils literal notranslate"><span class="pre">DisaggExecutorOrchestrator</span></code> in <code class="docutils literal notranslate"><span class="pre">cpp/include/tensorrt_llm/executor/disaggServerUtil.h</span></code> to help user to launch multiple executor instances, however, <code class="docutils literal notranslate"><span class="pre">DisaggExecutorOrchestrator</span></code> only routes requests to executors in a simple round-robin policy, users need to implement their own orchestrator for disaggregated-service based on their business.</p>
<p>TRT-LLM currently implements kvCache transfer using <code class="docutils literal notranslate"><span class="pre">CUDA-aware</span> <span class="pre">MPI</span></code>, and all executor processes involved need to hold same MPI world communicator. Therefore, TRT-LLM only supports launching multiple executors using <code class="docutils literal notranslate"><span class="pre">MPI</span></code>, and the <code class="docutils literal notranslate"><span class="pre">CommunicationMode</span></code> of the executors must be set to <code class="docutils literal notranslate"><span class="pre">KLEADER</span></code> or <code class="docutils literal notranslate"><span class="pre">kORCHESTRATOR</span></code> with <code class="docutils literal notranslate"><span class="pre">SpawnProcesses=false</span></code> for <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code>, TRT-LLM will relax this restriction in future version to manage executors with greater ease.</p>
</section>
<section id="benchmarks">
<h2>Benchmarks<a class="headerlink" href="#benchmarks" title="Link to this heading"></a></h2>
<p>Please refer to <span class="xref myst">disaggServerBenchmark</span></p>
<p>Please refer to <code class="docutils literal notranslate"><span class="pre">benchmarks/cpp/disaggServerBenchmark.cpp</span></code> and <code class="docutils literal notranslate"><span class="pre">benchmarks/cpp/README.md</span></code></p>
</section>
<section id="troubleshooting-and-faq">
<h2>Troubleshooting and FAQ<a class="headerlink" href="#troubleshooting-and-faq" title="Link to this heading"></a></h2>
@ -250,12 +262,15 @@ If version of UCX =1.18, set <code class="docutils literal notranslate"><span cl
</div>
</div>
<footer>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="speculative-decoding.html" class="btn btn-neutral float-left" title="Speculative Sampling" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="../performance/perf-overview.html" class="btn btn-neutral float-right" title="Overview" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f714f6b0>
<jinja2.runtime.BlockReference object at 0x7da747e8d0a0>
<div class="footer">
<p>

View File

@ -128,6 +128,7 @@
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -289,7 +290,7 @@ the TensorRT-LLM C++ Executor API.</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f71dcbc0>
<jinja2.runtime.BlockReference object at 0x7da747e36000>
<div class="footer">
<p>

View File

@ -118,6 +118,7 @@
</li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -201,7 +202,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59fe3ca5d0>
<jinja2.runtime.BlockReference object at 0x7da747e9ea80>
<div class="footer">
<p>

View File

@ -147,6 +147,7 @@
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -521,7 +522,7 @@ is computed as:</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f71f8080>
<jinja2.runtime.BlockReference object at 0x7da747d4d790>
<div class="footer">
<p>

View File

@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -574,7 +575,7 @@ one.</p></li>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f71fe2a0>
<jinja2.runtime.BlockReference object at 0x7da747ed2c30>
<div class="footer">
<p>

View File

@ -124,6 +124,7 @@
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -380,7 +381,7 @@ techniques to optimize the underlying graph. It provides a wrapper similar to P
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f71ead50>
<jinja2.runtime.BlockReference object at 0x7da747c66870>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -396,7 +397,7 @@ The mandatory input tensors to create a valid <code class="docutils literal notr
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f708c9b0>
<jinja2.runtime.BlockReference object at 0x7da747c6e0f0>
<div class="footer">
<p>

View File

@ -122,6 +122,7 @@
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -284,7 +285,7 @@ Assume vocaburlay size is 100, which means normal text token ids are in range [0
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f70951c0>
<jinja2.runtime.BlockReference object at 0x7da747d4c5f0>
<div class="footer">
<p>

View File

@ -122,6 +122,7 @@
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -358,7 +359,7 @@ The following tensors are for a LoRA which has a <code class="docutils literal n
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f700eba0>
<jinja2.runtime.BlockReference object at 0x7da747c21dc0>
<div class="footer">
<p>

View File

@ -22,7 +22,7 @@
<script src="../_static/js/theme.js"></script>
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="Overview" href="../performance/perf-overview.html" />
<link rel="next" title="Disaggregated-Service (experimental)" href="disaggregated-service.html" />
<link rel="prev" title="KV cache reuse" href="kv-cache-reuse.html" />
</head>
@ -133,6 +133,7 @@
<li class="toctree-l2"><a class="reference internal" href="#lookahead-decoding">Lookahead Decoding</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -612,13 +613,13 @@ However, similar to any new model, you can follow the same approach to define yo
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="kv-cache-reuse.html" class="btn btn-neutral float-left" title="KV cache reuse" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="../performance/perf-overview.html" class="btn btn-neutral float-right" title="Overview" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
<a href="disaggregated-service.html" class="btn btn-neutral float-right" title="Disaggregated-Service (experimental)" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6f73590>
<jinja2.runtime.BlockReference object at 0x7da747c04d40>
<div class="footer">
<p>

View File

@ -111,6 +111,7 @@
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -232,7 +233,7 @@ python3<span class="w"> </span>examples/summarize.py<span class="w"> </span><spa
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f703bd70>
<jinja2.runtime.BlockReference object at 0x7da747ba1f70>
<div class="footer">
<p>

View File

@ -120,6 +120,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -271,7 +272,7 @@ python<span class="w"> </span>../summarize.py<span class="w"> </span>--engine_di
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f71f8560>
<jinja2.runtime.BlockReference object at 0x7da747b2fc80>
<div class="footer">
<p>

View File

@ -132,6 +132,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -536,7 +537,7 @@ trtllm-build<span class="w"> </span>--checkpoint_dir<span class="w"> </span>./op
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f700ea80>
<jinja2.runtime.BlockReference object at 0x7da747bb12e0>
<div class="footer">
<p>

View File

@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -535,7 +536,7 @@ srun<span class="w"> </span><span class="se">\</span>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6e78950>
<jinja2.runtime.BlockReference object at 0x7da747bad2b0>
<div class="footer">
<p>

View File

@ -111,6 +111,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -456,7 +457,7 @@ The support for Qwen-1 is in <code class="docutils literal notranslate"><span cl
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f71ddcd0>
<jinja2.runtime.BlockReference object at 0x7da747c04710>
<div class="footer">
<p>

View File

@ -116,6 +116,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -189,7 +190,7 @@ Server</a> to easily create web-based services for LLMs. TensorRT-LLM supports m
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f700f1a0>
<jinja2.runtime.BlockReference object at 0x7da747a442f0>
<div class="footer">
<p>

View File

@ -120,6 +120,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -368,7 +369,7 @@ The usage of this API looks like this:</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6e78c80>
<jinja2.runtime.BlockReference object at 0x7da747c05070>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -326,7 +327,7 @@ ISL = Input Sequence Length
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6e743b0>
<jinja2.runtime.BlockReference object at 0x7da747bc1130>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -278,7 +279,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f700f920>
<jinja2.runtime.BlockReference object at 0x7da747ac9550>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -270,7 +271,7 @@ TensorRT-LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. </sub></p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6e76cf0>
<jinja2.runtime.BlockReference object at 0x7da747bc2b70>
<div class="footer">
<p>

View File

@ -112,6 +112,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -235,7 +236,7 @@ ISL = Input Sequence Length
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5c66286a80>
<jinja2.runtime.BlockReference object at 0x7da747bc2ab0>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -390,7 +391,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6d9abd0>
<jinja2.runtime.BlockReference object at 0x7da747ce70e0>
<div class="footer">
<p>

View File

@ -121,6 +121,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -172,11 +173,14 @@
<section id="trtllm-build">
<h1>trtllm-build<a class="headerlink" href="#trtllm-build" title="Link to this heading"></a></h1>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">usage</span><span class="p">:</span> <span class="n">trtllm</span><span class="o">-</span><span class="n">build</span> <span class="p">[</span><span class="o">-</span><span class="n">h</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">checkpoint_dir</span> <span class="n">CHECKPOINT_DIR</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">model_config</span> <span class="n">MODEL_CONFIG</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">build_config</span> <span class="n">BUILD_CONFIG</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">model_config</span> <span class="n">MODEL_CONFIG</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">build_config</span> <span class="n">BUILD_CONFIG</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">model_cls_file</span> <span class="n">MODEL_CLS_FILE</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">model_cls_name</span> <span class="n">MODEL_CLS_NAME</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">output_dir</span> <span class="n">OUTPUT_DIR</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">model_cls_name</span> <span class="n">MODEL_CLS_NAME</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">output_dir</span> <span class="n">OUTPUT_DIR</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_batch_size</span> <span class="n">MAX_BATCH_SIZE</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_input_len</span> <span class="n">MAX_INPUT_LEN</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">max_seq_len</span> <span class="n">MAX_SEQ_LEN</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_input_len</span> <span class="n">MAX_INPUT_LEN</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_seq_len</span> <span class="n">MAX_SEQ_LEN</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_beam_width</span> <span class="n">MAX_BEAM_WIDTH</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">max_num_tokens</span> <span class="n">MAX_NUM_TOKENS</span><span class="p">]</span>
<span class="p">[</span><span class="o">--</span><span class="n">opt_num_tokens</span> <span class="n">OPT_NUM_TOKENS</span><span class="p">]</span>
@ -551,7 +555,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6e75310>
<jinja2.runtime.BlockReference object at 0x7da747962a50>
<div class="footer">
<p>

View File

@ -116,6 +116,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -261,7 +262,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6d926f0>
<jinja2.runtime.BlockReference object at 0x7da747963e00>
<div class="footer">
<p>

View File

@ -110,6 +110,7 @@
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -7639,7 +7640,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59fdb6c200>
<jinja2.runtime.BlockReference object at 0x7da7915097f0>
<div class="footer">
<p>

View File

@ -112,6 +112,7 @@
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -341,6 +342,12 @@
<li class="toctree-l2"><a class="reference internal" href="advanced/speculative-decoding.html#lookahead-decoding">Lookahead Decoding</a></li>
</ul>
</li>
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a><ul>
<li class="toctree-l2"><a class="reference internal" href="advanced/disaggregated-service.html#usage">Usage</a></li>
<li class="toctree-l2"><a class="reference internal" href="advanced/disaggregated-service.html#benchmarks">Benchmarks</a></li>
<li class="toctree-l2"><a class="reference internal" href="advanced/disaggregated-service.html#troubleshooting-and-faq">Troubleshooting and FAQ</a></li>
</ul>
</li>
</ul>
</div>
<div class="toctree-wrapper compound" id="performance">
@ -439,7 +446,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6bd1d60>
<jinja2.runtime.BlockReference object at 0x7da7479f7e00>
<div class="footer">
<p>

View File

@ -130,6 +130,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -336,7 +337,7 @@ relevant classes. The associated unit tests should also be consulted for underst
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6cb78f0>
<jinja2.runtime.BlockReference object at 0x7da747794a10>
<div class="footer">
<p>

View File

@ -125,6 +125,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -391,7 +392,7 @@ pip<span class="w"> </span>uninstall<span class="w"> </span>-y<span class="w"> <
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6b22780>
<jinja2.runtime.BlockReference object at 0x7da747732330>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -209,7 +210,7 @@ sudo<span class="w"> </span>apt-get<span class="w"> </span>-y<span class="w"> </
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6bd30b0>
<jinja2.runtime.BlockReference object at 0x7da74763b500>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -238,7 +239,7 @@ Please install CUDA toolkit when you see the following message when running Mode
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6b1fec0>
<jinja2.runtime.BlockReference object at 0x7da747995550>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -252,7 +253,7 @@ and retry. Check the system path to make sure the latest version installed in <c
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6b1f8f0>
<jinja2.runtime.BlockReference object at 0x7da7476403b0>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -185,7 +186,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6a5c620>
<jinja2.runtime.BlockReference object at 0x7da747638e60>
<div class="footer">
<p>

View File

@ -128,6 +128,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -313,7 +314,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6bd1f40>
<jinja2.runtime.BlockReference object at 0x7da747a44440>
<div class="footer">
<p>

View File

@ -132,6 +132,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -319,7 +320,7 @@ Refer to the <a class="reference external" href="https://github.com/NVIDIA/Tenso
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6a5d5e0>
<jinja2.runtime.BlockReference object at 0x7da747647ef0>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="current reference internal" href="#">Examples</a><ul>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -180,17 +181,17 @@
<div class="toctree-wrapper compound">
<p class="caption" role="heading"><span class="caption-text">Scripts</span></p>
<ul>
<li class="toctree-l1"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l1"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</div>
@ -207,7 +208,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6d92840>
<jinja2.runtime.BlockReference object at 0x7da747641370>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -229,7 +230,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6b20620>
<jinja2.runtime.BlockReference object at 0x7da747644cb0>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -239,7 +240,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6b1eba0>
<jinja2.runtime.BlockReference object at 0x7da7434e8aa0>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -233,7 +234,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f69c8fb0>
<jinja2.runtime.BlockReference object at 0x7da747726240>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -237,7 +238,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6b218b0>
<jinja2.runtime.BlockReference object at 0x7da747794290>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -257,7 +258,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f69ca4e0>
<jinja2.runtime.BlockReference object at 0x7da747640230>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -247,7 +248,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6926630>
<jinja2.runtime.BlockReference object at 0x7da747725b20>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -238,7 +239,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f688f0e0>
<jinja2.runtime.BlockReference object at 0x7da7433417f0>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -251,7 +252,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f69cabd0>
<jinja2.runtime.BlockReference object at 0x7da747803fb0>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -232,7 +233,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6863a70>
<jinja2.runtime.BlockReference object at 0x7da7478103e0>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -250,7 +251,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f688f500>
<jinja2.runtime.BlockReference object at 0x7da747ce7230>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -253,7 +254,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6862ed0>
<jinja2.runtime.BlockReference object at 0x7da747c23170>
<div class="footer">
<p>

View File

@ -70,17 +70,17 @@
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generation with Quantization</a></li>
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
</ul>
</li>
@ -127,6 +127,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -260,7 +261,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f67373b0>
<jinja2.runtime.BlockReference object at 0x7da747cd2930>
<div class="footer">
<p>

View File

@ -123,6 +123,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -283,7 +284,7 @@ Refer to the <a class="reference external" href="https://github.com/NVIDIA/Tenso
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6818ef0>
<jinja2.runtime.BlockReference object at 0x7da74df5a930>
<div class="footer">
<p>

View File

@ -349,6 +349,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -1708,7 +1709,7 @@ changed, you should remove the caches manually.</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f6734770>
<jinja2.runtime.BlockReference object at 0x7da75a1d3f50>
<div class="footer">
<p>

Binary file not shown.

View File

@ -125,6 +125,7 @@
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -226,7 +227,7 @@ Certain limitations might apply. Refer to the <a class="reference internal" href
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f702e4e0>
<jinja2.runtime.BlockReference object at 0x7da75a027fe0>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul class="current">
@ -253,7 +254,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f714cad0>
<jinja2.runtime.BlockReference object at 0x7da77c983920>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul class="current">
@ -746,7 +747,7 @@ The choices are specified with a YAML file like the following example (<code cla
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59fca95d30>
<jinja2.runtime.BlockReference object at 0x7da75a1a8350>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul class="current">
@ -475,7 +476,7 @@ increase runtime performance at the expense of reduced accuracy.</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59fcbc5c40>
<jinja2.runtime.BlockReference object at 0x7da7afe9b8f0>
<div class="footer">
<p>

View File

@ -23,7 +23,7 @@
<link rel="index" title="Index" href="../genindex.html" />
<link rel="search" title="Search" href="../search.html" />
<link rel="next" title="TensorRT-LLM Benchmarking" href="perf-benchmarking.html" />
<link rel="prev" title="Speculative Sampling" href="../advanced/speculative-decoding.html" />
<link rel="prev" title="Disaggregated-Service (experimental)" href="../advanced/disaggregated-service.html" />
</head>
<body class="wy-body-for-nav">
@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul class="current">
@ -2764,14 +2765,14 @@ that can be compared with the table in the <a class="reference internal" href="#
</div>
</div>
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
<a href="../advanced/speculative-decoding.html" class="btn btn-neutral float-left" title="Speculative Sampling" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="../advanced/disaggregated-service.html" class="btn btn-neutral float-left" title="Disaggregated-Service (experimental)" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
<a href="perf-benchmarking.html" class="btn btn-neutral float-right" title="TensorRT-LLM Benchmarking" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
</div>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59fe3eeba0>
<jinja2.runtime.BlockReference object at 0x7da7afeda9f0>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -257,7 +258,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e6dcc9df0>
<jinja2.runtime.BlockReference object at 0x7da791508d40>
<div class="footer">
<p>

View File

@ -377,6 +377,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -4805,7 +4806,7 @@ function creates a constant tensor.</p></li>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5c64378950>
<jinja2.runtime.BlockReference object at 0x7da747a468a0>
<div class="footer">
<p>

View File

@ -266,6 +266,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -813,7 +814,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5c5b531fa0>
<jinja2.runtime.BlockReference object at 0x7da747bae420>
<div class="footer">
<p>

View File

@ -429,6 +429,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -1752,7 +1753,7 @@ ranges of the dimensions of when using TRT dynamic shapes.</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5c66337110>
<jinja2.runtime.BlockReference object at 0x7da79270e510>
<div class="footer">
<p>

View File

@ -119,6 +119,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -219,7 +220,7 @@ migrated to the centralized building script <cite>tensorrt_llm/commands/build.py
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f700e120>
<jinja2.runtime.BlockReference object at 0x7da79273be60>
<div class="footer">
<p>

View File

@ -118,6 +118,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -200,7 +201,7 @@ the quantized model as TRT-LLM checkpoint</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5c664b7f80>
<jinja2.runtime.BlockReference object at 0x7da792788a10>
<div class="footer">
<p>

View File

@ -376,6 +376,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -1906,7 +1907,7 @@ For example, word_dict[2] = [” I am happy”, “ I am sad”].</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f59f7097170>
<jinja2.runtime.BlockReference object at 0x7da792540e60>
<div class="footer">
<p>

View File

@ -122,6 +122,7 @@
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -303,7 +304,7 @@ The model definition is a minimal example that shows some of the optimizations a
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e61f9f620>
<jinja2.runtime.BlockReference object at 0x7da791570ad0>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -307,7 +308,7 @@ Here some explanations on how these values affect the memory:</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e622e1430>
<jinja2.runtime.BlockReference object at 0x7da7915a73b0>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -756,7 +757,7 @@ are:</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e61f9ffe0>
<jinja2.runtime.BlockReference object at 0x7da7918d4b60>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -343,7 +344,7 @@ In addition, older architectures can have limitations for newer software release
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e6da9dac0>
<jinja2.runtime.BlockReference object at 0x7da7918664e0>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -463,7 +464,7 @@ dedicated MPI environment, not the one provided by your Slurm allocation.</p>
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e6da82cf0>
<jinja2.runtime.BlockReference object at 0x7da7915a7f20>
<div class="footer">
<p>

View File

@ -199,6 +199,7 @@
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -1271,7 +1272,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e61d0f1a0>
<jinja2.runtime.BlockReference object at 0x7da79153cce0>
<div class="footer">
<p>

View File

@ -113,6 +113,7 @@
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
</ul>
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
<ul>
@ -180,7 +181,7 @@
<hr/>
<div role="contentinfo">
<jinja2.runtime.BlockReference object at 0x7f5e6e53b080>
<jinja2.runtime.BlockReference object at 0x7da791a1b560>
<div class="footer">
<p>

File diff suppressed because one or more lines are too long