mirror of
https://github.com/NVIDIA/TensorRT-LLM.git
synced 2026-01-14 06:27:45 +08:00
Update gh-pages (#2651)
This commit is contained in:
parent
4ad18fd144
commit
f11aeed624
File diff suppressed because one or more lines are too long
File diff suppressed because one or more lines are too long
@ -110,6 +110,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -232,7 +233,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e6e6c6e40>
|
||||
<jinja2.runtime.BlockReference object at 0x7da791a96e40>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -110,6 +110,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -2362,8 +2363,10 @@
|
||||
<span class="n">out</span> <span class="o">=</span> <span class="n">record</span><span class="o">.</span><span class="n">outputs</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>
|
||||
<span class="c1"># Left the result_handler determine the final output dtype.</span>
|
||||
<span class="c1"># NOTE: This will change the CompletionOutput._postprocess_result</span>
|
||||
<span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">nvtx</span><span class="o">.</span><span class="n">range_push</span><span class="p">(</span><span class="s2">"_result_handler"</span><span class="p">)</span>
|
||||
<span class="k">if</span> <span class="bp">self</span><span class="o">.</span><span class="n">_result_handler</span><span class="p">:</span>
|
||||
<span class="n">out</span> <span class="o">=</span> <span class="bp">self</span><span class="o">.</span><span class="n">_result_handler</span><span class="p">(</span><span class="n">record</span><span class="p">)</span>
|
||||
<span class="n">torch</span><span class="o">.</span><span class="n">cuda</span><span class="o">.</span><span class="n">nvtx</span><span class="o">.</span><span class="n">range_pop</span><span class="p">()</span>
|
||||
|
||||
<span class="c1"># TODO: Keep only the diff token_ids and text in streaming mode when</span>
|
||||
<span class="c1"># result_handler is not set</span>
|
||||
@ -2388,6 +2391,7 @@
|
||||
<span class="n">Input</span> <span class="o">=</span> <span class="n">ResponsePostprocessWorker</span><span class="o">.</span><span class="n">Input</span>
|
||||
<span class="n">Output</span> <span class="o">=</span> <span class="n">ResponsePostprocessWorker</span><span class="o">.</span><span class="n">Output</span>
|
||||
|
||||
<span class="nd">@nvtx_range</span><span class="p">(</span><span class="s2">"handle_single_input"</span><span class="p">)</span>
|
||||
<span class="k">async</span> <span class="k">def</span> <span class="nf">handle_single_input</span><span class="p">(</span>
|
||||
<span class="nb">input</span><span class="p">:</span> <span class="n">ResponsePostprocessWorker</span><span class="o">.</span><span class="n">Input</span><span class="p">,</span>
|
||||
<span class="n">batch</span><span class="p">:</span> <span class="n">List</span><span class="p">[</span><span class="n">ResponsePostprocessWorker</span><span class="o">.</span><span class="n">Output</span><span class="p">]):</span>
|
||||
@ -2469,7 +2473,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e6dddfe00>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7918410d0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -110,6 +110,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -716,7 +717,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e6dd86870>
|
||||
<jinja2.runtime.BlockReference object at 0x7da791842210>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -110,6 +110,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -1889,7 +1890,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e6de5f0e0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da791af4950>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -110,6 +110,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../../../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -1349,7 +1350,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e6e3f6270>
|
||||
<jinja2.runtime.BlockReference object at 0x7da791a97050>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -4,6 +4,12 @@ Executor
|
||||
.. Here are files in the cpp/include/executor
|
||||
.. We manually add subsection to enable detailed description in the future
|
||||
.. It is also doable to automatically generate this file and list all the modules in the conf.py
|
||||
disaggServerUtil.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: disaggServerUtil.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
executor.h
|
||||
__________
|
||||
|
||||
@ -28,12 +34,6 @@ _______
|
||||
.. doxygenfile:: types.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
disaggServerUtil.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: disaggServerUtil.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
version.h
|
||||
_________
|
||||
|
||||
|
||||
@ -40,6 +40,12 @@ ________________
|
||||
.. doxygenfile:: decodingOutput.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleBuffers.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: eagleBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
explicitDraftTokensBuffers.h
|
||||
____________________________
|
||||
|
||||
@ -112,6 +118,12 @@ __________
|
||||
.. doxygenfile:: ipcUtils.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
lookaheadBuffers.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: lookaheadBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
lookaheadModule.h
|
||||
_________________
|
||||
|
||||
@ -172,6 +184,12 @@ _________
|
||||
.. doxygenfile:: request.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
runtimeDefaults.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: runtimeDefaults.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
samplingConfig.h
|
||||
________________
|
||||
|
||||
@ -202,21 +220,3 @@ _____________
|
||||
.. doxygenfile:: worldConfig.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
eagleBuffers.h
|
||||
______________
|
||||
|
||||
.. doxygenfile:: eagleBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
lookaheadBuffers.h
|
||||
__________________
|
||||
|
||||
.. doxygenfile:: lookaheadBuffers.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
runtimeDefaults.h
|
||||
_________________
|
||||
|
||||
.. doxygenfile:: runtimeDefaults.h
|
||||
:project: TensorRT-LLM
|
||||
|
||||
|
||||
@ -58,13 +58,13 @@ In the code above, the `requestId` assigned to a request by different executors
|
||||
|
||||

|
||||
|
||||
An `orchestrator` is required in `disaggregated-service` to manage multiple executor instance and route requests to different executors, TRT-LLM provides class [DisaggExecutorOrchestrator](../../../cpp/include/tensorrt_llm/executor/disaggServerUtil.h) to help user to launch multiple executor instances, however, `DisaggExecutorOrchestrator` only routes requests to executors in a simple round-robin policy, users need to implement their own orchestrator for disaggregated-service based on their business.
|
||||
An `orchestrator` is required in `disaggregated-service` to manage multiple executor instance and route requests to different executors, TRT-LLM provides class `DisaggExecutorOrchestrator` in `cpp/include/tensorrt_llm/executor/disaggServerUtil.h` to help user to launch multiple executor instances, however, `DisaggExecutorOrchestrator` only routes requests to executors in a simple round-robin policy, users need to implement their own orchestrator for disaggregated-service based on their business.
|
||||
|
||||
TRT-LLM currently implements kvCache transfer using `CUDA-aware MPI`, and all executor processes involved need to hold same MPI world communicator. Therefore, TRT-LLM only supports launching multiple executors using `MPI`, and the `CommunicationMode` of the executors must be set to `KLEADER` or `kORCHESTRATOR` with `SpawnProcesses=false` for `disaggregated-service`, TRT-LLM will relax this restriction in future version to manage executors with greater ease.
|
||||
|
||||
## Benchmarks
|
||||
|
||||
Please refer to [disaggServerBenchmark](../../../benchmarks/cpp/README.md#4.launch-C++-disaggServerBenchmark)
|
||||
Please refer to `benchmarks/cpp/disaggServerBenchmark.cpp` and `benchmarks/cpp/README.md`
|
||||
|
||||
|
||||
## Troubleshooting and FAQ
|
||||
|
||||
@ -106,6 +106,7 @@ Welcome to TensorRT-LLM's Documentation!
|
||||
advanced/expert-parallelism.md
|
||||
advanced/kv-cache-reuse.md
|
||||
advanced/speculative-decoding.md
|
||||
advanced/disaggregated-service.md
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
@ -5,15 +5,15 @@ Examples
|
||||
:maxdepth: 2
|
||||
:caption: Scripts
|
||||
|
||||
llm_guided_decoding
|
||||
llm_inference
|
||||
llm_inference_async
|
||||
llm_inference_async_streaming
|
||||
llm_inference_customize
|
||||
llm_inference_distributed
|
||||
llm_logits_processor
|
||||
llm_quantization
|
||||
llm_guided_decoding
|
||||
llm_lookahead_decoding
|
||||
llm_medusa_decoding
|
||||
llm_multilora
|
||||
llm_quantization
|
||||
llm_auto_parallel
|
||||
|
||||
@ -21,7 +21,9 @@
|
||||
<script src="../_static/copybutton.js?v=65e89d2a"></script>
|
||||
<script src="../_static/js/theme.js"></script>
|
||||
<link rel="index" title="Index" href="../genindex.html" />
|
||||
<link rel="search" title="Search" href="../search.html" />
|
||||
<link rel="search" title="Search" href="../search.html" />
|
||||
<link rel="next" title="Overview" href="../performance/perf-overview.html" />
|
||||
<link rel="prev" title="Speculative Sampling" href="speculative-decoding.html" />
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
@ -100,7 +102,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../architecture/add-model.html">Adding a Model</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Advanced</span></p>
|
||||
<ul>
|
||||
<ul class="current">
|
||||
<li class="toctree-l1"><a class="reference internal" href="gpt-attention.html">Multi-Head, Multi-Query, and Group-Query Attention</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="gpt-runtime.html">C++ GPT Runtime</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="executor.html">Executor API</a></li>
|
||||
@ -111,6 +113,16 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1 current"><a class="current reference internal" href="#">Disaggregated-Service (experimental)</a><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="#usage">Usage</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="#benchmarks">Benchmarks</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="#troubleshooting-and-faq">Troubleshooting and FAQ</a><ul>
|
||||
<li class="toctree-l3"><a class="reference internal" href="#general-faqs">General FAQs</a></li>
|
||||
<li class="toctree-l3"><a class="reference internal" href="#debugging-faqs">Debugging FAQs</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -202,12 +214,12 @@ This feature is currently experimental, and the related API is subjected to chan
|
||||
<p>The generationExecutor will require data such as kvCache from the corresponding contextExecutor based on the <code class="docutils literal notranslate"><span class="pre">contextPhaseParams</span></code> attached to the request, so please make sure that the corresponding contextExecutor is not shut down before getting the generationExecutor’s response.</p>
|
||||
<p>In the code above, the <code class="docutils literal notranslate"><span class="pre">requestId</span></code> assigned to a request by different executors may be different, it is the user’s responsibility to manage the mapping of the <code class="docutils literal notranslate"><span class="pre">requestId</span></code> for context-only requests to the <code class="docutils literal notranslate"><span class="pre">requestId</span></code> for generation-only requests.</p>
|
||||
<p><img alt="disaggregated-service usage" src="../_images/disaggregated-service_usage.png" /></p>
|
||||
<p>An <code class="docutils literal notranslate"><span class="pre">orchestrator</span></code> is required in <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code> to manage multiple executor instance and route requests to different executors, TRT-LLM provides class <a class="reference download internal" download="" href="../_downloads/5b0862c3dcc6c1b12192056ed28ee5c1/disaggServerUtil.h"><span class="xref download myst">DisaggExecutorOrchestrator</span></a> to help user to launch multiple executor instances, however, <code class="docutils literal notranslate"><span class="pre">DisaggExecutorOrchestrator</span></code> only routes requests to executors in a simple round-robin policy, users need to implement their own orchestrator for disaggregated-service based on their business.</p>
|
||||
<p>An <code class="docutils literal notranslate"><span class="pre">orchestrator</span></code> is required in <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code> to manage multiple executor instance and route requests to different executors, TRT-LLM provides class <code class="docutils literal notranslate"><span class="pre">DisaggExecutorOrchestrator</span></code> in <code class="docutils literal notranslate"><span class="pre">cpp/include/tensorrt_llm/executor/disaggServerUtil.h</span></code> to help user to launch multiple executor instances, however, <code class="docutils literal notranslate"><span class="pre">DisaggExecutorOrchestrator</span></code> only routes requests to executors in a simple round-robin policy, users need to implement their own orchestrator for disaggregated-service based on their business.</p>
|
||||
<p>TRT-LLM currently implements kvCache transfer using <code class="docutils literal notranslate"><span class="pre">CUDA-aware</span> <span class="pre">MPI</span></code>, and all executor processes involved need to hold same MPI world communicator. Therefore, TRT-LLM only supports launching multiple executors using <code class="docutils literal notranslate"><span class="pre">MPI</span></code>, and the <code class="docutils literal notranslate"><span class="pre">CommunicationMode</span></code> of the executors must be set to <code class="docutils literal notranslate"><span class="pre">KLEADER</span></code> or <code class="docutils literal notranslate"><span class="pre">kORCHESTRATOR</span></code> with <code class="docutils literal notranslate"><span class="pre">SpawnProcesses=false</span></code> for <code class="docutils literal notranslate"><span class="pre">disaggregated-service</span></code>, TRT-LLM will relax this restriction in future version to manage executors with greater ease.</p>
|
||||
</section>
|
||||
<section id="benchmarks">
|
||||
<h2>Benchmarks<a class="headerlink" href="#benchmarks" title="Link to this heading"></a></h2>
|
||||
<p>Please refer to <span class="xref myst">disaggServerBenchmark</span></p>
|
||||
<p>Please refer to <code class="docutils literal notranslate"><span class="pre">benchmarks/cpp/disaggServerBenchmark.cpp</span></code> and <code class="docutils literal notranslate"><span class="pre">benchmarks/cpp/README.md</span></code></p>
|
||||
</section>
|
||||
<section id="troubleshooting-and-faq">
|
||||
<h2>Troubleshooting and FAQ<a class="headerlink" href="#troubleshooting-and-faq" title="Link to this heading"></a></h2>
|
||||
@ -250,12 +262,15 @@ If version of UCX =1.18, set <code class="docutils literal notranslate"><span cl
|
||||
|
||||
</div>
|
||||
</div>
|
||||
<footer>
|
||||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||||
<a href="speculative-decoding.html" class="btn btn-neutral float-left" title="Speculative Sampling" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||||
<a href="../performance/perf-overview.html" class="btn btn-neutral float-right" title="Overview" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||||
</div>
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f714f6b0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747e8d0a0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -128,6 +128,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -289,7 +290,7 @@ the TensorRT-LLM C++ Executor API.</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f71dcbc0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747e36000>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -118,6 +118,7 @@
|
||||
</li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -201,7 +202,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59fe3ca5d0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747e9ea80>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -147,6 +147,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -521,7 +522,7 @@ is computed as:</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f71f8080>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747d4d790>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -574,7 +575,7 @@ one.</p></li>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f71fe2a0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747ed2c30>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -124,6 +124,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -380,7 +381,7 @@ techniques to optimize the underlying graph. It provides a wrapper similar to P
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f71ead50>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747c66870>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -396,7 +397,7 @@ The mandatory input tensors to create a valid <code class="docutils literal notr
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f708c9b0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747c6e0f0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -122,6 +122,7 @@
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -284,7 +285,7 @@ Assume vocaburlay size is 100, which means normal text token ids are in range [0
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f70951c0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747d4c5f0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -122,6 +122,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -358,7 +359,7 @@ The following tensors are for a LoRA which has a <code class="docutils literal n
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f700eba0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747c21dc0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -22,7 +22,7 @@
|
||||
<script src="../_static/js/theme.js"></script>
|
||||
<link rel="index" title="Index" href="../genindex.html" />
|
||||
<link rel="search" title="Search" href="../search.html" />
|
||||
<link rel="next" title="Overview" href="../performance/perf-overview.html" />
|
||||
<link rel="next" title="Disaggregated-Service (experimental)" href="disaggregated-service.html" />
|
||||
<link rel="prev" title="KV cache reuse" href="kv-cache-reuse.html" />
|
||||
</head>
|
||||
|
||||
@ -133,6 +133,7 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="#lookahead-decoding">Lookahead Decoding</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -612,13 +613,13 @@ However, similar to any new model, you can follow the same approach to define yo
|
||||
</div>
|
||||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||||
<a href="kv-cache-reuse.html" class="btn btn-neutral float-left" title="KV cache reuse" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||||
<a href="../performance/perf-overview.html" class="btn btn-neutral float-right" title="Overview" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||||
<a href="disaggregated-service.html" class="btn btn-neutral float-right" title="Disaggregated-Service (experimental)" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||||
</div>
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6f73590>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747c04d40>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -111,6 +111,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -232,7 +233,7 @@ python3<span class="w"> </span>examples/summarize.py<span class="w"> </span><spa
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f703bd70>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747ba1f70>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -120,6 +120,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -271,7 +272,7 @@ python<span class="w"> </span>../summarize.py<span class="w"> </span>--engine_di
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f71f8560>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747b2fc80>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -132,6 +132,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -536,7 +537,7 @@ trtllm-build<span class="w"> </span>--checkpoint_dir<span class="w"> </span>./op
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f700ea80>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747bb12e0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -535,7 +536,7 @@ srun<span class="w"> </span><span class="se">\</span>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6e78950>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747bad2b0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -111,6 +111,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -456,7 +457,7 @@ The support for Qwen-1 is in <code class="docutils literal notranslate"><span cl
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f71ddcd0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747c04710>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -116,6 +116,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -189,7 +190,7 @@ Server</a> to easily create web-based services for LLMs. TensorRT-LLM supports m
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f700f1a0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747a442f0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -120,6 +120,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -368,7 +369,7 @@ The usage of this API looks like this:</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6e78c80>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747c05070>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -326,7 +327,7 @@ ISL = Input Sequence Length
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6e743b0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747bc1130>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -278,7 +279,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f700f920>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747ac9550>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -270,7 +271,7 @@ TensorRT-LLM v0.5.0, TensorRT v9.1.0.4 | H200, H100 FP8. </sub></p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6e76cf0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747bc2b70>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -112,6 +112,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -235,7 +236,7 @@ ISL = Input Sequence Length
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5c66286a80>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747bc2ab0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -390,7 +391,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6d9abd0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747ce70e0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -121,6 +121,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -172,11 +173,14 @@
|
||||
<section id="trtllm-build">
|
||||
<h1>trtllm-build<a class="headerlink" href="#trtllm-build" title="Link to this heading"></a></h1>
|
||||
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">usage</span><span class="p">:</span> <span class="n">trtllm</span><span class="o">-</span><span class="n">build</span> <span class="p">[</span><span class="o">-</span><span class="n">h</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">checkpoint_dir</span> <span class="n">CHECKPOINT_DIR</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">model_config</span> <span class="n">MODEL_CONFIG</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">build_config</span> <span class="n">BUILD_CONFIG</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">model_config</span> <span class="n">MODEL_CONFIG</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">build_config</span> <span class="n">BUILD_CONFIG</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">model_cls_file</span> <span class="n">MODEL_CLS_FILE</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">model_cls_name</span> <span class="n">MODEL_CLS_NAME</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">output_dir</span> <span class="n">OUTPUT_DIR</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">model_cls_name</span> <span class="n">MODEL_CLS_NAME</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">output_dir</span> <span class="n">OUTPUT_DIR</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">max_batch_size</span> <span class="n">MAX_BATCH_SIZE</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">max_input_len</span> <span class="n">MAX_INPUT_LEN</span><span class="p">]</span> <span class="p">[</span><span class="o">--</span><span class="n">max_seq_len</span> <span class="n">MAX_SEQ_LEN</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">max_input_len</span> <span class="n">MAX_INPUT_LEN</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">max_seq_len</span> <span class="n">MAX_SEQ_LEN</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">max_beam_width</span> <span class="n">MAX_BEAM_WIDTH</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">max_num_tokens</span> <span class="n">MAX_NUM_TOKENS</span><span class="p">]</span>
|
||||
<span class="p">[</span><span class="o">--</span><span class="n">opt_num_tokens</span> <span class="n">OPT_NUM_TOKENS</span><span class="p">]</span>
|
||||
@ -551,7 +555,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6e75310>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747962a50>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -116,6 +116,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -261,7 +262,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6d926f0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747963e00>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -110,6 +110,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -7639,7 +7640,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59fdb6c200>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7915097f0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -112,6 +112,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -341,6 +342,12 @@
|
||||
<li class="toctree-l2"><a class="reference internal" href="advanced/speculative-decoding.html#lookahead-decoding">Lookahead Decoding</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="advanced/disaggregated-service.html#usage">Usage</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="advanced/disaggregated-service.html#benchmarks">Benchmarks</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="advanced/disaggregated-service.html#troubleshooting-and-faq">Troubleshooting and FAQ</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
<div class="toctree-wrapper compound" id="performance">
|
||||
@ -439,7 +446,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6bd1d60>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7479f7e00>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -130,6 +130,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -336,7 +337,7 @@ relevant classes. The associated unit tests should also be consulted for underst
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6cb78f0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747794a10>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -125,6 +125,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -391,7 +392,7 @@ pip<span class="w"> </span>uninstall<span class="w"> </span>-y<span class="w"> <
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6b22780>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747732330>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -209,7 +210,7 @@ sudo<span class="w"> </span>apt-get<span class="w"> </span>-y<span class="w"> </
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6bd30b0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da74763b500>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -238,7 +239,7 @@ Please install CUDA toolkit when you see the following message when running Mode
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6b1fec0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747995550>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -252,7 +253,7 @@ and retry. Check the system path to make sure the latest version installed in <c
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6b1f8f0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7476403b0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -185,7 +186,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6a5c620>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747638e60>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -128,6 +128,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -313,7 +314,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6bd1f40>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747a44440>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -132,6 +132,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -319,7 +320,7 @@ Refer to the <a class="reference external" href="https://github.com/NVIDIA/Tenso
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6a5d5e0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747647ef0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="current reference internal" href="#">Examples</a><ul>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -180,17 +181,17 @@
|
||||
<div class="toctree-wrapper compound">
|
||||
<p class="caption" role="heading"><span class="caption-text">Scripts</span></p>
|
||||
<ul>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
@ -207,7 +208,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6d92840>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747641370>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -229,7 +230,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6b20620>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747644cb0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -239,7 +240,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6b1eba0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7434e8aa0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -233,7 +234,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f69c8fb0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747726240>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -237,7 +238,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6b218b0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747794290>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -257,7 +258,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f69ca4e0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747640230>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -247,7 +248,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6926630>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747725b20>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -238,7 +239,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f688f0e0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7433417f0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -251,7 +252,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f69cabd0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747803fb0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -232,7 +233,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6863a70>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7478103e0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -250,7 +251,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f688f500>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747ce7230>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_quantization.html">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -253,7 +254,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6862ed0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747c23170>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -70,17 +70,17 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="index.html">LLM Examples Introduction</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="customization.html">Common Customizations</a></li>
|
||||
<li class="toctree-l1 current"><a class="reference internal" href="llm_api_examples.html">Examples</a><ul class="current">
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async.html">Generate Text Asynchronously</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_async_streaming.html">Generate Text in Streaming</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_customize.html">Generate text</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_inference_distributed.html">Distributed LLM Generation</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_logits_processor.html">Control generated text using logits post processor</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_guided_decoding.html">Generate text with guided decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_lookahead_decoding.html">Generate Text Using Lookahead Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_medusa_decoding.html">Generate Text Using Medusa Decoding</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_multilora.html">Generate text with multiple LoRA adapters</a></li>
|
||||
<li class="toctree-l2 current"><a class="current reference internal" href="#">Generation with Quantization</a></li>
|
||||
<li class="toctree-l2"><a class="reference internal" href="llm_auto_parallel.html">Automatic Parallelism with LLM</a></li>
|
||||
</ul>
|
||||
</li>
|
||||
@ -127,6 +127,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -260,7 +261,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f67373b0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747cd2930>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -123,6 +123,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -283,7 +284,7 @@ Refer to the <a class="reference external" href="https://github.com/NVIDIA/Tenso
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6818ef0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da74df5a930>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -349,6 +349,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -1708,7 +1709,7 @@ changed, you should remove the caches manually.</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f6734770>
|
||||
<jinja2.runtime.BlockReference object at 0x7da75a1d3f50>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
BIN
objects.inv
BIN
objects.inv
Binary file not shown.
@ -125,6 +125,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -226,7 +227,7 @@ Certain limitations might apply. Refer to the <a class="reference internal" href
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f702e4e0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da75a027fe0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="current">
|
||||
@ -253,7 +254,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f714cad0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da77c983920>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="current">
|
||||
@ -746,7 +747,7 @@ The choices are specified with a YAML file like the following example (<code cla
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59fca95d30>
|
||||
<jinja2.runtime.BlockReference object at 0x7da75a1a8350>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="current">
|
||||
@ -475,7 +476,7 @@ increase runtime performance at the expense of reduced accuracy.</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59fcbc5c40>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7afe9b8f0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -23,7 +23,7 @@
|
||||
<link rel="index" title="Index" href="../genindex.html" />
|
||||
<link rel="search" title="Search" href="../search.html" />
|
||||
<link rel="next" title="TensorRT-LLM Benchmarking" href="perf-benchmarking.html" />
|
||||
<link rel="prev" title="Speculative Sampling" href="../advanced/speculative-decoding.html" />
|
||||
<link rel="prev" title="Disaggregated-Service (experimental)" href="../advanced/disaggregated-service.html" />
|
||||
</head>
|
||||
|
||||
<body class="wy-body-for-nav">
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul class="current">
|
||||
@ -2764,14 +2765,14 @@ that can be compared with the table in the <a class="reference internal" href="#
|
||||
</div>
|
||||
</div>
|
||||
<footer><div class="rst-footer-buttons" role="navigation" aria-label="Footer">
|
||||
<a href="../advanced/speculative-decoding.html" class="btn btn-neutral float-left" title="Speculative Sampling" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||||
<a href="../advanced/disaggregated-service.html" class="btn btn-neutral float-left" title="Disaggregated-Service (experimental)" accesskey="p" rel="prev"><span class="fa fa-arrow-circle-left" aria-hidden="true"></span> Previous</a>
|
||||
<a href="perf-benchmarking.html" class="btn btn-neutral float-right" title="TensorRT-LLM Benchmarking" accesskey="n" rel="next">Next <span class="fa fa-arrow-circle-right" aria-hidden="true"></span></a>
|
||||
</div>
|
||||
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59fe3eeba0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7afeda9f0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -257,7 +258,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e6dcc9df0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da791508d40>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -377,6 +377,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -4805,7 +4806,7 @@ function creates a constant tensor.</p></li>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5c64378950>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747a468a0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -266,6 +266,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -813,7 +814,7 @@ the number of tokens used for each task, should be equal to prompt_embedding_tab
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5c5b531fa0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da747bae420>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -429,6 +429,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -1752,7 +1753,7 @@ ranges of the dimensions of when using TRT dynamic shapes.</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5c66337110>
|
||||
<jinja2.runtime.BlockReference object at 0x7da79270e510>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -119,6 +119,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -219,7 +220,7 @@ migrated to the centralized building script <cite>tensorrt_llm/commands/build.py
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f700e120>
|
||||
<jinja2.runtime.BlockReference object at 0x7da79273be60>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -118,6 +118,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -200,7 +201,7 @@ the quantized model as TRT-LLM checkpoint</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5c664b7f80>
|
||||
<jinja2.runtime.BlockReference object at 0x7da792788a10>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -376,6 +376,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -1906,7 +1907,7 @@ For example, word_dict[2] = [” I am happy”, “ I am sad”].</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f59f7097170>
|
||||
<jinja2.runtime.BlockReference object at 0x7da792540e60>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -122,6 +122,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -303,7 +304,7 @@ The model definition is a minimal example that shows some of the optimizations a
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e61f9f620>
|
||||
<jinja2.runtime.BlockReference object at 0x7da791570ad0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -307,7 +308,7 @@ Here some explanations on how these values affect the memory:</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e622e1430>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7915a73b0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -756,7 +757,7 @@ are:</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e61f9ffe0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7918d4b60>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -343,7 +344,7 @@ In addition, older architectures can have limitations for newer software release
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e6da9dac0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7918664e0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="../advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -463,7 +464,7 @@ dedicated MPI environment, not the one provided by your Slurm allocation.</p>
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e6da82cf0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da7915a7f20>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -199,6 +199,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -1271,7 +1272,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e61d0f1a0>
|
||||
<jinja2.runtime.BlockReference object at 0x7da79153cce0>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
@ -113,6 +113,7 @@
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/expert-parallelism.html">Expert Parallelism in TensorRT-LLM</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/kv-cache-reuse.html">KV cache reuse</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/speculative-decoding.html">Speculative Sampling</a></li>
|
||||
<li class="toctree-l1"><a class="reference internal" href="advanced/disaggregated-service.html">Disaggregated-Service (experimental)</a></li>
|
||||
</ul>
|
||||
<p class="caption" role="heading"><span class="caption-text">Performance</span></p>
|
||||
<ul>
|
||||
@ -180,7 +181,7 @@
|
||||
<hr/>
|
||||
|
||||
<div role="contentinfo">
|
||||
<jinja2.runtime.BlockReference object at 0x7f5e6e53b080>
|
||||
<jinja2.runtime.BlockReference object at 0x7da791a1b560>
|
||||
|
||||
<div class="footer">
|
||||
<p>
|
||||
|
||||
File diff suppressed because one or more lines are too long
Loading…
Reference in New Issue
Block a user